Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import spaces | |
| import torch | |
| import torchaudio | |
| import io | |
| import base64 | |
| import uuid | |
| import os | |
| import time | |
| import re | |
| import threading | |
| import gc | |
| import random | |
| import numpy as np | |
| from einops import rearrange | |
| from huggingface_hub import login | |
| from stable_audio_tools import get_pretrained_model | |
| from stable_audio_tools.inference.generation import generate_diffusion_cond | |
| from gradio_client import Client, handle_file | |
| from contextlib import contextmanager | |
| # MelodyFlow Variations - extracted from variations.py | |
| MELODYFLOW_VARIATIONS = { | |
| # Acoustic Instruments | |
| 'accordion_folk': "Lively accordion music with a European folk feeling, perfect for a travel documentary about traditional culture and street performances in Paris", | |
| 'banjo_bluegrass': "Authentic bluegrass banjo band performance with rich picking patterns, ideal for a heartfelt documentary about American rural life and traditional crafts", | |
| 'piano_classical': "Expressive classical piano performance with dynamic range and emotional depth, ideal for a luxury brand commercial", | |
| 'celtic': "Traditional Celtic arrangement with fiddle and flute, perfect for a documentary about Ireland's stunning landscapes and ancient traditions", | |
| 'strings_quartet': "Elegant string quartet arrangement with rich harmonies and expressive dynamics, perfect for wedding ceremony music", | |
| # Synthesizer Variations | |
| 'synth_retro': "1980s style synthesizer melody with warm analog pads and arpeggios, perfect for a nostalgic sci-fi movie soundtrack", | |
| 'synth_modern': "Modern electronic production with crisp digital synthesizer arpeggios and vocoder effects, ideal for a tech product launch video", | |
| 'synth_ambient': "Atmospheric synthesizer pads with reverb and delay, perfect for a meditation app or wellness commercial", | |
| 'synth_edm': "High-energy EDM synth saw leads with sidechain compression, pitch bends, perfect for sports highlights or action sequences", | |
| # Band Arrangements | |
| 'rock_band': "Full rock band arrangement with electric guitars, bass, and drums, perfect for an action movie trailer", | |
| # Hybrid/Special | |
| 'cinematic_epic': "Epic orchestral arrangement with modern hybrid elements, synthesizers, and percussion, perfect for movie trailers", | |
| 'lofi_chill': "Lo-fi hip hop style with vinyl crackle, mellow piano, and tape saturation, perfect for study or focus playlists", | |
| 'synth_bass': "Deep analog synthesizer bassline with modern production and subtle modulation, perfect for electronic music production", | |
| 'retro_rpg': "16-bit era JRPG soundtrack with bright melodic synthesizers, orchestral elements, and adventurous themes, perfect for a fantasy video game battle scene or overworld exploration", | |
| 'steel_drums': "Vibrant Caribbean steel drum ensemble with tropical percussion and uplifting melodies, perfect for a beach resort commercial or travel documentary", | |
| 'chiptune': "8-bit video game soundtrack with arpeggiated melodies and classic NES-style square waves, perfect for a retro platformer or action game", | |
| 'gamelan_fusion': "Indonesian gamelan ensemble with metallic percussion, gongs, and ethereal textures, perfect for a meditation app or spiritual documentary", | |
| 'music_box': "Delicate music box melody with gentle bell tones and ethereal ambiance, perfect for a children's lullaby or magical fantasy scene", | |
| # Hip Hop / Trap Percussion | |
| 'trap_808': "808 bass", | |
| 'lo_fi_drums': "lofi hiphop percussion", | |
| 'boom_bap': "Classic 90s boom bap hip hop drums with punchy kicks, crisp snares, and jazz sample chops, perfect for documentary footage of urban street scenes and skateboarding", | |
| 'percussion_ensemble': "Rich percussive ensemble with djembe, congas, shakers, and tribal drums creating complex polyrhythms, perfect for nature documentaries about rainforests or ancient cultural rituals", | |
| # Enhanced Electronic Music | |
| 'future_bass': "Energetic future bass with filtered supersaws, pitch-bending lead synths, heavy sidechain, and chopped vocal samples, perfect for extreme sports highlights or uplifting motivational content", | |
| 'synthwave_retro': "80s retrofuturistic synthwave with gated reverb drums, analog arpeggios, neon-bright lead synths and driving bass, perfect for cyberpunk-themed technology showcases or retro gaming montages", | |
| 'melodic_techno': "Hypnotic melodic techno with pulsing bass, atmospheric pads, and evolving synthesizer sequences with subtle filter modulation, ideal for timelapse footage of urban nightscapes or architectural showcases", | |
| 'dubstep_wobble': "Heavy dubstep with aggressive wobble bass, metallic synthesizers, distorted drops, and tension-building risers, perfect for action sequence transitions or gaming highlight reels", | |
| # Glitchy Effects | |
| 'glitch_hop': "Glitch hop with stuttering sample slices, bit-crushed percussion, granular synthesis textures and digital artifacts, perfect for technology malfunction scenes or data visualization animations", | |
| 'digital_disruption': "Heavily glitched soundscape with digital artifacts, buffer errors, granular time stretching, and corrupted audio samples, ideal for cybersecurity themes or digital distortion transitions in tech presentations", | |
| 'circuit_bent': "Circuit-bent toy sounds with unpredictable pitch shifts, broken electronic tones, and hardware malfunction artifacts, perfect for creative coding demonstrations or innovative technology exhibitions", | |
| # Experimental Hybrids | |
| 'orchestral_glitch': "Cinematic orchestral elements disrupted by digital glitches, granular textures, and temporal distortions, perfect for science fiction trailers or futuristic product reveals with contrasting classical and modern elements", | |
| 'vapor_drums': "Vaporwave drum processing with extreme pitch and time manipulation, reverb-drenched samples, and retro commercial music elements, ideal for nostalgic internet culture documentaries or retrofuturistic art installations", | |
| 'industrial_textures': "Harsh industrial soundscape with mechanical percussion, factory recordings, metallic impacts, and distorted synth drones, perfect for manufacturing process videos or dystopian urban environments", | |
| 'jungle_breaks': "High-energy jungle drum breaks with choppy breakbeat samples, deep sub bass, and dub reggae influences, perfect for fast-paced urban chase scenes or extreme sports montages" | |
| } | |
| # Global model storage | |
| model_cache = {} | |
| model_lock = threading.Lock() | |
| def resource_cleanup(): | |
| """Lightweight context manager - let zerogpu handle memory management""" | |
| try: | |
| yield | |
| finally: | |
| # Minimal cleanup - let zerogpu handle the heavy lifting | |
| if torch.cuda.is_available(): | |
| torch.cuda.synchronize() | |
| # Removed aggressive empty_cache() and gc.collect() calls | |
| def load_stable_audio_model(): | |
| """Load stable-audio-open-small model if not already loaded.""" | |
| with model_lock: | |
| if 'stable_audio_model' not in model_cache: | |
| print("🔄 Loading stable-audio-open-small model...") | |
| load_start = time.time() | |
| # Authenticate with HF | |
| hf_token = os.getenv('HF_TOKEN') | |
| if hf_token: | |
| login(token=hf_token) | |
| print(f"✅ HF authenticated") | |
| # Load model | |
| model, config = get_pretrained_model("stabilityai/stable-audio-open-small") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = model.to(device) | |
| if device == "cuda": | |
| model = model.half() | |
| load_time = time.time() - load_start | |
| print(f"✅ Model loaded on {device} in {load_time:.2f}s") | |
| # Aggressive model persistence - warm up with dummy generation | |
| print("🔥 Warming up model...") | |
| warmup_start = time.time() | |
| try: | |
| dummy_conditioning = [{"prompt": "test", "seconds_total": 12}] | |
| with torch.no_grad(): | |
| _ = generate_diffusion_cond( | |
| model, | |
| steps=1, # Minimal steps for warmup | |
| cfg_scale=1.0, | |
| conditioning=dummy_conditioning, | |
| sample_size=config["sample_size"], | |
| sampler_type="pingpong", | |
| device=device, | |
| seed=42 | |
| ) | |
| warmup_time = time.time() - warmup_start | |
| print(f"🔥 Model warmed up in {warmup_time:.2f}s") | |
| except Exception as e: | |
| print(f"⚠️ Warmup failed (but continuing): {e}") | |
| model_cache['stable_audio_model'] = model | |
| model_cache['stable_audio_config'] = config | |
| model_cache['stable_audio_device'] = device | |
| print(f"✅ Stable Audio model ready for fast generation!") | |
| else: | |
| print("♻️ Using cached model (should be fast!)") | |
| return (model_cache['stable_audio_model'], | |
| model_cache['stable_audio_config'], | |
| model_cache['stable_audio_device']) | |
| def generate_stable_audio_loop(prompt, loop_type, bpm, bars, steps, cfg_scale, seed=-1): | |
| """Generate a BPM-aware loop using stable-audio-open-small""" | |
| try: | |
| total_start = time.time() | |
| # Model loading timing | |
| load_start = time.time() | |
| model, config, device = load_stable_audio_model() | |
| load_time = time.time() - load_start | |
| # Calculate loop duration based on BPM and bars | |
| seconds_per_beat = 60.0 / bpm | |
| seconds_per_bar = seconds_per_beat * 4 # 4/4 time | |
| target_loop_duration = seconds_per_bar * bars | |
| # Enhance prompt based on loop type and BPM - minimal modification | |
| if loop_type == "drums": | |
| enhanced_prompt = f"{prompt} {bpm}bpm" | |
| # Comprehensive negative prompting for drums - exclude all melodic/harmonic content | |
| negative_prompt = "melody, harmony, pitched instruments, vocals, singing, piano, guitar, bass, synth, strings, horns, woodwinds, flute, saxophone, violin, cello, organ, keyboard, chords, notes, musical scale, tonal, melodic, harmonic" | |
| else: # instruments | |
| enhanced_prompt = f"{prompt} {bpm}bpm" | |
| # Comprehensive negative prompting for instruments - exclude all percussive content | |
| negative_prompt = "drums, percussion, kick, snare, hi-hat, cymbals, tom, drum kit, rhythm section, beats, drumming, percussive, drum machine, 808, trap drums, boom bap drums, breakbeat, drum breaks, kick drum, snare drum" | |
| # Set seed | |
| if seed == -1: | |
| seed = random.randint(0, 2**32 - 1) | |
| torch.manual_seed(seed) | |
| if device == "cuda": | |
| torch.cuda.manual_seed(seed) | |
| print(f"🎵 Generating {loop_type} loop:") | |
| print(f" Enhanced prompt: {enhanced_prompt}") | |
| print(f" Target duration: {target_loop_duration:.2f}s ({bars} bars at {bpm}bpm)") | |
| print(f" Steps: {steps}, CFG Scale: {cfg_scale}") | |
| print(f" Seed: {seed}") | |
| # Prepare conditioning | |
| conditioning_start = time.time() | |
| conditioning = [{ | |
| "prompt": enhanced_prompt, | |
| "seconds_total": 12 # Model generates 12s max | |
| }] | |
| negative_conditioning = [{ | |
| "prompt": negative_prompt, | |
| "seconds_total": 12 | |
| }] | |
| conditioning_time = time.time() - conditioning_start | |
| # Generation timing | |
| generation_start = time.time() | |
| # Clear GPU cache once before generation (not after) | |
| # if device == "cuda": | |
| # torch.cuda.empty_cache() | |
| with torch.cuda.amp.autocast(enabled=(device == "cuda")): | |
| output = generate_diffusion_cond( | |
| model, | |
| steps=steps, # User-configurable steps | |
| cfg_scale=cfg_scale, # User-configurable CFG scale | |
| conditioning=conditioning, | |
| negative_conditioning=negative_conditioning, | |
| sample_size=config["sample_size"], | |
| sampler_type="pingpong", | |
| device=device, | |
| seed=seed | |
| ) | |
| generation_time = time.time() - generation_start | |
| # Post-processing timing | |
| postproc_start = time.time() | |
| # Post-process audio | |
| output = rearrange(output, "b d n -> d (b n)") # (2, N) stereo | |
| output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1) | |
| # Extract the loop portion | |
| sample_rate = config["sample_rate"] | |
| loop_samples = int(target_loop_duration * sample_rate) | |
| available_samples = output.shape[1] | |
| if loop_samples > available_samples: | |
| loop_samples = available_samples | |
| actual_duration = available_samples / sample_rate | |
| print(f"⚠️ Requested {target_loop_duration:.2f}s, got {actual_duration:.2f}s") | |
| # Extract loop from beginning (cleanest beat alignment) | |
| loop_output = output[:, :loop_samples] | |
| loop_output_int16 = loop_output.mul(32767).to(torch.int16).cpu() | |
| # Save to temporary file | |
| loop_filename = f"loop_{loop_type}_{bpm}bpm_{bars}bars_{seed}.wav" | |
| torchaudio.save(loop_filename, loop_output_int16, sample_rate) | |
| postproc_time = time.time() - postproc_start | |
| total_time = time.time() - total_start | |
| actual_duration = loop_samples / sample_rate | |
| # Detailed timing breakdown | |
| print(f"⏱️ Timing breakdown:") | |
| print(f" Model load: {load_time:.2f}s") | |
| print(f" Conditioning: {conditioning_time:.3f}s") | |
| print(f" Generation: {generation_time:.2f}s") | |
| print(f" Post-processing: {postproc_time:.3f}s") | |
| print(f" Total: {total_time:.2f}s") | |
| print(f"✅ {loop_type.title()} loop: {actual_duration:.2f}s audio in {total_time:.2f}s") | |
| return loop_filename, f"Generated {actual_duration:.2f}s {loop_type} loop at {bpm}bpm ({bars} bars) in {total_time:.2f}s (steps: {steps}, cfg: {cfg_scale})" | |
| except Exception as e: | |
| print(f"❌ Generation error: {str(e)}") | |
| return None, f"Error: {str(e)}" | |
| def combine_loops(drums_audio, instruments_audio, bpm, bars, num_repeats): | |
| """Combine drum and instrument loops with specified repetitions""" | |
| try: | |
| if not drums_audio and not instruments_audio: | |
| return None, "No audio files to combine" | |
| # Calculate timing | |
| seconds_per_beat = 60.0 / bpm | |
| seconds_per_bar = seconds_per_beat * 4 | |
| loop_duration = seconds_per_bar * bars | |
| total_duration = loop_duration * num_repeats | |
| print(f"🎛️ Combining loops:") | |
| print(f" Loop duration: {loop_duration:.2f}s ({bars} bars)") | |
| print(f" Repeats: {num_repeats}") | |
| print(f" Total duration: {total_duration:.2f}s") | |
| combined_audio = None | |
| sample_rate = None | |
| # Process each audio file | |
| for audio_path, audio_type in [(drums_audio, "drums"), (instruments_audio, "instruments")]: | |
| if audio_path: | |
| # Load audio | |
| waveform, sr = torchaudio.load(audio_path) | |
| if sample_rate is None: | |
| sample_rate = sr | |
| # Ensure we have the exact loop duration | |
| target_samples = int(loop_duration * sr) | |
| if waveform.shape[1] > target_samples: | |
| waveform = waveform[:, :target_samples] | |
| elif waveform.shape[1] < target_samples: | |
| # Pad if necessary | |
| padding = target_samples - waveform.shape[1] | |
| waveform = torch.cat([waveform, torch.zeros(waveform.shape[0], padding)], dim=1) | |
| # Repeat the loop | |
| repeated_waveform = waveform.repeat(1, num_repeats) | |
| print(f" {audio_type}: {waveform.shape[1]/sr:.2f}s repeated {num_repeats}x = {repeated_waveform.shape[1]/sr:.2f}s") | |
| # Add to combined audio | |
| if combined_audio is None: | |
| combined_audio = repeated_waveform | |
| else: | |
| combined_audio = combined_audio + repeated_waveform | |
| if combined_audio is None: | |
| return None, "No valid audio to combine" | |
| # Normalize to prevent clipping | |
| combined_audio = combined_audio / torch.max(torch.abs(combined_audio)) | |
| combined_audio = combined_audio.clamp(-1, 1) | |
| # Convert to int16 and save | |
| combined_audio_int16 = combined_audio.mul(32767).to(torch.int16) | |
| combined_filename = f"combined_{bpm}bpm_{bars}bars_{num_repeats}loops_{random.randint(1000, 9999)}.wav" | |
| torchaudio.save(combined_filename, combined_audio_int16, sample_rate) | |
| actual_duration = combined_audio.shape[1] / sample_rate | |
| status = f"Combined into {actual_duration:.2f}s audio ({num_repeats} × {bars} bars at {bpm}bpm)" | |
| print(f"✅ {status}") | |
| return combined_filename, status | |
| except Exception as e: | |
| print(f"❌ Combine error: {str(e)}") | |
| return None, f"Combine error: {str(e)}" | |
| def transform_with_melodyflow_api(audio_path, prompt, solver="euler", flowstep=0.12): | |
| """Transform audio using Facebook/MelodyFlow space API""" | |
| if audio_path is None: | |
| return None, "❌ No audio file provided" | |
| try: | |
| # Initialize client for Facebook MelodyFlow space | |
| client = Client("facebook/MelodyFlow") | |
| # Set steps based on solver | |
| if solver == "midpoint": | |
| base_steps = 128 | |
| effective_steps = base_steps // 2 # 64 effective steps | |
| else: # euler | |
| base_steps = 125 | |
| effective_steps = base_steps // 5 # 25 effective steps | |
| print(f"🎛️ MelodyFlow transformation:") | |
| print(f" Prompt: {prompt}") | |
| print(f" Solver: {solver} ({effective_steps} effective steps)") | |
| print(f" Flowstep: {flowstep}") | |
| # Call the MelodyFlow API | |
| result = client.predict( | |
| model="facebook/melodyflow-t24-30secs", | |
| text=prompt, | |
| solver=solver, | |
| steps=base_steps, | |
| target_flowstep=flowstep, | |
| regularize=solver == "euler", | |
| regularization_strength=0.2, | |
| duration=30, | |
| melody=handle_file(audio_path), | |
| api_name="/predict" | |
| ) | |
| if result and len(result) > 0 and result[0]: | |
| # Save the result locally | |
| output_filename = f"melodyflow_transformed_{random.randint(1000, 9999)}.wav" | |
| import shutil | |
| shutil.copy2(result[0], output_filename) | |
| status_msg = f"✅ Transformed with prompt: '{prompt}' (flowstep: {flowstep}, {effective_steps} steps)" | |
| return output_filename, status_msg | |
| else: | |
| return None, "❌ MelodyFlow API returned no results" | |
| except Exception as e: | |
| return None, f"❌ MelodyFlow API error: {str(e)}" | |
| def extend_with_musicgen_api(audio_path, prompt_duration, musicgen_model, output_duration): | |
| """Extend audio using the micro-slot-machine space API""" | |
| if audio_path is None: | |
| return None, "❌ No audio file provided" | |
| try: | |
| # Initialize client for micro-slot-machine space | |
| client = Client("thepatch/micro-slot-machine") | |
| print(f"🎼 MusicGen extension:") | |
| print(f" Prompt duration: {prompt_duration} (type: {type(prompt_duration)})") | |
| print(f" Model: {musicgen_model}") | |
| print(f" Output duration: {output_duration} (type: {type(output_duration)})") | |
| # Call the continue_music API | |
| result = client.predict( | |
| input_audio_path=handle_file(audio_path), | |
| prompt_duration=prompt_duration, # Integer from dropdown | |
| musicgen_model=musicgen_model, | |
| output_duration=float(output_duration), # Ensure it's a float | |
| api_name="/continue_music" | |
| ) | |
| if result: | |
| # Save the result locally | |
| output_filename = f"musicgen_extended_{random.randint(1000, 9999)}.wav" | |
| import shutil | |
| shutil.copy2(result, output_filename) | |
| status_msg = f"✅ Extended with {musicgen_model} (prompt: {prompt_duration}s, output: {output_duration}s)" | |
| return output_filename, status_msg | |
| else: | |
| return None, "❌ MusicGen API returned no results" | |
| except Exception as e: | |
| return None, f"❌ MusicGen API error: {str(e)}" | |
| def calculate_optimal_bars(bpm): | |
| """Calculate optimal bar count for given BPM to fit in ~10s""" | |
| seconds_per_beat = 60.0 / bpm | |
| seconds_per_bar = seconds_per_beat * 4 | |
| max_duration = 10.0 | |
| for bars in [8, 4, 2, 1]: | |
| if seconds_per_bar * bars <= max_duration: | |
| return bars | |
| return 1 | |
| """Calculate optimal bar count for given BPM to fit in ~10s""" | |
| seconds_per_beat = 60.0 / bpm | |
| seconds_per_bar = seconds_per_beat * 4 | |
| max_duration = 10.0 | |
| for bars in [8, 4, 2, 1]: | |
| if seconds_per_bar * bars <= max_duration: | |
| return bars | |
| return 1 | |
| def update_transform_prompt(variation_choice): | |
| """Update the transformation prompt based on variation selection""" | |
| if variation_choice == "custom": | |
| return gr.update(value="", placeholder="enter your custom transformation prompt", interactive=True) | |
| elif variation_choice in MELODYFLOW_VARIATIONS: | |
| return gr.update(value=MELODYFLOW_VARIATIONS[variation_choice], interactive=True) | |
| else: | |
| return gr.update(value="", placeholder="select a variation or enter custom prompt", interactive=True) | |
| # ========== GRADIO INTERFACE ========== | |
| with gr.Blocks(title="stable-melodyflow") as iface: | |
| gr.Markdown("# stable-melodyflow (aka jerry and terry)") | |
| gr.Markdown("**generate synchronized drum and instrument loops with stable-audio-open-small (jerry), then transform with melodyflow (terry)!**") | |
| # ========== MODELS & PROJECT INFO ========== | |
| with gr.Accordion(" some info about these models", open=False): | |
| with gr.Accordion("🚀 stable-audio-open-small", open=False): | |
| gr.Markdown(""" | |
| **stable-audio-open-small** is an incredibly fast model from the zachs and friends at Stability AI. It's capable of generating 12 seconds of audio in under a second, which gives rise to a lot of very interesting kinds of UX. | |
| **note about generation speed in this zerogpu space:** you'll notice generation times are a little slower here than if you were to use the model on a local gpu. that's just a result of the way zerogpu spaces work i think... let me know if there's a way to keep the model loaded in a zerogpu space! | |
| **links:** | |
| - 🤗 [model on HuggingFace](https://huggingface.co/stabilityai/stable-audio-open-small) | |
| there's a docker container at this repo that can be spun up as a standalone api specifically for stable-audio-open-small: | |
| - [stable-audio-api](https://github.com/betweentwomidnights/stable-audio-api) | |
| """) | |
| with gr.Accordion("🎛️ melodyflow", open=False): | |
| gr.Markdown(""" | |
| **MelodyFlow** is a model by meta that can use regularized latent inversion to do transformations of input audio. | |
| It's not officially a part of the audiocraft repo yet, but we use it as a docker container in the backend for gary4live. i really enjoy turning my guitar riffs into orchestra. | |
| **links:** | |
| - 🤗 [Official MelodyFlow Space](https://huggingface.co/spaces/Facebook/MelodyFlow) | |
| - [our melodyflow api](https://github.com/betweentwomidnights/melodyflow) | |
| """) | |
| with gr.Accordion("gary4live project", open=False): | |
| gr.Markdown(""" | |
| **gary4live** is a free/open source project that uses these models, along with musicGen, inside of ableton live to iterate on your projects with you. i run a backend myself so that we can all experiment with it, but you can also spin the backend up locally using docker-compose with our repo. | |
| **project Links:** | |
| - [frontend repo](https://github.com/betweentwomidnights/gary4live) | |
| - [backend repo](https://github.com/betweentwomidnights/gary-backend-combined) | |
| **installers:** | |
| - [p.c. & mac installers on gumroad](https://thepatch.gumroad.com/l/gary4live) | |
| """) | |
| with gr.Accordion("how this works", open=False): | |
| gr.Markdown(""" | |
| **workflow:** | |
| 1. **set global bpm and bars** - affects both drum and instrument generation | |
| 2. **generate drum loop** - creates BPM-aware percussion with negative prompting to attempt to get rid of instruments | |
| 3. **generate instrument loop** - creates melodic/harmonic content with negative prompting to attempt to get rid of drums | |
| 4. **combine loops** - layer them together with repetitions (up to 30s) | |
| 5. **transform** - use melodyflow to stylistically transform the combined result | |
| **features:** | |
| - bpm-aware generation ensures perfect sync between loops (most the time lol) | |
| - negative prompting separates drums from instruments (most the time) | |
| - smart bar calculation optimizes loop length for the BPM | |
| - preset transformation styles for braindead ease of use | |
| """) | |
| # ========== GLOBAL CONTROLS ========== | |
| gr.Markdown("## 🎛️ global settings") | |
| with gr.Row(): | |
| global_bpm = gr.Dropdown( | |
| label="global bpm", | |
| choices=[90, 100, 110, 120, 130, 140, 150], | |
| value=120, | |
| info="bpm applied to both drum and instrument generation. keep this the same for the combine step to work correctly" | |
| ) | |
| global_bars = gr.Dropdown( | |
| label="loop length (bars)", | |
| choices=[1, 2, 4], | |
| value=4, | |
| info="number of bars for each loop. keep this the same for both pieces of audio" | |
| ) | |
| base_prompt = gr.Textbox( | |
| label="base prompt", | |
| value="lofi hiphop with pianos", | |
| placeholder="e.g., 'aggressive techno', 'lofi hiphop', 'chillwave', 'liquid drum and bass'", | |
| info="prompt applied to either loop. make it more drum/instrument specific for best results" | |
| ) | |
| with gr.Row(): | |
| generation_steps = gr.Slider( | |
| label="generation steps", | |
| minimum=4, | |
| maximum=16, | |
| step=1, | |
| value=8, | |
| info="more steps = higher quality but slower generation" | |
| ) | |
| cfg_scale = gr.Slider( | |
| label="cfg scale", | |
| minimum=0.5, | |
| maximum=2.0, | |
| step=0.1, | |
| value=1.0, | |
| info="higher values = more prompt adherence but potentially less natural" | |
| ) | |
| # Auto-suggest optimal bars based on BPM | |
| def update_suggested_bars(bpm): | |
| optimal = calculate_optimal_bars(bpm) | |
| return gr.update(info=f"Suggested: {optimal} bars for {bpm}bpm (≤10s)") | |
| global_bpm.change(update_suggested_bars, inputs=[global_bpm], outputs=[global_bars]) | |
| # ========== LOOP GENERATION ========== | |
| gr.Markdown("## step one: generate individual loops") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### drums") | |
| generate_drums_btn = gr.Button("generate drums", variant="primary", size="lg") | |
| drums_audio = gr.Audio(label="drum loop", type="filepath", show_download_button=True) | |
| drums_status = gr.Textbox(label="status", value="ready to generate") | |
| with gr.Column(): | |
| gr.Markdown("### instruments") | |
| generate_instruments_btn = gr.Button("generate instruments", variant="secondary", size="lg") | |
| instruments_audio = gr.Audio(label="instrument loop", type="filepath", show_download_button=True) | |
| instruments_status = gr.Textbox(label="status", value="ready to generate") | |
| # Seed controls | |
| with gr.Row(): | |
| drums_seed = gr.Number(label="drums seed", value=-1, info="-1 for random") | |
| instruments_seed = gr.Number(label="instruments seed", value=-1, info="-1 for random") | |
| # ========== COMBINATION ========== | |
| gr.Markdown("## step two: combine loops") | |
| with gr.Row(): | |
| num_repeats = gr.Slider( | |
| label="number of repetitions", | |
| minimum=1, | |
| maximum=5, | |
| step=1, | |
| value=2, | |
| info="how many times to repeat each loop (creates longer audio). aim for 30 seconds max" | |
| ) | |
| combine_btn = gr.Button("combine", variant="primary", size="lg") | |
| combined_audio = gr.Audio(label="combined loops", type="filepath", show_download_button=True) | |
| combine_status = gr.Textbox(label="status", value="Generate loops first") | |
| # ========== MELODYFLOW TRANSFORMATION ========== | |
| gr.Markdown("## step three: transform with melodyflow") | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Variation dropdown | |
| variation_choice = gr.Dropdown( | |
| label="transformation style preset", | |
| choices=["custom"] + list(MELODYFLOW_VARIATIONS.keys()), | |
| value="custom", | |
| info="select a preset style or choose 'custom' for your own prompt" | |
| ) | |
| transform_prompt = gr.Textbox( | |
| label="transformation prompt", | |
| value="", | |
| placeholder="enter your custom transformation prompt", | |
| lines=3, | |
| info="describes the style transformation to apply" | |
| ) | |
| with gr.Column(): | |
| transform_solver = gr.Dropdown( | |
| label="solver", | |
| choices=["euler", "midpoint"], | |
| value="euler", | |
| info="EULER: faster (25 steps), MIDPOINT: slower (64 steps)" | |
| ) | |
| transform_flowstep = gr.Slider( | |
| label="transform intensity", | |
| minimum=0.0, | |
| maximum=0.15, | |
| step=0.01, | |
| value=0.12, | |
| info="Lower = more dramatic transformation" | |
| ) | |
| transform_btn = gr.Button("transform audio", variant="secondary", size="lg") | |
| transformed_audio = gr.Audio(label="transformed audio", type="filepath", show_download_button=True) | |
| transform_status = gr.Textbox(label="status", value="Combine audio first") | |
| # ========== MUSICGEN EXTENSION ========== | |
| gr.Markdown("## step four (optional): extend with musicgen") | |
| with gr.Row(): | |
| with gr.Column(): | |
| musicgen_prompt_duration = gr.Dropdown( | |
| label="prompt duration (seconds)", | |
| choices=[3, 5, 7, 10], # Back to integers since the function expects numbers | |
| value=5, | |
| info="how much of the end to use as prompt for continuation" | |
| ) | |
| musicgen_output_duration = gr.Slider( | |
| label="extension duration (seconds)", | |
| minimum=10.0, | |
| maximum=30.0, | |
| step=1.0, | |
| value=20.0, | |
| info="how much new audio to generate" | |
| ) | |
| with gr.Column(): | |
| musicgen_model_choice = gr.Dropdown( | |
| label="musicgen model", | |
| choices=[ | |
| "thepatch/vanya_ai_dnb_0.1 (small)", | |
| "thepatch/bleeps-medium (medium)", | |
| "thepatch/hoenn_lofi (large)" | |
| ], | |
| value="thepatch/vanya_ai_dnb_0.1 (small)", | |
| info="various musicgen fine-tunes for different styles" | |
| ) | |
| extend_btn = gr.Button("extend with musicgen", variant="primary", size="lg") | |
| extended_audio = gr.Audio(label="extended audio", type="filepath") | |
| extend_status = gr.Textbox(label="status", value="Transform audio first") | |
| # ========== EVENT HANDLERS ========== | |
| # Update transform prompt when variation is selected | |
| variation_choice.change( | |
| update_transform_prompt, | |
| inputs=[variation_choice], | |
| outputs=[transform_prompt] | |
| ) | |
| # Generate drums | |
| generate_drums_btn.click( | |
| generate_stable_audio_loop, | |
| inputs=[base_prompt, gr.State("drums"), global_bpm, global_bars, generation_steps, cfg_scale, drums_seed], | |
| outputs=[drums_audio, drums_status] | |
| ) | |
| # Generate instruments | |
| generate_instruments_btn.click( | |
| generate_stable_audio_loop, | |
| inputs=[base_prompt, gr.State("instruments"), global_bpm, global_bars, generation_steps, cfg_scale, instruments_seed], | |
| outputs=[instruments_audio, instruments_status] | |
| ) | |
| # Combine loops | |
| combine_btn.click( | |
| combine_loops, | |
| inputs=[drums_audio, instruments_audio, global_bpm, global_bars, num_repeats], | |
| outputs=[combined_audio, combine_status] | |
| ) | |
| # Transform with MelodyFlow | |
| transform_btn.click( | |
| transform_with_melodyflow_api, | |
| inputs=[combined_audio, transform_prompt, transform_solver, transform_flowstep], | |
| outputs=[transformed_audio, transform_status] | |
| ) | |
| # Extend with MusicGen | |
| extend_btn.click( | |
| extend_with_musicgen_api, | |
| inputs=[transformed_audio, musicgen_prompt_duration, musicgen_model_choice, musicgen_output_duration], | |
| outputs=[extended_audio, extend_status] | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |