Spaces:
Sleeping
Sleeping
Commit
·
11f5aeb
1
Parent(s):
cfd186d
cfg and steps params added
Browse files
app.py
CHANGED
|
@@ -19,6 +19,58 @@ from stable_audio_tools.inference.generation import generate_diffusion_cond
|
|
| 19 |
from gradio_client import Client, handle_file
|
| 20 |
from contextlib import contextmanager
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# Global model storage
|
| 23 |
model_cache = {}
|
| 24 |
model_lock = threading.Lock()
|
|
@@ -90,7 +142,7 @@ def load_stable_audio_model():
|
|
| 90 |
model_cache['stable_audio_device'])
|
| 91 |
|
| 92 |
@spaces.GPU(duration=12)
|
| 93 |
-
def generate_stable_audio_loop(prompt, loop_type, bpm, bars, seed=-1):
|
| 94 |
"""Generate a BPM-aware loop using stable-audio-open-small"""
|
| 95 |
try:
|
| 96 |
total_start = time.time()
|
|
@@ -105,7 +157,6 @@ def generate_stable_audio_loop(prompt, loop_type, bpm, bars, seed=-1):
|
|
| 105 |
seconds_per_bar = seconds_per_beat * 4 # 4/4 time
|
| 106 |
target_loop_duration = seconds_per_bar * bars
|
| 107 |
|
| 108 |
-
|
| 109 |
# Enhance prompt based on loop type and BPM - minimal modification
|
| 110 |
if loop_type == "drums":
|
| 111 |
enhanced_prompt = f"{prompt} {bpm}bpm"
|
|
@@ -127,6 +178,7 @@ def generate_stable_audio_loop(prompt, loop_type, bpm, bars, seed=-1):
|
|
| 127 |
print(f"🎵 Generating {loop_type} loop:")
|
| 128 |
print(f" Enhanced prompt: {enhanced_prompt}")
|
| 129 |
print(f" Target duration: {target_loop_duration:.2f}s ({bars} bars at {bpm}bpm)")
|
|
|
|
| 130 |
print(f" Seed: {seed}")
|
| 131 |
|
| 132 |
# Prepare conditioning
|
|
@@ -145,7 +197,6 @@ def generate_stable_audio_loop(prompt, loop_type, bpm, bars, seed=-1):
|
|
| 145 |
# Generation timing
|
| 146 |
generation_start = time.time()
|
| 147 |
|
| 148 |
-
# Removed aggressive resource cleanup wrapper
|
| 149 |
# Clear GPU cache once before generation (not after)
|
| 150 |
# if device == "cuda":
|
| 151 |
# torch.cuda.empty_cache()
|
|
@@ -153,8 +204,8 @@ def generate_stable_audio_loop(prompt, loop_type, bpm, bars, seed=-1):
|
|
| 153 |
with torch.cuda.amp.autocast(enabled=(device == "cuda")):
|
| 154 |
output = generate_diffusion_cond(
|
| 155 |
model,
|
| 156 |
-
steps=
|
| 157 |
-
cfg_scale=
|
| 158 |
conditioning=conditioning,
|
| 159 |
negative_conditioning=negative_conditioning,
|
| 160 |
sample_size=config["sample_size"],
|
|
@@ -203,7 +254,7 @@ def generate_stable_audio_loop(prompt, loop_type, bpm, bars, seed=-1):
|
|
| 203 |
print(f" Total: {total_time:.2f}s")
|
| 204 |
print(f"✅ {loop_type.title()} loop: {actual_duration:.2f}s audio in {total_time:.2f}s")
|
| 205 |
|
| 206 |
-
return loop_filename, f"Generated {actual_duration:.2f}s {loop_type} loop at {bpm}bpm ({bars} bars) in {total_time:.2f}s"
|
| 207 |
|
| 208 |
except Exception as e:
|
| 209 |
print(f"❌ Generation error: {str(e)}")
|
|
@@ -340,6 +391,15 @@ def calculate_optimal_bars(bpm):
|
|
| 340 |
return bars
|
| 341 |
return 1
|
| 342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
# ========== GRADIO INTERFACE ==========
|
| 344 |
|
| 345 |
with gr.Blocks(title="stable-melodyflow") as iface:
|
|
@@ -398,6 +458,7 @@ with gr.Blocks(title="stable-melodyflow") as iface:
|
|
| 398 |
- bpm-aware generation ensures perfect sync between loops (most the time lol)
|
| 399 |
- negative prompting separates drums from instruments (most the time)
|
| 400 |
- smart bar calculation optimizes loop length for the BPM
|
|
|
|
| 401 |
""")
|
| 402 |
|
| 403 |
# ========== GLOBAL CONTROLS ==========
|
|
@@ -425,6 +486,25 @@ with gr.Blocks(title="stable-melodyflow") as iface:
|
|
| 425 |
info="prompt applied to either loop. make it more drum/instrument specific for best results"
|
| 426 |
)
|
| 427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
# Auto-suggest optimal bars based on BPM
|
| 429 |
def update_suggested_bars(bpm):
|
| 430 |
optimal = calculate_optimal_bars(bpm)
|
|
@@ -475,11 +555,20 @@ with gr.Blocks(title="stable-melodyflow") as iface:
|
|
| 475 |
|
| 476 |
with gr.Row():
|
| 477 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 478 |
transform_prompt = gr.Textbox(
|
| 479 |
label="transformation prompt",
|
| 480 |
-
value="
|
| 481 |
-
placeholder="
|
| 482 |
-
lines=
|
|
|
|
| 483 |
)
|
| 484 |
|
| 485 |
with gr.Column():
|
|
@@ -504,17 +593,24 @@ with gr.Blocks(title="stable-melodyflow") as iface:
|
|
| 504 |
|
| 505 |
# ========== EVENT HANDLERS ==========
|
| 506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
# Generate drums
|
| 508 |
generate_drums_btn.click(
|
| 509 |
generate_stable_audio_loop,
|
| 510 |
-
inputs=[base_prompt, gr.State("drums"), global_bpm, global_bars, drums_seed],
|
| 511 |
outputs=[drums_audio, drums_status]
|
| 512 |
)
|
| 513 |
|
| 514 |
# Generate instruments
|
| 515 |
generate_instruments_btn.click(
|
| 516 |
generate_stable_audio_loop,
|
| 517 |
-
inputs=[base_prompt, gr.State("instruments"), global_bpm, global_bars, instruments_seed],
|
| 518 |
outputs=[instruments_audio, instruments_status]
|
| 519 |
)
|
| 520 |
|
|
@@ -531,20 +627,6 @@ with gr.Blocks(title="stable-melodyflow") as iface:
|
|
| 531 |
inputs=[combined_audio, transform_prompt, transform_solver, transform_flowstep],
|
| 532 |
outputs=[transformed_audio, transform_status]
|
| 533 |
)
|
| 534 |
-
|
| 535 |
-
# # ========== EXAMPLES ==========
|
| 536 |
-
# gr.Markdown("## 🎯 Example Workflows")
|
| 537 |
-
|
| 538 |
-
# examples = gr.Examples(
|
| 539 |
-
# examples=[
|
| 540 |
-
# ["techno", 128, 4, "aggressive industrial techno"],
|
| 541 |
-
# ["jazz", 110, 2, "smooth lo-fi jazz with vinyl crackle"],
|
| 542 |
-
# ["ambient", 90, 8, "ethereal ambient soundscape"],
|
| 543 |
-
# ["hip-hop", 100, 4, "classic boom bap hip-hop"],
|
| 544 |
-
# ["drum and bass", 140, 4, "liquid drum and bass"],
|
| 545 |
-
# ],
|
| 546 |
-
# inputs=[base_prompt, global_bpm, global_bars, transform_prompt],
|
| 547 |
-
# )
|
| 548 |
|
| 549 |
if __name__ == "__main__":
|
| 550 |
iface.launch()
|
|
|
|
| 19 |
from gradio_client import Client, handle_file
|
| 20 |
from contextlib import contextmanager
|
| 21 |
|
| 22 |
+
# MelodyFlow Variations - extracted from variations.py
|
| 23 |
+
MELODYFLOW_VARIATIONS = {
|
| 24 |
+
# Acoustic Instruments
|
| 25 |
+
'accordion_folk': "Lively accordion music with a European folk feeling, perfect for a travel documentary about traditional culture and street performances in Paris",
|
| 26 |
+
'banjo_bluegrass': "Authentic bluegrass banjo band performance with rich picking patterns, ideal for a heartfelt documentary about American rural life and traditional crafts",
|
| 27 |
+
'piano_classical': "Expressive classical piano performance with dynamic range and emotional depth, ideal for a luxury brand commercial",
|
| 28 |
+
'celtic': "Traditional Celtic arrangement with fiddle and flute, perfect for a documentary about Ireland's stunning landscapes and ancient traditions",
|
| 29 |
+
'strings_quartet': "Elegant string quartet arrangement with rich harmonies and expressive dynamics, perfect for wedding ceremony music",
|
| 30 |
+
|
| 31 |
+
# Synthesizer Variations
|
| 32 |
+
'synth_retro': "1980s style synthesizer melody with warm analog pads and arpeggios, perfect for a nostalgic sci-fi movie soundtrack",
|
| 33 |
+
'synth_modern': "Modern electronic production with crisp digital synthesizer arpeggios and vocoder effects, ideal for a tech product launch video",
|
| 34 |
+
'synth_ambient': "Atmospheric synthesizer pads with reverb and delay, perfect for a meditation app or wellness commercial",
|
| 35 |
+
'synth_edm': "High-energy EDM synth saw leads with sidechain compression, pitch bends, perfect for sports highlights or action sequences",
|
| 36 |
+
|
| 37 |
+
# Band Arrangements
|
| 38 |
+
'rock_band': "Full rock band arrangement with electric guitars, bass, and drums, perfect for an action movie trailer",
|
| 39 |
+
|
| 40 |
+
# Hybrid/Special
|
| 41 |
+
'cinematic_epic': "Epic orchestral arrangement with modern hybrid elements, synthesizers, and percussion, perfect for movie trailers",
|
| 42 |
+
'lofi_chill': "Lo-fi hip hop style with vinyl crackle, mellow piano, and tape saturation, perfect for study or focus playlists",
|
| 43 |
+
'synth_bass': "Deep analog synthesizer bassline with modern production and subtle modulation, perfect for electronic music production",
|
| 44 |
+
'retro_rpg': "16-bit era JRPG soundtrack with bright melodic synthesizers, orchestral elements, and adventurous themes, perfect for a fantasy video game battle scene or overworld exploration",
|
| 45 |
+
'steel_drums': "Vibrant Caribbean steel drum ensemble with tropical percussion and uplifting melodies, perfect for a beach resort commercial or travel documentary",
|
| 46 |
+
'chiptune': "8-bit video game soundtrack with arpeggiated melodies and classic NES-style square waves, perfect for a retro platformer or action game",
|
| 47 |
+
'gamelan_fusion': "Indonesian gamelan ensemble with metallic percussion, gongs, and ethereal textures, perfect for a meditation app or spiritual documentary",
|
| 48 |
+
'music_box': "Delicate music box melody with gentle bell tones and ethereal ambiance, perfect for a children's lullaby or magical fantasy scene",
|
| 49 |
+
|
| 50 |
+
# Hip Hop / Trap Percussion
|
| 51 |
+
'trap_808': "808 bass",
|
| 52 |
+
'lo_fi_drums': "lofi hiphop percussion",
|
| 53 |
+
'boom_bap': "Classic 90s boom bap hip hop drums with punchy kicks, crisp snares, and jazz sample chops, perfect for documentary footage of urban street scenes and skateboarding",
|
| 54 |
+
'percussion_ensemble': "Rich percussive ensemble with djembe, congas, shakers, and tribal drums creating complex polyrhythms, perfect for nature documentaries about rainforests or ancient cultural rituals",
|
| 55 |
+
|
| 56 |
+
# Enhanced Electronic Music
|
| 57 |
+
'future_bass': "Energetic future bass with filtered supersaws, pitch-bending lead synths, heavy sidechain, and chopped vocal samples, perfect for extreme sports highlights or uplifting motivational content",
|
| 58 |
+
'synthwave_retro': "80s retrofuturistic synthwave with gated reverb drums, analog arpeggios, neon-bright lead synths and driving bass, perfect for cyberpunk-themed technology showcases or retro gaming montages",
|
| 59 |
+
'melodic_techno': "Hypnotic melodic techno with pulsing bass, atmospheric pads, and evolving synthesizer sequences with subtle filter modulation, ideal for timelapse footage of urban nightscapes or architectural showcases",
|
| 60 |
+
'dubstep_wobble': "Heavy dubstep with aggressive wobble bass, metallic synthesizers, distorted drops, and tension-building risers, perfect for action sequence transitions or gaming highlight reels",
|
| 61 |
+
|
| 62 |
+
# Glitchy Effects
|
| 63 |
+
'glitch_hop': "Glitch hop with stuttering sample slices, bit-crushed percussion, granular synthesis textures and digital artifacts, perfect for technology malfunction scenes or data visualization animations",
|
| 64 |
+
'digital_disruption': "Heavily glitched soundscape with digital artifacts, buffer errors, granular time stretching, and corrupted audio samples, ideal for cybersecurity themes or digital distortion transitions in tech presentations",
|
| 65 |
+
'circuit_bent': "Circuit-bent toy sounds with unpredictable pitch shifts, broken electronic tones, and hardware malfunction artifacts, perfect for creative coding demonstrations or innovative technology exhibitions",
|
| 66 |
+
|
| 67 |
+
# Experimental Hybrids
|
| 68 |
+
'orchestral_glitch': "Cinematic orchestral elements disrupted by digital glitches, granular textures, and temporal distortions, perfect for science fiction trailers or futuristic product reveals with contrasting classical and modern elements",
|
| 69 |
+
'vapor_drums': "Vaporwave drum processing with extreme pitch and time manipulation, reverb-drenched samples, and retro commercial music elements, ideal for nostalgic internet culture documentaries or retrofuturistic art installations",
|
| 70 |
+
'industrial_textures': "Harsh industrial soundscape with mechanical percussion, factory recordings, metallic impacts, and distorted synth drones, perfect for manufacturing process videos or dystopian urban environments",
|
| 71 |
+
'jungle_breaks': "High-energy jungle drum breaks with choppy breakbeat samples, deep sub bass, and dub reggae influences, perfect for fast-paced urban chase scenes or extreme sports montages"
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
# Global model storage
|
| 75 |
model_cache = {}
|
| 76 |
model_lock = threading.Lock()
|
|
|
|
| 142 |
model_cache['stable_audio_device'])
|
| 143 |
|
| 144 |
@spaces.GPU(duration=12)
|
| 145 |
+
def generate_stable_audio_loop(prompt, loop_type, bpm, bars, steps, cfg_scale, seed=-1):
|
| 146 |
"""Generate a BPM-aware loop using stable-audio-open-small"""
|
| 147 |
try:
|
| 148 |
total_start = time.time()
|
|
|
|
| 157 |
seconds_per_bar = seconds_per_beat * 4 # 4/4 time
|
| 158 |
target_loop_duration = seconds_per_bar * bars
|
| 159 |
|
|
|
|
| 160 |
# Enhance prompt based on loop type and BPM - minimal modification
|
| 161 |
if loop_type == "drums":
|
| 162 |
enhanced_prompt = f"{prompt} {bpm}bpm"
|
|
|
|
| 178 |
print(f"🎵 Generating {loop_type} loop:")
|
| 179 |
print(f" Enhanced prompt: {enhanced_prompt}")
|
| 180 |
print(f" Target duration: {target_loop_duration:.2f}s ({bars} bars at {bpm}bpm)")
|
| 181 |
+
print(f" Steps: {steps}, CFG Scale: {cfg_scale}")
|
| 182 |
print(f" Seed: {seed}")
|
| 183 |
|
| 184 |
# Prepare conditioning
|
|
|
|
| 197 |
# Generation timing
|
| 198 |
generation_start = time.time()
|
| 199 |
|
|
|
|
| 200 |
# Clear GPU cache once before generation (not after)
|
| 201 |
# if device == "cuda":
|
| 202 |
# torch.cuda.empty_cache()
|
|
|
|
| 204 |
with torch.cuda.amp.autocast(enabled=(device == "cuda")):
|
| 205 |
output = generate_diffusion_cond(
|
| 206 |
model,
|
| 207 |
+
steps=steps, # User-configurable steps
|
| 208 |
+
cfg_scale=cfg_scale, # User-configurable CFG scale
|
| 209 |
conditioning=conditioning,
|
| 210 |
negative_conditioning=negative_conditioning,
|
| 211 |
sample_size=config["sample_size"],
|
|
|
|
| 254 |
print(f" Total: {total_time:.2f}s")
|
| 255 |
print(f"✅ {loop_type.title()} loop: {actual_duration:.2f}s audio in {total_time:.2f}s")
|
| 256 |
|
| 257 |
+
return loop_filename, f"Generated {actual_duration:.2f}s {loop_type} loop at {bpm}bpm ({bars} bars) in {total_time:.2f}s (steps: {steps}, cfg: {cfg_scale})"
|
| 258 |
|
| 259 |
except Exception as e:
|
| 260 |
print(f"❌ Generation error: {str(e)}")
|
|
|
|
| 391 |
return bars
|
| 392 |
return 1
|
| 393 |
|
| 394 |
+
def update_transform_prompt(variation_choice):
|
| 395 |
+
"""Update the transformation prompt based on variation selection"""
|
| 396 |
+
if variation_choice == "custom":
|
| 397 |
+
return gr.update(value="", placeholder="enter your custom transformation prompt", interactive=True)
|
| 398 |
+
elif variation_choice in MELODYFLOW_VARIATIONS:
|
| 399 |
+
return gr.update(value=MELODYFLOW_VARIATIONS[variation_choice], interactive=True)
|
| 400 |
+
else:
|
| 401 |
+
return gr.update(value="", placeholder="select a variation or enter custom prompt", interactive=True)
|
| 402 |
+
|
| 403 |
# ========== GRADIO INTERFACE ==========
|
| 404 |
|
| 405 |
with gr.Blocks(title="stable-melodyflow") as iface:
|
|
|
|
| 458 |
- bpm-aware generation ensures perfect sync between loops (most the time lol)
|
| 459 |
- negative prompting separates drums from instruments (most the time)
|
| 460 |
- smart bar calculation optimizes loop length for the BPM
|
| 461 |
+
- preset transformation styles for braindead ease of use
|
| 462 |
""")
|
| 463 |
|
| 464 |
# ========== GLOBAL CONTROLS ==========
|
|
|
|
| 486 |
info="prompt applied to either loop. make it more drum/instrument specific for best results"
|
| 487 |
)
|
| 488 |
|
| 489 |
+
with gr.Row():
|
| 490 |
+
generation_steps = gr.Slider(
|
| 491 |
+
label="generation steps",
|
| 492 |
+
minimum=4,
|
| 493 |
+
maximum=16,
|
| 494 |
+
step=1,
|
| 495 |
+
value=8,
|
| 496 |
+
info="more steps = higher quality but slower generation"
|
| 497 |
+
)
|
| 498 |
+
|
| 499 |
+
cfg_scale = gr.Slider(
|
| 500 |
+
label="cfg scale",
|
| 501 |
+
minimum=0.5,
|
| 502 |
+
maximum=2.0,
|
| 503 |
+
step=0.1,
|
| 504 |
+
value=1.0,
|
| 505 |
+
info="higher values = more prompt adherence but potentially less natural"
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
# Auto-suggest optimal bars based on BPM
|
| 509 |
def update_suggested_bars(bpm):
|
| 510 |
optimal = calculate_optimal_bars(bpm)
|
|
|
|
| 555 |
|
| 556 |
with gr.Row():
|
| 557 |
with gr.Column():
|
| 558 |
+
# Variation dropdown
|
| 559 |
+
variation_choice = gr.Dropdown(
|
| 560 |
+
label="transformation style preset",
|
| 561 |
+
choices=["custom"] + list(MELODYFLOW_VARIATIONS.keys()),
|
| 562 |
+
value="custom",
|
| 563 |
+
info="select a preset style or choose 'custom' for your own prompt"
|
| 564 |
+
)
|
| 565 |
+
|
| 566 |
transform_prompt = gr.Textbox(
|
| 567 |
label="transformation prompt",
|
| 568 |
+
value="",
|
| 569 |
+
placeholder="enter your custom transformation prompt",
|
| 570 |
+
lines=3,
|
| 571 |
+
info="describes the style transformation to apply"
|
| 572 |
)
|
| 573 |
|
| 574 |
with gr.Column():
|
|
|
|
| 593 |
|
| 594 |
# ========== EVENT HANDLERS ==========
|
| 595 |
|
| 596 |
+
# Update transform prompt when variation is selected
|
| 597 |
+
variation_choice.change(
|
| 598 |
+
update_transform_prompt,
|
| 599 |
+
inputs=[variation_choice],
|
| 600 |
+
outputs=[transform_prompt]
|
| 601 |
+
)
|
| 602 |
+
|
| 603 |
# Generate drums
|
| 604 |
generate_drums_btn.click(
|
| 605 |
generate_stable_audio_loop,
|
| 606 |
+
inputs=[base_prompt, gr.State("drums"), global_bpm, global_bars, generation_steps, cfg_scale, drums_seed],
|
| 607 |
outputs=[drums_audio, drums_status]
|
| 608 |
)
|
| 609 |
|
| 610 |
# Generate instruments
|
| 611 |
generate_instruments_btn.click(
|
| 612 |
generate_stable_audio_loop,
|
| 613 |
+
inputs=[base_prompt, gr.State("instruments"), global_bpm, global_bars, generation_steps, cfg_scale, instruments_seed],
|
| 614 |
outputs=[instruments_audio, instruments_status]
|
| 615 |
)
|
| 616 |
|
|
|
|
| 627 |
inputs=[combined_audio, transform_prompt, transform_solver, transform_flowstep],
|
| 628 |
outputs=[transformed_audio, transform_status]
|
| 629 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 630 |
|
| 631 |
if __name__ == "__main__":
|
| 632 |
iface.launch()
|