VEO3-Free

Running on Zero

App Files Files Community

ginipick commited on Aug 28

Commit

340367d

verified ·

1 Parent(s): b66d096

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -16

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import logging
 import os
 from pathlib import Path
 from datetime import datetime
 import torch
 import numpy as np
@@ -15,6 +16,7 @@ from diffusers import AutoModel
 import gradio as gr
 import tempfile
 from huggingface_hub import hf_hub_download
 # Patch for scaled_dot_product_attention to fix enable_gqa issue
 import torch.nn.functional as F
@@ -73,7 +75,7 @@ MIN_FRAMES_MODEL = 8
 MAX_FRAMES_MODEL = 129
 DEFAULT_NAG_NEGATIVE_PROMPT = "Static, motionless, still, ugly, bad quality, worst quality, poorly drawn, low resolution, blurry, lack of details"
-DEFAULT_AUDIO_NEGATIVE_PROMPT = "music"
 # NAG Model Settings
 MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
@@ -136,17 +138,83 @@ except Exception as e:
    print(f"Error loading MMAudio Model: {e}")
    audio_net = None
 # Audio generation function
 @torch.inference_mode()
-def add_audio_to_video(video_path, prompt, audio_negative_prompt, audio_steps, audio_cfg_strength, duration):
    """Generate and add audio to video using MMAudio"""
    if audio_net is None:
        print("MMAudio model not loaded, returning video without audio")
        return video_path
    try:
        rng = torch.Generator(device=device)
-       rng.seed()  # Random seed for audio
        fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=audio_steps)
        video_info = load_video(video_path, duration)
@@ -158,9 +226,12 @@ def add_audio_to_video(video_path, prompt, audio_negative_prompt, audio_steps, a
        audio_seq_cfg.duration = duration
        audio_net.update_seq_lengths(audio_seq_cfg.latent_seq_len, audio_seq_cfg.clip_seq_len, audio_seq_cfg.sync_seq_len)
        audios = mmaudio_generate(clip_frames,
-                                 sync_frames, [prompt],
-                                 negative_text=[audio_negative_prompt],
                                  feature_utils=audio_feature_utils,
                                  net=audio_net,
                                  fm=fm,
@@ -175,12 +246,13 @@ def add_audio_to_video(video_path, prompt, audio_negative_prompt, audio_steps, a
        return video_with_audio_path
    except Exception as e:
        print(f"Error in audio generation: {e}")
        return video_path
 # Combined generation function
 def get_duration(prompt, nag_negative_prompt, nag_scale, height, width, duration_seconds,
-                steps, seed, randomize_seed, enable_audio, audio_negative_prompt,
-                audio_steps, audio_cfg_strength):
    # Calculate total duration including audio processing if enabled
    video_duration = int(duration_seconds) * int(steps) * 2.25 + 5
    audio_duration = 30 if enable_audio else 0  # Additional time for audio processing
@@ -193,8 +265,9 @@ def generate_video_with_audio(
        height=DEFAULT_H_SLIDER_VALUE, width=DEFAULT_W_SLIDER_VALUE, duration_seconds=DEFAULT_DURATION_SECONDS,
        steps=DEFAULT_STEPS,
        seed=DEFAULT_SEED, randomize_seed=False,
-       enable_audio=True, audio_negative_prompt=DEFAULT_AUDIO_NEGATIVE_PROMPT,
-       audio_steps=25, audio_cfg_strength=4.5,
 ):
    if pipe is None:
        return None, DEFAULT_SEED
@@ -235,7 +308,8 @@ def generate_video_with_audio(
                print("Adding audio to video...")
                final_video_path = add_audio_to_video(
                    temp_video_path,
-                   prompt,  # Use the same prompt for audio generation
                    audio_negative_prompt,
                    audio_steps,
                    audio_cfg_strength,
@@ -270,8 +344,9 @@ def set_example(prompt, nag_negative_prompt, nag_scale):
        DEFAULT_SEED,
        True,  # randomize_seed
        True,  # enable_audio
        DEFAULT_AUDIO_NEGATIVE_PROMPT,
-       25,    # audio_steps
        4.5    # audio_cfg_strength
    )
@@ -430,10 +505,15 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
                )
                with gr.Column(visible=True) as audio_settings_group:
                    audio_negative_prompt = gr.Textbox(
                        label="Audio Negative Prompt",
                        value=DEFAULT_AUDIO_NEGATIVE_PROMPT,
-                       placeholder="Elements to avoid in audio (e.g., music, speech)",
                    )
                    with gr.Row():
@@ -441,7 +521,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
                            minimum=10,
                            maximum=50,
                            step=5,
-                           value=25,
                            label="🎚️ Audio Steps",
                            info="More steps = better quality"
                        )
@@ -478,8 +558,8 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
            gr.HTML("""
                <div style="text-align: center; margin-top: 20px; color: #6b7280;">
-                   <p>💡 Tip: The same prompt is used for both video and audio generation!</p>
-                   <p>🎧 Audio is automatically matched to the visual content</p>
                </div>
            """)
@@ -501,7 +581,8 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
        height_input, width_input, duration_seconds_input,
        steps_slider,
        seed_input, randomize_seed_checkbox,
-       enable_audio, audio_negative_prompt, audio_steps, audio_cfg_strength,
    ]
    generate_button.click(

 import os
 from pathlib import Path
 from datetime import datetime
+import re
 import torch
 import numpy as np
 import gradio as gr
 import tempfile
 from huggingface_hub import hf_hub_download
+import traceback
 # Patch for scaled_dot_product_attention to fix enable_gqa issue
 import torch.nn.functional as F
 MAX_FRAMES_MODEL = 129
 DEFAULT_NAG_NEGATIVE_PROMPT = "Static, motionless, still, ugly, bad quality, worst quality, poorly drawn, low resolution, blurry, lack of details"
+DEFAULT_AUDIO_NEGATIVE_PROMPT = "music, speech, voice, singing, narration"
 # NAG Model Settings
 MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
    print(f"Error loading MMAudio Model: {e}")
    audio_net = None
+# 비디오 프롬프트를 오디오 프롬프트로 변환하는 함수
+def extract_audio_description(video_prompt):
+   """비디오 프롬프트에서 오디오 관련 설명 추출/변환"""
+   # 키워드 매핑
+   audio_keywords = {
+       'car': 'car engine sound, vehicle noise',
+       'porsche': 'sports car engine roar, exhaust sound',
+       'guitar': 'electric guitar playing, guitar music',
+       'concert': 'crowd cheering, live music, applause',
+       'motorcycle': 'motorcycle engine sound, motor rumble',
+       'highway': 'traffic noise, road ambience',
+       'rain': 'rain sounds, water drops',
+       'wind': 'wind blowing sound',
+       'ocean': 'ocean waves, water sounds',
+       'city': 'urban ambience, city traffic sounds',
+       'singer': 'singing voice, vocals',
+       'crowd': 'crowd noise, people talking',
+       'flames': 'fire crackling sound',
+       'pyro': 'fire whoosh, flame burst sound',
+       'explosion': 'explosion sound, blast',
+       'countryside': 'nature ambience, birds chirping',
+       'wheat fields': 'wind through grass, rural ambience',
+       'engine': 'motor sound, mechanical noise',
+       'flat-six engine': 'sports car engine sound',
+       'roaring': 'loud engine roar',
+       'thunderous': 'loud booming sound',
+       'child': 'children playing sounds',
+       'running': 'footsteps sound',
+       'woman': 'ambient sounds',
+       'phone': 'subtle electronic ambience',
+       'advertisement': 'modern ambient sounds'
+   }
+   # 간단한 키워드 기반 변환
+   audio_descriptions = []
+   lower_prompt = video_prompt.lower()
+   for key, value in audio_keywords.items():
+       if key in lower_prompt:
+           audio_descriptions.append(value)
+   # 기본값 설정
+   if not audio_descriptions:
+       # 프롬프트에 명시적인 오디오 설명이 있는지 확인
+       if 'sound' in lower_prompt or 'audio' in lower_prompt or 'noise' in lower_prompt:
+           # 프롬프트에서 오디오 관련 부분만 추출
+           audio_pattern = r'([^.]*(?:sound|audio|noise|music|voice|roar|rumble)[^.]*)'
+           matches = re.findall(audio_pattern, lower_prompt, re.IGNORECASE)
+           if matches:
+               return ', '.join(matches)
+       # 기본 ambient sound
+       return "ambient environmental sounds matching the scene"
+   return ', '.join(audio_descriptions)
 # Audio generation function
 @torch.inference_mode()
+def add_audio_to_video(video_path, prompt, audio_custom_prompt, audio_negative_prompt, audio_steps, audio_cfg_strength, duration):
    """Generate and add audio to video using MMAudio"""
    if audio_net is None:
        print("MMAudio model not loaded, returning video without audio")
        return video_path
    try:
+       # 커스텀 오디오 프롬프트가 있으면 사용, 없으면 비디오 프롬프트에서 추출
+       if audio_custom_prompt and audio_custom_prompt.strip():
+           audio_prompt = audio_custom_prompt.strip()
+       else:
+           audio_prompt = extract_audio_description(prompt)
+       print(f"Original prompt: {prompt}")
+       print(f"Audio prompt: {audio_prompt}")
        rng = torch.Generator(device=device)
+       rng.manual_seed(random.randint(0, 2**32 - 1))  # 더 명확한 랜덤 시드
        fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=audio_steps)
        video_info = load_video(video_path, duration)
        audio_seq_cfg.duration = duration
        audio_net.update_seq_lengths(audio_seq_cfg.latent_seq_len, audio_seq_cfg.clip_seq_len, audio_seq_cfg.sync_seq_len)
+       # 향상된 네거티브 프롬프트
+       enhanced_negative = f"{audio_negative_prompt}, distortion, static noise, silence, random beeps"
        audios = mmaudio_generate(clip_frames,
+                                 sync_frames, [audio_prompt],  # 변환된 오디오 프롬프트 사용
+                                 negative_text=[enhanced_negative],
                                  feature_utils=audio_feature_utils,
                                  net=audio_net,
                                  fm=fm,
        return video_with_audio_path
    except Exception as e:
        print(f"Error in audio generation: {e}")
+       traceback.print_exc()
        return video_path
 # Combined generation function
 def get_duration(prompt, nag_negative_prompt, nag_scale, height, width, duration_seconds,
+                steps, seed, randomize_seed, enable_audio, audio_custom_prompt,
+                audio_negative_prompt, audio_steps, audio_cfg_strength):
    # Calculate total duration including audio processing if enabled
    video_duration = int(duration_seconds) * int(steps) * 2.25 + 5
    audio_duration = 30 if enable_audio else 0  # Additional time for audio processing
        height=DEFAULT_H_SLIDER_VALUE, width=DEFAULT_W_SLIDER_VALUE, duration_seconds=DEFAULT_DURATION_SECONDS,
        steps=DEFAULT_STEPS,
        seed=DEFAULT_SEED, randomize_seed=False,
+       enable_audio=True, audio_custom_prompt="",
+       audio_negative_prompt=DEFAULT_AUDIO_NEGATIVE_PROMPT,
+       audio_steps=30, audio_cfg_strength=4.5,
 ):
    if pipe is None:
        return None, DEFAULT_SEED
                print("Adding audio to video...")
                final_video_path = add_audio_to_video(
                    temp_video_path,
+                   prompt,
+                   audio_custom_prompt,
                    audio_negative_prompt,
                    audio_steps,
                    audio_cfg_strength,
        DEFAULT_SEED,
        True,  # randomize_seed
        True,  # enable_audio
+       "",    # audio_custom_prompt
        DEFAULT_AUDIO_NEGATIVE_PROMPT,
+       30,    # audio_steps
        4.5    # audio_cfg_strength
    )
                )
                with gr.Column(visible=True) as audio_settings_group:
+                   audio_custom_prompt = gr.Textbox(
+                       label="Custom Audio Prompt (Optional)",
+                       placeholder="Leave empty to auto-generate from video prompt, or specify custom audio description (e.g., 'car engine sound, traffic noise')",
+                       value="",
+                   )
                    audio_negative_prompt = gr.Textbox(
                        label="Audio Negative Prompt",
                        value=DEFAULT_AUDIO_NEGATIVE_PROMPT,
+                       placeholder="Elements to avoid in audio",
                    )
                    with gr.Row():
                            minimum=10,
                            maximum=50,
                            step=5,
+                           value=30,
                            label="🎚️ Audio Steps",
                            info="More steps = better quality"
                        )
            gr.HTML("""
                <div style="text-align: center; margin-top: 20px; color: #6b7280;">
+                   <p>💡 Tip: For better audio, use Custom Audio Prompt with sound descriptions!</p>
+                   <p>🎧 Examples: "car engine sound", "crowd cheering", "nature ambience"</p>
                </div>
            """)
        height_input, width_input, duration_seconds_input,
        steps_slider,
        seed_input, randomize_seed_checkbox,
+       enable_audio, audio_custom_prompt, audio_negative_prompt,
+       audio_steps, audio_cfg_strength,
    ]
    generate_button.click(