Spaces:

alexnasa
/

OmniAvatar

Running on Zero

App Files Files Community

alexnasa commited on Aug 17

Commit

9cfe43e

verified ·

1 Parent(s): 3df2527

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -3

app.py CHANGED Viewed

@@ -711,7 +711,55 @@ def start_session(request: gr.Request):
 def check_box_clicked(adapative_tick):
     print("checkbox clicked")
     return gr.update(interactive=not adapative_tick)
 css = """
     #col-container {
@@ -762,6 +810,7 @@ with gr.Blocks(css=css) as demo:
                 image_input = gr.Image(label="Reference Image", type="filepath", height=512)
                 audio_input = ExtendedAudio(label="Input Audio", type="filepath", options=["EMPTY"], show_download_button=True)
             with gr.Column():
@@ -771,8 +820,10 @@ with gr.Blocks(css=css) as demo:
                 time_required = gr.Text(value="⌚ Zero GPU Required: --", show_label=False)
                 infer_btn = gr.Button("🦜 Avatar Me", variant="primary")
-                adaptive_text = gr.Checkbox(label="Adaptive Video Prompt", value=True)
-                text_input = gr.Textbox(show_label=False, lines=6, elem_classes=["stateful"], interactive=False, value= ADAPTIVE_PROMPT_TEMPLATES[1])
             with gr.Column():
@@ -875,6 +926,11 @@ with gr.Blocks(css=css) as demo:
     audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps], outputs=[time_required])
     num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, text_input, num_steps, adaptive_text], outputs=[time_required, text_input])
     adaptive_text.change(fn=check_box_clicked, inputs=[adaptive_text], outputs=[text_input])
 if __name__ == "__main__":
     demo.unload(cleanup)

 def check_box_clicked(adapative_tick):
     print("checkbox clicked")
     return gr.update(interactive=not adapative_tick)
+def preprocess_audio_first_5s_librosa(audio_path, limit_on, session_id=None):
+    """
+    If the uploaded audio is < 5s, return it unchanged.
+    If it's >= 5s, trim to the first 5s and return the trimmed WAV path.
+    """
+    if not limit_on:
+        return audio_path
+    if not audio_path:
+        return None
+    # Robust duration check (librosa changed arg name across versions)
+    try:
+        dur = librosa.get_duration(path=audio_path)
+    except TypeError:
+        dur = librosa.get_duration(filename=audio_path)
+    # Small tolerance to avoid re-encoding 4.9999s files
+    if dur < 5.0 - 1e-3:
+        return audio_path
+    if session_id is None:
+        session_id = uuid.uuid4().hex
+    # Where we'll store per-session processed audio
+    output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
+    audio_dir = os.path.join(output_dir, "audio")
+    os.makedirs(audio_dir, exist_ok=True)
+    trimmed_path = os.path.join(audio_dir, "audio_input_5s.wav")
+    sr = getattr(args, "sample_rate", 16000)
+    # Load exactly the first 5s as mono at target sample rate
+    y, _ = librosa.load(audio_path, sr=sr, mono=True, duration=5.0)
+    # Save as 16-bit PCM mono WAV
+    waveform = torch.from_numpy(y).unsqueeze(0)  # [1, num_samples]
+    torchaudio.save(
+        trimmed_path,
+        waveform,
+        sr,
+        encoding="PCM_S",
+        bits_per_sample=16,
+        format="wav",
+    )
+    return trimmed_path
 css = """
     #col-container {
                 image_input = gr.Image(label="Reference Image", type="filepath", height=512)
                 audio_input = ExtendedAudio(label="Input Audio", type="filepath", options=["EMPTY"], show_download_button=True)
+                gr.Markdown("*A 5-second limit is applied to audio files to shorten generation time. You can turn this off in Advanced Settings*")
             with gr.Column():
                 time_required = gr.Text(value="⌚ Zero GPU Required: --", show_label=False)
                 infer_btn = gr.Button("🦜 Avatar Me", variant="primary")
+                with gr.Accordion("Advanced Settings", open=False):
+                    limit_on = gr.Checkbox(label="Limit Audio files to 5 seconds", value=True)
+                    adaptive_text = gr.Checkbox(label="Adaptive Video Prompt", value=True)
+                    text_input = gr.Textbox(show_label=False, lines=6, elem_classes=["stateful"], interactive=False, value= ADAPTIVE_PROMPT_TEMPLATES[1])
             with gr.Column():
     audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps], outputs=[time_required])
     num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, text_input, num_steps, adaptive_text], outputs=[time_required, text_input])
     adaptive_text.change(fn=check_box_clicked, inputs=[adaptive_text], outputs=[text_input])
+    audio_input.upload(
+        fn=preprocess_audio_first_5s_librosa,
+        inputs=[audio_input, limit_on, session_state],
+        outputs=[audio_input],
+    )
 if __name__ == "__main__":
     demo.unload(cleanup)