Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -711,7 +711,55 @@ def start_session(request: gr.Request):
|
|
| 711 |
def check_box_clicked(adapative_tick):
|
| 712 |
print("checkbox clicked")
|
| 713 |
return gr.update(interactive=not adapative_tick)
|
| 714 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 715 |
|
| 716 |
css = """
|
| 717 |
#col-container {
|
|
@@ -762,6 +810,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 762 |
|
| 763 |
image_input = gr.Image(label="Reference Image", type="filepath", height=512)
|
| 764 |
audio_input = ExtendedAudio(label="Input Audio", type="filepath", options=["EMPTY"], show_download_button=True)
|
|
|
|
| 765 |
|
| 766 |
|
| 767 |
with gr.Column():
|
|
@@ -771,8 +820,10 @@ with gr.Blocks(css=css) as demo:
|
|
| 771 |
|
| 772 |
time_required = gr.Text(value="⌚ Zero GPU Required: --", show_label=False)
|
| 773 |
infer_btn = gr.Button("🦜 Avatar Me", variant="primary")
|
| 774 |
-
|
| 775 |
-
|
|
|
|
|
|
|
| 776 |
|
| 777 |
with gr.Column():
|
| 778 |
|
|
@@ -875,6 +926,11 @@ with gr.Blocks(css=css) as demo:
|
|
| 875 |
audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps], outputs=[time_required])
|
| 876 |
num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, text_input, num_steps, adaptive_text], outputs=[time_required, text_input])
|
| 877 |
adaptive_text.change(fn=check_box_clicked, inputs=[adaptive_text], outputs=[text_input])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 878 |
|
| 879 |
if __name__ == "__main__":
|
| 880 |
demo.unload(cleanup)
|
|
|
|
| 711 |
def check_box_clicked(adapative_tick):
|
| 712 |
print("checkbox clicked")
|
| 713 |
return gr.update(interactive=not adapative_tick)
|
| 714 |
+
|
| 715 |
+
def preprocess_audio_first_5s_librosa(audio_path, limit_on, session_id=None):
|
| 716 |
+
"""
|
| 717 |
+
If the uploaded audio is < 5s, return it unchanged.
|
| 718 |
+
If it's >= 5s, trim to the first 5s and return the trimmed WAV path.
|
| 719 |
+
"""
|
| 720 |
+
|
| 721 |
+
if not limit_on:
|
| 722 |
+
return audio_path
|
| 723 |
+
if not audio_path:
|
| 724 |
+
return None
|
| 725 |
+
|
| 726 |
+
# Robust duration check (librosa changed arg name across versions)
|
| 727 |
+
try:
|
| 728 |
+
dur = librosa.get_duration(path=audio_path)
|
| 729 |
+
except TypeError:
|
| 730 |
+
dur = librosa.get_duration(filename=audio_path)
|
| 731 |
+
|
| 732 |
+
# Small tolerance to avoid re-encoding 4.9999s files
|
| 733 |
+
if dur < 5.0 - 1e-3:
|
| 734 |
+
return audio_path
|
| 735 |
+
|
| 736 |
+
if session_id is None:
|
| 737 |
+
session_id = uuid.uuid4().hex
|
| 738 |
+
|
| 739 |
+
# Where we'll store per-session processed audio
|
| 740 |
+
output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
|
| 741 |
+
audio_dir = os.path.join(output_dir, "audio")
|
| 742 |
+
os.makedirs(audio_dir, exist_ok=True)
|
| 743 |
+
|
| 744 |
+
trimmed_path = os.path.join(audio_dir, "audio_input_5s.wav")
|
| 745 |
+
sr = getattr(args, "sample_rate", 16000)
|
| 746 |
+
|
| 747 |
+
# Load exactly the first 5s as mono at target sample rate
|
| 748 |
+
y, _ = librosa.load(audio_path, sr=sr, mono=True, duration=5.0)
|
| 749 |
+
|
| 750 |
+
# Save as 16-bit PCM mono WAV
|
| 751 |
+
waveform = torch.from_numpy(y).unsqueeze(0) # [1, num_samples]
|
| 752 |
+
torchaudio.save(
|
| 753 |
+
trimmed_path,
|
| 754 |
+
waveform,
|
| 755 |
+
sr,
|
| 756 |
+
encoding="PCM_S",
|
| 757 |
+
bits_per_sample=16,
|
| 758 |
+
format="wav",
|
| 759 |
+
)
|
| 760 |
+
|
| 761 |
+
return trimmed_path
|
| 762 |
+
|
| 763 |
|
| 764 |
css = """
|
| 765 |
#col-container {
|
|
|
|
| 810 |
|
| 811 |
image_input = gr.Image(label="Reference Image", type="filepath", height=512)
|
| 812 |
audio_input = ExtendedAudio(label="Input Audio", type="filepath", options=["EMPTY"], show_download_button=True)
|
| 813 |
+
gr.Markdown("*A 5-second limit is applied to audio files to shorten generation time. You can turn this off in Advanced Settings*")
|
| 814 |
|
| 815 |
|
| 816 |
with gr.Column():
|
|
|
|
| 820 |
|
| 821 |
time_required = gr.Text(value="⌚ Zero GPU Required: --", show_label=False)
|
| 822 |
infer_btn = gr.Button("🦜 Avatar Me", variant="primary")
|
| 823 |
+
with gr.Accordion("Advanced Settings", open=False):
|
| 824 |
+
limit_on = gr.Checkbox(label="Limit Audio files to 5 seconds", value=True)
|
| 825 |
+
adaptive_text = gr.Checkbox(label="Adaptive Video Prompt", value=True)
|
| 826 |
+
text_input = gr.Textbox(show_label=False, lines=6, elem_classes=["stateful"], interactive=False, value= ADAPTIVE_PROMPT_TEMPLATES[1])
|
| 827 |
|
| 828 |
with gr.Column():
|
| 829 |
|
|
|
|
| 926 |
audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps], outputs=[time_required])
|
| 927 |
num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, text_input, num_steps, adaptive_text], outputs=[time_required, text_input])
|
| 928 |
adaptive_text.change(fn=check_box_clicked, inputs=[adaptive_text], outputs=[text_input])
|
| 929 |
+
audio_input.upload(
|
| 930 |
+
fn=preprocess_audio_first_5s_librosa,
|
| 931 |
+
inputs=[audio_input, limit_on, session_state],
|
| 932 |
+
outputs=[audio_input],
|
| 933 |
+
)
|
| 934 |
|
| 935 |
if __name__ == "__main__":
|
| 936 |
demo.unload(cleanup)
|