Spaces:
Running
on
Zero
Running
on
Zero
James Zhou
commited on
Commit
·
49611df
1
Parent(s):
3fff971
[update] neg prompt
Browse files- app.py +14 -6
- hunyuanvideo_foley/utils/feature_utils.py +4 -3
app.py
CHANGED
|
@@ -120,6 +120,7 @@ def auto_load_models() -> str:
|
|
| 120 |
def infer_single_video(
|
| 121 |
video_file,
|
| 122 |
text_prompt: str,
|
|
|
|
| 123 |
guidance_scale: float = 4.5,
|
| 124 |
num_inference_steps: int = 50,
|
| 125 |
sample_nums: int = 1
|
|
@@ -147,7 +148,8 @@ def infer_single_video(
|
|
| 147 |
video_file,
|
| 148 |
text_prompt,
|
| 149 |
model_dict,
|
| 150 |
-
cfg
|
|
|
|
| 151 |
)
|
| 152 |
|
| 153 |
# Denoising process to generate multiple audio samples
|
|
@@ -566,6 +568,12 @@ def create_gradio_interface():
|
|
| 566 |
placeholder="A person walks on frozen ice",
|
| 567 |
lines=3,
|
| 568 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
|
| 570 |
with gr.Row():
|
| 571 |
guidance_scale = gr.Slider(
|
|
@@ -748,10 +756,10 @@ def create_gradio_interface():
|
|
| 748 |
example_buttons.append((example_btn, example))
|
| 749 |
|
| 750 |
# Event handlers
|
| 751 |
-
def process_inference(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
|
| 752 |
# Generate videos
|
| 753 |
video_list, status_msg = infer_single_video(
|
| 754 |
-
video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
|
| 755 |
)
|
| 756 |
# Update outputs with proper visibility
|
| 757 |
return update_video_outputs(video_list, status_msg)
|
|
@@ -777,7 +785,7 @@ def create_gradio_interface():
|
|
| 777 |
|
| 778 |
generate_btn.click(
|
| 779 |
fn=process_inference,
|
| 780 |
-
inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
|
| 781 |
outputs=[
|
| 782 |
video_output_1, # Sample 1 value
|
| 783 |
video_output_2, # Sample 2 value
|
|
@@ -810,12 +818,12 @@ def create_gradio_interface():
|
|
| 810 |
if not result_video:
|
| 811 |
status_msg += f"\n⚠️ Result video not found: {ex['result_path']}"
|
| 812 |
|
| 813 |
-
return video_file, ex['caption'], result_video, status_msg
|
| 814 |
return handler
|
| 815 |
|
| 816 |
btn.click(
|
| 817 |
fn=create_example_handler(example),
|
| 818 |
-
outputs=[video_input, text_input, video_output_1, result_text]
|
| 819 |
)
|
| 820 |
|
| 821 |
# Footer
|
|
|
|
| 120 |
def infer_single_video(
|
| 121 |
video_file,
|
| 122 |
text_prompt: str,
|
| 123 |
+
neg_prompt: str = None,
|
| 124 |
guidance_scale: float = 4.5,
|
| 125 |
num_inference_steps: int = 50,
|
| 126 |
sample_nums: int = 1
|
|
|
|
| 148 |
video_file,
|
| 149 |
text_prompt,
|
| 150 |
model_dict,
|
| 151 |
+
cfg,
|
| 152 |
+
neg_prompt=neg_prompt
|
| 153 |
)
|
| 154 |
|
| 155 |
# Denoising process to generate multiple audio samples
|
|
|
|
| 568 |
placeholder="A person walks on frozen ice",
|
| 569 |
lines=3,
|
| 570 |
)
|
| 571 |
+
|
| 572 |
+
neg_prompt_input = gr.Textbox(
|
| 573 |
+
label="🚫 Negative Prompt",
|
| 574 |
+
placeholder="noisy, harsh",
|
| 575 |
+
lines=2,
|
| 576 |
+
)
|
| 577 |
|
| 578 |
with gr.Row():
|
| 579 |
guidance_scale = gr.Slider(
|
|
|
|
| 756 |
example_buttons.append((example_btn, example))
|
| 757 |
|
| 758 |
# Event handlers
|
| 759 |
+
def process_inference(video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, sample_nums):
|
| 760 |
# Generate videos
|
| 761 |
video_list, status_msg = infer_single_video(
|
| 762 |
+
video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, int(sample_nums)
|
| 763 |
)
|
| 764 |
# Update outputs with proper visibility
|
| 765 |
return update_video_outputs(video_list, status_msg)
|
|
|
|
| 785 |
|
| 786 |
generate_btn.click(
|
| 787 |
fn=process_inference,
|
| 788 |
+
inputs=[video_input, text_input, neg_prompt_input, guidance_scale, inference_steps, sample_nums],
|
| 789 |
outputs=[
|
| 790 |
video_output_1, # Sample 1 value
|
| 791 |
video_output_2, # Sample 2 value
|
|
|
|
| 818 |
if not result_video:
|
| 819 |
status_msg += f"\n⚠️ Result video not found: {ex['result_path']}"
|
| 820 |
|
| 821 |
+
return video_file, ex['caption'], "noisy, harsh", result_video, status_msg
|
| 822 |
return handler
|
| 823 |
|
| 824 |
btn.click(
|
| 825 |
fn=create_example_handler(example),
|
| 826 |
+
outputs=[video_input, text_input, neg_prompt_input, video_output_1, result_text]
|
| 827 |
)
|
| 828 |
|
| 829 |
# Footer
|
hunyuanvideo_foley/utils/feature_utils.py
CHANGED
|
@@ -10,7 +10,7 @@ from typing import Any, Dict, List, Union, Tuple
|
|
| 10 |
from loguru import logger
|
| 11 |
|
| 12 |
from .config_utils import AttributeDict
|
| 13 |
-
from ..constants import FPS_VISUAL, MAX_VIDEO_DURATION_SECONDS
|
| 14 |
|
| 15 |
|
| 16 |
class FeatureExtractionError(Exception):
|
|
@@ -134,9 +134,10 @@ def encode_text_feat(text: List[str], model_dict):
|
|
| 134 |
return outputs.last_hidden_state, outputs.attentions
|
| 135 |
|
| 136 |
|
| 137 |
-
def feature_process(video_path, prompt, model_dict, cfg):
|
| 138 |
visual_feats, audio_len_in_s = encode_video_features(video_path, model_dict)
|
| 139 |
-
neg_prompt
|
|
|
|
| 140 |
prompts = [neg_prompt, prompt]
|
| 141 |
text_feat_res, text_feat_mask = encode_text_feat(prompts, model_dict)
|
| 142 |
|
|
|
|
| 10 |
from loguru import logger
|
| 11 |
|
| 12 |
from .config_utils import AttributeDict
|
| 13 |
+
from ..constants import FPS_VISUAL, MAX_VIDEO_DURATION_SECONDS, DEFAULT_NEGATIVE_PROMPT
|
| 14 |
|
| 15 |
|
| 16 |
class FeatureExtractionError(Exception):
|
|
|
|
| 134 |
return outputs.last_hidden_state, outputs.attentions
|
| 135 |
|
| 136 |
|
| 137 |
+
def feature_process(video_path, prompt, model_dict, cfg, neg_prompt=None):
|
| 138 |
visual_feats, audio_len_in_s = encode_video_features(video_path, model_dict)
|
| 139 |
+
if neg_prompt is None:
|
| 140 |
+
neg_prompt = DEFAULT_NEGATIVE_PROMPT # 使用常量中的默认值
|
| 141 |
prompts = [neg_prompt, prompt]
|
| 142 |
text_feat_res, text_feat_mask = encode_text_feat(prompts, model_dict)
|
| 143 |
|