Spaces:

alexnasa
/

OmniAvatar

Running on Zero

App Files Files Community

alexnasa commited on Aug 2

Commit

35abee1

verified ·

1 Parent(s): 7babf45

Update args_config.yaml

Browse files

Files changed (1) hide show

args_config.yaml +76 -76

args_config.yaml CHANGED Viewed

@@ -1,77 +1,77 @@
-config: configs/inference.yaml
-input_file: examples/infer_samples.txt
-debug: null
-infer: false
-hparams: ''
-dtype: bf16
-exp_path: pretrained_models/OmniAvatar-14B
-text_encoder_path: pretrained_models/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth
-image_encoder_path: None
-dit_path: pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
-vae_path: pretrained_models/Wan2.1-T2V-14B/Wan2.1_VAE.pth
-# exp_path: pretrained_models/OmniAvatar-1.3B
-# text_encoder_path: pretrained_models/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
-# image_encoder_path: None
-# dit_path: pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
-# vae_path: pretrained_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
-wav2vec_path: pretrained_models/wav2vec2-base-960h
-num_persistent_param_in_dit:
-reload_cfg: true
-sp_size: 1
-seed: 42
-image_sizes_720:
-- - 400
-  - 720
-# - - 720 commented out due duration needed on HF
-#   - 720
-- - 720
-  - 400
-image_sizes_1280:
-- - 720
-  - 720
-- - 528
-  - 960
-- - 960
-  - 528
-- - 720
-  - 1280
-- - 1280
-  - 720
-max_hw: 720
-max_tokens: 40000
-seq_len: 200
-overlap_frame: 13
-guidance_scale: 4.5
-audio_scale: null
-num_steps: 8
-fps: 24
-sample_rate: 16000
-negative_prompt: Vivid color tones, background/camera moving quickly, screen switching,
-  subtitles and special effects, mutation, overexposed, static, blurred details, subtitles,
-  style, work, painting, image, still, overall grayish, worst quality, low quality,
-  JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly
-  drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image,
-  chaotic background, three legs, crowded background with many people, walking backward
-silence_duration_s: 0.0
-use_fsdp: false
-tea_cache_l1_thresh: 0
-rank: 0
-world_size: 1
-local_rank: 0
-device: cuda
-num_nodes: 1
-i2v: true
-use_audio: true
-random_prefix_frames: true
-model_config:
-  in_dim: 33
-  audio_hidden_size: 32
-train_architecture: lora
-lora_target_modules: q,k,v,o,ffn.0,ffn.2
-init_lora_weights: kaiming
-lora_rank: 128
 lora_alpha: 64.0

+config: configs/inference.yaml
+input_file: examples/infer_samples.txt
+debug: null
+infer: false
+hparams: ''
+dtype: bf16
+exp_path: pretrained_models/OmniAvatar-14B
+text_encoder_path: pretrained_models/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth
+image_encoder_path: None
+dit_path: pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
+vae_path: pretrained_models/Wan2.1-T2V-14B/Wan2.1_VAE.pth
+# exp_path: pretrained_models/OmniAvatar-1.3B
+# text_encoder_path: pretrained_models/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
+# image_encoder_path: None
+# dit_path: pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
+# vae_path: pretrained_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
+wav2vec_path: pretrained_models/wav2vec2-base-960h
+num_persistent_param_in_dit:
+reload_cfg: true
+sp_size: 1
+seed: 42
+image_sizes_720:
+- - 400
+  - 720
+# - - 720 commented out due duration needed on HF
+#   - 720
+# - - 720
+#   - 400
+image_sizes_1280:
+- - 720
+  - 720
+- - 528
+  - 960
+- - 960
+  - 528
+- - 720
+  - 1280
+- - 1280
+  - 720
+max_hw: 720
+max_tokens: 40000
+seq_len: 200
+overlap_frame: 13
+guidance_scale: 4.5
+audio_scale: null
+num_steps: 8
+fps: 24
+sample_rate: 16000
+negative_prompt: Vivid color tones, background/camera moving quickly, screen switching,
+  subtitles and special effects, mutation, overexposed, static, blurred details, subtitles,
+  style, work, painting, image, still, overall grayish, worst quality, low quality,
+  JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly
+  drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image,
+  chaotic background, three legs, crowded background with many people, walking backward
+silence_duration_s: 0.0
+use_fsdp: false
+tea_cache_l1_thresh: 0
+rank: 0
+world_size: 1
+local_rank: 0
+device: cuda
+num_nodes: 1
+i2v: true
+use_audio: true
+random_prefix_frames: true
+model_config:
+  in_dim: 33
+  audio_hidden_size: 32
+train_architecture: lora
+lora_target_modules: q,k,v,o,ffn.0,ffn.2
+init_lora_weights: kaiming
+lora_rank: 128
 lora_alpha: 64.0