Spaces:
Runtime error
Runtime error
| # pytorch_lightning==2.2.2 | |
| seed_everything: 33 | |
| trainer: | |
| accelerator: auto | |
| strategy: auto | |
| devices: '1' | |
| num_nodes: 1 | |
| precision: 16-mixed | |
| logger: False | |
| model: | |
| class_path: diffusion_trainer.streaming_svd.StreamingSVD | |
| init_args: | |
| vfi: | |
| class_path: modules.params.vfi.VFIParams | |
| init_args: | |
| ckpt_path_local: checkpoint/VFI/ours.pkl | |
| ckpt_path_global: https://drive.google.com/file/d/1XCNoyhA1RX3m8W-XJK8H8inH47l36kxP/view?usp=sharing | |
| i2v_enhance: | |
| class_path: modules.params.i2v_enhance.I2VEnhanceParams | |
| init_args: | |
| ckpt_path_local: checkpoint/i2v_enhance/ | |
| ckpt_path_global: ali-vilab/i2vgen-xl | |
| module_loader: | |
| class_path: modules.loader.module_loader.GenericModuleLoader | |
| init_args: | |
| pipeline_repo: stabilityai/stable-video-diffusion-img2vid-xt | |
| pipeline_obj: streamingt2v_pipeline | |
| set_prediction_type: '' | |
| module_names: | |
| - network_config | |
| - model | |
| - controlnet | |
| - denoiser | |
| - conditioner | |
| - first_stage_model | |
| - sampler | |
| - svd_pipeline | |
| module_config: | |
| controlnet: | |
| class_path: modules.loader.module_loader_config.ModuleLoaderConfig | |
| init_args: | |
| loader_cls_path: models.control.controlnet.ControlNet | |
| cls_func: from_unet | |
| cls_func_fast_dev_run: '' | |
| kwargs_diffusers: null | |
| model_params: | |
| merging_mode: addition | |
| zero_conv_mode: Identity | |
| frame_expansion: none | |
| downsample_controlnet_cond: true | |
| use_image_encoder_normalization: true | |
| use_controlnet_mask: false | |
| condition_encoder: '' | |
| conditioning_embedding_out_channels: | |
| - 32 | |
| - 96 | |
| - 256 | |
| - 512 | |
| kwargs_diff_trainer_params: null | |
| args: [] | |
| dependent_modules: | |
| model: model | |
| dependent_modules_cloned: null | |
| state_dict_path: '' | |
| strict_loading: true | |
| state_dict_filters: [] | |
| network_config: | |
| class_path: models.diffusion.video_model.VideoUNet | |
| init_args: | |
| in_channels: 8 | |
| model_channels: 320 | |
| out_channels: 4 | |
| num_res_blocks: 2 | |
| num_conditional_frames: null | |
| attention_resolutions: | |
| - 4 | |
| - 2 | |
| - 1 | |
| dropout: 0.0 | |
| channel_mult: | |
| - 1 | |
| - 2 | |
| - 4 | |
| - 4 | |
| conv_resample: true | |
| dims: 2 | |
| num_classes: sequential | |
| use_checkpoint: False | |
| num_heads: -1 | |
| num_head_channels: 64 | |
| num_heads_upsample: -1 | |
| use_scale_shift_norm: false | |
| resblock_updown: false | |
| transformer_depth: 1 | |
| transformer_depth_middle: null | |
| context_dim: 1024 | |
| time_downup: false | |
| time_context_dim: null | |
| extra_ff_mix_layer: true | |
| use_spatial_context: true | |
| merge_strategy: learned_with_images | |
| merge_factor: 0.5 | |
| spatial_transformer_attn_type: softmax-xformers | |
| video_kernel_size: | |
| - 3 | |
| - 1 | |
| - 1 | |
| use_linear_in_transformer: true | |
| adm_in_channels: 768 | |
| disable_temporal_crossattention: false | |
| max_ddpm_temb_period: 10000 | |
| merging_mode: attention_cross_attention | |
| controlnet_mode: true | |
| use_apm: false | |
| model: | |
| class_path: modules.loader.module_loader_config.ModuleLoaderConfig | |
| init_args: | |
| loader_cls_path: models.svd.sgm.modules.diffusionmodules.wrappers.OpenAIWrapper | |
| cls_func: '' | |
| cls_func_fast_dev_run: '' | |
| kwargs_diffusers: | |
| compile_model: false | |
| model_params: null | |
| model_params_fast_dev_run: null | |
| kwargs_diff_trainer_params: null | |
| args: [] | |
| dependent_modules: | |
| diffusion_model: network_config | |
| dependent_modules_cloned: null | |
| state_dict_path: '' | |
| strict_loading: true | |
| state_dict_filters: [] | |
| denoiser: | |
| class_path: models.svd.sgm.modules.diffusionmodules.denoiser.Denoiser | |
| init_args: | |
| scaling_config: | |
| target: models.svd.sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise | |
| sampler: | |
| class_path: models.svd.sgm.modules.diffusionmodules.sampling.EulerEDMSampler | |
| init_args: | |
| s_churn: 0.0 | |
| s_tmin: 0.0 | |
| s_tmax: .inf | |
| s_noise: 1.0 | |
| discretization_config: | |
| target: models.diffusion.discretizer.AlignYourSteps | |
| params: | |
| sigma_max: 700.0 | |
| num_steps: 30 | |
| guider_config: | |
| target: models.svd.sgm.modules.diffusionmodules.guiders.LinearPredictionGuider | |
| params: | |
| max_scale: 3.0 | |
| min_scale: 1.5 | |
| num_frames: 25 | |
| verbose: false | |
| device: cuda | |
| conditioner: | |
| class_path: models.svd.sgm.modules.GeneralConditioner | |
| init_args: | |
| emb_models: | |
| - is_trainable: false | |
| input_key: cond_frames_without_noise | |
| target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder | |
| params: | |
| n_cond_frames: 1 | |
| n_copies: 1 | |
| open_clip_embedding_config: | |
| target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder | |
| params: | |
| freeze: true | |
| - input_key: fps_id | |
| is_trainable: false | |
| target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND | |
| params: | |
| outdim: 256 | |
| - input_key: motion_bucket_id | |
| is_trainable: false | |
| target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND | |
| params: | |
| outdim: 256 | |
| - input_key: cond_frames | |
| is_trainable: false | |
| target: models.svd.sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder | |
| params: | |
| disable_encoder_autocast: true | |
| n_cond_frames: 1 | |
| n_copies: 1 | |
| is_ae: true | |
| encoder_config: | |
| target: models.svd.sgm.models.autoencoder.AutoencoderKLModeOnly | |
| params: | |
| embed_dim: 4 | |
| monitor: val/rec_loss | |
| ddconfig: | |
| attn_type: vanilla-xformers | |
| double_z: true | |
| z_channels: 4 | |
| resolution: 256 | |
| in_channels: 3 | |
| out_ch: 3 | |
| ch: 128 | |
| ch_mult: | |
| - 1 | |
| - 2 | |
| - 4 | |
| - 4 | |
| num_res_blocks: 2 | |
| attn_resolutions: [] | |
| dropout: 0.0 | |
| lossconfig: | |
| target: torch.nn.Identity | |
| - input_key: cond_aug | |
| is_trainable: false | |
| target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND | |
| params: | |
| outdim: 256 | |
| first_stage_model: | |
| class_path: models.svd.sgm.AutoencodingEngine | |
| init_args: | |
| encoder_config: | |
| target: models.svd.sgm.modules.diffusionmodules.model.Encoder | |
| params: | |
| attn_type: vanilla | |
| double_z: true | |
| z_channels: 4 | |
| resolution: 256 | |
| in_channels: 3 | |
| out_ch: 3 | |
| ch: 128 | |
| ch_mult: | |
| - 1 | |
| - 2 | |
| - 4 | |
| - 4 | |
| num_res_blocks: 2 | |
| attn_resolutions: [] | |
| dropout: 0.0 | |
| decoder_config: | |
| target: models.svd.sgm.modules.autoencoding.temporal_ae.VideoDecoder | |
| params: | |
| attn_type: vanilla | |
| double_z: true | |
| z_channels: 4 | |
| resolution: 256 | |
| in_channels: 3 | |
| out_ch: 3 | |
| ch: 128 | |
| ch_mult: | |
| - 1 | |
| - 2 | |
| - 4 | |
| - 4 | |
| num_res_blocks: 2 | |
| attn_resolutions: [] | |
| dropout: 0.0 | |
| video_kernel_size: | |
| - 3 | |
| - 1 | |
| - 1 | |
| loss_config: | |
| target: torch.nn.Identity | |
| regularizer_config: | |
| target: models.svd.sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer | |
| optimizer_config: null | |
| lr_g_factor: 1.0 | |
| trainable_ae_params: null | |
| ae_optimizer_args: null | |
| trainable_disc_params: null | |
| disc_optimizer_args: null | |
| disc_start_iter: 0 | |
| diff_boost_factor: 3.0 | |
| ckpt_engine: null | |
| ckpt_path: null | |
| additional_decode_keys: null | |
| ema_decay: null | |
| monitor: null | |
| input_key: jpg | |
| svd_pipeline: | |
| class_path: modules.loader.module_loader_config.ModuleLoaderConfig | |
| init_args: | |
| loader_cls_path: diffusers.StableVideoDiffusionPipeline | |
| cls_func: from_pretrained | |
| cls_func_fast_dev_run: '' | |
| kwargs_diffusers: | |
| torch_dtype: torch.float16 | |
| variant: fp16 | |
| use_safetensors: true | |
| model_params: null | |
| model_params_fast_dev_run: null | |
| kwargs_diff_trainer_params: null | |
| args: | |
| - stabilityai/stable-video-diffusion-img2vid-xt | |
| dependent_modules: null | |
| dependent_modules_cloned: null | |
| state_dict_path: '' | |
| strict_loading: true | |
| state_dict_filters: [] | |
| root_cls: null | |
| diff_trainer_params: | |
| class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.DiffusionTrainerParams | |
| init_args: | |
| scale_factor: 0.18215 | |
| streamingsvd_ckpt: | |
| class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.CheckpointDescriptor | |
| init_args: | |
| ckpt_path_local: checkpoint/StreamingSVD/model.safetensors | |
| ckpt_path_global: PAIR/StreamingSVD/resolve/main/model.safetensors | |
| disable_first_stage_autocast: true | |
| inference_params: | |
| class_path: modules.params.diffusion.inference_params.T2VInferenceParams | |
| init_args: | |
| n_autoregressive_generations: 2 # Number of autoregression for StreamingSVD | |
| num_conditional_frames: 7 # is this used? | |
| anchor_frames: '6' # Take the (Number+1)th frame as CLIP encoding for StreamingSVD | |
| reset_seed_per_generation: true # If true, the seed is reset on every generation | |