diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..8049651d2658ff02f940bb7c84cc46be2b69dee4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,47 @@
+.*
+*.py[cod]
+# *.jpg
+*.jpeg
+# *.png
+*.gif
+*.bmp
+*.mp4
+*.mov
+*.mkv
+*.log
+*.zip
+*.pt
+*.pth
+*.ckpt
+*.safetensors
+#*.json
+# *.txt
+*.backup
+*.pkl
+*.html
+*.pdf
+*.whl
+*.exe
+cache
+__pycache__/
+storage/
+samples/
+!.gitignore
+!requirements.txt
+.DS_Store
+*DS_Store
+google/
+Wan2.1-T2V-14B/
+Wan2.1-T2V-1.3B/
+Wan2.1-I2V-14B-480P/
+Wan2.1-I2V-14B-720P/
+outputs/
+outputs2/
+gradio_outputs/
+ckpts/
+loras/
+loras_i2v/
+
+settings/
+
+wgp_config.json
diff --git a/Custom Resolutions Instructions.txt b/Custom Resolutions Instructions.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c11f25dc3d29d2142b1cb4254e9bc7562ec1835e
--- /dev/null
+++ b/Custom Resolutions Instructions.txt	
@@ -0,0 +1,16 @@
+You can override the choice of Resolutions offered by WanGP, if you create a file "resolutions.json" in the main WanGP folder.
+This file is composed of a list of 2 elements sublists. Each 2 elements sublist should have the format ["Label", "WxH"] where W, H are respectively the Width and Height of the resolution. Please make sure that W and H are multiples of 16. The letter "x" should be placed inbetween these two dimensions.
+
+Here is below a sample "resolutions.json" file :
+
+[
+	["1280x720 (16:9, 720p)", "1280x720"],
+	["720x1280 (9:16, 720p)", "720x1280"], 
+	["1024x1024 (1:1, 720p)", "1024x1024"],
+	["1280x544 (21:9, 720p)", "1280x544"],
+	["544x1280 (9:21, 720p)", "544x1280"],
+	["1104x832 (4:3, 720p)", "1104x832"],
+	["832x1104 (3:4, 720p)", "832x1104"],
+    ["960x960 (1:1, 720p)", "960x960"],
+    ["832x480 (16:9, 480p)", "832x480"]
+]
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..927c579fd9240970de1a43a049ca0d29411cfecf
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,92 @@
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+# Build arg for GPU architectures - specify which CUDA compute capabilities to compile for
+# Common values:
+#   7.0  - Tesla V100
+#   7.5  - RTX 2060, 2070, 2080, Titan RTX
+#   8.0  - A100, A800 (Ampere data center)
+#   8.6  - RTX 3060, 3070, 3080, 3090 (Ampere consumer)
+#   8.9  - RTX 4070, 4080, 4090 (Ada Lovelace)
+#   9.0  - H100, H800 (Hopper data center)
+#   12.0 - RTX 5070, 5080, 5090 (Blackwell) - Note: sm_120 architecture
+#
+# Examples:
+#   RTX 3060: --build-arg CUDA_ARCHITECTURES="8.6"
+#   RTX 4090: --build-arg CUDA_ARCHITECTURES="8.9"
+#   Multiple: --build-arg CUDA_ARCHITECTURES="8.0;8.6;8.9"
+#
+# Note: Including 8.9 or 9.0 may cause compilation issues on some setups
+# Default includes 8.0 and 8.6 for broad Ampere compatibility
+ARG CUDA_ARCHITECTURES="8.0;8.6"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install system dependencies
+RUN apt update && \
+    apt install -y \
+    python3 python3-pip git wget curl cmake ninja-build \
+    libgl1 libglib2.0-0 ffmpeg && \
+    apt clean
+
+WORKDIR /workspace
+
+COPY requirements.txt .
+
+# Upgrade pip first
+RUN pip install --upgrade pip setuptools wheel
+
+# Install requirements if exists
+RUN pip install -r requirements.txt
+
+# Install PyTorch with CUDA support
+RUN pip install --extra-index-url https://download.pytorch.org/whl/cu124 \
+    torch==2.6.0+cu124 torchvision==0.21.0+cu124
+
+# Install SageAttention from git (patch GPU detection)
+ENV TORCH_CUDA_ARCH_LIST="${CUDA_ARCHITECTURES}"
+ENV FORCE_CUDA="1"
+ENV MAX_JOBS="1"
+
+COPY <
+Made with ❤️ by DeepBeepMeep +
diff --git a/defaults/ReadMe.txt b/defaults/ReadMe.txt new file mode 100644 index 0000000000000000000000000000000000000000..c98ee2ec959c9fca2bf66d3f5d63a91bc4f5c337 --- /dev/null +++ b/defaults/ReadMe.txt @@ -0,0 +1,13 @@ +Please dot not modify any file in this Folder. + +If you want to change a property of a default model, copy the corrresponding model file in the ./finetunes folder and modify the properties you want to change in the new file. +If a property is not in the new file, it will be inherited automatically from the default file that matches the same name file. + +For instance to hide a model: + +{ + "model": + { + "visible": false + } +} diff --git a/defaults/animate.json b/defaults/animate.json new file mode 100644 index 0000000000000000000000000000000000000000..bdcb6fefd14e4d2d2b98345c9b91e465c8c461bf --- /dev/null +++ b/defaults/animate.json @@ -0,0 +1,17 @@ +{ + "model": { + "name": "Wan2.2 Animate 14B", + "architecture": "animate", + "description": "Wan-Animate takes a video and a character image as input, and generates a video in either 'Animation' or 'Replacement' mode. Sliding Window of 81 frames at least are recommeded to obtain the best Style continuity.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_animate_14B_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_animate_14B_quanto_fp16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_animate_14B_quanto_bf16_int8.safetensors" + ], + "preload_URLs" : + [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_animate_relighting_lora.safetensors" + ], + "group": "wan2_2" + } +} \ No newline at end of file diff --git a/defaults/fantasy.json b/defaults/fantasy.json new file mode 100644 index 0000000000000000000000000000000000000000..fc09cee9fd76acc378d7c3829a6f34eed7bc8ff2 --- /dev/null +++ b/defaults/fantasy.json @@ -0,0 +1,11 @@ +{ + "model": + { + "name": "Fantasy Talking 720p 14B", + "architecture" : "fantasy", + "modules": [ ["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_fantasy_speaking_14B_bf16.safetensors"]], + "description": "The Fantasy Talking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking module to process an audio Input.", + "URLs": "i2v_720p" + }, + "resolution": "1280x720" +} diff --git a/defaults/flf2v_720p.json b/defaults/flf2v_720p.json new file mode 100644 index 0000000000000000000000000000000000000000..b25c4387a2904774d54ae26095560d0d429ee38a --- /dev/null +++ b/defaults/flf2v_720p.json @@ -0,0 +1,16 @@ +{ + "model": + { + "name": "First Last Frame to Video 720p (FLF2V) 14B", + "architecture" : "flf2v_720p", + "visible" : true, + "description": "The First Last Frame 2 Video model is the official model Image 2 Video model that supports Start and End frames.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_FLF2V_720p_14B_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_FLF2V_720p_14B_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_FLF2V_720p_14B_quanto_mfp16_int8.safetensors" + ], + "auto_quantize": true + }, + "resolution": "1280x720" +} \ No newline at end of file diff --git a/defaults/flux.json b/defaults/flux.json new file mode 100644 index 0000000000000000000000000000000000000000..724ec1abb1efc27fd78956a82e8a2cc2f40aaf52 --- /dev/null +++ b/defaults/flux.json @@ -0,0 +1,15 @@ +{ + "model": { + "name": "Flux 1 Dev 12B", + "architecture": "flux", + "description": "FLUX.1 Dev is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev_quanto_bf16_int8.safetensors" + ], + "image_outputs": true + }, + "prompt": "draw a hat", + "resolution": "1280x720", + "batch_size": 1 +} \ No newline at end of file diff --git a/defaults/flux_chroma.json b/defaults/flux_chroma.json new file mode 100644 index 0000000000000000000000000000000000000000..ebb8076be67fca39c6123124abc1a85548717e7d --- /dev/null +++ b/defaults/flux_chroma.json @@ -0,0 +1,17 @@ +{ + "model": { + "name": "Flux 1 Chroma 1 HD 8.9B", + "architecture": "flux_chroma", + "description": "FLUX.1 Chroma is a 8.9 billion parameters model. As a base model, Chroma1 is intentionally designed to be an excellent starting point for finetuning. It provides a strong, neutral foundation for developers, researchers, and artists to create specialized models..", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-chroma_hd_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-chroma_hd_quanto_bf16_int8.safetensors" + ], + "image_outputs": true + }, + "prompt": "draw a hat", + "resolution": "1280x720", + "guidance_scale": 3.0, + "num_inference_steps": 20, + "batch_size": 1 +} \ No newline at end of file diff --git a/defaults/flux_dev_kontext.json b/defaults/flux_dev_kontext.json new file mode 100644 index 0000000000000000000000000000000000000000..cf3631a1f51d7c2a88b7e0d860220ced0afd2cb6 --- /dev/null +++ b/defaults/flux_dev_kontext.json @@ -0,0 +1,16 @@ +{ + "model": { + "name": "Flux 1 Dev Kontext 12B", + "architecture": "flux_dev_kontext", + "description": "FLUX.1 Kontext is a 12 billion parameter rectified flow transformer capable of editing images based on instructions stored in the Prompt. Please be aware that Flux Kontext is picky on the resolution of the input image and the output dimensions may not match the dimensions of the input image.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1_kontext_dev_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1_kontext_dev_quanto_bf16_int8.safetensors" + ] + }, + "prompt": "add a hat", + "resolution": "1280x720", + "batch_size": 1 +} + + \ No newline at end of file diff --git a/defaults/flux_dev_umo.json b/defaults/flux_dev_umo.json new file mode 100644 index 0000000000000000000000000000000000000000..a5e8e7d922e776a51cd7e73f61b30d12c369d083 --- /dev/null +++ b/defaults/flux_dev_umo.json @@ -0,0 +1,23 @@ +{ + "model": { + "name": "Flux 1 UMO Dev 12B", + "architecture": "flux_dev_umo", + "description": "FLUX.1 UMO Dev is a model that can Edit Images with a specialization in combining multiple image references (resized internally at 512x512 max) to produce an Image output. Best Image preservation at 768x768 Resolution Output.", + "URLs": "flux", + "loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-UMO_dit_lora_bf16.safetensors"], + "resolutions": [ ["1024x1024 (1:1)", "1024x1024"], + ["768x1024 (3:4)", "768x1024"], + ["1024x768 (4:3)", "1024x768"], + ["512x1024 (1:2)", "512x1024"], + ["1024x512 (2:1)", "1024x512"], + ["768x768 (1:1)", "768x768"], + ["768x512 (3:2)", "768x512"], + ["512x768 (2:3)", "512x768"]] + }, + "prompt": "the man is wearing a hat", + "embedded_guidance_scale": 4, + "resolution": "768x768", + "batch_size": 1 +} + + \ No newline at end of file diff --git a/defaults/flux_dev_uso.json b/defaults/flux_dev_uso.json new file mode 100644 index 0000000000000000000000000000000000000000..4b429210ef50820cd84a0aa6ef572cf713c218d3 --- /dev/null +++ b/defaults/flux_dev_uso.json @@ -0,0 +1,16 @@ +{ + "model": { + "name": "Flux 1 USO Dev 12B", + "architecture": "flux_dev_uso", + "description": "FLUX.1 USO Dev is a model that can Edit Images with a specialization in Style Transfers (up to two).", + "modules": [ ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_projector_bf16.safetensors"]], + "URLs": "flux", + "loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_dit_lora_bf16.safetensors"] + }, + "prompt": "the man is wearing a hat", + "embedded_guidance_scale": 4, + "resolution": "1024x1024", + "batch_size": 1 +} + + \ No newline at end of file diff --git a/defaults/flux_krea.json b/defaults/flux_krea.json new file mode 100644 index 0000000000000000000000000000000000000000..669e1a595de29ea4d2bc6679f3e62ddf1b8fbb1c --- /dev/null +++ b/defaults/flux_krea.json @@ -0,0 +1,15 @@ +{ + "model": { + "name": "Flux 1 Dev Krea 12B", + "architecture": "flux", + "description": "Cutting-edge output quality, with a focus on aesthetic photography..", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-krea-dev_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-krea-dev_quanto_bf16_int8.safetensors" + ], + "image_outputs": true + }, + "prompt": "draw a hat", + "resolution": "1280x720", + "batch_size": 1 +} \ No newline at end of file diff --git a/defaults/flux_schnell.json b/defaults/flux_schnell.json new file mode 100644 index 0000000000000000000000000000000000000000..1645a86ab56251a5bdb39bce24befc327ac0eaac --- /dev/null +++ b/defaults/flux_schnell.json @@ -0,0 +1,16 @@ +{ + "model": { + "name": "Flux 1 Schnell 12B", + "architecture": "flux_schnell", + "description": "FLUX.1 Schnell is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions. As a distilled model it requires fewer denoising steps.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-schnell_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-schnell_quanto_bf16_int8.safetensors" + ], + "image_outputs": true + }, + "prompt": "draw a hat", + "resolution": "1280x720", + "num_inference_steps": 10, + "batch_size": 1 +} \ No newline at end of file diff --git a/defaults/flux_srpo.json b/defaults/flux_srpo.json new file mode 100644 index 0000000000000000000000000000000000000000..8b1c447e98ba63575615be70289f2117d18f7821 --- /dev/null +++ b/defaults/flux_srpo.json @@ -0,0 +1,14 @@ +{ + "model": { + "name": "Flux 1 Dev SRPO 12B", + "architecture": "flux", + "description": "By fine-tuning the FLUX.1.dev model with optimized denoising and online reward adjustment, SRPO improves its human-evaluated realism and aesthetic quality by over 3x.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_quanto_bf16_int8.safetensors" + ] + }, + "prompt": "draw a hat", + "resolution": "1024x1024", + "batch_size": 1 +} \ No newline at end of file diff --git a/defaults/flux_srpo_uso.json b/defaults/flux_srpo_uso.json new file mode 100644 index 0000000000000000000000000000000000000000..eed67bd36d97f4c151021360753f500003d30371 --- /dev/null +++ b/defaults/flux_srpo_uso.json @@ -0,0 +1,16 @@ +{ + "model": { + "name": "Flux 1 USO SRPO 12B", + "architecture": "flux_dev_uso", + "description": "FLUX.1 USO SRPO is a model that can Edit Images with a specialization in Style Transfers (up to two). It leverages the improved Image quality brought by the SRPO process", + "modules": [ "flux_dev_uso"], + "URLs": "flux_srpo", + "loras": "flux_dev_uso" + }, + "prompt": "the man is wearing a hat", + "embedded_guidance_scale": 4, + "resolution": "1024x1024", + "batch_size": 1 +} + + \ No newline at end of file diff --git a/defaults/fun_inp.json b/defaults/fun_inp.json new file mode 100644 index 0000000000000000000000000000000000000000..65330cd128661c6271705697997bd9780a93617c --- /dev/null +++ b/defaults/fun_inp.json @@ -0,0 +1,13 @@ +{ + "model": + { + "name": "Fun InP image2video 14B", + "architecture" : "fun_inp", + "description": "The Fun model is an alternative image 2 video that supports out the box End Image fixing (contrary to the original Wan image 2 video model).", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Fun_InP_14B_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Fun_InP_14B_quanto_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Fun_InP_14B_quanto_fp16_int8.safetensors" + ] + } +} diff --git a/defaults/fun_inp_1.3B.json b/defaults/fun_inp_1.3B.json new file mode 100644 index 0000000000000000000000000000000000000000..9d60e63e081c129f1744e8700d279d417de5d705 --- /dev/null +++ b/defaults/fun_inp_1.3B.json @@ -0,0 +1,11 @@ +{ + "model": + { + "name": "Fun InP image2video 1.3B", + "architecture" : "fun_inp_1.3B", + "description": "The Fun model is an alternative image 2 video that supports out the box End Image fixing (contrary to the original Wan image 2 video model). The 1.3B adds also image 2 to video capability to the 1.3B model.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Fun_InP_1.3B_bf16.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/hunyuan.json b/defaults/hunyuan.json new file mode 100644 index 0000000000000000000000000000000000000000..a6ba832b82cbeb9974c983436ad2adb614643124 --- /dev/null +++ b/defaults/hunyuan.json @@ -0,0 +1,12 @@ +{ + "model": + { + "name": "Hunyuan Video Text2video 720p 13B", + "architecture" : "hunyuan", + "description": "Probably the best text 2 video model available.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_720_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_720_quanto_int8.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/hunyuan_avatar.json b/defaults/hunyuan_avatar.json new file mode 100644 index 0000000000000000000000000000000000000000..d01c318fde0702b7e81f2d7478df3d260592ceb4 --- /dev/null +++ b/defaults/hunyuan_avatar.json @@ -0,0 +1,12 @@ +{ + "model": + { + "name": "Hunyuan Video Avatar 720p 13B", + "architecture" : "hunyuan_avatar", + "description": "With the Hunyuan Video Avatar model you can animate a person based on the content of an audio input. Please note that the video generator works by processing 128 frames segment at a time (even if you ask less). The good news is that it will concatenate multiple segments for long video generation (max 3 segments recommended as the quality will get worse).", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_avatar_720_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_avatar_720_quanto_bf16_int8.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/hunyuan_custom.json b/defaults/hunyuan_custom.json new file mode 100644 index 0000000000000000000000000000000000000000..d6217e9f5c6fdb2bef0a16f9fe9de6a18afa7563 --- /dev/null +++ b/defaults/hunyuan_custom.json @@ -0,0 +1,12 @@ +{ + "model": + { + "name": "Hunyuan Video Custom 720p 13B", + "architecture" : "hunyuan_custom", + "description": "The Hunyuan Video Custom model is probably the best model to transfer people (only people for the moment) as it is quite good to keep their identity. However it is slow as to get good results, you need to generate 720p videos with 30 steps.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_custom_720_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_custom_720_quanto_bf16_int8.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/hunyuan_custom_audio.json b/defaults/hunyuan_custom_audio.json new file mode 100644 index 0000000000000000000000000000000000000000..f5c4d52345d24b83f83cb0c503965d064e50356e --- /dev/null +++ b/defaults/hunyuan_custom_audio.json @@ -0,0 +1,12 @@ +{ + "model": + { + "name": "Hunyuan Video Custom Audio 720p 13B", + "architecture" : "hunyuan_custom_audio", + "description": "The Hunyuan Video Custom Audio model can be used to generate scenes of a person speaking given a Reference Image and a Recorded Voice or Song. The reference image is not a start image and therefore one can represent the person in a different context.The video length can be anything up to 10s. It is also quite good to generate no sound Video based on a person.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_custom_audio_720_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_custom_audio_720_quanto_bf16_int8.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/hunyuan_custom_edit.json b/defaults/hunyuan_custom_edit.json new file mode 100644 index 0000000000000000000000000000000000000000..9cf037e7eb1e927293488da57f2d2dcee51af1dd --- /dev/null +++ b/defaults/hunyuan_custom_edit.json @@ -0,0 +1,12 @@ +{ + "model": + { + "name": "Hunyuan Video Custom Edit 720p 13B", + "architecture" : "hunyuan_custom_edit", + "description": "The Hunyuan Video Custom Edit model can be used to do Video inpainting on a person (add accessories or completely replace the person). You will need in any case to define a Video Mask which will indicate which area of the Video should be edited.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_custom_edit_720_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_custom_edit_720_quanto_bf16_int8.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/hunyuan_i2v.json b/defaults/hunyuan_i2v.json new file mode 100644 index 0000000000000000000000000000000000000000..44722da6b4445c79a7349eab72ff6681c62f1be7 --- /dev/null +++ b/defaults/hunyuan_i2v.json @@ -0,0 +1,12 @@ +{ + "model": + { + "name": "Hunyuan Video Image2video 720p 13B", + "architecture" : "hunyuan_i2v", + "description": "A good looking image 2 video model, but not so good in prompt adherence.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_i2v_720_bf16v2.safetensors", + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/hunyuan_video_i2v_720_quanto_int8v2.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/hunyuan_t2v_accvideo.json b/defaults/hunyuan_t2v_accvideo.json new file mode 100644 index 0000000000000000000000000000000000000000..2da984a8da95167ef76a4f82d3ca219ff72f9972 --- /dev/null +++ b/defaults/hunyuan_t2v_accvideo.json @@ -0,0 +1,30 @@ +{ + "model": { + "name": "Hunyuan Video Text2video 720p AccVideo 13B", + "architecture": "hunyuan", + "description": " AccVideo is a novel efficient distillation method to accelerate video diffusion models with synthetic datset. Our method is 8.5x faster than HunyuanVideo.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/accvideo_hunyuan_video_720_quanto_int8.safetensors" + ], + "preload_URLs": [ + ], + "auto_quantize": true + }, + "negative_prompt": "", + "resolution": "832x480", + "video_length": 81, + "seed": 42, + "num_inference_steps": 5, + "flow_shift": 7, + "embedded_guidance_scale": 6, + "repeat_generation": 1, + "loras_multipliers": "", + "temporal_upsampling": "", + "spatial_upsampling": "", + "RIFLEx_setting": 0, + "slg_start_perc": 10, + "slg_end_perc": 90, + "prompt_enhancer": "", + "activated_loras": [ + ] +} \ No newline at end of file diff --git a/defaults/hunyuan_t2v_fast.json b/defaults/hunyuan_t2v_fast.json new file mode 100644 index 0000000000000000000000000000000000000000..4019e24ef7c0b32a02ab0b232c4abc75c5a40ec7 --- /dev/null +++ b/defaults/hunyuan_t2v_fast.json @@ -0,0 +1,32 @@ +{ + "model": { + "name": "Hunyuan Video Text2video 720p FastHunyuan 13B", + "architecture": "hunyuan", + "description": "Fast Hunyuan is an accelerated HunyuanVideo model. It can sample high quality videos with 6 diffusion steps.", + "settings_dir": [ "" ], + "URLs": [ + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/fast_hunyuan_video_720_quanto_int8.safetensors" + ], + "preload_URLs": [ + "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/fast_hunyuan_video_720_quanto_int8_map.json" + ], + "auto_quantize": true + }, + "negative_prompt": "", + "resolution": "832x480", + "video_length": 81, + "seed": 42, + "num_inference_steps": 6, + "flow_shift": 17, + "embedded_guidance_scale": 6, + "repeat_generation": 1, + "loras_multipliers": "", + "temporal_upsampling": "", + "spatial_upsampling": "", + "RIFLEx_setting": 0, + "slg_start_perc": 10, + "slg_end_perc": 90, + "prompt_enhancer": "", + "activated_loras": [ + ] +} \ No newline at end of file diff --git a/defaults/i2v.json b/defaults/i2v.json new file mode 100644 index 0000000000000000000000000000000000000000..ba10691483c09a0ed34ff8769ad429ae182fb18b --- /dev/null +++ b/defaults/i2v.json @@ -0,0 +1,13 @@ +{ + "model": + { + "name": "Wan2.1 Image2video 480p 14B", + "architecture" : "i2v", + "description": "The standard Wan Image 2 Video specialized to generate 480p images. It also offers Start and End Image support (End Image is not supported in the original model but seems to work well)", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_image2video_480p_14B_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_image2video_480p_14B_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_image2video_480p_14B_quanto_mfp16_int8.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/i2v_2_2.json b/defaults/i2v_2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a032333eb44e54d1095eaf11887c558aa84cb923 --- /dev/null +++ b/defaults/i2v_2_2.json @@ -0,0 +1,25 @@ +{ + "model": + { + "name": "Wan2.2 Image2video 14B", + "architecture" : "i2v_2_2", + "description": "Wan 2.2 Image 2 Video model. Contrary to the Wan Image2video 2.1 this model is structurally close to the t2v model. You will need consequently to store Loras for this model in the t2v Lora Folder.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_high_quanto_mfp16_int8.safetensors" + ], + "URLs2": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_image2video_14B_low_quanto_mfp16_int8.safetensors" + ], + "group": "wan2_2" + }, + "guidance_phases": 2, + "switch_threshold" : 900, + "guidance_scale" : 3.5, + "guidance2_scale" : 3.5, + "flow_shift" : 5 + +} \ No newline at end of file diff --git a/defaults/i2v_2_2_multitalk.json b/defaults/i2v_2_2_multitalk.json new file mode 100644 index 0000000000000000000000000000000000000000..9326469ffb2300a0a4427a971cba14ba9c0d543f --- /dev/null +++ b/defaults/i2v_2_2_multitalk.json @@ -0,0 +1,18 @@ +{ + "model": + { + "name": "Wan2.2 Multitalk 14B", + "architecture" : "i2v_2_2_multitalk", + "description": "The Multitalk module of Wan 2.1 has been combined with the Wan 2.2 image 2 video. It lets you have up to two people have a conversation.", + "modules": ["multitalk"], + "URLs": "i2v_2_2", + "URLs2": "i2v_2_2", + "group": "wan2_2", + "visible": false + }, + "switch_threshold" : 900, + "guidance_scale" : 3.5, + "guidance2_scale" : 3.5, + "flow_shift" : 5 + +} \ No newline at end of file diff --git a/defaults/i2v_720p.json b/defaults/i2v_720p.json new file mode 100644 index 0000000000000000000000000000000000000000..844aab9884efe22aaeb1c9b4aa1b38dc656e5098 --- /dev/null +++ b/defaults/i2v_720p.json @@ -0,0 +1,14 @@ +{ + "model": + { + "name": "Wan2.1 Image2video 720p 14B", + "architecture" : "i2v", + "description": "The standard Wan Image 2 Video specialized to generate 720p images. It also offers Start and End Image support (End Image is not supported in the original model but seems to work well).", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_image2video_720p_14B_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_image2video_720p_14B_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_image2video_720p_14B_quanto_mfp16_int8.safetensors" + ] + }, + "resolution": "1280x720" +} \ No newline at end of file diff --git a/defaults/i2v_fusionix.json b/defaults/i2v_fusionix.json new file mode 100644 index 0000000000000000000000000000000000000000..8b0a8af548e1b7c3db5eab10a599d509efbc6b19 --- /dev/null +++ b/defaults/i2v_fusionix.json @@ -0,0 +1,11 @@ +{ + "model": + { + "name": "Wan2.1 Image2video 480p FusioniX 14B", + "architecture" : "i2v", + "description": "A powerful merged image-to-video model based on the original WAN 2.1 I2V model, enhanced using multiple open-source components and LoRAs to boost motion realism, temporal consistency, and expressive detail.", + "URLs": "i2v", + "settings_dir": [ "" ], + "loras": ["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan2.1_I2V_14B_FusionX_LoRA.safetensors"] + } +} \ No newline at end of file diff --git a/defaults/i2v_palingenesis_2_2.json b/defaults/i2v_palingenesis_2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..129ef654dc8b9c88f9875f4c0dd40c9acf1e6801 --- /dev/null +++ b/defaults/i2v_palingenesis_2_2.json @@ -0,0 +1,18 @@ +{ + "model": + { + "name": "Wan2.2 Image2video Palingenesis 14B", + "architecture" : "i2v_2_2", + "description": "Wan 2.2 Image 2 Video model. Contrary to the Wan Image2video 2.1 this model is structurally close to the t2v model. Palingenesis a finetune praised for its high quality.", + "URLs": [ "https://huggingface.co/eddy1111111/WAN22.XX_Palingenesis/resolve/main/WAN22.XX_Palingenesis_high_i2v_fix.safetensors"], + "URLs2": [ "https://huggingface.co/eddy1111111/WAN22.XX_Palingenesis/resolve/main/WAN22.XX_Palingenesis_low_i2v_fix.safetensors"], + "group": "wan2_2" + }, + "ignore_unused_weights": true, + "guidance_phases": 2, + "switch_threshold" : 900, + "guidance_scale" : 3.5, + "guidance2_scale" : 3.5, + "flow_shift" : 5 + +} \ No newline at end of file diff --git a/defaults/infinitetalk.json b/defaults/infinitetalk.json new file mode 100644 index 0000000000000000000000000000000000000000..fc28d96e5e1021ac1e9102fe5048401305106f70 --- /dev/null +++ b/defaults/infinitetalk.json @@ -0,0 +1,16 @@ +{ + "model": { + "name": "Infinitetalk Single Speaker 480p 14B", + "architecture": "infinitetalk", + "modules": [ + [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_single_14B_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_single_14B_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_single_14B_quanto_mfp16_int8.safetensors" + ] + ], + "description": "The Infinitetalk model is an improved version of Multitalk that supports very long videos. This is the single speaker version. Sliding Window size must be 81 frames to get smooth transitions between shots.", + "one_speaker_only": true, + "URLs": "i2v" + } +} \ No newline at end of file diff --git a/defaults/infinitetalk_multi.json b/defaults/infinitetalk_multi.json new file mode 100644 index 0000000000000000000000000000000000000000..229ecc778bd7495f5575f3ca02ee7f0c5da2d0ef --- /dev/null +++ b/defaults/infinitetalk_multi.json @@ -0,0 +1,16 @@ +{ + "model": { + "name": "Infinitetalk Multi Speakers 480p 14B", + "architecture": "infinitetalk", + "modules": [ + [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_multi_14B_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_multi_14B_quanto_mfp16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_infinitetalk_multi_14B_quanto_mbf16_int8.safetensors" + ] + ], + "description": "The Infinitetalk model is an improved version of Multitalk that supports very long videos. This is the multi speakers version.Sliding Window size must be 81 frames to get smooth transitions between shots", + "multi_speakers_only": true, + "URLs": "i2v" + } +} \ No newline at end of file diff --git a/defaults/ltxv_13B.json b/defaults/ltxv_13B.json new file mode 100644 index 0000000000000000000000000000000000000000..639442e1aa2989d86a3d1574357d0cb3348afe18 --- /dev/null +++ b/defaults/ltxv_13B.json @@ -0,0 +1,19 @@ +{ + "model": + { + "name": "LTX Video 0.9.8 13B", + "architecture" : "ltxv_13B", + "description": "LTX Video is a fast model that can be used to generate very very long videos (up to 1800 frames !).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.8-dev.yaml'.The LTX Video model expects very long prompts, so don't hesitate to use the Prompt Enhancer.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv_0.9.8_13B_dev_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv_0.9.8_13B_dev_quanto_bf16_int8.safetensors" + ], + "preload_URLs" : [ + "https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv-097-ic-lora-pose-control-diffusers.safetensors", + "https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv-097-ic-lora-depth-control-diffusers.safetensors", + "https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv-097-ic-lora-canny-control-diffusers.safetensors" + ], + "LTXV_config": "models/ltx_video/configs/ltxv-13b-0.9.8-dev.yaml" + }, + "num_inference_steps": 30 +} diff --git a/defaults/ltxv_distilled.json b/defaults/ltxv_distilled.json new file mode 100644 index 0000000000000000000000000000000000000000..c570057289f03596ccc9fffe547f7ce7d407680b --- /dev/null +++ b/defaults/ltxv_distilled.json @@ -0,0 +1,15 @@ +{ + "model": + { + "name": "LTX Video 0.9.8 Distilled 13B", + "architecture" : "ltxv_13B", + "description": "LTX Video is a fast model that can be used to generate very long videos (up to 1800 frames !).This distilled version is a very fast version and retains a high level of quality. The LTX Video model expects very long prompts, so don't hesitate to use the Prompt Enhancer.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv_0.9.8_13B_distilled_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/LTX_Video/resolve/main/ltxv_0.9.8_13B_distilled_quanto_bf16_int8.safetensors" + ], + "preload_URLs" : "ltxv_13B", + "LTXV_config": "models/ltx_video/configs/ltxv-13b-0.9.8-distilled.yaml" + }, + "num_inference_steps": 6 +} diff --git a/defaults/lucy_edit.json b/defaults/lucy_edit.json new file mode 100644 index 0000000000000000000000000000000000000000..57d3d958ab6f82da6f32dba5cbcc1e60df4c473c --- /dev/null +++ b/defaults/lucy_edit.json @@ -0,0 +1,20 @@ +{ + "model": { + "name": "Wan2.2 Lucy Edit 5B", + "architecture": "lucy_edit", + "description": "Lucy Edit is a video editing model that performs instruction-guided edits on videos using free-text prompts. It supports a variety of edits, such as clothing & accessory changes, character changes, object insertions, and scene replacements while preserving the motion and composition perfectly.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_lucy_edit_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_lucy_edit_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_lucy_edit_quanto_mfp16_int8.safetensors" + ], + "settings_dir": "ti2v_2_2", + "group": "wan2_2" + }, + "prompt": "change the clothes to red", + "video_length": 81, + "guidance_scale": 5, + "flow_shift": 5, + "num_inference_steps": 30, + "resolution": "1280x720" +} \ No newline at end of file diff --git a/defaults/lucy_edit_fastwan.json b/defaults/lucy_edit_fastwan.json new file mode 100644 index 0000000000000000000000000000000000000000..de2830c9fb6fe92ae78ba77e673ab9c02025e51d --- /dev/null +++ b/defaults/lucy_edit_fastwan.json @@ -0,0 +1,17 @@ +{ + "model": { + "name": "Wan2.2 Lucy Edit FastWan 5B", + "architecture": "lucy_edit", + "description": "Lucy Edit is a video editing model that performs instruction-guided edits on videos using free-text prompts. It supports a variety of edits, such as clothing & accessory changes, character changes, object insertions, and scene replacements while preserving the motion and composition perfectly. This is the FastWan version for faster generation.", + "URLs": "lucy_edit", + "group": "wan2_2", + "settings_dir": [ "" ], + "loras": "ti2v_2_2_fastwan" + }, + "prompt": "change the clothes to red", + "video_length": 81, + "guidance_scale": 1, + "flow_shift": 3, + "num_inference_steps": 5, + "resolution": "1280x720" +} \ No newline at end of file diff --git a/defaults/lynx.json b/defaults/lynx.json new file mode 100644 index 0000000000000000000000000000000000000000..528f5ef68300306279f6098bace1ceb24958a79e --- /dev/null +++ b/defaults/lynx.json @@ -0,0 +1,18 @@ +{ + "model": { + "name": "Wan2.1 Lynx 14B", + "modules": [ + [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_lynx_full_module_14B_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_lynx_full_module_14B_quanto_bf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_lynx_full_module_14B_quanto_fp16_int8.safetensors" + ] + ], + "architecture": "lynx", + "description": "The Lynx ControlNet offers State of the Art Identity Preservation. You need to provide a Reference Image which is a close up of a person face to transfer this person in the Video.", + "URLs": "t2v", + "preload_URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_lynx_full_arc_resampler.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/moviigen.json b/defaults/moviigen.json new file mode 100644 index 0000000000000000000000000000000000000000..96a04f8842e4183c6860bf7937eec4c9adf490af --- /dev/null +++ b/defaults/moviigen.json @@ -0,0 +1,16 @@ +{ + "model": + { + "name": "MoviiGen 1080p 14B", + "architecture" : "t2v", + "description": "MoviiGen 1.1, a cutting-edge video generation model that excels in cinematic aesthetics and visual quality. Use it to generate videos in 720p or 1080p in the 21:9 ratio.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_moviigen1.1_14B_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_moviigen1.1_14B_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_moviigen1.1_14B_quanto_mfp16_int8.safetensors" + ], + "auto_quantize": true + }, + "resolution": "1280x720", + "video_length": 81 +} \ No newline at end of file diff --git a/defaults/multitalk.json b/defaults/multitalk.json new file mode 100644 index 0000000000000000000000000000000000000000..41699b58458233444abd25a94be25bc112c4489d --- /dev/null +++ b/defaults/multitalk.json @@ -0,0 +1,15 @@ +{ + "model": + { + "name": "Multitalk 480p 14B", + "architecture" : "multitalk", + "modules": [ + ["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_multitalk_14B_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_multitalk_14B_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_multitalk_14B_quanto_mfp16_int8.safetensors"] + ], + "description": "The Multitalk model corresponds to the original Wan image 2 video model combined with the Multitalk module. It lets you have up to two people have a conversation.", + "URLs": "i2v", + "teacache_coefficients" : [-3.02331670e+02, 2.23948934e+02, -5.25463970e+01, 5.87348440e+00, -2.01973289e-01] + } +} \ No newline at end of file diff --git a/defaults/multitalk_720p.json b/defaults/multitalk_720p.json new file mode 100644 index 0000000000000000000000000000000000000000..f18bebc01907c668c30b7daa58d69829f1eaf76f --- /dev/null +++ b/defaults/multitalk_720p.json @@ -0,0 +1,13 @@ +{ + "model": + { + "name": "Multitalk 720p 14B", + "architecture" : "multitalk", + "modules": ["multitalk"], + "description": "The Multitalk model corresponds to the original Wan image 2 video 720p model combined with the Multitalk module. It lets you have up to two people have a conversation.", + "URLs": "i2v_720p", + "teacache_coefficients" : [-114.36346466, 65.26524496, -18.82220707, 4.91518089, -0.23412683], + "auto_quantize": true + }, + "resolution": "1280x720" +} diff --git a/defaults/phantom_1.3B.json b/defaults/phantom_1.3B.json new file mode 100644 index 0000000000000000000000000000000000000000..5be31daf4aafcbf5ce7333447a3a626d37eeb6f4 --- /dev/null +++ b/defaults/phantom_1.3B.json @@ -0,0 +1,11 @@ +{ + "model": + { + "name": "Phantom 1.3B", + "architecture" : "phantom_1.3B", + "description": "The Phantom model is specialized in transferring people or objects of your choice into a generated Video. It produces very nice results when used at 720p.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2_1_phantom_1.3B_mbf16.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/phantom_14B.json b/defaults/phantom_14B.json new file mode 100644 index 0000000000000000000000000000000000000000..e6ec6147af60469654b0692ff2d1f0bb4d724563 --- /dev/null +++ b/defaults/phantom_14B.json @@ -0,0 +1,13 @@ +{ + "model": + { + "name": "Phantom 14B", + "architecture" : "phantom_14B", + "description": "The Phantom model is specialized in transferring people or objects of your choice into a generated Video. It produces very nice results when used at 720p.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_phantom_14B_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_phantom_14B_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_phantom_14B_quanto_mfp16_int8.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/qwen_image_20B.json b/defaults/qwen_image_20B.json new file mode 100644 index 0000000000000000000000000000000000000000..27bee20289fd1c1847bfb7c2570bdde2f7313b89 --- /dev/null +++ b/defaults/qwen_image_20B.json @@ -0,0 +1,21 @@ +{ + "model": { + "name": "Qwen Image 20B", + "architecture": "qwen_image_20B", + "description": "Qwen Image is generative model that will generate very high quality images. It is one of the few models capable to generate in the image very long texts.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_20B_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_20B_quanto_bf16_int8.safetensors" + ], + "xresolutions": [ ["1328x1328 (1:1)", "1328x1328"], + ["1664x928 (16:9)", "1664x928"], + ["928x1664 (9:16)", "928x1664"], + ["1472x1140 (4:3)", "1472x1140"], + ["1140x1472 (3:4)", "1140x1472"]], + "attention": {"<89" : "sdpa"}, + "image_outputs": true + }, + "prompt": "draw a hat", + "resolution": "1280x720", + "batch_size": 1 +} \ No newline at end of file diff --git a/defaults/qwen_image_edit_20B.json b/defaults/qwen_image_edit_20B.json new file mode 100644 index 0000000000000000000000000000000000000000..04fc573fbcaf50fb26936855975af2cde768c9cc --- /dev/null +++ b/defaults/qwen_image_edit_20B.json @@ -0,0 +1,18 @@ +{ + "model": { + "name": "Qwen Image Edit 20B", + "architecture": "qwen_image_edit_20B", + "description": "Qwen Image Edit is a generative model that can generate very high quality images with long texts in it. Best results will be at 720p. Use it to edit a Subject or combine multiple Subjects. ", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_20B_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_20B_quanto_bf16_int8.safetensors" + ], + "preload_URLs": ["https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_inpainting.safetensors"], + "attention": { + "<89": "sdpa" + } + }, + "prompt": "add a hat", + "resolution": "1280x720", + "batch_size": 1 +} \ No newline at end of file diff --git a/defaults/qwen_image_edit_plus_20B.json b/defaults/qwen_image_edit_plus_20B.json new file mode 100644 index 0000000000000000000000000000000000000000..e10deb24b871b58a79fdabb2c2dc797008500f6b --- /dev/null +++ b/defaults/qwen_image_edit_plus_20B.json @@ -0,0 +1,17 @@ +{ + "model": { + "name": "Qwen Image Edit Plus 20B", + "architecture": "qwen_image_edit_plus_20B", + "description": "Qwen Image Edit Plus is a generative model that can generate very high quality images with long texts in it. Best results will be at 720p. This model is optimized to combine multiple Subjects & Objects.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_plus_20B_quanto_bf16_int8.safetensors" + ], + "preload_URLs": "qwen_image_edit_20B", + "attention": { + "<89": "sdpa" + } + }, + "prompt": "add a hat", + "resolution": "1024x1024", + "batch_size": 1 +} \ No newline at end of file diff --git a/defaults/recam_1.3B.json b/defaults/recam_1.3B.json new file mode 100644 index 0000000000000000000000000000000000000000..e65d1b251b0ca71eb8c3239112d3b8dca1b35967 --- /dev/null +++ b/defaults/recam_1.3B.json @@ -0,0 +1,11 @@ +{ + "model": + { + "name": "ReCamMaster 1.3B", + "architecture" : "recam_1.3B", + "description": "The Recam Master in theory should allow you to replay a video by applying a different camera movement. The model supports only video that are at least 81 frames long (any frame beyond will be ignored)", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_recammaster_1.3B_bf16.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/sky_df_1.3B.json b/defaults/sky_df_1.3B.json new file mode 100644 index 0000000000000000000000000000000000000000..61e118d213633dcdcac1c598b80c492dde09b53f --- /dev/null +++ b/defaults/sky_df_1.3B.json @@ -0,0 +1,11 @@ +{ + "model": + { + "name": "SkyReels2 Diffusion Forcing 1.3B", + "architecture" : "sky_df_1.3B", + "description": "The SkyReels 2 Diffusion Forcing model has been designed to generate very long videos that exceeds the usual 5s limit. You can also use this model to extend any existing video.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/sky_reels2_diffusion_forcing_1.3B_mbf16.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/sky_df_14B.json b/defaults/sky_df_14B.json new file mode 100644 index 0000000000000000000000000000000000000000..e9d7bd52e0fb8f5ec45dba723d2faa3bfbc66c28 --- /dev/null +++ b/defaults/sky_df_14B.json @@ -0,0 +1,13 @@ +{ + "model": + { + "name": "SkyReels2 Diffusion Forcing 540p 14B", + "architecture" : "sky_df_14B", + "description": "The SkyReels 2 Diffusion Forcing model has been designed to generate very long videos that exceeds the usual 5s limit. You can also use this model to extend any existing video.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/sky_reels2_diffusion_forcing_14B_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/sky_reels2_diffusion_forcing_14B_quanto_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/sky_reels2_diffusion_forcing_14B_quanto_fp16_int8.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/sky_df_720p_14B.json b/defaults/sky_df_720p_14B.json new file mode 100644 index 0000000000000000000000000000000000000000..6bae6661689fb893ed8402efed376690c70625b7 --- /dev/null +++ b/defaults/sky_df_720p_14B.json @@ -0,0 +1,14 @@ +{ + "model": + { + "name": "SkyReels2 Diffusion Forcing 720p 14B", + "architecture" : "sky_df_14B", + "description": "The SkyReels 2 Diffusion Forcing model has been designed to generate very long videos that exceeds the usual 5s limit. You can also use this model to extend any existing video.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/sky_reels2_diffusion_forcing_720p_14B_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/sky_reels2_diffusion_forcing_720p_14B_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/sky_reels2_diffusion_forcing_720p_14B_quanto_mfp16_int8.safetensors" + ] + }, + "resolution": "1280x720" +} \ No newline at end of file diff --git a/defaults/standin.json b/defaults/standin.json new file mode 100644 index 0000000000000000000000000000000000000000..09298e97afa5383ce45ebad8cfe084d0740c3956 --- /dev/null +++ b/defaults/standin.json @@ -0,0 +1,10 @@ +{ + "model": + { + "name": "Wan2.1 Standin 14B", + "modules": [ ["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Stand-In_wan2.1_T2V_14B_ver1.0_bf16.safetensors"]], + "architecture" : "standin", + "description": "The original Wan Text 2 Video model combined with the StandIn module to improve Identity Preservation. You need to provide a Reference Image with white background which is a close up of a person face to transfer this person in the Video.", + "URLs": "t2v" + } +} \ No newline at end of file diff --git a/defaults/t2v.json b/defaults/t2v.json new file mode 100644 index 0000000000000000000000000000000000000000..ef7f2409ee9384462ff0f997a22bd08dab3958a3 --- /dev/null +++ b/defaults/t2v.json @@ -0,0 +1,13 @@ +{ + "model": + { + "name": "Wan2.1 Text2video 14B", + "architecture" : "t2v", + "description": "The original Wan Text 2 Video model. Most other models have been built on top of it", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_text2video_14B_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_text2video_14B_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_text2video_14B_quanto_mfp16_int8.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/t2v_1.3B.json b/defaults/t2v_1.3B.json new file mode 100644 index 0000000000000000000000000000000000000000..ca88bd92bb814448ee79894fbda32f9cb20caab6 --- /dev/null +++ b/defaults/t2v_1.3B.json @@ -0,0 +1,11 @@ +{ + "model": + { + "name": "Wan2.1 Text2video 1.3B", + "architecture" : "t2v_1.3B", + "description": "The light version of the original Wan Text 2 Video model. Most other models have been built on top of it", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_text2video_1.3B_mbf16.safetensors" + ] + } +} \ No newline at end of file diff --git a/defaults/t2v_2_2.json b/defaults/t2v_2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..806a1bfe6a15133ee7aa2e8fff0d4a78af0cacea --- /dev/null +++ b/defaults/t2v_2_2.json @@ -0,0 +1,25 @@ +{ + "model": + { + "name": "Wan2.2 Text2video 14B", + "architecture" : "t2v_2_2", + "description": "Wan 2.2 Text 2 Video model", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_high_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_high_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_high_quanto_mfp16_int8.safetensors" + ], + "URLs2": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_low_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_low_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_low_quanto_mfp16_int8.safetensors" + ], + "group": "wan2_2" + }, + "guidance_phases": 2, + "switch_threshold" : 875, + "guidance_scale" : 4, + "guidance2_scale" : 3, + "flow_shift" : 12 + +} \ No newline at end of file diff --git a/defaults/t2v_fusionix.json b/defaults/t2v_fusionix.json new file mode 100644 index 0000000000000000000000000000000000000000..6ecdf0c1227dec68bf4fdd8dae2e3180ae50e43f --- /dev/null +++ b/defaults/t2v_fusionix.json @@ -0,0 +1,38 @@ +{ + "model": + { + "name": "Wan2.1 Text2video FusioniX 14B", + "architecture" : "t2v", + "description": "A powerful merged text-to-video model based on the original WAN 2.1 T2V model, enhanced using multiple open-source components and LoRAs to boost motion realism, temporal consistency, and expressive detail.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_fp16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_quanto_fp16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_quanto_bf16_int8.safetensors" + ], + "auto_quantize": true + }, + "negative_prompt": "", + "prompt": "", + "resolution": "832x480", + "video_length": 81, + "seed": -1, + "num_inference_steps": 8, + "guidance_scale": 1, + "flow_shift": 5, + "embedded_guidance_scale": 6, + "repeat_generation": 1, + "multi_images_gen_type": 0, + "tea_cache_setting": 0, + "tea_cache_start_step_perc": 0, + "loras_multipliers": "", + "temporal_upsampling": "", + "spatial_upsampling": "", + "RIFLEx_setting": 0, + "slg_switch": 0, + "slg_start_perc": 10, + "slg_end_perc": 90, + "cfg_star_switch": 0, + "cfg_zero_step": -1, + "prompt_enhancer": "", + "activated_loras": [] +} \ No newline at end of file diff --git a/defaults/t2v_lighting_palingenesis_2_2.json b/defaults/t2v_lighting_palingenesis_2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f14f03aea4e13c8e9faf8973263e8cce512779fd --- /dev/null +++ b/defaults/t2v_lighting_palingenesis_2_2.json @@ -0,0 +1,25 @@ +{ + "model": + { + "name": "Wan2.2 Text2video Lightning Palingenesis 14B", + "architecture" : "t2v_2_2", + "description": "Wan 2.2 Text 2 Video Lightning Dyno model. Palingenesis a finetune praised for its high quality is used for the Low noise model whereas the High Noise model uses Lightning Finetune that offers natively Loras accelerators. ", + "URLs": [ + "https://huggingface.co/lightx2v/Wan2.2-Lightning/resolve/main/Wan2.2-T2V-A14B-4steps-250928-dyno/Wan2.2-T2V-A14B-4steps-250928-dyno-high-lightx2v.safetensors" + ], + "URLs2": ["https://huggingface.co/eddy1111111/WAN22.XX_Palingenesis/resolve/main/WAN22.XX_Palingenesis_low_t2v.safetensors"], + "loras_multipliers": ["0;1"], + "activated_loras": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/loras_accelerators/Wan22_A14B_T2V_LOW_Lightning_4steps_lora_250928_rank64_fp16.safetensors" + ], + "profiles_dir": [ "" ], + "group": "wan2_2" + }, + "guidance_phases": 2, + "switch_threshold" : 875, + "guidance_scale" : 1, + "guidance2_scale" : 1, + "num_inference_steps": 4, + "flow_shift" : 3 + +} \ No newline at end of file diff --git a/defaults/t2v_palingenesis_2_2.json b/defaults/t2v_palingenesis_2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e742cffab379b679582e357832252b91c1719a2b --- /dev/null +++ b/defaults/t2v_palingenesis_2_2.json @@ -0,0 +1,15 @@ +{ + "model": + { + "name": "Wan2.2 Text2video Palingenesis 14B", + "architecture" : "t2v_2_2", + "description": "Wan 2.2 Text 2 Video Palingenesis a finetune praised for its high quality.", + "URLs": ["https://huggingface.co/eddy1111111/WAN22.XX_Palingenesis/resolve/main/WAN22.XX_Palingenesis_high_t2v.safetensors"], + "URLs2": ["https://huggingface.co/eddy1111111/WAN22.XX_Palingenesis/resolve/main/WAN22.XX_Palingenesis_low_t2v.safetensors"], + "group": "wan2_2" + }, + "guidance_phases": 2, + "switch_threshold" : 875, + "flow_shift" : 3 + +} \ No newline at end of file diff --git a/defaults/t2v_sf.json b/defaults/t2v_sf.json new file mode 100644 index 0000000000000000000000000000000000000000..2131413dda979d4ce47e2e12246c9f5332440281 --- /dev/null +++ b/defaults/t2v_sf.json @@ -0,0 +1,38 @@ +{ + "model": { + "name": "Wan2.1 Text2video Self-Forcing 14B", + "architecture": "t2v", + "description": "This model is an advanced text-to-video generation model. This approach allows the model to generate videos with significantly fewer inference steps (4 or 8 steps) and without classifier-free guidance, substantially reducing video generation time while maintaining high quality outputs.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_StepDistill-CfgDistill_14B_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_StepDistill-CfgDistill_14B_quanto_bf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_StepDistill-CfgDistill_14B_quanto_fp16_int8.safetensors" + ], + "author": "https://huggingface.co/lightx2v/Wan2.1-T2V-14B-StepDistill-CfgDistill", + "auto_quantize": true + }, + "negative_prompt": "", + "prompt": "", + "resolution": "832x480", + "video_length": 81, + "seed": -1, + "num_inference_steps": 4, + "guidance_scale": 1, + "flow_shift": 3, + "embedded_guidance_scale": 6, + "repeat_generation": 1, + "multi_images_gen_type": 0, + "tea_cache_setting": 0, + "tea_cache_start_step_perc": 0, + "loras_multipliers": "", + "temporal_upsampling": "", + "spatial_upsampling": "", + "RIFLEx_setting": 0, + "slg_switch": 0, + "slg_start_perc": 10, + "slg_end_perc": 90, + "cfg_star_switch": 0, + "cfg_zero_step": -1, + "prompt_enhancer": "", + "activated_loras": [] +} \ No newline at end of file diff --git a/defaults/ti2v_2_2.json b/defaults/ti2v_2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..91e90634766b8d1468f4e851f79b4bd49344eba0 --- /dev/null +++ b/defaults/ti2v_2_2.json @@ -0,0 +1,18 @@ +{ + "model": { + "name": "Wan2.2 TextImage2video 5B", + "architecture": "ti2v_2_2", + "description": "Wan 2.2 Text 2 Video model 5B", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_5B_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_5B_quanto_mbf16_int8.safetensors" + ], + "settings_dir": [ "wan2_2_5B" ], + "group": "wan2_2" + }, + "video_length": 121, + "guidance_scale": 5, + "flow_shift": 5, + "num_inference_steps": 50, + "resolution": "1280x720" +} \ No newline at end of file diff --git a/defaults/ti2v_2_2_fastwan.json b/defaults/ti2v_2_2_fastwan.json new file mode 100644 index 0000000000000000000000000000000000000000..eeb2cffbf8f028d787eed7e6a7ed3cc6fb515055 --- /dev/null +++ b/defaults/ti2v_2_2_fastwan.json @@ -0,0 +1,17 @@ +{ + "model": { + "name": "Wan2.2 TextImage2video FastWan 5B", + "architecture": "ti2v_2_2", + "description": "FastWan2.2-TI2V-5B-Full-Diffusers is built upon Wan-AI/Wan2.2-TI2V-5B-Diffusers. It supports efficient 3-step inference and produces high-quality videos at 121×704×1280 resolution", + "URLs": "ti2v_2_2", + "settings_dir": [ "" ], + "loras": ["https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/loras_accelerators/Wan2_2_5B_FastWanFullAttn_lora_rank_128_bf16.safetensors"], + "group": "wan2_2" + }, + "prompt" : "Put the person into a clown outfit.", + "video_length": 121, + "guidance_scale": 1, + "flow_shift": 3, + "num_inference_steps": 3, + "resolution": "1280x720" +} \ No newline at end of file diff --git a/defaults/vace_1.3B.json b/defaults/vace_1.3B.json new file mode 100644 index 0000000000000000000000000000000000000000..8b18f45843b0a50197e647a017d12243077193c3 --- /dev/null +++ b/defaults/vace_1.3B.json @@ -0,0 +1,12 @@ +{ + "model": + { + "name": "Vace 1.3B", + "architecture" : "vace_1.3B", + "modules": [ + ["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_1_3B_module.safetensors"] + ], + "description": "The Vace ControlNet model is a powerful model that allows you to control the content of the generated video based of additional custom data : pose or depth video, images or objects you want to see in the video.", + "URLs": "t2v_1.3B" + } +} \ No newline at end of file diff --git a/defaults/vace_14B.json b/defaults/vace_14B.json new file mode 100644 index 0000000000000000000000000000000000000000..0304db71b018a1120cc0ed2c015d23b24c953493 --- /dev/null +++ b/defaults/vace_14B.json @@ -0,0 +1,13 @@ +{ + "model": { + "name": "Vace 14B", + "architecture": "vace_14B", + "modules": [ + ["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_14B_module_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_14B_module_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_Vace_14B_module_quanto_mfp16_int8.safetensors"] + ], + "description": "The Vace ControlNet model is a powerful model that allows you to control the content of the generated video based of additional custom data : pose or depth video, images or objects you want to see in the video.", + "URLs": "t2v" + } +} \ No newline at end of file diff --git a/defaults/vace_14B_2_2.json b/defaults/vace_14B_2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e93f96aaa19920d4449e009b55a7e5710b938480 --- /dev/null +++ b/defaults/vace_14B_2_2.json @@ -0,0 +1,17 @@ +{ + "model": { + "name": "Wan2.2 Vace 14B", + "architecture": "vace_14B_2_2", + "modules": [ + "vace_14B" + ], + "description": "There is so far only PARTIAL support of Vace 2.1 which is currently used.", + "URLs": "t2v_2_2", + "URLs2": "t2v_2_2" + }, + "guidance_phases": 2, + "guidance_scale": 1, + "guidance2_scale": 1, + "flow_shift": 2, + "switch_threshold" : 875 +} \ No newline at end of file diff --git a/defaults/vace_14B_cocktail.json b/defaults/vace_14B_cocktail.json new file mode 100644 index 0000000000000000000000000000000000000000..0ab7a625e8623217fd35f95120aa4165eae5fcc9 --- /dev/null +++ b/defaults/vace_14B_cocktail.json @@ -0,0 +1,22 @@ +{ + "model": { + "name": "Vace Cocktail 14B", + "architecture": "vace_14B", + "modules": [ + "vace_14B" + ], + "description": "This model has been created on the fly using the Wan text 2 video model and the Loras of FusioniX. The weight of the Detail Enhancer Lora has been reduced to improve identity preservation. Copy the model def in the finetune folder to change the Cocktail composition.", + "URLs": "t2v", + "settings_dir": [ "" ], + "loras": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_CausVid_14B_T2V_lora_rank32_v2.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/DetailEnhancerV1.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_AccVid_T2V_14B_lora_rank32_fp16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_T2V_14B_MoviiGen_lora_rank32_fp16.safetensors" + ], + "loras_multipliers": [1, 0.5, 0.5, 0.5] + }, + "num_inference_steps": 10, + "guidance_scale": 1, + "flow_shift": 2 +} \ No newline at end of file diff --git a/defaults/vace_14B_cocktail_2_2.json b/defaults/vace_14B_cocktail_2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a704241189ec3fcd80246a8f153e67dd867b4379 --- /dev/null +++ b/defaults/vace_14B_cocktail_2_2.json @@ -0,0 +1,27 @@ +{ + "model": { + "name": "Wan2.2 Vace Experimental Cocktail 14B", + "architecture": "vace_14B_2_2", + "modules": [ + "vace_14B" + ], + "description": "This model has been created on the fly using the Wan text 2.2 video model and the Loras of FusioniX. The weight of the Detail Enhancer Lora has been reduced to improve identity preservation. There is so far only PARTIAL support of Vace 2.1 which is currently used.", + "URLs": "t2v_2_2", + "URLs2": "t2v_2_2", + "loras": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_CausVid_14B_T2V_lora_rank32_v2.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/DetailEnhancerV1.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_AccVid_T2V_14B_lora_rank32_fp16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_T2V_14B_MoviiGen_lora_rank32_fp16.safetensors" + ], + "profiles_dir": [ "" ], + "loras_multipliers": [1, 0.2, 0.5, 0.5], + "group": "wan2_2" + }, + "guidance_phases": 2, + "num_inference_steps": 10, + "guidance_scale": 1, + "guidance2_scale": 1, + "flow_shift": 2, + "switch_threshold" : 875 +} \ No newline at end of file diff --git a/defaults/vace_14B_fusionix.json b/defaults/vace_14B_fusionix.json new file mode 100644 index 0000000000000000000000000000000000000000..95639f4044a7d542e6ee8c5d53bebb17dccf1d71 --- /dev/null +++ b/defaults/vace_14B_fusionix.json @@ -0,0 +1,36 @@ +{ + "model": { + "name": "Vace FusioniX 14B", + "architecture": "vace_14B", + "modules": [ + "vace_14B" + ], + "description": "Vace control model enhanced using multiple open-source components and LoRAs to boost motion realism, temporal consistency, and expressive detail.", + "profiles_dir": [ "" ], + "URLs": "t2v_fusionix" + }, + "negative_prompt": "", + "prompt": "", + "resolution": "832x480", + "video_length": 81, + "seed": -1, + "num_inference_steps": 10, + "guidance_scale": 1, + "flow_shift": 2, + "embedded_guidance_scale": 6, + "repeat_generation": 1, + "multi_images_gen_type": 0, + "tea_cache_setting": 0, + "tea_cache_start_step_perc": 0, + "loras_multipliers": "", + "temporal_upsampling": "", + "spatial_upsampling": "", + "RIFLEx_setting": 0, + "slg_switch": 0, + "slg_start_perc": 10, + "slg_end_perc": 90, + "cfg_star_switch": 0, + "cfg_zero_step": -1, + "prompt_enhancer": "", + "activated_loras": [] +} \ No newline at end of file diff --git a/defaults/vace_14B_lightning_3p_2_2.json b/defaults/vace_14B_lightning_3p_2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..00bca667affb77a719fec36988e47c3df3c1b0ef --- /dev/null +++ b/defaults/vace_14B_lightning_3p_2_2.json @@ -0,0 +1,29 @@ +{ + "model": { + "name": "Wan2.2 Vace Lightning 3 Phases 14B", + "architecture": "vace_14B_2_2", + "modules": [ + "vace_14B" + ], + "description": "This finetune uses the Lightning 4 steps Loras Accelerator for Wan 2.2 but extend them to 8 steps in order to insert a CFG phase before the 2 accelerated phases with no Guidance. The ultimate goal is reduce the slow motion effect of these Loras Accelerators.", + "URLs": "t2v_2_2", + "URLs2": "t2v_2_2", + "loras": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/loras_accelerators/Wan2.2-Lightning_T2V-v1.1-A14B-4steps-lora_HIGH_fp16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/loras_accelerators/Wan2.2-Lightning_T2V-v1.1-A14B-4steps-lora_LOW_fp16.safetensors" + ], + "loras_multipliers": ["0;1;0", "0;0;1"], + "lock_guidance_phases": true, + "group": "wan2_2" + }, + "num_inference_steps": 8, + "guidance_phases": 3, + "guidance_scale": 3.5, + "guidance2_scale": 1, + "guidance3_scale": 1, + "switch_threshold": 965, + "switch_threshold2": 800, + "model_switch_phase": 2, + "flow_shift": 3, + "sample_solver": "euler" +} \ No newline at end of file diff --git a/defaults/vace_14B_sf.json b/defaults/vace_14B_sf.json new file mode 100644 index 0000000000000000000000000000000000000000..7dc495d54127bc0bec354f5ce8d6182623562950 --- /dev/null +++ b/defaults/vace_14B_sf.json @@ -0,0 +1,41 @@ +{ + "model": { + "name": "Vace Self-Forcing 14B", + "architecture": "vace_14B", + "modules": [ + "vace_14B" + ], + "description": "This model is a combination of Vace and an advanced text-to-video generation model. This approach allows the model to generate videos with significantly fewer inference steps (4 or 8 steps) and without classifier-free guidance, substantially reducing video generation time while maintaining high quality outputs.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_StepDistill-CfgDistill_14B_bf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_StepDistill-CfgDistill_14B_quanto_bf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_StepDistill-CfgDistill_14B_quanto_fp16_int8.safetensors" + ], + "author": "https://huggingface.co/lightx2v/Wan2.1-T2V-14B-StepDistill-CfgDistill", + "auto_quantize": true + }, + "negative_prompt": "", + "prompt": "", + "resolution": "832x480", + "video_length": 81, + "seed": -1, + "num_inference_steps": 4, + "guidance_scale": 1, + "flow_shift": 3, + "embedded_guidance_scale": 6, + "repeat_generation": 1, + "multi_images_gen_type": 0, + "tea_cache_setting": 0, + "tea_cache_start_step_perc": 0, + "loras_multipliers": "", + "temporal_upsampling": "", + "spatial_upsampling": "", + "RIFLEx_setting": 0, + "slg_switch": 0, + "slg_start_perc": 10, + "slg_end_perc": 90, + "cfg_star_switch": 0, + "cfg_zero_step": -1, + "prompt_enhancer": "", + "activated_loras": [] +} \ No newline at end of file diff --git a/defaults/vace_fun_14B_2_2.json b/defaults/vace_fun_14B_2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..febf5ea816f160c6861d24be611596f0599e2e18 --- /dev/null +++ b/defaults/vace_fun_14B_2_2.json @@ -0,0 +1,24 @@ +{ + "model": { + "name": "Wan2.2 Vace Fun 14B", + "architecture": "vace_14B_2_2", + "description": "This is the Fun Vace 2.2 version, that is not the official Vace 2.2", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/Wan2_2_Fun_VACE_A14B_HIGH_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/Wan2_2_Fun_VACE_A14B_HIGH_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/Wan2_2_Fun_VACE_A14B_HIGH_quanto_mfp16_int8.safetensors" + ], + "URLs2": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/Wan2_2_Fun_VACE_A14B_LOW_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/Wan2_2_Fun_VACE_A14B_LOW_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/Wan2_2_Fun_VACE_A14B_LOW_quanto_mfp16_int8.safetensors" + ], + "group": "wan2_2" + }, + "guidance_phases": 2, + "num_inference_steps": 30, + "guidance_scale": 1, + "guidance2_scale": 1, + "flow_shift": 2, + "switch_threshold": 875 +} \ No newline at end of file diff --git a/defaults/vace_fun_14B_cocktail_2_2.json b/defaults/vace_fun_14B_cocktail_2_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b50ddda350cf949f96328862785dc014a2c4706b --- /dev/null +++ b/defaults/vace_fun_14B_cocktail_2_2.json @@ -0,0 +1,29 @@ +{ + "model": { + "name": "Wan2.2 Vace Fun Cocktail 14B", + "architecture": "vace_14B_2_2", + "description": "This model has been created on the fly using the Wan text 2.2 video model and the Loras of FusioniX. The weight of the Detail Enhancer Lora has been reduced to improve identity preservation. This is the Fun Vace 2.2, that is not the official Vace 2.2", + "URLs": "vace_fun_14B_2_2", + "URLs2": "vace_fun_14B_2_2", + "loras": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_CausVid_14B_T2V_lora_rank32_v2.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/DetailEnhancerV1.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_AccVid_T2V_14B_lora_rank32_fp16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/loras_accelerators/Wan21_T2V_14B_MoviiGen_lora_rank32_fp16.safetensors" + ], + "loras_multipliers": [ + 1, + 0.2, + 0.5, + 0.5 + ], + "profiles_dir": [""], + "group": "wan2_2" + }, + "guidance_phases": 2, + "num_inference_steps": 10, + "guidance_scale": 1, + "guidance2_scale": 1, + "flow_shift": 2, + "switch_threshold": 875 +} \ No newline at end of file diff --git a/defaults/vace_lynx_14B.json b/defaults/vace_lynx_14B.json new file mode 100644 index 0000000000000000000000000000000000000000..3584aa99fb0b0d856e0ffe056bd069d60da24bc6 --- /dev/null +++ b/defaults/vace_lynx_14B.json @@ -0,0 +1,10 @@ +{ + "model": { + "name": "Vace Lynx 14B", + "architecture": "vace_lynx_14B", + "modules": [ "vace_14B", "lynx"], + "description": "The Vace ControlNet model is a powerful model that allows you to control the content of the generated video based of additional custom data : pose or depth video, images or objects you want to see in the video. The Lynx version is specialized in identity transfer, so the last Image Ref should always contain a close up of the Face of a Person to transfer.", + "URLs": "t2v", + "preload_URLs": "lynx" + } +} \ No newline at end of file diff --git a/defaults/vace_multitalk_14B.json b/defaults/vace_multitalk_14B.json new file mode 100644 index 0000000000000000000000000000000000000000..c35a04809139026f503b0ddecf80ea5a00433a58 --- /dev/null +++ b/defaults/vace_multitalk_14B.json @@ -0,0 +1,41 @@ +{ + "model": { + "name": "Vace Multitalk FusioniX 14B", + "architecture": "vace_multitalk_14B", + "modules": [ + "vace_14B", + "multitalk" + ], + "description": "Vace control model enhanced using multiple open-source components and LoRAs to boost motion realism, temporal consistency, and expressive detail. And it that's not sufficient Vace is combined with Multitalk.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_fp16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_quanto_bf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_quanto_fp16_int8.safetensors" + ], + "auto_quantize": true + }, + "negative_prompt": "", + "prompt": "", + "resolution": "832x480", + "video_length": 81, + "seed": -1, + "num_inference_steps": 10, + "guidance_scale": 1, + "flow_shift": 5, + "embedded_guidance_scale": 6, + "repeat_generation": 1, + "multi_images_gen_type": 0, + "tea_cache_setting": 0, + "tea_cache_start_step_perc": 0, + "loras_multipliers": "", + "temporal_upsampling": "", + "spatial_upsampling": "", + "RIFLEx_setting": 0, + "slg_switch": 0, + "slg_start_perc": 10, + "slg_end_perc": 90, + "cfg_star_switch": 0, + "cfg_zero_step": -1, + "prompt_enhancer": "", + "activated_loras": [] +} \ No newline at end of file diff --git a/defaults/vace_standin_14B.json b/defaults/vace_standin_14B.json new file mode 100644 index 0000000000000000000000000000000000000000..b6f6af099763c32932170a28103d35ca636eaaa3 --- /dev/null +++ b/defaults/vace_standin_14B.json @@ -0,0 +1,9 @@ +{ + "model": { + "name": "Vace Standin 14B", + "architecture": "vace_standin_14B", + "modules": [ "vace_14B", "standin"], + "description": "The Vace ControlNet model is a powerful model that allows you to control the content of the generated video based of additional custom data : pose or depth video, images or objects you want to see in the video. The Standin version is specialized in identity transfer, so the last Image Ref should always contain a close up of the Face of a Person to transfer.", + "URLs": "t2v" + } +} \ No newline at end of file diff --git a/docs/AMD-INSTALLATION.md b/docs/AMD-INSTALLATION.md new file mode 100644 index 0000000000000000000000000000000000000000..4f05589eb8718c4f08d97c0d7bd62b56f889b721 --- /dev/null +++ b/docs/AMD-INSTALLATION.md @@ -0,0 +1,146 @@ +# Installation Guide + +This guide covers installation for specific RDNA3 and RDNA3.5 AMD CPUs (APUs) and GPUs +running under Windows. + +tl;dr: Radeon RX 7900 GOOD, RX 9700 BAD, RX 6800 BAD. (I know, life isn't fair). + +Currently supported (but not necessary tested): + +**gfx110x**: + +* Radeon RX 7600 +* Radeon RX 7700 XT +* Radeon RX 7800 XT +* Radeon RX 7900 GRE +* Radeon RX 7900 XT +* Radeon RX 7900 XTX + +**gfx1151**: + +* Ryzen 7000 series APUs (Phoenix) +* Ryzen Z1 (e.g., handheld devices like the ROG Ally) + +**gfx1201**: + +* Ryzen 8000 series APUs (Strix Point) +* A [frame.work](https://frame.work/au/en/desktop) desktop/laptop + + +## Requirements + +- Python 3.11 (3.12 might work, 3.10 definately will not!) + +## Installation Environment + +This installation uses PyTorch 2.7.0 because that's what currently available in +terms of pre-compiled wheels. + +### Installing Python + +Download Python 3.11 from [python.org/downloads/windows](https://www.python.org/downloads/windows/). Hit Ctrl+F and search for "3.11". Dont use this direct link: [https://www.python.org/ftp/python/3.11.9/python-3.11.9-amd64.exe](https://www.python.org/ftp/python/3.11.9/python-3.11.9-amd64.exe) -- that was an IQ test. + +After installing, make sure `python --version` works in your terminal and returns 3.11.x + +If not, you probably need to fix your PATH. Go to: + +* Windows + Pause/Break +* Advanced System Settings +* Environment Variables +* Edit your `Path` under User Variables + +Example correct entries: + +```cmd +C:\Users\YOURNAME\AppData\Local\Programs\Python\Launcher\ +C:\Users\YOURNAME\AppData\Local\Programs\Python\Python311\Scripts\ +C:\Users\YOURNAME\AppData\Local\Programs\Python\Python311\ +``` + +If that doesnt work, scream into a bucket. + +### Installing Git + +Get Git from [git-scm.com/downloads/win](https://git-scm.com/downloads/win). Default install is fine. + + +## Install (Windows, using `venv`) + +### Step 1: Download and Set Up Environment + +```cmd +:: Navigate to your desired install directory +cd \your-path-to-wan2gp + +:: Clone the repository +git clone https://github.com/deepbeepmeep/Wan2GP.git +cd Wan2GP + +:: Create virtual environment using Python 3.10.9 +python -m venv wan2gp-env + +:: Activate the virtual environment +wan2gp-env\Scripts\activate +``` + +### Step 2: Install PyTorch + +The pre-compiled wheels you need are hosted at [scottt's rocm-TheRock releases](https://github.com/scottt/rocm-TheRock/releases). Find the heading that says: + +**Pytorch wheels for gfx110x, gfx1151, and gfx1201** + +Don't click this link: [https://github.com/scottt/rocm-TheRock/releases/tag/v6.5.0rc-pytorch-gfx110x](https://github.com/scottt/rocm-TheRock/releases/tag/v6.5.0rc-pytorch-gfx110x). It's just here to check if you're skimming. + +Copy the links of the closest binaries to the ones in the example below (adjust if you're not running Python 3.11), then hit enter. + +```cmd +pip install ^ + https://github.com/scottt/rocm-TheRock/releases/download/v6.5.0rc-pytorch-gfx110x/torch-2.7.0a0+rocm_git3f903c3-cp311-cp311-win_amd64.whl ^ + https://github.com/scottt/rocm-TheRock/releases/download/v6.5.0rc-pytorch-gfx110x/torchaudio-2.7.0a0+52638ef-cp311-cp311-win_amd64.whl ^ + https://github.com/scottt/rocm-TheRock/releases/download/v6.5.0rc-pytorch-gfx110x/torchvision-0.22.0+9eb57cd-cp311-cp311-win_amd64.whl +``` + +### Step 3: Install Dependencies + +```cmd +:: Install core dependencies +pip install -r requirements.txt +``` + +## Attention Modes + +WanGP supports several attention implementations, only one of which will work for you: + +- **SDPA** (default): Available by default with PyTorch. This uses the built-in aotriton accel library, so is actually pretty fast. + +## Performance Profiles + +Choose a profile based on your hardware: + +- **Profile 3 (LowRAM_HighVRAM)**: Loads entire model in VRAM, requires 24GB VRAM for 8-bit quantized 14B model +- **Profile 4 (LowRAM_LowVRAM)**: Default, loads model parts as needed, slower but lower VRAM requirement + +## Running Wan2GP + +In future, you will have to do this: + +```cmd +cd \path-to\wan2gp +wan2gp\Scripts\activate.bat +python wgp.py +``` + +For now, you should just be able to type `python wgp.py` (because you're already in the virtual environment) + +## Troubleshooting + +- If you use a HIGH VRAM mode, don't be a fool. Make sure you use VAE Tiled Decoding. + +### Memory Issues + +- Use lower resolution or shorter videos +- Enable quantization (default) +- Use Profile 4 for lower VRAM usage +- Consider using 1.3B models instead of 14B models + +For more troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..b0eeae34b72140a1622331ccb0e78fdf4ca1eb51 --- /dev/null +++ b/docs/CHANGELOG.md @@ -0,0 +1,417 @@ +# Changelog + +## 🔥 Latest News +### August 29 2025: WanGP v8.21 - Here Goes Your Weekend + +- **InfiniteTalk Video to Video**: this feature can be used for Video Dubbing. Keep in mind that it is a *Sparse Video to Video*, that is internally only image is used by Sliding Window. However thanks to the new *Smooth Transition* mode, each new clip is connected to the previous and all the camera work is done by InfiniteTalk. If you dont get any transition, increase the number of frames of a Sliding Window (81 frames recommended) + +- **StandIn**: very light model specialized in Identity Transfer. I have provided two versions of Standin: a basic one derived from the text 2 video model and another based on Vace. If used with Vace, the last reference frame given to Vace will be also used for StandIn + +- **Flux ESO**: a new Flux dervied *Image Editing tool*, but this one is specialized both in *Identity Transfer* and *Style Transfer*. Style has to be understood in its wide meaning: give a reference picture of a person and another one of Sushis and you will turn this person into Sushis + +### August 24 2025: WanGP v8.1 - the RAM Liberator + +- **Reserved RAM entirely freed when switching models**, you should get much less out of memory related to RAM. I have also added a button in *Configuration / Performance* that will release most of the RAM used by WanGP if you want to use another application without quitting WanGP +- **InfiniteTalk** support: improved version of Multitalk that supposedly supports very long video generations based on an audio track. Exists in two flavors (*Single Speaker* and *Multi Speakers*) but doesnt seem to be compatible with Vace. One key new feature compared to Multitalk is that you can have different visual shots associated to the same audio: each Reference frame you provide you will be associated to a new Sliding Window. If only Reference frame is provided, it will be used for all windows. When Continuing a video, you can either continue the current shot (no Reference Frame) or add new shots (one or more Reference Frames).\ +If you are not into audio, you can use still this model to generate infinite long image2video, just select "no speaker". Last but not least, Infinitetalk works works with all the Loras accelerators. +- **Flux Chroma 1 HD** support: uncensored flux based model and lighter than Flux (8.9B versus 12B) and can fit entirely in VRAM with only 16 GB of VRAM. Unfortunalely it is not distilled and you will need CFG at minimum 20 steps + +### August 21 2025: WanGP v8.01 - the killer of seven + +- **Qwen Image Edit** : Flux Kontext challenger (prompt driven image edition). Best results (including Identity preservation) will be obtained at 720p. Beyond you may get image outpainting and / or lose identity preservation. Below 720p prompt adherence will be worse. Qwen Image Edit works with Qwen Lora Lightning 4 steps. I have also unlocked all the resolutions for Qwen models. Bonus Zone: support for multiple image compositions but identity preservation won't be as good. +- **On demand Prompt Enhancer** (needs to be enabled in Configuration Tab) that you can use to Enhance a Text Prompt before starting a Generation. You can refine the Enhanced Prompt or change the original Prompt. +- Choice of a **Non censored Prompt Enhancer**. Beware this is one is VRAM hungry and will require 12 GB of VRAM to work +- **Memory Profile customizable per model** : useful to set for instance Profile 3 (preload the model entirely in VRAM) with only Image Generation models, if you have 24 GB of VRAM. In that case Generation will be much faster because with Image generators (contrary to Video generators) as a lot of time is wasted in offloading +- **Expert Guidance Mode**: change the Guidance during the generation up to 2 times. Very useful with Wan 2.2 Ligthning to reduce the slow motion effect. The idea is to insert a CFG phase before the 2 accelerated phases that follow and have no Guidance. I have added the finetune *Wan2.2 Vace Lightning 3 Phases 14B* with a prebuilt configuration. Please note that it is a 8 steps process although the lora lightning is 4 steps. This expert guidance mode is also available with Wan 2.1. + +*WanGP 8.01 update, improved Qwen Image Edit Identity Preservation* +### August 12 2025: WanGP v7.7777 - Lucky Day(s) + +This is your lucky day ! thanks to new configuration options that will let you store generated Videos and Images in lossless compressed formats, you will find they in fact they look two times better without doing anything ! + +Just kidding, they will be only marginally better, but at least this opens the way to professionnal editing. + +Support: +- Video: x264, x264 lossless, x265 +- Images: jpeg, png, webp, wbp lossless +Generation Settings are stored in each of the above regardless of the format (that was the hard part). + +Also you can now choose different output directories for images and videos. + +unexpected luck: fixed lightning 8 steps for Qwen, and lightning 4 steps for Wan 2.2, now you just need 1x multiplier no weird numbers. +*update 7.777 : oops got a crash a with FastWan ? Luck comes and goes, try a new update, maybe you will have a better chance this time* +*update 7.7777 : Sometime good luck seems to last forever. For instance what if Qwen Lightning 4 steps could also work with WanGP ?* +- https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-4steps-V1.0-bf16.safetensors (Qwen Lightning 4 steps) +- https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-8steps-V1.1-bf16.safetensors (new improved version of Qwen Lightning 8 steps) + + +### August 10 2025: WanGP v7.76 - Faster than the VAE ... +We have a funny one here today: FastWan 2.2 5B, the Fastest Video Generator, only 20s to generate 121 frames at 720p. The snag is that VAE is twice as slow... +Thanks to Kijai for extracting the Lora that is used to build the corresponding finetune. + +*WanGP 7.76: fixed the messed up I did to i2v models (loras path was wrong for Wan2.2 and Clip broken)* + +### August 9 2025: WanGP v7.74 - Qwen Rebirth part 2 +Added support for Qwen Lightning lora for a 8 steps generation (https://huggingface.co/lightx2v/Qwen-Image-Lightning/blob/main/Qwen-Image-Lightning-8steps-V1.0.safetensors). Lora is not normalized and you can use a multiplier around 0.1. + +Mag Cache support for all the Wan2.2 models Don't forget to set guidance to 1 and 8 denoising steps , your gen will be 7x faster ! + +### August 8 2025: WanGP v7.73 - Qwen Rebirth +Ever wondered what impact not using Guidance has on a model that expects it ? Just look at Qween Image in WanGP 7.71 whose outputs were erratic. Somehow I had convinced myself that Qwen was a distilled model. In fact Qwen was dying for a negative prompt. And in WanGP 7.72 there is at last one for him. + +As Qwen is not so picky after all I have added also quantized text encoder which reduces the RAM requirements of Qwen by 10 GB (the text encoder quantized version produced garbage before) + +Unfortunately still the Sage bug for older GPU architectures. Added Sdpa fallback for these architectures. + +*7.73 update: still Sage / Sage2 bug for GPUs before RTX40xx. I have added a detection mechanism that forces Sdpa attention if that's the case* + + +### August 6 2025: WanGP v7.71 - Picky, picky + +This release comes with two new models : +- Qwen Image: a Commercial grade Image generator capable to inject full sentences in the generated Image while still offering incredible visuals +- Wan 2.2 TextImage to Video 5B: the last Wan 2.2 needed if you want to complete your Wan 2.2 collection (loras for this folder can be stored in "\loras\5B" ) + +There is catch though, they are very picky if you want to get good generations: first they both need lots of steps (50 ?) to show what they have to offer. Then for Qwen Image I had to hardcode the supported resolutions, because if you try anything else, you will get garbage. Likewise Wan 2.2 5B will remind you of Wan 1.0 if you don't ask for at least 720p. + +*7.71 update: Added VAE Tiling for both Qwen Image and Wan 2.2 TextImage to Video 5B, for low VRAM during a whole gen.* + + +### August 4 2025: WanGP v7.6 - Remuxed + +With this new version you won't have any excuse if there is no sound in your video. + +*Continue Video* now works with any video that has already some sound (hint: Multitalk ). + +Also, on top of MMaudio and the various sound driven models I have added the ability to use your own soundtrack. + +As a result you can apply a different sound source on each new video segment when doing a *Continue Video*. + +For instance: +- first video part: use Multitalk with two people speaking +- second video part: you apply your own soundtrack which will gently follow the multitalk conversation +- third video part: you use Vace effect and its corresponding control audio will be concatenated to the rest of the audio + +To multiply the combinations I have also implemented *Continue Video* with the various image2video models. + +Also: +- End Frame support added for LTX Video models +- Loras can now be targetted specifically at the High noise or Low noise models with Wan 2.2, check the Loras and Finetune guides +- Flux Krea Dev support + +### July 30 2025: WanGP v7.5: Just another release ... Wan 2.2 part 2 +Here is now Wan 2.2 image2video a very good model if you want to set Start and End frames. Two Wan 2.2 models delivered, only one to go ... + +Please note that although it is an image2video model it is structurally very close to Wan 2.2 text2video (same layers with only a different initial projection). Given that Wan 2.1 image2video loras don't work too well (half of their tensors are not supported), I have decided that this model will look for its loras in the text2video loras folder instead of the image2video folder. + +I have also optimized RAM management with Wan 2.2 so that loras and modules will be loaded only once in RAM and Reserved RAM, this saves up to 5 GB of RAM which can make a difference... + +And this time I really removed Vace Cocktail Light which gave a blurry vision. + +### July 29 2025: WanGP v7.4: Just another release ... Wan 2.2 Preview +Wan 2.2 is here. The good news is that WanGP wont require a single byte of extra VRAM to run it and it will be as fast as Wan 2.1. The bad news is that you will need much more RAM if you want to leverage entirely this new model since it has twice has many parameters. + +So here is a preview version of Wan 2.2 that is without the 5B model and Wan 2.2 image to video for the moment. + +However as I felt bad to deliver only half of the wares, I gave you instead .....** Wan 2.2 Vace Experimental Cocktail** ! + +Very good surprise indeed, the loras and Vace partially work with Wan 2.2. We will need to wait for the official Vace 2.2 release since some Vace features are broken like identity preservation + +Bonus zone: Flux multi images conditions has been added, or maybe not if I broke everything as I have been distracted by Wan... + +7.4 update: I forgot to update the version number. I also removed Vace Cocktail light which didnt work well. + +### July 27 2025: WanGP v7.3 : Interlude +While waiting for Wan 2.2, you will appreciate the model selection hierarchy which is very useful to collect even more models. You will also appreciate that WanGP remembers which model you used last in each model family. + +### July 26 2025: WanGP v7.2 : Ode to Vace +I am really convinced that Vace can do everything the other models can do and in a better way especially as Vace can be combined with Multitalk. + +Here are some new Vace improvements: +- I have provided a default finetune named *Vace Cocktail* which is a model created on the fly using the Wan text 2 video model and the Loras used to build FusioniX. The weight of the *Detail Enhancer* Lora has been reduced to improve identity preservation. Copy the model definition in *defaults/vace_14B_cocktail.json* in the *finetunes/* folder to change the Cocktail composition. Cocktail contains already some Loras acccelerators so no need to add on top a Lora Accvid, Causvid or Fusionix, ... . The whole point of Cocktail is to be able to build you own FusioniX (which originally is a combination of 4 loras) but without the inconvenient of FusioniX. +- Talking about identity preservation, it tends to go away when one generates a single Frame instead of a Video which is shame for our Vace photoshop. But there is a solution : I have added an Advanced Quality option, that tells WanGP to generate a little more than a frame (it will still keep only the first frame). It will be a little slower but you will be amazed how Vace Cocktail combined with this option will preserve identities (bye bye *Phantom*). +- As in practise I have observed one switches frequently between *Vace text2video* and *Vace text2image* I have put them in the same place they are now just one tab away, no need to reload the model. Likewise *Wan text2video* and *Wan tex2image* have been merged. +- Color fixing when using Sliding Windows. A new postprocessing *Color Correction* applied automatically by default (you can disable it in the *Advanced tab Sliding Window*) will try to match the colors of the new window with that of the previous window. It doesnt fix all the unwanted artifacts of the new window but at least this makes the transition smoother. Thanks to the multitalk team for the original code. + +Also you will enjoy our new real time statistics (CPU / GPU usage, RAM / VRAM used, ... ). Many thanks to **Redtash1** for providing the framework for this new feature ! You need to go in the Config tab to enable real time stats. + + +### July 21 2025: WanGP v7.12 +- Flux Family Reunion : *Flux Dev* and *Flux Schnell* have been invited aboard WanGP. To celebrate that, Loras support for the Flux *diffusers* format has also been added. + +- LTX Video upgraded to version 0.9.8: you can now generate 1800 frames (1 min of video !) in one go without a sliding window. With the distilled model it will take only 5 minutes with a RTX 4090 (you will need 22 GB of VRAM though). I have added options to select higher humber frames if you want to experiment (go to Configuration Tab / General / Increase the Max Number of Frames, change the value and restart the App) + +- LTX Video ControlNet : it is a Control Net that allows you for instance to transfer a Human motion or Depth from a control video. It is not as powerful as Vace but can produce interesting things especially as now you can generate quickly a 1 min video. Under the scene IC-Loras (see below) for Pose, Depth and Canny are automatically loaded for you, no need to add them. + +- LTX IC-Lora support: these are special Loras that consumes a conditional image or video +Beside the pose, depth and canny IC-Loras transparently loaded there is the *detailer* (https://huggingface.co/Lightricks/LTX-Video-ICLoRA-detailer-13b-0.9.8) which is basically an upsampler. Add the *detailer* as a Lora and use LTX Raw Format as control net choice to use it. + +- Matanyone is now also for the GPU Poor as its VRAM requirements have been divided by 2! (7.12 shadow update) + +- Easier way to select video resolution + +### July 15 2025: WanGP v7.0 is an AI Powered Photoshop +This release turns the Wan models into Image Generators. This goes way more than allowing to generate a video made of single frame : +- Multiple Images generated at the same time so that you can choose the one you like best.It is Highly VRAM optimized so that you can generate for instance 4 720p Images at the same time with less than 10 GB +- With the *image2image* the original text2video WanGP becomes an image upsampler / restorer +- *Vace image2image* comes out of the box with image outpainting, person / object replacement, ... +- You can use in one click a newly Image generated as Start Image or Reference Image for a Video generation + +And to complete the full suite of AI Image Generators, Ladies and Gentlemen please welcome for the first time in WanGP : **Flux Kontext**.\ +As a reminder Flux Kontext is an image editor : give it an image and a prompt and it will do the change for you.\ +This highly optimized version of Flux Kontext will make you feel that you have been cheated all this time as WanGP Flux Kontext requires only 8 GB of VRAM to generate 4 images at the same time with no need for quantization. + +WanGP v7 comes with *Image2image* vanilla and *Vace FusinoniX*. However you can build your own finetune where you will combine a text2video or Vace model with any combination of Loras. + +Also in the news: +- You can now enter the *Bbox* for each speaker in *Multitalk* to precisely locate who is speaking. And to save some headaches the *Image Mask generator* will give you the *Bbox* coordinates of an area you have selected. +- *Film Grain* post processing to add a vintage look at your video +- *First Last Frame to Video* model should work much better now as I have discovered rencently its implementation was not complete +- More power for the finetuners, you can now embed Loras directly in the finetune definition. You can also override the default models (titles, visibility, ...) with your own finetunes. Check the doc that has been updated. + + +### July 10 2025: WanGP v6.7, is NAG a game changer ? you tell me +Maybe you knew that already but most *Loras accelerators* we use today (Causvid, FusioniX) don't use *Guidance* at all (that it is *CFG* is set to 1). This helps to get much faster generations but the downside is that *Negative Prompts* are completely ignored (including the default ones set by the models). **NAG** (https://github.com/ChenDarYen/Normalized-Attention-Guidance) aims to solve that by injecting the *Negative Prompt* during the *attention* processing phase. + +So WanGP 6.7 gives you NAG, but not any NAG, a *Low VRAM* implementation, the default one ends being VRAM greedy. You will find NAG in the *General* advanced tab for most Wan models. + +Use NAG especially when Guidance is set to 1. To turn it on set the **NAG scale** to something around 10. There are other NAG parameters **NAG tau** and **NAG alpha** which I recommend to change only if you don't get good results by just playing with the NAG scale. Don't hesitate to share on this discord server the best combinations for these 3 parameters. + +The authors of NAG claim that NAG can also be used when using a Guidance (CFG > 1) and to improve the prompt adherence. + +### July 8 2025: WanGP v6.6, WanGP offers you **Vace Multitalk Dual Voices Fusionix Infinite** : +**Vace** our beloved super Control Net has been combined with **Multitalk** the new king in town that can animate up to two people speaking (**Dual Voices**). It is accelerated by the **Fusionix** model and thanks to *Sliding Windows* support and *Adaptive Projected Guidance* (much slower but should reduce the reddish effect with long videos) your two people will be able to talk for very a long time (which is an **Infinite** amount of time in the field of video generation). + +Of course you will get as well *Multitalk* vanilla and also *Multitalk 720p* as a bonus. + +And since I am mister nice guy I have enclosed as an exclusivity an *Audio Separator* that will save you time to isolate each voice when using Multitalk with two people. + +As I feel like resting a bit I haven't produced yet a nice sample Video to illustrate all these new capabilities. But here is the thing, I ams sure you will publish in the *Share Your Best Video* channel your *Master Pieces*. The best ones will be added to the *Announcements Channel* and will bring eternal fame to its authors. + +But wait, there is more: +- Sliding Windows support has been added anywhere with Wan models, so imagine with text2video recently upgraded in 6.5 into a video2video, you can now upsample very long videos regardless of your VRAM. The good old image2video model can now reuse the last image to produce new videos (as requested by many of you) +- I have added also the capability to transfer the audio of the original control video (Misc. advanced tab) and an option to preserve the fps into the generated video, so from now on you will be to upsample / restore your old families video and keep the audio at their original pace. Be aware that the duration will be limited to 1000 frames as I still need to add streaming support for unlimited video sizes. + +Also, of interest too: +- Extract video info from Videos that have not been generated by WanGP, even better you can also apply post processing (Upsampling / MMAudio) on non WanGP videos +- Force the generated video fps to your liking, works wery well with Vace when using a Control Video +- Ability to chain URLs of Finetune models (for instance put the URLs of a model in your main finetune and reference this finetune in other finetune models to save time) + +### July 2 2025: WanGP v6.5.1, WanGP takes care of you: lots of quality of life features: +- View directly inside WanGP the properties (seed, resolutions, length, most settings...) of the past generations +- In one click use the newly generated video as a Control Video or Source Video to be continued +- Manage multiple settings for the same model and switch between them using a dropdown box +- WanGP will keep the last generated videos in the Gallery and will remember the last model you used if you restart the app but kept the Web page open +- Custom resolutions : add a file in the WanGP folder with the list of resolutions you want to see in WanGP (look at the instruction readme in this folder) + +Taking care of your life is not enough, you want new stuff to play with ? +- MMAudio directly inside WanGP : add an audio soundtrack that matches the content of your video. By the way it is a low VRAM MMAudio and 6 GB of VRAM should be sufficient. You will need to go in the *Extensions* tab of the WanGP *Configuration* to enable MMAudio +- Forgot to upsample your video during the generation ? want to try another MMAudio variation ? Fear not you can also apply upsampling or add an MMAudio track once the video generation is done. Even better you can ask WangGP for multiple variations of MMAudio to pick the one you like best +- MagCache support: a new step skipping approach, supposed to be better than TeaCache. Makes a difference if you usually generate with a high number of steps +- SageAttention2++ support : not just the compatibility but also a slightly reduced VRAM usage +- Video2Video in Wan Text2Video : this is the paradox, a text2video can become a video2video if you start the denoising process later on an existing video +- FusioniX upsampler: this is an illustration of Video2Video in Text2Video. Use the FusioniX text2video model with an output resolution of 1080p and a denoising strength of 0.25 and you will get one of the best upsamplers (in only 2/3 steps, you will need lots of VRAM though). Increase the denoising strength and you will get one of the best Video Restorer +- Choice of Wan Samplers / Schedulers +- More Lora formats support + +**If you had upgraded to v6.5 please upgrade again to 6.5.1 as this will fix a bug that ignored Loras beyond the first one** + +### June 23 2025: WanGP v6.3, Vace Unleashed. Thought we couldnt squeeze Vace even more ? +- Multithreaded preprocessing when possible for faster generations +- Multithreaded frames Lanczos Upsampling as a bonus +- A new Vace preprocessor : *Flow* to extract fluid motion +- Multi Vace Controlnets: you can now transfer several properties at the same time. This opens new possibilities to explore, for instance if you transfer *Human Movement* and *Shapes* at the same time for some reasons the lighting of your character will take into account much more the environment of your character. +- Injected Frames Outpainting, in case you missed it in WanGP 6.21 + +Don't know how to use all of the Vace features ? Check the Vace Guide embedded in WanGP as it has also been updated. + + +### June 19 2025: WanGP v6.2, Vace even more Powercharged +👋 Have I told you that I am a big fan of Vace ? Here are more goodies to unleash its power: +- If you ever wanted to watch Star Wars in 4:3, just use the new *Outpainting* feature and it will add the missing bits of image at the top and the bottom of the screen. The best thing is *Outpainting* can be combined with all the other Vace modifications, for instance you can change the main character of your favorite movie at the same time +- More processing can combined at the same time (for instance the depth process can be applied outside the mask) +- Upgraded the depth extractor to Depth Anything 2 which is much more detailed + +As a bonus, I have added two finetunes based on the Safe-Forcing technology (which requires only 4 steps to generate a video): Wan 2.1 text2video Self-Forcing and Vace Self-Forcing. I know there is Lora around but the quality of the Lora is worse (at least with Vace) compared to the full model. Don't hesitate to share your opinion about this on the discord server. +### June 17 2025: WanGP v6.1, Vace Powercharged +👋 Lots of improvements for Vace the Mother of all Models: +- masks can now be combined with on the fly processing of a control video, for instance you can extract the motion of a specific person defined by a mask +- on the fly modification of masks : reversed masks (with the same mask you can modify the background instead of the people covered by the masks), enlarged masks (you can cover more area if for instance the person you are trying to inject is larger than the one in the mask), ... +- view these modified masks directly inside WanGP during the video generation to check they are really as expected +- multiple frames injections: multiples frames can be injected at any location of the video +- expand past videos in on click: just select one generated video to expand it + +Of course all these new stuff work on all Vace finetunes (including Vace Fusionix). + +Thanks also to Reevoy24 for adding a Notfication sound at the end of a generation and for fixing the background color of the current generation summary. + +### June 12 2025: WanGP v6.0 +👋 *Finetune models*: You find the 20 models supported by WanGP not sufficient ? Too impatient to wait for the next release to get the support for a newly released model ? Your prayers have been answered: if a new model is compatible with a model architecture supported by WanGP, you can add by yourself the support for this model in WanGP by just creating a finetune model definition. You can then store this model in the cloud (for instance in Huggingface) and the very light finetune definition file can be easily shared with other users. WanGP will download automatically the finetuned model for them. + +To celebrate the new finetunes support, here are a few finetune gifts (directly accessible from the model selection menu): +- *Fast Hunyuan Video* : generate model t2v in only 6 steps +- *Hunyuan Vido AccVideo* : generate model t2v in only 5 steps +- *Wan FusioniX*: it is a combo of AccVideo / CausVid ans other models and can generate high quality Wan videos in only 8 steps + +One more thing... + +The new finetune system can be used to combine complementaty models : what happens when you combine Fusionix Text2Video and Vace Control Net ? + +You get **Vace FusioniX**: the Ultimate Vace Model, Fast (10 steps, no need for guidance) and with a much better quality Video than the original slower model (despite being the best Control Net out there). Here goes one more finetune... + +Check the *Finetune Guide* to create finetune models definitions and share them on the WanGP discord server. + +### June 11 2025: WanGP v5.5 +👋 *Hunyuan Video Custom Audio*: it is similar to Hunyuan Video Avatar excpet there isn't any lower limit on the number of frames and you can use your reference images in a different context than the image itself\ +*Hunyuan Video Custom Edit*: Hunyuan Video Controlnet, use it to do inpainting and replace a person in a video while still keeping his poses. Similar to Vace but less restricted than the Wan models in terms of content... + +### June 6 2025: WanGP v5.41 +👋 Bonus release: Support for **AccVideo** Lora to speed up x2 Video generations in Wan models. Check the Loras documentation to get the usage instructions of AccVideo. + +### June 6 2025: WanGP v5.4 +👋 World Exclusive : Hunyuan Video Avatar Support ! You won't need 80 GB of VRAM nor 32 GB oF VRAM, just 10 GB of VRAM will be sufficient to generate up to 15s of high quality speech / song driven Video at a high speed with no quality degradation. Support for TeaCache included. + +### May 26, 2025: WanGP v5.3 +👋 Happy with a Video generation and want to do more generations using the same settings but you can't remember what you did or you find it too hard to copy/paste one per one each setting from the file metadata? Rejoice! There are now multiple ways to turn this tedious process into a one click task: +- Select one Video recently generated in the Video Gallery and click *Use Selected Video Settings* +- Click *Drop File Here* and select a Video you saved somewhere, if the settings metadata have been saved with the Video you will be able to extract them automatically +- Click *Export Settings to File* to save on your harddrive the current settings. You will be able to use them later again by clicking *Drop File Here* and select this time a Settings json file + +### May 23, 2025: WanGP v5.21 +👋 Improvements for Vace: better transitions between Sliding Windows, Support for Image masks in Matanyone, new Extend Video for Vace, different types of automated background removal + +### May 20, 2025: WanGP v5.2 +👋 Added support for Wan CausVid which is a distilled Wan model that can generate nice looking videos in only 4 to 12 steps. The great thing is that Kijai (Kudos to him!) has created a CausVid Lora that can be combined with any existing Wan t2v model 14B like Wan Vace 14B. See [LORAS.md](LORAS.md) for instructions on how to use CausVid. + +Also as an experiment I have added support for the MoviiGen, the first model that claims to be capable of generating 1080p videos (if you have enough VRAM (20GB...) and be ready to wait for a long time...). Don't hesitate to share your impressions on the Discord server. + +### May 18, 2025: WanGP v5.1 +👋 Bonus Day, added LTX Video 13B Distilled: generate in less than one minute, very high quality Videos! + +### May 17, 2025: WanGP v5.0 +👋 One App to Rule Them All! Added support for the other great open source architectures: +- **Hunyuan Video**: text 2 video (one of the best, if not the best t2v), image 2 video and the recently released Hunyuan Custom (very good identity preservation when injecting a person into a video) +- **LTX Video 13B** (released last week): very long video support and fast 720p generation. Wan GP version has been greatly optimized and reduced LTX Video VRAM requirements by 4! + +Also: +- Added support for the best Control Video Model, released 2 days ago: Vace 14B +- New Integrated prompt enhancer to increase the quality of the generated videos + +*You will need one more `pip install -r requirements.txt`* + +### May 5, 2025: WanGP v4.5 +👋 FantasySpeaking model, you can animate a talking head using a voice track. This works not only on people but also on objects. Also better seamless transitions between Vace sliding windows for very long videos. New high quality processing features (mixed 16/32 bits calculation and 32 bits VAE) + +### April 27, 2025: WanGP v4.4 +👋 Phantom model support, very good model to transfer people or objects into video, works quite well at 720p and with the number of steps > 30 + +### April 25, 2025: WanGP v4.3 +👋 Added preview mode and support for Sky Reels v2 Diffusion Forcing for high quality "infinite length videos". Note that Skyreel uses causal attention that is only supported by Sdpa attention so even if you choose another type of attention, some of the processes will use Sdpa attention. + +### April 18, 2025: WanGP v4.2 +👋 FLF2V model support, official support from Wan for image2video start and end frames specialized for 720p. + +### April 17, 2025: WanGP v4.1 +👋 Recam Master model support, view a video from a different angle. The video to process must be at least 81 frames long and you should set at least 15 steps denoising to get good results. + +### April 13, 2025: WanGP v4.0 +👋 Lots of goodies for you! +- A new UI, tabs were replaced by a Dropdown box to easily switch models +- A new queuing system that lets you stack in a queue as many text2video, image2video tasks, ... as you want. Each task can rely on complete different generation parameters (different number of frames, steps, loras, ...). Many thanks to **Tophness** for being a big contributor on this new feature +- Temporal upsampling (Rife) and spatial upsampling (Lanczos) for a smoother video (32 fps or 64 fps) and to enlarge your video by x2 or x4. Check these new advanced options. +- Wan Vace Control Net support: with Vace you can inject in the scene people or objects, animate a person, perform inpainting or outpainting, continue a video, ... See [VACE.md](VACE.md) for introduction guide. +- Integrated *Matanyone* tool directly inside WanGP so that you can create easily inpainting masks used in Vace +- Sliding Window generation for Vace, create windows that can last dozens of seconds +- New optimizations for old generation GPUs: Generate 5s (81 frames, 15 steps) of Vace 1.3B with only 5GB and in only 6 minutes on a RTX 2080Ti and 5s of t2v 14B in less than 10 minutes. + +### March 27, 2025 +👋 Added support for the new Wan Fun InP models (image2video). The 14B Fun InP has probably better end image support but unfortunately existing loras do not work so well with it. The great novelty is the Fun InP image2 1.3B model: Image 2 Video is now accessible to even lower hardware configuration. It is not as good as the 14B models but very impressive for its size. Many thanks to the VideoX-Fun team (https://github.com/aigc-apps/VideoX-Fun) + +### March 26, 2025 +👋 Good news! Official support for RTX 50xx please check the [installation instructions](INSTALLATION.md). + +### March 24, 2025: Wan2.1GP v3.2 +👋 +- Added Classifier-Free Guidance Zero Star. The video should match better the text prompt (especially with text2video) at no performance cost: many thanks to the **CFG Zero * Team**. Don't hesitate to give them a star if you appreciate the results: https://github.com/WeichenFan/CFG-Zero-star +- Added back support for PyTorch compilation with Loras. It seems it had been broken for some time +- Added possibility to keep a number of pregenerated videos in the Video Gallery (useful to compare outputs of different settings) + +*You will need one more `pip install -r requirements.txt`* + +### March 19, 2025: Wan2.1GP v3.1 +👋 Faster launch and RAM optimizations (should require less RAM to run) + +*You will need one more `pip install -r requirements.txt`* + +### March 18, 2025: Wan2.1GP v3.0 +👋 +- New Tab based interface, you can switch from i2v to t2v conversely without restarting the app +- Experimental Dual Frames mode for i2v, you can also specify an End frame. It doesn't always work, so you will need a few attempts. +- You can save default settings in the files *i2v_settings.json* and *t2v_settings.json* that will be used when launching the app (you can also specify the path to different settings files) +- Slight acceleration with loras + +*You will need one more `pip install -r requirements.txt`* + +Many thanks to *Tophness* who created the framework (and did a big part of the work) of the multitabs and saved settings features + +### March 18, 2025: Wan2.1GP v2.11 +👋 Added more command line parameters to prefill the generation settings + customizable output directory and choice of type of metadata for generated videos. Many thanks to *Tophness* for his contributions. + +*You will need one more `pip install -r requirements.txt` to reflect new dependencies* + +### March 18, 2025: Wan2.1GP v2.1 +👋 More Loras!: added support for 'Safetensors' and 'Replicate' Lora formats. + +*You will need to refresh the requirements with a `pip install -r requirements.txt`* + +### March 17, 2025: Wan2.1GP v2.0 +👋 The Lora festival continues: +- Clearer user interface +- Download 30 Loras in one click to try them all (expand the info section) +- Very easy to use Loras as now Lora presets can input the subject (or other needed terms) of the Lora so that you don't have to modify manually a prompt +- Added basic macro prompt language to prefill prompts with different values. With one prompt template, you can generate multiple prompts. +- New Multiple images prompts: you can now combine any number of images with any number of text prompts (need to launch the app with --multiple-images) +- New command lines options to launch directly the 1.3B t2v model or the 14B t2v model + +### March 14, 2025: Wan2.1GP v1.7 +👋 +- Lora Fest special edition: very fast loading/unload of loras for those Loras collectors around. You can also now add/remove loras in the Lora folder without restarting the app. +- Added experimental Skip Layer Guidance (advanced settings), that should improve the image quality at no extra cost. Many thanks to the *AmericanPresidentJimmyCarter* for the original implementation + +*You will need to refresh the requirements `pip install -r requirements.txt`* + +### March 13, 2025: Wan2.1GP v1.6 +👋 Better Loras support, accelerated loading Loras. + +*You will need to refresh the requirements `pip install -r requirements.txt`* + +### March 10, 2025: Wan2.1GP v1.5 +👋 Official Teacache support + Smart Teacache (find automatically best parameters for a requested speed multiplier), 10% speed boost with no quality loss, improved lora presets (they can now include prompts and comments to guide the user) + +### March 7, 2025: Wan2.1GP v1.4 +👋 Fix PyTorch compilation, now it is really 20% faster when activated + +### March 4, 2025: Wan2.1GP v1.3 +👋 Support for Image to Video with multiples images for different images/prompts combinations (requires *--multiple-images* switch), and added command line *--preload x* to preload in VRAM x MB of the main diffusion model if you find there is too much unused VRAM and you want to (slightly) accelerate the generation process. + +*If you upgrade you will need to do a `pip install -r requirements.txt` again.* + +### March 4, 2025: Wan2.1GP v1.2 +👋 Implemented tiling on VAE encoding and decoding. No more VRAM peaks at the beginning and at the end + +### March 3, 2025: Wan2.1GP v1.1 +👋 Added Tea Cache support for faster generations: optimization of kijai's implementation (https://github.com/kijai/ComfyUI-WanVideoWrapper/) of teacache (https://github.com/ali-vilab/TeaCache) + +### March 2, 2025: Wan2.1GP by DeepBeepMeep v1 +👋 Brings: +- Support for all Wan including the Image to Video model +- Reduced memory consumption by 2, with possibility to generate more than 10s of video at 720p with a RTX 4090 and 10s of video at 480p with less than 12GB of VRAM. Many thanks to REFLEx (https://github.com/thu-ml/RIFLEx) for their algorithm that allows generating nice looking video longer than 5s. +- The usual perks: web interface, multiple generations, loras support, sage attention, auto download of models, ... + +## Original Wan Releases + +### February 25, 2025 +👋 We've released the inference code and weights of Wan2.1. + +### February 27, 2025 +👋 Wan2.1 has been integrated into [ComfyUI](https://comfyanonymous.github.io/ComfyUI_examples/wan/). Enjoy! \ No newline at end of file diff --git a/docs/CLI.md b/docs/CLI.md new file mode 100644 index 0000000000000000000000000000000000000000..38538b38e0178449999243f58c46041654eb73c4 --- /dev/null +++ b/docs/CLI.md @@ -0,0 +1,226 @@ +--vace-1-3B--vace-1-3B# Command Line Reference + +This document covers all available command line options for WanGP. + +## Basic Usage + +```bash +# Default launch +python wgp.py + +# Specific model modes +python wgp.py --i2v # Image-to-video +python wgp.py --t2v # Text-to-video (default) +python wgp.py --t2v-14B # 14B text-to-video model +python wgp.py --t2v-1-3B # 1.3B text-to-video model +python wgp.py --i2v-14B # 14B image-to-video model +python wgp.py --i2v-1-3B # Fun InP 1.3B image-to-video model +python wgp.py --vace-1-3B # VACE ControlNet 1.3B model +``` + +## Model and Performance Options + +### Model Configuration +```bash +--quantize-transformer BOOL # Enable/disable transformer quantization (default: True) +--compile # Enable PyTorch compilation (requires Triton) +--attention MODE # Force attention mode: sdpa, flash, sage, sage2 +--profile NUMBER # Performance profile 1-5 (default: 4) +--preload NUMBER # Preload N MB of diffusion model in VRAM +--fp16 # Force fp16 instead of bf16 models +--gpu DEVICE # Run on specific GPU device (e.g., "cuda:1") +``` + +### Performance Profiles +- **Profile 1**: Load entire current model in VRAM and keep all unused models in reserved RAM for fast VRAM tranfers +- **Profile 2**: Load model parts as needed, keep all unused models in reserved RAM for fast VRAM tranfers +- **Profile 3**: Load entire current model in VRAM (requires 24GB for 14B model) +- **Profile 4**: Default and recommended, load model parts as needed, most flexible option +- **Profile 5**: Minimum RAM usage + +### Memory Management +```bash +--perc-reserved-mem-max FLOAT # Max percentage of RAM for reserved memory (< 0.5) +``` + +## Lora Configuration + +```bash +--lora-dir PATH # Path to Wan t2v loras directory +--lora-dir-i2v PATH # Path to Wan i2v loras directory +--lora-dir-hunyuan PATH # Path to Hunyuan t2v loras directory +--lora-dir-hunyuan-i2v PATH # Path to Hunyuan i2v loras directory +--lora-dir-ltxv PATH # Path to LTX Video loras directory +--lora-preset PRESET # Load lora preset file (.lset) on startup +--check-loras # Filter incompatible loras (slower startup) +``` + +## Generation Settings + +### Basic Generation +```bash +--seed NUMBER # Set default seed value +--frames NUMBER # Set default number of frames to generate +--steps NUMBER # Set default number of denoising steps +--advanced # Launch with advanced mode enabled +``` + +### Advanced Generation +```bash +--teacache MULTIPLIER # TeaCache speed multiplier: 0, 1.5, 1.75, 2.0, 2.25, 2.5 +``` + +## Interface and Server Options + +### Server Configuration +```bash +--server-port PORT # Gradio server port (default: 7860) +--server-name NAME # Gradio server name (default: localhost) +--listen # Make server accessible on network +--share # Create shareable HuggingFace URL for remote access +--open-browser # Open browser automatically when launching +``` + +### Interface Options +```bash +--lock-config # Prevent modifying video engine configuration from interface +--theme THEME_NAME # UI theme: "default" or "gradio" +``` + +## File and Directory Options + +```bash +--settings PATH # Path to folder containing default settings for all models +--verbose LEVEL # Information level 0-2 (default: 1) +``` + +## Examples + +### Basic Usage Examples +```bash +# Launch with specific model and loras +python wgp.py --t2v-14B --lora-preset mystyle.lset + +# High-performance setup with compilation +python wgp.py --compile --attention sage2 --profile 3 + +# Low VRAM setup +python wgp.py --t2v-1-3B --profile 4 --attention sdpa + +# Multiple images with custom lora directory +python wgp.py --i2v --multiple-images --lora-dir /path/to/shared/loras +``` + +### Server Configuration Examples +```bash +# Network accessible server +python wgp.py --listen --server-port 8080 + +# Shareable server with custom theme +python wgp.py --share --theme gradio --open-browser + +# Locked configuration for public use +python wgp.py --lock-config --share +``` + +### Advanced Performance Examples +```bash +# Maximum performance (requires high-end GPU) +python wgp.py --compile --attention sage2 --profile 3 --preload 2000 + +# Optimized for RTX 2080Ti +python wgp.py --profile 4 --attention sdpa --teacache 2.0 + +# Memory-efficient setup +python wgp.py --fp16 --profile 4 --perc-reserved-mem-max 0.3 +``` + +### TeaCache Configuration +```bash +# Different speed multipliers +python wgp.py --teacache 1.5 # 1.5x speed, minimal quality loss +python wgp.py --teacache 2.0 # 2x speed, some quality loss +python wgp.py --teacache 2.5 # 2.5x speed, noticeable quality loss +python wgp.py --teacache 0 # Disable TeaCache +``` + +## Attention Modes + +### SDPA (Default) +```bash +python wgp.py --attention sdpa +``` +- Available by default with PyTorch +- Good compatibility with all GPUs +- Moderate performance + +### Sage Attention +```bash +python wgp.py --attention sage +``` +- Requires Triton installation +- 30% faster than SDPA +- Small quality cost + +### Sage2 Attention +```bash +python wgp.py --attention sage2 +``` +- Requires Triton and SageAttention 2.x +- 40% faster than SDPA +- Best performance option + +### Flash Attention +```bash +python wgp.py --attention flash +``` +- May require CUDA kernel compilation +- Good performance +- Can be complex to install on Windows + +## Troubleshooting Command Lines + +### Fallback to Basic Setup +```bash +# If advanced features don't work +python wgp.py --attention sdpa --profile 4 --fp16 +``` + +### Debug Mode +```bash +# Maximum verbosity for troubleshooting +python wgp.py --verbose 2 --check-loras +``` + +### Memory Issue Debugging +```bash +# Minimal memory usage +python wgp.py --profile 4 --attention sdpa --perc-reserved-mem-max 0.2 +``` + + + +## Configuration Files + +### Settings Files +Load custom settings: +```bash +python wgp.py --settings /path/to/settings/folder +``` + +### Lora Presets +Create and share lora configurations: +```bash +# Load specific preset +python wgp.py --lora-preset anime_style.lset + +# With custom lora directory +python wgp.py --lora-preset mystyle.lset --lora-dir /shared/loras +``` + +## Environment Variables + +While not command line options, these environment variables can affect behavior: +- `CUDA_VISIBLE_DEVICES` - Limit visible GPUs +- `PYTORCH_CUDA_ALLOC_CONF` - CUDA memory allocation settings +- `TRITON_CACHE_DIR` - Triton cache directory (for Sage attention) \ No newline at end of file diff --git a/docs/FINETUNES.md b/docs/FINETUNES.md new file mode 100644 index 0000000000000000000000000000000000000000..7f9dc3f44699c58ff6bed7ad3f3900a2ada37297 --- /dev/null +++ b/docs/FINETUNES.md @@ -0,0 +1,131 @@ +# FINETUNES + +A Finetuned model is model that shares the same architecture of one specific model but has derived weights from this model. Some finetuned models have been created by combining multiple finetuned models. + +As there are potentially an infinite number of finetunes, specific finetuned models are not known by default by WanGP. However you can create a finetuned model definition that will tell WanGP about the existence of this finetuned model and WanGP will do as usual all the work for you: autodownload the model and build the user interface. + +WanGP finetune system can be also used to tweak default models : for instance you can add on top of an existing model some loras that will be always applied transparently. + +Finetune models definitions are light json files that can be easily shared. You can find some of them on the WanGP *discord* server https://discord.gg/g7efUW9jGV + +All the finetunes definitions files should be stored in the *finetunes/* subfolder. + +Finetuned models have been tested so far with Wan2.1 text2video, Wan2.1 image2video, Hunyuan Video text2video. There isn't currently any support for LTX Video finetunes. + + + +## Create a new Finetune Model Definition +All the finetune models definitions are json files stored in the **finetunes/** sub folder. All the corresponding finetune model weights when they are downloaded will be stored in the *ckpts/* subfolder and will sit next to the base models. + +All the models used by WanGP are also described using the finetunes json format and can be found in the **defaults/** subfolder. Please don’t modify any file in the **defaults/** folder. + +However you can use these files as starting points for new definition files and to get an idea of the structure of a definition file. If you want to change how a base model is handled (title, default settings, path to model weights, …) you may override any property of the default finetunes definition file by creating a new file in the finetunes folder with the same name. Everything will happen as if the two models will be merged property by property with a higher priority given to the finetunes model definition. + +A definition is built from a *settings file* that can contains all the default parameters for a video generation. On top of this file a subtree named **model** contains all the information regarding the finetune (URLs to download model, corresponding base model id, ...). + +You can obtain a settings file in several ways: +- In the subfolder **settings**, get the json file that corresponds to the base model of your finetune (see the next section for the list of ids of base models) +- From the user interface, select the base model for which you want to create a finetune and click **export settings** + +Here are steps: +1) Create a *settings file* +2) Add a **model** subtree with the finetune description +3) Save this file in the subfolder **finetunes**. The name used for the file will be used as its id. It is a good practise to prefix the name of this file with the base model. For instance for a finetune named **Fast*** based on Hunyuan Text 2 Video model *hunyuan_t2v_fast.json*. In this example the Id is *hunyuan_t2v_fast*. +4) Restart WanGP + +## Architecture Models Ids +A finetune is derived from a base model and will inherit all the user interface and corresponding model capabilities, here are some Architecture Ids: +- *t2v*: Wan 2.1 Video text 2 video +- *i2v*: Wan 2.1 Video image 2 video 480p and 720p +- *vace_14B*: Wan 2.1 Vace 14B +- *hunyuan*: Hunyuan Video text 2 video +- *hunyuan_i2v*: Hunyuan Video image 2 video + +Any file name in the defaults subfolder (without the json extension) corresponds to an architecture id. + +Please note that weights of some architectures correspond to a combination of weight of a one architecture which are completed by the weights of one more or modules. + +A module is a set a weights that are insufficient to be model by itself but that can be added to an existing model to extend its capabilities. + +For instance if one adds a module *vace_14B* on top of a model with architecture *t2v* one gets get a model with the *vace_14B* architecture. Here *vace_14B* stands for both an architecture name and a module name. The module system allows you to reuse shared weights between models. + + +## The Model Subtree +- *name* : name of the finetune used to select +- *architecture* : architecture Id of the base model of the finetune (see previous section) +- *description*: description of the finetune that will appear at the top +- *URLs*: URLs of all the finetune versions (quantized / non quantized). WanGP will pick the version that is the closest to the user preferences. You will need to follow a naming convention to help WanGP identify the content of each version (see next section). Right now WanGP supports only 8 bits quantized model that have been quantized using **quanto**. WanGP offers a command switch to build easily such a quantized model (see below). *URLs* can contain also paths to local file to allow testing. +- *URLs2*: URLs of all the finetune versions (quantized / non quantized) of the weights used for the second phase of a model. For instance with Wan 2.2, the first phase contains the High Noise model weights and the second phase contains the Low Noise model weights. This feature can be used with other models than Wan 2.2 to combine different model weights during the same video generation. +- *modules*: this a list of modules to be combined with the models referenced by the URLs. A module is a model extension that is merged with a model to expand its capabilities. Supported models so far are : *vace_14B* and *multitalk*. For instance the full Vace model is the fusion of a Wan text 2 video and the Vace module. +- *preload_URLs* : URLs of files to download no matter what (used to load quantization maps for instance) +-*loras* : URLs of Loras that will applied before any other Lora specified by the user. These loras will be quite often Loras accelerators. For instance if you specify here the FusioniX Lora you will be able to reduce the number of generation steps to 10 +-*loras_multipliers* : a list of float numbers or strings that defines the weight of each Lora mentioned in *Loras*. The string syntax is used if you want your lora multiplier to change over the steps (please check the Loras doc) or if you want a multiplier to be applied on a specific High Noise phase or Low Noise phase of a Wan 2.2 model. For instance, here the multiplier will be only applied during the High Noise phase and for half of the steps of this phase the multiplier will be 1 and for the other half 1.1. +``` +"loras" : [ "my_lora.safetensors"], +"loras_multipliers" : [ "1,1.1;0"] +``` + +- *auto_quantize*: if set to True and no quantized model URL is provided, WanGP will perform on the fly quantization if the user expects a quantized model +-*visible* : by default assumed to be true. If set to false the model will no longer be visible. This can be useful if you create a finetune to override a default model and hide it. +-*image_outputs* : turn any model that generates a video into a model that generates images. In fact it will adapt the user interface for image generation and ask the model to generate a video with a single frame. + +In order to favor reusability the properties of *URLs*, *modules*, *loras* and *preload_URLs* can contain instead of a list of URLs a single text which corresponds to the id of a finetune or default model to reuse. Instead of: +``` + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_high_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_high_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_high_quanto_mfp16_int8.safetensors" + ], + "URLs2": [ + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_low_mbf16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_low_quanto_mbf16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_text2video_14B_low_quanto_mfp16_int8.safetensors" + ], +``` + You can write: +``` + "URLs": "t2v_2_2", + "URLs2": "t2v_2_2", +``` + + +Example of **model** subtree +``` + "model": + { + "name": "Wan text2video FusioniX 14B", + "architecture" : "t2v", + "description": "A powerful merged text-to-video model based on the original WAN 2.1 T2V model, enhanced using multiple open-source components and LoRAs to boost motion realism, temporal consistency, and expressive detail. multiple open-source models and LoRAs to boost temporal quality, expressiveness, and motion realism.", + "URLs": [ + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_fp16.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_quanto_fp16_int8.safetensors", + "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_quanto_bf16_int8.safetensors" + ], + "preload_URLs": [ + ], + "auto_quantize": true + }, +``` + +## Finetune Model Naming Convention +If a model is not quantized, it is assumed to be mostly 16 bits (with maybe a few 32 bits weights), so *bf16* or *fp16* should appear somewhere in the name. If you need examples just look at the **ckpts** subfolder, the naming convention for the base models is the same. + +If a model is quantized the term *quanto* should also be included since WanGP supports for the moment only *quanto* quantized model, most specically you should replace *fp16* by *quanto_fp16_int8* or *bf6* by *quanto_bf16_int8*. + +Please note it is important than *bf16", "fp16* and *quanto* are all in lower cases letters. + +## Creating a Quanto Quantized file +If you launch the app with the *--save-quantized* switch, WanGP will create a quantized file in the **ckpts** subfolder just after the model has been loaded. Please note that the model will *bf16* or *fp16* quantized depending on what you chose in the configuration menu. + +1) Make sure that in the finetune definition json file there is only a URL or filepath that points to the non quantized model +2) Launch WanGP *python wgp.py --save-quantized* +3) In the configuration menu *Transformer Data Type* property choose either *BF16* of *FP16* +4) Launch a video generation (settings used do not matter). As soon as the model is loaded, a new quantized model will be created in the **ckpts** subfolder if it doesn't already exist. +5) WanGP will update automatically the finetune definition file with the local path of the newly created quantized file (the list "URLs" will have an extra value such as *"ckpts/finetune_quanto_fp16_int8.safetensors"* +6) Remove *--save-quantized*, restart WanGP and select *Scaled Int8 Quantization* in the *Transformer Model Quantization* property +7) Launch a new generation and verify in the terminal window that the right quantized model is loaded +8) In order to share the finetune definition file you will need to store the fine model weights in the cloud. You can upload them for instance on *Huggingface*. You can now replace in the finetune definition file the local path by a URL (on Huggingface to get the URL of the model file click *Copy download link* when accessing the model properties) + +You need to create a quantized model specifically for *bf16* or *fp16* as they can not converted on the fly. However there is no need for a non quantized model as they can be converted on the fly while being loaded. + +Wan models supports both *fp16* and *bf16* data types albeit *fp16* delivers in theory better quality. On the contrary Hunyuan and LTXV supports only *bf16*. diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md new file mode 100644 index 0000000000000000000000000000000000000000..2449e4f263f7d897165bfc7eb2b072bc1ab23ae5 --- /dev/null +++ b/docs/GETTING_STARTED.md @@ -0,0 +1,194 @@ +# Getting Started with WanGP + +This guide will help you get started with WanGP video generation quickly and easily. + +## Prerequisites + +Before starting, ensure you have: +- A compatible GPU (RTX 10XX or newer recommended) +- Python 3.10.9 installed +- At least 6GB of VRAM for basic models +- Internet connection for model downloads + +## Quick Setup + +### Option 1: One-Click Installation (Recommended) +Use [Pinokio App](https://pinokio.computer/) for the easiest installation experience. + +### Option 2: Manual Installation +```bash +git clone https://github.com/deepbeepmeep/Wan2GP.git +cd Wan2GP +conda create -n wan2gp python=3.10.9 +conda activate wan2gp +pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu124 +pip install -r requirements.txt +``` + +For detailed installation instructions, see [INSTALLATION.md](INSTALLATION.md). + +## First Launch + +### Basic Launch +```bash +python wgp.py +``` +This launches the WanGP generator with default settings. You will be able to pick from a Drop Down menu which model you want to use. + +### Alternative Modes +```bash +python wgp.py --i2v # Wan Image-to-video mode +python wgp.py --t2v-1-3B # Wan Smaller, faster model +``` + +## Understanding the Interface + +When you launch WanGP, you'll see a web interface with several sections: + +### Main Generation Panel +- **Model Selection**: Dropdown to choose between different models +- **Prompt**: Text description of what you want to generate +- **Generate Button**: Start the video generation process + +### Advanced Settings (click checkbox to enable) +- **Generation Settings**: Steps, guidance, seeds +- **Loras**: Additional style customizations +- **Sliding Window**: For longer videos + +## Your First Video + +Let's generate a simple text-to-video: + +1. **Launch WanGP**: `python wgp.py` +2. **Open Browser**: Navigate to `http://localhost:7860` +3. **Enter Prompt**: "A cat walking in a garden" +4. **Click Generate**: Wait for the video to be created +5. **View Result**: The video will appear in the output section + +### Recommended First Settings +- **Model**: Wan 2.1 text2video 1.3B (faster, lower VRAM) +- **Frames**: 49 (about 2 seconds) +- **Steps**: 20 (good balance of speed/quality) + +## Model Selection + +### Text-to-Video Models +- **Wan 2.1 T2V 1.3B**: Fastest, lowest VRAM (6GB), good quality +- **Wan 2.1 T2V 14B**: Best quality, requires more VRAM (12GB+) +- **Hunyuan Video**: Excellent quality, slower generation +- **LTX Video**: Good for longer videos + +### Image-to-Video Models +- **Wan Fun InP 1.3B**: Fast image animation +- **Wan Fun InP 14B**: Higher quality image animation +- **VACE**: Advanced control over video generation + +### Choosing the Right Model +- **Low VRAM (6-8GB)**: Use 1.3B models +- **Medium VRAM (10-12GB)**: Use 14B models or Hunyuan +- **High VRAM (16GB+)**: Any model, longer videos + +## Basic Settings Explained + +### Generation Settings +- **Frames**: Number of frames (more = longer video) + - 25 frames ≈ 1 second + - 49 frames ≈ 2 seconds + - 73 frames ≈ 3 seconds + +- **Steps**: Quality vs Speed tradeoff + - 15 steps: Fast, lower quality + - 20 steps: Good balance + - 30+ steps: High quality, slower + +- **Guidance Scale**: How closely to follow the prompt + - 3-5: More creative interpretation + - 7-10: Closer to prompt description + - 12+: Very literal interpretation + +### Seeds +- **Random Seed**: Different result each time +- **Fixed Seed**: Reproducible results +- **Use same seed + prompt**: Generate variations + +## Common Beginner Issues + +### "Out of Memory" Errors +1. Use smaller models (1.3B instead of 14B) +2. Reduce frame count +3. Lower resolution in advanced settings +4. Enable quantization (usually on by default) + +### Slow Generation +1. Use 1.3B models for speed +2. Reduce number of steps +3. Install Sage attention (see [INSTALLATION.md](INSTALLATION.md)) +4. Enable TeaCache: `python wgp.py --teacache 2.0` + +### Poor Quality Results +1. Increase number of steps (25-30) +2. Improve prompt description +3. Use 14B models if you have enough VRAM +4. Enable Skip Layer Guidance in advanced settings + +## Writing Good Prompts + +### Basic Structure +``` +[Subject] [Action] [Setting] [Style/Quality modifiers] +``` + +### Examples +``` +A red sports car driving through a mountain road at sunset, cinematic, high quality + +A woman with long hair walking on a beach, waves in the background, realistic, detailed + +A cat sitting on a windowsill watching rain, cozy atmosphere, soft lighting +``` + +### Tips +- Be specific about what you want +- Include style descriptions (cinematic, realistic, etc.) +- Mention lighting and atmosphere +- Describe the setting in detail +- Use quality modifiers (high quality, detailed, etc.) + +## Next Steps + +Once you're comfortable with basic generation: + +1. **Explore Advanced Features**: + - [Loras Guide](LORAS.md) - Customize styles and characters + - [VACE ControlNet](VACE.md) - Advanced video control + - [Command Line Options](CLI.md) - Optimize performance + +2. **Improve Performance**: + - Install better attention mechanisms + - Optimize memory settings + - Use compilation for speed + +3. **Join the Community**: + - [Discord Server](https://discord.gg/g7efUW9jGV) - Get help and share videos + - Share your best results + - Learn from other users + +## Troubleshooting First Steps + +### Installation Issues +- Ensure Python 3.10.9 is used +- Check CUDA version compatibility +- See [INSTALLATION.md](INSTALLATION.md) for detailed steps + +### Generation Issues +- Check GPU compatibility +- Verify sufficient VRAM +- Try basic settings first +- See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for specific issues + +### Performance Issues +- Use appropriate model for your hardware +- Enable performance optimizations +- Check [CLI.md](CLI.md) for optimization flags + +Remember: Start simple and gradually explore more advanced features as you become comfortable with the basics! \ No newline at end of file diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md new file mode 100644 index 0000000000000000000000000000000000000000..361f266848be4bcb61ebbdc8500bcd95ea25fb60 --- /dev/null +++ b/docs/INSTALLATION.md @@ -0,0 +1,118 @@ +# Installation Guide + +This guide covers installation for different GPU generations and operating systems. + +## Requirements + +- Python 3.10.9 +- Conda or Python venv +- Compatible GPU (RTX 10XX or newer recommended) + +## Installation for RTX 10XX to RTX 50XX (Stable) + +This installation uses PyTorch 2.7.0 which is well-tested and stable. + +### Step 1: Download and Setup Environment + +```shell +# Clone the repository +git clone https://github.com/deepbeepmeep/Wan2GP.git +cd Wan2GP + +# Create Python 3.10.9 environment using conda +conda create -n wan2gp python=3.10.9 +conda activate wan2gp +``` + +### Step 2: Install PyTorch + +```shell +# Install PyTorch 2.7.0 with CUDA 12.8 +pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128 +``` + +### Step 3: Install Dependencies + +```shell +# Install core dependencies +pip install -r requirements.txt +``` + +### Step 4: Optional Performance Optimizations + +#### Sage Attention (30% faster), don't install with RTX 50xx as it is not compatible + +```shell +# Windows only: Install Triton +pip install triton-windows + +# For both Windows and Linux +pip install sageattention==1.0.6 +``` + +#### Sage 2 Attention (40% faster) + +```shell +# Windows +pip install triton-windows +pip install https://github.com/woct0rdho/SageAttention/releases/download/v2.1.1-windows/sageattention-2.1.1+cu126torch2.6.0-cp310-cp310-win_amd64.whl + +# Linux (manual compilation required) +python -m pip install "setuptools<=75.8.2" --force-reinstall +git clone https://github.com/thu-ml/SageAttention +cd SageAttention +pip install -e . +``` + +#### Flash Attention + +```shell +# May require CUDA kernel compilation on Windows +pip install flash-attn==2.7.2.post1 +``` + + +## Attention Modes + +WanGP supports several attention implementations: + +- **SDPA** (default): Available by default with PyTorch +- **Sage**: 30% speed boost with small quality cost +- **Sage2**: 40% speed boost +- **Flash**: Good performance, may be complex to install on Windows + +### Attention GPU Compatibility + +- RTX 10XX, 20XX: SDPA +- RTX 30XX, 40XX: SDPA, Flash Attention, Xformers, Sage, Sage2 +- RTX 50XX: SDPA, SDPA, Flash Attention, Xformers, Sage2 + +## Performance Profiles + +Choose a profile based on your hardware: + +- **Profile 3 (LowRAM_HighVRAM)**: Loads entire model in VRAM, requires 24GB VRAM for 8-bit quantized 14B model +- **Profile 4 (LowRAM_LowVRAM)**: Default, loads model parts as needed, slower but lower VRAM requirement + +## Troubleshooting + +### Sage Attention Issues + +If Sage attention doesn't work: + +1. Check if Triton is properly installed +2. Clear Triton cache +3. Fallback to SDPA attention: + ```bash + python wgp.py --attention sdpa + ``` + +### Memory Issues + +- Use lower resolution or shorter videos +- Enable quantization (default) +- Use Profile 4 for lower VRAM usage +- Consider using 1.3B models instead of 14B models + + +For more troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md) diff --git a/docs/LORAS.md b/docs/LORAS.md new file mode 100644 index 0000000000000000000000000000000000000000..f17b51f0bc9c0a649e0b2fd4c7ec164fedc547db --- /dev/null +++ b/docs/LORAS.md @@ -0,0 +1,387 @@ +# Loras Guide + +Loras (Low-Rank Adaptations) allow you to customize video generation models by adding specific styles, characters, or effects to your videos. + +## Directory Structure + +Loras are organized in different folders based on the model they're designed for: + +### Wan Text-to-Video Models +- `loras/` - General t2v loras for Wan 2.1 (t2v only) and for all Wan 2.2 models +Optional sub folders: +- `loras/1.3B/` - Loras specifically for 1.3B models +- `loras/5B/` - Loras specifically for 1.3B models +- `loras/14B/` - Loras specifically for 14B models + +### Wan Image-to-Video Models +- `loras_i2v/` - Image-to-video loras for Wan 2.1 + +### Other Models +- `loras_hunyuan/` - Hunyuan Video t2v loras +- `loras_hunyuan_i2v/` - Hunyuan Video i2v loras +- `loras_ltxv/` - LTX Video loras +- `loras_flux/` - Flux loras +- `loras_qwen/` - Qwen loras + +## Custom Lora Directory + +You can specify custom lora directories when launching the app: + +```bash +# Use shared lora directory for both t2v and i2v +python wgp.py --lora-dir /path/to/shared/loras --lora-dir-i2v /path/to/shared/loras + +# Specify different directories for different models +python wgp.py --lora-dir-hunyuan /path/to/hunyuan/loras --lora-dir-ltxv /path/to/ltx/loras +``` + +## Using Loras + +### Basic Usage + +1. Place your lora files in the appropriate directory +2. Launch WanGP +3. In the Advanced Tab, select the "Loras" section +4. Check the loras you want to activate +5. Set multipliers for each lora (default is 1.0 if multiplier is not mentioned) + +If you store loras in the loras folder once WanGP has been launched, click the *Refresh* button at the top so that it can become selectable. + +### Autodownload of Loras +WanGP will try to remember where a Lora was obtained and will store the corresponding Download URL in the Generation settings that are embedded in the Generated Video. This is useful to share this information or to easily recover lost loras after a reinstall. + +This works very well if the Loras are stored in repositories such *Hugging Face* but won't work for the moment for Loras that requires a Login (like *CivitAi*) to be downloaded. + +WanGP will update its internal URL Lora Cache whenener one of this events will occur: +- when applying or importing a *Accelerator Profile*, *Settings* or *Lset* file that contains Loras with full URLs (not just local paths) +- when extracting the settings of a Video that was generated with Loras and that contained the full Loras URLs +- when downloading manually a Lora using the *Download Lora* button at the bottom + +So the more you use WanGP the more the URL cache File will get updated. The file is *loras_url_cache.json* and is located in the root folder of WanGP. + +You can delete this file with any risk if needed or share it with friends to save them time locating the Loras. You will need to restart WanGP if you modify manually this file or delete it. + +### Lora Multipliers + +Multipliers control the strength of each lora's effect: + +#### Simple Multipliers +``` +1.2 0.8 +``` +- First lora: 1.2 strength +- Second lora: 0.8 strength + +#### Time-based and Phase-based Multipliers +For dynamic effects over generation steps, use comma-separated values: +``` +0.9,0.8,0.7 +1.2,1.1,1.0 +``` +- For 30 steps: steps 0-9 use first value, 10-19 use second, 20-29 use third +- First lora: 0.9 → 0.8 → 0.7 +- Second lora: 1.2 → 1.1 → 1.0 + +With models like Wan 2.2 that uses internally two diffusion models (*High noise* / *Low Noise*) you can specify which Loras you want to be applied for a specific phase by separating each phase with a ";". + +For instance, if you want to disable a lora for phase *High Noise* and enables it only for phase *Low Noise*: +``` +0;1 +``` + +Also with Wan 2.2, if you have two loras and you want the first one to be applied only during the High noise and the second one during the Low noise phase: +``` +1;0 0;1 +``` + +As usual, you can use any float for a multiplier and have a multiplier varries throughout one phase for one Lora: +``` +0.9,0.8;1.2,1.1,1 +``` +In this example multiplier 0.9 and 0.8 will be used during the *High Noise* phase and 1.2, 1.1 and 1 during the *Low Noise* phase. + +Here is another example for two loras: +``` +0.9,0.8;1.2,1.1,1 +0.5;0,0.7 +``` + +If one of several of your Lora multipliers are phased based (that is with a ";") and there are also Loras Multipliers that are only time based (don't have a ";" but have a ",") the time only multiplier will ignore the phases. For instance, let's assume we have a 6 steps denoising process in the following example: + +``` +1;0 +0;1 +0.8,0.7,0.5 +``` +Here the first lora will be as expected only used with the High Noise model and the second lora only used with the Low noise model. However for the third Lora: for steps 1-2 the multiplier will be (regadless of the phase) 0.8 then for steps 3-4 the multiplier will be 0.7 and finally for steps 5-6 the multiplier will be 0.5 + +You can use phased Lora multipliers even if have a single model (that is without any High / Low models) as Lora multiplier phases are aligned with Guidance phases. Let's assume you have defined 3 guidance phases (for instance guidance=3, then guidance=1.5 and at last guidance=1 ): +``` +0;1;0 +0;0;1 +``` +In that case no lora will be applied during the first phase when guidance is 3. Then the fist lora will be only used when guidance is 1.5 and the second lora only when guidance is 1. + +Best of all you can combine 3 guidance phases with High / Low models. Let's take this practical example with *Lightning 4/8 steps loras accelerators for Wan 2.2* where we want to increase the motion by adding some guidance at the very beginning (in that case a first phase that lasts only 1 step should be sufficient): +``` +Guidances: 3.5, 1 and 1 +Model transition: Phase 2-3 +Loras Multipliers: 0;1;0 0;0;1 +``` +Here during the first phase with guidance 3.5, the High model will be used but there won't be any lora at all. Then during phase 2 only the High lora will be used (which requires to set the guidance to 1). At last in phase 3 WanGP will switch to the Low model and then only the Low lora will be used. + +*Note that the syntax for multipliers can also be used in a Finetune model definition file (except that each multiplier definition is a string in a json list)* +## Lora Presets (.Lset file) + +Lora Presets contains all the information needed to use a Lora or a combination of Loras: +- The full download URLs of the Loras +- Default Loras Multipliers +- Sample Prompt to use the Loras with their corresponding *Trigger Words* (usually as comments) +Optionaly they may contain advanced prompts with macros to generate automatically Prompt using keywords. + +A Lora Preset is a text file of only of few kilobytes and can be easily shared between users. Don't hesitate to use this format if you have created a Lora. + +### Creating Presets +1. Configure your loras and multipliers +2. Write a prompt with comments lines starting with # that contains instructions +3. Save as a preset with `.lset` extension by clicking the *Save* button at the top, select *Save Only Loras & Full Prompt* and finally click *Go Ahead Save it!* + +### Example Lora Preset Prompt +``` +# Use the keyword "ohnvx" to trigger the lora +A ohnvx character is driving a car through the city +``` + +Using a macro (check the doc below), the user will just have to enter two words and the Prompt will be generated for him: +``` +! {Person}="man" : {Object}="car" +This {Person} is cleaning his {Object}. +``` + + +### Managing Loras Presets (.lset Files) +- Edit, save, or delete presets directly from the web interface +- Presets include comments with usage instructions +- Share `.lset` files with other users (make sure the full Loras URLs are in it) + +A *.lset* file may contain only local paths to the Loras if WanGP doesn't know where you got it. You can edit the .lset file with a text editor and replace the local path with its URL. If you store your Lora in Hugging Face, you can easily obtain its URL by selecting the file and clicking *Copy Download Link*. + +To share a *.Lset* file you will need (for the moment) to get it directly in the Lora folder where it is stored. + +## Supported Formats + +WanGP supports multiple most lora formats: +- **Safetensors** (.safetensors) +- **Replicate** format +- ... + + +## Loras Accelerators +Most Loras are used to apply a specific style or to alter the content of the output of the generated video. +However some Loras have been designed to tranform a model into a distilled model which requires fewer steps to generate a video. +Loras accelerators usually require to the set the Guidance to 1. Don't forget to do it as not only the quality of the generate video will be bad but it will two times slower. + +You will find most *Loras Accelerators* below: +- Wan 2.1 +https://huggingface.co/DeepBeepMeep/Wan2.1/tree/main/loras_accelerators +- Wan 2.2 +https://huggingface.co/DeepBeepMeep/Wan2.2/tree/main/loras_accelerators +- Qwen: +https://huggingface.co/DeepBeepMeep/Qwen_image/tree/main/loras_accelerators + + +### Setup Instructions +There are three ways to setup Loras accelerators: +1) **Finetune with Embedded Loras Accelerators** +Some models Finetunes such as *Vace FusioniX* or *Vace Coctail* have the Loras Accelerators already set up in their own definition and you won't have to do anything as they will be downloaded with the Finetune. + +2) **Accelerators Profiles** +Predefined *Accelerator Profiles* can be selected using the *Settings* Dropdown box at the top. The choices of Accelerators will depend on the models. No accelerator will be offered if the finetune / model is already accelerated. Just click *Apply* and the Accelerators Loras will be setup in the Loras tab at the bottom. Any missing Lora will be downloaded automatically the first time you try to generate a Video. Be aware that when applying an *Accelerator Profile*, inputs such as *Activated Loras*, *Number of Inference Steps*, ... will be updated. However if you have already existing Loras set up (that are non Loras Accelerators) they will be preserved so that you can easily switch between Accelerators Profiles. + +You will see the "|" character at the end of the Multipliers text input associated to Loras Accelerators. It plays the same role than the Space character to separate Multipliers except it tells WanGP where the Loras Accelerators multipliers end so that it can merge Loras Accelerators with Non Loras Accelerators. + +3) **Manual Install** +- Download the Lora +- Place it in the Lora Directory of the correspondig model +- Configure the Loras Multipliers, CFG as described in the later sections + +## FusioniX (or FusionX) Lora for Wan 2.1 / Wan 2.2 +If you need just one Lora accelerator use this one. It is a combination of multiple Loras acelerators (including Causvid below) and style loras. It will not only accelerate the video generation but it will also improve the quality. There are two versions of this lora whether you use it for t2v or i2v + +### Usage +1. Select a Wan t2v model (e.g., Wan 2.1 text2video 13B or Vace 13B) +2. Enable Advanced Mode +3. In Advanced Generation Tab: + - Set Guidance Scale = 1 + - Set Shift Scale = 2 +4. In Advanced Lora Tab: + - Select CausVid Lora + - Set multiplier to 1 +5. Set generation steps from 8-10 +6. Generate! + +## Self-Forcing lightx2v Lora (Video Generation Accelerator) for Wan 2.1 / Wan 2.2 +Selg forcing Lora has been created by Kijai from the Self-Forcing lightx2v distilled Wan model and can generate videos with only 2 steps and offers also a 2x speed improvement since it doesnt require classifier free guidance. It works on both t2v and i2v models +You will find it under the name of *Wan21_T2V_14B_lightx2v_cfg_step_distill_lora_rank32.safetensors* + +### Usage +1. Select a Wan t2v or i2v model (e.g., Wan 2.1 text2video 13B or Vace 13B) +2. Enable Advanced Mode +3. In Advanced Generation Tab: + - Set Guidance Scale = 1 + - Set Shift Scale = 5 +4. In Advanced Lora Tab: + - Select the Lora above + - Set multiplier to 1 +5. Set generation steps to 2-8 +6. Generate! + + +## CausVid Lora (Video Generation Accelerator) for Wan 2.1 / Wan 2.2 +CausVid is a distilled Wan model that generates videos in 4-12 steps with 2x speed improvement. + +### Usage +1. Select a Wan t2v model (e.g., Wan 2.1 text2video 13B or Vace 13B) +2. Enable Advanced Mode +3. In Advanced Generation Tab: + - Set Guidance Scale = 1 + - Set Shift Scale = 7 +4. In Advanced Lora Tab: + - Select CausVid Lora + - Set multiplier to 0.3 +5. Set generation steps to 12 +6. Generate! + +### CausVid Step/Multiplier Relationship +- **12 steps**: 0.3 multiplier (recommended) +- **8 steps**: 0.5-0.7 multiplier +- **4 steps**: 0.8-1.0 multiplier + +*Note: Lower steps = lower quality (especially motion)* + + +## AccVid Lora (Video Generation Accelerator) for Wan 2.1 / Wan 2.2 + +AccVid is a distilled Wan model that generates videos with a 2x speed improvement since classifier free guidance is no longer needed (that is cfg = 1). + +### Usage +1. Select a Wan t2v model (e.g., Wan 2.1 text2video 13B or Vace 13B) or Wan i2v model +2. Enable Advanced Mode +3. In Advanced Generation Tab: + - Set Guidance Scale = 1 + - Set Shift Scale = 5 +4. The number steps remain unchanged compared to what you would use with the original model but it will be two times faster since classifier free guidance is not needed + +## Lightx2v 4 steps Lora (Video Generation Accelerator) for Wan 2.2 +This lora is in fact composed of two loras, one for the High model and one for the Low Wan 2.2 model. + +You need to select these two loras and set the following Loras multipliers: + +``` +1;0 0;1 (the High lora should be only enabled when only the High model is loaded, same for the Low lora) +``` + +Don't forget to set guidance to 1 ! +## Qwen Image Lightning 4 steps / Lightning 8 steps +Very powerful lora that you can use to reduce the number of steps from 30 to only 4 ! +Just install the lora in *lora_qwen* folder, select the lora and set Guidance to 1 and the number of steps to 4 or 8 + + + +https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_T2V_14B_lightx2v_cfg_step_distill_lora_rank32.safetensors + +## Performance Tips + +### Fast Loading/Unloading +- Loras can be added/removed without restarting the app +- Use the "Refresh" button to detect new loras +- Enable `--check-loras` to filter incompatible loras (slower startup) + +### Memory Management +- Loras are loaded on-demand to save VRAM +- Multiple loras can be used simultaneously +- Time-based multipliers don't use extra memory +- The order of Loras doesn't matter (as long as the loras multipliers are in the right order of course !) + +## Finding Loras + +### Sources +- **[Civitai](https://civitai.com/)** - Large community collection +- **HuggingFace** - Official and community loras +- **Discord Server** - Community recommendations + +### Creating Loras +- **Kohya** - Popular training tool +- **OneTrainer** - Alternative training solution +- **Custom datasets** - Train on your own content + +## Macro System (Advanced) + +Create multiple prompts from templates using macros. This allows you to generate variations of a sentence by defining lists of values for different variables. + +**Syntax Rule:** + +Define your variables on a single line starting with `!`. Each complete variable definition, including its name and values, **must be separated by a colon (`:`)**. + +**Format:** + +``` +! {Variable1}="valueA","valueB" : {Variable2}="valueC","valueD" +This is a template using {Variable1} and {Variable2}. +``` + +**Example:** + +The following macro will generate three distinct prompts by cycling through the values for each variable. + +**Macro Definition:** + +``` +! {Subject}="cat","woman","man" : {Location}="forest","lake","city" : {Possessive}="its","her","his" +In the video, a {Subject} is presented. The {Subject} is in a {Location} and looks at {Possessive} watch. +``` + +**Generated Output:** + +``` +In the video, a cat is presented. The cat is in a forest and looks at its watch. +In the video, a woman is presented. The woman is in a lake and looks at her watch. +In the video, a man is presented. The man is in a city and looks at his watch. +``` + + +## Troubleshooting + +### Lora Not Working +0. If it is a lora accelerator, Guidance should be set to 1 +1. Check if lora is compatible with your model size (1.3B vs 14B) +2. Verify lora format is supported +3. Try different multiplier values +4. Check the lora was trained for your model type (t2v vs i2v) + +### Performance Issues +1. Reduce number of active loras +2. Lower multiplier values +3. Use `--check-loras` to filter incompatible files +4. Clear lora cache if issues persist + +### Memory Errors +1. Use fewer loras simultaneously +2. Reduce model size (use 1.3B instead of 14B) +3. Lower video resolution or frame count +4. Enable quantization if not already active + +## Command Line Options + +```bash +# Lora-related command line options +--lora-dir path # Path to t2v loras directory +--lora-dir-i2v path # Path to i2v loras directory +--lora-dir-hunyuan path # Path to Hunyuan t2v loras +--lora-dir-hunyuan-i2v path # Path to Hunyuan i2v loras +--lora-dir-ltxv path # Path to LTX Video loras +--lora-dir-flux path # Path to Flux loras +--lora-dir-qwen path # Path to Qwen loras +--lora-preset preset # Load preset on startup +--check-loras # Filter incompatible loras +``` \ No newline at end of file diff --git a/docs/MODELS.md b/docs/MODELS.md new file mode 100644 index 0000000000000000000000000000000000000000..720cb7398b8acc87d54f4849fd729023e3b092ca --- /dev/null +++ b/docs/MODELS.md @@ -0,0 +1,267 @@ +# Models Overview + +WanGP supports multiple video generation models, each optimized for different use cases and hardware configurations. + +Most models can combined with Loras Accelerators (check the Lora guide) to accelerate the generation of a video x2 or x3 with little quality loss + + +## Wan 2.1 Text2Video Models +Please note that that the term *Text2Video* refers to the underlying Wan architecture but as it has been greatly improved overtime many derived Text2Video models can now generate videos using images. + +#### Wan 2.1 Text2Video 1.3B +- **Size**: 1.3 billion parameters +- **VRAM**: 6GB minimum +- **Speed**: Fast generation +- **Quality**: Good quality for the size +- **Best for**: Quick iterations, lower-end hardware +- **Command**: `python wgp.py --t2v-1-3B` + +#### Wan 2.1 Text2Video 14B +- **Size**: 14 billion parameters +- **VRAM**: 12GB+ recommended +- **Speed**: Slower but higher quality +- **Quality**: Excellent detail and coherence +- **Best for**: Final production videos +- **Command**: `python wgp.py --t2v-14B` + +#### Wan Vace 1.3B +- **Type**: ControlNet for advanced video control +- **VRAM**: 6GB minimum +- **Features**: Motion transfer, object injection, inpainting +- **Best for**: Advanced video manipulation +- **Command**: `python wgp.py --vace-1.3B` + +#### Wan Vace 14B +- **Type**: Large ControlNet model +- **VRAM**: 12GB+ recommended +- **Features**: All Vace features with higher quality +- **Best for**: Professional video editing workflows + +#### MoviiGen (Experimental) +- **Resolution**: Claims 1080p capability +- **VRAM**: 20GB+ required +- **Speed**: Very slow generation +- **Features**: Should generate cinema like video, specialized for 2.1 / 1 ratios +- **Status**: Experimental, feedback welcome + +| " + prompt + "" + thumbnails + " |