Upload folder using huggingface_hub
Browse files- .gitattributes +8 -0
- README.md +51 -0
- README_from_modelscope.md +82 -0
- assets/image.jpg +3 -0
- assets/image_1_full.jpg +3 -0
- assets/image_1_original.jpg +0 -0
- assets/image_1_ours.jpg +3 -0
- assets/image_2_full.jpg +3 -0
- assets/image_2_original.jpg +0 -0
- assets/image_2_ours.jpg +3 -0
- assets/image_3_full.jpg +3 -0
- assets/image_3_original.jpg +3 -0
- assets/image_3_ours.jpg +3 -0
- assets/prompts.txt +4 -0
- assets/title.jpg +0 -0
- config.json +18 -0
- configuration.json +1 -0
- diffusion_pytorch_model-00001-of-00009.safetensors +3 -0
- diffusion_pytorch_model-00002-of-00009.safetensors +3 -0
- diffusion_pytorch_model-00003-of-00009.safetensors +3 -0
- diffusion_pytorch_model-00004-of-00009.safetensors +3 -0
- diffusion_pytorch_model-00005-of-00009.safetensors +3 -0
- diffusion_pytorch_model-00006-of-00009.safetensors +3 -0
- diffusion_pytorch_model-00007-of-00009.safetensors +3 -0
- diffusion_pytorch_model-00008-of-00009.safetensors +3 -0
- diffusion_pytorch_model-00009-of-00009.safetensors +3 -0
- diffusion_pytorch_model.safetensors.index.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/image.jpg filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/image_1_full.jpg filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/image_1_ours.jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
assets/image_2_full.jpg filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
assets/image_2_ours.jpg filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
assets/image_3_full.jpg filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
assets/image_3_original.jpg filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
assets/image_3_ours.jpg filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
---
|
| 4 |
+
# Qwen-Image Full Distillation Accelerated Model
|
| 5 |
+
|
| 6 |
+

|
| 7 |
+
|
| 8 |
+
## Model Introduction
|
| 9 |
+
|
| 10 |
+
This model is a distilled and accelerated version of [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image). The original model requires 40 inference steps and classifier-free guidance (CFG), resulting in a total of 80 forward passes. In contrast, the distilled accelerated model only requires 15 inference steps without CFG, totaling just 15 forward passes—**achieving approximately 5x speedup**. Of course, the number of inference steps can be further reduced based on requirements, though this may lead to some degradation in generation quality.
|
| 11 |
+
|
| 12 |
+
The training framework is built upon [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio). The training data consists of 16,000 images generated by the original model using randomly sampled prompts from [DiffusionDB](https://www.modelscope.cn/datasets/AI-ModelScope/diffusiondb). The training process was conducted on 8 * MI308X GPUs and took approximately one day.
|
| 13 |
+
|
| 14 |
+
## Performance Comparison
|
| 15 |
+
|
| 16 |
+
||Original Model|Original Model|Accelerated Model|
|
| 17 |
+
|-|-|-|-|
|
| 18 |
+
|Inference Steps|40|15|15|
|
| 19 |
+
|CFG Scale|4|1|1|
|
| 20 |
+
|Forward Passes|80|15|15|
|
| 21 |
+
|Example 1||||
|
| 22 |
+
|Example 2||||
|
| 23 |
+
|Example 3||||
|
| 24 |
+
|
| 25 |
+
## Inference Code
|
| 26 |
+
|
| 27 |
+
```shell
|
| 28 |
+
git clone https://github.com/modelscope/DiffSynth-Studio.git
|
| 29 |
+
cd DiffSynth-Studio
|
| 30 |
+
pip install -e .
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
```python
|
| 34 |
+
from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
|
| 35 |
+
import torch
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
pipe = QwenImagePipeline.from_pretrained(
|
| 39 |
+
torch_dtype=torch.bfloat16,
|
| 40 |
+
device="cuda",
|
| 41 |
+
model_configs=[
|
| 42 |
+
ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-Distill-Full", origin_file_pattern="diffusion_pytorch_model*.safetensors"),
|
| 43 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"),
|
| 44 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
| 45 |
+
],
|
| 46 |
+
tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
|
| 47 |
+
)
|
| 48 |
+
prompt = "精致肖像,水下少女,蓝裙飘逸,发丝轻扬,光影透澈,气泡环绕,面容恬静,细节精致,梦幻唯美。"
|
| 49 |
+
image = pipe(prompt, seed=0, num_inference_steps=15, cfg_scale=1)
|
| 50 |
+
image.save("image.jpg")
|
| 51 |
+
```
|
README_from_modelscope.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
frameworks:
|
| 3 |
+
- Pytorch
|
| 4 |
+
license: Apache License 2.0
|
| 5 |
+
tasks:
|
| 6 |
+
- text-to-image-synthesis
|
| 7 |
+
|
| 8 |
+
#model-type:
|
| 9 |
+
##如 gpt、phi、llama、chatglm、baichuan 等
|
| 10 |
+
#- gpt
|
| 11 |
+
|
| 12 |
+
#domain:
|
| 13 |
+
##如 nlp、cv、audio、multi-modal
|
| 14 |
+
#- nlp
|
| 15 |
+
|
| 16 |
+
#language:
|
| 17 |
+
##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
|
| 18 |
+
#- cn
|
| 19 |
+
|
| 20 |
+
#metrics:
|
| 21 |
+
##如 CIDEr、Blue、ROUGE 等
|
| 22 |
+
#- CIDEr
|
| 23 |
+
|
| 24 |
+
#tags:
|
| 25 |
+
##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
|
| 26 |
+
#- pretrained
|
| 27 |
+
|
| 28 |
+
#tools:
|
| 29 |
+
##如 vllm、fastchat、llamacpp、AdaSeq 等
|
| 30 |
+
#- vllm
|
| 31 |
+
base_model_relation: finetune
|
| 32 |
+
base_model:
|
| 33 |
+
- Qwen/Qwen-Image
|
| 34 |
+
---
|
| 35 |
+
# Qwen-Image 全量蒸馏加速模型
|
| 36 |
+
|
| 37 |
+

|
| 38 |
+
|
| 39 |
+
## 模型介绍
|
| 40 |
+
|
| 41 |
+
本模型是 [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image) 的蒸馏加速版本。原版模型需要进行 40 步推理,且需要开启 classifier-free guidance (CFG),总计需要 80 次模型前向推理。蒸馏加速模型仅需要进行 15 步推理,且无需开启 CFG,总计需要 15 次模型前向推理,**实现约 5 倍的加速**。当然,可根据需要进一步减少推理步数,但生成效果会有一定损失。
|
| 42 |
+
|
| 43 |
+
训练框架基于 [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) 构建,训练数据是由原模型根据 [DiffusionDB](https://www.modelscope.cn/datasets/AI-ModelScope/diffusiondb) 中随机抽取的提示词生成的 1.6 万张图,训练程序在 8 * MI308X GPU 上运行了约 1 天。
|
| 44 |
+
|
| 45 |
+
## 效果展示
|
| 46 |
+
|
| 47 |
+
||原版模型|原版模型|加速模型|
|
| 48 |
+
|-|-|-|-|
|
| 49 |
+
|推理步数|40|15|15|
|
| 50 |
+
|CFG scale|4|1|1|
|
| 51 |
+
|前向推理次数|80|15|15|
|
| 52 |
+
|样例1||||
|
| 53 |
+
|样例2||||
|
| 54 |
+
|样例3||||
|
| 55 |
+
|
| 56 |
+
## 推理代码
|
| 57 |
+
|
| 58 |
+
```shell
|
| 59 |
+
git clone https://github.com/modelscope/DiffSynth-Studio.git
|
| 60 |
+
cd DiffSynth-Studio
|
| 61 |
+
pip install -e .
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
```python
|
| 65 |
+
from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
|
| 66 |
+
import torch
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
pipe = QwenImagePipeline.from_pretrained(
|
| 70 |
+
torch_dtype=torch.bfloat16,
|
| 71 |
+
device="cuda",
|
| 72 |
+
model_configs=[
|
| 73 |
+
ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-Distill-Full", origin_file_pattern="diffusion_pytorch_model*.safetensors"),
|
| 74 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"),
|
| 75 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
| 76 |
+
],
|
| 77 |
+
tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
|
| 78 |
+
)
|
| 79 |
+
prompt = "精致肖像,水下少女,蓝裙飘逸,发丝轻扬,光影透澈,气泡环绕,面容恬静,细节精致,梦幻唯美。"
|
| 80 |
+
image = pipe(prompt, seed=0, num_inference_steps=15, cfg_scale=1)
|
| 81 |
+
image.save("image.jpg")
|
| 82 |
+
```
|
assets/image.jpg
ADDED
|
Git LFS Details
|
assets/image_1_full.jpg
ADDED
|
Git LFS Details
|
assets/image_1_original.jpg
ADDED
|
assets/image_1_ours.jpg
ADDED
|
Git LFS Details
|
assets/image_2_full.jpg
ADDED
|
Git LFS Details
|
assets/image_2_original.jpg
ADDED
|
assets/image_2_ours.jpg
ADDED
|
Git LFS Details
|
assets/image_3_full.jpg
ADDED
|
Git LFS Details
|
assets/image_3_original.jpg
ADDED
|
Git LFS Details
|
assets/image_3_ours.jpg
ADDED
|
Git LFS Details
|
assets/prompts.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
动漫风格,一个漂亮的少女在教室里,身后右边的黑板上写着“Qwen-Image-Distill 更快速的生图”以及“DiffSynth-Studio Team”
|
| 2 |
+
精致肖像,水下少女,蓝裙飘逸,发丝轻扬,光影透澈,气泡环绕,面容恬静,细节精致,梦幻唯美。
|
| 3 |
+
唯美动漫画面,一位二次元美少女,坐在公园的长椅上,落日的霞光洒在少女脸上,少女露出动人的微笑,整体色调为橙色
|
| 4 |
+
绿意盎然的森林间,皮克斯风2.5D渲染,一辆小车悠然驶过辽阔草原,光影柔和,画面温暖梦幻。
|
assets/title.jpg
ADDED
|
config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "QwenImageTransformer2DModel",
|
| 3 |
+
"_diffusers_version": "0.34.0.dev0",
|
| 4 |
+
"attention_head_dim": 128,
|
| 5 |
+
"axes_dims_rope": [
|
| 6 |
+
16,
|
| 7 |
+
56,
|
| 8 |
+
56
|
| 9 |
+
],
|
| 10 |
+
"guidance_embeds": false,
|
| 11 |
+
"in_channels": 64,
|
| 12 |
+
"joint_attention_dim": 3584,
|
| 13 |
+
"num_attention_heads": 24,
|
| 14 |
+
"num_layers": 60,
|
| 15 |
+
"out_channels": 16,
|
| 16 |
+
"patch_size": 2,
|
| 17 |
+
"pooled_projection_dim": 768
|
| 18 |
+
}
|
configuration.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"framework":"Pytorch","task":"text-to-image-synthesis"}
|
diffusion_pytorch_model-00001-of-00009.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34ef1f7afa6de7430d6a8c500dfa43a2d8deaf0c283a6a9d0c6242fe1fda722d
|
| 3 |
+
size 4989364288
|
diffusion_pytorch_model-00002-of-00009.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:367bec7efd4e2dec1a921dc1fadf4ac10669e636ad120b2c8c616ed57aae37ce
|
| 3 |
+
size 4984214128
|
diffusion_pytorch_model-00003-of-00009.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:089fba35c38d09962c8f17ea2142d31cffc131b5b62e3be014c82e04c34b6130
|
| 3 |
+
size 4946469968
|
diffusion_pytorch_model-00004-of-00009.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e71c5ae8ee05df492bd30306cafb568e635fc93a039458c8abae11faa52dc691
|
| 3 |
+
size 4984213704
|
diffusion_pytorch_model-00005-of-00009.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:219d73821abb460065d8d4643b9945e7ab0df421565149961c7d52f08f4e8347
|
| 3 |
+
size 4946471864
|
diffusion_pytorch_model-00006-of-00009.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a4fab29389d9a59bffcdc3d73c31d344093ebd0e197eb320e55c14804903edd
|
| 3 |
+
size 4946451528
|
diffusion_pytorch_model-00007-of-00009.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9f8c442fb03a304adb32b44cd3cc2d6a5d3248885b057f1c7d74ac3c878abe7
|
| 3 |
+
size 4908690488
|
diffusion_pytorch_model-00008-of-00009.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5256cd950aa19c14e654cf4f90688d34227b677bea90a0718e37ee754fbf36d7
|
| 3 |
+
size 4984232824
|
diffusion_pytorch_model-00009-of-00009.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6b0da40b5e0a48a5045fdd7d2148baa1ce4068249a8011f35c5b4246c87c467
|
| 3 |
+
size 1170918816
|
diffusion_pytorch_model.safetensors.index.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|