Borys Tymchenko commited on Nov 28, 2023

Commit

ae2e28c

1 Parent(s): 1545cf6

Initial commit

Files changed (19) hide show

.gitignore +192 -0
feature_extractor/preprocessor_config.json +28 -0
flexible_unet/config.json +125 -0
flexible_unet/diffusion_pytorch_model.safetensors +3 -0
model_index.json +34 -0
pipeline.py +1010 -0
safety_checker/config.json +168 -0
safety_checker/model.safetensors +3 -0
scheduler/scheduler_config.json +19 -0
text_encoder/config.json +25 -0
text_encoder/model.safetensors +3 -0
tokenizer/merges.txt +0 -0
tokenizer/special_tokens_map.json +24 -0
tokenizer/tokenizer_config.json +33 -0
tokenizer/vocab.json +0 -0
unet/config.json +68 -0
unet/diffusion_pytorch_model.safetensors +3 -0
vae/config.json +32 -0
vae/diffusion_pytorch_model.safetensors +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,192 @@

+# Created by https://www.toptal.com/developers/gitignore/api/linux,macos,python
+# Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,python
+### Linux ###
+*~
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+# KDE directory preferences
+.directory
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+*/build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+documentation/_build/
+documentation/build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+.idea
+docs/.doctrees
+# End of https://www.toptal.com/developers/gitignore/api/linux,macos,python

feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

flexible_unet/config.json ADDED Viewed

	@@ -0,0 +1,125 @@

+{
+  "_class_name": "FlexibleUNet2DConditionModel",
+  "_diffusers_version": "0.23.0",
+  "_name_or_path": "/home/borys.tymchenko/qcomdiffusion/checkpoint-286000-2050048000/pipeline/unet",
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "configurations": {
+    "add_downsample": [
+      true,
+      true,
+      false
+    ],
+    "add_upsample": [
+      true,
+      true,
+      false
+    ],
+    "add_upsample_mid_block": null,
+    "cross_attention_dim": 768,
+    "down_blocks_in_channels": [
+      320,
+      320,
+      640
+    ],
+    "down_blocks_num_attentions": [
+      0,
+      1,
+      3
+    ],
+    "down_blocks_num_resnets": [
+      2,
+      2,
+      1
+    ],
+    "down_blocks_out_channels": [
+      320,
+      640,
+      1280
+    ],
+    "mid_num_attentions": 0,
+    "mid_num_resnets": 0,
+    "mix_block_in_forward": true,
+    "num_attention_heads": 8,
+    "prev_output_channels": [
+      1280,
+      1280,
+      640
+    ],
+    "resnet_act_fn": "silu",
+    "resnet_eps": 1e-05,
+    "sample_size": 64,
+    "temb_dim": 1280,
+    "up_blocks_num_attentions": [
+      5,
+      3,
+      0
+    ],
+    "up_blocks_num_resnets": [
+      2,
+      3,
+      3
+    ]
+  },
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 768,
+  "cross_attention_norm": null,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_out_scale_factor": 1.0,
+  "resnet_skip_time_act": false,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": false
+}

flexible_unet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:337322d55ebf3ad224f25121b3ab439e3406f5517bdb61b252d1d2aaea06024d
+size 2101170216

model_index.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "_class_name": "DeciDiffusionPipeline",
+  "_diffusers_version": "0.21.4",
+  "_name_or_path": "Deci/DeciDiffusion-v2-0",
+  "feature_extractor": [
+    "transformers",
+    "CLIPImageProcessor"
+  ],
+  "requires_safety_checker": true,
+  "safety_checker": [
+    "stable_diffusion",
+    "StableDiffusionSafetyChecker"
+  ],
+  "scheduler": [
+    "diffusers",
+    "DDIMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

pipeline.py ADDED Viewed

	@@ -0,0 +1,1010 @@

+import itertools
+from functools import partial
+from typing import Any, Dict, Tuple, Callable
+from typing import Union, Optional, List
+import numpy as np
+import torch
+from diffusers import DPMSolverMultistepScheduler
+from diffusers import StableDiffusionPipeline, AutoencoderKL
+from diffusers import Transformer2DModel, ModelMixin, ConfigMixin
+from diffusers import UNet2DConditionModel
+from diffusers.configuration_utils import register_to_config
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.resnet import ResnetBlock2D, Downsample2D, Upsample2D
+from diffusers.models.transformer_2d import Transformer2DModelOutput
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker, StableDiffusionPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import replace_example_docstring
+from torch import nn
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPImageProcessor
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+def custom_sort_order(obj):
+    """
+    Key function for sorting order of execution in forward methods
+    """
+    return {ResnetBlock2D: 0, Transformer2DModel: 1, FlexibleTransformer2DModel: 1}.get(obj.__class__)
+def squeeze_to_len_n_starting_from_index_i(n, i, timestep_spacing):
+    """
+    :param timestep_spacing: the timestep_spacing array we want to squeeze
+    :param n: the size of the squeezed array
+    :param i: the index we start squeezing from
+    :return: squeezed timestep_spacing
+    Example:
+    timesteps = np.array([967, 907, 846, 786, 725, 665, 604, 544, 484, 423, 363, 302, 242, 181, 121, 60]) (len=16)
+    n = 10, i = 6
+    Expected:
+    [967, 907, 846, 786, 725, 665, 4k, 3k, 2k, k], and if we define 665=5k => k = 133
+    """
+    assert i < n
+    squeezed = np.flip(np.arange(n)) + 1  # [n, n-1, ..., 2, 1]
+    squeezed[:i] = timestep_spacing[:i]
+    k = squeezed[i - 1] // (n - i + 1)
+    squeezed[i:] *= k
+    return squeezed
+PREDEFINED_TIMESTEP_SQUEEZERS = {
+    # Tested with DPM 16-steps (reduced 16 -> 10 or 11 steps)
+    "10,6": partial(squeeze_to_len_n_starting_from_index_i, 10, 6),
+    "11,7": partial(squeeze_to_len_n_starting_from_index_i, 11, 7),
+}
+FlexibleUnetConfigurations = {
+    # General parameters for all blocks
+    "sample_size": 64,
+    "temb_dim": 320 * 4,
+    "resnet_eps": 1e-5,
+    "resnet_act_fn": "silu",
+    "num_attention_heads": 8,
+    "cross_attention_dim": 768,
+    # Controls modules execute order in unet's forward
+    "mix_block_in_forward": True,
+    # Down blocks parameters
+    "down_blocks_in_channels": [320, 320, 640],
+    "down_blocks_out_channels": [320, 640, 1280],
+    "down_blocks_num_attentions": [0, 1, 3],
+    "down_blocks_num_resnets": [2, 2, 1],
+    "add_downsample": [True, True, False],
+    # Middle block parameters
+    "add_upsample_mid_block": None,
+    "mid_num_resnets": 0,
+    "mid_num_attentions": 0,
+    # Up block parameters
+    "prev_output_channels": [1280, 1280, 640],
+    "up_blocks_num_attentions": [5, 3, 0],
+    "up_blocks_num_resnets": [2, 3, 3],
+    "add_upsample": [True, True, False],
+}
+class SqueezedDPMSolverMultistepScheduler(DPMSolverMultistepScheduler):
+    """
+    This is a copy-paste from Diffuser's `DPMSolverMultistepScheduler`, with minor differences:
+    * Defaults are modified to accommodate DeciDiffusion
+    * It supports a squeezer to squeeze the number of inference steps to a smaller number
+    //!\\ IMPORTANT: the actual number of inference steps is deduced by the squeezer, and not the pipeline!
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "squaredcos_cap_v2",  # NOTE THIS DEFAULT VALUE
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        solver_order: int = 2,
+        prediction_type: str = "v_prediction",  # NOTE THIS DEFAULT VALUE
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "heun",  # NOTE THIS DEFAULT VALUE
+        lower_order_final: bool = True,
+        use_karras_sigmas: Optional[bool] = False,
+        lambda_min_clipped: float = -3.0,  # NOTE THIS DEFAULT VALUE
+        variance_type: Optional[str] = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 1,
+        squeeze_mode: Optional[str] = None,  # NOTE THIS ADDITION. Supports keys from `PREDEFINED_TIMESTEP_SQUEEZERS` defined above
+    ):
+        self._squeezer = PREDEFINED_TIMESTEP_SQUEEZERS.get(squeeze_mode)
+        if use_karras_sigmas:
+            raise NotImplementedError("Squeezing isn't tested with `use_karras_sigmas`. Please provide `use_karras_sigmas=False`")
+        super().__init__(
+            num_train_timesteps=num_train_timesteps,
+            beta_start=beta_start,
+            beta_end=beta_end,
+            beta_schedule=beta_schedule,
+            trained_betas=trained_betas,
+            solver_order=solver_order,
+            prediction_type=prediction_type,
+            thresholding=thresholding,
+            dynamic_thresholding_ratio=dynamic_thresholding_ratio,
+            sample_max_value=sample_max_value,
+            algorithm_type=algorithm_type,
+            solver_type=solver_type,
+            lower_order_final=lower_order_final,
+            use_karras_sigmas=False,
+            lambda_min_clipped=lambda_min_clipped,
+            variance_type=variance_type,
+            timestep_spacing=timestep_spacing,
+            steps_offset=steps_offset,
+        )
+    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        super().set_timesteps(num_inference_steps=num_inference_steps, device=device)
+        if self._squeezer is not None:
+            timesteps = self._squeezer(self.timesteps.cpu())
+            sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+            sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+            self.sigmas = torch.from_numpy(sigmas)
+            self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+            self.num_inference_steps = len(timesteps)
+class FlexibleIdentityBlock(nn.Module):
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ):
+        return hidden_states
+class FlexibleUNet2DConditionModel(UNet2DConditionModel, ModelMixin):
+    configurations = FlexibleUnetConfigurations
+    @register_to_config
+    def __init__(self):
+        super().__init__(
+            sample_size=self.configurations.get("sample_size", FlexibleUnetConfigurations["sample_size"]),
+            cross_attention_dim=self.configurations.get("cross_attention_dim", FlexibleUnetConfigurations["cross_attention_dim"]),
+        )
+        num_attention_heads = self.configurations.get("num_attention_heads")
+        cross_attention_dim = self.configurations.get("cross_attention_dim")
+        mix_block_in_forward = self.configurations.get("mix_block_in_forward")
+        resnet_act_fn = self.configurations.get("resnet_act_fn")
+        resnet_eps = self.configurations.get("resnet_eps")
+        temb_dim = self.configurations.get("temb_dim")
+        ###############
+        # Down blocks #
+        ###############
+        down_blocks_num_attentions = self.configurations.get("down_blocks_num_attentions")
+        down_blocks_out_channels = self.configurations.get("down_blocks_out_channels")
+        down_blocks_in_channels = self.configurations.get("down_blocks_in_channels")
+        down_blocks_num_resnets = self.configurations.get("down_blocks_num_resnets")
+        add_downsample = self.configurations.get("add_downsample")
+        self.down_blocks = nn.ModuleList()
+        for i, (in_c, out_c, n_res, n_att, add_down) in enumerate(
+            zip(down_blocks_in_channels, down_blocks_out_channels, down_blocks_num_resnets, down_blocks_num_attentions, add_downsample)
+        ):
+            last_block = i == len(down_blocks_in_channels) - 1
+            self.down_blocks.append(
+                FlexibleCrossAttnDownBlock2D(
+                    in_channels=in_c,
+                    out_channels=out_c,
+                    temb_channels=temb_dim,
+                    num_resnets=n_res,
+                    num_attentions=n_att,
+                    resnet_eps=resnet_eps,
+                    resnet_act_fn=resnet_act_fn,
+                    num_attention_heads=num_attention_heads,
+                    cross_attention_dim=cross_attention_dim,
+                    add_downsample=add_down,
+                    last_block=last_block,
+                    mix_block_in_forward=mix_block_in_forward,
+                )
+            )
+        ###############
+        # Mid blocks  #
+        ###############
+        mid_block_add_upsample = self.configurations.get("add_upsample_mid_block")
+        mid_num_attentions = self.configurations.get("mid_num_attentions")
+        mid_num_resnets = self.configurations.get("mid_num_resnets")
+        if mid_num_resnets == mid_num_attentions == 0:
+            self.mid_block = FlexibleIdentityBlock()
+        else:
+            self.mid_block = FlexibleUNetMidBlock2DCrossAttn(
+                in_channels=down_blocks_out_channels[-1],
+                temb_channels=temb_dim,
+                resnet_act_fn=resnet_act_fn,
+                resnet_eps=resnet_eps,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads,
+                num_resnets=mid_num_resnets,
+                num_attentions=mid_num_attentions,
+                mix_block_in_forward=mix_block_in_forward,
+                add_upsample=mid_block_add_upsample,
+            )
+        ###############
+        #  Up blocks  #
+        ###############
+        up_blocks_num_attentions = self.configurations.get("up_blocks_num_attentions")
+        up_blocks_num_resnets = self.configurations.get("up_blocks_num_resnets")
+        prev_output_channels = self.configurations.get("prev_output_channels")
+        up_upsample = self.configurations.get("add_upsample")
+        self.up_blocks = nn.ModuleList()
+        for in_c, out_c, prev_out, n_res, n_att, add_up in zip(
+            reversed(down_blocks_in_channels),
+            reversed(down_blocks_out_channels),
+            prev_output_channels,
+            up_blocks_num_resnets,
+            up_blocks_num_attentions,
+            up_upsample,
+        ):
+            self.up_blocks.append(
+                FlexibleCrossAttnUpBlock2D(
+                    in_channels=in_c,
+                    out_channels=out_c,
+                    prev_output_channel=prev_out,
+                    temb_channels=temb_dim,
+                    num_resnets=n_res,
+                    num_attentions=n_att,
+                    resnet_eps=resnet_eps,
+                    resnet_act_fn=resnet_act_fn,
+                    num_attention_heads=num_attention_heads,
+                    cross_attention_dim=cross_attention_dim,
+                    add_upsample=add_up,
+                    mix_block_in_forward=mix_block_in_forward,
+                )
+            )
+class FlexibleCrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_resnets: int = 1,
+        num_attentions: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        last_block: bool = False,
+        mix_block_in_forward: bool = True,
+    ):
+        super().__init__()
+        self.last_block = last_block
+        self.mix_block_in_forward = mix_block_in_forward
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        modules = []
+        add_resnets = [True] * num_resnets
+        add_cross_attentions = [True] * num_attentions
+        for i, (add_resnet, add_cross_attention) in enumerate(itertools.zip_longest(add_resnets, add_cross_attentions, fillvalue=False)):
+            in_channels = in_channels if i == 0 else out_channels
+            if add_resnet:
+                modules.append(
+                    ResnetBlock2D(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+            if add_cross_attention:
+                modules.append(
+                    FlexibleTransformer2DModel(
+                        num_attention_heads=num_attention_heads,
+                        attention_head_dim=out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                    )
+                )
+        if not mix_block_in_forward:
+            modules = sorted(modules, key=custom_sort_order)
+        self.modules_list = nn.ModuleList(modules)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList([Downsample2D(out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op")])
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ):
+        output_states = ()
+        for module in self.modules_list:
+            if isinstance(module, ResnetBlock2D):
+                hidden_states = module(hidden_states, temb)
+            elif isinstance(module, (FlexibleTransformer2DModel, Transformer2DModel)):
+                hidden_states = module(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                raise ValueError(f"Got an unexpected module in modules list! {type(module)}")
+            if isinstance(module, ResnetBlock2D):
+                output_states = output_states + (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            if not self.last_block:
+                output_states = output_states + (hidden_states,)
+        return hidden_states, output_states
+class FlexibleCrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_resnets: int = 1,
+        num_attentions: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        mix_block_in_forward: bool = True,
+    ):
+        super().__init__()
+        modules = []
+        # WARNING: This parameter is filled with number of resnets and used within StableDiffusionPipeline
+        self.resnets = []
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        add_resnets = [True] * num_resnets
+        add_cross_attentions = [True] * num_attentions
+        for i, (add_resnet, add_cross_attention) in enumerate(itertools.zip_longest(add_resnets, add_cross_attentions, fillvalue=False)):
+            res_skip_channels = in_channels if (i == len(add_resnets) - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            if add_resnet:
+                self.resnets += [True]
+                modules.append(
+                    ResnetBlock2D(
+                        in_channels=resnet_in_channels + res_skip_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+            if add_cross_attention:
+                modules.append(
+                    FlexibleTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                    )
+                )
+        if not mix_block_in_forward:
+            modules = sorted(modules, key=custom_sort_order)
+        self.modules_list = nn.ModuleList(modules)
+        self.upsamplers = None
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ):
+        for module in self.modules_list:
+            if isinstance(module, ResnetBlock2D):
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                hidden_states = module(hidden_states, temb)
+            if isinstance(module, (FlexibleTransformer2DModel, Transformer2DModel)):
+                hidden_states = module(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class FlexibleUNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_resnets: int = 1,
+        num_attentions: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        mix_block_in_forward: bool = True,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # There is always at least one resnet
+        modules = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        add_resnets = [True] * num_resnets
+        add_cross_attentions = [True] * num_attentions
+        for i, (add_resnet, add_cross_attention) in enumerate(itertools.zip_longest(add_resnets, add_cross_attentions, fillvalue=False)):
+            if add_cross_attention:
+                modules.append(
+                    FlexibleTransformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                    )
+                )
+            if add_resnet:
+                modules.append(
+                    ResnetBlock2D(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+        if not mix_block_in_forward:
+            modules = sorted(modules, key=custom_sort_order)
+        self.modules_list = nn.ModuleList(modules)
+        self.upsamplers = nn.ModuleList([nn.Identity()])
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(in_channels, use_conv=True, out_channels=in_channels)])
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        hidden_states = self.modules_list[0](hidden_states, temb)
+        for module in self.modules_list:
+            if isinstance(module, (FlexibleTransformer2DModel, Transformer2DModel)):
+                hidden_states = module(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            elif isinstance(module, ResnetBlock2D):
+                hidden_states = module(hidden_states, temb)
+        for upsampler in self.upsamplers:
+            hidden_states = upsampler(hidden_states)
+        return hidden_states
+class FlexibleTransformer2DModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        only_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.in_channels = in_channels
+        inner_dim = num_attention_heads * attention_head_dim
+        # Define input layers
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        self.use_linear_projection = use_linear_projection
+        if self.use_linear_projection:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        # Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.use_linear_projection:
+            self.proj_out = nn.Linear(inner_dim, in_channels)
+        else:
+            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = False,
+    ):
+        # 1. Input
+        batch, _, height, width = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if not self.use_linear_projection:
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+        else:
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            hidden_states = self.proj_in(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+        # 3. Output
+        if not self.use_linear_projection:
+            hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+            hidden_states = self.proj_out(hidden_states)
+        else:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+        output = hidden_states + residual
+        if return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+class DeciDiffusionPipeline(StableDiffusionPipeline):
+    deci_default_squeeze_mode = "10,6"
+    deci_default_number_of_iterations = 16
+    deci_default_guidance_rescale = 0.7
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        # Replace UNet with Deci`s unet
+        del unet
+        unet = FlexibleUNet2DConditionModel()
+        # Replace with custom scheduler
+        del scheduler
+        scheduler = SqueezedDPMSolverMultistepScheduler(squeeze_mode=self.deci_default_squeeze_mode)
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker,
+        )
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 16,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.7,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_encoder_lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=len(timesteps)) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

safety_checker/config.json ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "_commit_hash": "1d0c4ebf6ff58a5caecab40fa1406526bca4b5b9",
+  "_name_or_path": "/home/borys.tymchenko/.cache/huggingface/hub/models--runwayml--stable-diffusion-v1-5/snapshots/1d0c4ebf6ff58a5caecab40fa1406526bca4b5b9/safety_checker",
+  "architectures": [
+    "StableDiffusionSafetyChecker"
+  ],
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "clip",
+  "projection_dim": 768,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.30.2",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vocab_size": 49408
+  },
+  "torch_dtype": "float32",
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.30.2",
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  }
+}

safety_checker/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11cfe53105625af8c00faac32a430626641cce686454f3c39d837f14397d858b
+size 1215981832

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "_class_name": "DDIMScheduler",
+  "_diffusers_version": "0.23.0",
+  "beta_end": 0.012,
+  "beta_schedule": "squaredcos_cap_v2",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "clip_sample_range": 1.0,
+  "dynamic_thresholding_ratio": 0.995,
+  "num_train_timesteps": 1000,
+  "prediction_type": "v_prediction",
+  "rescale_betas_zero_snr": true,
+  "sample_max_value": 1.0,
+  "set_alpha_to_one": true,
+  "steps_offset": 1,
+  "thresholding": false,
+  "timestep_spacing": "trailing",
+  "trained_betas": null
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "/home/borys.tymchenko/qcomdiffusion/checkpoint-286000-2050048000/pipeline/text_encoder",
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 768,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "projection_dim": 768,
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.2",
+  "vocab_size": 49408
+}

text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22928c6a6a99759e4a19648ba56e044d1df47b650f7879470501b71ec996a3ef
+size 492265880

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "model_max_length": 77,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

unet/config.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.23.0",
+  "_name_or_path": "/home/borys.tymchenko/.cache/huggingface/hub/models--runwayml--stable-diffusion-v1-5/snapshots/1d0c4ebf6ff58a5caecab40fa1406526bca4b5b9/unet",
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 768,
+  "cross_attention_norm": null,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_out_scale_factor": 1.0,
+  "resnet_skip_time_act": false,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": false
+}

unet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d27cd69d4a0aa32105087a619f32a51bc087e133be93fe23da92f3c0bcc07d79
+size 3438167536

vae/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.23.0",
+  "_name_or_path": "stabilityai/stable-diffusion-2-1",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 768,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2aa1f43011b553a4cba7f37456465cdbd48aab7b54b9348b890e8058ea7683ec
+size 334643268