Initial upload of LibreFlux ControlNet pipeline

Browse files

Files changed (4) hide show

__init__.py +2 -1
controlnet/net.py +227 -0
pipeline.py +51 -52
transformer/trans.py +510 -1

__init__.py CHANGED Viewed

@@ -2,4 +2,5 @@ from .pipeline import (
     LibreFluxControlNetPipeline,
     LibreFluxTransformer2DModel,
     LibreFluxControlNetModel,
-)

     LibreFluxControlNetPipeline,
     LibreFluxTransformer2DModel,
     LibreFluxControlNetModel,
+)
+from .transformer.tran import *

controlnet/net.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -50,6 +51,210 @@ from diffusers.models.embeddings import apply_rotary_emb
 class FluxFusedSDPAProcessor:
     """
     Fused QKV processor using PyTorch's scaled_dot_product_attention.
@@ -1070,6 +1275,27 @@ class LibreFluxTransformer2DModel(
         return Transformer2DModelOutput(sample=output)
 ####################################
 ##### CONTROL NET MODEL MERGE ######
 ####################################
@@ -1505,3 +1731,4 @@ class LibreFluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             controlnet_block_samples=controlnet_block_samples,
             controlnet_single_block_samples=controlnet_single_block_samples,
         )

 # Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
+def fa3_sdpa(
+    q,
+    k,
+    v,
+):
+    # flash attention 3 sdpa drop-in replacement
+    q, k, v = [x.permute(0, 2, 1, 3) for x in [q, k, v]]
+    out = flash_attn_func(q, k, v)[0]
+    return out.permute(0, 2, 1, 3)
+class FluxSingleAttnProcessor3_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn,
+        hidden_states: Tensor,
+        encoder_hidden_states: Tensor = None,
+        attention_mask: FloatTensor = None,
+        image_rotary_emb: Tensor = None,
+    ) -> Tensor:
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, _, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        # hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = fa3_sdpa(query, key, value)
+        hidden_states = rearrange(hidden_states, "B H L D -> B L (H D)")
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        return hidden_states
+class FluxAttnProcessor3_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "FluxAttnProcessor3_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn,
+        hidden_states: FloatTensor,
+        encoder_hidden_states: FloatTensor = None,
+        attention_mask: FloatTensor = None,
+        image_rotary_emb: Tensor = None,
+    ) -> FloatTensor:
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        context_input_ndim = encoder_hidden_states.ndim
+        if context_input_ndim == 4:
+            batch_size, channel, height, width = encoder_hidden_states.shape
+            encoder_hidden_states = encoder_hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size = encoder_hidden_states.shape[0]
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # `context` projections.
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(
+                encoder_hidden_states_query_proj
+            )
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(
+                encoder_hidden_states_key_proj
+            )
+        # attention
+        query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+        key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+        value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        # hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = fa3_sdpa(query, key, value)
+        hidden_states = rearrange(hidden_states, "B H L D -> B L (H D)")
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        encoder_hidden_states, hidden_states = (
+            hidden_states[:, : encoder_hidden_states.shape[1]],
+            hidden_states[:, encoder_hidden_states.shape[1] :],
+        )
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if context_input_ndim == 4:
+            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        return hidden_states, encoder_hidden_states
 class FluxFusedSDPAProcessor:
     """
     Fused QKV processor using PyTorch's scaled_dot_product_attention.
         return Transformer2DModelOutput(sample=output)
+###################################
+# END TRANS MERGE
+####################################
+# Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This was modied from the control net repo
 ####################################
 ##### CONTROL NET MODEL MERGE ######
 ####################################
             controlnet_block_samples=controlnet_block_samples,
             controlnet_single_block_samples=controlnet_single_block_samples,
         )

pipeline.py CHANGED Viewed

@@ -749,62 +749,61 @@ class LibreFluxControlNetPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSi
         else:
             inner_module = self.controlnet
-        if isinstance(inner_module, LibreFluxControlNetModel):
-            control_image = self.prepare_image(
-                image=control_image,
-                width=width,
-                height=height,
-                batch_size=batch_size * num_images_per_prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                device=device,
-                dtype=dtype,
-            )
-            if control_image_undo_centering:
-                if not self.image_processor.do_normalize:
-                    raise ValueError(
-                        "`control_image_undo_centering` only makes sense if `do_normalize==True` in the image processor"
-                    )
-                control_image = control_image*0.5 + 0.5
-            height, width = control_image.shape[-2:]
-            #logger.warning(
-            #    f"pipeline_flux_controlnet, control_image: {control_image.min()} {control_image.max()}"
-            #)
-            # vae encode
-            control_image = _maybe_to(control_image, device=self.vae.device)
-            control_image = self.vae.encode(control_image).latent_dist.sample()
-            control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
-            control_image = _maybe_to(control_image, device=device)
-            # pack
-            height_control_image, width_control_image = control_image.shape[2:]
-            control_image = self._pack_latents(
-                control_image,
-                batch_size * num_images_per_prompt,
-                num_channels_latents,
-                height_control_image,
-                width_control_image,
-            )
-            # set control mode
-            if control_mode is not None:
-                control_mode = torch.tensor(control_mode).to(device, dtype=torch.long)
-                control_mode = control_mode.reshape([-1, 1])
-            # set control mode
-            control_mode_ = []
-            if isinstance(control_mode, list):
-                for cmode in control_mode:
-                    if cmode is None:
-                        control_mode_.append(-1)
-                    else:
-                        control_mode_.append(cmode)
-            control_mode = torch.tensor(control_mode_).to(device, dtype=torch.long)
             control_mode = control_mode.reshape([-1, 1])
         # 4. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4
         latents, latent_image_ids = self.prepare_latents(

         else:
             inner_module = self.controlnet
+        control_image = self.prepare_image(
+            image=control_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=dtype,
+        )
+        if control_image_undo_centering:
+            if not self.image_processor.do_normalize:
+                raise ValueError(
+                    "`control_image_undo_centering` only makes sense if `do_normalize==True` in the image processor"
+                )
+            control_image = control_image*0.5 + 0.5
+        height, width = control_image.shape[-2:]
+        #logger.warning(
+        #    f"pipeline_flux_controlnet, control_image: {control_image.min()} {control_image.max()}"
+        #)
+        # vae encode
+        control_image = _maybe_to(control_image, device=self.vae.device)
+        control_image = self.vae.encode(control_image).latent_dist.sample()
+        control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        control_image = _maybe_to(control_image, device=device)
+        # pack
+        height_control_image, width_control_image = control_image.shape[2:]
+        control_image = self._pack_latents(
+            control_image,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height_control_image,
+            width_control_image,
+        )
+        # set control mode
+        if control_mode is not None:
+            control_mode = torch.tensor(control_mode).to(device, dtype=torch.long)
             control_mode = control_mode.reshape([-1, 1])
+        # set control mode
+        control_mode_ = []
+        if isinstance(control_mode, list):
+            for cmode in control_mode:
+                if cmode is None:
+                    control_mode_.append(-1)
+                else:
+                    control_mode_.append(cmode)
+        control_mode = torch.tensor(control_mode_).to(device, dtype=torch.long)
+        control_mode = control_mode.reshape([-1, 1])
         # 4. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4
         latents, latent_image_ids = self.prepare_latents(

transformer/trans.py CHANGED Viewed

@@ -1,3 +1,512 @@
 #################################
 ##### TRANSFORMER MERGE #########
 #################################
@@ -763,4 +1272,4 @@ class LibreFluxTransformer2DModel(
         if not return_dict:
             return (output,)
-        return Transformer2DModelOutput(sample=output)

+# Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This was modied from the control net repo
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
+import numpy as np
+import torch
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FluxLoraLoaderMixin, FromSingleFileMixin
+from diffusers.models.autoencoders import AutoencoderKL
+###  MERGEING THESE ###
+# from src.models.transformer import FluxTransformer2DModel
+# from src.models.controlnet_flux import FluxControlNetModel
+#############
+##########################################
+########### ATTENTION MERGE ##############
+##########################################
+import torch
+from torch import Tensor, FloatTensor
+from torch.nn import functional as F
+from einops import rearrange
+from diffusers.models.attention_processor import Attention
+from diffusers.models.embeddings import apply_rotary_emb
+def fa3_sdpa(
+    q,
+    k,
+    v,
+):
+    # flash attention 3 sdpa drop-in replacement
+    q, k, v = [x.permute(0, 2, 1, 3) for x in [q, k, v]]
+    out = flash_attn_func(q, k, v)[0]
+    return out.permute(0, 2, 1, 3)
+class FluxSingleAttnProcessor3_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn,
+        hidden_states: Tensor,
+        encoder_hidden_states: Tensor = None,
+        attention_mask: FloatTensor = None,
+        image_rotary_emb: Tensor = None,
+    ) -> Tensor:
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, _, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        # hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = fa3_sdpa(query, key, value)
+        hidden_states = rearrange(hidden_states, "B H L D -> B L (H D)")
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        return hidden_states
+class FluxAttnProcessor3_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "FluxAttnProcessor3_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn,
+        hidden_states: FloatTensor,
+        encoder_hidden_states: FloatTensor = None,
+        attention_mask: FloatTensor = None,
+        image_rotary_emb: Tensor = None,
+    ) -> FloatTensor:
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        context_input_ndim = encoder_hidden_states.ndim
+        if context_input_ndim == 4:
+            batch_size, channel, height, width = encoder_hidden_states.shape
+            encoder_hidden_states = encoder_hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size = encoder_hidden_states.shape[0]
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # `context` projections.
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(
+                encoder_hidden_states_query_proj
+            )
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(
+                encoder_hidden_states_key_proj
+            )
+        # attention
+        query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+        key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+        value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        # hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = fa3_sdpa(query, key, value)
+        hidden_states = rearrange(hidden_states, "B H L D -> B L (H D)")
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        encoder_hidden_states, hidden_states = (
+            hidden_states[:, : encoder_hidden_states.shape[1]],
+            hidden_states[:, encoder_hidden_states.shape[1] :],
+        )
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if context_input_ndim == 4:
+            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        return hidden_states, encoder_hidden_states
+class FluxFusedSDPAProcessor:
+    """
+    Fused QKV processor using PyTorch's scaled_dot_product_attention.
+    Uses fused projections but splits for attention computation.
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "FluxFusedSDPAProcessor requires PyTorch 2.0+ for scaled_dot_product_attention"
+            )
+    def __call__(
+        self,
+        attn,
+        hidden_states: FloatTensor,
+        encoder_hidden_states: FloatTensor = None,
+        attention_mask: FloatTensor = None,
+        image_rotary_emb: Tensor = None,
+    ) -> FloatTensor:
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        context_input_ndim = (
+            encoder_hidden_states.ndim if encoder_hidden_states is not None else None
+        )
+        if context_input_ndim == 4:
+            batch_size, channel, height, width = encoder_hidden_states.shape
+            encoder_hidden_states = encoder_hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size = (
+            encoder_hidden_states.shape[0]
+            if encoder_hidden_states is not None
+            else hidden_states.shape[0]
+        )
+        # Single attention case (no encoder states)
+        if encoder_hidden_states is None:
+            # Use fused QKV projection
+            qkv = attn.to_qkv(hidden_states)  # (batch, seq_len, 3 * inner_dim)
+            inner_dim = qkv.shape[-1] // 3
+            head_dim = inner_dim // attn.heads
+            seq_len = hidden_states.shape[1]
+            # Split and reshape
+            qkv = qkv.view(batch_size, seq_len, 3, attn.heads, head_dim)
+            query, key, value = qkv.unbind(
+                dim=2
+            )  # Each is (batch, seq_len, heads, head_dim)
+            # Transpose to (batch, heads, seq_len, head_dim)
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            # Apply norms if needed
+            if attn.norm_q is not None:
+                query = attn.norm_q(query)
+            if attn.norm_k is not None:
+                key = attn.norm_k(key)
+            # Apply RoPE if needed
+            if image_rotary_emb is not None:
+                query = apply_rotary_emb(query, image_rotary_emb)
+                key = apply_rotary_emb(key, image_rotary_emb)
+            # SDPA
+            hidden_states = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=attention_mask,
+                dropout_p=0.0,
+                is_causal=False,
+            )
+            # Reshape back
+            hidden_states = hidden_states.transpose(1, 2).reshape(
+                batch_size, -1, attn.heads * head_dim
+            )
+            hidden_states = hidden_states.to(query.dtype)
+            if input_ndim == 4:
+                hidden_states = hidden_states.transpose(-1, -2).reshape(
+                    batch_size, channel, height, width
+                )
+            return hidden_states
+        # Joint attention case (with encoder states)
+        else:
+            # Process self-attention QKV
+            qkv = attn.to_qkv(hidden_states)
+            inner_dim = qkv.shape[-1] // 3
+            head_dim = inner_dim // attn.heads
+            seq_len = hidden_states.shape[1]
+            qkv = qkv.view(batch_size, seq_len, 3, attn.heads, head_dim)
+            query, key, value = qkv.unbind(dim=2)
+            # Transpose to (batch, heads, seq_len, head_dim)
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            # Apply norms if needed
+            if attn.norm_q is not None:
+                query = attn.norm_q(query)
+            if attn.norm_k is not None:
+                key = attn.norm_k(key)
+            # Process encoder QKV
+            encoder_seq_len = encoder_hidden_states.shape[1]
+            encoder_qkv = attn.to_added_qkv(encoder_hidden_states)
+            encoder_qkv = encoder_qkv.view(
+                batch_size, encoder_seq_len, 3, attn.heads, head_dim
+            )
+            encoder_query, encoder_key, encoder_value = encoder_qkv.unbind(dim=2)
+            # Transpose to (batch, heads, seq_len, head_dim)
+            encoder_query = encoder_query.transpose(1, 2)
+            encoder_key = encoder_key.transpose(1, 2)
+            encoder_value = encoder_value.transpose(1, 2)
+            # Apply encoder norms if needed
+            if attn.norm_added_q is not None:
+                encoder_query = attn.norm_added_q(encoder_query)
+            if attn.norm_added_k is not None:
+                encoder_key = attn.norm_added_k(encoder_key)
+            # Concatenate encoder and self-attention
+            query = torch.cat([encoder_query, query], dim=2)
+            key = torch.cat([encoder_key, key], dim=2)
+            value = torch.cat([encoder_value, value], dim=2)
+            # Apply RoPE if needed
+            if image_rotary_emb is not None:
+                query = apply_rotary_emb(query, image_rotary_emb)
+                key = apply_rotary_emb(key, image_rotary_emb)
+            # SDPA
+            hidden_states = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=attention_mask,
+                dropout_p=0.0,
+                is_causal=False,
+            )
+            # Reshape: (batch, heads, seq_len, head_dim) -> (batch, seq_len, heads * head_dim)
+            hidden_states = hidden_states.transpose(1, 2).reshape(
+                batch_size, -1, attn.heads * head_dim
+            )
+            hidden_states = hidden_states.to(query.dtype)
+            # Split encoder and self outputs
+            encoder_hidden_states = hidden_states[:, :encoder_seq_len]
+            hidden_states = hidden_states[:, encoder_seq_len:]
+            # Output projections
+            hidden_states = attn.to_out[0](hidden_states)
+            hidden_states = attn.to_out[1](hidden_states)  # dropout
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            # Reshape if needed
+            if input_ndim == 4:
+                hidden_states = hidden_states.transpose(-1, -2).reshape(
+                    batch_size, channel, height, width
+                )
+            if context_input_ndim == 4:
+                encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(
+                    batch_size, channel, height, width
+                )
+            return hidden_states, encoder_hidden_states
+class FluxSingleFusedSDPAProcessor:
+    """
+    Fused QKV processor for single attention (no encoder states).
+    Simpler version for self-attention only blocks.
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "FluxSingleFusedSDPAProcessor requires PyTorch 2.0+ for scaled_dot_product_attention"
+            )
+    def __call__(
+        self,
+        attn,
+        hidden_states: Tensor,
+        encoder_hidden_states: Tensor = None,
+        attention_mask: FloatTensor = None,
+        image_rotary_emb: Tensor = None,
+    ) -> Tensor:
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, seq_len, _ = hidden_states.shape
+        # Use fused QKV projection
+        qkv = attn.to_qkv(hidden_states)  # (batch, seq_len, 3 * inner_dim)
+        inner_dim = qkv.shape[-1] // 3
+        head_dim = inner_dim // attn.heads
+        # Split and reshape in one go
+        qkv = qkv.view(batch_size, seq_len, 3, attn.heads, head_dim)
+        qkv = qkv.permute(2, 0, 3, 1, 4)  # (3, B, H, L, D) – still strided
+        query, key, value = [
+            t.contiguous() for t in qkv.unbind(0)  # make each view dense
+        ]
+        # Now each is (batch, heads, seq_len, head_dim)
+        # Apply norms if needed
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        # SDPA
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        # Reshape back
+        hidden_states = rearrange(hidden_states, "B H L D -> B L (H D)")
+        hidden_states = hidden_states.to(query.dtype)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        return hidden_states
 #################################
 ##### TRANSFORMER MERGE #########
 #################################
         if not return_dict:
             return (output,)
+        return Transformer2DModelOutput(sample=output)