Spaces:

alexShangeeth
/

huny_02

Runtime error

App Files Files Community

huny_02 / comfy /ldm /flux /controlnet.py

alexShangeeth

Initial commit for ComfyUI files

26ca9da 9 months ago

raw

history blame contribute delete

9.04 kB

	#Original code can be found on: https://github.com/XLabs-AI/x-flux/blob/main/src/flux/controlnet.py
	#modified to support different types of flux controlnets

	import torch
	import math
	from torch import Tensor, nn
	from einops import rearrange, repeat

	from .layers import (timestep_embedding)

	from .model import Flux
	import comfy.ldm.common_dit

	class MistolineCondDownsamplBlock(nn.Module):
	def __init__(self, dtype=None, device=None, operations=None):
	super().__init__()
	self.encoder = nn.Sequential(
	operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 1, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 1, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
	)

	def forward(self, x):
	return self.encoder(x)

	class MistolineControlnetBlock(nn.Module):
	def __init__(self, hidden_size, dtype=None, device=None, operations=None):
	super().__init__()
	self.linear = operations.Linear(hidden_size, hidden_size, dtype=dtype, device=device)
	self.act = nn.SiLU()

	def forward(self, x):
	return self.act(self.linear(x))


	class ControlNetFlux(Flux):
	def __init__(self, latent_input=False, num_union_modes=0, mistoline=False, control_latent_channels=None, image_model=None, dtype=None, device=None, operations=None, **kwargs):
	super().__init__(final_layer=False, dtype=dtype, device=device, operations=operations, **kwargs)

	self.main_model_double = 19
	self.main_model_single = 38

	self.mistoline = mistoline
	# add ControlNet blocks
	if self.mistoline:
	control_block = lambda : MistolineControlnetBlock(self.hidden_size, dtype=dtype, device=device, operations=operations)
	else:
	control_block = lambda : operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)

	self.controlnet_blocks = nn.ModuleList([])
	for _ in range(self.params.depth):
	self.controlnet_blocks.append(control_block())

	self.controlnet_single_blocks = nn.ModuleList([])
	for _ in range(self.params.depth_single_blocks):
	self.controlnet_single_blocks.append(control_block())

	self.num_union_modes = num_union_modes
	self.controlnet_mode_embedder = None
	if self.num_union_modes > 0:
	self.controlnet_mode_embedder = operations.Embedding(self.num_union_modes, self.hidden_size, dtype=dtype, device=device)

	self.gradient_checkpointing = False
	self.latent_input = latent_input
	if control_latent_channels is None:
	control_latent_channels = self.in_channels
	else:
	control_latent_channels = 2 2 #patch size

	self.pos_embed_input = operations.Linear(control_latent_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
	if not self.latent_input:
	if self.mistoline:
	self.input_cond_block = MistolineCondDownsamplBlock(dtype=dtype, device=device, operations=operations)
	else:
	self.input_hint_block = nn.Sequential(
	operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
	nn.SiLU(),
	operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
	)

	def forward_orig(
	self,
	img: Tensor,
	img_ids: Tensor,
	controlnet_cond: Tensor,
	txt: Tensor,
	txt_ids: Tensor,
	timesteps: Tensor,
	y: Tensor,
	guidance: Tensor = None,
	control_type: Tensor = None,
	) -> Tensor:
	if img.ndim != 3 or txt.ndim != 3:
	raise ValueError("Input img and txt tensors must have 3 dimensions.")

	# running on sequences img
	img = self.img_in(img)

	controlnet_cond = self.pos_embed_input(controlnet_cond)
	img = img + controlnet_cond
	vec = self.time_in(timestep_embedding(timesteps, 256))
	if self.params.guidance_embed:
	vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
	vec = vec + self.vector_in(y)
	txt = self.txt_in(txt)

	if self.controlnet_mode_embedder is not None and len(control_type) > 0:
	control_cond = self.controlnet_mode_embedder(torch.tensor(control_type, device=img.device), out_dtype=img.dtype).unsqueeze(0).repeat((txt.shape[0], 1, 1))
	txt = torch.cat([control_cond, txt], dim=1)
	txt_ids = torch.cat([txt_ids[:,:1], txt_ids], dim=1)

	ids = torch.cat((txt_ids, img_ids), dim=1)
	pe = self.pe_embedder(ids)

	controlnet_double = ()

	for i in range(len(self.double_blocks)):
	img, txt = self.double_blocks[i](img=img, txt=txt, vec=vec, pe=pe)
	controlnet_double = controlnet_double + (self.controlnet_blocks[i](img),)

	img = torch.cat((txt, img), 1)

	controlnet_single = ()

	for i in range(len(self.single_blocks)):
	img = self.single_blocks[i](img, vec=vec, pe=pe)
	controlnet_single = controlnet_single + (self.controlnet_single_blocks[i](img[:, txt.shape[1] :, ...]),)

	repeat = math.ceil(self.main_model_double / len(controlnet_double))
	if self.latent_input:
	out_input = ()
	for x in controlnet_double:
	out_input += (x,) * repeat
	else:
	out_input = (controlnet_double * repeat)

	out = {"input": out_input[:self.main_model_double]}
	if len(controlnet_single) > 0:
	repeat = math.ceil(self.main_model_single / len(controlnet_single))
	out_output = ()
	if self.latent_input:
	for x in controlnet_single:
	out_output += (x,) * repeat
	else:
	out_output = (controlnet_single * repeat)
	out["output"] = out_output[:self.main_model_single]
	return out

	def forward(self, x, timesteps, context, y, guidance=None, hint=None, **kwargs):
	patch_size = 2
	if self.latent_input:
	hint = comfy.ldm.common_dit.pad_to_patch_size(hint, (patch_size, patch_size))
	elif self.mistoline:
	hint = hint * 2.0 - 1.0
	hint = self.input_cond_block(hint)
	else:
	hint = hint * 2.0 - 1.0
	hint = self.input_hint_block(hint)

	hint = rearrange(hint, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)

	bs, c, h, w = x.shape
	x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))

	img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)

	h_len = ((h + (patch_size // 2)) // patch_size)
	w_len = ((w + (patch_size // 2)) // patch_size)
	img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
	img_ids[..., 1] = img_ids[..., 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype)[:, None]
	img_ids[..., 2] = img_ids[..., 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype)[None, :]
	img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)

	txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
	return self.forward_orig(img, img_ids, hint, context, txt_ids, timesteps, y, guidance, control_type=kwargs.get("control_type", []))