Spaces:

rahul7star
/

Hunyuan-Avatar

Paused

App Files Files Community

Hunyuan-Avatar / hymm_sp /modules /parallel_states.py

rahul7star

Upload 99 files

357c94c verified 6 months ago

raw

history blame

13.3 kB

	import os
	import torch
	import datetime
	import torch.distributed as dist
	from typing import Any, Tuple
	from torch import Tensor
	from flash_attn.flash_attn_interface import flash_attn_varlen_func


	class COMM_INFO:
	def __init__(self):
	self.group = None
	self.sp_size = 1
	self.global_rank = 0
	self.rank_within_group = 0
	self.group_id = 0


	nccl_info = COMM_INFO()
	_SEQUENCE_PARALLEL_STATE = False


	def get_cu_seqlens(text_mask, img_len):
	"""Calculate cu_seqlens_q, cu_seqlens_kv using text_mask and img_len

	Args:
	text_mask (torch.Tensor): the mask of text
	img_len (int): the length of image

	Returns:
	torch.Tensor: the calculated cu_seqlens for flash attention
	"""
	batch_size = text_mask.shape[0]
	text_len = text_mask.sum(dim=1)
	max_len = text_mask.shape[1] + img_len

	cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda")

	for i in range(batch_size):
	s = text_len[i] + img_len
	s1 = i * max_len + s
	s2 = (i + 1) * max_len
	cu_seqlens[2 * i + 1] = s1
	cu_seqlens[2 * i + 2] = s2

	return cu_seqlens

	def initialize_sequence_parallel_state(sequence_parallel_size):
	global _SEQUENCE_PARALLEL_STATE
	if sequence_parallel_size > 1:
	_SEQUENCE_PARALLEL_STATE = True
	initialize_sequence_parallel_group(sequence_parallel_size)
	else:
	nccl_info.sp_size = 1
	nccl_info.global_rank = int(os.getenv("RANK", "0"))
	nccl_info.rank_within_group = 0
	nccl_info.group_id = int(os.getenv("RANK", "0"))

	def get_sequence_parallel_state():
	return _SEQUENCE_PARALLEL_STATE

	def initialize_sequence_parallel_group(sequence_parallel_size):
	"""Initialize the sequence parallel group."""
	rank = int(os.getenv("RANK", "0"))
	world_size = int(os.getenv("WORLD_SIZE", "1"))
	assert (
	world_size % sequence_parallel_size == 0
	), "world_size must be divisible by sequence_parallel_size, but got world_size: {}, sequence_parallel_size: {}".format(
	world_size, sequence_parallel_size)
	nccl_info.sp_size = sequence_parallel_size
	nccl_info.global_rank = rank
	num_sequence_parallel_groups: int = world_size // sequence_parallel_size
	for i in range(num_sequence_parallel_groups):
	ranks = range(i * sequence_parallel_size, (i + 1) * sequence_parallel_size)
	group = dist.new_group(ranks)
	if rank in ranks:
	nccl_info.group = group
	nccl_info.rank_within_group = rank - i * sequence_parallel_size
	nccl_info.group_id = i

	def initialize_distributed(seed):
	local_rank = int(os.getenv("RANK", 0))
	world_size = int(os.getenv("WORLD_SIZE", 1))
	torch.cuda.set_device(local_rank)
	dist.init_process_group(backend="nccl", init_method="env://", timeout=datetime.timedelta(seconds=2**31-1), world_size=world_size, rank=local_rank)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)
	initialize_sequence_parallel_state(world_size)

	def _all_to_all_4D(input: torch.tensor, scatter_idx: int = 2, gather_idx: int = 1, group=None) -> torch.tensor:
	"""
	all-to-all for QKV

	Args:
	input (torch.tensor): a tensor sharded along dim scatter dim
	scatter_idx (int): default 1
	gather_idx (int): default 2
	group : torch process group

	Returns:
	torch.tensor: resharded tensor (bs, seqlen/P, hc, hs)
	"""
	assert (input.dim() == 4), f"input must be 4D tensor, got {input.dim()} and shape {input.shape}"

	seq_world_size = dist.get_world_size(group)
	if scatter_idx == 2 and gather_idx == 1:
	# input (torch.tensor): a tensor sharded along dim 1 (bs, seqlen/P, hc, hs) output: (bs, seqlen, hc/P, hs)
	bs, shard_seqlen, hc, hs = input.shape
	seqlen = shard_seqlen * seq_world_size
	shard_hc = hc // seq_world_size

	# transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
	# (bs, seqlen/P, hc, hs) -reshape-> (bs, seq_len/P, P, hc/P, hs) -transpose(0,2)-> (P, seq_len/P, bs, hc/P, hs)
	input_t = (input.reshape(bs, shard_seqlen, seq_world_size, shard_hc, hs).transpose(0, 2).contiguous())

	output = torch.empty_like(input_t)
	# https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_to_all_single
	# (P, seq_len/P, bs, hc/P, hs) scatter seqlen -all2all-> (P, seq_len/P, bs, hc/P, hs) scatter head
	if seq_world_size > 1:
	dist.all_to_all_single(output, input_t, group=group)
	torch.cuda.synchronize()
	else:
	output = input_t
	# if scattering the seq-dim, transpose the heads back to the original dimension
	output = output.reshape(seqlen, bs, shard_hc, hs)

	# (seq_len, bs, hc/P, hs) -reshape-> (bs, seq_len, hc/P, hs)
	output = output.transpose(0, 1).contiguous().reshape(bs, seqlen, shard_hc, hs)

	return output

	elif scatter_idx == 1 and gather_idx == 2:
	# input (torch.tensor): a tensor sharded along dim 1 (bs, seqlen, hc/P, hs) output: (bs, seqlen/P, hc, hs)
	bs, seqlen, shard_hc, hs = input.shape
	hc = shard_hc * seq_world_size
	shard_seqlen = seqlen // seq_world_size
	seq_world_size = dist.get_world_size(group)

	# transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
	# (bs, seqlen, hc/P, hs) -reshape-> (bs, P, seq_len/P, hc/P, hs) -transpose(0, 3)-> (hc/P, P, seqlen/P, bs, hs) -transpose(0, 1) -> (P, hc/P, seqlen/P, bs, hs)
	input_t = (input.reshape(bs, seq_world_size, shard_seqlen, shard_hc,
	hs).transpose(0,
	3).transpose(0,
	1).contiguous().reshape(seq_world_size, shard_hc,
	shard_seqlen, bs, hs))

	output = torch.empty_like(input_t)
	# https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_to_all_single
	# (P, bs x hc/P, seqlen/P, hs) scatter seqlen -all2all-> (P, bs x seq_len/P, hc/P, hs) scatter head
	if seq_world_size > 1:
	dist.all_to_all_single(output, input_t, group=group)
	torch.cuda.synchronize()
	else:
	output = input_t

	# if scattering the seq-dim, transpose the heads back to the original dimension
	output = output.reshape(hc, shard_seqlen, bs, hs)

	# (hc, seqlen/N, bs, hs) -tranpose(0,2)-> (bs, seqlen/N, hc, hs)
	output = output.transpose(0, 2).contiguous().reshape(bs, shard_seqlen, hc, hs)

	return output
	else:
	raise RuntimeError("scatter_idx must be 1 or 2 and gather_idx must be 1 or 2")


	class SeqAllToAll4D(torch.autograd.Function):
	@staticmethod
	def forward(
	ctx: Any,
	group: dist.ProcessGroup,
	input: Tensor,
	scatter_idx: int,
	gather_idx: int,
	) -> Tensor:
	ctx.group = group
	ctx.scatter_idx = scatter_idx
	ctx.gather_idx = gather_idx

	return _all_to_all_4D(input, scatter_idx, gather_idx, group=group)

	@staticmethod
	def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
	return (
	None,
	SeqAllToAll4D.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx),
	None,
	None,
	)


	def all_to_all_4D(
	input_: torch.Tensor,
	scatter_dim: int = 2,
	gather_dim: int = 1,
	):
	return SeqAllToAll4D.apply(nccl_info.group, input_, scatter_dim, gather_dim)


	def _all_to_all(
	input_: torch.Tensor,
	world_size: int,
	group: dist.ProcessGroup,
	scatter_dim: int,
	gather_dim: int,
	):
	input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
	output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
	dist.all_to_all(output_list, input_list, group=group)
	return torch.cat(output_list, dim=gather_dim).contiguous()


	class _AllToAll(torch.autograd.Function):
	"""All-to-all communication.

	Args:
	input_: input matrix
	process_group: communication group
	scatter_dim: scatter dimension
	gather_dim: gather dimension
	"""

	@staticmethod
	def forward(ctx, input_, process_group, scatter_dim, gather_dim):
	ctx.process_group = process_group
	ctx.scatter_dim = scatter_dim
	ctx.gather_dim = gather_dim
	ctx.world_size = dist.get_world_size(process_group)
	output = _all_to_all(input_, ctx.world_size, process_group, scatter_dim, gather_dim)
	return output

	@staticmethod
	def backward(ctx, grad_output):
	grad_output = _all_to_all(
	grad_output,
	ctx.world_size,
	ctx.process_group,
	ctx.gather_dim,
	ctx.scatter_dim,
	)
	return (
	grad_output,
	None,
	None,
	None,
	)

	def all_to_all(
	input_: torch.Tensor,
	scatter_dim: int = 2,
	gather_dim: int = 1,
	):
	return _AllToAll.apply(input_, nccl_info.group, scatter_dim, gather_dim)


	class _AllGather(torch.autograd.Function):
	"""All-gather communication with autograd support.

	Args:
	input_: input tensor
	dim: dimension along which to concatenate
	"""

	@staticmethod
	def forward(ctx, input_, dim):
	ctx.dim = dim
	world_size = nccl_info.sp_size
	group = nccl_info.group
	input_size = list(input_.size())

	ctx.input_size = input_size[dim]

	tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
	input_ = input_.contiguous()
	dist.all_gather(tensor_list, input_, group=group)

	output = torch.cat(tensor_list, dim=dim)
	return output

	@staticmethod
	def backward(ctx, grad_output):
	world_size = nccl_info.sp_size
	rank = nccl_info.rank_within_group
	dim = ctx.dim
	input_size = ctx.input_size

	sizes = [input_size] * world_size

	grad_input_list = torch.split(grad_output, sizes, dim=dim)
	grad_input = grad_input_list[rank]

	return grad_input, None


	def all_gather(input_: torch.Tensor, dim: int = 1):
	"""Performs an all-gather operation on the input tensor along the specified dimension.

	Args:
	input_ (torch.Tensor): Input tensor of shape [B, H, S, D].
	dim (int, optional): Dimension along which to concatenate. Defaults to 1.

	Returns:
	torch.Tensor: Output tensor after all-gather operation, concatenated along 'dim'.
	"""
	return _AllGather.apply(input_, dim)

	def parallel_attention(q, k, v, img_q_len, img_kv_len, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv,):
	"""
	img_q_len,img_kv_len: 32256
	text_mask: 2x256
	query: [2, 32256, 24, 128])
	encoder_query: [2, 256, 24, 128]
	"""
	query, encoder_query = q
	key, encoder_key = k
	value, encoder_value = v
	rank = torch.distributed.get_rank()
	if get_sequence_parallel_state():
	query = all_to_all_4D(query, scatter_dim=2, gather_dim=1) # [2, 32256, 24, 128]
	key = all_to_all_4D(key, scatter_dim=2, gather_dim=1)
	value = all_to_all_4D(value, scatter_dim=2, gather_dim=1)
	def shrink_head(encoder_state, dim):
	local_heads = encoder_state.shape[dim] // nccl_info.sp_size
	return encoder_state.narrow(dim, nccl_info.rank_within_group * local_heads, local_heads)
	encoder_query = shrink_head(encoder_query, dim=2)
	encoder_key = shrink_head(encoder_key, dim=2)
	encoder_value = shrink_head(encoder_value, dim=2)

	sequence_length = query.size(1) # 32256
	encoder_sequence_length = encoder_query.size(1) # 256

	query = torch.cat([query, encoder_query], dim=1)
	key = torch.cat([key, encoder_key], dim=1)
	value = torch.cat([value, encoder_value], dim=1)
	bsz = query.shape[0]
	head = query.shape[-2]
	head_dim = query.shape[-1]
	query, key, value = [
	x.view(x.shape[0] * x.shape[1], *x.shape[2:])
	for x in [query, key, value]
	]
	hidden_states = flash_attn_varlen_func(
	query,
	key,
	value,
	cu_seqlens_q,
	cu_seqlens_kv,
	max_seqlen_q,
	max_seqlen_kv,
	)
	# B, S, 3, H, D
	hidden_states = hidden_states.view(bsz, max_seqlen_q, head, head_dim).contiguous()

	hidden_states, encoder_hidden_states = hidden_states.split_with_sizes((sequence_length, encoder_sequence_length),
	dim=1)
	if get_sequence_parallel_state():
	hidden_states = all_to_all_4D(hidden_states, scatter_dim=1, gather_dim=2)
	encoder_hidden_states = all_gather(encoder_hidden_states, dim=2).contiguous()
	hidden_states = hidden_states.to(query.dtype)
	encoder_hidden_states = encoder_hidden_states.to(query.dtype)

	attn = torch.cat([hidden_states, encoder_hidden_states], dim=1)

	b, s, _, _= attn.shape
	attn = attn.reshape(b, s, -1)
	return attn, None