NOI_3_ZIP

Sleeping

App Files Files Community

NOI_3_ZIP / zipvoice /models /zipvoice_dialog.py

hynt

update zipvoice demo

e53ffe1 4 months ago

raw

history blame

13.9 kB

	# Copyright 2025 Xiaomi Corp. (authors: Han Zhu)
	#
	# See ../../../../LICENSE for clarification regarding multiple authors
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from typing import List

	import torch
	import torch.nn as nn
	from torch.nn.parallel import DistributedDataParallel as DDP

	from zipvoice.models.modules.zipformer_two_stream import TTSZipformerTwoStream
	from zipvoice.models.zipvoice import ZipVoice
	from zipvoice.utils.common import condition_time_mask_suffix, make_pad_mask, pad_labels


	class ZipVoiceDialog(ZipVoice):
	"""The ZipVoice-Dialog model."""

	def __init__(
	self,
	fm_decoder_downsampling_factor: List[int] = [1, 2, 4, 2, 1],
	fm_decoder_num_layers: List[int] = [2, 2, 4, 4, 4],
	fm_decoder_cnn_module_kernel: List[int] = [31, 15, 7, 15, 31],
	fm_decoder_feedforward_dim: int = 1536,
	fm_decoder_num_heads: int = 4,
	fm_decoder_dim: int = 512,
	text_encoder_num_layers: int = 4,
	text_encoder_feedforward_dim: int = 512,
	text_encoder_cnn_module_kernel: int = 9,
	text_encoder_num_heads: int = 4,
	text_encoder_dim: int = 192,
	time_embed_dim: int = 192,
	text_embed_dim: int = 192,
	query_head_dim: int = 32,
	value_head_dim: int = 12,
	pos_head_dim: int = 4,
	pos_dim: int = 48,
	feat_dim: int = 100,
	vocab_size: int = 26,
	pad_id: int = 0,
	spk_a_id: int = 360,
	spk_b_id: int = 361,
	):
	"""
	Initialize the model with specified configuration parameters.

	Args:
	fm_decoder_downsampling_factor: List of downsampling factors for each layer
	in the flow-matching decoder.
	fm_decoder_num_layers: List of the number of layers for each block in the
	flow-matching decoder.
	fm_decoder_cnn_module_kernel: List of kernel sizes for CNN modules in the
	flow-matching decoder.
	fm_decoder_feedforward_dim: Dimension of the feedforward network in the
	flow-matching decoder.
	fm_decoder_num_heads: Number of attention heads in the flow-matching
	decoder.
	fm_decoder_dim: Hidden dimension of the flow-matching decoder.
	text_encoder_num_layers: Number of layers in the text encoder.
	text_encoder_feedforward_dim: Dimension of the feedforward network in the
	text encoder.
	text_encoder_cnn_module_kernel: Kernel size for the CNN module in the
	text encoder.
	text_encoder_num_heads: Number of attention heads in the text encoder.
	text_encoder_dim: Hidden dimension of the text encoder.
	time_embed_dim: Dimension of the time embedding.
	text_embed_dim: Dimension of the text embedding.
	query_head_dim: Dimension of the query attention head.
	value_head_dim: Dimension of the value attention head.
	pos_head_dim: Dimension of the position attention head.
	pos_dim: Dimension of the positional encoding.
	feat_dim: Dimension of the acoustic features.
	vocab_size: Size of the vocabulary.
	pad_id: ID used for padding tokens.
	spk_a_id: ID of speaker A / [S1].
	spk_b_id: ID of speaker B / [S2].
	"""
	super().__init__(
	fm_decoder_downsampling_factor=fm_decoder_downsampling_factor,
	fm_decoder_num_layers=fm_decoder_num_layers,
	fm_decoder_cnn_module_kernel=fm_decoder_cnn_module_kernel,
	fm_decoder_feedforward_dim=fm_decoder_feedforward_dim,
	fm_decoder_num_heads=fm_decoder_num_heads,
	fm_decoder_dim=fm_decoder_dim,
	text_encoder_num_layers=text_encoder_num_layers,
	text_encoder_feedforward_dim=text_encoder_feedforward_dim,
	text_encoder_cnn_module_kernel=text_encoder_cnn_module_kernel,
	text_encoder_num_heads=text_encoder_num_heads,
	text_encoder_dim=text_encoder_dim,
	time_embed_dim=time_embed_dim,
	text_embed_dim=text_embed_dim,
	query_head_dim=query_head_dim,
	value_head_dim=value_head_dim,
	pos_head_dim=pos_head_dim,
	pos_dim=pos_dim,
	feat_dim=feat_dim,
	vocab_size=vocab_size,
	pad_id=pad_id,
	)

	self.spk_a_id = spk_a_id
	self.spk_b_id = spk_b_id
	self.spk_embed = nn.Embedding(2, feat_dim)
	torch.nn.init.normal_(self.spk_embed.weight, mean=0, std=0.1)

	def extract_spk_indices(self, tensor):
	turn_mask = ((tensor == self.spk_a_id) \| (tensor == self.spk_b_id)).long()
	turn_counts = turn_mask.cumsum(dim=1)
	spk_mask = turn_counts % 2
	spk_mask = torch.where(tensor == self.pad_id, -1, spk_mask)
	spk_a_indices = torch.where(spk_mask == 0)
	spk_b_indices = torch.where(spk_mask == 1)
	return spk_a_indices, spk_b_indices

	def forward_text_embed(
	self,
	tokens: List[List[int]],
	):
	"""
	Get the text embeddings.
	Args:
	tokens: a list of list of token ids.
	Returns:
	embed: the text embeddings, shape (batch, seq_len, emb_dim).
	tokens_lens: the length of each token sequence, shape (batch,).
	"""
	device = (
	self.device if isinstance(self, DDP) else next(self.parameters()).device
	)
	tokens_padded = pad_labels(tokens, pad_id=self.pad_id, device=device) # (B, S)
	embed = self.embed(tokens_padded) # (B, S, C)
	spk_a_indices, spk_b_indices = self.extract_spk_indices(tokens_padded)
	tokens_lens = torch.tensor(
	[len(token) for token in tokens], dtype=torch.int64, device=device
	)
	tokens_padding_mask = make_pad_mask(tokens_lens, embed.shape[1]) # (B, S)

	embed = self.text_encoder(
	x=embed, t=None, padding_mask=tokens_padding_mask
	) # (B, S, C)
	embed[spk_a_indices] += self.spk_embed(torch.tensor(0, device=device)).to(
	embed.dtype
	)
	embed[spk_b_indices] += self.spk_embed(torch.tensor(1, device=device)).to(
	embed.dtype
	)
	return embed, tokens_lens

	def forward(
	self,
	tokens: List[List[int]],
	features: torch.Tensor,
	features_lens: torch.Tensor,
	noise: torch.Tensor,
	t: torch.Tensor,
	condition_drop_ratio: float = 0.0,
	) -> torch.Tensor:
	"""Forward pass of the model for training.
	Args:
	tokens: a list of list of token ids.
	features: the acoustic features, with the shape (batch, seq_len, feat_dim).
	features_lens: the length of each acoustic feature sequence, shape (batch,).
	noise: the intitial noise, with the shape (batch, seq_len, feat_dim).
	t: the time step, with the shape (batch, 1, 1).
	condition_drop_ratio: the ratio of dropped text condition.
	Returns:
	fm_loss: the flow-matching loss.
	"""

	(text_condition, padding_mask,) = self.forward_text_train(
	tokens=tokens,
	features_lens=features_lens,
	)

	speech_condition_mask = condition_time_mask_suffix(
	features_lens=features_lens,
	mask_percent=(0.5, 1.0),
	max_len=features.size(1),
	)
	speech_condition = torch.where(speech_condition_mask.unsqueeze(-1), 0, features)

	if condition_drop_ratio > 0.0:
	drop_mask = (
	torch.rand(text_condition.size(0), 1, 1).to(text_condition.device)
	> condition_drop_ratio
	)
	text_condition = text_condition * drop_mask

	xt = features * t + noise * (1 - t)
	ut = features - noise # (B, T, F)

	vt = self.forward_fm_decoder(
	t=t,
	xt=xt,
	text_condition=text_condition,
	speech_condition=speech_condition,
	padding_mask=padding_mask,
	)

	loss_mask = speech_condition_mask & (~padding_mask)
	fm_loss = torch.mean((vt[loss_mask] - ut[loss_mask]) ** 2)

	return fm_loss


	class ZipVoiceDialogStereo(ZipVoiceDialog):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	required_params = {
	"feat_dim",
	"fm_decoder_downsampling_factor",
	"fm_decoder_num_layers",
	"fm_decoder_cnn_module_kernel",
	"fm_decoder_dim",
	"fm_decoder_feedforward_dim",
	"fm_decoder_num_heads",
	"query_head_dim",
	"pos_head_dim",
	"value_head_dim",
	"pos_dim",
	"time_embed_dim",
	}

	missing = [p for p in required_params if p not in kwargs]
	if missing:
	raise ValueError(f"Missing required parameters: {', '.join(missing)}")

	self.fm_decoder = TTSZipformerTwoStream(
	in_dim=(kwargs["feat_dim"] * 5, kwargs["feat_dim"] * 3),
	out_dim=(kwargs["feat_dim"] * 2, kwargs["feat_dim"]),
	downsampling_factor=kwargs["fm_decoder_downsampling_factor"],
	num_encoder_layers=kwargs["fm_decoder_num_layers"],
	cnn_module_kernel=kwargs["fm_decoder_cnn_module_kernel"],
	encoder_dim=kwargs["fm_decoder_dim"],
	feedforward_dim=kwargs["fm_decoder_feedforward_dim"],
	num_heads=kwargs["fm_decoder_num_heads"],
	query_head_dim=kwargs["query_head_dim"],
	pos_head_dim=kwargs["pos_head_dim"],
	value_head_dim=kwargs["value_head_dim"],
	pos_dim=kwargs["pos_dim"],
	use_time_embed=True,
	time_embed_dim=kwargs["time_embed_dim"],
	)

	def forward(
	self,
	tokens: List[List[int]],
	features: torch.Tensor,
	features_lens: torch.Tensor,
	noise: torch.Tensor,
	t: torch.Tensor,
	condition_drop_ratio: float = 0.0,
	se_weight: float = 1.0,
	) -> torch.Tensor:
	"""Forward pass of the model for training.
	Args:
	tokens: a list of list of token ids.
	features: the acoustic features, with the shape (batch, seq_len, feat_dim).
	features_lens: the length of each acoustic feature sequence, shape (batch,).
	noise: the intitial noise, with the shape (batch, seq_len, feat_dim).
	t: the time step, with the shape (batch, 1, 1).
	condition_drop_ratio: the ratio of dropped text condition.
	se_weight: the weight of the speaker exclusive loss.
	Returns:
	fm_loss: the flow-matching loss.
	"""

	(text_condition, padding_mask,) = self.forward_text_train(
	tokens=tokens,
	features_lens=features_lens,
	)

	speech_condition_mask = condition_time_mask_suffix(
	features_lens=features_lens,
	mask_percent=(0.5, 1.0),
	max_len=features.size(1),
	)
	speech_condition = torch.where(speech_condition_mask.unsqueeze(-1), 0, features)

	if condition_drop_ratio > 0.0:
	drop_mask = (
	torch.rand(text_condition.size(0), 1, 1).to(text_condition.device)
	> condition_drop_ratio
	)
	text_condition = text_condition * drop_mask

	xt = features * t + noise * (1 - t)
	ut = features - noise # (B, T, F)

	vt = self.forward_fm_decoder(
	t=t,
	xt=xt,
	text_condition=text_condition,
	speech_condition=speech_condition,
	padding_mask=padding_mask,
	)

	loss_mask = speech_condition_mask & (~padding_mask)
	fm_loss = torch.mean((vt[loss_mask] - ut[loss_mask]) ** 2)

	if se_weight > 0:
	target = xt + vt * (1 - t)
	fbank_1 = target[:, :, : self.feat_dim]
	fbank_2 = target[:, :, self.feat_dim :]
	energy_loss = torch.mean(
	self.energy_based_loss(fbank_1, fbank_2, features)[loss_mask]
	)
	loss = fm_loss + energy_loss * se_weight
	else:
	loss = fm_loss

	return loss

	def energy_based_loss(self, fbank1, fbank2, gt_fbank):
	energy1 = self.energy(fbank1)
	energy2 = self.energy(fbank2)

	energy_thresholds = self.adaptive_threshold_from_gt(
	torch.cat(
	[
	gt_fbank[:, :, : self.feat_dim],
	gt_fbank[:, :, self.feat_dim :],
	],
	dim=1,
	)
	)

	both_speaking = (
	(energy1 > energy_thresholds) & (energy2 > energy_thresholds)
	).float()

	penalty = (
	both_speaking
	* (energy1 - energy_thresholds)
	* (energy2 - energy_thresholds)
	)
	return penalty

	def energy(self, fbank):
	return torch.mean(fbank, dim=-1)

	def adaptive_threshold_from_gt(self, gt_fbank, percentile=50):
	frame_energies = self.energy(gt_fbank)
	thresholds = torch.quantile(frame_energies, q=percentile / 100, dim=1)
	return thresholds.unsqueeze(1)