Spaces:

mrfakename
/

VoiceStar

Running on Zero

App Files Files Community

VoiceStar / data /encodec.py

mrfakename

Upload 51 files

82bc972 verified 7 months ago

raw

history blame contribute delete

64.7 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.
	"""Compression models or wrapper around existing models.
	Also defines the main interface that a model must follow to be usable as an audio tokenizer.
	"""

	from abc import ABC, abstractmethod
	from dataclasses import dataclass, field
	import logging
	import math
	from pathlib import Path
	import typing as tp

	import numpy as np
	import torch
	from torch import nn
	from torch import einsum
	import torch.nn.functional as F
	from torch.nn.utils import spectral_norm, weight_norm

	import logging
	import warnings
	from einops import rearrange, repeat
	import omegaconf
	# import flashy

	CONV_NORMALIZATIONS = frozenset(['none', 'weight_norm', 'spectral_norm',
	'time_group_norm'])

	def dict_from_config(cfg: omegaconf.DictConfig) -> dict:
	"""Convenience function to map an omegaconf configuration to a dictionary.

	Args:
	cfg (omegaconf.DictConfig): Original configuration to map to dict.
	Returns:
	dict: Config as dictionary object.
	"""
	dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
	assert isinstance(dct, dict)
	return dct

	@dataclass
	class QuantizedResult:
	x: torch.Tensor
	codes: torch.Tensor
	bandwidth: torch.Tensor # bandwidth in kb/s used, per batch item.
	penalty: tp.Optional[torch.Tensor] = None
	metrics: dict = field(default_factory=dict)

	class BaseQuantizer(nn.Module):
	"""Base class for quantizers.
	"""

	def forward(self, x: torch.Tensor, frame_rate: int) -> QuantizedResult:
	"""
	Given input tensor x, returns first the quantized (or approximately quantized)
	representation along with quantized codes, bandwidth, and any penalty term for the loss.
	Finally, this returns a dict of metrics to update logging etc.
	Frame rate must be passed so that the bandwidth is properly computed.
	"""
	raise NotImplementedError()

	def encode(self, x: torch.Tensor) -> torch.Tensor:
	"""Encode a given input tensor with the specified sample rate at the given bandwidth."""
	raise NotImplementedError()

	def decode(self, codes: torch.Tensor) -> torch.Tensor:
	"""Decode the given codes to the quantized representation."""
	raise NotImplementedError()

	@property
	def total_codebooks(self):
	"""Total number of codebooks."""
	raise NotImplementedError()

	@property
	def num_codebooks(self):
	"""Number of active codebooks."""
	raise NotImplementedError()

	def set_num_codebooks(self, n: int):
	"""Set the number of active codebooks."""
	raise NotImplementedError()

	class CompressionModel(ABC, nn.Module):
	"""Base API for all compression model that aim at being used as audio tokenizers
	with a language model.
	"""

	@abstractmethod
	def forward(self, x: torch.Tensor) -> QuantizedResult:
	...

	@abstractmethod
	def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
	"""See `EncodecModel.encode`."""
	...

	@abstractmethod
	def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
	"""See `EncodecModel.decode`."""
	...

	@abstractmethod
	def decode_latent(self, codes: torch.Tensor):
	"""Decode from the discrete codes to continuous latent space."""
	...

	@property
	@abstractmethod
	def channels(self) -> int:
	...

	@property
	@abstractmethod
	def frame_rate(self) -> float:
	...

	@property
	@abstractmethod
	def sample_rate(self) -> int:
	...

	@property
	@abstractmethod
	def cardinality(self) -> int:
	...

	@property
	@abstractmethod
	def num_codebooks(self) -> int:
	...

	@property
	@abstractmethod
	def total_codebooks(self) -> int:
	...

	@abstractmethod
	def set_num_codebooks(self, n: int):
	"""Set the active number of codebooks used by the quantizer."""
	...

	def apply_parametrization_norm(module: nn.Module, norm: str = 'none'):
	assert norm in CONV_NORMALIZATIONS
	if norm == 'weight_norm':
	return weight_norm(module)
	elif norm == 'spectral_norm':
	return spectral_norm(module)
	else:
	# We already check was in CONV_NORMALIZATION, so any other choice
	# doesn't need reparametrization.
	return module


	def get_norm_module(module: nn.Module, causal: bool = False, norm: str = 'none', **norm_kwargs):
	"""Return the proper normalization module. If causal is True, this will ensure the returned
	module is causal, or return an error if the normalization doesn't support causal evaluation.
	"""
	assert norm in CONV_NORMALIZATIONS
	if norm == 'time_group_norm':
	if causal:
	raise ValueError("GroupNorm doesn't support causal evaluation.")
	assert isinstance(module, nn.modules.conv._ConvNd)
	return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
	else:
	return nn.Identity()


	def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
	padding_total: int = 0) -> int:
	"""See `pad_for_conv1d`."""
	length = x.shape[-1]
	n_frames = (length - kernel_size + padding_total) / stride + 1
	ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
	return ideal_length - length


	def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0):
	"""Pad for a convolution to make sure that the last window is full.
	Extra padding is added at the end. This is required to ensure that we can rebuild
	an output of the same length, as otherwise, even with padding, some time steps
	might get removed.
	For instance, with total padding = 4, kernel size = 4, stride = 2:
	0 0 1 2 3 4 5 0 0 # (0s are padding)
	1 2 3 # (output frames of a convolution, last 0 is never used)
	0 0 1 2 3 4 5 0 # (output of tr. conv., but pos. 5 is going to get removed as padding)
	1 2 3 4 # once you removed padding, we are missing one time step !
	"""
	extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
	return F.pad(x, (0, extra_padding))


	def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'constant', value: float = 0.):
	"""Tiny wrapper around F.pad, just to allow for reflect padding on small input.
	If this is the case, we insert extra 0 padding to the right before the reflection happen.
	"""
	length = x.shape[-1]
	padding_left, padding_right = paddings
	assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
	if mode == 'reflect':
	max_pad = max(padding_left, padding_right)
	extra_pad = 0
	if length <= max_pad:
	extra_pad = max_pad - length + 1
	x = F.pad(x, (0, extra_pad))
	padded = F.pad(x, paddings, mode, value)
	end = padded.shape[-1] - extra_pad
	return padded[..., :end]
	else:
	return F.pad(x, paddings, mode, value)


	def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
	"""Remove padding from x, handling properly zero padding. Only for 1d!"""
	padding_left, padding_right = paddings
	assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
	assert (padding_left + padding_right) <= x.shape[-1]
	end = x.shape[-1] - padding_right
	return x[..., padding_left: end]


	class NormConv1d(nn.Module):
	"""Wrapper around Conv1d and normalization applied to this conv
	to provide a uniform interface across normalization approaches.
	"""
	def __init__(self, *args, causal: bool = False, norm: str = 'none',
	norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
	super().__init__()
	self.conv = apply_parametrization_norm(nn.Conv1d(args, *kwargs), norm)
	self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
	self.norm_type = norm

	def forward(self, x):
	x = self.conv(x)
	x = self.norm(x)
	return x


	class NormConv2d(nn.Module):
	"""Wrapper around Conv2d and normalization applied to this conv
	to provide a uniform interface across normalization approaches.
	"""
	def __init__(self, args, norm: str = 'none', norm_kwargs: tp.Dict[str, tp.Any] = {}, *kwargs):
	super().__init__()
	self.conv = apply_parametrization_norm(nn.Conv2d(args, *kwargs), norm)
	self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
	self.norm_type = norm

	def forward(self, x):
	x = self.conv(x)
	x = self.norm(x)
	return x


	class NormConvTranspose1d(nn.Module):
	"""Wrapper around ConvTranspose1d and normalization applied to this conv
	to provide a uniform interface across normalization approaches.
	"""
	def __init__(self, *args, causal: bool = False, norm: str = 'none',
	norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
	super().__init__()
	self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(args, *kwargs), norm)
	self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
	self.norm_type = norm

	def forward(self, x):
	x = self.convtr(x)
	x = self.norm(x)
	return x


	class NormConvTranspose2d(nn.Module):
	"""Wrapper around ConvTranspose2d and normalization applied to this conv
	to provide a uniform interface across normalization approaches.
	"""
	def __init__(self, args, norm: str = 'none', norm_kwargs: tp.Dict[str, tp.Any] = {}, *kwargs):
	super().__init__()
	self.convtr = apply_parametrization_norm(nn.ConvTranspose2d(args, *kwargs), norm)
	self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)

	def forward(self, x):
	x = self.convtr(x)
	x = self.norm(x)
	return x


	class StreamableConv1d(nn.Module):
	"""Conv1d with some builtin handling of asymmetric or causal padding
	and normalization.
	"""
	def __init__(self, in_channels: int, out_channels: int,
	kernel_size: int, stride: int = 1, dilation: int = 1,
	groups: int = 1, bias: bool = True, causal: bool = False,
	norm: str = 'none', norm_kwargs: tp.Dict[str, tp.Any] = {},
	pad_mode: str = 'reflect'):
	super().__init__()
	# warn user on unusual setup between dilation and stride
	if stride > 1 and dilation > 1:
	warnings.warn("StreamableConv1d has been initialized with stride > 1 and dilation > 1"
	f" (kernel_size={kernel_size} stride={stride}, dilation={dilation}).")
	self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
	dilation=dilation, groups=groups, bias=bias, causal=causal,
	norm=norm, norm_kwargs=norm_kwargs)
	self.causal = causal
	self.pad_mode = pad_mode

	def forward(self, x):
	B, C, T = x.shape
	kernel_size = self.conv.conv.kernel_size[0]
	stride = self.conv.conv.stride[0]
	dilation = self.conv.conv.dilation[0]
	kernel_size = (kernel_size - 1) * dilation + 1 # effective kernel size with dilations
	padding_total = kernel_size - stride
	extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
	if self.causal:
	# Left padding for causal
	x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
	else:
	# Asymmetric padding required for odd strides
	padding_right = padding_total // 2
	padding_left = padding_total - padding_right
	x = pad1d(x, (padding_left, padding_right + extra_padding), mode=self.pad_mode)
	return self.conv(x)


	class StreamableConvTranspose1d(nn.Module):
	"""ConvTranspose1d with some builtin handling of asymmetric or causal padding
	and normalization.
	"""
	def __init__(self, in_channels: int, out_channels: int,
	kernel_size: int, stride: int = 1, causal: bool = False,
	norm: str = 'none', trim_right_ratio: float = 1.,
	norm_kwargs: tp.Dict[str, tp.Any] = {}):
	super().__init__()
	self.convtr = NormConvTranspose1d(in_channels, out_channels, kernel_size, stride,
	causal=causal, norm=norm, norm_kwargs=norm_kwargs)
	self.causal = causal
	self.trim_right_ratio = trim_right_ratio
	assert self.causal or self.trim_right_ratio == 1., \
	"`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
	assert self.trim_right_ratio >= 0. and self.trim_right_ratio <= 1.

	def forward(self, x):
	kernel_size = self.convtr.convtr.kernel_size[0]
	stride = self.convtr.convtr.stride[0]
	padding_total = kernel_size - stride

	y = self.convtr(x)

	# We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
	# removed at the very end, when keeping only the right length for the output,
	# as removing it here would require also passing the length at the matching layer
	# in the encoder.
	if self.causal:
	# Trim the padding on the right according to the specified ratio
	# if trim_right_ratio = 1.0, trim everything from right
	padding_right = math.ceil(padding_total * self.trim_right_ratio)
	padding_left = padding_total - padding_right
	y = unpad1d(y, (padding_left, padding_right))
	else:
	# Asymmetric padding required for odd strides
	padding_right = padding_total // 2
	padding_left = padding_total - padding_right
	y = unpad1d(y, (padding_left, padding_right))
	return y


	class StreamableLSTM(nn.Module):
	"""LSTM without worrying about the hidden state, nor the layout of the data.
	Expects input as convolutional layout.
	"""
	def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
	super().__init__()
	self.skip = skip
	self.lstm = nn.LSTM(dimension, dimension, num_layers)

	def forward(self, x):
	x = x.permute(2, 0, 1)
	y, _ = self.lstm(x)
	if self.skip:
	y = y + x
	y = y.permute(1, 2, 0)
	return y


	class SEANetResnetBlock(nn.Module):
	"""Residual block from SEANet model.

	Args:
	dim (int): Dimension of the input/output.
	kernel_sizes (list): List of kernel sizes for the convolutions.
	dilations (list): List of dilations for the convolutions.
	activation (str): Activation function.
	activation_params (dict): Parameters to provide to the activation function.
	norm (str): Normalization method.
	norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
	causal (bool): Whether to use fully causal convolution.
	pad_mode (str): Padding mode for the convolutions.
	compress (int): Reduced dimensionality in residual branches (from Demucs v3).
	true_skip (bool): Whether to use true skip connection or a simple
	(streamable) convolution as the skip connection.
	"""
	def __init__(self, dim: int, kernel_sizes: tp.List[int] = [3, 1], dilations: tp.List[int] = [1, 1],
	activation: str = 'ELU', activation_params: dict = {'alpha': 1.0},
	norm: str = 'none', norm_params: tp.Dict[str, tp.Any] = {}, causal: bool = False,
	pad_mode: str = 'reflect', compress: int = 2, true_skip: bool = True):
	super().__init__()
	assert len(kernel_sizes) == len(dilations), 'Number of kernel sizes should match number of dilations'
	act = getattr(nn, activation)
	hidden = dim // compress
	block = []
	for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
	in_chs = dim if i == 0 else hidden
	out_chs = dim if i == len(kernel_sizes) - 1 else hidden
	block += [
	act(**activation_params),
	StreamableConv1d(in_chs, out_chs, kernel_size=kernel_size, dilation=dilation,
	norm=norm, norm_kwargs=norm_params,
	causal=causal, pad_mode=pad_mode),
	]
	self.block = nn.Sequential(*block)
	self.shortcut: nn.Module
	if true_skip:
	self.shortcut = nn.Identity()
	else:
	self.shortcut = StreamableConv1d(dim, dim, kernel_size=1, norm=norm, norm_kwargs=norm_params,
	causal=causal, pad_mode=pad_mode)

	def forward(self, x):
	return self.shortcut(x) + self.block(x)


	class SEANetEncoder(nn.Module):
	"""SEANet encoder.

	Args:
	channels (int): Audio channels.
	dimension (int): Intermediate representation dimension.
	n_filters (int): Base width for the model.
	n_residual_layers (int): nb of residual layers.
	ratios (Sequence[int]): kernel size and stride ratios. The encoder uses downsampling ratios instead of
	upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
	that must match the decoder order. We use the decoder order as some models may only employ the decoder.
	activation (str): Activation function.
	activation_params (dict): Parameters to provide to the activation function.
	norm (str): Normalization method.
	norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
	kernel_size (int): Kernel size for the initial convolution.
	last_kernel_size (int): Kernel size for the initial convolution.
	residual_kernel_size (int): Kernel size for the residual layers.
	dilation_base (int): How much to increase the dilation with each layer.
	causal (bool): Whether to use fully causal convolution.
	pad_mode (str): Padding mode for the convolutions.
	true_skip (bool): Whether to use true skip connection or a simple
	(streamable) convolution as the skip connection in the residual network blocks.
	compress (int): Reduced dimensionality in residual branches (from Demucs v3).
	lstm (int): Number of LSTM layers at the end of the encoder.
	disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm.
	For the encoder, it corresponds to the N first blocks.
	"""
	def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
	ratios: tp.List[int] = [8, 5, 4, 2], activation: str = 'ELU', activation_params: dict = {'alpha': 1.0},
	norm: str = 'none', norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
	last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
	pad_mode: str = 'reflect', true_skip: bool = True, compress: int = 2, lstm: int = 0,
	disable_norm_outer_blocks: int = 0):
	super().__init__()
	self.channels = channels
	self.dimension = dimension
	self.n_filters = n_filters
	self.ratios = list(reversed(ratios))
	del ratios
	self.n_residual_layers = n_residual_layers
	self.hop_length = np.prod(self.ratios)
	self.n_blocks = len(self.ratios) + 2 # first and last conv + residual blocks
	self.disable_norm_outer_blocks = disable_norm_outer_blocks
	assert self.disable_norm_outer_blocks >= 0 and self.disable_norm_outer_blocks <= self.n_blocks, \
	"Number of blocks for which to disable norm is invalid." \
	"It should be lower or equal to the actual number of blocks in the network and greater or equal to 0."

	act = getattr(nn, activation)
	mult = 1
	model: tp.List[nn.Module] = [
	StreamableConv1d(channels, mult * n_filters, kernel_size,
	norm='none' if self.disable_norm_outer_blocks >= 1 else norm,
	norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
	]
	# Downsample to raw audio scale
	for i, ratio in enumerate(self.ratios):
	block_norm = 'none' if self.disable_norm_outer_blocks >= i + 2 else norm
	# Add residual layers
	for j in range(n_residual_layers):
	model += [
	SEANetResnetBlock(mult * n_filters, kernel_sizes=[residual_kernel_size, 1],
	dilations=[dilation_base ** j, 1],
	norm=block_norm, norm_params=norm_params,
	activation=activation, activation_params=activation_params,
	causal=causal, pad_mode=pad_mode, compress=compress, true_skip=true_skip)]

	# Add downsampling layers
	model += [
	act(**activation_params),
	StreamableConv1d(mult * n_filters, mult * n_filters * 2,
	kernel_size=ratio * 2, stride=ratio,
	norm=block_norm, norm_kwargs=norm_params,
	causal=causal, pad_mode=pad_mode),
	]
	mult *= 2

	if lstm:
	model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]

	model += [
	act(**activation_params),
	StreamableConv1d(mult * n_filters, dimension, last_kernel_size,
	norm='none' if self.disable_norm_outer_blocks == self.n_blocks else norm,
	norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
	]

	self.model = nn.Sequential(*model)

	def forward(self, x):
	return self.model(x)


	class SEANetDecoder(nn.Module):
	"""SEANet decoder.

	Args:
	channels (int): Audio channels.
	dimension (int): Intermediate representation dimension.
	n_filters (int): Base width for the model.
	n_residual_layers (int): nb of residual layers.
	ratios (Sequence[int]): kernel size and stride ratios.
	activation (str): Activation function.
	activation_params (dict): Parameters to provide to the activation function.
	final_activation (str): Final activation function after all convolutions.
	final_activation_params (dict): Parameters to provide to the activation function.
	norm (str): Normalization method.
	norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
	kernel_size (int): Kernel size for the initial convolution.
	last_kernel_size (int): Kernel size for the initial convolution.
	residual_kernel_size (int): Kernel size for the residual layers.
	dilation_base (int): How much to increase the dilation with each layer.
	causal (bool): Whether to use fully causal convolution.
	pad_mode (str): Padding mode for the convolutions.
	true_skip (bool): Whether to use true skip connection or a simple.
	(streamable) convolution as the skip connection in the residual network blocks.
	compress (int): Reduced dimensionality in residual branches (from Demucs v3).
	lstm (int): Number of LSTM layers at the end of the encoder.
	disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm.
	For the decoder, it corresponds to the N last blocks.
	trim_right_ratio (float): Ratio for trimming at the right of the transposed convolution under the causal setup.
	If equal to 1.0, it means that all the trimming is done at the right.
	"""
	def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
	ratios: tp.List[int] = [8, 5, 4, 2], activation: str = 'ELU', activation_params: dict = {'alpha': 1.0},
	final_activation: tp.Optional[str] = None, final_activation_params: tp.Optional[dict] = None,
	norm: str = 'none', norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
	last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
	pad_mode: str = 'reflect', true_skip: bool = True, compress: int = 2, lstm: int = 0,
	disable_norm_outer_blocks: int = 0, trim_right_ratio: float = 1.0):
	super().__init__()
	self.dimension = dimension
	self.channels = channels
	self.n_filters = n_filters
	self.ratios = ratios
	del ratios
	self.n_residual_layers = n_residual_layers
	self.hop_length = np.prod(self.ratios)
	self.n_blocks = len(self.ratios) + 2 # first and last conv + residual blocks
	self.disable_norm_outer_blocks = disable_norm_outer_blocks
	assert self.disable_norm_outer_blocks >= 0 and self.disable_norm_outer_blocks <= self.n_blocks, \
	"Number of blocks for which to disable norm is invalid." \
	"It should be lower or equal to the actual number of blocks in the network and greater or equal to 0."

	act = getattr(nn, activation)
	mult = int(2 ** len(self.ratios))
	model: tp.List[nn.Module] = [
	StreamableConv1d(dimension, mult * n_filters, kernel_size,
	norm='none' if self.disable_norm_outer_blocks == self.n_blocks else norm,
	norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
	]

	if lstm:
	model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]

	# Upsample to raw audio scale
	for i, ratio in enumerate(self.ratios):
	block_norm = 'none' if self.disable_norm_outer_blocks >= self.n_blocks - (i + 1) else norm
	# Add upsampling layers
	model += [
	act(**activation_params),
	StreamableConvTranspose1d(mult * n_filters, mult * n_filters // 2,
	kernel_size=ratio * 2, stride=ratio,
	norm=block_norm, norm_kwargs=norm_params,
	causal=causal, trim_right_ratio=trim_right_ratio),
	]
	# Add residual layers
	for j in range(n_residual_layers):
	model += [
	SEANetResnetBlock(mult * n_filters // 2, kernel_sizes=[residual_kernel_size, 1],
	dilations=[dilation_base ** j, 1],
	activation=activation, activation_params=activation_params,
	norm=block_norm, norm_params=norm_params, causal=causal,
	pad_mode=pad_mode, compress=compress, true_skip=true_skip)]

	mult //= 2

	# Add final layers
	model += [
	act(**activation_params),
	StreamableConv1d(n_filters, channels, last_kernel_size,
	norm='none' if self.disable_norm_outer_blocks >= 1 else norm,
	norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
	]
	# Add optional final activation to decoder (eg. tanh)
	if final_activation is not None:
	final_act = getattr(nn, final_activation)
	final_activation_params = final_activation_params or {}
	model += [
	final_act(**final_activation_params)
	]
	self.model = nn.Sequential(*model)

	def forward(self, z):
	y = self.model(z)
	return y


	def exists(val: tp.Optional[tp.Any]) -> bool:
	return val is not None


	def default(val: tp.Any, d: tp.Any) -> tp.Any:
	return val if exists(val) else d


	def l2norm(t):
	return F.normalize(t, p=2, dim=-1)


	def ema_inplace(moving_avg, new, decay: float):
	moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))


	def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
	return (x + epsilon) / (x.sum() + n_categories * epsilon)


	def uniform_init(*shape: int):
	t = torch.empty(shape)
	nn.init.kaiming_uniform_(t)
	return t


	def sample_vectors(samples, num: int):
	num_samples, device = samples.shape[0], samples.device

	if num_samples >= num:
	indices = torch.randperm(num_samples, device=device)[:num]
	else:
	indices = torch.randint(0, num_samples, (num,), device=device)

	return samples[indices]


	def kmeans(samples, num_clusters: int, num_iters: int = 10):
	dim, dtype = samples.shape[-1], samples.dtype

	means = sample_vectors(samples, num_clusters)

	for _ in range(num_iters):
	diffs = rearrange(samples, "n d -> n () d") - rearrange(
	means, "c d -> () c d"
	)
	dists = -(diffs ** 2).sum(dim=-1)

	buckets = dists.max(dim=-1).indices
	bins = torch.bincount(buckets, minlength=num_clusters)
	zero_mask = bins == 0
	bins_min_clamped = bins.masked_fill(zero_mask, 1)

	new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
	new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
	new_means = new_means / bins_min_clamped[..., None]

	means = torch.where(zero_mask[..., None], means, new_means)

	return means, bins


	def orthogonal_loss_fn(t):
	# eq (2) from https://arxiv.org/abs/2112.00384
	n = t.shape[0]
	normed_codes = l2norm(t)
	identity = torch.eye(n, device=t.device)
	cosine_sim = einsum("i d, j d -> i j", normed_codes, normed_codes)
	return ((cosine_sim - identity) 2).sum() / (n 2)


	class EuclideanCodebook(nn.Module):
	"""Codebook with Euclidean distance.

	Args:
	dim (int): Dimension.
	codebook_size (int): Codebook size.
	kmeans_init (bool): Whether to use k-means to initialize the codebooks.
	If set to true, run the k-means algorithm on the first training batch and use
	the learned centroids as initialization.
	kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
	decay (float): Decay for exponential moving average over the codebooks.
	epsilon (float): Epsilon value for numerical stability.
	threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
	that have an exponential moving average cluster size less than the specified threshold with
	randomly selected vector from the current batch.
	"""
	def __init__(
	self,
	dim: int,
	codebook_size: int,
	kmeans_init: int = False,
	kmeans_iters: int = 10,
	decay: float = 0.8,
	epsilon: float = 1e-5,
	threshold_ema_dead_code: int = 2,
	):
	super().__init__()
	self.decay = decay
	init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
	embed = init_fn(codebook_size, dim)

	self.codebook_size = codebook_size

	self.kmeans_iters = kmeans_iters
	self.epsilon = epsilon
	self.threshold_ema_dead_code = threshold_ema_dead_code

	self.register_buffer("inited", torch.Tensor([not kmeans_init]))
	self.register_buffer("cluster_size", torch.zeros(codebook_size))
	self.register_buffer("embed", embed)
	self.register_buffer("embed_avg", embed.clone())

	@torch.jit.ignore
	def init_embed_(self, data):
	if self.inited:
	return

	embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
	self.embed.data.copy_(embed)
	self.embed_avg.data.copy_(embed.clone())
	self.cluster_size.data.copy_(cluster_size)
	self.inited.data.copy_(torch.Tensor([True]))
	# Make sure all buffers across workers are in sync after initialization
	flashy.distrib.broadcast_tensors(self.buffers())

	def replace_(self, samples, mask):
	modified_codebook = torch.where(
	mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
	)
	self.embed.data.copy_(modified_codebook)

	def expire_codes_(self, batch_samples):
	if self.threshold_ema_dead_code == 0:
	return

	expired_codes = self.cluster_size < self.threshold_ema_dead_code
	if not torch.any(expired_codes):
	return

	batch_samples = rearrange(batch_samples, "... d -> (...) d")
	self.replace_(batch_samples, mask=expired_codes)
	flashy.distrib.broadcast_tensors(self.buffers())

	def preprocess(self, x):
	x = rearrange(x, "... d -> (...) d")
	return x

	def quantize(self, x):
	embed = self.embed.t()
	dist = -(
	x.pow(2).sum(1, keepdim=True)
	- 2 * x @ embed
	+ embed.pow(2).sum(0, keepdim=True)
	)
	embed_ind = dist.max(dim=-1).indices
	return embed_ind

	def postprocess_emb(self, embed_ind, shape):
	return embed_ind.view(*shape[:-1])

	def dequantize(self, embed_ind):
	quantize = F.embedding(embed_ind, self.embed)
	return quantize

	def encode(self, x):
	shape = x.shape
	# pre-process
	x = self.preprocess(x)
	# quantize
	embed_ind = self.quantize(x)
	# post-process
	embed_ind = self.postprocess_emb(embed_ind, shape)
	return embed_ind

	def decode(self, embed_ind):
	quantize = self.dequantize(embed_ind)
	return quantize

	def forward(self, x):
	raise NotImplementedError()
	shape, dtype = x.shape, x.dtype
	x = self.preprocess(x)
	self.init_embed_(x)

	embed_ind = self.quantize(x)
	embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
	embed_ind = self.postprocess_emb(embed_ind, shape)
	quantize = self.dequantize(embed_ind)

	if self.training:
	# We do the expiry of code at that point as buffers are in sync
	# and all the workers will take the same decision.
	self.expire_codes_(x)
	ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
	embed_sum = x.t() @ embed_onehot
	ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
	cluster_size = (
	laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
	* self.cluster_size.sum()
	)
	embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
	self.embed.data.copy_(embed_normalized)

	return quantize, embed_ind


	class VectorQuantization(nn.Module):
	"""Vector quantization implementation.
	Currently supports only euclidean distance.

	Args:
	dim (int): Dimension
	codebook_size (int): Codebook size
	codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
	decay (float): Decay for exponential moving average over the codebooks.
	epsilon (float): Epsilon value for numerical stability.
	kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
	kmeans_iters (int): Number of iterations used for kmeans initialization.
	threshold_ema_dead_code (int):
	channels_last (bool): Channels are the last dimension in the input tensors.
	commitment_weight (float): Weight for commitment loss.
	orthogonal_reg_weight (float): Orthogonal regularization weights.
	orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
	orthogonal_reg_max_codes (optional int): Maximum number of codes to consider
	for orthogonal regularization.
	threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
	that have an exponential moving average cluster size less than the specified threshold with
	randomly selected vector from the current batch.
	"""
	def __init__(
	self,
	dim: int,
	codebook_size: int,
	codebook_dim: tp.Optional[int] = None,
	decay: float = 0.8,
	epsilon: float = 1e-5,
	kmeans_init: bool = False,
	kmeans_iters: int = 10,
	threshold_ema_dead_code: int = 2,
	channels_last: bool = False,
	commitment_weight: float = 1.,
	orthogonal_reg_weight: float = 0.0,
	orthogonal_reg_active_codes_only: bool = False,
	orthogonal_reg_max_codes: tp.Optional[int] = None,
	):
	super().__init__()
	_codebook_dim: int = default(codebook_dim, dim)

	requires_projection = _codebook_dim != dim
	self.project_in = (nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity())
	self.project_out = (nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity())

	self.epsilon = epsilon
	self.commitment_weight = commitment_weight

	self.orthogonal_reg_weight = orthogonal_reg_weight
	self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
	self.orthogonal_reg_max_codes = orthogonal_reg_max_codes

	self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size,
	kmeans_init=kmeans_init, kmeans_iters=kmeans_iters,
	decay=decay, epsilon=epsilon,
	threshold_ema_dead_code=threshold_ema_dead_code)
	self.codebook_size = codebook_size

	self.channels_last = channels_last

	@property
	def codebook(self):
	return self._codebook.embed

	@property
	def inited(self):
	return self._codebook.inited

	def _preprocess(self, x):
	if not self.channels_last:
	x = rearrange(x, "b d n -> b n d")
	return x

	def _postprocess(self, quantize):
	if not self.channels_last:
	quantize = rearrange(quantize, "b n d -> b d n")
	return quantize

	def encode(self, x):
	x = self._preprocess(x)
	x = self.project_in(x)
	embed_in = self._codebook.encode(x)
	return embed_in

	def decode(self, embed_ind):
	quantize = self._codebook.decode(embed_ind)
	quantize = self.project_out(quantize)
	quantize = self._postprocess(quantize)
	return quantize

	def forward(self, x):
	device = x.device
	x = self._preprocess(x)

	x = self.project_in(x)
	quantize, embed_ind = self._codebook(x)

	if self.training:
	quantize = x + (quantize - x).detach()

	loss = torch.tensor([0.0], device=device, requires_grad=self.training)

	if self.training:
	if self.commitment_weight > 0:
	commit_loss = F.mse_loss(quantize.detach(), x)
	loss = loss + commit_loss * self.commitment_weight

	if self.orthogonal_reg_weight > 0:
	codebook = self.codebook

	if self.orthogonal_reg_active_codes_only:
	# only calculate orthogonal loss for the activated codes for this batch
	unique_code_ids = torch.unique(embed_ind)
	codebook = codebook[unique_code_ids]

	num_codes = codebook.shape[0]
	if exists(self.orthogonal_reg_max_codes) and num_codes > self.orthogonal_reg_max_codes:
	rand_ids = torch.randperm(num_codes, device=device)[:self.orthogonal_reg_max_codes]
	codebook = codebook[rand_ids]

	orthogonal_reg_loss = orthogonal_loss_fn(codebook)
	loss = loss + orthogonal_reg_loss * self.orthogonal_reg_weight

	quantize = self.project_out(quantize)
	quantize = self._postprocess(quantize)

	return quantize, embed_ind, loss


	class ResidualVectorQuantization(nn.Module):
	"""Residual vector quantization implementation.

	Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
	"""
	def __init__(self, , num_quantizers, *kwargs):
	super().__init__()
	codebook_size = kwargs.pop('codebook_size', None)
	if codebook_size is None:
	raise ValueError("codebook_size must be provided in kwargs")
	if type(codebook_size) != list:
	codebook_size = [codebook_size] * num_quantizers
	self.layers = nn.ModuleList(
	[VectorQuantization(codebook_size=cur_codebook_size, **kwargs) for _,cur_codebook_size in zip(range(num_quantizers), codebook_size)]
	)


	# self.layers = nn.ModuleList(
	# [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
	# )

	def forward(self, x, n_q: tp.Optional[int] = None):
	quantized_out = 0.0
	residual = x

	all_losses = []
	all_indices = []

	n_q = n_q or len(self.layers)

	for i, layer in enumerate(self.layers[:n_q]):
	quantized, indices, loss = layer(residual)
	residual = residual - quantized
	quantized_out = quantized_out + quantized
	all_indices.append(indices)
	all_losses.append(loss)

	out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
	return quantized_out, out_indices, out_losses

	def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
	residual = x
	all_indices = []
	n_q = n_q or len(self.layers)
	for layer in self.layers[:n_q]:
	indices = layer.encode(residual)
	quantized = layer.decode(indices)
	# the original code is below
	# since quantize has the gradient of residual, according to line 321
	# quantize = x + (quantize - x).detach()
	# the code below will make commitment loss to be 0 for all codebooks except for codebook1
	# https://github.com/facebookresearch/encodec/issues/25
	# therefore we change it

	residual = residual - quantized
	# residual = residual - quantized.detach()
	# since commitment loss is averaged, the scale of the loss won't get change (not as said in the issue above)
	all_indices.append(indices)
	out_indices = torch.stack(all_indices)
	return out_indices

	def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
	quantized_out = torch.tensor(0.0, device=q_indices.device)
	for i, indices in enumerate(q_indices):
	layer = self.layers[i]
	quantized = layer.decode(indices)
	quantized_out = quantized_out + quantized
	return quantized_out


	class ResidualVectorQuantizer(BaseQuantizer):
	"""Residual Vector Quantizer.

	Args:
	dimension (int): Dimension of the codebooks.
	n_q (int): Number of residual vector quantizers used.
	q_dropout (bool): Random quantizer drop out at train time.
	bins (int): Codebook size.
	decay (float): Decay for exponential moving average over the codebooks.
	kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
	kmeans_iters (int): Number of iterations used for kmeans initialization.
	threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
	that have an exponential moving average cluster size less than the specified threshold with
	randomly selected vector from the current batch.
	orthogonal_reg_weight (float): Orthogonal regularization weights.
	orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
	orthogonal_reg_max_codes (optional int): Maximum number of codes to consider.
	for orthogonal regularization.
	"""
	def __init__(
	self,
	dimension: int = 256,
	n_q: int = 8,
	q_dropout: bool = False,
	bins: tp.Union[int, tp.List[int]] = 1024,
	decay: float = 0.99,
	kmeans_init: bool = True,
	kmeans_iters: int = 10,
	threshold_ema_dead_code: int = 2,
	orthogonal_reg_weight: float = 0.0,
	orthogonal_reg_active_codes_only: bool = False,
	orthogonal_reg_max_codes: tp.Optional[int] = None,
	):
	super().__init__()
	self.max_n_q = n_q
	self.n_q = n_q
	self.q_dropout = q_dropout
	self.dimension = dimension
	self.bins = bins
	self.decay = decay
	self.kmeans_init = kmeans_init
	self.kmeans_iters = kmeans_iters
	self.threshold_ema_dead_code = threshold_ema_dead_code
	self.orthogonal_reg_weight = orthogonal_reg_weight
	self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
	self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
	self.vq = ResidualVectorQuantization(
	dim=self.dimension,
	codebook_size=self.bins,
	num_quantizers=self.n_q,
	decay=self.decay,
	kmeans_init=self.kmeans_init,
	kmeans_iters=self.kmeans_iters,
	threshold_ema_dead_code=self.threshold_ema_dead_code,
	orthogonal_reg_weight=self.orthogonal_reg_weight,
	orthogonal_reg_active_codes_only=self.orthogonal_reg_active_codes_only,
	orthogonal_reg_max_codes=self.orthogonal_reg_max_codes,
	channels_last=False
	)

	def forward(self, x: torch.Tensor, frame_rate: int):
	n_q = self.n_q
	if self.training and self.q_dropout:
	n_q = int(torch.randint(1, self.n_q + 1, (1,)).item())
	if type(self.bins) == list:
	bins = self.bins
	else:
	bins = [self.bins] * self.n_q
	bw_per_q = [math.log2(bin) * frame_rate / 1000 for bin in bins]
	bw = torch.tensor(sum(bw_per_q)).to(x)
	quantized, codes, commit_loss = self.vq(x, n_q=n_q)
	codes = codes.transpose(0, 1)
	# codes is [B, K, T], with T frames, K nb of codebooks.
	return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss))

	def encode(self, x: torch.Tensor) -> torch.Tensor:
	"""Encode a given input tensor with the specified frame rate at the given bandwidth.
	The RVQ encode method sets the appropriate number of quantizer to use
	and returns indices for each quantizer.
	"""
	n_q = self.n_q
	codes = self.vq.encode(x, n_q=n_q)
	codes = codes.transpose(0, 1)
	# codes is [B, K, T], with T frames, K nb of codebooks.
	return codes

	def decode(self, codes: torch.Tensor) -> torch.Tensor:
	"""Decode the given codes to the quantized representation."""
	# codes is [B, K, T], with T frames, K nb of codebooks, vq.decode expects [K, B, T].
	codes = codes.transpose(0, 1)
	quantized = self.vq.decode(codes)
	return quantized

	@property
	def total_codebooks(self):
	return self.max_n_q

	@property
	def num_codebooks(self):
	return self.n_q

	def set_num_codebooks(self, n: int):
	assert n > 0 and n <= self.max_n_q
	self.n_q = n

	class DummyQuantizer(BaseQuantizer):
	"""Fake quantizer that actually does not perform any quantization.
	"""
	def __init__(self):
	super().__init__()

	def forward(self, x: torch.Tensor, frame_rate: int):
	q = x.unsqueeze(1)
	return QuantizedResult(x, q, torch.tensor(q.numel() * 32 * frame_rate / 1000 / len(x)).to(x))

	def encode(self, x: torch.Tensor) -> torch.Tensor:
	"""Encode a given input tensor with the specified sample rate at the given bandwidth.
	In the case of the DummyQuantizer, the codes are actually identical
	to the input and resulting quantized representation as no quantization is done.
	"""
	return x.unsqueeze(1)

	def decode(self, codes: torch.Tensor) -> torch.Tensor:
	"""Decode the given codes to the quantized representation.
	In the case of the DummyQuantizer, the codes are actually identical
	to the input and resulting quantized representation as no quantization is done.
	"""
	return codes.squeeze(1)

	@property
	def total_codebooks(self):
	"""Total number of codebooks."""
	return 1

	@property
	def num_codebooks(self):
	"""Total number of codebooks."""
	return self.total_codebooks

	def set_num_codebooks(self, n: int):
	"""Set the number of active codebooks."""
	raise AttributeError("Cannot override the number of codebooks for the dummy quantizer")


	class EncodecModel(CompressionModel):
	"""Encodec model operating on the raw waveform.

	Args:
	encoder (nn.Module): Encoder network.
	decoder (nn.Module): Decoder network.
	quantizer (BaseQuantizer): Quantizer network.
	frame_rate (int): Frame rate for the latent representation.
	sample_rate (int): Audio sample rate.
	channels (int): Number of audio channels.
	causal (bool): Whether to use a causal version of the model.
	renormalize (bool): Whether to renormalize the audio before running the model.
	"""
	# we need assignment to override the property in the abstract class,
	# I couldn't find a better way...
	frame_rate: float = 0
	sample_rate: int = 0
	channels: int = 0

	def __init__(self,
	encoder: nn.Module,
	decoder: nn.Module,
	quantizer: BaseQuantizer,
	frame_rate: int,
	sample_rate: int,
	channels: int,
	causal: bool = False,
	renormalize: bool = False):
	super().__init__()
	self.encoder = encoder
	self.decoder = decoder
	self.quantizer = quantizer
	self.frame_rate = frame_rate
	self.sample_rate = sample_rate
	self.channels = channels
	self.renormalize = renormalize
	self.causal = causal
	if self.causal:
	# we force disabling here to avoid handling linear overlap of segments
	# as supported in original EnCodec codebase.
	assert not self.renormalize, 'Causal model does not support renormalize'

	@property
	def total_codebooks(self):
	"""Total number of quantizer codebooks available."""
	return self.quantizer.total_codebooks

	@property
	def num_codebooks(self):
	"""Active number of codebooks used by the quantizer."""
	return self.quantizer.num_codebooks

	def set_num_codebooks(self, n: int):
	"""Set the active number of codebooks used by the quantizer."""
	self.quantizer.set_num_codebooks(n)

	@property
	def cardinality(self):
	"""Cardinality of each codebook."""
	return self.quantizer.bins

	def preprocess(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
	scale: tp.Optional[torch.Tensor]
	if self.renormalize:
	mono = x.mean(dim=1, keepdim=True)
	volume = mono.pow(2).mean(dim=2, keepdim=True).sqrt()
	scale = 1e-8 + volume
	x = x / scale
	scale = scale.view(-1, 1)
	else:
	scale = None
	return x, scale

	def postprocess(self,
	x: torch.Tensor,
	scale: tp.Optional[torch.Tensor] = None) -> torch.Tensor:
	if scale is not None:
	assert self.renormalize
	x = x * scale.view(-1, 1, 1)
	return x

	def forward(self, x: torch.Tensor, encode=False) -> QuantizedResult:
	if encode:
	return self.encode(x)
	else:
	raise NotImplementedError("model forward and training is not supported.")
	assert x.dim() == 3
	length = x.shape[-1]
	x, scale = self.preprocess(x)

	emb = self.encoder(x)
	q_res = self.quantizer(emb, self.frame_rate)
	out = self.decoder(q_res.x)

	# remove extra padding added by the encoder and decoder
	assert out.shape[-1] >= length, (out.shape[-1], length)
	out = out[..., :length]

	q_res.x = self.postprocess(out, scale)

	return q_res

	def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
	"""Encode the given input tensor to quantized representation along with scale parameter.

	Args:
	x (torch.Tensor): Float tensor of shape [B, C, T]

	Returns:
	codes, scale (tuple of torch.Tensor, torch.Tensor): Tuple composed of:
	codes a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
	scale a float tensor containing the scale for audio renormalizealization.
	"""
	assert x.dim() == 3
	x, scale = self.preprocess(x)
	emb = self.encoder(x)
	codes = self.quantizer.encode(emb)
	return codes, scale

	def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
	"""Decode the given codes to a reconstructed representation, using the scale to perform
	audio denormalization if needed.

	Args:
	codes (torch.Tensor): Int tensor of shape [B, K, T]
	scale (torch.Tensor, optional): Float tensor containing the scale value.

	Returns:
	out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
	"""
	emb = self.decode_latent(codes)
	out = self.decoder(emb)
	out = self.postprocess(out, scale)
	# out contains extra padding added by the encoder and decoder
	return out

	def decode_latent(self, codes: torch.Tensor):
	"""Decode from the discrete codes to continuous latent space."""
	return self.quantizer.decode(codes)

	class EncodecModel_encode_only(CompressionModel):
	"""Encodec model operating on the raw waveform. Encode only, so no decoder

	Args:
	encoder (nn.Module): Encoder network.
	quantizer (BaseQuantizer): Quantizer network.
	frame_rate (int): Frame rate for the latent representation.
	sample_rate (int): Audio sample rate.
	channels (int): Number of audio channels.
	causal (bool): Whether to use a causal version of the model.
	renormalize (bool): Whether to renormalize the audio before running the model.
	"""
	# we need assignment to override the property in the abstract class,
	# I couldn't find a better way...
	frame_rate: float = 0
	sample_rate: int = 0
	channels: int = 0

	def __init__(self,
	encoder: nn.Module,
	quantizer: BaseQuantizer,
	frame_rate: int,
	sample_rate: int,
	channels: int,
	causal: bool = False,
	renormalize: bool = False):
	super().__init__()
	self.encoder = encoder
	self.quantizer = quantizer
	self.frame_rate = frame_rate
	self.sample_rate = sample_rate
	self.channels = channels
	self.renormalize = renormalize
	self.causal = causal
	if self.causal:
	# we force disabling here to avoid handling linear overlap of segments
	# as supported in original EnCodec codebase.
	assert not self.renormalize, 'Causal model does not support renormalize'

	@property
	def total_codebooks(self):
	"""Total number of quantizer codebooks available."""
	return self.quantizer.total_codebooks

	@property
	def num_codebooks(self):
	"""Active number of codebooks used by the quantizer."""
	return self.quantizer.num_codebooks

	def set_num_codebooks(self, n: int):
	"""Set the active number of codebooks used by the quantizer."""
	self.quantizer.set_num_codebooks(n)

	@property
	def cardinality(self):
	"""Cardinality of each codebook."""
	return self.quantizer.bins

	def preprocess(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
	scale: tp.Optional[torch.Tensor]
	if self.renormalize:
	mono = x.mean(dim=1, keepdim=True)
	volume = mono.pow(2).mean(dim=2, keepdim=True).sqrt()
	scale = 1e-8 + volume
	x = x / scale
	scale = scale.view(-1, 1)
	else:
	scale = None
	return x, scale

	def postprocess(self,
	x: torch.Tensor,
	scale: tp.Optional[torch.Tensor] = None) -> torch.Tensor:
	if scale is not None:
	assert self.renormalize
	x = x * scale.view(-1, 1, 1)
	return x

	def forward(self, x: torch.Tensor, encode=False) -> QuantizedResult:
	if encode:
	return self.encode(x)
	else:
	raise NotImplementedError("model forward and training is not supported.")
	assert x.dim() == 3
	length = x.shape[-1]
	x, scale = self.preprocess(x)

	emb = self.encoder(x)
	q_res = self.quantizer(emb, self.frame_rate)
	out = self.decoder(q_res.x)

	# remove extra padding added by the encoder and decoder
	assert out.shape[-1] >= length, (out.shape[-1], length)
	out = out[..., :length]

	q_res.x = self.postprocess(out, scale)

	return q_res

	def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
	"""Encode the given input tensor to quantized representation along with scale parameter.

	Args:
	x (torch.Tensor): Float tensor of shape [B, C, T]

	Returns:
	codes, scale (tuple of torch.Tensor, torch.Tensor): Tuple composed of:
	codes a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
	scale a float tensor containing the scale for audio renormalizealization.
	"""
	assert x.dim() == 3
	x, scale = self.preprocess(x)
	emb = self.encoder(x)
	codes = self.quantizer.encode(emb)
	return codes, scale

	def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
	"""Decode the given codes to a reconstructed representation, using the scale to perform
	audio denormalization if needed.

	Args:
	codes (torch.Tensor): Int tensor of shape [B, K, T]
	scale (torch.Tensor, optional): Float tensor containing the scale value.

	Returns:
	out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
	"""
	raise NotImplementedError("Decode is not supported for encode only model")
	emb = self.decode_latent(codes)
	out = self.decoder(emb)
	out = self.postprocess(out, scale)
	# out contains extra padding added by the encoder and decoder
	return out

	def decode_latent(self, codes: torch.Tensor):
	"""Decode from the discrete codes to continuous latent space."""
	raise NotImplementedError("Decode is not supported for encode only model")
	return self.quantizer.decode(codes)

	def get_quantizer(quantizer: str, cfg: omegaconf.DictConfig, dimension: int) -> BaseQuantizer:
	klass = {
	'no_quant': DummyQuantizer,
	'rvq': ResidualVectorQuantizer
	}[quantizer]
	kwargs = dict_from_config(getattr(cfg, quantizer))
	if quantizer != 'no_quant':
	kwargs['dimension'] = dimension
	return klass(**kwargs)

	def get_encodec_autoencoder(encoder_name: str, cfg: omegaconf.DictConfig):
	if encoder_name == 'seanet':
	kwargs = dict_from_config(getattr(cfg, 'seanet'))
	encoder_override_kwargs = kwargs.pop('encoder')
	decoder_override_kwargs = kwargs.pop('decoder')
	encoder_kwargs = {kwargs, encoder_override_kwargs}
	decoder_kwargs = {kwargs, decoder_override_kwargs}
	encoder = SEANetEncoder(**encoder_kwargs)
	decoder = SEANetDecoder(**decoder_kwargs)
	return encoder, decoder
	else:
	raise KeyError(f"Unexpected compression model {cfg.compression_model}")


	def get_compression_model(ckpt_fn, encode_only=False, device="cpu") -> CompressionModel:
	"""Instantiate a compression model."""
	if device == None:
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	state = torch.load(ckpt_fn, map_location='cpu')
	cfg = state['xp.cfg']
	cfg.device = str(device)
	weights = state['best_state']['model']
	assert cfg.compression_model == 'encodec', "Only Encodec model is supported for now."
	if encode_only:
	all_keys = list(weights.keys())
	for key in all_keys:
	if key.startswith('decoder'):
	del weights[key]
	kwargs = dict_from_config(getattr(cfg, 'encodec'))
	encoder_name = kwargs.pop('autoencoder')
	quantizer_name = kwargs.pop('quantizer')
	encoder, _ = get_encodec_autoencoder(encoder_name, cfg)
	quantizer = get_quantizer(quantizer_name, cfg, encoder.dimension)
	frame_rate = kwargs['sample_rate'] // encoder.hop_length
	renormalize = kwargs.pop('renormalize', False)
	# deprecated params
	kwargs.pop('renorm', None)
	compression_model = EncodecModel_encode_only(encoder, quantizer,
	frame_rate=frame_rate, renormalize=renormalize, **kwargs).to(cfg.device)
	assert compression_model.sample_rate == cfg.sample_rate, "Compression model sample rate should match"
	compression_model.load_state_dict(weights)
	compression_model.eval()
	return compression_model

	else:
	kwargs = dict_from_config(getattr(cfg, 'encodec'))
	encoder_name = kwargs.pop('autoencoder')
	quantizer_name = kwargs.pop('quantizer')
	encoder, decoder = get_encodec_autoencoder(encoder_name, cfg)
	quantizer = get_quantizer(quantizer_name, cfg, encoder.dimension)
	frame_rate = kwargs['sample_rate'] // encoder.hop_length
	renormalize = kwargs.pop('renormalize', False)
	# deprecated params
	kwargs.pop('renorm', None)
	compression_model = EncodecModel(encoder, decoder, quantizer,
	frame_rate=frame_rate, renormalize=renormalize, **kwargs).to(cfg.device)
	assert compression_model.sample_rate == cfg.sample_rate, "Compression model sample rate should match"
	compression_model.load_state_dict(weights)
	compression_model.eval()
	return compression_model

	if __name__ == "__main__":
	import torchaudio
	ckpt_fn = "/home/pyp/BoostedVoiceEditor/pretrained/encodec_6f79c6a8.th"
	audio_in_fns = ["/home/pyp/BoostedVoiceEditor/demo/pam.wav", "/home/pyp/BoostedVoiceEditor/demo/ray.wav", "/home/pyp/BoostedVoiceEditor/demo/84_121550_000074_000000.wav", "/home/pyp/BoostedVoiceEditor/demo/caribbean.wav", "/home/pyp/BoostedVoiceEditor/demo/bible.wav", "/home/pyp/BoostedVoiceEditor/demo/miley.wav"]
	audio_out_fns = ["/home/pyp/BoostedVoiceEditor/demo/pam_encodecTest.wav", "/home/pyp/BoostedVoiceEditor/demo/ray_encodecTest.wav", "/home/pyp/BoostedVoiceEditor/demo/84_121550_000074_000000_encodecTest.wav", "/home/pyp/BoostedVoiceEditor/demo/caribbean_encodecTest.wav", "/home/pyp/BoostedVoiceEditor/demo/bible_encodecTest.wav", "/home/pyp/BoostedVoiceEditor/demo/miley_encodecTest.wav"]
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = get_compression_model(ckpt_fn, device=device)

	for audio_in_fn, audio_out_fn in zip(audio_in_fns, audio_out_fns):
	audio_in, sr = torchaudio.load(audio_in_fn)
	if sr != model.sample_rate:
	audio_in = torchaudio.transforms.Resample(sr, model.sample_rate)(audio_in)
	if audio_in.shape[0] == 2:
	audio_in = audio_in.mean(dim=0, keepdim=True)
	audio_in = audio_in.unsqueeze(0)
	audio_in = audio_in.to(torch.float32).to(device)
	codes = model.encode(audio_in)[0]
	audio_out = model.decode(codes)[0].cpu()
	torchaudio.save(audio_out_fn, audio_out, model.sample_rate)