Spaces:
Running
Running
| """ | |
| Common miscellaneous functions. | |
| AI Music Technology Group, Sony Group Corporation | |
| AI Speech and Sound Group, Sony Europe | |
| This implementation originally belongs to Sony Group Corporation, | |
| which has been introduced in the work "Automatic music mixing with deep learning and out-of-domain data". | |
| Original repo link: https://github.com/sony/FxNorm-automix | |
| """ | |
| import os | |
| import psutil | |
| import sys | |
| import numpy as np | |
| import librosa | |
| import torch | |
| import math | |
| def uprint(s): | |
| """ | |
| Unbuffered print to stdout. | |
| We also flush stderr to have the log-file in sync. | |
| Args: | |
| s: string to print | |
| """ | |
| print(s) | |
| sys.stdout.flush() | |
| sys.stderr.flush() | |
| def recursive_getattr(obj, attr): | |
| """ | |
| Run `getattr` recursively (e.g., for `fc1.weight`). | |
| Args: | |
| obj: object | |
| attr: attribute to get | |
| Returns: | |
| object | |
| """ | |
| for a in attr.split('.'): | |
| obj = getattr(obj, a) | |
| return obj | |
| def compute_stft(samples, hop_length, fft_size, stft_window): | |
| """ | |
| Compute the STFT of `samples` applying a Hann window of size `FFT_SIZE`, shifted for each frame by `hop_length`. | |
| Args: | |
| samples: num samples x channels | |
| hop_length: window shift in samples | |
| fft_size: FFT size which is also the window size | |
| stft_window: STFT analysis window | |
| Returns: | |
| stft: frames x channels x freqbins | |
| """ | |
| n_channels = samples.shape[1] | |
| n_frames = 1+int((samples.shape[0] - fft_size)/hop_length) | |
| stft = np.empty((n_frames, n_channels, fft_size//2+1), dtype=np.complex64) | |
| # convert into f_contiguous (such that [:,n] slicing is c_contiguous) | |
| samples = np.asfortranarray(samples) | |
| for n in range(n_channels): | |
| # compute STFT (output has size `n_frames x N_BINS`) | |
| stft[:, n, :] = librosa.stft(samples[:, n], | |
| n_fft=fft_size, | |
| hop_length=hop_length, | |
| window=stft_window, | |
| center=False).transpose() | |
| return stft | |
| def compute_istft(stft, hop_length, stft_window): | |
| """ | |
| Compute the inverse STFT of `stft`. | |
| Args: | |
| stft: frames x channels x freqbins | |
| hop_length: window shift in samples | |
| stft_window: STFT synthesis window | |
| Returns: | |
| samples: num samples x channels | |
| """ | |
| for n in range(stft.shape[1]): | |
| s = librosa.istft(stft[:, n, :].transpose(), | |
| hop_length=hop_length, window=stft_window, center=False) | |
| if n == 0: | |
| samples = s | |
| else: | |
| samples = np.column_stack((samples, s)) | |
| # ensure that we have a 2d array (monaural files are just loaded as vectors) | |
| if samples.ndim == 1: | |
| samples = samples[:, np.newaxis] | |
| return samples | |
| def get_size(obj): | |
| """ | |
| Recursively find size of objects (in bytes). | |
| Args: | |
| obj: object | |
| Returns: | |
| size of object | |
| """ | |
| size = sys.getsizeof(obj) | |
| import functools | |
| if isinstance(obj, dict): | |
| size += sum([get_size(v) for v in obj.values()]) | |
| size += sum([get_size(k) for k in obj.keys()]) | |
| elif isinstance(obj, functools.partial): | |
| size += sum([get_size(v) for v in obj.keywords.values()]) | |
| size += sum([get_size(k) for k in obj.keywords.keys()]) | |
| elif isinstance(obj, list): | |
| size += sum([get_size(i) for i in obj]) | |
| elif isinstance(obj, tuple): | |
| size += sum([get_size(i) for i in obj]) | |
| return size | |
| def get_process_memory(): | |
| """ | |
| Return memory consumption in GBytes. | |
| Returns: | |
| memory used by the process | |
| """ | |
| return psutil.Process(os.getpid()).memory_info()[0] / (2 ** 30) | |
| def check_complete_convolution(input_size, kernel_size, stride=1, | |
| padding=0, dilation=1, note=''): | |
| """ | |
| Check where the convolution is complete. | |
| Returns true if no time steps left over in a Conv1d | |
| Args: | |
| input_size: size of input | |
| kernel_size: size of kernel | |
| stride: stride | |
| padding: padding | |
| dilation: dilation | |
| note: string for additional notes | |
| """ | |
| is_complete = ((input_size + 2*padding - dilation * (kernel_size - 1) - 1) | |
| / stride + 1).is_integer() | |
| uprint(f'{note} {is_complete}') | |
| def pad_to_shape(x: torch.Tensor, y: int) -> torch.Tensor: | |
| """ | |
| Right-pad or right-trim first argument last dimension to have same size as second argument. | |
| Args: | |
| x: Tensor to be padded. | |
| y: Size to pad/trim x last dimension to | |
| Returns: | |
| `x` padded to match `y`'s dimension. | |
| """ | |
| inp_len = y | |
| output_len = x.shape[-1] | |
| return torch.nn.functional.pad(x, [0, inp_len - output_len]) | |
| def valid_length(input_size, kernel_size, stride=1, padding=0, dilation=1): | |
| """ | |
| Return the nearest valid upper length to use with the model so that there is no time steps left over in a 1DConv. | |
| For all layers, size of the (input - kernel_size) % stride = 0. | |
| Here valid means that there is no left over frame neglected and discarded. | |
| Args: | |
| input_size: size of input | |
| kernel_size: size of kernel | |
| stride: stride | |
| padding: padding | |
| dilation: dilation | |
| Returns: | |
| valid length for convolution | |
| """ | |
| length = math.ceil((input_size + 2*padding - dilation * (kernel_size - 1) - 1)/stride) + 1 | |
| length = (length - 1) * stride - 2*padding + dilation * (kernel_size - 1) + 1 | |
| return int(length) | |
| def td_length_from_fd(fd_length: int, fft_size: int, fft_hop: int) -> int: | |
| """ | |
| Return the length in time domain, given the length in frequency domain. | |
| Return the necessary length in the time domain of a signal to be transformed into | |
| a signal of length `fd_length` in time-frequency domain with the given STFT | |
| parameters `fft_size` and `fft_hop`. No padding is assumed. | |
| Args: | |
| fd_length: length in frequency domain | |
| fft_size: size of FFT | |
| fft_hop: hop length | |
| Returns: | |
| length in time domain | |
| """ | |
| return (fd_length - 1) * fft_hop + fft_size | |