Spaces:
Runtime error
Runtime error
| import numpy as np | |
| import librosa | |
| def process_audio(audio, sr=16000, silence_thresh=-60, min_silence_len=250): | |
| """ | |
| Splits an audio signal into segments using a fixed frame size and hop size. | |
| Parameters: | |
| - audio (np.ndarray): The audio signal to split. | |
| - sr (int): The sample rate of the input audio (default is 16000). | |
| - silence_thresh (int): Silence threshold (default =-60dB) | |
| - min_silence_len (int): Minimum silence duration (default 250ms). | |
| Returns: | |
| - list of np.ndarray: A list of audio segments. | |
| - np.ndarray: The intervals where the audio was split. | |
| """ | |
| frame_length = int(min_silence_len / 1000 * sr) | |
| hop_length = frame_length // 2 | |
| intervals = librosa.effects.split( | |
| audio, top_db=-silence_thresh, frame_length=frame_length, hop_length=hop_length | |
| ) | |
| audio_segments = [audio[start:end] for start, end in intervals] | |
| return audio_segments, intervals | |
| def merge_audio(audio_segments_org, audio_segments_new, intervals, sr_orig, sr_new): | |
| """ | |
| Merges audio segments back into a single audio signal, filling gaps with silence. | |
| Assumes audio segments are already at sr_new. | |
| Parameters: | |
| - audio_segments_org (list of np.ndarray): The non-silent audio segments (at sr_orig). | |
| - audio_segments_new (list of np.ndarray): The non-silent audio segments (at sr_new). | |
| - intervals (np.ndarray): The intervals used for splitting the original audio. | |
| - sr_orig (int): The sample rate of the original audio | |
| - sr_new (int): The sample rate of the model | |
| Returns: | |
| - np.ndarray: The merged audio signal with silent gaps restored. | |
| """ | |
| merged_audio = np.array([], dtype=audio_segments_new[0].dtype) | |
| sr_ratio = sr_new / sr_orig | |
| for i, (start, end) in enumerate(intervals): | |
| start_new = int(start * sr_ratio) | |
| end_new = int(end * sr_ratio) | |
| original_duration = len(audio_segments_org[i]) / sr_orig | |
| new_duration = len(audio_segments_new[i]) / sr_new | |
| duration_diff = new_duration - original_duration | |
| silence_samples = int(abs(duration_diff) * sr_new) | |
| silence_compensation = np.zeros( | |
| silence_samples, dtype=audio_segments_new[0].dtype | |
| ) | |
| if i == 0 and start_new > 0: | |
| initial_silence = np.zeros(start_new, dtype=audio_segments_new[0].dtype) | |
| merged_audio = np.concatenate((merged_audio, initial_silence)) | |
| if duration_diff > 0: | |
| merged_audio = np.concatenate((merged_audio, silence_compensation)) | |
| merged_audio = np.concatenate((merged_audio, audio_segments_new[i])) | |
| if duration_diff < 0: | |
| merged_audio = np.concatenate((merged_audio, silence_compensation)) | |
| if i < len(intervals) - 1: | |
| next_start_new = int(intervals[i + 1][0] * sr_ratio) | |
| silence_duration = next_start_new - end_new | |
| if silence_duration > 0: | |
| silence = np.zeros(silence_duration, dtype=audio_segments_new[0].dtype) | |
| merged_audio = np.concatenate((merged_audio, silence)) | |
| return merged_audio | |