Spaces:
Build error
Build error
Update audio_foundation_models.py
Browse files- audio_foundation_models.py +2 -109
audio_foundation_models.py
CHANGED
|
@@ -94,17 +94,7 @@ def select_best_audio(prompt,wav_list):
|
|
| 94 |
print(score_list,max_index)
|
| 95 |
return wav_list[max_index]
|
| 96 |
|
| 97 |
-
|
| 98 |
-
merged_signal = []
|
| 99 |
-
sr_1, signal_1 = wavfile.read(audio_path_1)
|
| 100 |
-
sr_2, signal_2 = wavfile.read(audio_path_2)
|
| 101 |
-
merged_signal.append(signal_1)
|
| 102 |
-
merged_signal.append(signal_2)
|
| 103 |
-
merged_signal = np.hstack(merged_signal)
|
| 104 |
-
merged_signal = np.asarray(merged_signal, dtype=np.int16)
|
| 105 |
-
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 106 |
-
wavfile.write(audio_filename, sr_1, merged_signal)
|
| 107 |
-
return audio_filename
|
| 108 |
class T2I:
|
| 109 |
def __init__(self, device):
|
| 110 |
print("Initializing T2I to %s" % device)
|
|
@@ -525,10 +515,6 @@ class ASR:
|
|
| 525 |
options = whisper.DecodingOptions()
|
| 526 |
result = whisper.decode(self.model, mel, options)
|
| 527 |
return result.text
|
| 528 |
-
|
| 529 |
-
def translate_english(self, audio_path):
|
| 530 |
-
audio = self.model.transcribe(audio_path, language='English')
|
| 531 |
-
return audio['text']
|
| 532 |
|
| 533 |
class A2T:
|
| 534 |
def __init__(self, device):
|
|
@@ -818,97 +804,4 @@ class TargetSoundDetection:
|
|
| 818 |
ans = ''
|
| 819 |
for i,item in enumerate(time_predictions):
|
| 820 |
ans = ans + 'segment' + str(i+1) + ' start_time: ' + str(item['onset']) + ' end_time: ' + str(item['offset']) + '\t'
|
| 821 |
-
return ans
|
| 822 |
-
|
| 823 |
-
class Speech_Enh_SC:
|
| 824 |
-
"""Speech Enhancement or Separation in single-channel
|
| 825 |
-
Example usage:
|
| 826 |
-
enh_model = Speech_Enh_SS("cuda")
|
| 827 |
-
enh_wav = enh_model.inference("./test_chime4_audio_M05_440C0213_PED_REAL.wav")
|
| 828 |
-
"""
|
| 829 |
-
def __init__(self, device="cuda", model_name="espnet/Wangyou_Zhang_chime4_enh_train_enh_conv_tasnet_raw"):
|
| 830 |
-
self.model_name = model_name
|
| 831 |
-
self.device = device
|
| 832 |
-
print("Initializing ESPnet Enh to %s" % device)
|
| 833 |
-
self._initialize_model()
|
| 834 |
-
|
| 835 |
-
def _initialize_model(self):
|
| 836 |
-
from espnet_model_zoo.downloader import ModelDownloader
|
| 837 |
-
from espnet2.bin.enh_inference import SeparateSpeech
|
| 838 |
-
|
| 839 |
-
d = ModelDownloader()
|
| 840 |
-
|
| 841 |
-
cfg = d.download_and_unpack(self.model_name)
|
| 842 |
-
self.separate_speech = SeparateSpeech(
|
| 843 |
-
train_config=cfg["train_config"],
|
| 844 |
-
model_file=cfg["model_file"],
|
| 845 |
-
# for segment-wise process on long speech
|
| 846 |
-
segment_size=2.4,
|
| 847 |
-
hop_size=0.8,
|
| 848 |
-
normalize_segment_scale=False,
|
| 849 |
-
show_progressbar=True,
|
| 850 |
-
ref_channel=None,
|
| 851 |
-
normalize_output_wav=True,
|
| 852 |
-
device=self.device,
|
| 853 |
-
)
|
| 854 |
-
|
| 855 |
-
@prompts(name="Speech Enhancement In Single-Channel",
|
| 856 |
-
description="useful for when you want to enhance the quality of the speech signal by reducing background noise (single-channel), "
|
| 857 |
-
"receives audio_path as input."
|
| 858 |
-
"The input to this tool should be a string, "
|
| 859 |
-
"representing the audio_path. " )
|
| 860 |
-
|
| 861 |
-
def inference(self, speech_path, ref_channel=0):
|
| 862 |
-
speech, sr = soundfile.read(speech_path)
|
| 863 |
-
speech = speech[:, ref_channel]
|
| 864 |
-
enh_speech = self.separate_speech(speech[None, ...], fs=sr)
|
| 865 |
-
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 866 |
-
soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
|
| 867 |
-
return audio_filename
|
| 868 |
-
|
| 869 |
-
class Speech_SS:
|
| 870 |
-
def __init__(self, device="cuda", model_name="lichenda/wsj0_2mix_skim_noncausal"):
|
| 871 |
-
self.model_name = model_name
|
| 872 |
-
self.device = device
|
| 873 |
-
print("Initializing ESPnet SS to %s" % device)
|
| 874 |
-
self._initialize_model()
|
| 875 |
-
|
| 876 |
-
def _initialize_model(self):
|
| 877 |
-
from espnet_model_zoo.downloader import ModelDownloader
|
| 878 |
-
from espnet2.bin.enh_inference import SeparateSpeech
|
| 879 |
-
|
| 880 |
-
d = ModelDownloader()
|
| 881 |
-
|
| 882 |
-
cfg = d.download_and_unpack(self.model_name)
|
| 883 |
-
self.separate_speech = SeparateSpeech(
|
| 884 |
-
train_config=cfg["train_config"],
|
| 885 |
-
model_file=cfg["model_file"],
|
| 886 |
-
# for segment-wise process on long speech
|
| 887 |
-
segment_size=2.4,
|
| 888 |
-
hop_size=0.8,
|
| 889 |
-
normalize_segment_scale=False,
|
| 890 |
-
show_progressbar=True,
|
| 891 |
-
ref_channel=None,
|
| 892 |
-
normalize_output_wav=True,
|
| 893 |
-
device=self.device,
|
| 894 |
-
)
|
| 895 |
-
|
| 896 |
-
@prompts(name="Speech Separation",
|
| 897 |
-
description="useful for when you want to separate each speech from the speech mixture, "
|
| 898 |
-
"receives audio_path as input."
|
| 899 |
-
"The input to this tool should be a string, "
|
| 900 |
-
"representing the audio_path. " )
|
| 901 |
-
|
| 902 |
-
def inference(self, speech_path):
|
| 903 |
-
speech, sr = soundfile.read(speech_path)
|
| 904 |
-
enh_speech = self.separate_speech(speech[None, ...], fs=sr)
|
| 905 |
-
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 906 |
-
if len(enh_speech) == 1:
|
| 907 |
-
soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
|
| 908 |
-
else:
|
| 909 |
-
audio_filename_1 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 910 |
-
soundfile.write(audio_filename_1, enh_speech[0].squeeze(), samplerate=sr)
|
| 911 |
-
audio_filename_2 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 912 |
-
soundfile.write(audio_filename_2, enh_speech[1].squeeze(), samplerate=sr)
|
| 913 |
-
audio_filename = merge_audio(audio_filename_1, audio_filename_2)
|
| 914 |
-
return audio_filename
|
|
|
|
| 94 |
print(score_list,max_index)
|
| 95 |
return wav_list[max_index]
|
| 96 |
|
| 97 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
class T2I:
|
| 99 |
def __init__(self, device):
|
| 100 |
print("Initializing T2I to %s" % device)
|
|
|
|
| 515 |
options = whisper.DecodingOptions()
|
| 516 |
result = whisper.decode(self.model, mel, options)
|
| 517 |
return result.text
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
|
| 519 |
class A2T:
|
| 520 |
def __init__(self, device):
|
|
|
|
| 804 |
ans = ''
|
| 805 |
for i,item in enumerate(time_predictions):
|
| 806 |
ans = ans + 'segment' + str(i+1) + ' start_time: ' + str(item['onset']) + ' end_time: ' + str(item['offset']) + '\t'
|
| 807 |
+
return ans
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|