Spaces:
Runtime error
Runtime error
Change app.py
Browse files- .DS_Store +0 -0
- app.py +81 -79
- audio--1504190171-headset.flac → audio_slurp.flac +0 -0
.DS_Store
CHANGED
|
Binary files a/.DS_Store and b/.DS_Store differ
|
|
|
app.py
CHANGED
|
@@ -17,99 +17,101 @@ speech2text = Speech2Text.from_pretrained(
|
|
| 17 |
)
|
| 18 |
# Confirm the sampling rate is equal to that of the training corpus.
|
| 19 |
# If not, you need to resample the audio data before inputting to speech2text
|
| 20 |
-
speech, rate = soundfile.read("audio--1504190171-headset.flac")
|
| 21 |
-
nbests = speech2text(speech)
|
| 22 |
|
| 23 |
-
text, *_ = nbests[0]
|
| 24 |
-
print(text)
|
| 25 |
-
exit()
|
| 26 |
|
| 27 |
-
text2speechen = Text2Speech.from_pretrained(
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
)
|
| 45 |
|
| 46 |
|
| 47 |
-
tagjp = 'kan-bayashi/jsut_full_band_vits_prosody'
|
| 48 |
-
vocoder_tagjp = 'none'
|
| 49 |
|
| 50 |
-
text2speechjp = Text2Speech.from_pretrained(
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
)
|
| 68 |
|
| 69 |
-
tagch = 'kan-bayashi/csmsc_full_band_vits'
|
| 70 |
-
vocoder_tagch = "none"
|
| 71 |
|
| 72 |
-
text2speechch = Text2Speech.from_pretrained(
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
)
|
| 90 |
|
| 91 |
-
def inference(
|
| 92 |
with torch.no_grad():
|
| 93 |
if lang == "english":
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
| 106 |
|
| 107 |
-
examples=[['
|
| 108 |
|
|
|
|
| 109 |
gr.Interface(
|
| 110 |
inference,
|
| 111 |
-
[gr.inputs.
|
| 112 |
-
gr.outputs.
|
| 113 |
title=title,
|
| 114 |
description=description,
|
| 115 |
article=article,
|
|
|
|
| 17 |
)
|
| 18 |
# Confirm the sampling rate is equal to that of the training corpus.
|
| 19 |
# If not, you need to resample the audio data before inputting to speech2text
|
| 20 |
+
# speech, rate = soundfile.read("audio--1504190171-headset.flac")
|
| 21 |
+
# nbests = speech2text(speech)
|
| 22 |
|
| 23 |
+
# text, *_ = nbests[0]
|
| 24 |
+
# print(text)
|
| 25 |
+
# exit()
|
| 26 |
|
| 27 |
+
# text2speechen = Text2Speech.from_pretrained(
|
| 28 |
+
# model_tag=str_or_none(tagen),
|
| 29 |
+
# vocoder_tag=str_or_none(vocoder_tagen),
|
| 30 |
+
# device="cpu",
|
| 31 |
+
# # Only for Tacotron 2 & Transformer
|
| 32 |
+
# threshold=0.5,
|
| 33 |
+
# # Only for Tacotron 2
|
| 34 |
+
# minlenratio=0.0,
|
| 35 |
+
# maxlenratio=10.0,
|
| 36 |
+
# use_att_constraint=False,
|
| 37 |
+
# backward_window=1,
|
| 38 |
+
# forward_window=3,
|
| 39 |
+
# # Only for FastSpeech & FastSpeech2 & VITS
|
| 40 |
+
# speed_control_alpha=1.0,
|
| 41 |
+
# # Only for VITS
|
| 42 |
+
# noise_scale=0.333,
|
| 43 |
+
# noise_scale_dur=0.333,
|
| 44 |
+
# )
|
| 45 |
|
| 46 |
|
| 47 |
+
# tagjp = 'kan-bayashi/jsut_full_band_vits_prosody'
|
| 48 |
+
# vocoder_tagjp = 'none'
|
| 49 |
|
| 50 |
+
# text2speechjp = Text2Speech.from_pretrained(
|
| 51 |
+
# model_tag=str_or_none(tagjp),
|
| 52 |
+
# vocoder_tag=str_or_none(vocoder_tagjp),
|
| 53 |
+
# device="cpu",
|
| 54 |
+
# # Only for Tacotron 2 & Transformer
|
| 55 |
+
# threshold=0.5,
|
| 56 |
+
# # Only for Tacotron 2
|
| 57 |
+
# minlenratio=0.0,
|
| 58 |
+
# maxlenratio=10.0,
|
| 59 |
+
# use_att_constraint=False,
|
| 60 |
+
# backward_window=1,
|
| 61 |
+
# forward_window=3,
|
| 62 |
+
# # Only for FastSpeech & FastSpeech2 & VITS
|
| 63 |
+
# speed_control_alpha=1.0,
|
| 64 |
+
# # Only for VITS
|
| 65 |
+
# noise_scale=0.333,
|
| 66 |
+
# noise_scale_dur=0.333,
|
| 67 |
+
# )
|
| 68 |
|
| 69 |
+
# tagch = 'kan-bayashi/csmsc_full_band_vits'
|
| 70 |
+
# vocoder_tagch = "none"
|
| 71 |
|
| 72 |
+
# text2speechch = Text2Speech.from_pretrained(
|
| 73 |
+
# model_tag=str_or_none(tagch),
|
| 74 |
+
# vocoder_tag=str_or_none(vocoder_tagch),
|
| 75 |
+
# device="cpu",
|
| 76 |
+
# # Only for Tacotron 2 & Transformer
|
| 77 |
+
# threshold=0.5,
|
| 78 |
+
# # Only for Tacotron 2
|
| 79 |
+
# minlenratio=0.0,
|
| 80 |
+
# maxlenratio=10.0,
|
| 81 |
+
# use_att_constraint=False,
|
| 82 |
+
# backward_window=1,
|
| 83 |
+
# forward_window=3,
|
| 84 |
+
# # Only for FastSpeech & FastSpeech2 & VITS
|
| 85 |
+
# speed_control_alpha=1.0,
|
| 86 |
+
# # Only for VITS
|
| 87 |
+
# noise_scale=0.333,
|
| 88 |
+
# noise_scale_dur=0.333,
|
| 89 |
+
# )
|
| 90 |
|
| 91 |
+
def inference(wav,lang):
|
| 92 |
with torch.no_grad():
|
| 93 |
if lang == "english":
|
| 94 |
+
speech, rate = soundfile.read("audio--1504190171-headset.flac")
|
| 95 |
+
nbests = speech2text(speech)
|
| 96 |
+
text, *_ = nbests[0]
|
| 97 |
+
# if lang == "chinese":
|
| 98 |
+
# wav = text2speechch(text)["wav"]
|
| 99 |
+
# scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
|
| 100 |
+
# if lang == "japanese":
|
| 101 |
+
# wav = text2speechjp(text)["wav"]
|
| 102 |
+
# scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
|
| 103 |
+
return text
|
| 104 |
+
title = "ESPnet2-SLU"
|
| 105 |
+
description = "Gradio demo for ESPnet2-SLU: Extending the Edge of SLU Research. To use it, simply record your audio. Read more at the links below."
|
| 106 |
+
article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
|
| 107 |
|
| 108 |
+
examples=[['audio-_slurp.flac',"english"]]
|
| 109 |
|
| 110 |
+
# gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
|
| 111 |
gr.Interface(
|
| 112 |
inference,
|
| 113 |
+
[gr.inputs.Audio(label="input audio"),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")],
|
| 114 |
+
gr.outputs.Textbox(type="str", label="Output"),
|
| 115 |
title=title,
|
| 116 |
description=description,
|
| 117 |
article=article,
|
audio--1504190171-headset.flac → audio_slurp.flac
RENAMED
|
File without changes
|