Spaces:
Running
Running
| import logging | |
| import warnings | |
| import gradio as gr | |
| import librosa | |
| # import torchaudio | |
| from transformers import pipeline | |
| from transformers.utils.logging import disable_progress_bar | |
| warnings.filterwarnings("ignore") | |
| disable_progress_bar() | |
| logging.basicConfig( | |
| format="%(asctime)s [%(levelname)s] [%(name)s] %(message)s", | |
| datefmt="%Y-%m-%dT%H:%M:%SZ", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| logger.setLevel(logging.DEBUG) | |
| MODEL_NAME = "bofenghuang/asr-wav2vec2-ctc-french" | |
| SAMPLE_RATE = 16_000 | |
| pipe = pipeline(model=MODEL_NAME) | |
| logger.info("ASR pipeline has been initialized") | |
| def process_audio_file(audio_file): | |
| # waveform, sample_rate = torchaudio.load(audio_file) | |
| # waveform = waveform.squeeze(axis=0) # mono | |
| # # resample | |
| # if sample_rate != SAMPLE_RATE: | |
| # resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE) | |
| # waveform = resampler(waveform) | |
| waveform, sample_rate = librosa.load(audio_file, mono=True) | |
| # resample | |
| if sample_rate != SAMPLE_RATE: | |
| waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=SAMPLE_RATE) | |
| return waveform | |
| def transcribe(microphone_audio_file, uploaded_audio_file): | |
| warning_message = "" | |
| if (microphone_audio_file is not None) and (uploaded_audio_file is not None): | |
| warning_message = ( | |
| "WARNING: You've uploaded an audio file and used the microphone. " | |
| "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" | |
| ) | |
| audio_file = microphone_audio_file | |
| elif (microphone_audio_file is None) and (uploaded_audio_file is None): | |
| return "ERROR: You have to either use the microphone or upload an audio file" | |
| elif microphone_audio_file is not None: | |
| audio_file = microphone_audio_file | |
| else: | |
| audio_file = uploaded_audio_file | |
| audio_data = process_audio_file(audio_file) | |
| # text = pipe(audio_data)["text"] | |
| text = pipe(audio_data, chunk_length_s=30, stride_length_s=5)["text"] | |
| logger.info(f"Transcription for {audio_file}: {text}") | |
| return warning_message + text | |
| iface = gr.Interface( | |
| fn=transcribe, | |
| inputs=[ | |
| gr.Audio(source="microphone", type="filepath", label="Record something...", optional=True), | |
| gr.Audio(source="upload", type="filepath", label="Upload some audio file...", optional=True), | |
| ], | |
| outputs="text", | |
| layout="horizontal", | |
| # theme="huggingface", | |
| title="Speech-to-Text in French", | |
| description=f"Realtime demo for French automatic speech recognition. Demo uses the the fine-tuned checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and π€ Transformers to transcribe audio files of arbitrary length.", | |
| allow_flagging="never", | |
| ) | |
| # iface.launch(server_name="0.0.0.0", debug=True, share=True) | |
| iface.launch(enable_queue=True) | |