Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from wenet.cli.model import load_model | |
| import os | |
| from huggingface_hub import login | |
| # Load the API token from the environment variables | |
| api_token = os.getenv('HUGGINGFACE_API_TOKEN') | |
| if not api_token: | |
| raise ValueError("No Hugging Face API token found. Please set the HUGGING_FACE_API_TOKEN environment variable.") | |
| # Login to Hugging Face Hub | |
| login(token=api_token, add_to_git_credential=True) | |
| def process_cat_embs(cat_embs): | |
| device = "cpu" | |
| cat_embs = torch.tensor( | |
| [float(c) for c in cat_embs.split(',')]).to(device) | |
| return cat_embs | |
| def download_rev_models(): | |
| from huggingface_hub import hf_hub_download | |
| import joblib | |
| REPO_ID = "Revai/reverb-asr" | |
| files = ['reverb_asr_v1.jit.zip', 'tk.units.txt'] | |
| downloaded_files = [hf_hub_download(repo_id=REPO_ID, filename=f) for f in files] | |
| model = load_model(downloaded_files[0], downloaded_files[1]) | |
| return model | |
| model = download_rev_models() | |
| def recognition(audio, style=0): | |
| if audio is None: | |
| return "Input Error! Please enter one audio!" | |
| cat_embs = ','.join([str(s) for s in (style, 1-style)]) | |
| cat_embs = process_cat_embs(cat_embs) | |
| ans = model.transcribe(audio, cat_embs = cat_embs) | |
| if ans is None: | |
| return "ERROR! No text output! Please try again!" | |
| txt = ans['text'] | |
| txt = txt.replace('β', ' ') | |
| return txt | |
| audio_input = gr.Audio(type="filepath", label="Upload or Record Audio") | |
| style_slider = gr.Slider(0, 1, value=0, step=0.1, label="Transcription Style", | |
| info="Adjust the transcription style: 0 (casual) to 1 (formal).") | |
| output_textbox = gr.Textbox(label="Transcription Output") | |
| text = "ASR Transcription Opensource Demo-CPU" | |
| # description | |
| description = ( | |
| " Opensource Automatic Speech Recognition in English" | |
| "Verbatim Transcript style(1) refers to word to word-to-word transcription of an audio" | |
| "Non Verbatim Transcript style(0) refers to just conserving the message of the original audio" | |
| ) | |
| iface = gr.Interface( | |
| fn=recognition, | |
| inputs=[audio_input, style_slider], | |
| outputs=output_textbox, | |
| title=text, | |
| description=description, | |
| theme='default', | |
| ) | |
| iface.launch() | |