Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import nemo.collections.asr as nemo_asr | |
| import numpy as np | |
| import torch | |
| # Load the pre-trained Kabyle ASR model | |
| asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_kab_conformer_transducer_large") | |
| # Function to transcribe the audio input | |
| def transcribe(audio): | |
| # Print the raw audio input | |
| print(f"Raw audio input: {audio}") | |
| # Audio in Gradio is returned as a tuple (sample_rate, audio_data) | |
| sample_rate, audio_data = audio | |
| # Print to check the types | |
| print(f"Audio data type: {type(audio_data)}") | |
| print(f"Sample rate type: {type(sample_rate)}") | |
| # Ensure the audio data is in numpy array format | |
| if isinstance(audio_data, np.ndarray): | |
| # If it's already numpy, we pass it directly | |
| audio_data = np.array(audio_data) | |
| elif isinstance(audio_data, torch.Tensor): | |
| # If it's a tensor, convert to numpy array | |
| audio_data = audio_data.numpy() | |
| else: | |
| print("Error: Audio data is neither a numpy array nor a tensor.") | |
| return "Invalid audio format" | |
| # Now transcribe the audio | |
| return asr_model.transcribe([audio_data]) | |
| # Create the Gradio interface with audio input and text output | |
| iface = gr.Interface(fn=transcribe, inputs="audio", outputs="text") | |
| # Launch the Gradio interface | |
| iface.launch() |