Spaces:
Runtime error
Runtime error
| from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| import gradio as gr | |
| from pydub import AudioSegment, silence | |
| import tempfile | |
| import torch | |
| import torchaudio | |
| MODEL_NAME = "dataprizma/whisper-large-v3-turbo" | |
| processor = WhisperProcessor.from_pretrained(MODEL_NAME) | |
| model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = model.to(device) | |
| def split_on_silence_with_duration_control(audio, min_len, max_len, silence_thresh=-40): | |
| silences = silence.detect_silence(audio, min_silence_len=500, silence_thresh=silence_thresh) | |
| silences = [((start + end) // 2) for start, end in silences] | |
| chunks = [] | |
| start = 0 | |
| while start < len(audio): | |
| end = min(start + max_len, len(audio)) | |
| candidates = [s for s in silences if start + min_len <= s <= end] | |
| split_point = candidates[-1] if candidates else end | |
| chunks.append(audio[start:split_point]) | |
| start = split_point | |
| return chunks | |
| def transcribe(audio_file): | |
| # Load audio using pydub | |
| audio = AudioSegment.from_file(audio_file) | |
| # Convert to mono and 16kHz if needed | |
| if audio.channels > 1: | |
| audio = audio.set_channels(1) | |
| if audio.frame_rate != 16000: | |
| audio = audio.set_frame_rate(16000) | |
| # Detect silent chunks | |
| chunks = split_on_silence_with_duration_control( | |
| audio, min_len=15000, max_len=25000, silence_thresh=-40 | |
| ) | |
| # Transcribe each chunk | |
| results = [] | |
| for chunk in chunks: | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmpfile: | |
| chunk.export(tmpfile.name, format="wav") | |
| waveform, _ = torchaudio.load(tmpfile.name) | |
| input_features = processor( | |
| waveform.squeeze().numpy(), | |
| sampling_rate=16000, | |
| return_tensors="pt", | |
| language="uz" | |
| ).input_features.to(device) | |
| with torch.no_grad(): | |
| predicted_ids = model.generate(input_features) | |
| transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
| results.append(transcription) | |
| return " ".join(results) | |
| demo = gr.Blocks() | |
| file_transcribe = gr.Interface( | |
| fn=transcribe, | |
| inputs=gr.Audio(type="filepath", label="Audio file"), | |
| outputs="text", | |
| title="Whisper Large V3: Transcribe Audio", | |
| description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma", | |
| ) | |
| with demo: | |
| gr.TabbedInterface([file_transcribe], ["Audio file"]) | |
| demo.launch() | |