Spaces:
Runtime error
Runtime error
File size: 4,384 Bytes
f0d5b79 aa14541 f0d5b79 aa14541 f0d5b79 ba6a1e9 f0d5b79 ba6a1e9 f0d5b79 aa14541 f0d5b79 aa14541 f0d5b79 ba6a1e9 346a58e f0d5b79 ba6a1e9 f0d5b79 e76aabe 1e39ba8 f0d5b79 ba6a1e9 de4881c f0d5b79 ba6a1e9 f0d5b79 486be98 f0d5b79 1e39ba8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import gradio as gr
import spaces
import torch
from transformers import AutoProcessor, VoxtralForConditionalGeneration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load model and processor
voxtral_mini_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers")
voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)
voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)
LANGUAGES = {
"English": "en",
"French": "fr",
"German": "de",
"Spanish": "es",
"Italian": "it",
"Portuguese": "pt",
"Dutch": "nl",
"Russian": "ru",
"Chinese": "zh",
"Japanese": "ja",
"Arabic": "ar",
}
@spaces.GPU()
def process_audio(audio_path, model_name, lang_name, max_tokens=500):
"""Process audio with selected Voxtral model and return the generated response"""
if not audio_path:
return "Please upload an audio file."
if model_name == "Voxtral Mini (3B)":
model = voxtral_mini_model
processor = voxtral_mini_processor
repo_id = "MohamedRashad/Voxtral-Mini-3B-2507-transformers"
elif model_name == "Voxtral Small (24B)":
model = voxtral_small_model
processor = voxtral_small_processor
repo_id = "MohamedRashad/Voxtral-Small-24B-2507-transformers"
else:
return "Invalid model selected."
language = LANGUAGES[lang_name]
inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=repo_id)
inputs = inputs.to(device, dtype=torch.bfloat16)
outputs = model.generate(**inputs, max_new_tokens=max_tokens)
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
return decoded_outputs[0]
# Define Gradio interface
with gr.Blocks(title="Voxtral Demo") as demo:
gr.Markdown("# Voxtral Transcription Demo")
gr.Markdown("Upload an audio file and get a transcription from Voxtral.")
gr.Markdown("You can find the `transformers` version of Voxtral here: [3B](https://huggingface.co/MohamedRashad/Voxtral-Mini-3B-2507-transformers), [24B](https://huggingface.co/MohamedRashad/Voxtral-Small-24B-2507-transformers)")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio")
model_selector = gr.Dropdown(
choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"],
value="Voxtral Mini (3B)",
label="Select Model"
)
language = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="English",
label="Language"
)
max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
submit_btn = gr.Button("Extract Transcription", variant="primary")
with gr.Column():
output_text = gr.Textbox(label="Generated Response", lines=10)
submit_btn.click(
fn=process_audio,
inputs=[audio_input, model_selector, language, max_tokens],
outputs=output_text
)
gr.Examples(
examples=[
["examples/english_armstrong_small_step.mp3", "Voxtral Mini (3B)", "English", 500],
["examples/french_mathis_voice_intro.mp3", "Voxtral Mini (3B)", "French", 500],
["examples/german_spehr_voice_intro.mp3", "Voxtral Mini (3B)", "German", 500],
["examples/japanese_ann01_announcement.mp3", "Voxtral Mini (3B)", "Japanese", 500],
],
inputs=[audio_input, model_selector, language, max_tokens],
examples_labels=[
"Neil Armstrong's 'small step' (English, 24s)",
"RΓ©mi Mathis voice intro (French, 16s)",
"Christoph Spehr voice intro (German, 28s)",
"Ann01 announcement (Japanese, 22s)"
]
)
# Launch the app
if __name__ == "__main__":
demo.queue().launch(share=False)
|