Voxtral / app.py
victor's picture
victor HF Staff
Add audio examples with Git LFS support
486be98
raw
history blame
4.38 kB
import gradio as gr
import spaces
import torch
from transformers import AutoProcessor, VoxtralForConditionalGeneration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load model and processor
voxtral_mini_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers")
voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)
voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)
LANGUAGES = {
"English": "en",
"French": "fr",
"German": "de",
"Spanish": "es",
"Italian": "it",
"Portuguese": "pt",
"Dutch": "nl",
"Russian": "ru",
"Chinese": "zh",
"Japanese": "ja",
"Arabic": "ar",
}
@spaces.GPU()
def process_audio(audio_path, model_name, lang_name, max_tokens=500):
"""Process audio with selected Voxtral model and return the generated response"""
if not audio_path:
return "Please upload an audio file."
if model_name == "Voxtral Mini (3B)":
model = voxtral_mini_model
processor = voxtral_mini_processor
repo_id = "MohamedRashad/Voxtral-Mini-3B-2507-transformers"
elif model_name == "Voxtral Small (24B)":
model = voxtral_small_model
processor = voxtral_small_processor
repo_id = "MohamedRashad/Voxtral-Small-24B-2507-transformers"
else:
return "Invalid model selected."
language = LANGUAGES[lang_name]
inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=repo_id)
inputs = inputs.to(device, dtype=torch.bfloat16)
outputs = model.generate(**inputs, max_new_tokens=max_tokens)
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
return decoded_outputs[0]
# Define Gradio interface
with gr.Blocks(title="Voxtral Demo") as demo:
gr.Markdown("# Voxtral Transcription Demo")
gr.Markdown("Upload an audio file and get a transcription from Voxtral.")
gr.Markdown("You can find the `transformers` version of Voxtral here: [3B](https://huggingface.co/MohamedRashad/Voxtral-Mini-3B-2507-transformers), [24B](https://huggingface.co/MohamedRashad/Voxtral-Small-24B-2507-transformers)")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio")
model_selector = gr.Dropdown(
choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"],
value="Voxtral Mini (3B)",
label="Select Model"
)
language = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="English",
label="Language"
)
max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
submit_btn = gr.Button("Extract Transcription", variant="primary")
with gr.Column():
output_text = gr.Textbox(label="Generated Response", lines=10)
submit_btn.click(
fn=process_audio,
inputs=[audio_input, model_selector, language, max_tokens],
outputs=output_text
)
gr.Examples(
examples=[
["examples/english_armstrong_small_step.mp3", "Voxtral Mini (3B)", "English", 500],
["examples/french_mathis_voice_intro.mp3", "Voxtral Mini (3B)", "French", 500],
["examples/german_spehr_voice_intro.mp3", "Voxtral Mini (3B)", "German", 500],
["examples/japanese_ann01_announcement.mp3", "Voxtral Mini (3B)", "Japanese", 500],
],
inputs=[audio_input, model_selector, language, max_tokens],
examples_labels=[
"Neil Armstrong's 'small step' (English, 24s)",
"Rémi Mathis voice intro (French, 16s)",
"Christoph Spehr voice intro (German, 28s)",
"Ann01 announcement (Japanese, 22s)"
]
)
# Launch the app
if __name__ == "__main__":
demo.queue().launch(share=False)