File size: 4,384 Bytes
f0d5b79
 
 
 
 
 
 
 
 
aa14541
 
f0d5b79
aa14541
 
f0d5b79
ba6a1e9
 
 
 
 
 
 
 
 
 
 
 
 
 
f0d5b79
ba6a1e9
f0d5b79
 
 
 
 
 
 
aa14541
f0d5b79
 
 
aa14541
f0d5b79
 
 
ba6a1e9
346a58e
f0d5b79
 
 
 
 
 
 
ba6a1e9
 
f0d5b79
 
e76aabe
 
1e39ba8
f0d5b79
 
 
 
 
 
 
 
 
 
 
 
ba6a1e9
de4881c
f0d5b79
 
 
 
ba6a1e9
f0d5b79
 
 
 
 
 
 
 
 
 
486be98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0d5b79
 
1e39ba8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr
import spaces
import torch
from transformers import AutoProcessor, VoxtralForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model and processor
voxtral_mini_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers")
voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)

voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)

LANGUAGES = {
    "English": "en",
    "French": "fr",
    "German": "de",
    "Spanish": "es",
    "Italian": "it",
    "Portuguese": "pt",
    "Dutch": "nl",
    "Russian": "ru",
    "Chinese": "zh",
    "Japanese": "ja",
    "Arabic": "ar",
}

@spaces.GPU()
def process_audio(audio_path, model_name, lang_name, max_tokens=500):
    """Process audio with selected Voxtral model and return the generated response"""
    if not audio_path:
        return "Please upload an audio file."

    if model_name == "Voxtral Mini (3B)":
        model = voxtral_mini_model
        processor = voxtral_mini_processor
        repo_id = "MohamedRashad/Voxtral-Mini-3B-2507-transformers"
    elif model_name == "Voxtral Small (24B)":
        model = voxtral_small_model
        processor = voxtral_small_processor
        repo_id = "MohamedRashad/Voxtral-Small-24B-2507-transformers"
    else:
        return "Invalid model selected."
    
    language = LANGUAGES[lang_name]
    inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=repo_id)
    inputs = inputs.to(device, dtype=torch.bfloat16)
    
    outputs = model.generate(**inputs, max_new_tokens=max_tokens)
    decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    return decoded_outputs[0]



# Define Gradio interface
with gr.Blocks(title="Voxtral Demo") as demo:
    gr.Markdown("# Voxtral Transcription Demo")
    gr.Markdown("Upload an audio file and get a transcription from Voxtral.")
    gr.Markdown("You can find the `transformers` version of Voxtral here: [3B](https://huggingface.co/MohamedRashad/Voxtral-Mini-3B-2507-transformers), [24B](https://huggingface.co/MohamedRashad/Voxtral-Small-24B-2507-transformers)")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath", label="Upload Audio")
            
            model_selector = gr.Dropdown(
                choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"],
                value="Voxtral Mini (3B)",
                label="Select Model"
            )
            
            language = gr.Dropdown(
                choices=list(LANGUAGES.keys()),
                value="English",
                label="Language"
            )
            
            max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
            submit_btn = gr.Button("Extract Transcription", variant="primary")
        
        with gr.Column():
            output_text = gr.Textbox(label="Generated Response", lines=10)
    
    submit_btn.click(
        fn=process_audio,
        inputs=[audio_input, model_selector, language, max_tokens],
        outputs=output_text
    )

    gr.Examples(
        examples=[
            ["examples/english_armstrong_small_step.mp3", "Voxtral Mini (3B)", "English", 500],
            ["examples/french_mathis_voice_intro.mp3", "Voxtral Mini (3B)", "French", 500],
            ["examples/german_spehr_voice_intro.mp3", "Voxtral Mini (3B)", "German", 500],
            ["examples/japanese_ann01_announcement.mp3", "Voxtral Mini (3B)", "Japanese", 500],
        ],
        inputs=[audio_input, model_selector, language, max_tokens],
        examples_labels=[
            "Neil Armstrong's 'small step' (English, 24s)",
            "RΓ©mi Mathis voice intro (French, 16s)",
            "Christoph Spehr voice intro (German, 28s)",
            "Ann01 announcement (Japanese, 22s)"
        ]
    )

# Launch the app
if __name__ == "__main__":
    demo.queue().launch(share=False)