File size: 2,463 Bytes
cb9426c
10cfa3b
 
cb9426c
5a5050b
 
10cfa3b
90bcc62
10cfa3b
59ff216
10cfa3b
 
 
 
 
 
ce2a837
cb9426c
 
10cfa3b
 
 
 
 
 
5a5050b
90bcc62
10cfa3b
cb9426c
 
6440aaf
90bcc62
6440aaf
 
 
 
 
 
 
 
 
 
5a5050b
6440aaf
 
 
 
90bcc62
6440aaf
 
 
 
394213a
10cfa3b
394213a
90bcc62
 
 
6440aaf
90bcc62
 
 
 
 
 
 
 
5a5050b
cb9426c
90bcc62
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os
import tempfile
import json
import librosa
import numpy as np
import soundfile as sf
import torch
import gradio as gr
from transformers import pipeline

# Load models globally to avoid reloading on every request
device = 0 if torch.cuda.is_available() else -1
models = {
    'transcriber': pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en", device=device, chunk_length_s=30),
    'summarizer': pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)
}

def load_and_convert_audio(audio_path):
    """Load audio using librosa and convert to WAV format"""
    audio_data, sample_rate = librosa.load(audio_path, sr=16000)  # Whisper expects 16kHz
    audio_data = audio_data.astype(np.float32)
    
    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_wav:
        sf.write(temp_wav.name, audio_data, sample_rate, format='WAV')
        return temp_wav.name

def process_audio(audio_file):
    """Process audio file and return transcription and summary"""
    results = {}
    
    try:
        temp_wav_path = load_and_convert_audio(audio_file.name)

        # Transcription
        transcription = models['transcriber'](temp_wav_path, return_timestamps=True)
        results['transcription'] = transcription['text'] if isinstance(transcription, dict) else ' '.join([chunk['text'] for chunk in transcription])

        # Summarization
        text = results['transcription']
        words = text.split()
        chunk_size = 1000
        chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

        summaries = [models['summarizer'](chunk, max_length=200, min_length=50, truncation=True)[0]['summary_text'] for chunk in chunks]
        results['summary'] = ' '.join(summaries)

    except Exception as e:
        return {'error': str(e)}  # Return error message if something goes wrong

    finally:
        if os.path.exists(temp_wav_path):
            os.unlink(temp_wav_path)

    return results

def gradio_interface(audio):
    """Gradio interface function"""
    return process_audio(audio)

# Create Gradio interface
iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.inputs.Audio(source="upload", type="file", label="Upload Audio File"),
    outputs=["json"],
    title="Audio Transcription and Summarization",
    description="Upload an audio file to get its transcription and summary."
)

if __name__ == "__main__":
    iface.launch()