Lum4yx commited on
Commit
68728a0
·
verified ·
1 Parent(s): 48c3d6a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -67
app.py CHANGED
@@ -2,11 +2,9 @@ import gradio as gr
2
  from textblob import TextBlob
3
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
  import torch
5
- import base64
6
  import numpy as np
7
- import ffmpeg
8
  import os
9
- import glob # Imported to find example files
10
 
11
  # 1. Set up device and data type for optimized performance
12
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -41,7 +39,6 @@ def sentiment_analysis(text: str) -> dict:
41
  """
42
  blob = TextBlob(text)
43
  sentiment = blob.sentiment
44
-
45
  return {
46
  "transcript": text,
47
  "polarity": round(sentiment.polarity, 2),
@@ -49,66 +46,35 @@ def sentiment_analysis(text: str) -> dict:
49
  "assessment": "positive" if sentiment.polarity > 0 else "negative" if sentiment.polarity < 0 else "neutral"
50
  }
51
 
52
- def process_audio(audio_path: str) -> dict:
 
53
  """
54
- Processes an audio file from a local path, transcribes it, and analyzes its sentiment.
 
55
  """
56
- if not audio_path or not os.path.exists(audio_path):
57
- return {"error": "Invalid or non-existent file path provided."}
58
-
59
- try:
60
- out, _ = (
61
- ffmpeg
62
- .input(audio_path)
63
- .output('pipe:1', format='s16le', ac=1, ar=16000)
64
- .run(capture_stdout=True, capture_stderr=True)
65
- )
66
- audio_np = np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
67
- transcription_result = pipe(audio_np)
68
- transcript_text = transcription_result["text"]
69
- except Exception as e:
70
- return {"error": f"Failed to process audio file: {str(e)}"}
71
 
72
- return sentiment_analysis(transcript_text)
73
-
74
- def process_base64_audio(base64_data_uri: str) -> dict:
75
- """
76
- Decodes a Base64 audio data URI, processes it in-memory, transcribes it, and analyzes its sentiment.
77
- """
78
- if not isinstance(base64_data_uri, str) or "base64," not in base64_data_uri:
79
- return {"error": "Invalid or empty Base64 data URI provided."}
80
 
81
  try:
82
- _, encoded_data = base64_data_uri.split(',', 1)
83
- audio_data = base64.b64decode(encoded_data)
84
- out, _ = (
85
- ffmpeg
86
- .input('pipe:0')
87
- .output('pipe:1', format='s16le', ac=1, ar=16000)
88
- .run(input=audio_data, capture_stdout=True, capture_stderr=True)
89
- )
90
- audio_np = np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
91
- transcription_result = pipe(audio_np)
92
- transcript_text = transcription_result["text"]
93
  except Exception as e:
94
- return {"error": f"Failed to process Base64 audio: {str(e)}"}
95
 
 
96
  return sentiment_analysis(transcript_text)
97
 
98
- def analyze_audio_input(audio_input: str) -> dict:
99
- """
100
- Router function to handle both file paths and Base64 strings.
101
- This allows the Gradio UI to use file uploads and the API to use Base64.
102
- """
103
- # Check if the input is a valid file path provided by the Gradio component
104
- if audio_input and os.path.exists(audio_input):
105
- return process_audio(audio_input)
106
- # Otherwise, assume it's a Base64 string from an API call
107
- elif isinstance(audio_input, str):
108
- return process_base64_audio(audio_input)
109
- else:
110
- return {"error": f"Invalid input type: {type(audio_input)}"}
111
-
112
 
113
  # --- Code to find and load examples ---
114
  examples_dir = "examples"
@@ -127,26 +93,21 @@ examples_list = [[file] for file in example_files]
127
 
128
  # Create the Gradio interface
129
  demo = gr.Interface(
130
- fn=analyze_audio_input, # Point to the main router function
131
- inputs=gr.Audio(type="filepath", label="Upload Audio File or Record"),
132
  outputs=gr.JSON(label="Analysis Result"),
133
  title="🎙️ Audio Sentiment Analysis (Whisper Small)",
134
- description="""
135
- Analyze the sentiment of spoken words.
136
- **UI**: Upload an audio file, record directly, or click an example.
137
- **API**: The endpoint also accepts a Base64 encoded audio data URI as input.
138
- """,
139
  examples=examples_list,
140
  article="""
141
  ### How it Works
142
- This tool uses a speech-to-text model (`openai/whisper-small`) to transcribe audio, then TextBlob analyzes the text sentiment.
143
- The server can handle both local file paths (from the UI) and Base64 strings (from API calls).
 
144
  """,
145
  theme='huggingface'
146
  )
147
 
148
- # Launch the interface and MCP server
149
  if __name__ == "__main__":
150
- # Ensure ffmpeg is installed on your system.
151
- # pip install gradio textblob "transformers[torch]" accelerate safetensors ffmpeg-python numpy
152
  demo.launch(mcp_server=True)
 
2
  from textblob import TextBlob
3
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
  import torch
 
5
  import numpy as np
 
6
  import os
7
+ import glob
8
 
9
  # 1. Set up device and data type for optimized performance
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
39
  """
40
  blob = TextBlob(text)
41
  sentiment = blob.sentiment
 
42
  return {
43
  "transcript": text,
44
  "polarity": round(sentiment.polarity, 2),
 
46
  "assessment": "positive" if sentiment.polarity > 0 else "negative" if sentiment.polarity < 0 else "neutral"
47
  }
48
 
49
+ # NEW: Simplified main function to process audio from a NumPy array
50
+ def analyze_audio(audio: tuple) -> dict:
51
  """
52
+ Processes audio data from a NumPy array, transcribes it, and analyzes its sentiment.
53
+ Gradio provides the audio as a tuple (sample_rate, data).
54
  """
55
+ if audio is None:
56
+ return {"error": "No audio provided. Please upload, record, or select an example."}
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ # Unpack the audio tuple
59
+ sample_rate, audio_data = audio
60
+
61
+ # Convert the audio data to the format the model expects (float32)
62
+ audio_float32 = audio_data.astype(np.float32) / 32768.0
 
 
 
63
 
64
  try:
65
+ # Transcribe the audio
66
+ transcription_result = pipe(audio_float32)
67
+ transcript_text = transcription_result["text"].strip()
68
+
69
+ if not transcript_text:
70
+ return {"error": "Transcription failed or audio was silent."}
71
+
 
 
 
 
72
  except Exception as e:
73
+ return {"error": f"Failed to transcribe audio: {str(e)}"}
74
 
75
+ # Perform sentiment analysis on the transcript
76
  return sentiment_analysis(transcript_text)
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  # --- Code to find and load examples ---
80
  examples_dir = "examples"
 
93
 
94
  # Create the Gradio interface
95
  demo = gr.Interface(
96
+ fn=analyze_audio, # CHANGED: Point to the new, simplified function
97
+ inputs=gr.Audio(type="numpy", label="Upload Audio File or Record"), # CHANGED: type="numpy"
98
  outputs=gr.JSON(label="Analysis Result"),
99
  title="🎙️ Audio Sentiment Analysis (Whisper Small)",
100
+ description="Analyze the sentiment of spoken words. Upload an audio file, record directly, or click an example below.",
 
 
 
 
101
  examples=examples_list,
102
  article="""
103
  ### How it Works
104
+ This tool uses OpenAI's **Whisper Small** model to transcribe audio into text.
105
+ Then, **TextBlob** is used to perform sentiment analysis on the resulting transcript.
106
+ By using `type="numpy"`, the interface directly processes audio data, making it more reliable.
107
  """,
108
  theme='huggingface'
109
  )
110
 
111
+ # Launch the interface
112
  if __name__ == "__main__":
 
 
113
  demo.launch(mcp_server=True)