shukdevdattaEX commited on
Commit
f40b063
Β·
verified Β·
1 Parent(s): 5ceab5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -41
app.py CHANGED
@@ -13,6 +13,8 @@ from typing import List, Tuple, Optional
13
  import json
14
  import pydub
15
  from pydub import AudioSegment
 
 
16
 
17
  class MultimodalChatbot:
18
  def __init__(self, api_key: str):
@@ -22,7 +24,18 @@ class MultimodalChatbot:
22
  )
23
  self.model = "google/gemma-3n-e2b-it:free"
24
  self.conversation_history = []
25
-
 
 
 
 
 
 
 
 
 
 
 
26
  def encode_image_to_base64(self, image) -> str:
27
  """Convert PIL Image or file path to base64 string"""
28
  try:
@@ -105,8 +118,8 @@ class MultimodalChatbot:
105
  except Exception as e:
106
  return f"Error transcribing audio: {str(e)}"
107
 
108
- def process_video(self, video_file) -> Tuple[List[str], str]:
109
- """Process video file (metadata only, no visual analysis)"""
110
  try:
111
  if isinstance(video_file, str):
112
  video_path = video_file
@@ -117,17 +130,21 @@ class MultimodalChatbot:
117
 
118
  cap = cv2.VideoCapture(video_path)
119
  if not cap.isOpened():
120
- return [], "Error: Could not open video file"
121
 
122
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
123
- fps = cap.get(cv2.CAP_PROP_FPS)
124
- duration = total_frames / fps if fps > 0 else 0
 
 
125
  cap.release()
126
-
127
- description = f"Video metadata: {total_frames} frames, {duration:.1f} seconds. Visual analysis not supported by the current model."
128
- return [], description
 
 
129
  except Exception as e:
130
- return [], f"Error processing video: {str(e)}"
131
 
132
  def create_multimodal_message(self,
133
  text_input: str = "",
@@ -152,28 +169,44 @@ class MultimodalChatbot:
152
  content_parts.append({"type": "text", "text": f"Audio Transcription:\n{audio_text}"})
153
  processing_info.append("🎀 Audio transcribed")
154
 
155
- if image_file is not None:
156
- if isinstance(image_file, Image.Image):
157
- width, height = image_file.size
158
- mode = image_file.mode
159
- content_parts.append({
160
- "type": "text",
161
- "text": f"Image uploaded: {width}x{height} pixels, mode: {mode}. Visual analysis not supported by the current model. Please describe the image for further assistance."
162
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  else:
164
- content_parts.append({
165
- "type": "text",
166
- "text": "Image uploaded. Visual analysis not supported by the current model. Please describe the image for further assistance."
167
- })
168
- processing_info.append("πŸ–ΌοΈ Image received (metadata only)")
169
-
170
- if video_file is not None:
171
- _, video_desc = self.process_video(video_file)
172
- content_parts.append({
173
- "type": "text",
174
- "text": f"Video uploaded: {video_desc}. Please describe the video for further assistance."
175
- })
176
- processing_info.append("πŸŽ₯ Video processed (metadata only)")
177
 
178
  return {"role": "user", "content": content_parts}, processing_info
179
 
@@ -239,8 +272,8 @@ def create_interface():
239
  - **Text**: Regular text messages
240
  - **PDF**: Extract and analyze document content
241
  - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
242
- - **Images**: Upload images (metadata only; visual analysis not supported)
243
- - **Video**: Upload videos (metadata only; visual analysis not supported)
244
 
245
  **Setup**: Enter your OpenRouter API key below to get started
246
  """)
@@ -492,7 +525,7 @@ def create_interface():
492
  )
493
  text_input.submit(
494
  process_text_input,
495
- inputs=[api_key_input, text_input, text_chatbot],
496
  outputs=[text_chatbot, text_input]
497
  )
498
  text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
@@ -546,11 +579,11 @@ def create_interface():
546
  - Supports: WAV, MP3, M4A, FLAC, OGG formats
547
  - Best results with clear speech and minimal background noise
548
 
549
- **πŸ–ΌοΈ Image Chat**: Upload images (metadata only; visual analysis not supported)
550
- - Provide a text description of the image for further assistance
551
 
552
- **πŸŽ₯ Video Chat**: Upload videos (metadata only; visual analysis not supported)
553
- - Provide a text description of the video for further assistance
554
 
555
  **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
556
 
@@ -562,8 +595,8 @@ def create_interface():
562
  5. Copy and paste it in the field above
563
 
564
  ### ⚠️ Current Limitations:
565
- - Image and video visual analysis not supported by the free Gemma 3n model
566
- - Audio transcription requires internet connection for best results
567
  - Large files may take longer to process
568
  """)
569
 
@@ -578,7 +611,9 @@ if __name__ == "__main__":
578
  "SpeechRecognition",
579
  "opencv-python",
580
  "numpy",
581
- "pydub"
 
 
582
  ]
583
 
584
  print("πŸš€ Multimodal Chatbot with Gemma 3n")
 
13
  import json
14
  import pydub
15
  from pydub import AudioSegment
16
+ from transformers import pipeline
17
+ import torch
18
 
19
  class MultimodalChatbot:
20
  def __init__(self, api_key: str):
 
24
  )
25
  self.model = "google/gemma-3n-e2b-it:free"
26
  self.conversation_history = []
27
+ # Initialize the pipeline for image-text-to-text processing
28
+ try:
29
+ self.pipe = pipeline(
30
+ "image-text-to-text",
31
+ model="google/gemma-3n-e2b",
32
+ device="cpu", # Optimized for CPU in HF Spaces
33
+ torch_dtype=torch.float32, # Use float32 for CPU compatibility
34
+ )
35
+ except Exception as e:
36
+ print(f"Error initializing pipeline: {e}")
37
+ self.pipe = None
38
+
39
  def encode_image_to_base64(self, image) -> str:
40
  """Convert PIL Image or file path to base64 string"""
41
  try:
 
118
  except Exception as e:
119
  return f"Error transcribing audio: {str(e)}"
120
 
121
+ def extract_video_frame(self, video_file, frame_number=None):
122
+ """Extract a frame from the video"""
123
  try:
124
  if isinstance(video_file, str):
125
  video_path = video_file
 
130
 
131
  cap = cv2.VideoCapture(video_path)
132
  if not cap.isOpened():
133
+ return None
134
 
135
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
136
+ if frame_number is None:
137
+ frame_number = total_frames // 2 # Extract middle frame
138
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
139
+ ret, frame = cap.read()
140
  cap.release()
141
+ if ret:
142
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
143
+ return Image.fromarray(frame)
144
+ else:
145
+ return None
146
  except Exception as e:
147
+ return None
148
 
149
  def create_multimodal_message(self,
150
  text_input: str = "",
 
169
  content_parts.append({"type": "text", "text": f"Audio Transcription:\n{audio_text}"})
170
  processing_info.append("🎀 Audio transcribed")
171
 
172
+ if image_file is not None and self.pipe is not None:
173
+ try:
174
+ if isinstance(image_file, str):
175
+ image = Image.open(image_file)
176
+ else:
177
+ image = image_file
178
+ # Use user's text input as prompt, or default if none provided
179
+ prompt = f"<image_soft_token> {text_input}" if text_input else "<image_soft_token> Describe this image"
180
+ output = self.pipe(image, text=prompt)
181
+ description = output[0]['generated_text']
182
+ content_parts.append({"type": "text", "text": f"Image analysis: {description}"})
183
+ processing_info.append("πŸ–ΌοΈ Image analyzed")
184
+ except Exception as e:
185
+ content_parts.append({"type": "text", "text": f"Error analyzing image: {str(e)}"})
186
+ processing_info.append("πŸ–ΌοΈ Image analysis failed")
187
+ elif image_file is not None:
188
+ content_parts.append({"type": "text", "text": "Image uploaded. Analysis failed due to model initialization error."})
189
+ processing_info.append("πŸ–ΌοΈ Image received (analysis failed)")
190
+
191
+ if video_file is not None and self.pipe is not None:
192
+ frame = self.extract_video_frame(video_file)
193
+ if frame:
194
+ try:
195
+ # Use user's text input with context, or default for frame
196
+ prompt = f"<image_soft_token> This is a frame from the video. {text_input}" if text_input else "<image_soft_token> Describe this frame from the video"
197
+ output = self.pipe(frame, text=prompt)
198
+ description = output[0]['generated_text']
199
+ content_parts.append({"type": "text", "text": f"Video frame analysis: {description}. Please describe the video for further assistance."})
200
+ processing_info.append("πŸŽ₯ Video frame analyzed")
201
+ except Exception as e:
202
+ content_parts.append({"type": "text", "text": f"Error analyzing video frame: {str(e)}"})
203
+ processing_info.append("πŸŽ₯ Video frame analysis failed")
204
  else:
205
+ content_parts.append({"type": "text", "text": "Could not extract frame from video. Please describe the video."})
206
+ processing_info.append("πŸŽ₯ Video processing failed")
207
+ elif video_file is not None:
208
+ content_parts.append({"type": "text", "text": "Video uploaded. Analysis failed due to model initialization error."})
209
+ processing_info.append("πŸŽ₯ Video received (analysis failed)")
 
 
 
 
 
 
 
 
210
 
211
  return {"role": "user", "content": content_parts}, processing_info
212
 
 
272
  - **Text**: Regular text messages
273
  - **PDF**: Extract and analyze document content
274
  - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
275
+ - **Images**: Upload images for analysis using Gemma 3n
276
+ - **Video**: Upload videos for basic frame analysis using Gemma 3n
277
 
278
  **Setup**: Enter your OpenRouter API key below to get started
279
  """)
 
525
  )
526
  text_input.submit(
527
  process_text_input,
528
+ inputs=[api_key_input, text_input,+Y text_chatbot],
529
  outputs=[text_chatbot, text_input]
530
  )
531
  text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
 
579
  - Supports: WAV, MP3, M4A, FLAC, OGG formats
580
  - Best results with clear speech and minimal background noise
581
 
582
+ **πŸ–ΌοΈ Image Chat**: Upload images for analysis using Gemma 3n
583
+ - Provide a text prompt to guide the analysis (e.g., "What is in this image?")
584
 
585
+ **πŸŽ₯ Video Chat**: Upload videos for basic frame analysis using Gemma 3n
586
+ - Analysis is based on a single frame; provide a text description for full video context
587
 
588
  **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
589
 
 
595
  5. Copy and paste it in the field above
596
 
597
  ### ⚠️ Current Limitations:
598
+ - Image and video analysis may be slow on CPU in Hugging Face Spaces
599
+ - Video analysis is limited to a single frame due to CPU constraints
600
  - Large files may take longer to process
601
  """)
602
 
 
611
  "SpeechRecognition",
612
  "opencv-python",
613
  "numpy",
614
+ "pydub",
615
+ "transformers", # Added for image and video analysis
616
+ "torch" # Added for transformers compatibility
617
  ]
618
 
619
  print("πŸš€ Multimodal Chatbot with Gemma 3n")