shukdevdattaEX commited on
Commit
207b913
Β·
verified Β·
1 Parent(s): 8c4798d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -106
app.py CHANGED
@@ -27,13 +27,10 @@ class MultimodalChatbot:
27
  """Convert PIL Image to base64 string"""
28
  try:
29
  if isinstance(image, str):
30
- # If it's a file path
31
  with open(image, "rb") as img_file:
32
  return base64.b64encode(img_file.read()).decode('utf-8')
33
  else:
34
- # If it's a PIL Image
35
  buffered = io.BytesIO()
36
- # Convert to RGB if it's RGBA
37
  if image.mode == 'RGBA':
38
  image = image.convert('RGB')
39
  image.save(buffered, format="JPEG", quality=85)
@@ -44,18 +41,19 @@ class MultimodalChatbot:
44
  def extract_pdf_text(self, pdf_file) -> str:
45
  """Extract text from PDF file"""
46
  try:
47
- if hasattr(pdf_file, 'name'):
48
- # Gradio file object
 
49
  pdf_path = pdf_file.name
50
  else:
51
- pdf_path = pdf_file
52
 
53
  text = ""
54
  with open(pdf_path, 'rb') as file:
55
  pdf_reader = PyPDF2.PdfReader(file)
56
  for page_num, page in enumerate(pdf_reader.pages):
57
  page_text = page.extract_text()
58
- if page_text.strip():
59
  text += f"Page {page_num + 1}:\n{page_text}\n\n"
60
  return text.strip() if text.strip() else "No text could be extracted from this PDF."
61
  except Exception as e:
@@ -64,65 +62,56 @@ class MultimodalChatbot:
64
  def convert_audio_to_wav(self, audio_file) -> str:
65
  """Convert audio file to WAV format for speech recognition"""
66
  try:
67
- if hasattr(audio_file, 'name'):
 
 
68
  audio_path = audio_file.name
69
  else:
70
- audio_path = audio_file
71
 
72
- # Get file extension
73
  file_ext = os.path.splitext(audio_path)[1].lower()
74
-
75
- # If already WAV, return as is
76
  if file_ext == '.wav':
77
  return audio_path
78
 
79
- # Convert to WAV using pydub
80
  audio = AudioSegment.from_file(audio_path)
81
- # Export as WAV with proper settings for speech recognition
82
  wav_path = tempfile.mktemp(suffix='.wav')
83
  audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
84
  return wav_path
85
-
86
  except Exception as e:
87
- raise Exception(f"Error converting audio: {str(e)}")
88
 
89
  def transcribe_audio(self, audio_file) -> str:
90
  """Transcribe audio file to text"""
91
  try:
92
  recognizer = sr.Recognizer()
93
-
94
- # Convert audio to WAV format
95
  wav_path = self.convert_audio_to_wav(audio_file)
96
 
97
  with sr.AudioFile(wav_path) as source:
98
- # Adjust for ambient noise
99
  recognizer.adjust_for_ambient_noise(source, duration=0.2)
100
  audio_data = recognizer.record(source)
101
-
102
- # Try Google Speech Recognition
103
  try:
104
  text = recognizer.recognize_google(audio_data)
105
  return text
106
  except sr.UnknownValueError:
107
  return "Could not understand the audio. Please try with clearer audio."
108
  except sr.RequestError as e:
109
- # Fallback to offline recognition if available
110
  try:
111
  text = recognizer.recognize_sphinx(audio_data)
112
  return text
113
  except:
114
  return f"Speech recognition service error: {str(e)}"
115
-
116
  except Exception as e:
117
  return f"Error transcribing audio: {str(e)}"
118
 
119
  def process_video(self, video_file) -> Tuple[List[str], str]:
120
  """Extract frames from video and convert to base64"""
121
  try:
122
- if hasattr(video_file, 'name'):
 
 
123
  video_path = video_file.name
124
  else:
125
- video_path = video_file
126
 
127
  cap = cv2.VideoCapture(video_path)
128
  if not cap.isOpened():
@@ -133,33 +122,26 @@ class MultimodalChatbot:
133
  frame_count = 0
134
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
135
  fps = cap.get(cv2.CAP_PROP_FPS)
136
-
137
- # Extract frames (every 60 frames or every 2 seconds)
138
  frame_interval = max(60, int(fps * 2)) if fps > 0 else 60
139
 
140
- while cap.read()[0] and len(frames) < 5: # Limit to 5 frames
141
  ret, frame = cap.read()
142
- if ret and frame_count % frame_interval == 0:
143
- # Convert BGR to RGB
 
144
  rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
145
  pil_image = Image.fromarray(rgb_frame)
146
-
147
- # Resize image to reduce size
148
  pil_image.thumbnail((800, 600), Image.Resampling.LANCZOS)
149
-
150
  base64_frame = self.encode_image_to_base64(pil_image)
151
  if not base64_frame.startswith("Error"):
152
  frames.append(base64_frame)
153
  timestamp = frame_count / fps if fps > 0 else frame_count
154
  frame_descriptions.append(f"Frame at {timestamp:.1f}s")
155
-
156
  frame_count += 1
157
 
158
  cap.release()
159
-
160
  description = f"Video processed: {len(frames)} frames extracted from {total_frames} total frames"
161
  return frames, description
162
-
163
  except Exception as e:
164
  return [], f"Error processing video: {str(e)}"
165
 
@@ -170,36 +152,24 @@ class MultimodalChatbot:
170
  image_file=None,
171
  video_file=None) -> dict:
172
  """Create a multimodal message for the API"""
173
-
174
  content_parts = []
175
  processing_info = []
176
 
177
- # Add text content
178
  if text_input:
179
  content_parts.append({"type": "text", "text": text_input})
180
 
181
- # Process PDF
182
  if pdf_file is not None:
183
  pdf_text = self.extract_pdf_text(pdf_file)
184
- content_parts.append({
185
- "type": "text",
186
- "text": f"PDF Content:\n{pdf_text}"
187
- })
188
  processing_info.append("πŸ“„ PDF processed")
189
 
190
- # Process Audio
191
  if audio_file is not None:
192
  audio_text = self.transcribe_audio(audio_file)
193
- content_parts.append({
194
- "type": "text",
195
- "text": f"Audio Transcription:\n{audio_text}"
196
- })
197
  processing_info.append("🎀 Audio transcribed")
198
 
199
- # Process Image - Use text-only approach since vision isn't supported
200
  if image_file is not None:
201
- # Since vision isn't supported, we'll describe what we can about the image
202
- if hasattr(image_file, 'size'):
203
  width, height = image_file.size
204
  mode = image_file.mode
205
  content_parts.append({
@@ -213,7 +183,6 @@ class MultimodalChatbot:
213
  })
214
  processing_info.append("πŸ–ΌοΈ Image received (metadata only)")
215
 
216
- # Process Video - Use text-only approach since vision isn't supported
217
  if video_file is not None:
218
  frames, video_desc = self.process_video(video_file)
219
  content_parts.append({
@@ -232,12 +201,10 @@ class MultimodalChatbot:
232
  video_file=None,
233
  history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
234
  """Main chat function"""
235
-
236
  if history is None:
237
  history = []
238
 
239
  try:
240
- # Create user message summary for display
241
  user_message_parts = []
242
  if text_input:
243
  user_message_parts.append(f"Text: {text_input}")
@@ -251,20 +218,14 @@ class MultimodalChatbot:
251
  user_message_parts.append("πŸŽ₯ Video uploaded")
252
 
253
  user_display = " | ".join(user_message_parts)
254
-
255
- # Create multimodal message
256
  user_message, processing_info = self.create_multimodal_message(
257
  text_input, pdf_file, audio_file, image_file, video_file
258
  )
259
 
260
- # Add processing info to display
261
  if processing_info:
262
  user_display += f"\n{' | '.join(processing_info)}"
263
 
264
- # Add to conversation history
265
  messages = [user_message]
266
-
267
- # Get response from Gemma
268
  completion = self.client.chat.completions.create(
269
  extra_headers={
270
  "HTTP-Referer": "https://multimodal-chatbot.local",
@@ -277,12 +238,8 @@ class MultimodalChatbot:
277
  )
278
 
279
  bot_response = completion.choices[0].message.content
280
-
281
- # Update history
282
  history.append((user_display, bot_response))
283
-
284
  return history, ""
285
-
286
  except Exception as e:
287
  error_msg = f"Error: {str(e)}"
288
  history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
@@ -290,7 +247,6 @@ class MultimodalChatbot:
290
 
291
  def create_interface():
292
  """Create the Gradio interface"""
293
-
294
  with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
295
  gr.Markdown("""
296
  # πŸ€– Multimodal Chatbot with Gemma 3n
@@ -305,7 +261,6 @@ def create_interface():
305
  **Setup**: Enter your OpenRouter API key below to get started
306
  """)
307
 
308
- # API Key Input Section
309
  with gr.Row():
310
  with gr.Column():
311
  api_key_input = gr.Textbox(
@@ -320,9 +275,7 @@ def create_interface():
320
  interactive=False
321
  )
322
 
323
- # Tabbed Interface
324
  with gr.Tabs():
325
- # Text Chat Tab
326
  with gr.TabItem("πŸ’¬ Text Chat"):
327
  with gr.Row():
328
  with gr.Column(scale=1):
@@ -333,7 +286,6 @@ def create_interface():
333
  )
334
  text_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
335
  text_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
336
-
337
  with gr.Column(scale=2):
338
  text_chatbot = gr.Chatbot(
339
  label="Text Chat History",
@@ -342,7 +294,6 @@ def create_interface():
342
  show_copy_button=True
343
  )
344
 
345
- # PDF Chat Tab
346
  with gr.TabItem("πŸ“„ PDF Chat"):
347
  with gr.Row():
348
  with gr.Column(scale=1):
@@ -358,7 +309,6 @@ def create_interface():
358
  )
359
  pdf_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
360
  pdf_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
361
-
362
  with gr.Column(scale=2):
363
  pdf_chatbot = gr.Chatbot(
364
  label="PDF Chat History",
@@ -367,7 +317,6 @@ def create_interface():
367
  show_copy_button=True
368
  )
369
 
370
- # Audio Chat Tab
371
  with gr.TabItem("🎀 Audio Chat"):
372
  with gr.Row():
373
  with gr.Column(scale=1):
@@ -383,7 +332,6 @@ def create_interface():
383
  )
384
  audio_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
385
  audio_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
386
-
387
  with gr.Column(scale=2):
388
  audio_chatbot = gr.Chatbot(
389
  label="Audio Chat History",
@@ -392,7 +340,6 @@ def create_interface():
392
  show_copy_button=True
393
  )
394
 
395
- # Image Chat Tab
396
  with gr.TabItem("πŸ–ΌοΈ Image Chat"):
397
  with gr.Row():
398
  with gr.Column(scale=1):
@@ -407,7 +354,6 @@ def create_interface():
407
  )
408
  image_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
409
  image_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
410
-
411
  with gr.Column(scale=2):
412
  image_chatbot = gr.Chatbot(
413
  label="Image Chat History",
@@ -416,7 +362,6 @@ def create_interface():
416
  show_copy_button=True
417
  )
418
 
419
- # Video Chat Tab
420
  with gr.TabItem("πŸŽ₯ Video Chat"):
421
  with gr.Row():
422
  with gr.Column(scale=1):
@@ -432,7 +377,6 @@ def create_interface():
432
  )
433
  video_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
434
  video_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
435
-
436
  with gr.Column(scale=2):
437
  video_chatbot = gr.Chatbot(
438
  label="Video Chat History",
@@ -441,7 +385,6 @@ def create_interface():
441
  show_copy_button=True
442
  )
443
 
444
- # Combined Chat Tab
445
  with gr.TabItem("🌟 Combined Chat"):
446
  with gr.Row():
447
  with gr.Column(scale=1):
@@ -450,33 +393,27 @@ def create_interface():
450
  placeholder="Type your message here...",
451
  lines=3
452
  )
453
-
454
  combined_pdf_input = gr.File(
455
  label="πŸ“„ PDF Upload",
456
  file_types=[".pdf"],
457
  type="filepath"
458
  )
459
-
460
  combined_audio_input = gr.File(
461
  label="🎀 Audio Upload",
462
  file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
463
  type="filepath"
464
  )
465
-
466
  combined_image_input = gr.Image(
467
  label="πŸ–ΌοΈ Image Upload",
468
  type="pil"
469
  )
470
-
471
  combined_video_input = gr.File(
472
  label="πŸŽ₯ Video Upload",
473
  file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
474
  type="filepath"
475
  )
476
-
477
  combined_submit_btn = gr.Button("πŸš€ Send All", variant="primary", size="lg", interactive=False)
478
  combined_clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
479
-
480
  with gr.Column(scale=2):
481
  combined_chatbot = gr.Chatbot(
482
  label="Combined Chat History",
@@ -485,13 +422,10 @@ def create_interface():
485
  show_copy_button=True
486
  )
487
 
488
- # Event handlers
489
  def validate_api_key(api_key):
490
  if not api_key or len(api_key.strip()) == 0:
491
  return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(6)]
492
-
493
  try:
494
- # Test the API key by creating a client
495
  test_client = OpenAI(
496
  base_url="https://openrouter.ai/api/v1",
497
  api_key=api_key.strip(),
@@ -506,7 +440,6 @@ def create_interface():
506
  history = []
507
  history.append(("Error", "❌ Please provide a valid API key first"))
508
  return history, ""
509
-
510
  chatbot = MultimodalChatbot(api_key.strip())
511
  return chatbot.chat(text_input=text, history=history)
512
 
@@ -516,7 +449,6 @@ def create_interface():
516
  history = []
517
  history.append(("Error", "❌ Please provide a valid API key first"))
518
  return history, ""
519
-
520
  chatbot = MultimodalChatbot(api_key.strip())
521
  return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
522
 
@@ -526,7 +458,6 @@ def create_interface():
526
  history = []
527
  history.append(("Error", "❌ Please provide a valid API key first"))
528
  return history, ""
529
-
530
  chatbot = MultimodalChatbot(api_key.strip())
531
  return chatbot.chat(text_input=text, audio_file=audio, history=history)
532
 
@@ -536,7 +467,6 @@ def create_interface():
536
  history = []
537
  history.append(("Error", "❌ Please provide a valid API key first"))
538
  return history, ""
539
-
540
  chatbot = MultimodalChatbot(api_key.strip())
541
  return chatbot.chat(text_input=text, image_file=image, history=history)
542
 
@@ -546,7 +476,6 @@ def create_interface():
546
  history = []
547
  history.append(("Error", "❌ Please provide a valid API key first"))
548
  return history, ""
549
-
550
  chatbot = MultimodalChatbot(api_key.strip())
551
  return chatbot.chat(text_input=text, video_file=video, history=history)
552
 
@@ -556,9 +485,8 @@ def create_interface():
556
  history = []
557
  history.append(("Error", "❌ Please provide a valid API key first"))
558
  return history, ""
559
-
560
  chatbot = MultimodalChatbot(api_key.strip())
561
- return chatbot.chat(text, pdf, audio, image, video, history)
562
 
563
  def clear_chat():
564
  return [], ""
@@ -566,7 +494,6 @@ def create_interface():
566
  def clear_all_inputs():
567
  return [], "", None, None, None, None
568
 
569
- # API Key validation
570
  api_key_input.change(
571
  validate_api_key,
572
  inputs=[api_key_input],
@@ -574,7 +501,6 @@ def create_interface():
574
  image_submit_btn, video_submit_btn, combined_submit_btn]
575
  )
576
 
577
- # Text chat events
578
  text_submit_btn.click(
579
  process_text_input,
580
  inputs=[api_key_input, text_input, text_chatbot],
@@ -587,7 +513,6 @@ def create_interface():
587
  )
588
  text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
589
 
590
- # PDF chat events
591
  pdf_submit_btn.click(
592
  process_pdf_input,
593
  inputs=[api_key_input, pdf_input, pdf_text_input, pdf_chatbot],
@@ -595,7 +520,6 @@ def create_interface():
595
  )
596
  pdf_clear_btn.click(lambda: ([], "", None), outputs=[pdf_chatbot, pdf_text_input, pdf_input])
597
 
598
- # Audio chat events
599
  audio_submit_btn.click(
600
  process_audio_input,
601
  inputs=[api_key_input, audio_input, audio_text_input, audio_chatbot],
@@ -603,7 +527,6 @@ def create_interface():
603
  )
604
  audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
605
 
606
- # Image chat events
607
  image_submit_btn.click(
608
  process_image_input,
609
  inputs=[api_key_input, image_input, image_text_input, image_chatbot],
@@ -611,7 +534,6 @@ def create_interface():
611
  )
612
  image_clear_btn.click(lambda: ([], "", None), outputs=[image_chatbot, image_text_input, image_input])
613
 
614
- # Video chat events
615
  video_submit_btn.click(
616
  process_video_input,
617
  inputs=[api_key_input, video_input, video_text_input, video_chatbot],
@@ -619,7 +541,6 @@ def create_interface():
619
  )
620
  video_clear_btn.click(lambda: ([], "", None), outputs=[video_chatbot, video_text_input, video_input])
621
 
622
- # Combined chat events
623
  combined_submit_btn.click(
624
  process_combined_input,
625
  inputs=[api_key_input, combined_text_input, combined_pdf_input,
@@ -630,7 +551,6 @@ def create_interface():
630
  outputs=[combined_chatbot, combined_text_input, combined_pdf_input,
631
  combined_audio_input, combined_image_input, combined_video_input])
632
 
633
- # Examples and Instructions
634
  gr.Markdown("""
635
  ### 🎯 How to Use Each Tab:
636
 
@@ -664,7 +584,6 @@ def create_interface():
664
  return demo
665
 
666
  if __name__ == "__main__":
667
- # Required packages (install with pip):
668
  required_packages = [
669
  "gradio",
670
  "openai",
@@ -687,6 +606,4 @@ if __name__ == "__main__":
687
  print("πŸ’‘ Enter your API key in the web interface when it loads")
688
 
689
  demo = create_interface()
690
- demo.launch(
691
- share=True
692
- )
 
27
  """Convert PIL Image to base64 string"""
28
  try:
29
  if isinstance(image, str):
 
30
  with open(image, "rb") as img_file:
31
  return base64.b64encode(img_file.read()).decode('utf-8')
32
  else:
 
33
  buffered = io.BytesIO()
 
34
  if image.mode == 'RGBA':
35
  image = image.convert('RGB')
36
  image.save(buffered, format="JPEG", quality=85)
 
41
  def extract_pdf_text(self, pdf_file) -> str:
42
  """Extract text from PDF file"""
43
  try:
44
+ if isinstance(pdf_file, str):
45
+ pdf_path = pdf_file
46
+ elif hasattr(pdf_file, 'name'):
47
  pdf_path = pdf_file.name
48
  else:
49
+ raise ValueError("Invalid PDF file input")
50
 
51
  text = ""
52
  with open(pdf_path, 'rb') as file:
53
  pdf_reader = PyPDF2.PdfReader(file)
54
  for page_num, page in enumerate(pdf_reader.pages):
55
  page_text = page.extract_text()
56
+ if page_text and page_text.strip():
57
  text += f"Page {page_num + 1}:\n{page_text}\n\n"
58
  return text.strip() if text.strip() else "No text could be extracted from this PDF."
59
  except Exception as e:
 
62
  def convert_audio_to_wav(self, audio_file) -> str:
63
  """Convert audio file to WAV format for speech recognition"""
64
  try:
65
+ if isinstance(audio_file, str):
66
+ audio_path = audio_file
67
+ elif hasattr(audio_file, 'name'):
68
  audio_path = audio_file.name
69
  else:
70
+ raise ValueError("Invalid audio file input")
71
 
 
72
  file_ext = os.path.splitext(audio_path)[1].lower()
 
 
73
  if file_ext == '.wav':
74
  return audio_path
75
 
 
76
  audio = AudioSegment.from_file(audio_path)
 
77
  wav_path = tempfile.mktemp(suffix='.wav')
78
  audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
79
  return wav_path
 
80
  except Exception as e:
81
+ return f"Error converting audio: {str(e)}"
82
 
83
  def transcribe_audio(self, audio_file) -> str:
84
  """Transcribe audio file to text"""
85
  try:
86
  recognizer = sr.Recognizer()
 
 
87
  wav_path = self.convert_audio_to_wav(audio_file)
88
 
89
  with sr.AudioFile(wav_path) as source:
 
90
  recognizer.adjust_for_ambient_noise(source, duration=0.2)
91
  audio_data = recognizer.record(source)
 
 
92
  try:
93
  text = recognizer.recognize_google(audio_data)
94
  return text
95
  except sr.UnknownValueError:
96
  return "Could not understand the audio. Please try with clearer audio."
97
  except sr.RequestError as e:
 
98
  try:
99
  text = recognizer.recognize_sphinx(audio_data)
100
  return text
101
  except:
102
  return f"Speech recognition service error: {str(e)}"
 
103
  except Exception as e:
104
  return f"Error transcribing audio: {str(e)}"
105
 
106
  def process_video(self, video_file) -> Tuple[List[str], str]:
107
  """Extract frames from video and convert to base64"""
108
  try:
109
+ if isinstance(video_file, str):
110
+ video_path = video_file
111
+ elif hasattr(video_file, 'name'):
112
  video_path = video_file.name
113
  else:
114
+ raise ValueError("Invalid video file input")
115
 
116
  cap = cv2.VideoCapture(video_path)
117
  if not cap.isOpened():
 
122
  frame_count = 0
123
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
124
  fps = cap.get(cv2.CAP_PROP_FPS)
 
 
125
  frame_interval = max(60, int(fps * 2)) if fps > 0 else 60
126
 
127
+ while True:
128
  ret, frame = cap.read()
129
+ if not ret or len(frames) >= 5:
130
+ break
131
+ if frame_count % frame_interval == 0:
132
  rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
133
  pil_image = Image.fromarray(rgb_frame)
 
 
134
  pil_image.thumbnail((800, 600), Image.Resampling.LANCZOS)
 
135
  base64_frame = self.encode_image_to_base64(pil_image)
136
  if not base64_frame.startswith("Error"):
137
  frames.append(base64_frame)
138
  timestamp = frame_count / fps if fps > 0 else frame_count
139
  frame_descriptions.append(f"Frame at {timestamp:.1f}s")
 
140
  frame_count += 1
141
 
142
  cap.release()
 
143
  description = f"Video processed: {len(frames)} frames extracted from {total_frames} total frames"
144
  return frames, description
 
145
  except Exception as e:
146
  return [], f"Error processing video: {str(e)}"
147
 
 
152
  image_file=None,
153
  video_file=None) -> dict:
154
  """Create a multimodal message for the API"""
 
155
  content_parts = []
156
  processing_info = []
157
 
 
158
  if text_input:
159
  content_parts.append({"type": "text", "text": text_input})
160
 
 
161
  if pdf_file is not None:
162
  pdf_text = self.extract_pdf_text(pdf_file)
163
+ content_parts.append({"type": "text", "text": f"PDF Content:\n{pdf_text}"})
 
 
 
164
  processing_info.append("πŸ“„ PDF processed")
165
 
 
166
  if audio_file is not None:
167
  audio_text = self.transcribe_audio(audio_file)
168
+ content_parts.append({"type": "text", "text": f"Audio Transcription:\n{audio_text}"})
 
 
 
169
  processing_info.append("🎀 Audio transcribed")
170
 
 
171
  if image_file is not None:
172
+ if isinstance(image_file, Image.Image):
 
173
  width, height = image_file.size
174
  mode = image_file.mode
175
  content_parts.append({
 
183
  })
184
  processing_info.append("πŸ–ΌοΈ Image received (metadata only)")
185
 
 
186
  if video_file is not None:
187
  frames, video_desc = self.process_video(video_file)
188
  content_parts.append({
 
201
  video_file=None,
202
  history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
203
  """Main chat function"""
 
204
  if history is None:
205
  history = []
206
 
207
  try:
 
208
  user_message_parts = []
209
  if text_input:
210
  user_message_parts.append(f"Text: {text_input}")
 
218
  user_message_parts.append("πŸŽ₯ Video uploaded")
219
 
220
  user_display = " | ".join(user_message_parts)
 
 
221
  user_message, processing_info = self.create_multimodal_message(
222
  text_input, pdf_file, audio_file, image_file, video_file
223
  )
224
 
 
225
  if processing_info:
226
  user_display += f"\n{' | '.join(processing_info)}"
227
 
 
228
  messages = [user_message]
 
 
229
  completion = self.client.chat.completions.create(
230
  extra_headers={
231
  "HTTP-Referer": "https://multimodal-chatbot.local",
 
238
  )
239
 
240
  bot_response = completion.choices[0].message.content
 
 
241
  history.append((user_display, bot_response))
 
242
  return history, ""
 
243
  except Exception as e:
244
  error_msg = f"Error: {str(e)}"
245
  history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
 
247
 
248
  def create_interface():
249
  """Create the Gradio interface"""
 
250
  with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
251
  gr.Markdown("""
252
  # πŸ€– Multimodal Chatbot with Gemma 3n
 
261
  **Setup**: Enter your OpenRouter API key below to get started
262
  """)
263
 
 
264
  with gr.Row():
265
  with gr.Column():
266
  api_key_input = gr.Textbox(
 
275
  interactive=False
276
  )
277
 
 
278
  with gr.Tabs():
 
279
  with gr.TabItem("πŸ’¬ Text Chat"):
280
  with gr.Row():
281
  with gr.Column(scale=1):
 
286
  )
287
  text_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
288
  text_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
 
289
  with gr.Column(scale=2):
290
  text_chatbot = gr.Chatbot(
291
  label="Text Chat History",
 
294
  show_copy_button=True
295
  )
296
 
 
297
  with gr.TabItem("πŸ“„ PDF Chat"):
298
  with gr.Row():
299
  with gr.Column(scale=1):
 
309
  )
310
  pdf_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
311
  pdf_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
 
312
  with gr.Column(scale=2):
313
  pdf_chatbot = gr.Chatbot(
314
  label="PDF Chat History",
 
317
  show_copy_button=True
318
  )
319
 
 
320
  with gr.TabItem("🎀 Audio Chat"):
321
  with gr.Row():
322
  with gr.Column(scale=1):
 
332
  )
333
  audio_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
334
  audio_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
 
335
  with gr.Column(scale=2):
336
  audio_chatbot = gr.Chatbot(
337
  label="Audio Chat History",
 
340
  show_copy_button=True
341
  )
342
 
 
343
  with gr.TabItem("πŸ–ΌοΈ Image Chat"):
344
  with gr.Row():
345
  with gr.Column(scale=1):
 
354
  )
355
  image_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
356
  image_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
 
357
  with gr.Column(scale=2):
358
  image_chatbot = gr.Chatbot(
359
  label="Image Chat History",
 
362
  show_copy_button=True
363
  )
364
 
 
365
  with gr.TabItem("πŸŽ₯ Video Chat"):
366
  with gr.Row():
367
  with gr.Column(scale=1):
 
377
  )
378
  video_submit_btn = gr.Button("πŸš€ Send", variant="primary", size="lg", interactive=False)
379
  video_clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
 
380
  with gr.Column(scale=2):
381
  video_chatbot = gr.Chatbot(
382
  label="Video Chat History",
 
385
  show_copy_button=True
386
  )
387
 
 
388
  with gr.TabItem("🌟 Combined Chat"):
389
  with gr.Row():
390
  with gr.Column(scale=1):
 
393
  placeholder="Type your message here...",
394
  lines=3
395
  )
 
396
  combined_pdf_input = gr.File(
397
  label="πŸ“„ PDF Upload",
398
  file_types=[".pdf"],
399
  type="filepath"
400
  )
 
401
  combined_audio_input = gr.File(
402
  label="🎀 Audio Upload",
403
  file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
404
  type="filepath"
405
  )
 
406
  combined_image_input = gr.Image(
407
  label="πŸ–ΌοΈ Image Upload",
408
  type="pil"
409
  )
 
410
  combined_video_input = gr.File(
411
  label="πŸŽ₯ Video Upload",
412
  file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
413
  type="filepath"
414
  )
 
415
  combined_submit_btn = gr.Button("πŸš€ Send All", variant="primary", size="lg", interactive=False)
416
  combined_clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
 
417
  with gr.Column(scale=2):
418
  combined_chatbot = gr.Chatbot(
419
  label="Combined Chat History",
 
422
  show_copy_button=True
423
  )
424
 
 
425
  def validate_api_key(api_key):
426
  if not api_key or len(api_key.strip()) == 0:
427
  return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(6)]
 
428
  try:
 
429
  test_client = OpenAI(
430
  base_url="https://openrouter.ai/api/v1",
431
  api_key=api_key.strip(),
 
440
  history = []
441
  history.append(("Error", "❌ Please provide a valid API key first"))
442
  return history, ""
 
443
  chatbot = MultimodalChatbot(api_key.strip())
444
  return chatbot.chat(text_input=text, history=history)
445
 
 
449
  history = []
450
  history.append(("Error", "❌ Please provide a valid API key first"))
451
  return history, ""
 
452
  chatbot = MultimodalChatbot(api_key.strip())
453
  return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
454
 
 
458
  history = []
459
  history.append(("Error", "❌ Please provide a valid API key first"))
460
  return history, ""
 
461
  chatbot = MultimodalChatbot(api_key.strip())
462
  return chatbot.chat(text_input=text, audio_file=audio, history=history)
463
 
 
467
  history = []
468
  history.append(("Error", "❌ Please provide a valid API key first"))
469
  return history, ""
 
470
  chatbot = MultimodalChatbot(api_key.strip())
471
  return chatbot.chat(text_input=text, image_file=image, history=history)
472
 
 
476
  history = []
477
  history.append(("Error", "❌ Please provide a valid API key first"))
478
  return history, ""
 
479
  chatbot = MultimodalChatbot(api_key.strip())
480
  return chatbot.chat(text_input=text, video_file=video, history=history)
481
 
 
485
  history = []
486
  history.append(("Error", "❌ Please provide a valid API key first"))
487
  return history, ""
 
488
  chatbot = MultimodalChatbot(api_key.strip())
489
+ return chatbot.chat(text_input=text, pdf_file=pdf, audio_file=audio, image_file=image, video_file=video, history=history)
490
 
491
  def clear_chat():
492
  return [], ""
 
494
  def clear_all_inputs():
495
  return [], "", None, None, None, None
496
 
 
497
  api_key_input.change(
498
  validate_api_key,
499
  inputs=[api_key_input],
 
501
  image_submit_btn, video_submit_btn, combined_submit_btn]
502
  )
503
 
 
504
  text_submit_btn.click(
505
  process_text_input,
506
  inputs=[api_key_input, text_input, text_chatbot],
 
513
  )
514
  text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
515
 
 
516
  pdf_submit_btn.click(
517
  process_pdf_input,
518
  inputs=[api_key_input, pdf_input, pdf_text_input, pdf_chatbot],
 
520
  )
521
  pdf_clear_btn.click(lambda: ([], "", None), outputs=[pdf_chatbot, pdf_text_input, pdf_input])
522
 
 
523
  audio_submit_btn.click(
524
  process_audio_input,
525
  inputs=[api_key_input, audio_input, audio_text_input, audio_chatbot],
 
527
  )
528
  audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
529
 
 
530
  image_submit_btn.click(
531
  process_image_input,
532
  inputs=[api_key_input, image_input, image_text_input, image_chatbot],
 
534
  )
535
  image_clear_btn.click(lambda: ([], "", None), outputs=[image_chatbot, image_text_input, image_input])
536
 
 
537
  video_submit_btn.click(
538
  process_video_input,
539
  inputs=[api_key_input, video_input, video_text_input, video_chatbot],
 
541
  )
542
  video_clear_btn.click(lambda: ([], "", None), outputs=[video_chatbot, video_text_input, video_input])
543
 
 
544
  combined_submit_btn.click(
545
  process_combined_input,
546
  inputs=[api_key_input, combined_text_input, combined_pdf_input,
 
551
  outputs=[combined_chatbot, combined_text_input, combined_pdf_input,
552
  combined_audio_input, combined_image_input, combined_video_input])
553
 
 
554
  gr.Markdown("""
555
  ### 🎯 How to Use Each Tab:
556
 
 
584
  return demo
585
 
586
  if __name__ == "__main__":
 
587
  required_packages = [
588
  "gradio",
589
  "openai",
 
606
  print("πŸ’‘ Enter your API key in the web interface when it loads")
607
 
608
  demo = create_interface()
609
+ demo.launch(share=True)