Spaces:

shukdevdattaEX
/

Gemma-3n-Multi-modal-chatbot

Sleeping

App Files Files Community

shukdevdattaEX commited on Jul 19

Commit

207b913

verified ·

1 Parent(s): 8c4798d

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -106

app.py CHANGED Viewed

@@ -27,13 +27,10 @@ class MultimodalChatbot:
         """Convert PIL Image to base64 string"""
         try:
             if isinstance(image, str):
-                # If it's a file path
                 with open(image, "rb") as img_file:
                     return base64.b64encode(img_file.read()).decode('utf-8')
             else:
-                # If it's a PIL Image
                 buffered = io.BytesIO()
-                # Convert to RGB if it's RGBA
                 if image.mode == 'RGBA':
                     image = image.convert('RGB')
                 image.save(buffered, format="JPEG", quality=85)
@@ -44,18 +41,19 @@ class MultimodalChatbot:
     def extract_pdf_text(self, pdf_file) -> str:
         """Extract text from PDF file"""
         try:
-            if hasattr(pdf_file, 'name'):
-                # Gradio file object
                 pdf_path = pdf_file.name
             else:
-                pdf_path = pdf_file
             text = ""
             with open(pdf_path, 'rb') as file:
                 pdf_reader = PyPDF2.PdfReader(file)
                 for page_num, page in enumerate(pdf_reader.pages):
                     page_text = page.extract_text()
-                    if page_text.strip():
                         text += f"Page {page_num + 1}:\n{page_text}\n\n"
             return text.strip() if text.strip() else "No text could be extracted from this PDF."
         except Exception as e:
@@ -64,65 +62,56 @@ class MultimodalChatbot:
     def convert_audio_to_wav(self, audio_file) -> str:
         """Convert audio file to WAV format for speech recognition"""
         try:
-            if hasattr(audio_file, 'name'):
                 audio_path = audio_file.name
             else:
-                audio_path = audio_file
-            # Get file extension
             file_ext = os.path.splitext(audio_path)[1].lower()
-            # If already WAV, return as is
             if file_ext == '.wav':
                 return audio_path
-            # Convert to WAV using pydub
             audio = AudioSegment.from_file(audio_path)
-            # Export as WAV with proper settings for speech recognition
             wav_path = tempfile.mktemp(suffix='.wav')
             audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
             return wav_path
         except Exception as e:
-            raise Exception(f"Error converting audio: {str(e)}")
     def transcribe_audio(self, audio_file) -> str:
         """Transcribe audio file to text"""
         try:
             recognizer = sr.Recognizer()
-            # Convert audio to WAV format
             wav_path = self.convert_audio_to_wav(audio_file)
             with sr.AudioFile(wav_path) as source:
-                # Adjust for ambient noise
                 recognizer.adjust_for_ambient_noise(source, duration=0.2)
                 audio_data = recognizer.record(source)
-                # Try Google Speech Recognition
                 try:
                     text = recognizer.recognize_google(audio_data)
                     return text
                 except sr.UnknownValueError:
                     return "Could not understand the audio. Please try with clearer audio."
                 except sr.RequestError as e:
-                    # Fallback to offline recognition if available
                     try:
                         text = recognizer.recognize_sphinx(audio_data)
                         return text
                     except:
                         return f"Speech recognition service error: {str(e)}"
         except Exception as e:
             return f"Error transcribing audio: {str(e)}"
     def process_video(self, video_file) -> Tuple[List[str], str]:
         """Extract frames from video and convert to base64"""
         try:
-            if hasattr(video_file, 'name'):
                 video_path = video_file.name
             else:
-                video_path = video_file
             cap = cv2.VideoCapture(video_path)
             if not cap.isOpened():
@@ -133,33 +122,26 @@ class MultimodalChatbot:
             frame_count = 0
             total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
             fps = cap.get(cv2.CAP_PROP_FPS)
-            # Extract frames (every 60 frames or every 2 seconds)
             frame_interval = max(60, int(fps * 2)) if fps > 0 else 60
-            while cap.read()[0] and len(frames) < 5:  # Limit to 5 frames
                 ret, frame = cap.read()
-                if ret and frame_count % frame_interval == 0:
-                    # Convert BGR to RGB
                     rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                     pil_image = Image.fromarray(rgb_frame)
-                    # Resize image to reduce size
                     pil_image.thumbnail((800, 600), Image.Resampling.LANCZOS)
                     base64_frame = self.encode_image_to_base64(pil_image)
                     if not base64_frame.startswith("Error"):
                         frames.append(base64_frame)
                         timestamp = frame_count / fps if fps > 0 else frame_count
                         frame_descriptions.append(f"Frame at {timestamp:.1f}s")
                 frame_count += 1
             cap.release()
             description = f"Video processed: {len(frames)} frames extracted from {total_frames} total frames"
             return frames, description
         except Exception as e:
             return [], f"Error processing video: {str(e)}"
@@ -170,36 +152,24 @@ class MultimodalChatbot:
                                 image_file=None,
                                 video_file=None) -> dict:
         """Create a multimodal message for the API"""
         content_parts = []
         processing_info = []
-        # Add text content
         if text_input:
             content_parts.append({"type": "text", "text": text_input})
-        # Process PDF
         if pdf_file is not None:
             pdf_text = self.extract_pdf_text(pdf_file)
-            content_parts.append({
-                "type": "text",
-                "text": f"PDF Content:\n{pdf_text}"
-            })
             processing_info.append("📄 PDF processed")
-        # Process Audio
         if audio_file is not None:
             audio_text = self.transcribe_audio(audio_file)
-            content_parts.append({
-                "type": "text",
-                "text": f"Audio Transcription:\n{audio_text}"
-            })
             processing_info.append("🎤 Audio transcribed")
-        # Process Image - Use text-only approach since vision isn't supported
         if image_file is not None:
-            # Since vision isn't supported, we'll describe what we can about the image
-            if hasattr(image_file, 'size'):
                 width, height = image_file.size
                 mode = image_file.mode
                 content_parts.append({
@@ -213,7 +183,6 @@ class MultimodalChatbot:
                 })
             processing_info.append("🖼️ Image received (metadata only)")
-        # Process Video - Use text-only approach since vision isn't supported
         if video_file is not None:
             frames, video_desc = self.process_video(video_file)
             content_parts.append({
@@ -232,12 +201,10 @@ class MultimodalChatbot:
              video_file=None,
              history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
         """Main chat function"""
         if history is None:
             history = []
         try:
-            # Create user message summary for display
             user_message_parts = []
             if text_input:
                 user_message_parts.append(f"Text: {text_input}")
@@ -251,20 +218,14 @@ class MultimodalChatbot:
                 user_message_parts.append("🎥 Video uploaded")
             user_display = " | ".join(user_message_parts)
-            # Create multimodal message
             user_message, processing_info = self.create_multimodal_message(
                 text_input, pdf_file, audio_file, image_file, video_file
             )
-            # Add processing info to display
             if processing_info:
                 user_display += f"\n{' | '.join(processing_info)}"
-            # Add to conversation history
             messages = [user_message]
-            # Get response from Gemma
             completion = self.client.chat.completions.create(
                 extra_headers={
                     "HTTP-Referer": "https://multimodal-chatbot.local",
@@ -277,12 +238,8 @@ class MultimodalChatbot:
             )
             bot_response = completion.choices[0].message.content
-            # Update history
             history.append((user_display, bot_response))
             return history, ""
         except Exception as e:
             error_msg = f"Error: {str(e)}"
             history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
@@ -290,7 +247,6 @@ class MultimodalChatbot:
 def create_interface():
     """Create the Gradio interface"""
     with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🤖 Multimodal Chatbot with Gemma 3n
@@ -305,7 +261,6 @@ def create_interface():
         **Setup**: Enter your OpenRouter API key below to get started
         """)
-        # API Key Input Section
         with gr.Row():
             with gr.Column():
                 api_key_input = gr.Textbox(
@@ -320,9 +275,7 @@ def create_interface():
                     interactive=False
                 )
-        # Tabbed Interface
         with gr.Tabs():
-            # Text Chat Tab
             with gr.TabItem("💬 Text Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
@@ -333,7 +286,6 @@ def create_interface():
                         )
                         text_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         text_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         text_chatbot = gr.Chatbot(
                             label="Text Chat History",
@@ -342,7 +294,6 @@ def create_interface():
                             show_copy_button=True
                         )
-            # PDF Chat Tab
             with gr.TabItem("📄 PDF Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
@@ -358,7 +309,6 @@ def create_interface():
                         )
                         pdf_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         pdf_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         pdf_chatbot = gr.Chatbot(
                             label="PDF Chat History",
@@ -367,7 +317,6 @@ def create_interface():
                             show_copy_button=True
                         )
-            # Audio Chat Tab
             with gr.TabItem("🎤 Audio Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
@@ -383,7 +332,6 @@ def create_interface():
                         )
                         audio_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         audio_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         audio_chatbot = gr.Chatbot(
                             label="Audio Chat History",
@@ -392,7 +340,6 @@ def create_interface():
                             show_copy_button=True
                         )
-            # Image Chat Tab
             with gr.TabItem("🖼️ Image Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
@@ -407,7 +354,6 @@ def create_interface():
                         )
                         image_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         image_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         image_chatbot = gr.Chatbot(
                             label="Image Chat History",
@@ -416,7 +362,6 @@ def create_interface():
                             show_copy_button=True
                         )
-            # Video Chat Tab
             with gr.TabItem("🎥 Video Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
@@ -432,7 +377,6 @@ def create_interface():
                         )
                         video_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         video_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         video_chatbot = gr.Chatbot(
                             label="Video Chat History",
@@ -441,7 +385,6 @@ def create_interface():
                             show_copy_button=True
                         )
-            # Combined Chat Tab
             with gr.TabItem("🌟 Combined Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
@@ -450,33 +393,27 @@ def create_interface():
                             placeholder="Type your message here...",
                             lines=3
                         )
                         combined_pdf_input = gr.File(
                             label="📄 PDF Upload",
                             file_types=[".pdf"],
                             type="filepath"
                         )
                         combined_audio_input = gr.File(
                             label="🎤 Audio Upload",
                             file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
                             type="filepath"
                         )
                         combined_image_input = gr.Image(
                             label="🖼️ Image Upload",
                             type="pil"
                         )
                         combined_video_input = gr.File(
                             label="🎥 Video Upload",
                             file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
                             type="filepath"
                         )
                         combined_submit_btn = gr.Button("🚀 Send All", variant="primary", size="lg", interactive=False)
                         combined_clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
                     with gr.Column(scale=2):
                         combined_chatbot = gr.Chatbot(
                             label="Combined Chat History",
@@ -485,13 +422,10 @@ def create_interface():
                             show_copy_button=True
                         )
-        # Event handlers
         def validate_api_key(api_key):
             if not api_key or len(api_key.strip()) == 0:
                 return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(6)]
             try:
-                # Test the API key by creating a client
                 test_client = OpenAI(
                     base_url="https://openrouter.ai/api/v1",
                     api_key=api_key.strip(),
@@ -506,7 +440,6 @@ def create_interface():
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, history=history)
@@ -516,7 +449,6 @@ def create_interface():
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
@@ -526,7 +458,6 @@ def create_interface():
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, audio_file=audio, history=history)
@@ -536,7 +467,6 @@ def create_interface():
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, image_file=image, history=history)
@@ -546,7 +476,6 @@ def create_interface():
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, video_file=video, history=history)
@@ -556,9 +485,8 @@ def create_interface():
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
-            return chatbot.chat(text, pdf, audio, image, video, history)
         def clear_chat():
             return [], ""
@@ -566,7 +494,6 @@ def create_interface():
         def clear_all_inputs():
             return [], "", None, None, None, None
-        # API Key validation
         api_key_input.change(
             validate_api_key,
             inputs=[api_key_input],
@@ -574,7 +501,6 @@ def create_interface():
                     image_submit_btn, video_submit_btn, combined_submit_btn]
         )
-        # Text chat events
         text_submit_btn.click(
             process_text_input,
             inputs=[api_key_input, text_input, text_chatbot],
@@ -587,7 +513,6 @@ def create_interface():
         )
         text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
-        # PDF chat events
         pdf_submit_btn.click(
             process_pdf_input,
             inputs=[api_key_input, pdf_input, pdf_text_input, pdf_chatbot],
@@ -595,7 +520,6 @@ def create_interface():
         )
         pdf_clear_btn.click(lambda: ([], "", None), outputs=[pdf_chatbot, pdf_text_input, pdf_input])
-        # Audio chat events
         audio_submit_btn.click(
             process_audio_input,
             inputs=[api_key_input, audio_input, audio_text_input, audio_chatbot],
@@ -603,7 +527,6 @@ def create_interface():
         )
         audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
-        # Image chat events
         image_submit_btn.click(
             process_image_input,
             inputs=[api_key_input, image_input, image_text_input, image_chatbot],
@@ -611,7 +534,6 @@ def create_interface():
         )
         image_clear_btn.click(lambda: ([], "", None), outputs=[image_chatbot, image_text_input, image_input])
-        # Video chat events
         video_submit_btn.click(
             process_video_input,
             inputs=[api_key_input, video_input, video_text_input, video_chatbot],
@@ -619,7 +541,6 @@ def create_interface():
         )
         video_clear_btn.click(lambda: ([], "", None), outputs=[video_chatbot, video_text_input, video_input])
-        # Combined chat events
         combined_submit_btn.click(
             process_combined_input,
             inputs=[api_key_input, combined_text_input, combined_pdf_input,
@@ -630,7 +551,6 @@ def create_interface():
                                outputs=[combined_chatbot, combined_text_input, combined_pdf_input,
                                       combined_audio_input, combined_image_input, combined_video_input])
-        # Examples and Instructions
         gr.Markdown("""
         ### 🎯 How to Use Each Tab:
@@ -664,7 +584,6 @@ def create_interface():
     return demo
 if __name__ == "__main__":
-    # Required packages (install with pip):
     required_packages = [
         "gradio",
         "openai",
@@ -687,6 +606,4 @@ if __name__ == "__main__":
     print("💡 Enter your API key in the web interface when it loads")
     demo = create_interface()
-    demo.launch(
-        share=True
-    )

         """Convert PIL Image to base64 string"""
         try:
             if isinstance(image, str):
                 with open(image, "rb") as img_file:
                     return base64.b64encode(img_file.read()).decode('utf-8')
             else:
                 buffered = io.BytesIO()
                 if image.mode == 'RGBA':
                     image = image.convert('RGB')
                 image.save(buffered, format="JPEG", quality=85)
     def extract_pdf_text(self, pdf_file) -> str:
         """Extract text from PDF file"""
         try:
+            if isinstance(pdf_file, str):
+                pdf_path = pdf_file
+            elif hasattr(pdf_file, 'name'):
                 pdf_path = pdf_file.name
             else:
+                raise ValueError("Invalid PDF file input")
             text = ""
             with open(pdf_path, 'rb') as file:
                 pdf_reader = PyPDF2.PdfReader(file)
                 for page_num, page in enumerate(pdf_reader.pages):
                     page_text = page.extract_text()
+                    if page_text and page_text.strip():
                         text += f"Page {page_num + 1}:\n{page_text}\n\n"
             return text.strip() if text.strip() else "No text could be extracted from this PDF."
         except Exception as e:
     def convert_audio_to_wav(self, audio_file) -> str:
         """Convert audio file to WAV format for speech recognition"""
         try:
+            if isinstance(audio_file, str):
+                audio_path = audio_file
+            elif hasattr(audio_file, 'name'):
                 audio_path = audio_file.name
             else:
+                raise ValueError("Invalid audio file input")
             file_ext = os.path.splitext(audio_path)[1].lower()
             if file_ext == '.wav':
                 return audio_path
             audio = AudioSegment.from_file(audio_path)
             wav_path = tempfile.mktemp(suffix='.wav')
             audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
             return wav_path
         except Exception as e:
+            return f"Error converting audio: {str(e)}"
     def transcribe_audio(self, audio_file) -> str:
         """Transcribe audio file to text"""
         try:
             recognizer = sr.Recognizer()
             wav_path = self.convert_audio_to_wav(audio_file)
             with sr.AudioFile(wav_path) as source:
                 recognizer.adjust_for_ambient_noise(source, duration=0.2)
                 audio_data = recognizer.record(source)
                 try:
                     text = recognizer.recognize_google(audio_data)
                     return text
                 except sr.UnknownValueError:
                     return "Could not understand the audio. Please try with clearer audio."
                 except sr.RequestError as e:
                     try:
                         text = recognizer.recognize_sphinx(audio_data)
                         return text
                     except:
                         return f"Speech recognition service error: {str(e)}"
         except Exception as e:
             return f"Error transcribing audio: {str(e)}"
     def process_video(self, video_file) -> Tuple[List[str], str]:
         """Extract frames from video and convert to base64"""
         try:
+            if isinstance(video_file, str):
+                video_path = video_file
+            elif hasattr(video_file, 'name'):
                 video_path = video_file.name
             else:
+                raise ValueError("Invalid video file input")
             cap = cv2.VideoCapture(video_path)
             if not cap.isOpened():
             frame_count = 0
             total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
             fps = cap.get(cv2.CAP_PROP_FPS)
             frame_interval = max(60, int(fps * 2)) if fps > 0 else 60
+            while True:
                 ret, frame = cap.read()
+                if not ret or len(frames) >= 5:
+                    break
+                if frame_count % frame_interval == 0:
                     rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                     pil_image = Image.fromarray(rgb_frame)
                     pil_image.thumbnail((800, 600), Image.Resampling.LANCZOS)
                     base64_frame = self.encode_image_to_base64(pil_image)
                     if not base64_frame.startswith("Error"):
                         frames.append(base64_frame)
                         timestamp = frame_count / fps if fps > 0 else frame_count
                         frame_descriptions.append(f"Frame at {timestamp:.1f}s")
                 frame_count += 1
             cap.release()
             description = f"Video processed: {len(frames)} frames extracted from {total_frames} total frames"
             return frames, description
         except Exception as e:
             return [], f"Error processing video: {str(e)}"
                                 image_file=None,
                                 video_file=None) -> dict:
         """Create a multimodal message for the API"""
         content_parts = []
         processing_info = []
         if text_input:
             content_parts.append({"type": "text", "text": text_input})
         if pdf_file is not None:
             pdf_text = self.extract_pdf_text(pdf_file)
+            content_parts.append({"type": "text", "text": f"PDF Content:\n{pdf_text}"})
             processing_info.append("📄 PDF processed")
         if audio_file is not None:
             audio_text = self.transcribe_audio(audio_file)
+            content_parts.append({"type": "text", "text": f"Audio Transcription:\n{audio_text}"})
             processing_info.append("🎤 Audio transcribed")
         if image_file is not None:
+            if isinstance(image_file, Image.Image):
                 width, height = image_file.size
                 mode = image_file.mode
                 content_parts.append({
                 })
             processing_info.append("🖼️ Image received (metadata only)")
         if video_file is not None:
             frames, video_desc = self.process_video(video_file)
             content_parts.append({
              video_file=None,
              history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
         """Main chat function"""
         if history is None:
             history = []
         try:
             user_message_parts = []
             if text_input:
                 user_message_parts.append(f"Text: {text_input}")
                 user_message_parts.append("🎥 Video uploaded")
             user_display = " | ".join(user_message_parts)
             user_message, processing_info = self.create_multimodal_message(
                 text_input, pdf_file, audio_file, image_file, video_file
             )
             if processing_info:
                 user_display += f"\n{' | '.join(processing_info)}"
             messages = [user_message]
             completion = self.client.chat.completions.create(
                 extra_headers={
                     "HTTP-Referer": "https://multimodal-chatbot.local",
             )
             bot_response = completion.choices[0].message.content
             history.append((user_display, bot_response))
             return history, ""
         except Exception as e:
             error_msg = f"Error: {str(e)}"
             history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
 def create_interface():
     """Create the Gradio interface"""
     with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🤖 Multimodal Chatbot with Gemma 3n
         **Setup**: Enter your OpenRouter API key below to get started
         """)
         with gr.Row():
             with gr.Column():
                 api_key_input = gr.Textbox(
                     interactive=False
                 )
         with gr.Tabs():
             with gr.TabItem("💬 Text Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
                         )
                         text_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         text_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         text_chatbot = gr.Chatbot(
                             label="Text Chat History",
                             show_copy_button=True
                         )
             with gr.TabItem("📄 PDF Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
                         )
                         pdf_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         pdf_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         pdf_chatbot = gr.Chatbot(
                             label="PDF Chat History",
                             show_copy_button=True
                         )
             with gr.TabItem("🎤 Audio Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
                         )
                         audio_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         audio_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         audio_chatbot = gr.Chatbot(
                             label="Audio Chat History",
                             show_copy_button=True
                         )
             with gr.TabItem("🖼️ Image Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
                         )
                         image_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         image_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         image_chatbot = gr.Chatbot(
                             label="Image Chat History",
                             show_copy_button=True
                         )
             with gr.TabItem("🎥 Video Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
                         )
                         video_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
                         video_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                     with gr.Column(scale=2):
                         video_chatbot = gr.Chatbot(
                             label="Video Chat History",
                             show_copy_button=True
                         )
             with gr.TabItem("🌟 Combined Chat"):
                 with gr.Row():
                     with gr.Column(scale=1):
                             placeholder="Type your message here...",
                             lines=3
                         )
                         combined_pdf_input = gr.File(
                             label="📄 PDF Upload",
                             file_types=[".pdf"],
                             type="filepath"
                         )
                         combined_audio_input = gr.File(
                             label="🎤 Audio Upload",
                             file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
                             type="filepath"
                         )
                         combined_image_input = gr.Image(
                             label="🖼️ Image Upload",
                             type="pil"
                         )
                         combined_video_input = gr.File(
                             label="🎥 Video Upload",
                             file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
                             type="filepath"
                         )
                         combined_submit_btn = gr.Button("🚀 Send All", variant="primary", size="lg", interactive=False)
                         combined_clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
                     with gr.Column(scale=2):
                         combined_chatbot = gr.Chatbot(
                             label="Combined Chat History",
                             show_copy_button=True
                         )
         def validate_api_key(api_key):
             if not api_key or len(api_key.strip()) == 0:
                 return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(6)]
             try:
                 test_client = OpenAI(
                     base_url="https://openrouter.ai/api/v1",
                     api_key=api_key.strip(),
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, history=history)
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, audio_file=audio, history=history)
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, image_file=image, history=history)
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, video_file=video, history=history)
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text_input=text, pdf_file=pdf, audio_file=audio, image_file=image, video_file=video, history=history)
         def clear_chat():
             return [], ""
         def clear_all_inputs():
             return [], "", None, None, None, None
         api_key_input.change(
             validate_api_key,
             inputs=[api_key_input],
                     image_submit_btn, video_submit_btn, combined_submit_btn]
         )
         text_submit_btn.click(
             process_text_input,
             inputs=[api_key_input, text_input, text_chatbot],
         )
         text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
         pdf_submit_btn.click(
             process_pdf_input,
             inputs=[api_key_input, pdf_input, pdf_text_input, pdf_chatbot],
         )
         pdf_clear_btn.click(lambda: ([], "", None), outputs=[pdf_chatbot, pdf_text_input, pdf_input])
         audio_submit_btn.click(
             process_audio_input,
             inputs=[api_key_input, audio_input, audio_text_input, audio_chatbot],
         )
         audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
         image_submit_btn.click(
             process_image_input,
             inputs=[api_key_input, image_input, image_text_input, image_chatbot],
         )
         image_clear_btn.click(lambda: ([], "", None), outputs=[image_chatbot, image_text_input, image_input])
         video_submit_btn.click(
             process_video_input,
             inputs=[api_key_input, video_input, video_text_input, video_chatbot],
         )
         video_clear_btn.click(lambda: ([], "", None), outputs=[video_chatbot, video_text_input, video_input])
         combined_submit_btn.click(
             process_combined_input,
             inputs=[api_key_input, combined_text_input, combined_pdf_input,
                                outputs=[combined_chatbot, combined_text_input, combined_pdf_input,
                                       combined_audio_input, combined_image_input, combined_video_input])
         gr.Markdown("""
         ### 🎯 How to Use Each Tab:
     return demo
 if __name__ == "__main__":
     required_packages = [
         "gradio",
         "openai",
     print("💡 Enter your API key in the web interface when it loads")
     demo = create_interface()
+    demo.launch(share=True)