Spaces:

shukdevdattaEX
/

Gemma-3n-Multi-modal-chatbot

Sleeping

App Files Files Community

shukdevdattaEX commited on Jul 19

Commit

f40b063

verified ·

1 Parent(s): 5ceab5f

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -41

app.py CHANGED Viewed

@@ -13,6 +13,8 @@ from typing import List, Tuple, Optional
 import json
 import pydub
 from pydub import AudioSegment
 class MultimodalChatbot:
     def __init__(self, api_key: str):
@@ -22,7 +24,18 @@ class MultimodalChatbot:
         )
         self.model = "google/gemma-3n-e2b-it:free"
         self.conversation_history = []
     def encode_image_to_base64(self, image) -> str:
         """Convert PIL Image or file path to base64 string"""
         try:
@@ -105,8 +118,8 @@ class MultimodalChatbot:
         except Exception as e:
             return f"Error transcribing audio: {str(e)}"
-    def process_video(self, video_file) -> Tuple[List[str], str]:
-        """Process video file (metadata only, no visual analysis)"""
         try:
             if isinstance(video_file, str):
                 video_path = video_file
@@ -117,17 +130,21 @@ class MultimodalChatbot:
             cap = cv2.VideoCapture(video_path)
             if not cap.isOpened():
-                return [], "Error: Could not open video file"
             total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-            fps = cap.get(cv2.CAP_PROP_FPS)
-            duration = total_frames / fps if fps > 0 else 0
             cap.release()
-            description = f"Video metadata: {total_frames} frames, {duration:.1f} seconds. Visual analysis not supported by the current model."
-            return [], description
         except Exception as e:
-            return [], f"Error processing video: {str(e)}"
     def create_multimodal_message(self,
                                 text_input: str = "",
@@ -152,28 +169,44 @@ class MultimodalChatbot:
             content_parts.append({"type": "text", "text": f"Audio Transcription:\n{audio_text}"})
             processing_info.append("🎤 Audio transcribed")
-        if image_file is not None:
-            if isinstance(image_file, Image.Image):
-                width, height = image_file.size
-                mode = image_file.mode
-                content_parts.append({
-                    "type": "text",
-                    "text": f"Image uploaded: {width}x{height} pixels, mode: {mode}. Visual analysis not supported by the current model. Please describe the image for further assistance."
-                })
             else:
-                content_parts.append({
-                    "type": "text",
-                    "text": "Image uploaded. Visual analysis not supported by the current model. Please describe the image for further assistance."
-                })
-            processing_info.append("🖼️ Image received (metadata only)")
-        if video_file is not None:
-            _, video_desc = self.process_video(video_file)
-            content_parts.append({
-                "type": "text",
-                "text": f"Video uploaded: {video_desc}. Please describe the video for further assistance."
-            })
-            processing_info.append("🎥 Video processed (metadata only)")
         return {"role": "user", "content": content_parts}, processing_info
@@ -239,8 +272,8 @@ def create_interface():
         - **Text**: Regular text messages
         - **PDF**: Extract and analyze document content
         - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
-        - **Images**: Upload images (metadata only; visual analysis not supported)
-        - **Video**: Upload videos (metadata only; visual analysis not supported)
         **Setup**: Enter your OpenRouter API key below to get started
         """)
@@ -492,7 +525,7 @@ def create_interface():
         )
         text_input.submit(
             process_text_input,
-            inputs=[api_key_input, text_input, text_chatbot],
             outputs=[text_chatbot, text_input]
         )
         text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
@@ -546,11 +579,11 @@ def create_interface():
         - Supports: WAV, MP3, M4A, FLAC, OGG formats
         - Best results with clear speech and minimal background noise
-        **🖼️ Image Chat**: Upload images (metadata only; visual analysis not supported)
-        - Provide a text description of the image for further assistance
-        **🎥 Video Chat**: Upload videos (metadata only; visual analysis not supported)
-        - Provide a text description of the video for further assistance
         **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
@@ -562,8 +595,8 @@ def create_interface():
         5. Copy and paste it in the field above
         ### ⚠️ Current Limitations:
-        - Image and video visual analysis not supported by the free Gemma 3n model
-        - Audio transcription requires internet connection for best results
         - Large files may take longer to process
         """)
@@ -578,7 +611,9 @@ if __name__ == "__main__":
         "SpeechRecognition",
         "opencv-python",
         "numpy",
-        "pydub"
     ]
     print("🚀 Multimodal Chatbot with Gemma 3n")

 import json
 import pydub
 from pydub import AudioSegment
+from transformers import pipeline
+import torch
 class MultimodalChatbot:
     def __init__(self, api_key: str):
         )
         self.model = "google/gemma-3n-e2b-it:free"
         self.conversation_history = []
+        # Initialize the pipeline for image-text-to-text processing
+        try:
+            self.pipe = pipeline(
+                "image-text-to-text",
+                model="google/gemma-3n-e2b",
+                device="cpu",  # Optimized for CPU in HF Spaces
+                torch_dtype=torch.float32,  # Use float32 for CPU compatibility
+            )
+        except Exception as e:
+            print(f"Error initializing pipeline: {e}")
+            self.pipe = None
     def encode_image_to_base64(self, image) -> str:
         """Convert PIL Image or file path to base64 string"""
         try:
         except Exception as e:
             return f"Error transcribing audio: {str(e)}"
+    def extract_video_frame(self, video_file, frame_number=None):
+        """Extract a frame from the video"""
         try:
             if isinstance(video_file, str):
                 video_path = video_file
             cap = cv2.VideoCapture(video_path)
             if not cap.isOpened():
+                return None
             total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            if frame_number is None:
+                frame_number = total_frames // 2  # Extract middle frame
+            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
+            ret, frame = cap.read()
             cap.release()
+            if ret:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                return Image.fromarray(frame)
+            else:
+                return None
         except Exception as e:
+            return None
     def create_multimodal_message(self,
                                 text_input: str = "",
             content_parts.append({"type": "text", "text": f"Audio Transcription:\n{audio_text}"})
             processing_info.append("🎤 Audio transcribed")
+        if image_file is not None and self.pipe is not None:
+            try:
+                if isinstance(image_file, str):
+                    image = Image.open(image_file)
+                else:
+                    image = image_file
+                # Use user's text input as prompt, or default if none provided
+                prompt = f"<image_soft_token> {text_input}" if text_input else "<image_soft_token> Describe this image"
+                output = self.pipe(image, text=prompt)
+                description = output[0]['generated_text']
+                content_parts.append({"type": "text", "text": f"Image analysis: {description}"})
+                processing_info.append("🖼️ Image analyzed")
+            except Exception as e:
+                content_parts.append({"type": "text", "text": f"Error analyzing image: {str(e)}"})
+                processing_info.append("🖼️ Image analysis failed")
+        elif image_file is not None:
+            content_parts.append({"type": "text", "text": "Image uploaded. Analysis failed due to model initialization error."})
+            processing_info.append("🖼️ Image received (analysis failed)")
+        if video_file is not None and self.pipe is not None:
+            frame = self.extract_video_frame(video_file)
+            if frame:
+                try:
+                    # Use user's text input with context, or default for frame
+                    prompt = f"<image_soft_token> This is a frame from the video. {text_input}" if text_input else "<image_soft_token> Describe this frame from the video"
+                    output = self.pipe(frame, text=prompt)
+                    description = output[0]['generated_text']
+                    content_parts.append({"type": "text", "text": f"Video frame analysis: {description}. Please describe the video for further assistance."})
+                    processing_info.append("🎥 Video frame analyzed")
+                except Exception as e:
+                    content_parts.append({"type": "text", "text": f"Error analyzing video frame: {str(e)}"})
+                    processing_info.append("🎥 Video frame analysis failed")
             else:
+                content_parts.append({"type": "text", "text": "Could not extract frame from video. Please describe the video."})
+                processing_info.append("🎥 Video processing failed")
+        elif video_file is not None:
+            content_parts.append({"type": "text", "text": "Video uploaded. Analysis failed due to model initialization error."})
+            processing_info.append("🎥 Video received (analysis failed)")
         return {"role": "user", "content": content_parts}, processing_info
         - **Text**: Regular text messages
         - **PDF**: Extract and analyze document content
         - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
+        - **Images**: Upload images for analysis using Gemma 3n
+        - **Video**: Upload videos for basic frame analysis using Gemma 3n
         **Setup**: Enter your OpenRouter API key below to get started
         """)
         )
         text_input.submit(
             process_text_input,
+            inputs=[api_key_input, text_input,+Y text_chatbot],
             outputs=[text_chatbot, text_input]
         )
         text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
         - Supports: WAV, MP3, M4A, FLAC, OGG formats
         - Best results with clear speech and minimal background noise
+        **🖼️ Image Chat**: Upload images for analysis using Gemma 3n
+        - Provide a text prompt to guide the analysis (e.g., "What is in this image?")
+        **🎥 Video Chat**: Upload videos for basic frame analysis using Gemma 3n
+        - Analysis is based on a single frame; provide a text description for full video context
         **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
         5. Copy and paste it in the field above
         ### ⚠️ Current Limitations:
+        - Image and video analysis may be slow on CPU in Hugging Face Spaces
+        - Video analysis is limited to a single frame due to CPU constraints
         - Large files may take longer to process
         """)
         "SpeechRecognition",
         "opencv-python",
         "numpy",
+        "pydub",
+        "transformers",  # Added for image and video analysis
+        "torch"          # Added for transformers compatibility
     ]
     print("🚀 Multimodal Chatbot with Gemma 3n")