Spaces:

shukdevdattaEX
/

Gemma-3n-Multi-modal-chatbot

Sleeping

App Files Files Community

Gemma-3n-Multi-modal-chatbot / app.py

shukdevdattaEX

Create app.py

38b2ece verified 4 months ago

raw

history blame

11.8 kB

	import gradio as gr
	import base64
	import io
	import os
	from openai import OpenAI
	import PyPDF2
	from PIL import Image
	import speech_recognition as sr
	import tempfile
	import cv2
	import numpy as np
	from typing import List, Tuple, Optional
	import json

	class MultimodalChatbot:
	def __init__(self, api_key: str):
	self.client = OpenAI(
	base_url="https://openrouter.ai/api/v1",
	api_key=api_key,
	)
	self.model = "google/gemma-3n-e2b-it:free"
	self.conversation_history = []

	def encode_image_to_base64(self, image) -> str:
	"""Convert PIL Image to base64 string"""
	if isinstance(image, str):
	# If it's a file path
	with open(image, "rb") as img_file:
	return base64.b64encode(img_file.read()).decode('utf-8')
	else:
	# If it's a PIL Image
	buffered = io.BytesIO()
	image.save(buffered, format="PNG")
	return base64.b64encode(buffered.getvalue()).decode('utf-8')

	def extract_pdf_text(self, pdf_file) -> str:
	"""Extract text from PDF file"""
	try:
	if hasattr(pdf_file, 'name'):
	# Gradio file object
	pdf_path = pdf_file.name
	else:
	pdf_path = pdf_file

	text = ""
	with open(pdf_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text.strip()
	except Exception as e:
	return f"Error extracting PDF: {str(e)}"

	def transcribe_audio(self, audio_file) -> str:
	"""Transcribe audio file to text"""
	try:
	recognizer = sr.Recognizer()

	if hasattr(audio_file, 'name'):
	audio_path = audio_file.name
	else:
	audio_path = audio_file

	with sr.AudioFile(audio_path) as source:
	audio_data = recognizer.record(source)
	text = recognizer.recognize_google(audio_data)
	return text
	except Exception as e:
	return f"Error transcribing audio: {str(e)}"

	def process_video(self, video_file) -> List[str]:
	"""Extract frames from video and convert to base64"""
	try:
	if hasattr(video_file, 'name'):
	video_path = video_file.name
	else:
	video_path = video_file

	cap = cv2.VideoCapture(video_path)
	frames = []
	frame_count = 0

	# Extract frames (every 30 frames to avoid too many)
	while cap.read()[0] and frame_count < 10: # Limit to 10 frames
	ret, frame = cap.read()
	if ret and frame_count % 30 == 0:
	# Convert BGR to RGB
	rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	pil_image = Image.fromarray(rgb_frame)
	base64_frame = self.encode_image_to_base64(pil_image)
	frames.append(base64_frame)
	frame_count += 1

	cap.release()
	return frames
	except Exception as e:
	return [f"Error processing video: {str(e)}"]

	def create_multimodal_message(self,
	text_input: str = "",
	pdf_file=None,
	audio_file=None,
	image_file=None,
	video_file=None) -> dict:
	"""Create a multimodal message for the API"""

	content_parts = []

	# Add text content
	if text_input:
	content_parts.append({"type": "text", "text": text_input})

	# Process PDF
	if pdf_file is not None:
	pdf_text = self.extract_pdf_text(pdf_file)
	content_parts.append({
	"type": "text",
	"text": f"PDF Content:\n{pdf_text}"
	})

	# Process Audio
	if audio_file is not None:
	audio_text = self.transcribe_audio(audio_file)
	content_parts.append({
	"type": "text",
	"text": f"Audio Transcription:\n{audio_text}"
	})

	# Process Image
	if image_file is not None:
	image_base64 = self.encode_image_to_base64(image_file)
	content_parts.append({
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{image_base64}"
	}
	})

	# Process Video
	if video_file is not None:
	video_frames = self.process_video(video_file)
	for i, frame_base64 in enumerate(video_frames):
	if not frame_base64.startswith("Error"):
	content_parts.append({
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{frame_base64}"
	}
	})

	return {"role": "user", "content": content_parts}

	def chat(self,
	text_input: str = "",
	pdf_file=None,
	audio_file=None,
	image_file=None,
	video_file=None,
	history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
	"""Main chat function"""

	if history is None:
	history = []

	try:
	# Create user message summary for display
	user_message_parts = []
	if text_input:
	user_message_parts.append(f"Text: {text_input}")
	if pdf_file:
	user_message_parts.append("📄 PDF uploaded")
	if audio_file:
	user_message_parts.append("🎤 Audio uploaded")
	if image_file:
	user_message_parts.append("🖼️ Image uploaded")
	if video_file:
	user_message_parts.append("🎥 Video uploaded")

	user_display = " \| ".join(user_message_parts)

	# Create multimodal message
	user_message = self.create_multimodal_message(
	text_input, pdf_file, audio_file, image_file, video_file
	)

	# Add to conversation history
	messages = [user_message]

	# Get response from Gemma
	completion = self.client.chat.completions.create(
	extra_headers={
	"HTTP-Referer": "https://multimodal-chatbot.local",
	"X-Title": "Multimodal Chatbot",
	},
	model=self.model,
	messages=messages,
	max_tokens=1024,
	temperature=0.7
	)

	bot_response = completion.choices[0].message.content

	# Update history
	history.append((user_display, bot_response))

	return history, ""

	except Exception as e:
	error_msg = f"Error: {str(e)}"
	history.append((user_display if 'user_display' in locals() else "Error in input", error_msg))
	return history, ""

	def create_interface():
	"""Create the Gradio interface"""

	# Initialize chatbot (you'll need to set your API key)
	api_key = os.getenv("OPENROUTER_API_KEY", "your_api_key_here")
	chatbot = MultimodalChatbot(api_key)

	with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🤖 Multimodal Chatbot with Gemma 3n

	This chatbot can process multiple types of input:
	- Text: Regular text messages
	- PDF: Extract and analyze document content
	- Audio: Transcribe speech to text
	- Images: Analyze visual content
	- Video: Extract frames and analyze video content

	Setup: Set your OpenRouter API key as an environment variable `OPENROUTER_API_KEY`
	""")

	with gr.Row():
	with gr.Column(scale=1):
	# Input components
	text_input = gr.Textbox(
	label="💬 Text Input",
	placeholder="Type your message here...",
	lines=3
	)

	pdf_input = gr.File(
	label="📄 PDF Upload",
	file_types=[".pdf"],
	type="filepath"
	)

	audio_input = gr.File(
	label="🎤 Audio Upload",
	file_types=[".wav", ".mp3", ".m4a", ".flac"],
	type="filepath"
	)

	image_input = gr.Image(
	label="🖼️ Image Upload",
	type="pil"
	)

	video_input = gr.File(
	label="🎥 Video Upload",
	file_types=[".mp4", ".avi", ".mov", ".mkv"],
	type="filepath"
	)

	submit_btn = gr.Button("🚀 Send", variant="primary", size="lg")
	clear_btn = gr.Button("🗑️ Clear", variant="secondary")

	with gr.Column(scale=2):
	# Chat interface
	chatbot_interface = gr.Chatbot(
	label="Chat History",
	height=600,
	bubble_full_width=False
	)

	# Event handlers
	def process_input(text, pdf, audio, image, video, history):
	return chatbot.chat(text, pdf, audio, image, video, history)

	def clear_all():
	return [], "", None, None, None, None

	# Button events
	submit_btn.click(
	process_input,
	inputs=[text_input, pdf_input, audio_input, image_input, video_input, chatbot_interface],
	outputs=[chatbot_interface, text_input]
	)

	clear_btn.click(
	clear_all,
	outputs=[chatbot_interface, text_input, pdf_input, audio_input, image_input, video_input]
	)

	# Enter key support
	text_input.submit(
	process_input,
	inputs=[text_input, pdf_input, audio_input, image_input, video_input, chatbot_interface],
	outputs=[chatbot_interface, text_input]
	)

	# Examples
	gr.Markdown("""
	### 🎯 Example Usage:
	- Upload a PDF and ask "Summarize this document"
	- Upload an image and ask "What do you see in this image?"
	- Record audio and ask "What did I say?"
	- Upload a video and ask "Describe what's happening"
	- Combine multiple inputs: "Compare this image with the PDF content"
	""")

	return demo

	if __name__ == "__main__":
	# Required packages (install with pip):
	required_packages = [
	"gradio",
	"openai",
	"PyPDF2",
	"Pillow",
	"SpeechRecognition",
	"opencv-python",
	"numpy"
	]

	print("Required packages:", ", ".join(required_packages))
	print("\nTo install: pip install " + " ".join(required_packages))
	print("\nDon't forget to set your OPENROUTER_API_KEY environment variable!")

	demo = create_interface()
	demo.launch(
	share=True
	)