Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,6 +17,7 @@ try:
|
|
| 17 |
import numpy as np
|
| 18 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 19 |
import google.generativeai as genai
|
|
|
|
| 20 |
RAG_DEPENDENCIES_AVAILABLE = True
|
| 21 |
except ImportError as e:
|
| 22 |
print(f"RAG dependencies not available: {e}")
|
|
@@ -338,21 +339,21 @@ if RAG_DEPENDENCIES_AVAILABLE:
|
|
| 338 |
gemini_api_key = os.getenv('GEMINI_API_KEY')
|
| 339 |
if gemini_api_key:
|
| 340 |
genai.configure(api_key=gemini_api_key)
|
| 341 |
-
|
| 342 |
print("β
Gemini API configured successfully")
|
| 343 |
else:
|
| 344 |
print("β GEMINI_API_KEY not found in environment")
|
| 345 |
-
|
| 346 |
except Exception as e:
|
| 347 |
print(f"β Error loading models: {e}")
|
| 348 |
import traceback
|
| 349 |
traceback.print_exc()
|
| 350 |
embedding_model = None
|
| 351 |
-
|
| 352 |
else:
|
| 353 |
print("β RAG dependencies not available")
|
| 354 |
embedding_model = None
|
| 355 |
-
|
| 356 |
|
| 357 |
# Model management functions
|
| 358 |
def load_dolphin_model():
|
|
@@ -388,12 +389,12 @@ def unload_dolphin_model():
|
|
| 388 |
torch.cuda.empty_cache()
|
| 389 |
print("β
DOLPHIN model unloaded")
|
| 390 |
|
| 391 |
-
def
|
| 392 |
-
"""Initialize Gemini API
|
| 393 |
-
global
|
| 394 |
|
| 395 |
-
if
|
| 396 |
-
return
|
| 397 |
|
| 398 |
try:
|
| 399 |
gemini_api_key = os.getenv('GEMINI_API_KEY')
|
|
@@ -401,35 +402,41 @@ def initialize_gemini_model():
|
|
| 401 |
print("β GEMINI_API_KEY not found in environment")
|
| 402 |
return None
|
| 403 |
|
| 404 |
-
print("Initializing Gemini API...")
|
| 405 |
-
genai.configure(api_key=gemini_api_key)
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
return gemini_model
|
| 409 |
except Exception as e:
|
| 410 |
-
print(f"β Error initializing Gemini
|
| 411 |
import traceback
|
| 412 |
traceback.print_exc()
|
| 413 |
return None
|
| 414 |
|
| 415 |
|
| 416 |
def generate_alt_text_for_image(pil_image):
|
| 417 |
-
"""Generate alt text for an image using Gemma 3n model"""
|
| 418 |
try:
|
| 419 |
-
# Initialize Gemini
|
| 420 |
-
|
| 421 |
-
if
|
| 422 |
-
print("β Gemini
|
| 423 |
return "Image description unavailable"
|
| 424 |
|
| 425 |
# Debug: Check image format and properties
|
| 426 |
print(f"π Image format: {pil_image.format}, mode: {pil_image.mode}, size: {pil_image.size}")
|
| 427 |
|
| 428 |
-
# Ensure image is in RGB mode
|
| 429 |
if pil_image.mode != 'RGB':
|
| 430 |
print(f"Converting image from {pil_image.mode} to RGB")
|
| 431 |
pil_image = pil_image.convert('RGB')
|
| 432 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
# Create a detailed prompt for alt text generation
|
| 434 |
prompt = """You are an accessibility expert creating alt text for images to help visually impaired users understand visual content. Analyze this image and provide a clear, concise description that captures the essential visual information.
|
| 435 |
|
|
@@ -441,36 +448,23 @@ Focus on:
|
|
| 441 |
|
| 442 |
Provide a descriptive alt text in 1-2 sentences that is informative but not overly verbose. Start directly with the description without saying "This image shows" or similar phrases."""
|
| 443 |
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
|
|
|
|
|
|
|
|
|
| 449 |
|
| 450 |
print(f"π‘ API response received: {type(response)}")
|
| 451 |
-
print(f"π‘ Response attributes: {dir(response)}")
|
| 452 |
|
| 453 |
if hasattr(response, 'text') and response.text:
|
| 454 |
alt_text = response.text.strip()
|
| 455 |
print(f"β
Alt text generated: {alt_text[:100]}...")
|
| 456 |
else:
|
| 457 |
print(f"β No text in response. Response: {response}")
|
| 458 |
-
|
| 459 |
-
if hasattr(response, 'candidates') and response.candidates:
|
| 460 |
-
candidate = response.candidates[0]
|
| 461 |
-
if hasattr(candidate, 'content') and candidate.content:
|
| 462 |
-
if hasattr(candidate.content, 'parts') and candidate.content.parts:
|
| 463 |
-
alt_text = candidate.content.parts[0].text.strip()
|
| 464 |
-
print(f"β
Alt text from candidates: {alt_text[:100]}...")
|
| 465 |
-
else:
|
| 466 |
-
print(f"β No parts in content")
|
| 467 |
-
return "Image description unavailable"
|
| 468 |
-
else:
|
| 469 |
-
print(f"β No content in candidate")
|
| 470 |
-
return "Image description unavailable"
|
| 471 |
-
else:
|
| 472 |
-
print(f"β No candidates in response")
|
| 473 |
-
return "Image description unavailable"
|
| 474 |
|
| 475 |
# Clean up the alt text
|
| 476 |
alt_text = alt_text.replace('\n', ' ').replace('\r', ' ')
|
|
@@ -498,7 +492,7 @@ document_embeddings = None
|
|
| 498 |
|
| 499 |
# Global model state
|
| 500 |
dolphin_model = None
|
| 501 |
-
|
| 502 |
current_model = None # Track which model is currently loaded
|
| 503 |
|
| 504 |
|
|
@@ -668,7 +662,7 @@ with gr.Blocks(
|
|
| 668 |
# Home Tab
|
| 669 |
with gr.TabItem("π Home", id="home"):
|
| 670 |
embedding_status = "β
RAG ready" if embedding_model else "β RAG not loaded"
|
| 671 |
-
gemini_status = "β
Gemini API ready" if
|
| 672 |
current_status = f"Currently loaded: {current_model or 'None'}"
|
| 673 |
gr.Markdown(
|
| 674 |
"# Scholar Express - Alt Text Enhanced\n"
|
|
@@ -786,11 +780,11 @@ with gr.Blocks(
|
|
| 786 |
return history + [[message, "β Please process a PDF document first before asking questions."]]
|
| 787 |
|
| 788 |
try:
|
| 789 |
-
# Initialize Gemini
|
| 790 |
-
|
| 791 |
|
| 792 |
-
if
|
| 793 |
-
return history + [[message, "β Failed to initialize Gemini
|
| 794 |
|
| 795 |
# Use RAG to get relevant chunks from markdown (balanced for performance vs quota)
|
| 796 |
if document_chunks and len(document_chunks) > 0:
|
|
@@ -821,7 +815,7 @@ Please provide a clear and helpful answer based on the context provided."""
|
|
| 821 |
|
| 822 |
for attempt in range(max_retries):
|
| 823 |
try:
|
| 824 |
-
response =
|
| 825 |
response_text = response.text if hasattr(response, 'text') else str(response)
|
| 826 |
return history + [[message, response_text]]
|
| 827 |
except Exception as api_error:
|
|
|
|
| 17 |
import numpy as np
|
| 18 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 19 |
import google.generativeai as genai
|
| 20 |
+
from google.generativeai import types
|
| 21 |
RAG_DEPENDENCIES_AVAILABLE = True
|
| 22 |
except ImportError as e:
|
| 23 |
print(f"RAG dependencies not available: {e}")
|
|
|
|
| 339 |
gemini_api_key = os.getenv('GEMINI_API_KEY')
|
| 340 |
if gemini_api_key:
|
| 341 |
genai.configure(api_key=gemini_api_key)
|
| 342 |
+
gemini_client = True # Just mark as configured
|
| 343 |
print("β
Gemini API configured successfully")
|
| 344 |
else:
|
| 345 |
print("β GEMINI_API_KEY not found in environment")
|
| 346 |
+
gemini_client = None
|
| 347 |
except Exception as e:
|
| 348 |
print(f"β Error loading models: {e}")
|
| 349 |
import traceback
|
| 350 |
traceback.print_exc()
|
| 351 |
embedding_model = None
|
| 352 |
+
gemini_client = None
|
| 353 |
else:
|
| 354 |
print("β RAG dependencies not available")
|
| 355 |
embedding_model = None
|
| 356 |
+
gemini_client = None
|
| 357 |
|
| 358 |
# Model management functions
|
| 359 |
def load_dolphin_model():
|
|
|
|
| 389 |
torch.cuda.empty_cache()
|
| 390 |
print("β
DOLPHIN model unloaded")
|
| 391 |
|
| 392 |
+
def initialize_gemini_client():
|
| 393 |
+
"""Initialize Gemini API client"""
|
| 394 |
+
global gemini_client
|
| 395 |
|
| 396 |
+
if gemini_client is not None:
|
| 397 |
+
return gemini_client
|
| 398 |
|
| 399 |
try:
|
| 400 |
gemini_api_key = os.getenv('GEMINI_API_KEY')
|
|
|
|
| 402 |
print("β GEMINI_API_KEY not found in environment")
|
| 403 |
return None
|
| 404 |
|
| 405 |
+
print("Initializing Gemini API client...")
|
| 406 |
+
gemini_client = genai.configure(api_key=gemini_api_key)
|
| 407 |
+
print("β
Gemini API client ready for gemma-3n-e4b-it")
|
| 408 |
+
return gemini_client
|
|
|
|
| 409 |
except Exception as e:
|
| 410 |
+
print(f"β Error initializing Gemini client: {e}")
|
| 411 |
import traceback
|
| 412 |
traceback.print_exc()
|
| 413 |
return None
|
| 414 |
|
| 415 |
|
| 416 |
def generate_alt_text_for_image(pil_image):
|
| 417 |
+
"""Generate alt text for an image using Gemma 3n model via Google AI API"""
|
| 418 |
try:
|
| 419 |
+
# Initialize Gemini client
|
| 420 |
+
client = initialize_gemini_client()
|
| 421 |
+
if client is None:
|
| 422 |
+
print("β Gemini client not initialized for alt text generation")
|
| 423 |
return "Image description unavailable"
|
| 424 |
|
| 425 |
# Debug: Check image format and properties
|
| 426 |
print(f"π Image format: {pil_image.format}, mode: {pil_image.mode}, size: {pil_image.size}")
|
| 427 |
|
| 428 |
+
# Ensure image is in RGB mode
|
| 429 |
if pil_image.mode != 'RGB':
|
| 430 |
print(f"Converting image from {pil_image.mode} to RGB")
|
| 431 |
pil_image = pil_image.convert('RGB')
|
| 432 |
|
| 433 |
+
# Convert PIL image to bytes
|
| 434 |
+
buffered = io.BytesIO()
|
| 435 |
+
pil_image.save(buffered, format="JPEG")
|
| 436 |
+
image_bytes = buffered.getvalue()
|
| 437 |
+
|
| 438 |
+
print(f"π Generating alt text for image with Gemma 3n...")
|
| 439 |
+
|
| 440 |
# Create a detailed prompt for alt text generation
|
| 441 |
prompt = """You are an accessibility expert creating alt text for images to help visually impaired users understand visual content. Analyze this image and provide a clear, concise description that captures the essential visual information.
|
| 442 |
|
|
|
|
| 448 |
|
| 449 |
Provide a descriptive alt text in 1-2 sentences that is informative but not overly verbose. Start directly with the description without saying "This image shows" or similar phrases."""
|
| 450 |
|
| 451 |
+
# Use the Google AI API client with proper format
|
| 452 |
+
response = genai.GenerativeModel('gemma-3n-e4b-it').generate_content([
|
| 453 |
+
types.Part.from_bytes(
|
| 454 |
+
data=image_bytes,
|
| 455 |
+
mime_type='image/jpeg',
|
| 456 |
+
),
|
| 457 |
+
prompt
|
| 458 |
+
])
|
| 459 |
|
| 460 |
print(f"π‘ API response received: {type(response)}")
|
|
|
|
| 461 |
|
| 462 |
if hasattr(response, 'text') and response.text:
|
| 463 |
alt_text = response.text.strip()
|
| 464 |
print(f"β
Alt text generated: {alt_text[:100]}...")
|
| 465 |
else:
|
| 466 |
print(f"β No text in response. Response: {response}")
|
| 467 |
+
return "Image description unavailable"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
# Clean up the alt text
|
| 470 |
alt_text = alt_text.replace('\n', ' ').replace('\r', ' ')
|
|
|
|
| 492 |
|
| 493 |
# Global model state
|
| 494 |
dolphin_model = None
|
| 495 |
+
gemini_client = None
|
| 496 |
current_model = None # Track which model is currently loaded
|
| 497 |
|
| 498 |
|
|
|
|
| 662 |
# Home Tab
|
| 663 |
with gr.TabItem("π Home", id="home"):
|
| 664 |
embedding_status = "β
RAG ready" if embedding_model else "β RAG not loaded"
|
| 665 |
+
gemini_status = "β
Gemini API ready" if gemini_client else "β Gemini API not configured"
|
| 666 |
current_status = f"Currently loaded: {current_model or 'None'}"
|
| 667 |
gr.Markdown(
|
| 668 |
"# Scholar Express - Alt Text Enhanced\n"
|
|
|
|
| 780 |
return history + [[message, "β Please process a PDF document first before asking questions."]]
|
| 781 |
|
| 782 |
try:
|
| 783 |
+
# Initialize Gemini client
|
| 784 |
+
client = initialize_gemini_client()
|
| 785 |
|
| 786 |
+
if client is None:
|
| 787 |
+
return history + [[message, "β Failed to initialize Gemini client. Please check your GEMINI_API_KEY."]]
|
| 788 |
|
| 789 |
# Use RAG to get relevant chunks from markdown (balanced for performance vs quota)
|
| 790 |
if document_chunks and len(document_chunks) > 0:
|
|
|
|
| 815 |
|
| 816 |
for attempt in range(max_retries):
|
| 817 |
try:
|
| 818 |
+
response = genai.GenerativeModel('gemma-3n-e4b-it').generate_content(prompt)
|
| 819 |
response_text = response.text if hasattr(response, 'text') else str(response)
|
| 820 |
return history + [[message, response_text]]
|
| 821 |
except Exception as api_error:
|