Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
-
DOLPHIN PDF Document AI -
|
| 3 |
Optimized for HuggingFace Spaces NVIDIA T4 Small deployment
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import gradio as gr
|
|
@@ -219,6 +220,9 @@ def process_elements_optimized(layout_results, padded_image, dims, model, max_ba
|
|
| 219 |
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
| 220 |
pil_crop = crop_margin(pil_crop)
|
| 221 |
|
|
|
|
|
|
|
|
|
|
| 222 |
buffered = io.BytesIO()
|
| 223 |
pil_crop.save(buffered, format="PNG")
|
| 224 |
img_base64 = base64.b64encode(buffered.getvalue()).decode()
|
|
@@ -226,9 +230,10 @@ def process_elements_optimized(layout_results, padded_image, dims, model, max_ba
|
|
| 226 |
|
| 227 |
figure_results.append({
|
| 228 |
"label": label,
|
| 229 |
-
"text": f"![
|
| 230 |
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
|
| 231 |
"reading_order": reading_order,
|
|
|
|
| 232 |
})
|
| 233 |
else:
|
| 234 |
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
|
@@ -305,6 +310,7 @@ def generate_fallback_markdown(recognition_results):
|
|
| 305 |
elif element["label"] in ["para", "title", "sec", "sub_sec"]:
|
| 306 |
markdown_content += f"{element['text']}\n\n"
|
| 307 |
elif element["label"] == "fig":
|
|
|
|
| 308 |
markdown_content += f"{element['text']}\n\n"
|
| 309 |
return markdown_content
|
| 310 |
|
|
@@ -407,6 +413,45 @@ def initialize_gemini_model():
|
|
| 407 |
return None
|
| 408 |
|
| 409 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
# Global state for managing tabs
|
| 411 |
processed_markdown = ""
|
| 412 |
show_results_tab = False
|
|
@@ -588,11 +633,12 @@ with gr.Blocks(
|
|
| 588 |
gemini_status = "β
Gemini API ready" if gemini_model else "β Gemini API not configured"
|
| 589 |
current_status = f"Currently loaded: {current_model or 'None'}"
|
| 590 |
gr.Markdown(
|
| 591 |
-
"# Scholar Express\n"
|
| 592 |
-
"### Upload a research paper to get a web-friendly version
|
| 593 |
f"**System:** {model_status}\n"
|
| 594 |
f"**RAG System:** {embedding_status}\n"
|
| 595 |
f"**Gemini API:** {gemini_status}\n"
|
|
|
|
| 596 |
f"**Status:** {current_status}"
|
| 597 |
)
|
| 598 |
|
|
|
|
| 1 |
"""
|
| 2 |
+
DOLPHIN PDF Document AI - Alt Text Enhanced Version
|
| 3 |
Optimized for HuggingFace Spaces NVIDIA T4 Small deployment
|
| 4 |
+
Features: AI-generated alt text for accessibility using Gemma 3n
|
| 5 |
"""
|
| 6 |
|
| 7 |
import gradio as gr
|
|
|
|
| 220 |
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
| 221 |
pil_crop = crop_margin(pil_crop)
|
| 222 |
|
| 223 |
+
# Generate alt text for accessibility
|
| 224 |
+
alt_text = generate_alt_text_for_image(pil_crop)
|
| 225 |
+
|
| 226 |
buffered = io.BytesIO()
|
| 227 |
pil_crop.save(buffered, format="PNG")
|
| 228 |
img_base64 = base64.b64encode(buffered.getvalue()).decode()
|
|
|
|
| 230 |
|
| 231 |
figure_results.append({
|
| 232 |
"label": label,
|
| 233 |
+
"text": f"\n\n*{alt_text}*",
|
| 234 |
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
|
| 235 |
"reading_order": reading_order,
|
| 236 |
+
"alt_text": alt_text,
|
| 237 |
})
|
| 238 |
else:
|
| 239 |
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
|
|
|
| 310 |
elif element["label"] in ["para", "title", "sec", "sub_sec"]:
|
| 311 |
markdown_content += f"{element['text']}\n\n"
|
| 312 |
elif element["label"] == "fig":
|
| 313 |
+
# Image should already have alt text from processing
|
| 314 |
markdown_content += f"{element['text']}\n\n"
|
| 315 |
return markdown_content
|
| 316 |
|
|
|
|
| 413 |
return None
|
| 414 |
|
| 415 |
|
| 416 |
+
def generate_alt_text_for_image(pil_image):
|
| 417 |
+
"""Generate alt text for an image using Gemma 3n model"""
|
| 418 |
+
try:
|
| 419 |
+
# Initialize Gemini model
|
| 420 |
+
model = initialize_gemini_model()
|
| 421 |
+
if model is None:
|
| 422 |
+
return "Image description unavailable"
|
| 423 |
+
|
| 424 |
+
# Create a detailed prompt for alt text generation
|
| 425 |
+
prompt = """You are an accessibility expert creating alt text for images to help visually impaired users understand visual content. Analyze this image and provide a clear, concise description that captures the essential visual information.
|
| 426 |
+
|
| 427 |
+
Focus on:
|
| 428 |
+
- Main subject or content of the image
|
| 429 |
+
- Important details, text, or data shown
|
| 430 |
+
- Layout and structure if relevant (charts, diagrams, tables)
|
| 431 |
+
- Context that would help someone understand the image's purpose
|
| 432 |
+
|
| 433 |
+
Provide a descriptive alt text in 1-2 sentences that is informative but not overly verbose. Start directly with the description without saying "This image shows" or similar phrases."""
|
| 434 |
+
|
| 435 |
+
# Generate alt text using Gemini API
|
| 436 |
+
response = model.generate_content([prompt, pil_image])
|
| 437 |
+
alt_text = response.text.strip() if hasattr(response, 'text') else "Image description unavailable"
|
| 438 |
+
|
| 439 |
+
# Clean up the alt text
|
| 440 |
+
alt_text = alt_text.replace('\n', ' ').replace('\r', ' ')
|
| 441 |
+
# Remove common prefixes if they appear
|
| 442 |
+
prefixes_to_remove = ["This image shows", "The image shows", "This shows", "The figure shows"]
|
| 443 |
+
for prefix in prefixes_to_remove:
|
| 444 |
+
if alt_text.startswith(prefix):
|
| 445 |
+
alt_text = alt_text[len(prefix):].strip()
|
| 446 |
+
break
|
| 447 |
+
|
| 448 |
+
return alt_text if alt_text else "Image description unavailable"
|
| 449 |
+
|
| 450 |
+
except Exception as e:
|
| 451 |
+
print(f"Error generating alt text: {e}")
|
| 452 |
+
return "Image description unavailable"
|
| 453 |
+
|
| 454 |
+
|
| 455 |
# Global state for managing tabs
|
| 456 |
processed_markdown = ""
|
| 457 |
show_results_tab = False
|
|
|
|
| 633 |
gemini_status = "β
Gemini API ready" if gemini_model else "β Gemini API not configured"
|
| 634 |
current_status = f"Currently loaded: {current_model or 'None'}"
|
| 635 |
gr.Markdown(
|
| 636 |
+
"# Scholar Express - Alt Text Enhanced\n"
|
| 637 |
+
"### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot powered by Gemini API.\n"
|
| 638 |
f"**System:** {model_status}\n"
|
| 639 |
f"**RAG System:** {embedding_status}\n"
|
| 640 |
f"**Gemini API:** {gemini_status}\n"
|
| 641 |
+
f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
|
| 642 |
f"**Status:** {current_status}"
|
| 643 |
)
|
| 644 |
|