Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from fastapi import FastAPI, File, UploadFile
|
| 2 |
-
import
|
| 3 |
-
import
|
| 4 |
import openpyxl
|
| 5 |
from pptx import Presentation
|
| 6 |
import torch
|
|
@@ -16,11 +16,8 @@ import easyocr
|
|
| 16 |
# Initialize FastAPI
|
| 17 |
app = FastAPI()
|
| 18 |
|
| 19 |
-
# Load AI Model for Question Answering (
|
| 20 |
-
qa_pipeline = pipeline("
|
| 21 |
-
|
| 22 |
-
# Initialize Translator for Multilingual Support
|
| 23 |
-
translator = pipeline("translation", model="facebook/m2m100_418M")
|
| 24 |
|
| 25 |
# Load Pretrained Object Detection Model (if needed)
|
| 26 |
model = fasterrcnn_resnet50_fpn(pretrained=True)
|
|
@@ -48,25 +45,21 @@ def truncate_text(text, max_tokens=450):
|
|
| 48 |
words = text.split()
|
| 49 |
return " ".join(words[:max_tokens])
|
| 50 |
|
| 51 |
-
# Text Extraction Functions
|
| 52 |
def extract_text_from_pdf(pdf_file):
|
| 53 |
-
text = ""
|
| 54 |
try:
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
if page_text:
|
| 59 |
-
text += page_text + "\n"
|
| 60 |
except Exception as e:
|
| 61 |
return f"Error reading PDF: {str(e)}"
|
| 62 |
-
return text.strip() if text else "No text found."
|
| 63 |
|
| 64 |
-
def
|
| 65 |
try:
|
| 66 |
-
|
| 67 |
-
return "
|
| 68 |
except Exception as e:
|
| 69 |
-
return f"Error reading
|
| 70 |
|
| 71 |
def extract_text_from_pptx(pptx_file):
|
| 72 |
try:
|
|
@@ -99,9 +92,6 @@ def extract_text_from_image(image_file):
|
|
| 99 |
result = reader.readtext(np.array(image))
|
| 100 |
return " ".join([res[1] for res in result]) if result else "No text found."
|
| 101 |
|
| 102 |
-
def translate_text(text, target_lang="en"):
|
| 103 |
-
return translator(text, src_lang="auto", tgt_lang=target_lang)[0]["translation_text"]
|
| 104 |
-
|
| 105 |
# Function to answer questions based on document content
|
| 106 |
def answer_question_from_document(file, question):
|
| 107 |
validation_error = validate_file_type(file)
|
|
@@ -111,10 +101,8 @@ def answer_question_from_document(file, question):
|
|
| 111 |
file_ext = file.name.split(".")[-1].lower()
|
| 112 |
if file_ext == "pdf":
|
| 113 |
text = extract_text_from_pdf(file)
|
| 114 |
-
elif file_ext
|
| 115 |
-
text =
|
| 116 |
-
elif file_ext == "pptx":
|
| 117 |
-
text = extract_text_from_pptx(file)
|
| 118 |
elif file_ext == "xlsx":
|
| 119 |
text = extract_text_from_excel(file)
|
| 120 |
else:
|
|
@@ -123,22 +111,20 @@ def answer_question_from_document(file, question):
|
|
| 123 |
if not text:
|
| 124 |
return "No text extracted from the document."
|
| 125 |
|
| 126 |
-
text = translate_text(text) # Translate non-English text to English
|
| 127 |
truncated_text = truncate_text(text)
|
| 128 |
-
response = qa_pipeline(
|
| 129 |
|
| 130 |
-
return response["
|
| 131 |
|
| 132 |
def answer_question_from_image(image, question):
|
| 133 |
image_text = extract_text_from_image(image)
|
| 134 |
if not image_text:
|
| 135 |
return "No meaningful content detected in the image."
|
| 136 |
|
| 137 |
-
image_text = translate_text(image_text) # Translate non-English text to English
|
| 138 |
truncated_text = truncate_text(image_text)
|
| 139 |
-
response = qa_pipeline(
|
| 140 |
|
| 141 |
-
return response["
|
| 142 |
|
| 143 |
# Gradio UI for Document & Image QA
|
| 144 |
doc_interface = gr.Interface(
|
|
|
|
| 1 |
from fastapi import FastAPI, File, UploadFile
|
| 2 |
+
import fitz # PyMuPDF for PDF parsing
|
| 3 |
+
from tika import parser # Apache Tika for document parsing
|
| 4 |
import openpyxl
|
| 5 |
from pptx import Presentation
|
| 6 |
import torch
|
|
|
|
| 16 |
# Initialize FastAPI
|
| 17 |
app = FastAPI()
|
| 18 |
|
| 19 |
+
# Load AI Model for Question Answering (DeepSeek-V2-Chat)
|
| 20 |
+
qa_pipeline = pipeline("text-generation", model="deepseek-ai/DeepSeek-V2-Chat")
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# Load Pretrained Object Detection Model (if needed)
|
| 23 |
model = fasterrcnn_resnet50_fpn(pretrained=True)
|
|
|
|
| 45 |
words = text.split()
|
| 46 |
return " ".join(words[:max_tokens])
|
| 47 |
|
| 48 |
+
# Document Text Extraction Functions
|
| 49 |
def extract_text_from_pdf(pdf_file):
|
|
|
|
| 50 |
try:
|
| 51 |
+
doc = fitz.open(pdf_file)
|
| 52 |
+
text = "\n".join([page.get_text("text") for page in doc])
|
| 53 |
+
return text if text else "No text found."
|
|
|
|
|
|
|
| 54 |
except Exception as e:
|
| 55 |
return f"Error reading PDF: {str(e)}"
|
|
|
|
| 56 |
|
| 57 |
+
def extract_text_with_tika(file):
|
| 58 |
try:
|
| 59 |
+
parsed = parser.from_buffer(file)
|
| 60 |
+
return parsed.get("content", "No text found.").strip()
|
| 61 |
except Exception as e:
|
| 62 |
+
return f"Error reading document: {str(e)}"
|
| 63 |
|
| 64 |
def extract_text_from_pptx(pptx_file):
|
| 65 |
try:
|
|
|
|
| 92 |
result = reader.readtext(np.array(image))
|
| 93 |
return " ".join([res[1] for res in result]) if result else "No text found."
|
| 94 |
|
|
|
|
|
|
|
|
|
|
| 95 |
# Function to answer questions based on document content
|
| 96 |
def answer_question_from_document(file, question):
|
| 97 |
validation_error = validate_file_type(file)
|
|
|
|
| 101 |
file_ext = file.name.split(".")[-1].lower()
|
| 102 |
if file_ext == "pdf":
|
| 103 |
text = extract_text_from_pdf(file)
|
| 104 |
+
elif file_ext in ["docx", "pptx"]:
|
| 105 |
+
text = extract_text_with_tika(file)
|
|
|
|
|
|
|
| 106 |
elif file_ext == "xlsx":
|
| 107 |
text = extract_text_from_excel(file)
|
| 108 |
else:
|
|
|
|
| 111 |
if not text:
|
| 112 |
return "No text extracted from the document."
|
| 113 |
|
|
|
|
| 114 |
truncated_text = truncate_text(text)
|
| 115 |
+
response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
|
| 116 |
|
| 117 |
+
return response[0]["generated_text"]
|
| 118 |
|
| 119 |
def answer_question_from_image(image, question):
|
| 120 |
image_text = extract_text_from_image(image)
|
| 121 |
if not image_text:
|
| 122 |
return "No meaningful content detected in the image."
|
| 123 |
|
|
|
|
| 124 |
truncated_text = truncate_text(image_text)
|
| 125 |
+
response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
|
| 126 |
|
| 127 |
+
return response[0]["generated_text"]
|
| 128 |
|
| 129 |
# Gradio UI for Document & Image QA
|
| 130 |
doc_interface = gr.Interface(
|