Spaces:

ikraamkb
/

qtAnswering

Sleeping

App Files Files Community

ikraamkb commited on Mar 24

Commit

2852c90

verified ·

1 Parent(s): 73efb2c

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -32

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from fastapi import FastAPI, File, UploadFile
-import pdfplumber
-import docx
 import openpyxl
 from pptx import Presentation
 import torch
@@ -16,11 +16,8 @@ import easyocr
 # Initialize FastAPI
 app = FastAPI()
-# Load AI Model for Question Answering (Proper Extractive QA Model)
-qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
-# Initialize Translator for Multilingual Support
-translator = pipeline("translation", model="facebook/m2m100_418M")
 # Load Pretrained Object Detection Model (if needed)
 model = fasterrcnn_resnet50_fpn(pretrained=True)
@@ -48,25 +45,21 @@ def truncate_text(text, max_tokens=450):
     words = text.split()
     return " ".join(words[:max_tokens])
-# Text Extraction Functions
 def extract_text_from_pdf(pdf_file):
-    text = ""
     try:
-        with pdfplumber.open(pdf_file) as pdf:
-            for page in pdf.pages:
-                page_text = page.extract_text()
-                if page_text:
-                    text += page_text + "\n"
     except Exception as e:
         return f"Error reading PDF: {str(e)}"
-    return text.strip() if text else "No text found."
-def extract_text_from_docx(docx_file):
     try:
-        doc = docx.Document(docx_file)
-        return "\n".join([para.text for para in doc.paragraphs])
     except Exception as e:
-        return f"Error reading DOCX: {str(e)}"
 def extract_text_from_pptx(pptx_file):
     try:
@@ -99,9 +92,6 @@ def extract_text_from_image(image_file):
     result = reader.readtext(np.array(image))
     return " ".join([res[1] for res in result]) if result else "No text found."
-def translate_text(text, target_lang="en"):
-    return translator(text, src_lang="auto", tgt_lang=target_lang)[0]["translation_text"]
 # Function to answer questions based on document content
 def answer_question_from_document(file, question):
     validation_error = validate_file_type(file)
@@ -111,10 +101,8 @@ def answer_question_from_document(file, question):
     file_ext = file.name.split(".")[-1].lower()
     if file_ext == "pdf":
         text = extract_text_from_pdf(file)
-    elif file_ext == "docx":
-        text = extract_text_from_docx(file)
-    elif file_ext == "pptx":
-        text = extract_text_from_pptx(file)
     elif file_ext == "xlsx":
         text = extract_text_from_excel(file)
     else:
@@ -123,22 +111,20 @@ def answer_question_from_document(file, question):
     if not text:
         return "No text extracted from the document."
-    text = translate_text(text)  # Translate non-English text to English
     truncated_text = truncate_text(text)
-    response = qa_pipeline({"question": question, "context": truncated_text})
-    return response["answer"]
 def answer_question_from_image(image, question):
     image_text = extract_text_from_image(image)
     if not image_text:
         return "No meaningful content detected in the image."
-    image_text = translate_text(image_text)  # Translate non-English text to English
     truncated_text = truncate_text(image_text)
-    response = qa_pipeline({"question": question, "context": truncated_text})
-    return response["answer"]
 # Gradio UI for Document & Image QA
 doc_interface = gr.Interface(

 from fastapi import FastAPI, File, UploadFile
+import fitz  # PyMuPDF for PDF parsing
+from tika import parser  # Apache Tika for document parsing
 import openpyxl
 from pptx import Presentation
 import torch
 # Initialize FastAPI
 app = FastAPI()
+# Load AI Model for Question Answering (DeepSeek-V2-Chat)
+qa_pipeline = pipeline("text-generation", model="deepseek-ai/DeepSeek-V2-Chat")
 # Load Pretrained Object Detection Model (if needed)
 model = fasterrcnn_resnet50_fpn(pretrained=True)
     words = text.split()
     return " ".join(words[:max_tokens])
+# Document Text Extraction Functions
 def extract_text_from_pdf(pdf_file):
     try:
+        doc = fitz.open(pdf_file)
+        text = "\n".join([page.get_text("text") for page in doc])
+        return text if text else "No text found."
     except Exception as e:
         return f"Error reading PDF: {str(e)}"
+def extract_text_with_tika(file):
     try:
+        parsed = parser.from_buffer(file)
+        return parsed.get("content", "No text found.").strip()
     except Exception as e:
+        return f"Error reading document: {str(e)}"
 def extract_text_from_pptx(pptx_file):
     try:
     result = reader.readtext(np.array(image))
     return " ".join([res[1] for res in result]) if result else "No text found."
 # Function to answer questions based on document content
 def answer_question_from_document(file, question):
     validation_error = validate_file_type(file)
     file_ext = file.name.split(".")[-1].lower()
     if file_ext == "pdf":
         text = extract_text_from_pdf(file)
+    elif file_ext in ["docx", "pptx"]:
+        text = extract_text_with_tika(file)
     elif file_ext == "xlsx":
         text = extract_text_from_excel(file)
     else:
     if not text:
         return "No text extracted from the document."
     truncated_text = truncate_text(text)
+    response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
+    return response[0]["generated_text"]
 def answer_question_from_image(image, question):
     image_text = extract_text_from_image(image)
     if not image_text:
         return "No meaningful content detected in the image."
     truncated_text = truncate_text(image_text)
+    response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
+    return response[0]["generated_text"]
 # Gradio UI for Document & Image QA
 doc_interface = gr.Interface(