Spaces:

ikraamkb
/

qtAnswering

Sleeping

App Files Files Community

ikraamkb commited on Mar 25

Commit

ebf76ba

verified ·

1 Parent(s): da390cd

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -1

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from fastapi import FastAPI, File, UploadFile
 import fitz  # PyMuPDF for PDF parsing
 from tika import parser  # Apache Tika for document parsing
 import openpyxl
@@ -127,6 +127,142 @@ doc_interface = gr.Interface(fn=answer_question_from_document, inputs=[gr.File()
 demo = gr.TabbedInterface([doc_interface], ["Document QA"])
 app = gr.mount_gradio_app(app, demo, path="/")
 @app.get("/")
 def home():
     return RedirectResponse(url="/")

+"""from fastapi import FastAPI, File, UploadFile
 import fitz  # PyMuPDF for PDF parsing
 from tika import parser  # Apache Tika for document parsing
 import openpyxl
 demo = gr.TabbedInterface([doc_interface], ["Document QA"])
 app = gr.mount_gradio_app(app, demo, path="/")
+@app.get("/")
+def home():
+    return RedirectResponse(url="/")
+"""
+from fastapi import FastAPI, File, UploadFile
+import fitz  # PyMuPDF for PDF parsing
+import openpyxl
+from pptx import Presentation
+import torch
+from torchvision import transforms
+from torchvision.models.detection import fasterrcnn_resnet50_fpn
+from PIL import Image
+from transformers import pipeline
+import gradio as gr
+from fastapi.responses import RedirectResponse
+import numpy as np
+import docx
+# Initialize FastAPI
+print("🚀 FastAPI server is starting...")
+app = FastAPI()
+# Load AI Model for Question Answering (DeepSeek-V2-Chat)
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# Preload Hugging Face model
+print(f"🔄 Loading models")
+qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=-1)
+# Load Pretrained Object Detection Model (Torchvision)
+from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
+weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
+model = fasterrcnn_resnet50_fpn(weights=weights)
+model.eval()
+# Image Transformations
+transform = transforms.Compose([
+    transforms.ToTensor()
+])
+# Allowed File Extensions
+ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx"}
+def validate_file_type(file):
+    ext = file.name.split(".")[-1].lower()
+    print(f"🔍 Validating file type: {ext}")
+    if ext not in ALLOWED_EXTENSIONS:
+        return f"❌ Unsupported file format: {ext}"
+    return None
+# Function to truncate text to 450 tokens
+def truncate_text(text, max_tokens=450):
+    words = text.split()
+    truncated = " ".join(words[:max_tokens])
+    print(f"✂️ Truncated text to {max_tokens} tokens.")
+    return truncated
+# Document Text Extraction Functions
+def extract_text_from_pdf(pdf_file):
+    try:
+        print("📄 Extracting text from PDF...")
+        doc = fitz.open(pdf_file)
+        text = "\n".join([page.get_text("text") for page in doc])
+        print("✅ PDF text extraction completed.")
+        return text if text else "⚠️ No text found."
+    except Exception as e:
+        return f"❌ Error reading PDF: {str(e)}"
+def extract_text_from_docx(docx_file):
+    try:
+        print("📝 Extracting text from DOCX...")
+        doc = docx.Document(docx_file)
+        text = "\n".join([para.text for para in doc.paragraphs])
+        print("✅ DOCX text extraction completed.")
+        return text if text else "⚠️ No text found."
+    except Exception as e:
+        return f"❌ Error reading DOCX: {str(e)}"
+def extract_text_from_pptx(pptx_file):
+    try:
+        print("📊 Extracting text from PPTX...")
+        ppt = Presentation(pptx_file)
+        text = []
+        for slide in ppt.slides:
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    text.append(shape.text)
+        print("✅ PPTX text extraction completed.")
+        return "\n".join(text) if text else "⚠️ No text found."
+    except Exception as e:
+        return f"❌ Error reading PPTX: {str(e)}"
+def extract_text_from_excel(excel_file):
+    try:
+        print("📊 Extracting text from Excel...")
+        wb = openpyxl.load_workbook(excel_file, read_only=True)
+        text = []
+        for sheet in wb.worksheets:
+            for row in sheet.iter_rows(values_only=True):
+                text.append(" ".join(map(str, row)))
+        print("✅ Excel text extraction completed.")
+        return "\n".join(text) if text else "⚠️ No text found."
+    except Exception as e:
+        return f"❌ Error reading Excel: {str(e)}"
+def answer_question_from_document(file, question):
+    print("📂 Processing document for QA...")
+    validation_error = validate_file_type(file)
+    if validation_error:
+        return validation_error
+    file_ext = file.name.split(".")[-1].lower()
+    if file_ext == "pdf":
+        text = extract_text_from_pdf(file)
+    elif file_ext == "docx":
+        text = extract_text_from_docx(file)
+    elif file_ext == "pptx":
+        text = extract_text_from_pptx(file)
+    elif file_ext == "xlsx":
+        text = extract_text_from_excel(file)
+    else:
+        return "❌ Unsupported file format!"
+    if not text:
+        return "⚠️ No text extracted from the document."
+    truncated_text = truncate_text(text)
+    print("🤖 Generating response...")
+    response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
+    print("✅ AI response generated.")
+    return response[0]["generated_text"]
+print("✅ Models loaded successfully.")
+doc_interface = gr.Interface(fn=answer_question_from_document, inputs=[gr.File(), gr.Textbox()], outputs="text")
+demo = gr.TabbedInterface([doc_interface], ["Document QA"])
+app = gr.mount_gradio_app(app, demo, path="/")
 @app.get("/")
 def home():
     return RedirectResponse(url="/")