Spaces:
Sleeping
Sleeping
| """from fastapi import FastAPI, File, UploadFile | |
| import fitz # PyMuPDF for PDF parsing | |
| from tika import parser # Apache Tika for document parsing | |
| import openpyxl | |
| from pptx import Presentation | |
| import torch | |
| from torchvision import transforms | |
| from torchvision.models.detection import fasterrcnn_resnet50_fpn | |
| from PIL import Image | |
| from transformers import pipeline | |
| import gradio as gr | |
| from fastapi.responses import RedirectResponse | |
| import numpy as np | |
| # Initialize FastAPI | |
| print("π FastAPI server is starting...") | |
| app = FastAPI() | |
| # Load AI Model for Question Answering (DeepSeek-V2-Chat) | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # Preload Hugging Face model | |
| print(f"π Loading models") | |
| qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=-1) | |
| # Load Pretrained Object Detection Model (Torchvision) | |
| from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights | |
| weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT | |
| model = fasterrcnn_resnet50_fpn(weights=weights) | |
| model.eval() | |
| # Image Transformations | |
| transform = transforms.Compose([ | |
| transforms.ToTensor() | |
| ]) | |
| # Allowed File Extensions | |
| ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx"} | |
| def validate_file_type(file): | |
| ext = file.name.split(".")[-1].lower() | |
| print(f"π Validating file type: {ext}") | |
| if ext not in ALLOWED_EXTENSIONS: | |
| return f"β Unsupported file format: {ext}" | |
| return None | |
| # Function to truncate text to 450 tokens | |
| def truncate_text(text, max_tokens=450): | |
| words = text.split() | |
| truncated = " ".join(words[:max_tokens]) | |
| print(f"βοΈ Truncated text to {max_tokens} tokens.") | |
| return truncated | |
| # Document Text Extraction Functions | |
| def extract_text_from_pdf(pdf_file): | |
| try: | |
| print("π Extracting text from PDF...") | |
| doc = fitz.open(pdf_file) | |
| text = "\n".join([page.get_text("text") for page in doc]) | |
| print("β PDF text extraction completed.") | |
| return text if text else "β οΈ No text found." | |
| except Exception as e: | |
| return f"β Error reading PDF: {str(e)}" | |
| def extract_text_with_tika(file): | |
| try: | |
| print("π Extracting text with Tika...") | |
| parsed = parser.from_buffer(file) | |
| print("β Tika text extraction completed.") | |
| return parsed.get("content", "β οΈ No text found.").strip() | |
| except Exception as e: | |
| return f"β Error reading document: {str(e)}" | |
| def extract_text_from_pptx(pptx_file): | |
| try: | |
| print("π Extracting text from PPTX...") | |
| ppt = Presentation(pptx_file) | |
| text = [] | |
| for slide in ppt.slides: | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text"): | |
| text.append(shape.text) | |
| print("β PPTX text extraction completed.") | |
| return "\n".join(text) if text else "β οΈ No text found." | |
| except Exception as e: | |
| return f"β Error reading PPTX: {str(e)}" | |
| def extract_text_from_excel(excel_file): | |
| try: | |
| print("π Extracting text from Excel...") | |
| wb = openpyxl.load_workbook(excel_file, read_only=True) | |
| text = [] | |
| for sheet in wb.worksheets: | |
| for row in sheet.iter_rows(values_only=True): | |
| text.append(" ".join(map(str, row))) | |
| print("β Excel text extraction completed.") | |
| return "\n".join(text) if text else "β οΈ No text found." | |
| except Exception as e: | |
| return f"β Error reading Excel: {str(e)}" | |
| def answer_question_from_document(file, question): | |
| print("π Processing document for QA...") | |
| validation_error = validate_file_type(file) | |
| if validation_error: | |
| return validation_error | |
| file_ext = file.name.split(".")[-1].lower() | |
| if file_ext == "pdf": | |
| text = extract_text_from_pdf(file) | |
| elif file_ext in ["docx", "pptx"]: | |
| text = extract_text_with_tika(file) | |
| elif file_ext == "xlsx": | |
| text = extract_text_from_excel(file) | |
| else: | |
| return "β Unsupported file format!" | |
| if not text: | |
| return "β οΈ No text extracted from the document." | |
| truncated_text = truncate_text(text) | |
| print("π€ Generating response...") | |
| response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}") | |
| print("β AI response generated.") | |
| return response[0]["generated_text"] | |
| print("β Models loaded successfully.") | |
| doc_interface = gr.Interface(fn=answer_question_from_document, inputs=[gr.File(), gr.Textbox()], outputs="text") | |
| demo = gr.TabbedInterface([doc_interface], ["Document QA"]) | |
| app = gr.mount_gradio_app(app, demo, path="/") | |
| @app.get("/") | |
| def home(): | |
| return RedirectResponse(url="/") | |
| """ | |
| from fastapi import FastAPI, File, UploadFile | |
| import fitz # PyMuPDF for PDF parsing | |
| import openpyxl | |
| from pptx import Presentation | |
| import torch | |
| from torchvision import transforms | |
| from torchvision.models.detection import fasterrcnn_resnet50_fpn | |
| from PIL import Image | |
| from transformers import pipeline | |
| import gradio as gr | |
| from fastapi.responses import RedirectResponse | |
| import numpy as np | |
| import docx | |
| # Initialize FastAPI | |
| print("π FastAPI server is starting...") | |
| app = FastAPI() | |
| # Load AI Model for Question Answering (DeepSeek-V2-Chat) | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # Preload Hugging Face model | |
| print(f"π Loading models") | |
| qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=-1) | |
| # Load Pretrained Object Detection Model (Torchvision) | |
| from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights | |
| weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT | |
| model = fasterrcnn_resnet50_fpn(weights=weights) | |
| model.eval() | |
| # Image Transformations | |
| transform = transforms.Compose([ | |
| transforms.ToTensor() | |
| ]) | |
| # Allowed File Extensions | |
| ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx"} | |
| def validate_file_type(file): | |
| ext = file.name.split(".")[-1].lower() | |
| print(f"π Validating file type: {ext}") | |
| if ext not in ALLOWED_EXTENSIONS: | |
| return f"β Unsupported file format: {ext}" | |
| return None | |
| # Function to truncate text to 450 tokens | |
| def truncate_text(text, max_tokens=450): | |
| words = text.split() | |
| truncated = " ".join(words[:max_tokens]) | |
| print(f"βοΈ Truncated text to {max_tokens} tokens.") | |
| return truncated | |
| # Document Text Extraction Functions | |
| def extract_text_from_pdf(pdf_file): | |
| try: | |
| print("π Extracting text from PDF...") | |
| doc = fitz.open(pdf_file) | |
| text = "\n".join([page.get_text("text") for page in doc]) | |
| print("β PDF text extraction completed.") | |
| return text if text else "β οΈ No text found." | |
| except Exception as e: | |
| return f"β Error reading PDF: {str(e)}" | |
| def extract_text_from_docx(docx_file): | |
| try: | |
| print("π Extracting text from DOCX...") | |
| doc = docx.Document(docx_file) | |
| text = "\n".join([para.text for para in doc.paragraphs]) | |
| print("β DOCX text extraction completed.") | |
| return text if text else "β οΈ No text found." | |
| except Exception as e: | |
| return f"β Error reading DOCX: {str(e)}" | |
| def extract_text_from_pptx(pptx_file): | |
| try: | |
| print("π Extracting text from PPTX...") | |
| ppt = Presentation(pptx_file) | |
| text = [] | |
| for slide in ppt.slides: | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text"): | |
| text.append(shape.text) | |
| print("β PPTX text extraction completed.") | |
| return "\n".join(text) if text else "β οΈ No text found." | |
| except Exception as e: | |
| return f"β Error reading PPTX: {str(e)}" | |
| def extract_text_from_excel(excel_file): | |
| try: | |
| print("π Extracting text from Excel...") | |
| wb = openpyxl.load_workbook(excel_file, read_only=True) | |
| text = [] | |
| for sheet in wb.worksheets: | |
| for row in sheet.iter_rows(values_only=True): | |
| text.append(" ".join(map(str, row))) | |
| print("β Excel text extraction completed.") | |
| return "\n".join(text) if text else "β οΈ No text found." | |
| except Exception as e: | |
| return f"β Error reading Excel: {str(e)}" | |
| def answer_question_from_document(file, question): | |
| print("π Processing document for QA...") | |
| validation_error = validate_file_type(file) | |
| if validation_error: | |
| return validation_error | |
| file_ext = file.name.split(".")[-1].lower() | |
| if file_ext == "pdf": | |
| text = extract_text_from_pdf(file) | |
| elif file_ext == "docx": | |
| text = extract_text_from_docx(file) | |
| elif file_ext == "pptx": | |
| text = extract_text_from_pptx(file) | |
| elif file_ext == "xlsx": | |
| text = extract_text_from_excel(file) | |
| else: | |
| return "β Unsupported file format!" | |
| if not text: | |
| return "β οΈ No text extracted from the document." | |
| truncated_text = truncate_text(text) | |
| print("π€ Generating response...") | |
| response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}") | |
| print("β AI response generated.") | |
| return response[0]["generated_text"] | |
| print("β Models loaded successfully.") | |
| doc_interface = gr.Interface(fn=answer_question_from_document, inputs=[gr.File(), gr.Textbox()], outputs="text") | |
| demo = gr.TabbedInterface([doc_interface], ["Document QA"]) | |
| app = gr.mount_gradio_app(app, demo, path="/") | |
| def home(): | |
| return RedirectResponse(url="/") | |