Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
| 1 |
-
from fastapi import FastAPI
|
| 2 |
import pdfplumber
|
| 3 |
-
import pytesseract
|
| 4 |
-
from PIL import Image
|
| 5 |
import easyocr
|
| 6 |
import docx
|
| 7 |
import openpyxl
|
|
@@ -21,14 +19,16 @@ app = FastAPI()
|
|
| 21 |
vqa_pipeline = pipeline("image-to-text", model="Salesforce/blip-vqa-base")
|
| 22 |
code_generator = pipeline("text-generation", model="openai-community/gpt2-medium")
|
| 23 |
table_analyzer = pipeline("table-question-answering", model="google/tapas-large-finetuned-wtq")
|
| 24 |
-
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# β
Functions for Document & Image QA
|
| 27 |
def extract_text_from_pdf(pdf_file):
|
| 28 |
text = ""
|
| 29 |
with pdfplumber.open(pdf_file) as pdf:
|
| 30 |
for page in pdf.pages:
|
| 31 |
-
text += page.extract_text() + "\n"
|
| 32 |
return text.strip()
|
| 33 |
|
| 34 |
def extract_text_from_docx(docx_file):
|
|
@@ -49,7 +49,7 @@ def extract_text_from_excel(excel_file):
|
|
| 49 |
text = []
|
| 50 |
for sheet in wb.worksheets:
|
| 51 |
for row in sheet.iter_rows(values_only=True):
|
| 52 |
-
text.append(" ".join(
|
| 53 |
return "\n".join(text)
|
| 54 |
|
| 55 |
def extract_text_from_image(image_file):
|
|
@@ -74,7 +74,7 @@ def answer_question_from_document(file, question):
|
|
| 74 |
if not text:
|
| 75 |
return "No text extracted from the document."
|
| 76 |
|
| 77 |
-
response = qa_pipeline(question
|
| 78 |
return response["answer"]
|
| 79 |
|
| 80 |
def answer_question_from_image(image, question):
|
|
@@ -82,7 +82,7 @@ def answer_question_from_image(image, question):
|
|
| 82 |
if not image_text:
|
| 83 |
return "No text detected in the image."
|
| 84 |
|
| 85 |
-
response = qa_pipeline(question
|
| 86 |
return response["answer"]
|
| 87 |
|
| 88 |
# β
Gradio UI for Document & Image QA
|
|
@@ -124,6 +124,9 @@ def generate_visualization(excel_file, viz_type, user_request):
|
|
| 124 |
else:
|
| 125 |
generated_code = "Error: Model did not return valid code."
|
| 126 |
|
|
|
|
|
|
|
|
|
|
| 127 |
try:
|
| 128 |
exec_globals = {"plt": plt, "sns": sns, "pd": pd, "df": df, "io": io}
|
| 129 |
exec(generated_code, exec_globals)
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
import pdfplumber
|
|
|
|
|
|
|
| 3 |
import easyocr
|
| 4 |
import docx
|
| 5 |
import openpyxl
|
|
|
|
| 19 |
vqa_pipeline = pipeline("image-to-text", model="Salesforce/blip-vqa-base")
|
| 20 |
code_generator = pipeline("text-generation", model="openai-community/gpt2-medium")
|
| 21 |
table_analyzer = pipeline("table-question-answering", model="google/tapas-large-finetuned-wtq")
|
| 22 |
+
|
| 23 |
+
# β
Corrected Question-Answering Model
|
| 24 |
+
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
| 25 |
|
| 26 |
# β
Functions for Document & Image QA
|
| 27 |
def extract_text_from_pdf(pdf_file):
|
| 28 |
text = ""
|
| 29 |
with pdfplumber.open(pdf_file) as pdf:
|
| 30 |
for page in pdf.pages:
|
| 31 |
+
text += page.extract_text() + "\n" if page.extract_text() else ""
|
| 32 |
return text.strip()
|
| 33 |
|
| 34 |
def extract_text_from_docx(docx_file):
|
|
|
|
| 49 |
text = []
|
| 50 |
for sheet in wb.worksheets:
|
| 51 |
for row in sheet.iter_rows(values_only=True):
|
| 52 |
+
text.append(" ".join([str(cell) if cell is not None else "" for cell in row]))
|
| 53 |
return "\n".join(text)
|
| 54 |
|
| 55 |
def extract_text_from_image(image_file):
|
|
|
|
| 74 |
if not text:
|
| 75 |
return "No text extracted from the document."
|
| 76 |
|
| 77 |
+
response = qa_pipeline({"question": question, "context": text})
|
| 78 |
return response["answer"]
|
| 79 |
|
| 80 |
def answer_question_from_image(image, question):
|
|
|
|
| 82 |
if not image_text:
|
| 83 |
return "No text detected in the image."
|
| 84 |
|
| 85 |
+
response = qa_pipeline({"question": question, "context": image_text})
|
| 86 |
return response["answer"]
|
| 87 |
|
| 88 |
# β
Gradio UI for Document & Image QA
|
|
|
|
| 124 |
else:
|
| 125 |
generated_code = "Error: Model did not return valid code."
|
| 126 |
|
| 127 |
+
if "plt" not in generated_code or "sns" not in generated_code:
|
| 128 |
+
return generated_code, "Generated code seems incorrect."
|
| 129 |
+
|
| 130 |
try:
|
| 131 |
exec_globals = {"plt": plt, "sns": sns, "pd": pd, "df": df, "io": io}
|
| 132 |
exec(generated_code, exec_globals)
|