Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
import os
|
| 3 |
-
import
|
| 4 |
from pptx import Presentation
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
import torch
|
|
@@ -31,7 +31,7 @@ os.makedirs(IMAGE_FOLDER, exist_ok=True)
|
|
| 31 |
# Extract Text from PDF
|
| 32 |
def extract_text_from_pdf(pdf_path):
|
| 33 |
try:
|
| 34 |
-
doc =
|
| 35 |
text = " ".join(page.get_text() for page in doc)
|
| 36 |
return text.strip() if text else None
|
| 37 |
except Exception as e:
|
|
@@ -53,7 +53,7 @@ def extract_text_from_pptx(pptx_path):
|
|
| 53 |
# Extract Images from PDF
|
| 54 |
def extract_images_from_pdf(pdf_path):
|
| 55 |
try:
|
| 56 |
-
doc =
|
| 57 |
images = []
|
| 58 |
for i, page in enumerate(doc):
|
| 59 |
for img_index, img in enumerate(page.get_images(full=True)):
|
|
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
import os
|
| 3 |
+
import pymupdf # PyMuPDF
|
| 4 |
from pptx import Presentation
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
import torch
|
|
|
|
| 31 |
# Extract Text from PDF
|
| 32 |
def extract_text_from_pdf(pdf_path):
|
| 33 |
try:
|
| 34 |
+
doc = pymupdf.open(pdf_path)
|
| 35 |
text = " ".join(page.get_text() for page in doc)
|
| 36 |
return text.strip() if text else None
|
| 37 |
except Exception as e:
|
|
|
|
| 53 |
# Extract Images from PDF
|
| 54 |
def extract_images_from_pdf(pdf_path):
|
| 55 |
try:
|
| 56 |
+
doc = pymupdf.open(pdf_path)
|
| 57 |
images = []
|
| 58 |
for i, page in enumerate(doc):
|
| 59 |
for img_index, img in enumerate(page.get_images(full=True)):
|