Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
import os
|
| 3 |
-
import
|
| 4 |
from pptx import Presentation # PowerPoint
|
| 5 |
from sentence_transformers import SentenceTransformer # Text embeddings
|
| 6 |
import torch
|
|
@@ -29,7 +29,7 @@ os.makedirs(IMAGE_FOLDER, exist_ok=True)
|
|
| 29 |
|
| 30 |
# Extract text from PDF
|
| 31 |
def extract_text_from_pdf(pdf_path):
|
| 32 |
-
return " ".join([page.get_text() for page in
|
| 33 |
|
| 34 |
# Extract text from PowerPoint
|
| 35 |
def extract_text_from_pptx(pptx_path):
|
|
@@ -38,7 +38,7 @@ def extract_text_from_pptx(pptx_path):
|
|
| 38 |
# Extract images from PDF
|
| 39 |
def extract_images_from_pdf(pdf_path):
|
| 40 |
images = []
|
| 41 |
-
doc =
|
| 42 |
for i, page in enumerate(doc):
|
| 43 |
for img_index, img in enumerate(page.get_images(full=True)):
|
| 44 |
xref = img[0]
|
|
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
import os
|
| 3 |
+
import pymupdf
|
| 4 |
from pptx import Presentation # PowerPoint
|
| 5 |
from sentence_transformers import SentenceTransformer # Text embeddings
|
| 6 |
import torch
|
|
|
|
| 29 |
|
| 30 |
# Extract text from PDF
|
| 31 |
def extract_text_from_pdf(pdf_path):
|
| 32 |
+
return " ".join([page.get_text() for page in pymupdf.open(pdf_path)]).strip()
|
| 33 |
|
| 34 |
# Extract text from PowerPoint
|
| 35 |
def extract_text_from_pptx(pptx_path):
|
|
|
|
| 38 |
# Extract images from PDF
|
| 39 |
def extract_images_from_pdf(pdf_path):
|
| 40 |
images = []
|
| 41 |
+
doc = pymupdf.open(pdf_path)
|
| 42 |
for i, page in enumerate(doc):
|
| 43 |
for img_index, img in enumerate(page.get_images(full=True)):
|
| 44 |
xref = img[0]
|