restapitrial_vectordb

Sleeping

App Files Files Community

Redmind commited on Feb 13

Commit

bbe1084

verified ·

1 Parent(s): 5419df9

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -91

app.py CHANGED Viewed

@@ -1,55 +1,40 @@
 from fastapi import FastAPI
 import os
 import pymupdf
-from pptx import Presentation  # python-pptx for PowerPoint
 from sentence_transformers import SentenceTransformer  # Text embeddings
 import torch
 from transformers import CLIPProcessor, CLIPModel  # Image embeddings
 from PIL import Image
 import chromadb
 app = FastAPI()
 client = chromadb.PersistentClient(path="/data/chroma_db")
 collection = client.get_or_create_collection(name="knowledge_base")
-print("Created collection with 512 dimensions!")
-pdf_file="Sutures and Suturing techniques.pdf"
-pptx_file="impalnt 1.pptx"
-collection = client.get_collection(name="knowledge_base")
-print("Collection Embedding Dimension:", collection.metadata)
 # Initialize models
 text_model = SentenceTransformer('all-MiniLM-L6-v2')
-# Folder for extracted images
 IMAGE_FOLDER = "/data/extracted_images"
 os.makedirs(IMAGE_FOLDER, exist_ok=True)
-### Step 1: Extract Text from PDF ###
 def extract_text_from_pdf(pdf_path):
-    text = ""
-    doc = pymupdf.open(pdf_path)
-    for page in doc:
-        text += page.get_text() + "\n"
     return text.strip()
-### Step 2: Extract Text from PowerPoint ###
 def extract_text_from_pptx(pptx_path):
-    text = ""
-    prs = Presentation(pptx_path)
-    for slide in prs.slides:
-        for shape in slide.shapes:
-            if hasattr(shape, "text"):
-                text += shape.text + "\n"
-    return text.strip()
-### Step 3: Extract Images from PDF ###
 def extract_images_from_pdf(pdf_path):
     images = []
     doc = pymupdf.open(pdf_path)
@@ -57,102 +42,73 @@ def extract_images_from_pdf(pdf_path):
         for img_index, img in enumerate(page.get_images(full=True)):
             xref = img[0]
             image = doc.extract_image(xref)
-            img_bytes = image["image"]
-            img_ext = image["ext"]
-            img_path = f"{IMAGE_FOLDER}/pdf_image_{i}_{img_index}.{img_ext}"
             with open(img_path, "wb") as f:
-                f.write(img_bytes)
             images.append(img_path)
     return images
-### Step 4: Extract Images from PowerPoint ###
 def extract_images_from_pptx(pptx_path):
     images = []
     prs = Presentation(pptx_path)
     for i, slide in enumerate(prs.slides):
         for shape in slide.shapes:
-            if shape.shape_type == 13:  # Picture shape type
-                image = shape.image
-                img_bytes = image.blob
-                img_ext = image.ext
-                img_path = f"{IMAGE_FOLDER}/pptx_image_{i}.{img_ext}"
                 with open(img_path, "wb") as f:
-                    f.write(img_bytes)
                 images.append(img_path)
     return images
-### Step 5: Convert Text to Embeddings ###
 def get_text_embedding(text):
     return text_model.encode(text).tolist()
-from transformers import CLIPProcessor, CLIPModel
-import torch
-import numpy as np
-from sklearn.decomposition import PCA
-# ✅ Load CLIP (512-dimensional output)
-model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 def get_image_embedding(image_path):
-    """Extracts image embedding and reduces to 384 dimensions"""
-    from PIL import Image
     image = Image.open(image_path)
     inputs = processor(images=image, return_tensors="pt")
     with torch.no_grad():
-        image_embedding = model.get_image_features(**inputs)  # Shape: (1, 512)
-    image_embedding = image_embedding.numpy().flatten()  # Convert to NumPy (512,)
-    # ✅ Reduce to 384 dimensions using PCA
-    pca = PCA(n_components=384)
-    image_embedding_384 = pca.fit_transform(image_embedding.reshape(1, -1))
-    return image_embedding_384.flatten().tolist()
-### Step 7: Store Data in ChromaDB ###
 def store_data(texts, image_paths):
-    # Store text embeddings
     for i, text in enumerate(texts):
-        text_embedding = get_text_embedding(text)
-        print("Embedding Dimension:", len(text_embedding))
-        collection.add(ids=[f"text_{i}"], embeddings=[text_embedding], documents=[text])
-    # Store image embeddings
-    for j, image_path in enumerate(image_paths):
-        image_embedding = get_image_embedding(image_path)
-        collection.add(ids=[f"image_{j}"], embeddings=[image_embedding], documents=[image_path])
     print("Data stored successfully!")
-### Step 8: Process and Store from Files ###
 def process_and_store(pdf_path=None, pptx_path=None):
     texts, images = [], []
     if pdf_path:
-        print(f"Processing PDF: {pdf_path}")
         texts.append(extract_text_from_pdf(pdf_path))
         images.extend(extract_images_from_pdf(pdf_path))
     if pptx_path:
-        print(f"Processing PPTX: {pptx_path}")
         texts.append(extract_text_from_pptx(pptx_path))
         images.extend(extract_images_from_pptx(pptx_path))
     store_data(texts, images)
 process_and_store(pdf_path=pdf_file, pptx_path=pptx_file)
 @app.get("/")
 def greet_json():
     return {"Hello": "World!"}
 @app.get("/test")
@@ -162,11 +118,5 @@ def greet_json():
 @app.get("/search/")
 def search(query: str):
     query_embedding = get_text_embedding(query)
-    results = collection.query(
-        query_embeddings=[query_embedding],
-        n_results=5
-    )
     return {"results": results["documents"]}

 from fastapi import FastAPI
 import os
 import pymupdf
+from pptx import Presentation  # PowerPoint
 from sentence_transformers import SentenceTransformer  # Text embeddings
 import torch
 from transformers import CLIPProcessor, CLIPModel  # Image embeddings
 from PIL import Image
 import chromadb
+import numpy as np
+from sklearn.decomposition import PCA
 app = FastAPI()
 client = chromadb.PersistentClient(path="/data/chroma_db")
 collection = client.get_or_create_collection(name="knowledge_base")
+pdf_file = "Sutures and Suturing techniques.pdf"
+pptx_file = "impalnt 1.pptx"
 # Initialize models
 text_model = SentenceTransformer('all-MiniLM-L6-v2')
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 IMAGE_FOLDER = "/data/extracted_images"
 os.makedirs(IMAGE_FOLDER, exist_ok=True)
+# Extract text from PDF
 def extract_text_from_pdf(pdf_path):
+    text = "".join([page.get_text() for page in pymupdf.open(pdf_path)])
     return text.strip()
+# Extract text from PowerPoint
 def extract_text_from_pptx(pptx_path):
+    return "".join([shape.text for slide in Presentation(pptx_path).slides for shape in slide.shapes if hasattr(shape, "text")]).strip()
+# Extract images from PDF
 def extract_images_from_pdf(pdf_path):
     images = []
     doc = pymupdf.open(pdf_path)
         for img_index, img in enumerate(page.get_images(full=True)):
             xref = img[0]
             image = doc.extract_image(xref)
+            img_path = f"{IMAGE_FOLDER}/pdf_image_{i}_{img_index}.{image['ext']}"
             with open(img_path, "wb") as f:
+                f.write(image["image"])
             images.append(img_path)
     return images
+# Extract images from PowerPoint
 def extract_images_from_pptx(pptx_path):
     images = []
     prs = Presentation(pptx_path)
     for i, slide in enumerate(prs.slides):
         for shape in slide.shapes:
+            if shape.shape_type == 13:
+                img_path = f"{IMAGE_FOLDER}/pptx_image_{i}.{shape.image.ext}"
                 with open(img_path, "wb") as f:
+                    f.write(shape.image.blob)
                 images.append(img_path)
     return images
+# Convert text to embeddings
 def get_text_embedding(text):
     return text_model.encode(text).tolist()
+# Extract image embeddings
 def get_image_embedding(image_path):
     image = Image.open(image_path)
     inputs = processor(images=image, return_tensors="pt")
     with torch.no_grad():
+        image_embedding = model.get_image_features(**inputs).numpy().flatten()
+    return image_embedding.tolist()
+# Store Data in ChromaDB
 def store_data(texts, image_paths):
     for i, text in enumerate(texts):
+        collection.add(ids=[f"text_{i}"], embeddings=[get_text_embedding(text)], documents=[text])
+    # Collect image embeddings first
+    all_embeddings = [get_image_embedding(img_path) for img_path in image_paths]
+    all_embeddings = np.array(all_embeddings)
+    # Apply PCA if enough images exist
+    if all_embeddings.shape[0] >= 384:
+        pca = PCA(n_components=384)
+        transformed_embeddings = pca.fit_transform(all_embeddings)
+    else:
+        transformed_embeddings = all_embeddings  # Use original embeddings
+    for j, img_path in enumerate(image_paths):
+        collection.add(ids=[f"image_{j}"], embeddings=[transformed_embeddings[j].tolist()], documents=[img_path])
     print("Data stored successfully!")
+# Process and store from files
 def process_and_store(pdf_path=None, pptx_path=None):
     texts, images = [], []
     if pdf_path:
         texts.append(extract_text_from_pdf(pdf_path))
         images.extend(extract_images_from_pdf(pdf_path))
     if pptx_path:
         texts.append(extract_text_from_pptx(pptx_path))
         images.extend(extract_images_from_pptx(pptx_path))
     store_data(texts, images)
 process_and_store(pdf_path=pdf_file, pptx_path=pptx_file)
 @app.get("/")
 def greet_json():
     return {"Hello": "World!"}
 @app.get("/test")
 @app.get("/search/")
 def search(query: str):
     query_embedding = get_text_embedding(query)
+    results = collection.query(query_embeddings=[query_embedding], n_results=5)
     return {"results": results["documents"]}