File size: 4,391 Bytes
405b8de ef871ab 405b8de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import os
import io
import gradio as gr
import faiss
import numpy as np
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline
# ---- Models (CPU-friendly) ----
# We're using Hugging Face's free tier, which is 2 virtual
# cores and 16gb ram only. So we need to keep these lightweight + cpu-only
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # small & fast on CPU
GEN_MODEL_NAME = "google/flan-t5-small" # text2text model that runs on CPU
embedder = SentenceTransformer(EMBED_MODEL_NAME)
generator = pipeline("text2text-generation", model=GEN_MODEL_NAME)
# ---- PDF to text ----
def pdfs_to_texts(files):
texts = []
for f in files:
# f is an object from Gradio that read bytes for pypdf
reader = PdfReader(io.BytesIO(f.read()))
pages = [page.extract_text() or "" for page in reader.pages]
texts.append("\n".join(pages))
return texts
# ---- Chunking ----
def chunk_text(text, chunk_size=600, overlap=120):
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = words[i:i+chunk_size]
chunks.append(" ".join(chunk))
i += chunk_size - overlap
return chunks
# ---- Build FAISS index from uploaded PDFs ----
index = None
corpus_chunks = []
def build_index(files, progress=gr.Progress()):
global index, corpus_chunks
texts = pdfs_to_texts(files)
# basic cleanup + chunk
corpus_chunks = []
for t in texts:
if not t.strip():
continue
corpus_chunks += chunk_text(t)
if not corpus_chunks:
return "No text extracted from PDFs.", None
progress(0.3, desc="Embedding chunks…")
embeddings = embedder.encode(corpus_chunks, convert_to_numpy=True, show_progress_bar=False)
d = embeddings.shape[1]
progress(0.6, desc="Creating FAISS index…")
index = faiss.IndexFlatIP(d) # cosine via inner product on normalized vectors
# normalize to unit length to approximate cosine similarity
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-10
embeddings = embeddings / norms
index.add(embeddings.astype(np.float32))
return f"Indexed {len(corpus_chunks)} chunks.", len(corpus_chunks)
# ---- RAG query -> retrieve -> generate ----
def answer_question(question, top_k=5, max_new_tokens=256):
if index is None or not corpus_chunks:
return "Index not built yet. Upload PDFs and click **Build Index** first."
# embed query (normalize for inner product)
q = embedder.encode([question], convert_to_numpy=True)
q = q / (np.linalg.norm(q, axis=1, keepdims=True) + 1e-10)
D, I = index.search(q.astype(np.float32), int(top_k))
retrieved = [corpus_chunks[i] for i in I[0] if i < len(corpus_chunks)]
context = "\n\n".join(retrieved)
prompt = (
"You are a helpful study assistant. Using ONLY the context, answer the question.\n"
"If the answer isn't in the context, say you don't have enough information.\n\n"
f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
)
out = generator(prompt, max_new_tokens=int(max_new_tokens), temperature=0.2)
return out[0]["generated_text"].strip()
# ---- Gradio v5 UI (Blocks) ----
with gr.Blocks(title="Group 5 Study Helper (RAG)") as demo:
gr.Markdown("# Group 5 Study Helper (RAG)\nUpload PDFs → Build Index → Ask questions.")
with gr.Row():
file_in = gr.Files(file_types=[".pdf"], label="Upload PDF files")
with gr.Row():
build_btn = gr.Button("Build Index", variant="primary")
status = gr.Markdown()
chunk_count = gr.Number(label="Chunk count", interactive=False)
with gr.Row():
question = gr.Textbox(label="Your question")
with gr.Row():
topk = gr.Slider(1, 10, value=5, step=1, label="Top-K passages")
max_tokens = gr.Slider(64, 512, value=256, step=16, label="Max new tokens")
with gr.Row():
ask_btn = gr.Button("Ask", variant="primary")
with gr.Row():
answer = gr.Markdown(label="Answer")
def _build(files):
msg, n = build_index(files)
return msg, n or 0
build_btn.click(_build, inputs=[file_in], outputs=[status, chunk_count])
ask_btn.click(answer_question, inputs=[question, topk, max_tokens], outputs=[answer])
demo.launch()
|