Spaces:
Sleeping
Sleeping
| import os, re, pdfplumber, faiss, pickle | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| BOOKS = { | |
| "Pakistan Penal Code": r"D:\FYP WEBSITE\summerization-app\RAG books\pakistan penal code_removed.pdf", | |
| "Code of Criminal Procedure": r"D:\FYP WEBSITE\summerization-app\RAG books\code of criminal procedure_removed.pdf", | |
| "Code of Civil Procedure": r"D:\FYP WEBSITE\summerization-app\RAG books\code of civil procedure_removed_removed.pdf", | |
| "Constitution of Pakistan": r"D:\FYP WEBSITE\summerization-app\RAG books\constitution of pakistan_removed.pdf", | |
| "Rules of Business": r"D:\FYP WEBSITE\summerization-app\RAG books\rules of business_removed.pdf" | |
| } | |
| def extract_text_from_pdf(pdf_path): | |
| if not os.path.exists(pdf_path): | |
| return "" | |
| try: | |
| with pdfplumber.open(pdf_path) as pdf: | |
| return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()]) | |
| except: | |
| return "" | |
| def clean_text(text): | |
| text = re.sub(r'Page\s*\d+\s*of\s*\d+', '', text) | |
| text = re.sub(r'\bPage\s*\d+\b', '', text) | |
| text = re.sub(r'\b\d+\s*/\s*\d+\b', '', text) | |
| return text.strip() | |
| def split_sections_by_number(text): | |
| pattern = r"(\d+[A-Z]?(?:\(\d+\))?)\.\s*(.*?)(?=\n\d+[A-Z]?(?:\(\d+\))?\.)" | |
| return [{"section_id": s[0].strip(), "content": s[1].strip()} for s in re.findall(pattern, text, re.DOTALL)] | |
| def create_faiss_index(structured_data, model): | |
| corpus = [section['content'] for section in structured_data] | |
| corpus_embeddings = model.encode(corpus, show_progress_bar=True) | |
| dim = corpus_embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(np.array(corpus_embeddings)) | |
| return index | |
| if __name__ == "__main__": | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| all_data = {} | |
| all_indices = {} | |
| for name, path in BOOKS.items(): | |
| print(f"Processing {name}...") | |
| text = extract_text_from_pdf(path) | |
| clean = clean_text(text) | |
| sections = split_sections_by_number(clean) | |
| index = create_faiss_index(sections, model) | |
| all_data[name] = sections | |
| all_indices[name] = index | |
| with open("data/legal_data.pkl", "wb") as f: | |
| pickle.dump(all_data, f) | |
| for name, index in all_indices.items(): | |
| faiss.write_index(index, f"data/{name.replace(' ', '_')}_faiss.index") | |
| print("All data processed and saved.") | |