|
|
import gradio as gr |
|
|
from PyPDF2 import PdfReader |
|
|
from transformers import AutoTokenizer, AutoModel |
|
|
import torch |
|
|
import faiss |
|
|
import numpy as np |
|
|
from groq import Groq |
|
|
import os |
|
|
|
|
|
|
|
|
LEGAL_BERT_MODEL = "nlpaueb/legal-bert-base-uncased" |
|
|
|
|
|
|
|
|
DOCS = [ |
|
|
("bns_full.pdf", "Bharatiya Nyaya Sanhita 2023"), |
|
|
("bns_ipc_mapping.pdf", "BNS-IPC Comparative Mapping"), |
|
|
] |
|
|
|
|
|
MAX_CHUNK_SIZE = 1000 |
|
|
OVERLAP = 200 |
|
|
TOP_K = 5 |
|
|
LLAMA_MODEL = 'llama-3.3-70b-versatile' |
|
|
|
|
|
|
|
|
GROQ_API_KEY = os.getenv("GROQ_API_KEY") |
|
|
groq_client = Groq(api_key=GROQ_API_KEY) |
|
|
|
|
|
|
|
|
class LegalBERTEmbedder: |
|
|
def __init__(self, model_name=LEGAL_BERT_MODEL): |
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
self.model = AutoModel.from_pretrained(model_name) |
|
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
self.model.to(self.device) |
|
|
self.model.eval() |
|
|
|
|
|
def embed(self, texts): |
|
|
all_embeddings = [] |
|
|
with torch.no_grad(): |
|
|
for text in texts: |
|
|
inputs = self.tokenizer(text, return_tensors="pt", |
|
|
truncation=True, max_length=512).to(self.device) |
|
|
outputs = self.model(**inputs) |
|
|
cls_embed = outputs.last_hidden_state[:, 0, :].cpu().numpy() |
|
|
all_embeddings.append(cls_embed.flatten()) |
|
|
return np.vstack(all_embeddings) |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
|
"""Extract text from PDF file""" |
|
|
reader = PdfReader(pdf_path) |
|
|
raw_text = "" |
|
|
for page in reader.pages: |
|
|
text = page.extract_text() |
|
|
if text: |
|
|
raw_text += text + "\n" |
|
|
return raw_text |
|
|
|
|
|
def chunk_text(text, max_chunk_size=MAX_CHUNK_SIZE, overlap=OVERLAP): |
|
|
"""Split text into overlapping chunks""" |
|
|
chunks = [] |
|
|
start = 0 |
|
|
length = len(text) |
|
|
while start < length: |
|
|
end = min(start + max_chunk_size, length) |
|
|
chunk = text[start:end] |
|
|
chunks.append(chunk) |
|
|
start += max_chunk_size - overlap |
|
|
return chunks |
|
|
|
|
|
|
|
|
def build_faiss_index(embeddings): |
|
|
"""Build FAISS index for similarity search""" |
|
|
dim = embeddings.shape[1] |
|
|
index = faiss.IndexFlatIP(dim) |
|
|
faiss.normalize_L2(embeddings) |
|
|
index.add(embeddings) |
|
|
return index |
|
|
|
|
|
def query_faiss(index, query_embed, k=TOP_K): |
|
|
"""Query FAISS index for top-k similar chunks""" |
|
|
faiss.normalize_L2(query_embed) |
|
|
distances, indices = index.search(query_embed, k) |
|
|
return distances, indices |
|
|
|
|
|
|
|
|
print("Loading and processing multiple legal documents...") |
|
|
|
|
|
embedder = LegalBERTEmbedder() |
|
|
all_chunks = [] |
|
|
metadata = [] |
|
|
|
|
|
print("Extracting and chunking text from all PDFs...") |
|
|
for pdf_path, act_label in DOCS: |
|
|
try: |
|
|
raw_text = extract_text_from_pdf(pdf_path) |
|
|
print(f"Extracted {len(raw_text)} characters from {act_label}") |
|
|
|
|
|
chunks = chunk_text(raw_text) |
|
|
print(f"Created {len(chunks)} chunks from {act_label}") |
|
|
|
|
|
|
|
|
labeled_chunks = [f"[{act_label}] {chunk}" for chunk in chunks] |
|
|
all_chunks.extend(labeled_chunks) |
|
|
metadata.extend([(act_label, chunk) for chunk in chunks]) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error processing {pdf_path}: {str(e)}") |
|
|
continue |
|
|
|
|
|
print(f"Total chunks created: {len(all_chunks)}") |
|
|
|
|
|
print("Embedding all text chunks with Legal-BERT...") |
|
|
chunk_embeddings = embedder.embed(all_chunks) |
|
|
print("Embeddings created successfully") |
|
|
|
|
|
print("Building FAISS index...") |
|
|
faiss_index = build_faiss_index(chunk_embeddings) |
|
|
print("FAISS index built successfully") |
|
|
|
|
|
|
|
|
SYSTEM_PROMPT = """You are a senior Indian legal expert specializing in the Bharatiya Nyaya Sanhita 2023 (BNS) and its correspondence with the Indian Penal Code 1860 (IPC). |
|
|
When answering any question, you MUST use this exact format: |
|
|
CONTEXT/SITUATION: |
|
|
[Provide detailed explanation of the legal context and situation] |
|
|
BNS SECTIONS: |
|
|
[List the specific BNS sections and subsections that apply, with proper citations] |
|
|
IPC SECTIONS (if applicable): |
|
|
[List the corresponding IPC sections based on mappings, with proper citations] |
|
|
SUMMARY: |
|
|
[Provide a clear one-sentence summary highlighting the applicable BNS and IPC sections in **bold** format] |
|
|
Always cite specific sections when available and ensure your response covers relevant BNS provisions and mapped IPC equivalents.""" |
|
|
|
|
|
def build_user_prompt(context, question): |
|
|
"""Build the user prompt with context and question""" |
|
|
return f"""Based on the following relevant extracts from BNS and IPC legislation: |
|
|
{context} |
|
|
Question: {question} |
|
|
Please provide a comprehensive legal answer following the exact format specified in the system instructions.""" |
|
|
|
|
|
|
|
|
def answer_query(user_query): |
|
|
"""Main function to answer user queries""" |
|
|
try: |
|
|
|
|
|
query_embed = embedder.embed([user_query]) |
|
|
|
|
|
|
|
|
_, indices = query_faiss(faiss_index, query_embed, k=TOP_K) |
|
|
retrieved_chunks = [all_chunks[i] for i in indices[0]] |
|
|
|
|
|
|
|
|
context = "\n\n".join(retrieved_chunks) |
|
|
|
|
|
|
|
|
chat_completion = groq_client.chat.completions.create( |
|
|
messages=[ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": SYSTEM_PROMPT |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": build_user_prompt(context, user_query) |
|
|
} |
|
|
], |
|
|
model=LLAMA_MODEL, |
|
|
temperature=0.1, |
|
|
max_tokens=1024 |
|
|
) |
|
|
|
|
|
return chat_completion.choices[0].message.content.strip() |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error processing query: {str(e)}\n\nPlease check your Groq API key and internet connection." |
|
|
|
|
|
|
|
|
with gr.Blocks(title="IPC & BNS Legal Assistant") as demo: |
|
|
gr.Markdown(""" |
|
|
# ποΈ IPC & BNS Legal Assistant |
|
|
|
|
|
**Comprehensive Legal Q&A System covering:** |
|
|
- Bharatiya Nyaya Sanhita 2023 (BNS) |
|
|
- Corresponding Indian Penal Code 1860 (IPC) sections |
|
|
|
|
|
Ask any question about Indian criminal legislation and get structured legal answers with proper citations. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
query_input = gr.Textbox( |
|
|
label="πΌ Enter your legal query", |
|
|
placeholder="e.g., What are the penalties for murder under BNS? What is the IPC equivalent for theft?", |
|
|
lines=4, |
|
|
max_lines=8 |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
submit_btn = gr.Button("π Get Legal Answer", variant="primary", scale=2) |
|
|
clear_btn = gr.Button("ποΈ Clear", scale=1) |
|
|
|
|
|
with gr.Row(): |
|
|
answer_output = gr.Markdown( |
|
|
label="π Legal Analysis", |
|
|
value="*Submit your question to get a structured legal analysis...*" |
|
|
) |
|
|
|
|
|
|
|
|
submit_btn.click(answer_query, inputs=query_input, outputs=answer_output) |
|
|
query_input.submit(answer_query, inputs=query_input, outputs=answer_output) |
|
|
clear_btn.click(lambda: ("", "*Submit your question to get a structured legal analysis...*"), |
|
|
outputs=[query_input, answer_output]) |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["What are the penalties for murder under BNS?"], |
|
|
["What is the IPC equivalent for BNS Section 103?"], |
|
|
["What constitutes theft according to BNS legislation?"], |
|
|
["How are punishments defined for assault in BNS?"], |
|
|
["What are the legal provisions for robbery under IPC and BNS?"] |
|
|
], |
|
|
inputs=query_input, |
|
|
outputs=answer_output, |
|
|
fn=answer_query, |
|
|
cache_examples=False |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
share=False, |
|
|
debug=True, |
|
|
show_error=True |
|
|
) |