Spaces:

tdnathmlenthusiast
/

Legal_OCR

Sleeping

App Files Files Community

tdnathmlenthusiast commited on May 10

Commit

8c6bdbd

verified ·

1 Parent(s): 2ddd8a4

solved poppler-utils

Browse files

Files changed (1) hide show

app.py +123 -121

app.py CHANGED Viewed

@@ -1,121 +1,123 @@
-import os
-import fitz  # PyMuPDF
-from paddleocr import PPStructure
-from pdf2image import convert_from_path
-import numpy as np
-import json
-import re
-import spacy
-from spacy.matcher import Matcher
-from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
-import gradio as gr
-from tqdm.auto import tqdm
-# --- Initialization ---
-structure_engine = PPStructure(table=True, ocr=True, layout=True)
-nlp = spacy.load("en_core_web_sm")
-matcher = Matcher(nlp.vocab)
-# Regex & matcher setup
-date_pattern = r"\d{2}-[A-Za-z]{3}-\d{2}|\d{2}\.\d{2}\.\d{2}"
-party_pattern = r"M/s [A-Za-z\s&-]+(?:Consortium)?"
-pattern = [{"LOWER": "claimant"}, {"IS_PUNCT": True, "OP": "?"}, {"ENT_TYPE": "ORG"}]
-matcher.add("CLAIMANT", [pattern])
-# Load Legal-BERT pipelines
-ner_model = "nlpaueb/legal-bert-base-uncased"
-token_model = AutoModelForTokenClassification.from_pretrained(ner_model)
-tokenizer = AutoTokenizer.from_pretrained(ner_model)
-ner_pipeline = pipeline("ner", model=token_model, tokenizer=tokenizer, aggregation_strategy="simple")
-clf_pipeline = pipeline("text-classification", model=ner_model)
-# Helper functions
-def extract_text_from_pdf(pdf_path):
-    doc = fitz.open(pdf_path)
-    pages = []
-    for i in range(len(doc)):
-        page = doc[i]
-        pages.append({"page": i + 1, "text": page.get_text("text") or ""})
-    doc.close()
-    return pages
-def extract_content_from_images(pdf_path):
-    images = convert_from_path(pdf_path)
-    results = []
-    for i, img in enumerate(images, start=1):
-        img_np = np.array(img)
-        res = structure_engine(img_np)
-        text_lines, tables = [], []
-        for block in res:
-            if block['type'] == 'text':
-                text_lines += [line['text'] for line in block['res'] if 'text' in line]
-            elif block['type'] == 'table' and 'html' in block['res']:
-                tables.append(block['res']['html'])
-        results.append({"page": i, "ocr_text": " ".join(text_lines), "tables_html": tables})
-    return results
-def extract_metadata(text):
-    meta = {"dates": [], "parties": [], "claimants": [], "tribunals": [], "relationships": [], "clauses": []}
-    # Regex
-    meta['dates'] = re.findall(date_pattern, text)
-    meta['parties'] = re.findall(party_pattern, text)
-    # SpaCy
-    doc = nlp(text)
-    for ent in doc.ents:
-        if ent.label_ == 'ORG' and ent.text not in meta['parties']:
-            meta['parties'].append(ent.text)
-        if ent.label_ == 'GPE':
-            meta['tribunals'].append(ent.text)
-    for match_id, start, end in matcher(doc):
-        meta['claimants'].append(doc[start:end].text)
-    # Legal-BERT NER
-    for ent in ner_pipeline(text):
-        grp = ent['entity_group']
-        if grp in ('ORG','PARTY') and ent['word'] not in meta['parties']:
-            meta['parties'].append(ent['word'])
-        if grp == 'GPE' and ent['word'] not in meta['tribunals']:
-            meta['tribunals'].append(ent['word'])
-    # Clause classification
-    for sent in text.split('. '):
-        if len(sent) < 10: continue
-        try:
-            res = clf_pipeline(sent)[0]
-            if res['score'] > 0.7:
-                meta['clauses'].append({'type': res['label'], 'text': sent})
-        except:
-            pass
-    return meta
-def process_pdf(file_obj):
-    # Save uploaded file
-    pdf_path = file_obj.name
-    # 1. Text
-    text_pages = extract_text_from_pdf(pdf_path)
-    # 2. OCR & tables
-    img_content = extract_content_from_images(pdf_path)
-    # 3. Metadata
-    metadata = []
-    for page in text_pages:
-        metadata.append({"page": page['page'], "metadata": extract_metadata(page['text'])})
-    # Combine
-    output = {
-        "text_pages": text_pages,
-        "image_content": img_content,
-        "metadata": metadata
-    }
-    return output
-# Gradio Interface
-iface = gr.Interface(
-    fn=process_pdf,
-    inputs=gr.File(label="Upload PDF", file_types=['.pdf']),
-    outputs=gr.JSON(label="Extraction Result"),
-    title="PDF OCR & Metadata Extractor",
-    description="Upload a PDF, wait for processing, and view structured JSON output including text, OCR, tables, and metadata."
-)
-if __name__ == '__main__':
-    iface.launch()

+import os
+import fitz  # PyMuPDF
+from paddleocr import PPStructure
+from pdf2image import convert_from_path
+import numpy as np
+import json
+import re
+import spacy
+from spacy.matcher import Matcher
+from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
+import gradio as gr
+from tqdm.auto import tqdm
+import os
+# Ensure Poppler is available
+os.system("apt-get update -y && apt-get install -y poppler-utils")
+# --- Initialization ---
+structure_engine = PPStructure(table=True, ocr=True, layout=True)
+nlp = spacy.load("en_core_web_sm")
+matcher = Matcher(nlp.vocab)
+# Regex & matcher setup
+date_pattern = r"\d{2}-[A-Za-z]{3}-\d{2}|\d{2}\.\d{2}\.\d{2}"
+party_pattern = r"M/s [A-Za-z\s&-]+(?:Consortium)?"
+pattern = [{"LOWER": "claimant"}, {"IS_PUNCT": True, "OP": "?"}, {"ENT_TYPE": "ORG"}]
+matcher.add("CLAIMANT", [pattern])
+# Load Legal-BERT pipelines
+ner_model = "nlpaueb/legal-bert-base-uncased"
+token_model = AutoModelForTokenClassification.from_pretrained(ner_model)
+tokenizer = AutoTokenizer.from_pretrained(ner_model)
+ner_pipeline = pipeline("ner", model=token_model, tokenizer=tokenizer, aggregation_strategy="simple")
+clf_pipeline = pipeline("text-classification", model=ner_model)
+# Helper functions
+def extract_text_from_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
+    pages = []
+    for i in range(len(doc)):
+        page = doc[i]
+        pages.append({"page": i + 1, "text": page.get_text("text") or ""})
+    doc.close()
+    return pages
+def extract_content_from_images(pdf_path):
+    images = convert_from_path(pdf_path)
+    results = []
+    for i, img in enumerate(images, start=1):
+        img_np = np.array(img)
+        res = structure_engine(img_np)
+        text_lines, tables = [], []
+        for block in res:
+            if block['type'] == 'text':
+                text_lines += [line['text'] for line in block['res'] if 'text' in line]
+            elif block['type'] == 'table' and 'html' in block['res']:
+                tables.append(block['res']['html'])
+        results.append({"page": i, "ocr_text": " ".join(text_lines), "tables_html": tables})
+    return results
+def extract_metadata(text):
+    meta = {"dates": [], "parties": [], "claimants": [], "tribunals": [], "relationships": [], "clauses": []}
+    # Regex
+    meta['dates'] = re.findall(date_pattern, text)
+    meta['parties'] = re.findall(party_pattern, text)
+    # SpaCy
+    doc = nlp(text)
+    for ent in doc.ents:
+        if ent.label_ == 'ORG' and ent.text not in meta['parties']:
+            meta['parties'].append(ent.text)
+        if ent.label_ == 'GPE':
+            meta['tribunals'].append(ent.text)
+    for match_id, start, end in matcher(doc):
+        meta['claimants'].append(doc[start:end].text)
+    # Legal-BERT NER
+    for ent in ner_pipeline(text):
+        grp = ent['entity_group']
+        if grp in ('ORG','PARTY') and ent['word'] not in meta['parties']:
+            meta['parties'].append(ent['word'])
+        if grp == 'GPE' and ent['word'] not in meta['tribunals']:
+            meta['tribunals'].append(ent['word'])
+    # Clause classification
+    for sent in text.split('. '):
+        if len(sent) < 10: continue
+        try:
+            res = clf_pipeline(sent)[0]
+            if res['score'] > 0.7:
+                meta['clauses'].append({'type': res['label'], 'text': sent})
+        except:
+            pass
+    return meta
+def process_pdf(file_obj):
+    # Save uploaded file
+    pdf_path = file_obj.name
+    # 1. Text
+    text_pages = extract_text_from_pdf(pdf_path)
+    # 2. OCR & tables
+    img_content = extract_content_from_images(pdf_path)
+    # 3. Metadata
+    metadata = []
+    for page in text_pages:
+        metadata.append({"page": page['page'], "metadata": extract_metadata(page['text'])})
+    # Combine
+    output = {
+        "text_pages": text_pages,
+        "image_content": img_content,
+        "metadata": metadata
+    }
+    return output
+# Gradio Interface
+iface = gr.Interface(
+    fn=process_pdf,
+    inputs=gr.File(label="Upload PDF", file_types=['.pdf']),
+    outputs=gr.JSON(label="Extraction Result"),
+    title="PDF OCR & Metadata Extractor",
+    description="Upload a PDF, wait for processing, and view structured JSON output including text, OCR, tables, and metadata."
+)
+if __name__ == '__main__':
+    iface.launch()