Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

App Files Files Community

DrishtiSharma commited on Feb 14

Commit

22c44a9

verified ·

1 Parent(s): 456dd99

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -86

app.py CHANGED Viewed

@@ -74,94 +74,53 @@ def extract_title_manually(text):
     return "Unknown"
 # ----------------- Metadata Extraction -----------------
-def extract_metadata_llm(pdf_path):
-    """Extracts metadata using LLM with improved title detection and JSON handling."""
     with pdfplumber.open(pdf_path) as pdf:
-        first_page_text = pdf.pages[0].extract_text() or "No text found." if pdf.pages else "No text found."
-    # Apply text cleaning
-    cleaned_text = clean_extracted_text(first_page_text)
-    # Attempt manual title extraction before LLM
-    pre_extracted_title = extract_title_manually(cleaned_text)
-    # Streamlit Debugging: Show extracted text
-    st.subheader("📄 Extracted First Page Text (Cleaned)")
-    st.text_area("Cleaned Text:", cleaned_text, height=200)
-    # Define metadata prompt
-    metadata_prompt = PromptTemplate(
-        input_variables=["text", "pre_title"],
-        template="""
-        Given the first page of a research paper, extract metadata **strictly in JSON format**.
-        - The title is typically in the first few lines and is often in a larger font or bold.
-        - If a phrase like "Short Paper:" appears, the actual title follows.
-        - If no clear title is found, use the pre-extracted title: "{pre_title}".
-        - If a field is missing, return `"Unknown"`.
-        - Ensure the JSON format is **valid**.
-        Example output:
-        {{
-            "Title": "Example Paper Title",
-            "Author": "John Doe, Jane Smith",
-            "Emails": "[email protected], [email protected]",
-            "Affiliations": "School of AI, University of Example"
-        }}
-        Now, extract metadata from this document:
-        {text}
-        """
-    )
-    # Run LLM Metadata Extraction
-    metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
-    # Debugging: Log the LLM input
-    st.subheader("🔍 LLM Input for Metadata Extraction")
-    st.json({"text": cleaned_text, "pre_title": pre_extracted_title})
-    try:
-        metadata_response = metadata_chain.invoke({"text": cleaned_text, "pre_title": pre_extracted_title})
-        # Debugging: Log raw LLM response
-        st.subheader("🔍 Raw LLM Response")
-        st.json(metadata_response)
-        # Handle JSON extraction from LLM response
-        try:
-            metadata_dict = json.loads(metadata_response["metadata"])
-        except json.JSONDecodeError:
-            try:
-                # Attempt to clean up JSON if needed
-                metadata_dict = json.loads(metadata_response["metadata"].strip("```json\n").strip("\n```"))
-            except json.JSONDecodeError:
-                metadata_dict = {
-                    "Title": pre_extracted_title,  # Use pre-extracted title as fallback
-                    "Author": "Unknown",
-                    "Emails": "No emails found",
-                    "Affiliations": "No affiliations found"
-                }
-    except Exception as e:
-        st.error(f"❌ LLM Metadata Extraction Failed: {e}")
-        metadata_dict = {
-            "Title": pre_extracted_title,  # Use pre-extracted title
-            "Author": "Unknown",
-            "Emails": "No emails found",
-            "Affiliations": "No affiliations found"
         }
-    # Ensure all required fields exist
-    required_fields = ["Title", "Author", "Emails", "Affiliations"]
-    for field in required_fields:
-        metadata_dict.setdefault(field, "Unknown")
-    # Streamlit Debugging: Display Final Extracted Metadata
-    st.subheader("✅ Extracted Metadata")
-    st.json(metadata_dict)
-    return metadata_dict
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
@@ -204,7 +163,7 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
         st.json(docs[0].metadata)
         # Extract metadata
-        metadata = extract_metadata_llm(st.session_state.pdf_path)
         # Display extracted-metadata
         if isinstance(metadata, dict):
@@ -214,7 +173,7 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
             st.write(f"**Emails:** {metadata.get('Emails', 'No emails found')}")
             st.write(f"**Affiliations:** {metadata.get('Affiliations', 'No affiliations found')}")
         else:
-            st.error("Metadata extraction failed. Check the LLM response format.")
         # Embedding Model
         model_name = "nomic-ai/modernbert-embed-base"

     return "Unknown"
 # ----------------- Metadata Extraction -----------------
+# ----------------- Metadata Extraction -----------------
+def extract_metadata(pdf_path):
+    """Extracts metadata using simple heuristics without LLM."""
     with pdfplumber.open(pdf_path) as pdf:
+        if not pdf.pages:
+            return {
+                "Title": "Unknown",
+                "Author": "Unknown",
+                "Emails": "No emails found",
+                "Affiliations": "No affiliations found"
+            }
+        # Extract text from the first page
+        first_page_text = pdf.pages[0].extract_text() or "No text found."
+        cleaned_text = clean_extracted_text(first_page_text)
+        # Extract Title
+        pre_extracted_title = extract_title_manually(cleaned_text)
+        # Extract Authors (Names typically appear before affiliations)
+        author_pattern = re.compile(r"([\w\-\s]+,\s?)+[\w\-\s]+")
+        authors = "Unknown"
+        for line in cleaned_text.split("\n"):
+            match = author_pattern.search(line)
+            if match:
+                authors = match.group(0)
+                break
+        # Extract Emails
+        email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
+        emails = ", ".join(email_pattern.findall(cleaned_text)) or "No emails found"
+        # Extract Affiliations (usually below author names)
+        affiliations = "Unknown"
+        for i, line in enumerate(cleaned_text.split("\n")):
+            if "@" in line:  # Email appears before affiliations
+                affiliations = cleaned_text.split("\n")[i + 1] if i + 1 < len(cleaned_text.split("\n")) else "Unknown"
+                break
+        return {
+            "Title": pre_extracted_title,
+            "Author": authors,
+            "Emails": emails,
+            "Affiliations": affiliations
         }
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
         st.json(docs[0].metadata)
         # Extract metadata
+        metadata = extract_metadata(st.session_state.pdf_path)
         # Display extracted-metadata
         if isinstance(metadata, dict):
             st.write(f"**Emails:** {metadata.get('Emails', 'No emails found')}")
             st.write(f"**Affiliations:** {metadata.get('Affiliations', 'No affiliations found')}")
         else:
+            st.error("Metadata extraction failed.")
         # Embedding Model
         model_name = "nomic-ai/modernbert-embed-base"