Spaces:

nvtitan
/

graphRAG

Sleeping

App Files Files Community

nvtitan commited on 8 days ago

Commit

e884643

verified ·

1 Parent(s): 5cc12a8

Upload 24 files

Browse files

Files changed (24) hide show

.dockerignore +92 -0
.gitignore +72 -0
.railwayignore +26 -0
Dockerfile +51 -0
Procfile +1 -0
README.md +92 -10
config.py +127 -0
docker-compose.yml +68 -0
embedding_service.py +266 -0
frontend/app.js +539 -0
frontend/index.html +176 -0
frontend/styles.css +800 -0
gemini_extractor.py +612 -0
graph_builder.py +268 -0
graph_store.py +347 -0
llm_service.py +491 -0
main.py +550 -0
modal_app.py +50 -0
models.py +236 -0
pdf_processor.py +325 -0
rag_agent.py +485 -0
requirements.txt +59 -0
tests/__init__.py +1 -0
tests/test_basic.py +106 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,92 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+dist/
+*.egg-info/
+# Virtual Environments
+venv/
+env/
+.venv/
+ENV/
+# Environment Variables
+.env
+.env.*
+!.env.example
+# Data (exclude from image - will be created at runtime)
+data/
+uploads/
+*.pdf
+*.pkl
+*.faiss
+*.index
+# Logs (exclude from image)
+logs/
+*.log
+# Cache
+cache/
+.cache/
+__pycache__/
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*.sublime-*
+# OS
+.DS_Store
+Thumbs.db
+desktop.ini
+# Testing
+.coverage
+htmlcov/
+.pytest_cache/
+.tox/
+*.cover
+tests/
+# Database files
+*.db
+*.sqlite
+*.sqlite3
+# Git
+.git/
+.gitignore
+.gitattributes
+# Documentation
+docs/
+*.md
+!README.md
+# Deployment configs (not needed in container)
+railway.toml
+nixpacks.toml
+Procfile
+modal_app.py
+fly.toml
+vercel.json
+heroku.yml
+docker-compose*.yml
+# CI/CD
+.github/
+.gitlab-ci.yml
+.travis.yml
+# Misc
+*.bak
+*.tmp
+*.temp

.gitignore ADDED Viewed

	@@ -0,0 +1,72 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+env/
+.venv
+# Environment Variables
+.env
+.env.local
+# Data & Uploads
+data/
+uploads/
+*.pdf
+*.faiss
+*.index
+# Logs
+logs/
+*.log
+# Cache
+cache/
+.cache/
+*.cache
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+Thumbs.db
+# Testing
+.coverage
+htmlcov/
+.pytest_cache/
+.tox/
+# Database
+*.db
+*.sqlite
+*.sqlite3
+# Neo4j
+neo4j/

.railwayignore ADDED Viewed

	@@ -0,0 +1,26 @@

+# Ignore local development files
+venv/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+*.egg
+*.egg-info/
+dist/
+build/
+.env.local
+.DS_Store
+# Ignore local data (will be created on Railway)
+data/
+uploads/
+logs/
+cache/
+# Ignore development artifacts
+.pytest_cache/
+.coverage
+htmlcov/
+*.log

Dockerfile ADDED Viewed

	@@ -0,0 +1,51 @@

+# GraphLLM - Hugging Face Spaces Deployment
+# Optimized Docker image for HF Spaces
+FROM python:3.12-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    DEBIAN_FRONTEND=noninteractive \
+    API_PORT=7860 \
+    HF_HOME=/app/cache \
+    TRANSFORMERS_CACHE=/app/cache \
+    SENTENCE_TRANSFORMERS_HOME=/app/cache
+# Set working directory
+WORKDIR /app
+# Install system dependencies (minimal set for HF Spaces)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    curl \
+    tesseract-ocr \
+    ghostscript \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+# Copy requirements first (for better layer caching)
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create data directories with proper permissions (777 for HF Spaces non-root user)
+RUN mkdir -p data uploads logs cache data/faiss_index && \
+    chmod -R 777 data uploads logs cache
+# Expose Hugging Face Spaces default port
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:7860/ || exit 1
+# Run the application
+# HF Spaces expects the app to listen on 0.0.0.0:7860
+CMD ["python3", "main.py"]

Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: python main.py

README.md CHANGED Viewed

@@ -1,12 +1,94 @@
----
-title: GraphRAG
-emoji: 🦀
-colorFrom: red
-colorTo: indigo
-sdk: docker
-pinned: false
-license: mit
-short_description: graphRAG for PDFs
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# 🕸️ GraphLLM - PDF Knowledge Graph + RAG System
+Transform PDFs into interactive knowledge graphs with AI-powered Q&A.
+## 🚀 Features
+- **📄 PDF Processing:** Extract text, tables, and images from PDFs
+- **🕸️ Knowledge Graph Generation:** Build semantic graphs using Gemini AI
+- **🔍 Vector Search:** FAISS-powered semantic search with sentence transformers
+- **💬 RAG Chat:** Ask questions and get answers with source citations
+- **🎨 Interactive Visualization:** Explore knowledge graphs in your browser
+## 🛠️ Technology Stack
+- **LLM:** Google Gemini (gemini-2.5-flash)
+- **Embeddings:** sentence-transformers/all-MiniLM-L6-v2
+- **Vector Store:** FAISS with HNSW index
+- **Graph:** NetworkX (in-memory)
+- **Backend:** FastAPI + Uvicorn
+- **Frontend:** Vanilla JS with D3.js/Cytoscape
+## 📋 Setup
+### Required: Gemini API Key
+This app requires a Google Gemini API key:
+1. Get your API key from [Google AI Studio](https://makersuite.google.com/app/apikey)
+2. Add it as a **Secret** in Hugging Face Spaces settings:
+   - Name: `GEMINI_API_KEY`
+   - Value: Your API key
+### Configuration (Optional)
+You can set these environment variables in Space Settings:
+```bash
+# LLM Settings
+GEMINI_MODEL=gemini-2.5-flash     # Gemini model
+LLM_TEMPERATURE=0.0               # Temperature for extraction
+# Embedding Settings
+EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
+# Environment
+ENVIRONMENT=production
+LOG_LEVEL=INFO
+```
+## 🎯 Usage
+1. **Upload PDF:** Click "Upload PDF" and select your document
+2. **Wait for Processing:** The system will:
+   - Extract text chunks
+   - Generate embeddings
+   - Build knowledge graph with Gemini
+3. **Explore Graph:** Click nodes to see details and related concepts
+4. **Ask Questions:** Use the chat interface for Q&A with citations
+## 📊 Graph Generation
+- **Per-Page Extraction:** Max 2 concepts per page (quality over quantity)
+- **Parallel Processing:** All pages processed concurrently via Gemini API
+- **Strict Filtering:** Only technical/domain-specific concepts
+- **Co-occurrence Relationships:** Concepts on same page are linked
+## 🎨 Frontend
+The frontend is a single-page application located in `/frontend/`:
+- `index.html` - Main UI
+- `app.js` - Graph visualization & API calls
+- `styles.css` - Styling
+Access it at: `http://your-space-url.hf.space/frontend/`
+## 📦 Docker
+This Space uses Docker for deployment:
+- Base: `python:3.12-slim`
+- Port: 7860 (HF Spaces default)
+- Health check enabled
+- Persistent data directory
+## 🤝 Credits
+- **LLM:** Google Gemini
+- **Embeddings:** Hugging Face sentence-transformers
 ---

config.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+Configuration management for GraphLLM system
+"""
+from pydantic_settings import BaseSettings
+from pydantic import Field, field_validator
+from typing import Optional
+import os
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables"""
+    # Application
+    app_name: str = "GraphLLM"
+    app_version: str = "1.0.0"
+    environment: str = "development"
+    debug: bool = True
+    # API
+    api_host: str = "0.0.0.0"
+    api_port: int = 8000
+    api_workers: int = 4
+    # LLM Settings - Gemini (Primary)
+    gemini_api_key: str = Field(default="", env="GEMINI_API_KEY")
+    gemini_model: str = "gemini-2.5-flash"
+    # LLM Settings - Mistral (Fallback)
+    mistral_api_key: str = Field(default="", env="MISTRAL_API_KEY")
+    mistral_model: str = "mistral-7b-instruct-v0.1"
+    # LLM Parameters
+    llm_temperature: float = 0.0
+    llm_max_tokens: int = 2048
+    llm_timeout: int = 120
+    # Embedding Settings
+    embedding_model: str = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
+    embedding_dimension: int = 384
+    embedding_batch_size: int = 32
+    # FAISS Vector DB
+    faiss_index_path: str = "./data/faiss_index"
+    faiss_metric: str = "cosine"
+    # Neo4j Graph DB
+    neo4j_uri: str = "bolt://localhost:7687"
+    neo4j_user: str = "neo4j"
+    neo4j_password: str = Field(default="", env="NEO4J_PASSWORD")
+    neo4j_database: str = "neo4j"
+    # PostgreSQL
+    postgres_host: str = "localhost"
+    postgres_port: int = 5432
+    postgres_db: str = "graphllm"
+    postgres_user: str = "postgres"
+    postgres_password: str = Field(default="", env="POSTGRES_PASSWORD")
+    # MongoDB (optional)
+    mongodb_uri: str = "mongodb://localhost:27017"
+    mongodb_database: str = "graphllm"
+    # Chunking
+    chunk_size: int = 512
+    chunk_overlap: int = 128
+    min_chunk_size: int = 100
+    # Triplet Extraction
+    triplet_confidence_threshold: float = 0.6
+    entity_similarity_threshold: float = 0.85
+    max_triples_per_chunk: int = 10
+    # Graph Pruning
+    node_importance_threshold: float = 0.3
+    edge_confidence_threshold: float = 0.5
+    min_node_mentions: int = 2
+    # RAG
+    rag_top_k: int = 10
+    rag_rerank_top_k: int = 5
+    max_context_length: int = 4000
+    # File Upload
+    max_file_size_mb: int = 50
+    allowed_extensions: str = "pdf"
+    upload_dir: str = "./data/uploads"
+    # Storage
+    data_dir: str = "./data"
+    logs_dir: str = "./logs"
+    cache_dir: str = "./cache"
+    # Monitoring
+    enable_metrics: bool = True
+    metrics_port: int = 9090
+    log_level: str = "INFO"
+    @property
+    def postgres_url(self) -> str:
+        """Build PostgreSQL connection URL"""
+        return f"postgresql://{self.postgres_user}:{self.postgres_password}@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}"
+    @property
+    def max_file_size_bytes(self) -> int:
+        """Convert MB to bytes"""
+        return self.max_file_size_mb * 1024 * 1024
+    class Config:
+        env_file = ".env"
+        case_sensitive = False
+# Global settings instance
+settings = Settings()
+def ensure_directories():
+    """Ensure all required directories exist"""
+    dirs = [
+        settings.data_dir,
+        settings.upload_dir,
+        settings.logs_dir,
+        settings.cache_dir,
+        settings.faiss_index_path,
+    ]
+    for directory in dirs:
+        os.makedirs(directory, exist_ok=True)

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,68 @@

+# GraphLLM Docker Compose Configuration
+# Simple standalone deployment with persistent storage
+version: '3.8'
+services:
+  # Main GraphLLM Application
+  graphllm:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: graphllm
+    image: graphllm:latest
+    ports:
+      - "8000:8000"
+    volumes:
+      # Persistent storage for data, uploads, and logs
+      - graphllm-data:/app/data
+      - graphllm-uploads:/app/uploads
+      - graphllm-logs:/app/logs
+      - graphllm-cache:/app/cache
+    environment:
+      # Gemini API Configuration
+      - GEMINI_API_KEY=${GEMINI_API_KEY}
+      - GEMINI_MODEL=${GEMINI_MODEL:-gemini-1.5-flash}
+      # Application Settings
+      - ENVIRONMENT=${ENVIRONMENT:-production}
+      - LOG_LEVEL=${LOG_LEVEL:-INFO}
+      - DEBUG=false
+      # LLM Settings
+      - LLM_TEMPERATURE=${LLM_TEMPERATURE:-0.7}
+      - LLM_MAX_TOKENS=${LLM_MAX_TOKENS:-2048}
+      # Embedding Settings
+      - EMBEDDING_MODEL=${EMBEDDING_MODEL:-all-MiniLM-L6-v2}
+      - EMBEDDING_BATCH_SIZE=${EMBEDDING_BATCH_SIZE:-128}
+      # API Settings
+      - API_HOST=0.0.0.0
+      - API_PORT=8000
+      - MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+    networks:
+      - graphllm-network
+volumes:
+  # Named volumes for persistent storage
+  graphllm-data:
+    driver: local
+  graphllm-uploads:
+    driver: local
+  graphllm-logs:
+    driver: local
+  graphllm-cache:
+    driver: local
+networks:
+  graphllm-network:
+    driver: bridge

embedding_service.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+Embedding & Vector Index Service
+Handles embedding generation and FAISS vector store management
+"""
+# Import SentenceTransformer lazily to avoid hanging on startup
+import faiss
+import numpy as np
+from typing import List, Dict, Any, Tuple, Optional
+from loguru import logger
+import pickle
+import os
+from models import Chunk, EmbeddingEntry
+from config import settings
+import json
+class EmbeddingService:
+    """
+    Service for creating embeddings and managing FAISS vector index
+    Uses lazy loading for the embedding model (loads on first use)
+    """
+    def __init__(self):
+        logger.info(f"EmbeddingService initialized (model will load on first use)")
+        self._model = None  # Lazy-loaded
+        self.dimension = settings.embedding_dimension
+        self.index: Optional[faiss.Index] = None
+        self.chunk_metadata: Dict[int, Dict[str, Any]] = {}  # index_id -> metadata
+        self._initialize_index()
+    @property
+    def model(self):
+        """Lazy-load the embedding model on first access"""
+        if self._model is None:
+            logger.info(f"Loading embedding model: {settings.embedding_model}")
+            # Import only when needed to avoid hanging on startup
+            from sentence_transformers import SentenceTransformer
+            self._model = SentenceTransformer(settings.embedding_model)
+            logger.info(f"✓ Embedding model loaded successfully")
+        return self._model
+    def _initialize_index(self):
+        """Initialize or load FAISS index"""
+        index_path = os.path.join(settings.faiss_index_path, "index.faiss")
+        metadata_path = os.path.join(settings.faiss_index_path, "metadata.pkl")
+        if os.path.exists(index_path) and os.path.exists(metadata_path):
+            logger.info("Loading existing FAISS index")
+            self.index = faiss.read_index(index_path)
+            with open(metadata_path, 'rb') as f:
+                self.chunk_metadata = pickle.load(f)
+            logger.info(f"Loaded index with {self.index.ntotal} vectors")
+        else:
+            logger.info("Creating new FAISS index (optimized)")
+            # Use HNSW for better performance on larger datasets
+            # HNSW is ~10x faster than flat index with 99%+ accuracy
+            # M=32 is good balance (higher M = more accurate but slower)
+            self.index = faiss.IndexHNSWFlat(self.dimension, 32)
+            # Set ef construction (higher = better quality, slower build)
+            self.index.hnsw.efConstruction = 40
+            # Set ef search (higher = better recall, slower search)
+            self.index.hnsw.efSearch = 16
+            self.chunk_metadata = {}
+            logger.info("Using HNSW index for faster approximate search")
+    def create_embeddings(self, chunks: List[Chunk]) -> List[EmbeddingEntry]:
+        """
+        ⚡ OPTIMIZED: Create embeddings with larger batches and parallel processing
+        Args:
+            chunks: List of Chunk objects
+        Returns:
+            List of EmbeddingEntry objects
+        """
+        texts = [chunk.text for chunk in chunks]
+        logger.info(f"⚡ Creating embeddings for {len(texts)} chunks (batch_size={settings.embedding_batch_size})")
+        import time
+        start = time.time()
+        # Batch encode with optimized settings
+        embeddings = self.model.encode(
+            texts,
+            batch_size=settings.embedding_batch_size,
+            show_progress_bar=False,  # Disable for less overhead
+            convert_to_numpy=True,
+            normalize_embeddings=True  # Built-in normalization is faster
+        )
+        elapsed = time.time() - start
+        logger.info(f"✓ Created {len(embeddings)} embeddings in {elapsed:.2f}s ({len(embeddings)/elapsed:.1f} chunks/sec)")
+        # Create embedding entries
+        embedding_entries = []
+        for chunk, embedding in zip(chunks, embeddings):
+            entry = EmbeddingEntry(
+                chunk_id=chunk.chunk_id,
+                embedding=embedding.tolist(),
+                metadata={
+                    "pdf_id": chunk.pdf_id,
+                    "page_number": chunk.page_number,
+                    "type": chunk.type.value,
+                    "char_range": chunk.char_range
+                }
+            )
+            embedding_entries.append(entry)
+        return embedding_entries
+    def add_to_index(self, chunks: List[Chunk], embeddings: List[EmbeddingEntry]):
+        """
+        Add chunks and their embeddings to FAISS index
+        Args:
+            chunks: List of chunks
+            embeddings: Corresponding embeddings
+        """
+        if len(chunks) != len(embeddings):
+            raise ValueError("Chunks and embeddings must have same length")
+        # Convert embeddings to numpy array
+        embedding_array = np.array([e.embedding for e in embeddings]).astype('float32')
+        # Get current index size (starting ID for new chunks)
+        start_id = self.index.ntotal
+        # Add to FAISS index
+        self.index.add(embedding_array)
+        # Store metadata mapping
+        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+            idx = start_id + i
+            self.chunk_metadata[idx] = {
+                "chunk_id": chunk.chunk_id,
+                "pdf_id": chunk.pdf_id,
+                "page_number": chunk.page_number,
+                "type": chunk.type.value,
+                "text": chunk.text,
+                "char_range": chunk.char_range,
+                "metadata": chunk.metadata
+            }
+        logger.info(f"Added {len(chunks)} chunks to index. Total: {self.index.ntotal}")
+    def search(
+        self,
+        query: str,
+        top_k: int = 10,
+        filter_pdf_id: Optional[str] = None
+    ) -> List[Tuple[Dict[str, Any], float]]:
+        """
+        Search for similar chunks
+        Args:
+            query: Query string
+            top_k: Number of results to return
+            filter_pdf_id: Optional PDF ID to filter results
+        Returns:
+            List of (chunk_metadata, score) tuples
+        """
+        # Encode and normalize query
+        query_embedding = self.model.encode([query], convert_to_numpy=True)
+        faiss.normalize_L2(query_embedding)
+        # Search
+        # Fetch more if we need to filter
+        k = top_k * 10 if filter_pdf_id else top_k
+        scores, indices = self.index.search(query_embedding, k)
+        # Retrieve metadata
+        results = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx == -1:  # FAISS returns -1 for empty results
+                continue
+            metadata = self.chunk_metadata.get(idx)
+            if metadata is None:
+                continue
+            # Apply filter if specified
+            if filter_pdf_id and metadata.get("pdf_id") != filter_pdf_id:
+                continue
+            results.append((metadata, float(score)))
+            if len(results) >= top_k:
+                break
+        return results
+    def search_by_chunk_ids(self, chunk_ids: List[str], top_k: int = 5) -> List[Tuple[Dict[str, Any], float]]:
+        """
+        Find similar chunks to a set of chunk IDs (for node expansion)
+        Args:
+            chunk_ids: List of chunk IDs
+            top_k: Number of similar chunks per input chunk
+        Returns:
+            List of (chunk_metadata, score) tuples
+        """
+        # Find the chunks in metadata
+        chunk_indices = []
+        for idx, meta in self.chunk_metadata.items():
+            if meta["chunk_id"] in chunk_ids:
+                chunk_indices.append(idx)
+        if not chunk_indices:
+            return []
+        # Get embeddings for these chunks
+        # Note: FAISS doesn't have a direct "get vector" API for IndexFlatIP
+        # We'll search from the index using reconstruct (if supported)
+        results = []
+        for idx in chunk_indices:
+            # Reconstruct vector (works for Flat indices)
+            try:
+                vector = self.index.reconstruct(idx)
+                vector = vector.reshape(1, -1)
+                scores, indices = self.index.search(vector, top_k + 1)  # +1 to exclude self
+                for score, res_idx in zip(scores[0], indices[0]):
+                    if res_idx == idx:  # Skip self
+                        continue
+                    if res_idx == -1:
+                        continue
+                    metadata = self.chunk_metadata.get(res_idx)
+                    if metadata:
+                        results.append((metadata, float(score)))
+            except Exception as e:
+                logger.warning(f"Could not reconstruct vector for index {idx}: {e}")
+        # Sort by score and return top
+        results.sort(key=lambda x: x[1], reverse=True)
+        return results[:top_k]
+    def save(self):
+        """Save FAISS index and metadata to disk"""
+        os.makedirs(settings.faiss_index_path, exist_ok=True)
+        index_path = os.path.join(settings.faiss_index_path, "index.faiss")
+        metadata_path = os.path.join(settings.faiss_index_path, "metadata.pkl")
+        faiss.write_index(self.index, index_path)
+        with open(metadata_path, 'wb') as f:
+            pickle.dump(self.chunk_metadata, f)
+        logger.info(f"Saved FAISS index with {self.index.ntotal} vectors")
+    def clear(self):
+        """Clear the index and metadata"""
+        self.index = faiss.IndexFlatIP(self.dimension)
+        self.chunk_metadata = {}
+        logger.info("Cleared FAISS index")
+    def get_stats(self) -> Dict[str, Any]:
+        """Get index statistics"""
+        return {
+            "total_vectors": self.index.ntotal,
+            "dimension": self.dimension,
+            "index_type": type(self.index).__name__,
+            "num_chunks": len(self.chunk_metadata)
+        }

frontend/app.js ADDED Viewed

	@@ -0,0 +1,539 @@

+/**
+ * GraphLLM Frontend JavaScript
+ * Handles user interactions, API calls, and dynamic UI updates
+ */
+// ========== Global State ==========
+let currentPdfId = null;
+let graphData = { nodes: [], edges: [] };
+let selectedNodeId = null;
+// ========== API Configuration ==========
+const API_BASE = window.location.origin;
+// ========== Processing Overlay Functions ==========
+function showProcessingOverlay(title = 'Processing PDF', message = 'Starting...', percent = 0) {
+    const overlay = document.getElementById('processing-overlay');
+    const titleEl = document.getElementById('processing-title');
+    const messageEl = document.getElementById('processing-message');
+    const percentEl = document.getElementById('processing-percent');
+    const progressFill = document.getElementById('progress-fill');
+    titleEl.textContent = title;
+    messageEl.textContent = message;
+    percentEl.textContent = `${percent}%`;
+    progressFill.style.width = `${percent}%`;
+    overlay.hidden = false;
+}
+function updateProcessingOverlay(message, percent) {
+    const messageEl = document.getElementById('processing-message');
+    const percentEl = document.getElementById('processing-percent');
+    const progressFill = document.getElementById('progress-fill');
+    messageEl.textContent = message;
+    percentEl.textContent = `${percent}%`;
+    progressFill.style.width = `${percent}%`;
+}
+function hideProcessingOverlay() {
+    const overlay = document.getElementById('processing-overlay');
+    overlay.hidden = true;
+}
+// ========== Utility Functions ==========
+async function apiCall(endpoint, options = {}) {
+    try {
+        const response = await fetch(`${API_BASE}${endpoint}`, options);
+        if (!response.ok) {
+            throw new Error(`API Error: ${response.statusText}`);
+        }
+        return await response.json();
+    } catch (error) {
+        console.error('API call failed:', error);
+        showNotification(error.message, 'error');
+        throw error;
+    }
+}
+function showNotification(message, type = 'info') {
+    const statusEl = document.getElementById('upload-status');
+    statusEl.textContent = message;
+    statusEl.style.color = type === 'error' ? '#f44336' : type === 'success' ? '#4caf50' : '#4f9eff';
+    setTimeout(() => {
+        statusEl.textContent = '';
+    }, 5000);
+}
+// ========== PDF Upload ==========
+document.getElementById('pdf-upload').addEventListener('change', async (e) => {
+    const file = e.target.files[0];
+    if (!file) return;
+    // Show overlay immediately
+    showProcessingOverlay('Uploading PDF', `Uploading ${file.name}...`, 0);
+    const formData = new FormData();
+    formData.append('file', file);
+    try {
+        const result = await apiCall('/upload', {
+            method: 'POST',
+            body: formData
+        });
+        currentPdfId = result.pdf_id;
+        updateProcessingOverlay('Upload complete, starting processing...', 5);
+        // Poll for completion
+        pollProcessingStatus(result.pdf_id);
+    } catch (error) {
+        hideProcessingOverlay();
+        showNotification('Upload failed', 'error');
+    }
+});
+async function pollProcessingStatus(pdfId) {
+    const interval = setInterval(async () => {
+        try {
+            // Fetch detailed status for this PDF
+            const status = await apiCall(`/status/${pdfId}`);
+            // Update overlay with progress
+            if (status.progress) {
+                const { message, percent } = status.progress;
+                updateProcessingOverlay(message, percent);
+            }
+            // Check if processing is complete
+            if (status.status === 'completed') {
+                clearInterval(interval);
+                // Show completion message briefly
+                updateProcessingOverlay(
+                    `✓ Complete! ${status.num_nodes} nodes, ${status.num_edges} edges`,
+                    100
+                );
+                // Load graph and hide overlay
+                setTimeout(async () => {
+                    hideProcessingOverlay();
+                    await loadGraph();
+                    await updateStats();
+                    showNotification(`✓ Graph loaded: ${status.num_nodes} nodes, ${status.num_edges} edges`, 'success');
+                }, 1500); // Show completion for 1.5s
+            } else if (status.status === 'failed') {
+                clearInterval(interval);
+                hideProcessingOverlay();
+                showNotification(`Error: ${status.error}`, 'error');
+            }
+        } catch (error) {
+            clearInterval(interval);
+            hideProcessingOverlay();
+            showNotification('Failed to check status', 'error');
+        }
+    }, 1000); // Poll every 1 second for responsive updates
+    // Stop polling after 5 minutes
+    setTimeout(() => {
+        clearInterval(interval);
+        hideProcessingOverlay();
+        showNotification('Processing timeout', 'error');
+    }, 300000);
+}
+// ========== Graph Loading ==========
+let network = null;
+async function loadGraph() {
+    try {
+        const data = await apiCall('/graph');
+        graphData = data;
+        // Render interactive graph visualization
+        renderGraph(data);
+    } catch (error) {
+        console.error('Failed to load graph:', error);
+    }
+}
+function renderGraph(data) {
+    const container = document.getElementById('graph-container');
+    // Clear any existing content
+    container.innerHTML = '';
+    console.log(`Rendering graph: ${data.nodes.length} nodes, ${data.edges.length} edges`);
+    // Get actual container dimensions
+    const rect = container.getBoundingClientRect();
+    const containerHeight = rect.height || 600; // Fallback to 600px
+    const containerWidth = rect.width || 800;   // Fallback to 800px
+    // Set explicit container styles to prevent overflow
+    container.style.position = 'relative';
+    container.style.width = containerWidth + 'px';
+    container.style.height = containerHeight + 'px';
+    container.style.overflow = 'hidden';
+    // Prepare nodes for vis.js
+    const visNodes = data.nodes.map(node => ({
+        id: node.node_id,
+        label: node.label,
+        title: `${node.label}\nType: ${node.type}\nImportance: ${node.importance_score.toFixed(2)}`,
+        value: node.importance_score * 20, // Size based on importance
+        group: node.type,
+        font: { color: '#e6eef8' }
+    }));
+    // Prepare edges for vis.js (thin, bright green, no arrows - undirected graph)
+    const visEdges = data.edges.map(edge => ({
+        from: edge.from || edge.from_node,  // Handle both alias and field name
+        to: edge.to || edge.to_node,        // Handle both alias and field name
+        label: edge.relation,
+        title: `${edge.relation} (${edge.confidence.toFixed(2)})`,
+        width: 1.5,  // Thin edges
+        // No arrows for undirected graph
+        color: {
+            color: '#00ff00',  // BRIGHT NEON GREEN (most visible)
+            highlight: '#ff00ff',  // Neon magenta when highlighted
+            hover: '#ffff00',  // Yellow on hover
+            opacity: 1.0  // Full opacity
+        },
+        font: {
+            size: 12,
+            color: '#ffffff',
+            strokeWidth: 3,
+            strokeColor: '#000000',
+            background: 'rgba(0, 0, 0, 0.8)',
+            bold: true
+        }
+    }));
+    // Create vis.js network
+    const graphData = {
+        nodes: new vis.DataSet(visNodes),
+        edges: new vis.DataSet(visEdges)
+    };
+    const options = {
+        nodes: {
+            shape: 'dot',
+            scaling: {
+                min: 10,
+                max: 30
+            },
+            font: {
+                size: 12,
+                face: 'Arial',
+                color: '#e6eef8'
+            },
+            borderWidth: 2,
+            shadow: true
+        },
+        edges: {
+            width: 1.5,  // Thin edges
+            color: {
+                color: '#00ff00',  // BRIGHT NEON GREEN (most visible against dark bg)
+                highlight: '#ff00ff',  // Neon magenta when highlighted
+                hover: '#ffff00',  // Yellow on hover
+                opacity: 1.0  // Full opacity
+            },
+            arrows: {
+                to: { enabled: false }  // No arrows - undirected graph
+            },
+            smooth: {
+                type: 'continuous',
+                roundness: 0.2  // Less curved = more visible
+            },
+            font: {
+                size: 12,  // Moderate text size
+                color: '#ffffff',  // White text
+                strokeWidth: 3,  // Moderate outline
+                strokeColor: '#000000',  // Black outline for readability
+                align: 'top',  // Position above edge
+                bold: true,
+                background: 'rgba(0, 0, 0, 0.8)'  // Dark background for label
+            },
+            selectionWidth: 3,  // Moderately thicker when selected
+            hoverWidth: 2.5,  // Slightly thicker on hover
+            shadow: {
+                enabled: true,
+                color: 'rgba(0, 255, 0, 0.5)',  // Green glow
+                size: 5,
+                x: 0,
+                y: 0
+            }
+        },
+        groups: {
+            concept: { color: { background: '#4f9eff', border: '#3d8ae6' } },
+            function: { color: { background: '#9c27b0', border: '#7b1fa2' } },
+            class: { color: { background: '#ff5722', border: '#e64a19' } },
+            term: { color: { background: '#4caf50', border: '#388e3c' } },
+            person: { color: { background: '#ff9800', border: '#f57c00' } },
+            method: { color: { background: '#00bcd4', border: '#0097a7' } },
+            entity: { color: { background: '#607d8b', border: '#455a64' } }
+        },
+        physics: {
+            stabilization: { iterations: 200 },
+            barnesHut: {
+                gravitationalConstant: -8000,
+                springConstant: 0.04,
+                springLength: 95
+            }
+        },
+        interaction: {
+            hover: true,
+            navigationButtons: true,
+            keyboard: true
+        },
+        autoResize: false,  // Disable auto-resize to prevent infinite stretching
+        height: containerHeight + 'px',
+        width: containerWidth + 'px'
+    };
+    // Create network
+    network = new vis.Network(container, graphData, options);
+    // Prevent any further resize attempts
+    if (network) {
+        network.setOptions({ autoResize: false });
+    }
+    // Add click handler for nodes
+    network.on('click', function(params) {
+        if (params.nodes.length > 0) {
+            const nodeId = params.nodes[0];
+            selectNode(nodeId);
+        }
+    });
+}
+// ========== Node Selection ==========
+window.selectNode = async function(nodeId) {
+    selectedNodeId = nodeId;
+    try {
+        const nodeData = await apiCall(`/node/${nodeId}`);
+        displayNodeDetails(nodeData);
+    } catch (error) {
+        console.error('Failed to load node details:', error);
+    }
+}
+function displayNodeDetails(nodeData) {
+    const content = document.getElementById('node-content');
+    const sourcesHtml = nodeData.sources.map((source, i) => `
+        <li>p.${source.page_number} - "${source.snippet}" <span style="color: #8b92a0;">(${source.chunk_id})</span></li>
+    `).join('');
+    const relatedHtml = nodeData.related_nodes.map(related => `
+        <li onclick="selectNode('${related.node_id}')" style="cursor: pointer; padding: 0.5rem; background: #23262e; border-radius: 6px; margin-bottom: 0.25rem;">
+            <strong>${related.label}</strong> - ${related.relation} (confidence: ${related.confidence.toFixed(2)})
+        </li>
+    `).join('');
+    content.innerHTML = `
+        <div class="node-info">
+            <h3 class="node-label">${nodeData.label}</h3>
+            <span class="badge">${nodeData.type}</span>
+            <div class="node-summary">
+                <h4>Summary</h4>
+                <p>${nodeData.summary}</p>
+            </div>
+            <div class="node-sources">
+                <h4>Sources</h4>
+                <button class="expand-toggle" onclick="toggleSources()">Show Sources</button>
+                <ul class="sources-list" id="sources-list" hidden>
+                    ${sourcesHtml}
+                </ul>
+            </div>
+            ${nodeData.related_nodes.length > 0 ? `
+                <div class="related-nodes">
+                    <h4>Related Nodes</h4>
+                    <ul class="related-list">
+                        ${relatedHtml}
+                    </ul>
+                </div>
+            ` : ''}
+        </div>
+    `;
+}
+window.toggleSources = function() {
+    const sourcesList = document.getElementById('sources-list');
+    const toggle = document.querySelector('.expand-toggle');
+    if (sourcesList.hidden) {
+        sourcesList.hidden = false;
+        toggle.textContent = 'Hide Sources';
+    } else {
+        sourcesList.hidden = true;
+        toggle.textContent = 'Show Sources';
+    }
+}
+document.getElementById('close-node-detail').addEventListener('click', () => {
+    document.getElementById('node-content').innerHTML = '<p class="placeholder-text">Click a node in the graph to view details</p>';
+    selectedNodeId = null;
+});
+// ========== Chat ==========
+document.getElementById('send-btn').addEventListener('click', sendMessage);
+document.getElementById('chat-input').addEventListener('keydown', (e) => {
+    if (e.key === 'Enter' && !e.shiftKey) {
+        e.preventDefault();
+        sendMessage();
+    }
+});
+async function sendMessage() {
+    const input = document.getElementById('chat-input');
+    const query = input.value.trim();
+    if (!query) return;
+    if (!currentPdfId) {
+        showNotification('Please upload a PDF first', 'error');
+        return;
+    }
+    // Add user message to chat
+    addMessageToChat('user', query);
+    input.value = '';
+    try {
+        const includeCitations = document.getElementById('include-citations').checked;
+        const response = await apiCall('/chat', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({
+                query,
+                pdf_id: currentPdfId,
+                include_citations: includeCitations,
+                max_sources: 5
+            })
+        });
+        // Add assistant response
+        addMessageToChat('assistant', response.answer, response.sources);
+    } catch (error) {
+        addMessageToChat('assistant', 'Sorry, I encountered an error processing your question.');
+    }
+}
+function addMessageToChat(role, content, sources = []) {
+    const messagesContainer = document.getElementById('chat-messages');
+    const messageDiv = document.createElement('div');
+    messageDiv.className = `message ${role}`;
+    let html = `<p>${content}</p>`;
+    if (sources && sources.length > 0) {
+        html += '<div style="margin-top: 0.5rem; padding-top: 0.5rem; border-top: 1px solid rgba(255,255,255,0.1);">';
+        html += '<strong style="font-size: 0.875rem;">Sources:</strong><ul style="margin-top: 0.25rem; font-size: 0.875rem;">';
+        sources.forEach(source => {
+            html += `<li>p.${source.page_number}: "${source.snippet}"</li>`;
+        });
+        html += '</ul></div>';
+    }
+    messageDiv.innerHTML = html;
+    messagesContainer.appendChild(messageDiv);
+    // Scroll to bottom
+    messagesContainer.scrollTop = messagesContainer.scrollHeight;
+}
+// ========== Stats Update ==========
+async function updateStats() {
+    try {
+        const status = await apiCall('/admin/status');
+        document.getElementById('stats-nodes').textContent = `Nodes: ${status.total_nodes}`;
+        document.getElementById('stats-edges').textContent = `Edges: ${status.total_edges}`;
+        document.getElementById('stats-chunks').textContent = `Chunks: ${status.total_chunks}`;
+    } catch (error) {
+        console.error('Failed to update stats:', error);
+    }
+}
+// ========== Admin Controls ==========
+document.getElementById('reindex-btn').addEventListener('click', async () => {
+    if (!currentPdfId) {
+        showNotification('No PDF to reindex', 'error');
+        return;
+    }
+    if (!confirm('Reindex current PDF? This will take some time.')) return;
+    try {
+        // Show overlay for reindexing
+        showProcessingOverlay('Reindexing PDF', 'Starting reindex...', 0);
+        await apiCall(`/admin/reindex?pdf_id=${currentPdfId}`, { method: 'POST' });
+        // Poll for completion
+        pollProcessingStatus(currentPdfId);
+    } catch (error) {
+        hideProcessingOverlay();
+        showNotification('Reindex failed', 'error');
+    }
+});
+document.getElementById('clear-btn').addEventListener('click', async () => {
+    if (!confirm('Clear all data? This cannot be undone!')) return;
+    try {
+        await apiCall('/admin/clear', { method: 'POST' });
+        showNotification('All data cleared', 'success');
+        // Reset UI
+        currentPdfId = null;
+        graphData = { nodes: [], edges: [] };
+        document.getElementById('graph-container').innerHTML = '<div class="graph-placeholder"><p>Upload a PDF to generate a knowledge graph</p></div>';
+        document.getElementById('node-content').innerHTML = '<p class="placeholder-text">Click a node in the graph to view details</p>';
+        document.getElementById('chat-messages').innerHTML = '<div class="message system"><p>Ask questions about your uploaded PDF. Answers will cite page numbers.</p></div>';
+        await updateStats();
+    } catch (error) {
+        showNotification('Clear failed', 'error');
+    }
+});
+// ========== Graph Controls ==========
+document.getElementById('zoom-in-btn').addEventListener('click', () => {
+    if (network) {
+        const scale = network.getScale();
+        network.moveTo({ scale: scale * 1.2 });
+    }
+});
+document.getElementById('zoom-out-btn').addEventListener('click', () => {
+    if (network) {
+        const scale = network.getScale();
+        network.moveTo({ scale: scale * 0.8 });
+    }
+});
+document.getElementById('reset-view-btn').addEventListener('click', () => {
+    if (network) {
+        network.fit();
+    }
+});
+// ========== Initialization ==========
+document.addEventListener('DOMContentLoaded', () => {
+    updateStats();
+    console.log('GraphLLM Frontend Initialized');
+});

frontend/index.html ADDED Viewed

	@@ -0,0 +1,176 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>GraphLLM - PDF Knowledge Graph & RAG</title>
+    <link rel="stylesheet" href="/static/styles.css">
+</head>
+<body>
+    <!-- Header -->
+    <header class="app-header">
+        <div class="header-content">
+            <h1 class="app-title">GraphLLM</h1>
+            <p class="app-subtitle">PDF Knowledge Graph & RAG System</p>
+        </div>
+        <div class="header-controls">
+            <div class="upload-section">
+                <input type="file" id="pdf-upload" accept=".pdf" hidden>
+                <button id="upload-btn" class="btn btn-primary" onclick="document.getElementById('pdf-upload').click()">
+                    Upload PDF
+                </button>
+                <span id="upload-status" class="status-text"></span>
+            </div>
+            <button id="reindex-btn" class="btn btn-secondary">Reindex</button>
+            <button id="clear-btn" class="btn btn-danger">Clear All</button>
+        </div>
+    </header>
+    <!-- Main Content Area -->
+    <main class="main-container">
+        <!-- Left Pane: Graph Visualization -->
+        <aside id="graph-pane" class="graph-pane" role="region" aria-label="Knowledge Graph Visualization">
+            <div class="pane-header">
+                <h2>Knowledge Graph</h2>
+                <div class="graph-controls">
+                    <button id="zoom-in-btn" class="icon-btn" aria-label="Zoom In">+</button>
+                    <button id="zoom-out-btn" class="icon-btn" aria-label="Zoom Out">-</button>
+                    <button id="reset-view-btn" class="icon-btn" aria-label="Reset View">⟲</button>
+                </div>
+            </div>
+            <div id="graph-container" class="graph-container" role="img" aria-label="Interactive knowledge graph">
+                <!-- Graph visualization will be rendered here via JavaScript -->
+                <div class="graph-placeholder">
+                    <p>Upload a PDF to generate a knowledge graph</p>
+                    <p class="help-text">Graph visualization requires JavaScript for interactivity</p>
+                </div>
+            </div>
+            <div class="graph-legend">
+                <h3>Legend</h3>
+                <div class="legend-items">
+                    <div class="legend-item">
+                        <span class="legend-color concept"></span>
+                        <span>Concept</span>
+                    </div>
+                    <div class="legend-item">
+                        <span class="legend-color function"></span>
+                        <span>Function/Method</span>
+                    </div>
+                    <div class="legend-item">
+                        <span class="legend-color class"></span>
+                        <span>Class/Type</span>
+                    </div>
+                    <div class="legend-item">
+                        <span class="legend-color term"></span>
+                        <span>Term/Definition</span>
+                    </div>
+                </div>
+            </div>
+        </aside>
+        <!-- Right Pane: Node Details & Chat -->
+        <section id="detail-pane" class="detail-pane">
+            <!-- Node Detail Section -->
+            <div id="node-detail" class="node-detail card">
+                <div class="card-header">
+                    <h2>Node Details</h2>
+                    <button id="close-node-detail" class="icon-btn" aria-label="Close">✕</button>
+                </div>
+                <div id="node-content" class="node-content">
+                    <p class="placeholder-text">Click a node in the graph to view details</p>
+                </div>
+                <!-- Node detail will be populated dynamically:
+                <div class="node-info">
+                    <h3 class="node-label">[Node Label]</h3>
+                    <span class="node-type badge">[Type]</span>
+                    <span class="node-importance">Importance: [Score]</span>
+                    <div class="node-summary">
+                        <h4>Summary</h4>
+                        <p>[AI-generated summary with (p. N) citations]</p>
+                    </div>
+                    <div class="node-sources">
+                        <h4>Sources</h4>
+                        <button class="expand-toggle">Show Sources</button>
+                        <ul class="sources-list" hidden>
+                            <li>p.12 - "Exact snippet..." (chunk_id)</li>
+                        </ul>
+                    </div>
+                    <div class="related-nodes">
+                        <h4>Related Nodes</h4>
+                        <ul class="related-list">
+                            <li>[Node] - [relation] - [confidence]</li>
+                        </ul>
+                    </div>
+                </div>
+                -->
+            </div>
+            <!-- Chat Section -->
+            <div id="chat" class="chat-section card">
+                <div class="card-header">
+                    <h2>Chat with Document</h2>
+                    <label class="checkbox-label">
+                        <input type="checkbox" id="include-citations" checked>
+                        <span>Include Citations</span>
+                    </label>
+                </div>
+                <div id="chat-messages" class="chat-messages" role="log" aria-live="polite">
+                    <div class="message system">
+                        <p>Ask questions about your uploaded PDF. Answers will cite page numbers.</p>
+                    </div>
+                </div>
+                <div class="chat-input-area">
+                    <textarea
+                        id="chat-input"
+                        class="chat-input"
+                        placeholder="Ask a question about the document..."
+                        rows="3"
+                        aria-label="Chat input"
+                    ></textarea>
+                    <button id="send-btn" class="btn btn-primary" aria-label="Send message">Send</button>
+                </div>
+            </div>
+        </section>
+    </main>
+    <!-- Footer -->
+    <footer class="app-footer">
+        <div class="footer-content">
+            <div class="stats">
+                <span id="stats-nodes">Nodes: 0</span>
+                <span id="stats-edges">Edges: 0</span>
+                <span id="stats-chunks">Chunks: 0</span>
+            </div>
+            <p class="footer-text">GraphLLM v1.0 | Powered by Gemini & Mistral</p>
+        </div>
+    </footer>
+    <!-- Processing Overlay -->
+    <div id="processing-overlay" class="processing-overlay" hidden>
+        <div class="processing-modal">
+            <div class="spinner"></div>
+            <h2 id="processing-title">Processing PDF</h2>
+            <p id="processing-message">Starting...</p>
+            <div class="progress-bar">
+                <div id="progress-fill" class="progress-fill"></div>
+            </div>
+            <p id="processing-percent" class="processing-percent">0%</p>
+        </div>
+    </div>
+    <!-- JavaScript Libraries -->
+    <script type="text/javascript" src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
+    <script src="/static/app.js"></script>
+</body>
+</html>

frontend/styles.css ADDED Viewed

	@@ -0,0 +1,800 @@

+/* GraphLLM Stylesheet - Dark Sleek Theme */
+/* ========== CSS Reset & Base Styles ========== */
+* {
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+}
+html, body {
+    height: 100%;
+    width: 100%;
+    overflow-x: hidden;
+}
+:root {
+    /* Color Palette */
+    --bg-primary: #0f1115;
+    --bg-secondary: #12151a;
+    --bg-card: #1a1d24;
+    --bg-hover: #23262e;
+    --text-primary: #e6eef8;
+    --text-secondary: #cfd8e3;
+    --text-muted: #8b92a0;
+    --accent-primary: #4f9eff;
+    --accent-hover: #3d8ae6;
+    --accent-glow: rgba(79, 158, 255, 0.3);
+    --success: #4caf50;
+    --warning: #ff9800;
+    --danger: #f44336;
+    --border-color: #2a2f3a;
+    --shadow-sm: 0 2px 4px rgba(0, 0, 0, 0.3);
+    --shadow-md: 0 4px 12px rgba(0, 0, 0, 0.4);
+    --shadow-lg: 0 8px 24px rgba(0, 0, 0, 0.5);
+    /* Graph Node Colors */
+    --node-concept: #4f9eff;
+    --node-function: #9c27b0;
+    --node-class: #ff5722;
+    --node-term: #4caf50;
+    /* Spacing */
+    --spacing-xs: 0.25rem;
+    --spacing-sm: 0.5rem;
+    --spacing-md: 1rem;
+    --spacing-lg: 1.5rem;
+    --spacing-xl: 2rem;
+    /* Border Radius */
+    --radius-sm: 6px;
+    --radius-md: 12px;
+    --radius-lg: 16px;
+    /* Transitions */
+    --transition-fast: 0.15s ease;
+    --transition-normal: 0.3s ease;
+}
+/* ========== Typography ========== */
+body {
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
+        'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif;
+    font-size: 16px;
+    line-height: 1.6;
+    color: var(--text-primary);
+    background-color: var(--bg-primary);
+    overflow-x: hidden;
+}
+h1, h2, h3, h4, h5, h6 {
+    font-weight: 600;
+    line-height: 1.2;
+    margin-bottom: var(--spacing-md);
+    color: var(--text-primary);
+}
+h1 { font-size: 2rem; }
+h2 { font-size: 1.5rem; }
+h3 { font-size: 1.25rem; }
+h4 { font-size: 1.1rem; }
+code, pre {
+    font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
+    background-color: var(--bg-secondary);
+    padding: 0.2em 0.4em;
+    border-radius: var(--radius-sm);
+}
+/* ========== Layout ========== */
+body {
+    display: flex;
+    flex-direction: column;
+    min-height: 100vh;
+}
+/* Header */
+.app-header {
+    background-color: var(--bg-secondary);
+    border-bottom: 1px solid var(--border-color);
+    padding: var(--spacing-lg) var(--spacing-xl);
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    box-shadow: var(--shadow-sm);
+}
+.header-content {
+    flex: 1;
+}
+.app-title {
+    margin: 0;
+    font-size: 1.75rem;
+    background: linear-gradient(135deg, var(--accent-primary), #9c27b0);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+}
+.app-subtitle {
+    font-size: 0.875rem;
+    color: var(--text-muted);
+    margin: 0;
+}
+.header-controls {
+    display: flex;
+    gap: var(--spacing-md);
+    align-items: center;
+}
+.upload-section {
+    display: flex;
+    gap: var(--spacing-sm);
+    align-items: center;
+}
+.status-text {
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+}
+/* Main Container */
+.main-container {
+    flex: 1;
+    display: grid;
+    grid-template-columns: 65% 35%;
+    gap: var(--spacing-lg);
+    padding: var(--spacing-lg);
+    overflow: hidden;
+    height: calc(100vh - 180px); /* Account for header and footer */
+    max-height: calc(100vh - 180px);
+}
+/* ========== Graph Pane (Left) ========== */
+.graph-pane {
+    background-color: var(--bg-card);
+    border-radius: var(--radius-md);
+    padding: var(--spacing-lg);
+    display: flex;
+    flex-direction: column;
+    box-shadow: var(--shadow-md);
+    overflow: hidden;
+    height: 100%;
+    max-height: 100%;
+}
+.pane-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: var(--spacing-md);
+}
+.pane-header h2 {
+    margin: 0;
+    font-size: 1.25rem;
+}
+.graph-controls {
+    display: flex;
+    gap: var(--spacing-sm);
+}
+.graph-container {
+    flex: 1;
+    background-color: var(--bg-secondary);
+    border-radius: var(--radius-sm);
+    position: relative;
+    overflow: hidden;
+    border: 1px solid var(--border-color);
+    min-height: 500px;
+    height: 100%;
+    width: 100%;
+}
+.graph-placeholder {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    height: 100%;
+    color: var(--text-muted);
+    text-align: center;
+    padding: var(--spacing-xl);
+}
+.graph-placeholder p {
+    margin: var(--spacing-sm) 0;
+}
+.help-text {
+    font-size: 0.875rem;
+    color: var(--text-muted);
+    opacity: 0.7;
+}
+/* Graph Legend */
+.graph-legend {
+    margin-top: var(--spacing-md);
+    padding: var(--spacing-md);
+    background-color: var(--bg-secondary);
+    border-radius: var(--radius-sm);
+}
+.graph-legend h3 {
+    font-size: 0.875rem;
+    margin-bottom: var(--spacing-sm);
+    color: var(--text-secondary);
+}
+.legend-items {
+    display: grid;
+    grid-template-columns: repeat(2, 1fr);
+    gap: var(--spacing-sm);
+}
+.legend-item {
+    display: flex;
+    align-items: center;
+    gap: var(--spacing-sm);
+    font-size: 0.875rem;
+}
+.legend-color {
+    width: 16px;
+    height: 16px;
+    border-radius: 50%;
+    border: 2px solid currentColor;
+}
+.legend-color.concept { color: var(--node-concept); }
+.legend-color.function { color: var(--node-function); }
+.legend-color.class { color: var(--node-class); }
+.legend-color.term { color: var(--node-term); }
+/* ========== Detail Pane (Right) ========== */
+.detail-pane {
+    display: flex;
+    flex-direction: column;
+    gap: var(--spacing-lg);
+    overflow-y: auto;
+    overflow-x: hidden;
+    height: 100%;
+    max-height: 100%;
+    padding-right: var(--spacing-sm);  /* Space for scrollbar */
+}
+.card {
+    background-color: var(--bg-card);
+    border-radius: var(--radius-md);
+    padding: var(--spacing-lg);
+    box-shadow: var(--shadow-md);
+    width: 100%;
+}
+.card-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: var(--spacing-md);
+    padding-bottom: var(--spacing-sm);
+    border-bottom: 1px solid var(--border-color);
+}
+.card-header h2 {
+    margin: 0;
+    font-size: 1.25rem;
+}
+/* Node Detail Card */
+.node-detail {
+    flex-shrink: 0;  /* Never shrink - always show full content */
+    overflow-y: visible;  /* Don't scroll the card itself */
+    display: block;
+    margin-bottom: var(--spacing-lg);
+}
+.node-content {
+    color: var(--text-secondary);
+    max-height: none;  /* No height restriction */
+}
+.placeholder-text {
+    color: var(--text-muted);
+    text-align: center;
+    padding: var(--spacing-xl);
+}
+.node-info {
+    display: flex;
+    flex-direction: column;
+    gap: var(--spacing-md);
+    width: 100%;
+    padding-bottom: var(--spacing-lg);
+}
+.node-label {
+    font-size: 1.5rem;
+    color: var(--accent-primary);
+    margin-bottom: var(--spacing-xs);
+}
+.badge {
+    display: inline-block;
+    padding: 0.25rem 0.5rem;
+    background-color: var(--accent-primary);
+    color: white;
+    border-radius: var(--radius-sm);
+    font-size: 0.75rem;
+    font-weight: 600;
+    text-transform: uppercase;
+    margin-right: var(--spacing-sm);
+}
+.node-importance {
+    font-size: 0.875rem;
+    color: var(--text-muted);
+}
+.node-summary, .node-sources, .related-nodes {
+    padding: var(--spacing-md);
+    background-color: var(--bg-secondary);
+    border-radius: var(--radius-sm);
+    border-left: 3px solid var(--accent-primary);
+    margin-bottom: var(--spacing-md);
+}
+.node-summary p {
+    line-height: 1.7;
+    color: var(--text-secondary);
+}
+.expand-toggle {
+    background: none;
+    border: none;
+    color: var(--accent-primary);
+    cursor: pointer;
+    font-size: 0.875rem;
+    padding: var(--spacing-xs) 0;
+    transition: color var(--transition-fast);
+}
+.expand-toggle:hover {
+    color: var(--accent-hover);
+    text-decoration: underline;
+}
+.sources-list, .related-list {
+    list-style: none;
+    margin-top: var(--spacing-sm);
+}
+.sources-list li, .related-list li {
+    padding: var(--spacing-sm);
+    margin-bottom: var(--spacing-xs);
+    background-color: var(--bg-hover);
+    border-radius: var(--radius-sm);
+    font-size: 0.875rem;
+}
+/* Chat Section */
+.chat-section {
+    flex: 0 1 auto;  /* Can shrink but don't grow */
+    display: flex;
+    flex-direction: column;
+    min-height: 300px;
+    height: 400px;  /* Fixed height */
+}
+.checkbox-label {
+    display: flex;
+    align-items: center;
+    gap: var(--spacing-sm);
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+    cursor: pointer;
+}
+.checkbox-label input[type="checkbox"] {
+    width: 18px;
+    height: 18px;
+    cursor: pointer;
+}
+.chat-messages {
+    flex: 1;
+    overflow-y: auto;
+    padding: var(--spacing-md);
+    background-color: var(--bg-secondary);
+    border-radius: var(--radius-sm);
+    margin-bottom: var(--spacing-md);
+    min-height: 200px;
+}
+.message {
+    margin-bottom: var(--spacing-md);
+    padding: var(--spacing-md);
+    border-radius: var(--radius-sm);
+    line-height: 1.6;
+}
+.message.user {
+    background-color: var(--accent-primary);
+    color: white;
+    align-self: flex-end;
+    max-width: 80%;
+    margin-left: auto;
+}
+.message.assistant {
+    background-color: var(--bg-hover);
+    color: var(--text-primary);
+    border-left: 3px solid var(--accent-primary);
+}
+.message.system {
+    background-color: transparent;
+    color: var(--text-muted);
+    font-size: 0.875rem;
+    text-align: center;
+    border: none;
+}
+.chat-input-area {
+    display: flex;
+    gap: var(--spacing-sm);
+}
+.chat-input {
+    flex: 1;
+    background-color: var(--bg-secondary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    padding: var(--spacing-md);
+    color: var(--text-primary);
+    font-family: inherit;
+    font-size: 0.95rem;
+    resize: vertical;
+    transition: border-color var(--transition-fast);
+}
+.chat-input:focus {
+    outline: none;
+    border-color: var(--accent-primary);
+    box-shadow: 0 0 0 3px var(--accent-glow);
+}
+/* ========== Buttons ========== */
+.btn {
+    padding: 0.625rem 1.25rem;
+    border: none;
+    border-radius: var(--radius-sm);
+    font-size: 0.9rem;
+    font-weight: 600;
+    cursor: pointer;
+    transition: all var(--transition-fast);
+    white-space: nowrap;
+}
+.btn-primary {
+    background-color: var(--accent-primary);
+    color: white;
+}
+.btn-primary:hover {
+    background-color: var(--accent-hover);
+    box-shadow: 0 0 12px var(--accent-glow);
+}
+.btn-secondary {
+    background-color: var(--bg-hover);
+    color: var(--text-primary);
+    border: 1px solid var(--border-color);
+}
+.btn-secondary:hover {
+    background-color: var(--bg-card);
+}
+.btn-danger {
+    background-color: var(--danger);
+    color: white;
+}
+.btn-danger:hover {
+    background-color: #d32f2f;
+}
+.icon-btn {
+    width: 36px;
+    height: 36px;
+    padding: 0;
+    background-color: var(--bg-hover);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-sm);
+    color: var(--text-primary);
+    cursor: pointer;
+    font-size: 1.2rem;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    transition: all var(--transition-fast);
+}
+.icon-btn:hover {
+    background-color: var(--bg-card);
+    border-color: var(--accent-primary);
+}
+/* ========== Footer ========== */
+.app-footer {
+    background-color: var(--bg-secondary);
+    border-top: 1px solid var(--border-color);
+    padding: var(--spacing-md) var(--spacing-xl);
+}
+.footer-content {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+.stats {
+    display: flex;
+    gap: var(--spacing-lg);
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+}
+.footer-text {
+    font-size: 0.875rem;
+    color: var(--text-muted);
+}
+/* ========== Responsive Design ========== */
+@media (max-width: 1024px) {
+    .main-container {
+        grid-template-columns: 1fr;
+        grid-template-rows: auto auto;
+    }
+    .graph-pane {
+        min-height: 400px;
+    }
+}
+@media (max-width: 768px) {
+    .app-header {
+        flex-direction: column;
+        gap: var(--spacing-md);
+        align-items: flex-start;
+    }
+    .header-controls {
+        width: 100%;
+        flex-wrap: wrap;
+    }
+    .main-container {
+        padding: var(--spacing-sm);
+        gap: var(--spacing-sm);
+    }
+    .footer-content {
+        flex-direction: column;
+        gap: var(--spacing-sm);
+        text-align: center;
+    }
+    .stats {
+        flex-direction: column;
+        gap: var(--spacing-sm);
+    }
+}
+/* ========== Accessibility ========== */
+/* Focus styles for keyboard navigation */
+button:focus-visible,
+input:focus-visible,
+textarea:focus-visible {
+    outline: 2px solid var(--accent-primary);
+    outline-offset: 2px;
+}
+/* Hidden but accessible */
+.sr-only {
+    position: absolute;
+    width: 1px;
+    height: 1px;
+    padding: 0;
+    margin: -1px;
+    overflow: hidden;
+    clip: rect(0, 0, 0, 0);
+    white-space: nowrap;
+    border-width: 0;
+}
+/* Reduce motion for accessibility */
+@media (prefers-reduced-motion: reduce) {
+    *,
+    *::before,
+    *::after {
+        animation-duration: 0.01ms !important;
+        animation-iteration-count: 1 !important;
+        transition-duration: 0.01ms !important;
+    }
+}
+/* ========== Graph Node Styles (for JS visualization) ========== */
+/* These classes will be used by the graph visualization library */
+.graph-node {
+    cursor: pointer;
+    transition: all var(--transition-fast);
+}
+.graph-node.concept circle { fill: var(--node-concept); }
+.graph-node.function circle { fill: var(--node-function); }
+.graph-node.class circle { fill: var(--node-class); }
+.graph-node.term circle { fill: var(--node-term); }
+.graph-node:hover circle {
+    stroke-width: 3px;
+    filter: brightness(1.2);
+}
+.graph-node.selected circle {
+    stroke: var(--accent-primary);
+    stroke-width: 4px;
+    animation: pulse 1.5s infinite;
+}
+@keyframes pulse {
+    0%, 100% {
+        box-shadow: 0 0 0 0 var(--accent-glow);
+    }
+    50% {
+        box-shadow: 0 0 0 10px rgba(79, 158, 255, 0);
+    }
+}
+.graph-edge {
+    stroke: var(--text-muted);
+    stroke-width: 1.5px;
+    fill: none;
+    opacity: 0.6;
+}
+.graph-edge.highlighted {
+    stroke: var(--accent-primary);
+    opacity: 1;
+    stroke-width: 2px;
+}
+/* Vis.js Network Canvas Constraints */
+.graph-container canvas {
+    max-width: 100% !important;
+    max-height: 100% !important;
+}
+.graph-container > div {
+    width: 100% !important;
+    height: 100% !important;
+    max-height: 100% !important;
+}
+/* ========== Processing Overlay ========== */
+.processing-overlay {
+    position: fixed;
+    top: 0;
+    left: 0;
+    width: 100%;
+    height: 100%;
+    background: rgba(15, 17, 21, 0.95);
+    backdrop-filter: blur(8px);
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    z-index: 10000;
+    animation: fadeIn 0.3s ease-in-out;
+}
+.processing-overlay[hidden] {
+    display: none;
+}
+@keyframes fadeIn {
+    from {
+        opacity: 0;
+    }
+    to {
+        opacity: 1;
+    }
+}
+.processing-modal {
+    background: var(--bg-card);
+    border: 1px solid var(--border-color);
+    border-radius: 16px;
+    padding: 3rem 4rem;
+    box-shadow: var(--shadow-lg);
+    text-align: center;
+    min-width: 400px;
+    animation: slideUp 0.4s ease-out;
+}
+@keyframes slideUp {
+    from {
+        transform: translateY(30px);
+        opacity: 0;
+    }
+    to {
+        transform: translateY(0);
+        opacity: 1;
+    }
+}
+/* Spinner Animation */
+.spinner {
+    width: 80px;
+    height: 80px;
+    margin: 0 auto 2rem;
+    border: 6px solid var(--border-color);
+    border-top: 6px solid var(--accent-primary);
+    border-radius: 50%;
+    animation: spin 1s linear infinite;
+}
+@keyframes spin {
+    0% {
+        transform: rotate(0deg);
+    }
+    100% {
+        transform: rotate(360deg);
+    }
+}
+#processing-title {
+    color: var(--text-primary);
+    font-size: 1.75rem;
+    font-weight: 600;
+    margin-bottom: 1rem;
+}
+#processing-message {
+    color: var(--text-secondary);
+    font-size: 1rem;
+    margin-bottom: 1.5rem;
+    min-height: 1.5rem;
+}
+/* Progress Bar */
+.progress-bar {
+    width: 100%;
+    height: 8px;
+    background: var(--bg-secondary);
+    border-radius: 4px;
+    overflow: hidden;
+    margin-bottom: 1rem;
+}
+.progress-fill {
+    height: 100%;
+    background: linear-gradient(90deg, var(--accent-primary), var(--accent-hover));
+    border-radius: 4px;
+    transition: width 0.3s ease-out;
+    width: 0%;
+    box-shadow: 0 0 10px var(--accent-glow);
+}
+.processing-percent {
+    color: var(--text-muted);
+    font-size: 0.875rem;
+    font-weight: 500;
+    letter-spacing: 0.5px;
+}

gemini_extractor.py ADDED Viewed

	@@ -0,0 +1,612 @@

+"""
+Gemini-based Knowledge Graph Extraction
+Simple LLM-powered extraction using Google Gemini (cheapest option)
+"""
+from typing import List, Dict, Any, Optional
+from loguru import logger
+from models import Chunk, CanonicalTriple, RelationType
+from config import settings
+import json
+import asyncio
+class GeminiExtractor:
+    """
+    Extract key nodes and relationships using Gemini LLM
+    Simple, cost-effective approach for knowledge graph generation
+    """
+    def __init__(self, llm_service=None):
+        """Initialize Gemini extractor"""
+        logger.info("Initializing GeminiExtractor")
+        # Import litellm for API calls
+        try:
+            import litellm
+            self.litellm = litellm
+            # Configure litellm for Gemini
+            self.model_name = f"gemini/{settings.gemini_model}"
+            self.api_key = settings.gemini_api_key
+            logger.info(f"✓ GeminiExtractor initialized with model: {self.model_name}")
+        except ImportError as e:
+            logger.error("litellm not installed. Install with: pip install litellm")
+            raise RuntimeError("litellm required for Gemini") from e
+        # Comprehensive list of generic terms to REJECT
+        self.generic_stopwords = {
+            # Generic nouns
+            'system', 'systems', 'data', 'information', 'value', 'values',
+            'method', 'methods', 'approach', 'approaches', 'technique', 'techniques',
+            'result', 'results', 'study', 'studies', 'paper', 'papers',
+            'section', 'sections', 'figure', 'figures', 'table', 'tables',
+            'example', 'examples', 'case', 'cases', 'type', 'types',
+            'way', 'ways', 'thing', 'things', 'part', 'parts',
+            'model', 'models', 'framework', 'frameworks',  # Too generic unless specific
+            'process', 'processes', 'analysis', 'problem', 'problems',
+            'solution', 'solutions', 'set', 'sets', 'group', 'groups',
+            'element', 'elements', 'component', 'components',
+            'feature', 'features', 'property', 'properties',
+            'aspect', 'aspects', 'factor', 'factors', 'parameter', 'parameters',
+            'concept', 'concepts', 'idea', 'ideas', 'theory', 'theories',
+            'field', 'fields', 'area', 'areas', 'domain', 'domains',
+            'task', 'tasks', 'goal', 'goals', 'objective', 'objectives',
+            'input', 'inputs', 'output', 'outputs', 'function', 'functions',
+            'operation', 'operations', 'step', 'steps', 'stage', 'stages',
+            'phase', 'phases', 'level', 'levels', 'layer', 'layers',
+            'number', 'numbers', 'amount', 'amounts', 'size', 'sizes',
+            'performance', 'accuracy', 'quality', 'efficiency',
+            'document', 'documents', 'text', 'texts', 'word', 'words',
+            'sentence', 'sentences', 'paragraph', 'paragraphs',
+            'item', 'items', 'object', 'objects', 'entity', 'entities',
+            'relation', 'relations', 'relationship', 'relationships',
+            # Generic verbs/actions
+            'use', 'uses', 'using', 'used', 'usage',
+            'apply', 'applies', 'applying', 'applied', 'application', 'applications',
+            'work', 'works', 'working', 'worked',
+            'provide', 'provides', 'providing', 'provided',
+            'show', 'shows', 'showing', 'shown',
+            'present', 'presents', 'presenting', 'presented', 'presentation',
+            # Generic adjectives
+            'new', 'novel', 'existing', 'current', 'previous',
+            'different', 'similar', 'same', 'other', 'another',
+            'various', 'several', 'multiple', 'single',
+            'important', 'significant', 'main', 'key', 'major',
+            'good', 'better', 'best', 'high', 'low',
+            'large', 'small', 'big', 'little',
+            # Research-specific generic terms
+            'experiment', 'experiments', 'evaluation', 'evaluations',
+            'test', 'tests', 'testing', 'validation',
+            'comparison', 'comparisons', 'benchmark', 'benchmarks',
+            'baseline', 'baselines', 'metric', 'metrics',
+            'dataset', 'datasets', 'corpus', 'corpora',
+            # Time/sequence terms
+            'time', 'times', 'period', 'periods', 'year', 'years',
+            'first', 'second', 'third', 'last', 'final',
+            'next', 'previous', 'current', 'recent',
+            # Common prepositions/articles (shouldn't appear but just in case)
+            'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
+            # Additional generic ML/AI terms (too broad)
+            'neural network', 'deep learning', 'machine learning',
+            'training', 'testing', 'prediction', 'classification',
+            'regression', 'clustering', 'optimization',
+            'network', 'networks', 'algorithm', 'algorithms',
+            'learning', 'training data', 'test data',
+            'feature extraction', 'preprocessing',
+            'hyperparameter', 'hyperparameters',
+            'loss', 'error', 'gradient',
+        }
+    async def extract_from_chunks(
+        self,
+        chunks: List[Chunk],
+        use_llm: bool = True
+    ) -> List[CanonicalTriple]:
+        """
+        Extract knowledge graph - PER PAGE with HARD CAP of 2 concepts per page
+        Args:
+            chunks: List of text chunks
+            use_llm: Always True for Gemini extraction
+        Returns:
+            List of canonical triples
+        """
+        logger.info(f"\n{'='*80}")
+        logger.info(f"{'GEMINI PER-PAGE EXTRACTION - 2 CONCEPTS MAX PER PAGE':^80}")
+        logger.info(f"{'='*80}")
+        all_triples = []
+        # Filter text chunks
+        text_chunks = [c for c in chunks if c.type.value in ["paragraph", "code"]]
+        if not text_chunks:
+            logger.warning("No text chunks to process")
+            return []
+        # GROUP CHUNKS BY PAGE
+        from collections import defaultdict
+        chunks_by_page = defaultdict(list)
+        for chunk in text_chunks:
+            page_num = chunk.page_number or 0
+            chunks_by_page[page_num].append(chunk)
+        logger.info(f"Processing {len(chunks_by_page)} pages in PARALLEL")
+        # ⚡ PARALLEL PROCESSING: Create tasks for all pages
+        tasks = []
+        page_numbers = []
+        for page_num in sorted(chunks_by_page.keys()):
+            page_chunks = chunks_by_page[page_num]
+            combined_text = "\n\n".join([chunk.text for chunk in page_chunks])
+            logger.info(f"📄 PAGE {page_num}: {len(page_chunks)} chunks, {len(combined_text)} chars")
+            # Create async task for this page
+            tasks.append(self._extract_with_gemini(combined_text, page_num))
+            page_numbers.append(page_num)
+        # Execute all Gemini calls in parallel
+        logger.info(f"\n🚀 Launching {len(tasks)} parallel Gemini API calls...")
+        import time
+        start_time = time.time()
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        elapsed = time.time() - start_time
+        logger.info(f"✓ All {len(tasks)} Gemini calls completed in {elapsed:.2f}s (parallel)")
+        logger.info(f"  Average: {elapsed/len(tasks):.2f}s per page (would be {elapsed*len(tasks):.2f}s sequential)")
+        # Process results
+        for page_num, page_triples in zip(page_numbers, results):
+            if isinstance(page_triples, Exception):
+                logger.error(f"  ❌ Page {page_num} failed: {page_triples}")
+                continue
+            if page_triples:
+                all_triples.extend(page_triples)
+                logger.info(f"  ✓ Page {page_num}: Extracted {len(page_triples)} triples")
+                for t in page_triples:
+                    relation_value = t.relation.value if hasattr(t.relation, 'value') else t.relation
+                    logger.info(f"    → {t.subject_label} --[{relation_value}]--> {t.object_label}")
+            else:
+                logger.warning(f"  ⚠️ Page {page_num}: NO TRIPLES EXTRACTED!")
+        # Summary
+        unique_concepts = set()
+        concepts_by_page = {}
+        for triple in all_triples:
+            unique_concepts.add(triple.subject_label)
+            unique_concepts.add(triple.object_label)
+            page = triple.page_number
+            if page not in concepts_by_page:
+                concepts_by_page[page] = set()
+            concepts_by_page[page].add(triple.subject_label)
+            concepts_by_page[page].add(triple.object_label)
+        logger.info(f"\n{'='*80}")
+        logger.info(f"{'EXTRACTION SUMMARY':^80}")
+        logger.info(f"{'='*80}")
+        logger.info(f"Pages processed: {len(chunks_by_page)}")
+        logger.info(f"Total triples: {len(all_triples)}")
+        logger.info(f"Unique concepts: {len(unique_concepts)} (max {len(chunks_by_page) * 2})")
+        if len(all_triples) == 0:
+            logger.error(f"\n❌❌❌ CRITICAL ERROR: ZERO TRIPLES EXTRACTED! ❌❌❌")
+            logger.error(f"This means:")
+            logger.error(f"  - Either Gemini returned no concepts")
+            logger.error(f"  - Or all concepts were rejected by filters")
+            logger.error(f"  - Or there was an API error")
+            logger.error(f"Check the logs above for details!")
+        else:
+            logger.info(f"\nConcepts per page:")
+            for page in sorted(concepts_by_page.keys()):
+                logger.info(f"  Page {page}: {list(concepts_by_page[page])}")
+        logger.info(f"{'='*80}\n")
+        return all_triples
+    async def _extract_with_gemini(self, text: str, page_number: int) -> List[CanonicalTriple]:
+        """
+        Call Gemini API to extract technical concepts (nodes) from THIS PAGE
+        Args:
+            text: Text from single page
+            page_number: Page number
+        Returns:
+            List of canonical triples
+        """
+        # Specialized technical concept extraction prompt
+        prompt = f"""You are an expert in technical information extraction and knowledge graph construction.
+Your task is to identify only the most meaningful *technical concepts* from the given text.
+Concepts must represent scientific, mathematical, algorithmic, or methodological entities
+that could exist as standalone nodes in a knowledge graph.
+Ignore generic words, section titles, variable names, and everyday terms.
+Focus on high-value, domain-specific terminology relevant to the text.
+Extract all important technical concepts from the following text that would form the
+nodes of a knowledge graph.
+⚙️ Rules:
+• Each concept should represent a self-contained technical idea, model, method, metric, loss, theorem, or process
+• Keep only multi-word phrases when possible ("gradient descent", "convolutional neural network", "cross-entropy loss")
+• Skip single, contextless nouns ("data", "model", "value", "equation", "result")
+• Merge synonymous terms (e.g., "SGD", "stochastic gradient descent" → one entry)
+• Do not include equations, numeric values, figure names, or symbols
+• Do not repeat concepts
+• Maintain consistent naming conventions (lowercase, hyphen-separated words)
+• Extract MAXIMUM 4-5 concepts from this page (quality over quantity)
+Return output strictly as JSON with "nodes" key:
+{{
+  "nodes": [
+    "gradient descent",
+    "neural network",
+    "cross entropy loss"
+  ]
+}}
+PAGE {page_number} TEXT:
+{text}
+CRITICAL: Return ONLY the JSON. If no technical concepts found, return {{"nodes": []}}"""
+        logger.info(f"  🚀 Starting Gemini extraction for page {page_number}...")
+        logger.info(f"  Text length: {len(text)} characters")
+        try:
+            # Call Gemini via litellm
+            logger.info(f"  📡 Calling Gemini API for page {page_number}...")
+            response = await asyncio.to_thread(
+                self.litellm.completion,
+                model=self.model_name,
+                api_key=self.api_key,
+                messages=[{
+                    "role": "user",
+                    "content": prompt
+                }],
+                temperature=0.0,
+                max_tokens=settings.llm_max_tokens,
+                timeout=settings.llm_timeout
+            )
+            # Extract response text
+            response_text = response.choices[0].message.content.strip()
+            logger.info(f"  📥 Gemini response ({len(response_text)} chars):")
+            logger.info(f"  {response_text[:500]}")
+            if "```json" in response_text:
+                response_text = response_text.split("```json")[1].split("```")[0].strip()
+            elif "```" in response_text:
+                response_text = response_text.split("```")[1].split("```")[0].strip()
+            data = json.loads(response_text)
+            if isinstance(data, dict) and "nodes" in data:
+                nodes = data["nodes"]
+            elif isinstance(data, list):
+                # Fallback: if Gemini returned a list directly
+                nodes = data
+            else:
+                logger.warning(f"  ❌ Gemini returned unexpected format: {type(data)}")
+                return []
+            if not isinstance(nodes, list):
+                logger.warning(f"  ❌ Nodes is not a list, got: {type(nodes)}")
+                return []
+            logger.info(f"  ✓ Gemini extracted {len(nodes)} nodes from page {page_number}")
+            logger.info(f"  Raw nodes: {nodes}")
+            # Validate and filter nodes
+            valid_nodes = []
+            rejected_nodes = []
+            for node in nodes:
+                if not isinstance(node, str):
+                    logger.warning(f"  ⚠️ Skipping non-string node: {node}")
+                    continue
+                node = node.strip()
+                if not node:
+                    continue
+                logger.info(f"  Validating node: '{node}'")
+                # FILTER: Validate node is a technical concept
+                if not self._is_technical_concept(node):
+                    rejected_nodes.append(node)
+                    logger.warning(f"  ✗ REJECTED node '{node}' - not technical enough")
+                    continue
+                logger.info(f"  ✅ ACCEPTED node: '{node}'")
+                valid_nodes.append(node.lower())
+            # Summary of rejections
+            if rejected_nodes:
+                logger.warning(f"  📊 Rejected {len(rejected_nodes)} nodes: {rejected_nodes}")
+            if not valid_nodes:
+                logger.warning(f"  ⚠️ ALL {len(nodes)} NODES REJECTED for page {page_number}")
+                logger.warning(f"  No valid technical concepts found. Returning empty list.")
+                return []
+            selected_nodes = valid_nodes[:2]  #
+            logger.info(f"  🎯 Selected {len(selected_nodes)} nodes (hard cap = 2): {selected_nodes}")
+            page_triples = []
+            if len(selected_nodes) == 1:
+                # Only one node - create self-referencing relationship or skip
+                logger.info(f"  ℹ️ Only 1 node on page {page_number}, cannot create relationships")
+                return []
+            elif len(selected_nodes) == 2:
+                # Use LLM to determine actual relationship between nodes
+                node1, node2 = selected_nodes[0], selected_nodes[1]
+                # Extract relationship using LLM with page context
+                logger.info(f"  🔍 Extracting relationship between: {node1} ↔ {node2}")
+                relationship_triple = await self._extract_relationship_with_gemini(
+                    text=text,
+                    node1=node1,
+                    node2=node2,
+                    page_number=page_number
+                )
+                if relationship_triple:
+                    page_triples.append(relationship_triple)
+                    logger.info(f"  ✅ Created directed edge:")
+                    logger.info(f"    → {relationship_triple.subject_label} --[{relationship_triple.relation.value}]--> {relationship_triple.object_label}")
+                    logger.info(f"    Justification: {relationship_triple.justification}")
+                else:
+                    logger.warning(f"  ⚠️ Could not extract relationship for {node1} ↔ {node2}")
+            logger.info(f"  ✅ Returning {len(page_triples)} triples for page {page_number}")
+            return page_triples
+        except json.JSONDecodeError as e:
+            logger.error(f"  ❌ JSON PARSE ERROR for page {page_number}: {e}")
+            logger.error(f"  Response was: {response_text[:500]}")
+            return []
+        except Exception as e:
+            logger.error(f"  ❌ GEMINI API FAILED for page {page_number}: {e}")
+            logger.error(f"  Exception type: {type(e).__name__}")
+            logger.error(f"  Full trace:", exc_info=True)
+            return []
+    async def _extract_relationship_with_gemini(self, text: str, node1: str, node2: str, page_number: int) -> Optional[CanonicalTriple]:
+        """
+        Use Gemini to determine the actual relationship between two nodes based on page context
+        Args:
+            text: Full page text for context
+            node1: First node/concept
+            node2: Second node/concept
+            page_number: Page number
+        Returns:
+            CanonicalTriple with proper relationship, or None if extraction fails
+        """
+        # List all available relation types for the LLM
+        available_relations = [r.value for r in RelationType]
+        prompt = f"""You are an expert at extracting knowledge graph relationships from technical text.
+Given two concepts and the text they appear in, determine the most accurate relationship between them.
+**Concepts:**
+- Concept A: "{node1}"
+- Concept B: "{node2}"
+**Context (page {page_number}):**
+{text[:3000]}
+**Available Relationship Types:**
+{', '.join(available_relations)}
+**Instructions:**
+1. Analyze how these two concepts relate in the given context
+2. Choose the MOST SPECIFIC relationship type from the list above
+3. Determine the direction: which concept is the subject and which is the object
+4. Provide a brief justification from the text
+**Output Format (JSON):**
+{{
+  "subject": "<node1 or node2>",
+  "object": "<node1 or node2>",
+  "relation": "<one of the available relationship types>",
+  "confidence": <0.0-1.0>,
+  "justification": "<brief explanation from text>"
+}}
+**Rules:**
+- Use the exact concept names provided
+- Choose only ONE relation type from the available list
+- If no clear relationship exists, use "related_to"
+- Direction matters: subject performs/has the relation to the object
+"""
+        try:
+            # Call Gemini API
+            response_text = await self.litellm.acompletion(
+                model=self.model_name,
+                messages=[
+                    {"role": "system", "content": "You are an expert at knowledge graph relationship extraction. Always output valid JSON."},
+                    {"role": "user", "content": prompt}
+                ],
+                api_key=self.api_key,
+                temperature=0.1,  # Low temperature for consistent relationship extraction
+                response_format={"type": "json_object"}
+            )
+            response_content = response_text.choices[0].message.content
+            data = json.loads(response_content)
+            # Validate response
+            subject = data.get("subject", "").strip()
+            obj = data.get("object", "").strip()
+            relation_str = data.get("relation", "related_to").lower().strip().replace(" ", "_")
+            confidence = float(data.get("confidence", 0.7))
+            justification = data.get("justification", f"Relationship extracted from page {page_number}")
+            # Map relation string to enum
+            try:
+                relation = RelationType(relation_str)
+            except ValueError:
+                logger.warning(f"  ⚠️ Invalid relation '{relation_str}', defaulting to RELATED_TO")
+                relation = RelationType.RELATED_TO
+            # Create triple
+            triple = CanonicalTriple(
+                subject_label=subject,
+                object_label=obj,
+                relation=relation,
+                confidence=confidence,
+                justification=justification,
+                page_number=page_number
+            )
+            return triple
+        except json.JSONDecodeError as e:
+            logger.error(f"  ❌ JSON parse error in relationship extraction: {e}")
+            return None
+        except Exception as e:
+            logger.error(f"  ❌ Relationship extraction failed: {e}")
+            return None
+    def _is_technical_concept(self, concept: str) -> bool:
+        """
+        Args:
+            concept: Concept string to validate
+        Returns:
+            True if highly technical/specific, False otherwise
+        """
+        concept_lower = concept.lower().strip()
+        # RULE 1: Reject if in stopwords
+        if concept_lower in self.generic_stopwords:
+            logger.debug(f"Rejected '{concept}' - in stopword list")
+            return False
+        # RULE 2: Reject if any word is a generic stopword (stricter)
+        words = concept_lower.split()
+        for word in words:
+            if word in self.generic_stopwords:
+                # Allow if it's part of a specific multi-word technical term
+                # e.g., "convolutional neural network" has "network" but is specific
+                if len(words) < 2:
+                    logger.debug(f"Rejected '{concept}' - contains generic word '{word}'")
+                    return False
+        # RULE 3: Single-word concepts must have SOME specificity (RELAXED)
+        if len(words) == 1:
+            # Accept if ANY of these are true:
+            # - Has uppercase (BERT, Adam, PyTorch)
+            # - Has numbers (VGG16, GPT3)
+            # - Has special chars (t-SNE, bi-LSTM)
+            # - Longish word (8+ chars like "backpropagation")
+            has_uppercase = any(c.isupper() for c in concept)
+            has_numbers = any(c.isdigit() for c in concept)
+            has_special = '-' in concept or '_' in concept
+            is_longish = len(concept) >= 8  # RELAXED from 10
+            if not (has_uppercase or has_numbers or has_special or is_longish):
+                logger.debug(f"Rejected '{concept}' - single word not specific enough")
+                return False
+        # RULE 4: Multi-word phrases - very lenient
+        if len(words) >= 2:
+            # Just check that it's not ALL generic words
+            # At least one word should be non-generic or have caps/numbers
+            has_caps = any(c.isupper() for c in concept)
+            has_numbers = any(c.isdigit() for c in concept)
+            has_hyphen = '-' in concept
+            # Count non-generic words
+            non_generic_count = sum(1 for w in words if w not in self.generic_stopwords)
+            # Accept if ANY of these:
+            # - Has caps/numbers/hyphen
+            # - At least one word is non-generic
+            # - 3+ words (likely specific enough)
+            if not (has_caps or has_numbers or has_hyphen or non_generic_count > 0 or len(words) >= 3):
+                logger.debug(f"Rejected '{concept}' - multi-word phrase too generic")
+                return False
+        # RULE 5: Reject very short terms (1-2 chars) unless they're known acronyms (all caps)
+        if len(concept) <= 2 and concept.upper() != concept:
+            logger.debug(f"Rejected '{concept}' - too short")
+            return False
+        # RULE 6: Must contain at least one alphanumeric character
+        if not any(c.isalnum() for c in concept):
+            logger.debug(f"Rejected '{concept}' - no alphanumeric chars")
+            return False
+        # RULE 7: Reject if it's just a generic category with a modifier
+        # e.g., "new algorithm", "proposed method", "our model"
+        generic_patterns = [
+            'new ', 'novel ', 'proposed ', 'our ', 'this ', 'that ',
+            'these ', 'those ', 'such ', 'other ', 'another ',
+            'existing ', 'current ', 'previous ', 'standard '
+        ]
+        for pattern in generic_patterns:
+            if concept_lower.startswith(pattern):
+                logger.debug(f"Rejected '{concept}' - generic pattern")
+                return False
+        # Passed all strict filters
+        return True
+    def _map_relation(self, relation_str: str) -> RelationType:
+        """Map relation string to RelationType enum"""
+        relation_lower = relation_str.lower().strip()
+        # Direct mapping
+        mapping = {
+            "uses": RelationType.USES,
+            "implements": RelationType.IMPLEMENTS,
+            "is_a": RelationType.IS_A,
+            "is a": RelationType.IS_A,
+            "part_of": RelationType.PART_OF,
+            "part of": RelationType.PART_OF,
+            "requires": RelationType.REQUIRES,
+            "produces": RelationType.PRODUCES,
+            "enables": RelationType.ENABLES,
+            "improves": RelationType.IMPROVES,
+            "enhances": RelationType.ENHANCES,
+            "contains": RelationType.CONTAINS,
+            "depends_on": RelationType.DEPENDS_ON,
+            "depends on": RelationType.DEPENDS_ON,
+            "related_to": RelationType.RELATED_TO,
+            "related to": RelationType.RELATED_TO,
+        }
+        if relation_lower in mapping:
+            return mapping[relation_lower]
+        # Fallback
+        logger.debug(f"Unknown relation '{relation_str}', using 'related_to'")
+        return RelationType.RELATED_TO

graph_builder.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""
+Graph Builder - constructs knowledge graph from canonical triples
+Handles entity canonicalization, node/edge creation, and graph pruning
+"""
+from typing import List, Dict, Any, Set, Tuple
+from loguru import logger
+from models import CanonicalTriple, GraphNode, GraphEdge, SupportingChunk, NodeType
+from graph_store import GraphStore
+from embedding_service import EmbeddingService
+from config import settings
+import numpy as np
+from collections import defaultdict
+class GraphBuilder:
+    """
+    Builds and refines knowledge graph from canonical triples
+    Implements entity canonicalization, deduplication, and pruning
+    """
+    def __init__(self, graph_store: GraphStore, embedding_service: EmbeddingService):
+        self.graph_store = graph_store
+        self.embedding_service = embedding_service
+        self.entity_embeddings: Dict[str, np.ndarray] = {}
+    async def build_graph(self, triples: List[CanonicalTriple]) -> Tuple[int, int]:
+        """
+        Build graph from canonical triples
+        Args:
+            triples: List of canonical triples
+        Returns:
+            Tuple of (num_nodes_added, num_edges_added)
+        """
+        logger.info(f"Building graph from {len(triples)} triples")
+        # Step 1: Entity canonicalization - merge similar entities
+        entity_map = await self._canonicalize_entities(triples)
+        # Step 2: Create nodes
+        nodes_created = 0
+        logger.info(f"Creating nodes from {len(entity_map)} canonical entities")
+        for entity_label in entity_map.keys():
+            node = await self._create_node(entity_label, entity_map, triples)
+            if self.graph_store.add_node(node):
+                nodes_created += 1
+                logger.debug(f"Created node: {node.label} (type: {node.type.value})")
+        logger.info(f"✓ Successfully created {nodes_created} nodes")
+        # Step 3: Create edges
+        edges_created = 0
+        for triple in triples:
+            # Map to canonical entities
+            canonical_subject = entity_map.get(triple.subject_label, triple.subject_label)
+            canonical_object = entity_map.get(triple.object_label, triple.object_label)
+            # Skip self-loops
+            if canonical_subject == canonical_object:
+                continue
+            # Get node IDs
+            subject_node = self.graph_store.get_node_by_label(canonical_subject)
+            object_node = self.graph_store.get_node_by_label(canonical_object)
+            if not subject_node or not object_node:
+                continue
+            # Create edge
+            edge = self._create_edge(subject_node, object_node, triple)
+            if self.graph_store.add_edge(edge):
+                edges_created += 1
+        logger.info(f"Created {nodes_created} nodes and {edges_created} edges")
+        # Step 4: Compute importance scores
+        self._compute_importance_scores()
+        # Step 5: Prune low-importance nodes and edges
+        pruned_nodes, pruned_edges = self._prune_graph()
+        logger.info(f"Pruned {pruned_nodes} nodes and {pruned_edges} edges")
+        logger.info(f"Final graph: {nodes_created - pruned_nodes} nodes, {edges_created - pruned_edges} edges")
+        return nodes_created - pruned_nodes, edges_created - pruned_edges
+    async def _canonicalize_entities(self, triples: List[CanonicalTriple]) -> Dict[str, str]:
+        """
+        ⚡ OPTIMIZATION: Skip expensive canonicalization (identity mapping)
+        With 2 nodes per page hard cap and strict technical filtering,
+        we have very few duplicates and highly specific entities.
+        Embedding computation + O(n²) similarity checks not worth the cost.
+        Args:
+            triples: List of triples
+        Returns:
+            Dict mapping entity_label -> canonical_label (identity map)
+        """
+        # Collect all unique entities
+        entities = set()
+        for triple in triples:
+            entities.add(triple.subject_label)
+            entities.add(triple.object_label)
+        # DETERMINISTIC: Sort entities for consistent ordering across runs
+        entities_list = sorted(list(entities))
+        logger.info(f"⚡ FAST MODE: Skipping entity canonicalization for {len(entities_list)} unique entities")
+        logger.info(f"Each entity maps to itself (no merging)")
+        # Return identity mapping - each entity maps to itself
+        entity_map = {entity: entity for entity in entities_list}
+        logger.info(f"✓ Identity mapping created (0 merges, {len(entities_list)} canonical entities)")
+        return entity_map
+    def _entity_to_text(self, entity: str) -> str:
+        """Convert entity label to text for embedding"""
+        # Simple approach: use the label as-is
+        return entity
+    async def _create_node(
+        self,
+        label: str,
+        entity_map: Dict[str, str],
+        triples: List[CanonicalTriple]
+    ) -> GraphNode:
+        """
+        Create a graph node for an entity
+        Args:
+            label: Canonical entity label
+            entity_map: Entity canonicalization map
+            triples: All triples (to find supporting chunks)
+        Returns:
+            GraphNode
+        """
+        # Find all triples mentioning this entity
+        supporting_chunks = []
+        aliases = []
+        for original_label, canonical_label in entity_map.items():
+            if canonical_label == label:
+                if original_label != label:
+                    aliases.append(original_label)
+        # Collect supporting chunks from triples
+        chunk_scores = defaultdict(float)
+        for triple in triples:
+            canonical_subject = entity_map.get(triple.subject_label, triple.subject_label)
+            canonical_object = entity_map.get(triple.object_label, triple.object_label)
+            if canonical_subject == label or canonical_object == label:
+                # This triple supports the node
+                chunk_key = (triple.page_number, triple.justification[:100])  # Use justification as proxy
+                chunk_scores[chunk_key] += triple.confidence
+        # Convert to SupportingChunk objects
+        for (page_number, snippet), score in chunk_scores.items():
+            supporting_chunks.append(SupportingChunk(
+                chunk_id=f"page_{page_number}",  # Placeholder
+                score=score,
+                page_number=page_number,
+                snippet=snippet
+            ))
+        # DETERMINISTIC: Sort by score (desc) then page_number (asc) for stable ordering
+        supporting_chunks.sort(key=lambda x: (-x.score, x.page_number))
+        supporting_chunks = supporting_chunks[:10]
+        # Infer node type (simple heuristic)
+        node_type = self._infer_node_type(label)
+        node = GraphNode(
+            label=label,
+            type=node_type,
+            aliases=aliases,
+            supporting_chunks=supporting_chunks,
+            importance_score=0.0  # Will be computed later
+        )
+        return node
+    def _infer_node_type(self, label: str) -> NodeType:
+        """Infer node type from label (simple heuristics)"""
+        label_lower = label.lower()
+        # Check for common patterns
+        if any(word in label_lower for word in ["function", "method", "algorithm"]):
+            return NodeType.FUNCTION
+        elif any(word in label_lower for word in ["class", "type", "struct"]):
+            return NodeType.CLASS
+        elif label[0].isupper() and " " not in label:  # Capitalized single word
+            return NodeType.PERSON
+        elif any(word in label_lower for word in ["definition", "term", "concept"]):
+            return NodeType.TERM
+        else:
+            return NodeType.CONCEPT
+    def _create_edge(
+        self,
+        from_node: GraphNode,
+        to_node: GraphNode,
+        triple: CanonicalTriple
+    ) -> GraphEdge:
+        """Create a graph edge from a triple"""
+        supporting_chunk = SupportingChunk(
+            chunk_id=f"page_{triple.page_number}",
+            score=triple.confidence,
+            page_number=triple.page_number,
+            snippet=triple.justification
+        )
+        edge = GraphEdge(
+            from_node=from_node.node_id,
+            to_node=to_node.node_id,
+            relation=triple.relation,
+            confidence=triple.confidence,
+            supporting_chunks=[supporting_chunk]
+        )
+        return edge
+    def _compute_importance_scores(self):
+        """
+        ⚡ OPTIMIZATION: Simplified importance scoring (skip expensive PageRank)
+        Since we're not pruning, we only need basic scores for display purposes.
+        """
+        logger.info("⚡ FAST MODE: Computing simplified importance scores (no PageRank)")
+        # Update node importance with simple metric (just degree centrality)
+        for node in self.graph_store.get_all_nodes():
+            # Simple importance = number of connections (fast to compute)
+            num_neighbors = len(self.graph_store.get_neighbors(node.node_id))
+            # Normalize to 0-1 range (assume max 10 connections)
+            importance = min(num_neighbors / 10.0, 1.0)
+            node.importance_score = importance
+            # Update in store (for NetworkX)
+            if not self.graph_store.use_neo4j:
+                self.graph_store.nodes_dict[node.node_id] = node
+        logger.info(f"✓ Importance scores computed (based on degree centrality only)")
+    def _prune_graph(self) -> Tuple[int, int]:
+        """
+        ⚡ OPTIMIZATION: Skip pruning (we already filter at extraction)
+        Pruning is expensive (PageRank + multiple graph traversals).
+        With strict filtering at extraction (technical concepts only, 2 per page),
+        we don't need additional pruning.
+        Returns:
+            Tuple of (nodes_removed, edges_removed) - always (0, 0)
+        """
+        logger.info(f"⚡ FAST MODE: Skipping graph pruning")
+        logger.info(f"Nodes already filtered at extraction with strict technical validation")
+        logger.info(f"Final graph: {len(self.graph_store.get_all_nodes())} nodes, {len(self.graph_store.get_all_edges())} edges")
+        return 0, 0

graph_store.py ADDED Viewed

	@@ -0,0 +1,347 @@

+"""
+Knowledge Graph Store
+Manages nodes, edges, and graph operations
+Supports both NetworkX (local) and Neo4j (production)
+"""
+import networkx as nx
+from neo4j import GraphDatabase
+from typing import List, Dict, Any, Optional, Tuple, Set
+from loguru import logger
+from models import GraphNode, GraphEdge, CanonicalTriple, SupportingChunk, NodeType, RelationType
+from config import settings
+import json
+import pickle
+from collections import defaultdict
+from embedding_service import EmbeddingService
+class GraphStore:
+    """
+    Manages the knowledge graph with nodes and edges
+    Supports multiple backends: NetworkX (default) or Neo4j
+    """
+    def __init__(self, use_neo4j: bool = False, embedding_service: Optional[EmbeddingService] = None):
+        self.use_neo4j = use_neo4j
+        self.embedding_service = embedding_service
+        if use_neo4j:
+            self._init_neo4j()
+        else:
+            self.graph = nx.MultiGraph()  # Undirected graph (no arrows)
+            self.nodes_dict: Dict[str, GraphNode] = {}  # node_id -> GraphNode
+            self.edges_dict: Dict[str, GraphEdge] = {}  # edge_id -> GraphEdge
+        logger.info(f"Initialized GraphStore (backend: {'Neo4j' if use_neo4j else 'NetworkX'}, undirected graph)")
+    def _init_neo4j(self):
+        """Initialize Neo4j connection"""
+        try:
+            self.driver = GraphDatabase.driver(
+                settings.neo4j_uri,
+                auth=(settings.neo4j_user, settings.neo4j_password)
+            )
+            # Test connection
+            with self.driver.session() as session:
+                session.run("RETURN 1")
+            logger.info("Connected to Neo4j successfully")
+        except Exception as e:
+            logger.error(f"Failed to connect to Neo4j: {e}")
+            logger.info("Falling back to NetworkX (undirected)")
+            self.use_neo4j = False
+            self.graph = nx.MultiGraph()  # Undirected graph
+            self.nodes_dict = {}
+            self.edges_dict = {}
+    def add_node(self, node: GraphNode) -> bool:
+        """
+        Add a node to the graph
+        Args:
+            node: GraphNode to add
+        Returns:
+            True if added, False if already exists
+        """
+        if self.use_neo4j:
+            return self._add_node_neo4j(node)
+        else:
+            if node.node_id in self.nodes_dict:
+                return False
+            self.nodes_dict[node.node_id] = node
+            # Handle both enum and string for type field
+            node_type = node.type.value if hasattr(node.type, 'value') else node.type
+            self.graph.add_node(
+                node.node_id,
+                label=node.label,
+                type=node_type,
+                importance=node.importance_score
+            )
+            return True
+    def add_edge(self, edge: GraphEdge) -> bool:
+        """
+        Add an edge to the graph
+        Args:
+            edge: GraphEdge to add
+        Returns:
+            True if added successfully
+        """
+        if self.use_neo4j:
+            return self._add_edge_neo4j(edge)
+        else:
+            self.edges_dict[edge.edge_id] = edge
+            # Handle both enum and string for relation field
+            relation_value = edge.relation.value if hasattr(edge.relation, 'value') else edge.relation
+            self.graph.add_edge(
+                edge.from_node,
+                edge.to_node,
+                key=edge.edge_id,
+                relation=relation_value,
+                confidence=edge.confidence
+            )
+            return True
+    def get_node(self, node_id: str) -> Optional[GraphNode]:
+        """Get node by ID"""
+        if self.use_neo4j:
+            return self._get_node_neo4j(node_id)
+        else:
+            return self.nodes_dict.get(node_id)
+    def update_node(self, node: GraphNode) -> bool:
+        """
+        Update an existing node in the graph
+        Args:
+            node: GraphNode with updated data
+        Returns:
+            True if updated successfully, False if node doesn't exist
+        """
+        if node.node_id not in self.nodes_dict:
+            return False
+        # Update in dictionary
+        self.nodes_dict[node.node_id] = node
+        # Update NetworkX graph attributes
+        if node.node_id in self.graph:
+            node_type = node.type.value if hasattr(node.type, 'value') else node.type
+            self.graph.nodes[node.node_id]['label'] = node.label
+            self.graph.nodes[node.node_id]['type'] = node_type
+            self.graph.nodes[node.node_id]['importance'] = node.importance_score
+        return True
+    def get_node_by_label(self, label: str) -> Optional[GraphNode]:
+        """Get node by label (case-insensitive)"""
+        label_lower = label.lower()
+        for node in self.nodes_dict.values():
+            if node.label.lower() == label_lower or label_lower in [a.lower() for a in node.aliases]:
+                return node
+        return None
+    def get_neighbors(self, node_id: str) -> List[Tuple[GraphNode, GraphEdge]]:
+        """
+        Get neighboring nodes and connecting edges (undirected graph)
+        Args:
+            node_id: Node to get neighbors for
+        Returns:
+            List of (neighbor_node, edge) tuples
+        """
+        if self.use_neo4j:
+            return self._get_neighbors_neo4j(node_id)
+        else:
+            neighbors = []
+            # For undirected graph, just get all neighbors
+            for neighbor_id in self.graph.neighbors(node_id):
+                edges = self.graph.get_edge_data(node_id, neighbor_id)
+                if edges:
+                    for edge_key, edge_data in edges.items():
+                        edge = self.edges_dict.get(edge_key)
+                        neighbor_node = self.nodes_dict.get(neighbor_id)
+                        if edge and neighbor_node:
+                            neighbors.append((neighbor_node, edge))
+            return neighbors
+    def get_all_nodes(self) -> List[GraphNode]:
+        """Get all nodes in graph"""
+        if self.use_neo4j:
+            return self._get_all_nodes_neo4j()
+        else:
+            return list(self.nodes_dict.values())
+    def get_all_edges(self) -> List[GraphEdge]:
+        """Get all edges in graph"""
+        if self.use_neo4j:
+            return self._get_all_edges_neo4j()
+        else:
+            return list(self.edges_dict.values())
+    def remove_node(self, node_id: str):
+        """Remove node and its edges"""
+        if self.use_neo4j:
+            self._remove_node_neo4j(node_id)
+        else:
+            if node_id in self.nodes_dict:
+                del self.nodes_dict[node_id]
+                self.graph.remove_node(node_id)
+    def remove_edge(self, edge_id: str):
+        """Remove edge"""
+        if self.use_neo4j:
+            self._remove_edge_neo4j(edge_id)
+        else:
+            if edge_id in self.edges_dict:
+                edge = self.edges_dict[edge_id]
+                del self.edges_dict[edge_id]
+                if self.graph.has_edge(edge.from_node, edge.to_node, key=edge_id):
+                    self.graph.remove_edge(edge.from_node, edge.to_node, key=edge_id)
+    def compute_centrality(self) -> Dict[str, float]:
+        """
+        Compute node centrality scores (degree centrality for undirected graph)
+        Returns:
+            Dict mapping node_id -> centrality score
+        """
+        if self.use_neo4j:
+            # Use Neo4j's centrality algorithm
+            return self._compute_centrality_neo4j()
+        else:
+            try:
+                # Use degree centrality for undirected graph (simpler and faster)
+                centrality = nx.degree_centrality(self.graph)
+                return centrality
+            except Exception as e:
+                logger.error(f"Failed to compute centrality: {e}")
+                return {}
+    def save(self, filepath: str):
+        """Save graph to file (NetworkX only)"""
+        if self.use_neo4j:
+            logger.info("Neo4j graphs are persisted automatically")
+            return
+        data = {
+            "nodes": [node.dict() for node in self.nodes_dict.values()],
+            "edges": [edge.dict() for edge in self.edges_dict.values()],
+        }
+        with open(filepath, 'wb') as f:
+            pickle.dump(data, f)
+        logger.info(f"Saved graph with {len(self.nodes_dict)} nodes and {len(self.edges_dict)} edges to {filepath}")
+    def load(self, filepath: str):
+        """Load graph from file (NetworkX only)"""
+        if self.use_neo4j:
+            logger.warning("Cannot load into Neo4j from file")
+            return
+        with open(filepath, 'rb') as f:
+            data = pickle.load(f)
+        # Reconstruct nodes
+        for node_data in data["nodes"]:
+            node = GraphNode(**node_data)
+            self.add_node(node)
+        # Reconstruct edges
+        for edge_data in data["edges"]:
+            edge = GraphEdge(**edge_data)
+            self.add_edge(edge)
+        logger.info(f"Loaded graph with {len(self.nodes_dict)} nodes and {len(self.edges_dict)} edges")
+    def clear(self):
+        """Clear all nodes and edges"""
+        if self.use_neo4j:
+            self._clear_neo4j()
+        else:
+            self.graph.clear()
+            self.nodes_dict.clear()
+            self.edges_dict.clear()
+    # Neo4j implementations (placeholders - implement as needed)
+    def _add_node_neo4j(self, node: GraphNode) -> bool:
+        """Add node to Neo4j"""
+        with self.driver.session() as session:
+            # Handle both enum and string for type field
+            node_type = node.type.value if hasattr(node.type, 'value') else node.type
+            result = session.run(
+                """
+                MERGE (n:Entity {node_id: $node_id})
+                ON CREATE SET n.label = $label, n.type = $type,
+                              n.importance = $importance, n.created_at = datetime()
+                RETURN n
+                """,
+                node_id=node.node_id,
+                label=node.label,
+                type=node_type,
+                importance=node.importance_score
+            )
+            return result.single() is not None
+    def _add_edge_neo4j(self, edge: GraphEdge) -> bool:
+        """Add edge to Neo4j"""
+        with self.driver.session() as session:
+            # Handle both enum and string for relation field
+            relation_value = edge.relation.value if hasattr(edge.relation, 'value') else edge.relation
+            session.run(
+                """
+                MATCH (a:Entity {node_id: $from_node})
+                MATCH (b:Entity {node_id: $to_node})
+                CREATE (a)-[r:RELATES {edge_id: $edge_id, relation: $relation,
+                                       confidence: $confidence}]->(b)
+                """,
+                from_node=edge.from_node,
+                to_node=edge.to_node,
+                edge_id=edge.edge_id,
+                relation=relation_value,
+                confidence=edge.confidence
+            )
+            return True
+    def _get_node_neo4j(self, node_id: str) -> Optional[GraphNode]:
+        """Get node from Neo4j"""
+        # Implementation omitted for brevity
+        pass
+    def _get_neighbors_neo4j(self, node_id: str) -> List[Tuple[GraphNode, GraphEdge]]:
+        """Get neighbors from Neo4j"""
+        # Implementation omitted for brevity
+        pass
+    def _get_all_nodes_neo4j(self) -> List[GraphNode]:
+        """Get all nodes from Neo4j"""
+        pass
+    def _get_all_edges_neo4j(self) -> List[GraphEdge]:
+        """Get all edges from Neo4j"""
+        pass
+    def _remove_node_neo4j(self, node_id: str):
+        """Remove node from Neo4j"""
+        pass
+    def _remove_edge_neo4j(self, edge_id: str):
+        """Remove edge from Neo4j"""
+        pass
+    def _compute_centrality_neo4j(self) -> Dict[str, float]:
+        """Compute centrality in Neo4j"""
+        pass
+    def _clear_neo4j(self):
+        """Clear Neo4j database"""
+        with self.driver.session() as session:
+            session.run("MATCH (n) DETACH DELETE n")

llm_service.py ADDED Viewed

	@@ -0,0 +1,491 @@

+"""
+LLM Inference Layer
+Handles all LLM calls for extraction, summarization, and chat
+Uses Mistral 7B with structured prompt templates
+"""
+from typing import List, Dict, Any, Optional
+from loguru import logger
+from config import settings
+import json
+import httpx
+from tenacity import retry, stop_after_attempt, wait_exponential
+from models import Triple, CanonicalTriple, RelationType
+class PromptTemplates:
+    """Centralized prompt templates following the manual"""
+    @staticmethod
+    def triplet_canonicalization(passage: str, triple: Triple) -> str:
+        """Template for canonicalizing extracted triples"""
+        return f"""Given the passage and an extracted triple, return a cleaned, canonical version.
+Passage (from page {triple.page_number}):
+{passage}
+Extracted Triple:
+- Subject: {triple.subject}
+- Relation: {triple.predicate}
+- Object: {triple.object}
+CRITICAL INSTRUCTION: You MUST select the "relation" field from this EXACT list of 25 canonical relations.
+Copy the exact string - do NOT create variations, synonyms, or modifications.
+ALLOWED RELATIONS (choose exactly one):
+1. is_a - for type/class relationships (e.g., "X is a Y")
+2. part_of - for component relationships (e.g., "X is part of Y")
+3. uses - for utilization (use "uses" for: utilizes, employs, applies)
+4. causes - for causality (e.g., "X causes Y")
+5. defined_as - for definitions (use "defined_as" for: defines, is defined as)
+6. related_to - ONLY if no other relation fits
+7. method_of - for methodological relationships
+8. depends_on - for dependencies (e.g., "X depends on Y")
+9. implements - for implementation (e.g., "X implements Y")
+10. similar_to - for similarity
+11. observes - for observation (use "observes" for: captures, records, detects, monitors)
+12. measures - for measurement
+13. produces - for production/generation (use "produces" for: makes, creates, generates, builds)
+14. contains - for containment
+15. affects - for influence (use "affects" for: influences, impacts, modifies, changes)
+16. enables - for enablement (use "enables" for: facilitates, allows, permits)
+17. requires - for requirements
+18. interacts_with - for interactions
+19. enriches - for enrichment
+20. enhances - for enhancement (use "enhances" for: improves, optimizes, extends)
+21. supports - for support (use "supports" for: contributes, helps, aids)
+22. describes - for description (use "describes" for: proposes, suggests, presents, introduces)
+23. explains - for explanation (use "explains" for: clarifies, demonstrates, shows, disentangles)
+24. refers_to - for reference (use "refers_to" for: aims, targets, addresses, focuses on)
+25. associated_with - for associations
+EXAMPLES OF WHAT TO DO:
+- If input has "utilizes" → use "uses"
+- If input has "proposes" → use "describes"
+- If input has "contributes to" → use "supports"
+- If input has "aims at" → use "refers_to"
+DO NOT USE: utilizes, proposes, contributes, aims, makes, captures, defines, or any other variations.
+USE ONLY: The exact 25 strings listed above.
+Return JSON in this exact format:
+{{
+  "subject_label": "cleaned subject name",
+  "object_label": "cleaned object name",
+  "relation": "one_of_the_25_exact_strings_above",
+  "confidence": 0.85,
+  "justification": "brief explanation referencing page {triple.page_number}"
+}}
+Output ONLY the JSON, no other text:
+"""
+    @staticmethod
+    def node_summarization(node_label: str, chunks: List[Dict[str, Any]]) -> str:
+        """Template for node summarization with citations"""
+        chunks_text = "\n\n".join([
+            f"[Chunk from p.{chunk['page_number']}]\n{chunk['text']}"
+            for chunk in chunks
+        ])
+        return f"""Summarize the key facts about "{node_label}" using ONLY the following supporting chunks.
+Requirements:
+- Produce a concise summary (3-6 sentences)
+- After any sentence that directly relies on a chunk, append (p. N) where N is the page number
+- Do not invent information not present in the chunks
+- Focus on the most important facts
+Supporting Chunks:
+{chunks_text}
+Summary:
+"""
+    @staticmethod
+    def rag_chat(user_query: str, context_chunks: List[Dict[str, Any]]) -> str:
+        """Template for RAG chat with citations"""
+        context_text = "\n\n".join([
+            f"[Source {i+1}, p.{chunk['page_number']}]\n{chunk['text']}"
+            for i, chunk in enumerate(context_chunks)
+        ])
+        return f"""You are an assistant that answers questions using ONLY the provided document context.
+Context from document:
+{context_text}
+User Question: {user_query}
+Instructions:
+- Answer in friendly, concise language
+- Include inline citations (p. N) for statements supported by chunks
+- If you cannot find direct support, say "I cannot confirm this from the document"
+- At the end, add a "Sources:" section listing page numbers and short snippets
+Answer:
+"""
+    @staticmethod
+    def system_message() -> str:
+        """System message for chat"""
+        return """You are a helpful assistant that answers questions strictly based on provided document context.
+You always cite page numbers for factual statements. If information is not in the context, you say so clearly."""
+class LLMService:
+    """
+    Service for LLM inference using Gemini API (via litellm)
+    Handles generation, extraction, summarization, and agent synthesis
+    """
+    def __init__(self):
+        # Use Gemini instead of Mistral
+        self.api_key = settings.gemini_api_key
+        self.model = f"gemini/{settings.gemini_model}"
+        self.temperature = settings.llm_temperature
+        self.max_tokens = settings.llm_max_tokens
+        self.timeout = settings.llm_timeout
+        # Import litellm for Gemini
+        try:
+            import litellm
+            self.litellm = litellm
+            logger.info(f"✓ LLMService initialized with Gemini ({settings.gemini_model})")
+        except ImportError:
+            logger.error("litellm not installed. Install with: pip install litellm")
+            raise
+        if not self.api_key:
+            logger.warning("No Gemini API key configured. LLM features will not work.")
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
+    async def _call_api(
+        self,
+        messages: List[Dict[str, str]],
+        temperature: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        json_mode: bool = False
+    ) -> str:
+        """
+        Call Gemini API via litellm with retry logic
+        Args:
+            messages: List of message dicts with 'role' and 'content'
+            temperature: Override default temperature
+            max_tokens: Override default max tokens
+            json_mode: Request JSON output
+        Returns:
+            Generated text
+        """
+        if not self.api_key:
+            raise ValueError("Gemini API key not configured")
+        try:
+            # Use litellm for Gemini API calls
+            import asyncio
+            kwargs = {
+                "model": self.model,
+                "api_key": self.api_key,
+                "messages": messages,
+                "temperature": temperature or self.temperature,
+                "max_tokens": max_tokens or self.max_tokens,
+            }
+            if json_mode:
+                kwargs["response_format"] = {"type": "json_object"}
+            # litellm.completion is synchronous, wrap in asyncio.to_thread
+            response = await asyncio.to_thread(
+                self.litellm.completion,
+                **kwargs
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            logger.error(f"Gemini API error: {str(e)}")
+            raise
+    async def canonicalize_triple(
+        self,
+        triple: Triple,
+        passage: str
+    ) -> Optional[CanonicalTriple]:
+        """
+        Canonicalize a raw triple using LLM
+        Args:
+            triple: Raw extracted triple
+            passage: Surrounding text passage
+        Returns:
+            CanonicalTriple or None if LLM fails
+        """
+        prompt = PromptTemplates.triplet_canonicalization(passage, triple)
+        messages = [
+            {"role": "system", "content": "You are an expert at extracting and canonicalizing knowledge graph triples. Always output valid JSON."},
+            {"role": "user", "content": prompt}
+        ]
+        try:
+            response = await self._call_api(messages, temperature=0.1, json_mode=True)
+            data = json.loads(response)
+            # Map string relation to enum
+            relation_str = data.get("relation", "related_to").lower().strip()
+            # Auto-correct common variations and map semantically similar verbs
+            relation_corrections = {
+                # Exact variations
+                "defines_as": "defined_as",
+                "defines": "defined_as",
+                "is_part_of": "part_of",
+                "used_by": "uses",
+                "caused_by": "causes",
+                "methods_of": "method_of",
+                "depending_on": "depends_on",
+                "implemented_by": "implements",
+                "similar": "similar_to",
+                "observed_by": "observes",
+                "measured_by": "measures",
+                "produced_by": "produces",
+                "contained_in": "contains",
+                "affected_by": "affects",
+                "enabled_by": "enables",
+                "required_by": "requires",
+                "interact_with": "interacts_with",
+                "enriched_by": "enriches",
+                "enhanced_by": "enhances",
+                "supported_by": "supports",
+                "described_by": "describes",
+                "explained_by": "explains",
+                "refer_to": "refers_to",
+                # Semantic mappings for common verbs
+                "utilizes": "uses",
+                "utilize": "uses",
+                "employs": "uses",
+                "applies": "uses",
+                "makes": "produces",
+                "creates": "produces",
+                "generates": "produces",
+                "builds": "produces",
+                "proposes": "describes",
+                "suggests": "describes",
+                "presents": "describes",
+                "introduces": "describes",
+                "captures": "observes",
+                "records": "observes",
+                "detects": "observes",
+                "monitors": "observes",
+                "aims": "refers_to",
+                "targets": "refers_to",
+                "focuses_on": "refers_to",
+                "addresses": "refers_to",
+                "disentangles": "explains",
+                "clarifies": "explains",
+                "demonstrates": "explains",
+                "shows": "explains",
+                "contributes": "supports",
+                "contributes_to": "supports",
+                "helps": "supports",
+                "aids": "supports",
+                "facilitates": "enables",
+                "allows": "enables",
+                "permits": "enables",
+                "improves": "enhances",
+                "betters": "enhances",
+                "optimizes": "enhances",
+                "extends": "enhances",
+                "influences": "affects",
+                "impacts": "affects",
+                "modifies": "affects",
+                "changes": "affects",
+            }
+            relation_str = relation_corrections.get(relation_str, relation_str)
+            try:
+                relation = RelationType(relation_str)
+            except ValueError:
+                logger.warning(f"Invalid relation '{relation_str}', defaulting to 'related_to'")
+                relation = RelationType.RELATED_TO
+            return CanonicalTriple(
+                subject_label=data["subject_label"],
+                object_label=data["object_label"],
+                relation=relation,
+                confidence=data["confidence"],
+                justification=data["justification"],
+                page_number=triple.page_number or 0
+            )
+        except Exception as e:
+            logger.error(f"Failed to canonicalize triple: {e}")
+            return None
+    async def summarize_node(
+        self,
+        node_label: str,
+        supporting_chunks: List[Dict[str, Any]]
+    ) -> str:
+        """
+        Generate summary for a graph node with citations
+        Args:
+            node_label: Name of the node
+            supporting_chunks: List of chunk metadata dicts
+        Returns:
+            Summary text with inline citations
+        """
+        prompt = PromptTemplates.node_summarization(node_label, supporting_chunks)
+        messages = [
+            {"role": "system", "content": PromptTemplates.system_message()},
+            {"role": "user", "content": prompt}
+        ]
+        try:
+            # Use faster settings for node summaries
+            summary = await self._call_api(
+                messages,
+                temperature=0.3,
+                max_tokens=3072 # Shorter summaries = faster response
+            )
+            return summary.strip()
+        except Exception as e:
+            logger.error(f"Failed to summarize node: {e}")
+            return f"Unable to generate summary for {node_label}."
+    async def rag_chat(
+        self,
+        query: str,
+        context_chunks: List[Dict[str, Any]]
+    ) -> str:
+        """
+        Answer user query using RAG with citations
+        Args:
+            query: User question
+            context_chunks: Retrieved context chunks
+        Returns:
+            Answer with citations and sources
+        """
+        prompt = PromptTemplates.rag_chat(query, context_chunks)
+        messages = [
+            {"role": "system", "content": PromptTemplates.system_message()},
+            {"role": "user", "content": prompt}
+        ]
+        try:
+            answer = await self._call_api(messages, temperature=0.3)
+            return answer.strip()
+        except Exception as e:
+            logger.error(f"Failed to generate RAG response: {e}")
+            return "I encountered an error while processing your question. Please try again."
+    async def agent_synthesize(
+        self,
+        query: str,
+        context: str
+    ) -> str:
+        """
+        Synthesize answer for agent-based RAG from tool results
+        Args:
+            query: User question
+            context: Combined context from tool executions
+        Returns:
+            Synthesized answer with citations
+        """
+        prompt = f"""You are an assistant that answers questions using the provided context from multiple tools.
+Context from tools:
+{context}
+User Question: {query}
+Instructions:
+- Answer in friendly, concise language
+- Include inline citations (p. N) for statements supported by sources
+- If you cannot find direct support, say "I cannot confirm this from the available information"
+- Synthesize information from different tools (vector search, graph search, etc.) cohesively
+Answer:
+"""
+        messages = [
+            {"role": "system", "content": PromptTemplates.system_message()},
+            {"role": "user", "content": prompt}
+        ]
+        try:
+            answer = await self._call_api(messages, temperature=0.3)
+            return answer.strip()
+        except Exception as e:
+            logger.error(f"Failed to synthesize agent response: {e}")
+            return "I encountered an error while processing your question. Please try again."
+    async def extract_triples_llm(
+        self,
+        text: str,
+        page_number: int,
+        chunk_id: str
+    ) -> List[Triple]:
+        """
+        Use LLM to extract triples directly (alternative to OpenIE)
+        Args:
+            text: Text to extract from
+            page_number: Page number
+            chunk_id: Chunk identifier
+        Returns:
+            List of extracted triples
+        """
+        prompt = f"""Extract key relationships from this text as subject-predicate-object triples.
+Focus on important concepts, methods, definitions, and relationships.
+Text (from page {page_number}):
+{text}
+Return a JSON array of triples, each with:
+- subject: The subject entity
+- predicate: The relationship/action
+- object: The object entity
+- confidence: Your confidence (0-1)
+Output ONLY valid JSON array:
+"""
+        messages = [
+            {"role": "system", "content": "You are an expert at knowledge extraction. Always output valid JSON."},
+            {"role": "user", "content": prompt}
+        ]
+        try:
+            response = await self._call_api(messages, temperature=0.2, json_mode=True)
+            data = json.loads(response)
+            triples = []
+            for item in data if isinstance(data, list) else data.get("triples", []):
+                triple = Triple(
+                    subject=item["subject"],
+                    predicate=item["predicate"],
+                    object=item["object"],
+                    confidence=item.get("confidence", 0.7),
+                    source_chunk_id=chunk_id,
+                    page_number=page_number
+                )
+                triples.append(triple)
+            return triples
+        except Exception as e:
+            logger.error(f"Failed to extract triples: {e}")
+            return []

main.py ADDED Viewed

	@@ -0,0 +1,550 @@

+"""
+FastAPI Backend - Main Application
+Provides REST API for PDF upload, graph retrieval, chat, and node details
+"""
+# Suppress PyTorch JIT warnings (harmless, just noisy during import)
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="torch")
+warnings.filterwarnings("ignore", message="Unable to retrieve source")
+from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse, JSONResponse
+from loguru import logger
+import sys
+from pathlib import Path
+import os
+import uuid
+import pickle
+from datetime import datetime
+from typing import List, Dict, Any
+from config import settings, ensure_directories
+from models import (
+    UploadResponse, GraphResponse, ChatRequest, ChatResponse,
+    NodeDetailResponse, AdminStatus, SourceCitation, GraphNode, GraphEdge
+)
+from pdf_processor import PDFProcessor
+from embedding_service import EmbeddingService
+from llm_service import LLMService
+from gemini_extractor import GeminiExtractor
+from graph_store import GraphStore
+from graph_builder import GraphBuilder
+from rag_agent import RAGAgent
+# Configure logging
+logger.remove()
+logger.add(
+    sys.stderr,
+    level=settings.log_level,
+    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan> | <level>{message}</level>"
+)
+logger.add(
+    f"{settings.logs_dir}/app.log",
+    rotation="500 MB",
+    retention="10 days",
+    level=settings.log_level
+)
+# Initialize services
+ensure_directories()
+app = FastAPI(
+    title=settings.app_name,
+    version=settings.app_version,
+    description="PDF Knowledge Graph and RAG System"
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure appropriately for production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global service instances
+logger.info("Initializing PDFProcessor...")
+pdf_processor = PDFProcessor()
+logger.info("Initializing EmbeddingService...")
+embedding_service = EmbeddingService()
+logger.info("Initializing LLMService...")
+llm_service = LLMService()
+logger.info("Initializing GeminiExtractor (direct Gemini API)...")
+triplet_extractor = GeminiExtractor(llm_service)
+logger.info("Initializing GraphStore...")
+graph_store = GraphStore(use_neo4j=False, embedding_service=embedding_service)
+logger.info("Initializing GraphBuilder...")
+graph_builder = GraphBuilder(graph_store, embedding_service)
+logger.info("Initializing RAGAgent (LangGraph-based)...")
+rag_agent = RAGAgent(graph_store, embedding_service, llm_service)
+logger.info("✓ All services initialized successfully")
+# In-memory storage for PDF metadata (use database in production)
+pdf_metadata_store: Dict[str, Dict[str, Any]] = {}
+@app.on_event("startup")
+async def startup_event():
+    """Run on application startup"""
+    logger.info(f"Starting {settings.app_name} v{settings.app_version}")
+    logger.info(f"Environment: {settings.environment}")
+    # Try to load existing graph
+    graph_path = os.path.join(settings.data_dir, "knowledge_graph.pkl")
+    if os.path.exists(graph_path):
+        try:
+            graph_store.load(graph_path)
+            logger.info("Loaded existing knowledge graph")
+        except Exception as e:
+            logger.warning(f"Failed to load existing graph: {e}")
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Run on application shutdown"""
+    logger.info("Shutting down application")
+    # Save graph
+    graph_path = os.path.join(settings.data_dir, "knowledge_graph.pkl")
+    try:
+        graph_store.save(graph_path)
+        logger.info("Saved knowledge graph")
+    except Exception as e:
+        logger.error(f"Failed to save graph: {e}")
+    # Save FAISS index
+    try:
+        embedding_service.save()
+        logger.info("Saved FAISS index")
+    except Exception as e:
+        logger.error(f"Failed to save FAISS index: {e}")
+@app.get("/")
+async def root():
+    """Serve the frontend HTML"""
+    return FileResponse("frontend/index.html")
+@app.post("/upload", response_model=UploadResponse)
+async def upload_pdf(
+    file: UploadFile = File(...),
+    background_tasks: BackgroundTasks = BackgroundTasks()
+):
+    """
+    Upload a PDF and trigger ingestion pipeline
+    Returns immediately with pdf_id, processes in background
+    """
+    # Validate file
+    if not file.filename.endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Only PDF files are allowed")
+    file_size = 0
+    content = await file.read()
+    file_size = len(content)
+    if file_size > settings.max_file_size_bytes:
+        raise HTTPException(
+            status_code=400,
+            detail=f"File size exceeds maximum of {settings.max_file_size_mb}MB"
+        )
+    # Generate PDF ID
+    pdf_id = str(uuid.uuid4())
+    # Save file
+    filepath = os.path.join(settings.upload_dir, f"{pdf_id}.pdf")
+    with open(filepath, 'wb') as f:
+        f.write(content)
+    logger.info(f"Uploaded PDF: {file.filename} (ID: {pdf_id})")
+    # Store metadata with detailed progress tracking
+    pdf_metadata_store[pdf_id] = {
+        "filename": file.filename,
+        "filepath": filepath,
+        "status": "processing",
+        "progress": {
+            "stage": "starting",
+            "message": "Upload complete, starting processing...",
+            "percent": 0
+        }
+    }
+    # Trigger background processing
+    background_tasks.add_task(process_pdf_pipeline, pdf_id, filepath)
+    return UploadResponse(
+        pdf_id=pdf_id,
+        filename=file.filename,
+        status="processing",
+        message="PDF uploaded successfully. Processing started in background."
+    )
+async def process_pdf_pipeline(pdf_id: str, filepath: str):
+    """
+    ⚡ OPTIMIZED: Full ingestion pipeline with progress tracking
+    Steps:
+    0. Clear existing graph and index (FRESH START)
+    1. Extract chunks from PDF
+    2. Create embeddings
+    3. Add to vector index
+    4. Extract triples (PARALLEL)
+    5. Build knowledge graph (NO PRUNING)
+    """
+    def update_progress(stage: str, message: str, percent: int):
+        """Update progress in metadata store"""
+        if pdf_id in pdf_metadata_store:
+            pdf_metadata_store[pdf_id]["progress"] = {
+                "stage": stage,
+                "message": message,
+                "percent": percent
+            }
+    try:
+        logger.info(f"Starting ingestion pipeline for PDF {pdf_id}")
+        # Step 0: CLEAR EVERYTHING for fresh extraction
+        update_progress("clearing", "Clearing previous data...", 5)
+        logger.info("Step 0: Clearing existing graph and embeddings for fresh extraction")
+        graph_store.clear()
+        embedding_service.clear()
+        logger.info("✓ Cleared all existing data")
+        # Step 1: Extract chunks (with caching)
+        cache_path = os.path.join(settings.data_dir, f"chunks_{pdf_id}.pkl")
+        if os.path.exists(cache_path):
+            # Load cached chunks (saves 2-3s on reindex)
+            update_progress("extraction", "Loading cached text extraction...", 15)
+            logger.info("⚡ Step 1: Loading cached chunks from previous extraction")
+            with open(cache_path, 'rb') as f:
+                cache_data = pickle.load(f)
+                refined_chunks = cache_data['chunks']
+                metadata = cache_data['metadata']
+            logger.info(f"✓ Loaded {len(refined_chunks)} cached chunks (skipped PDF processing)")
+            update_progress("extraction", f"Loaded {len(refined_chunks)} cached chunks", 25)
+        else:
+            # Extract and cache chunks for future reindexing
+            update_progress("extraction", "Extracting text from PDF...", 15)
+            logger.info("Step 1: Extracting chunks from PDF")
+            chunks, metadata = pdf_processor.process_pdf(filepath, pdf_id)
+            refined_chunks = pdf_processor.chunk_text(chunks)
+            # Cache for future use
+            with open(cache_path, 'wb') as f:
+                pickle.dump({'chunks': refined_chunks, 'metadata': metadata}, f)
+            logger.info(f"✓ Cached {len(refined_chunks)} chunks for future reindexing")
+            update_progress("extraction", f"Extracted {len(refined_chunks)} chunks", 25)
+        # Step 2: Create embeddings
+        update_progress("embeddings", f"Creating embeddings for {len(refined_chunks)} chunks...", 35)
+        logger.info(f"Step 2: Creating embeddings for {len(refined_chunks)} chunks")
+        embeddings = embedding_service.create_embeddings(refined_chunks)
+        update_progress("embeddings", "Embeddings created", 50)
+        # Step 3: Add to vector index
+        update_progress("indexing", "Building vector index...", 55)
+        logger.info("Step 3: Adding to vector index")
+        embedding_service.add_to_index(refined_chunks, embeddings)
+        embedding_service.save()
+        update_progress("indexing", "Vector index complete", 60)
+        # Step 4: Extract triples using Gemini (direct API - PARALLEL)
+        update_progress("extraction", "Extracting concepts with AI (parallel)...", 65)
+        logger.info("Step 4: Extracting triples using Gemini (PARALLEL per-page, 2 concepts max)")
+        canonical_triples = await triplet_extractor.extract_from_chunks(
+            refined_chunks,
+            use_llm=True  # Direct Gemini API calls
+        )
+        update_progress("extraction", f"Extracted {len(canonical_triples)} relationships", 80)
+        # Step 5: Build graph
+        update_progress("graph", "Building knowledge graph...", 85)
+        logger.info("Step 5: Building knowledge graph")
+        num_nodes, num_edges = await graph_builder.build_graph(canonical_triples)
+        update_progress("graph", f"Graph complete: {num_nodes} nodes, {num_edges} edges", 95)
+        # Save graph
+        update_progress("saving", "Saving graph to disk...", 98)
+        graph_path = os.path.join(settings.data_dir, "knowledge_graph.pkl")
+        graph_store.save(graph_path)
+        # Update metadata
+        update_progress("completed", f"✓ Complete! {num_nodes} nodes, {num_edges} edges", 100)
+        pdf_metadata_store[pdf_id]["status"] = "completed"
+        pdf_metadata_store[pdf_id]["num_chunks"] = len(refined_chunks)
+        pdf_metadata_store[pdf_id]["num_nodes"] = num_nodes
+        pdf_metadata_store[pdf_id]["num_edges"] = num_edges
+        logger.info(f"✓ Completed ingestion for PDF {pdf_id}: {num_nodes} nodes, {num_edges} edges")
+    except Exception as e:
+        logger.error(f"❌ Failed to process PDF {pdf_id}: {e}", exc_info=True)
+        pdf_metadata_store[pdf_id]["status"] = "failed"
+        pdf_metadata_store[pdf_id]["error"] = str(e)
+        update_progress("error", f"Error: {str(e)[:100]}", 0)
+@app.get("/graph", response_model=GraphResponse)
+async def get_graph(pdf_id: str = None):
+    """
+    Get the knowledge graph
+    Args:
+        pdf_id: Optional filter by PDF ID
+    Returns:
+        Graph nodes and edges
+    """
+    nodes = graph_store.get_all_nodes()
+    edges = graph_store.get_all_edges()
+    logger.info(f"Returning {len(nodes)} nodes, {len(edges)} edges")
+    # Filter by PDF if specified
+    if pdf_id:
+        # Filter nodes and edges that belong to this PDF
+        # This requires tracking PDF ID in supporting chunks
+        pass
+    return GraphResponse(
+        nodes=nodes,
+        edges=edges,
+        metadata={
+            "total_nodes": len(nodes),
+            "total_edges": len(edges)
+        }
+    )
+@app.get("/node/{node_id}", response_model=NodeDetailResponse)
+async def get_node_details(node_id: str):
+    """
+    Get detailed information about a node
+    Includes:
+    - Node metadata
+    - LLM-generated summary with citations
+    - Supporting chunks
+    - Related nodes
+    """
+    node = graph_store.get_node(node_id)
+    if not node:
+        raise HTTPException(status_code=404, detail="Node not found")
+    # Check if summary is cached in node metadata
+    if "cached_summary" in node.metadata:
+        logger.info(f"✓ Using cached summary for node {node.label}")
+        summary = node.metadata["cached_summary"]
+        search_results = None  # Use node's supporting chunks for sources
+    else:
+        # Generate summary (first time)
+        logger.info(f"⏳ Generating summary for node {node.label}...")
+        # Get supporting chunks using semantic search on the node label
+        # This finds chunks that are semantically similar to the concept
+        search_results = embedding_service.search(
+            query=node.label,
+            top_k=3  # Reduced from 5 to 3 for faster processing
+        )
+        # Prepare chunks for LLM
+        chunks_for_llm = []
+        if search_results:
+            chunks_for_llm = [
+                {
+                    "page_number": meta.get("page_number", 0),
+                    "text": meta.get("text", "")
+                }
+                for meta, score in search_results
+            ]
+        # Fallback: if no chunks found, create a basic summary
+        if not chunks_for_llm:
+            logger.warning(f"No chunks found for node {node.label}, using basic summary")
+            chunks_for_llm = [
+                {
+                    "page_number": chunk.page_number or 0,
+                    "text": chunk.snippet or ""
+                }
+                for chunk in node.supporting_chunks[:3]
+            ]
+        # Generate summary
+        summary = await llm_service.summarize_node(node.label, chunks_for_llm)
+        # Cache summary in node metadata (don't cache search_results - they're not serializable)
+        node.metadata["cached_summary"] = summary
+        node.metadata["cache_timestamp"] = str(datetime.utcnow())
+        # Update the node in the graph store
+        graph_store.update_node(node)
+        logger.info(f"✓ Cached summary for node {node.label}")
+    # Get related nodes
+    neighbors = graph_store.get_neighbors(node_id)
+    related_nodes = [
+        {
+            "node_id": neighbor.node_id,
+            "label": neighbor.label,
+            "relation": edge.relation.value,
+            "confidence": edge.confidence
+        }
+        for neighbor, edge in neighbors[:10]  # Limit to top 10
+    ]
+    # Build source citations
+    sources = []
+    if search_results is not None:
+        # Use search results (freshly generated summary)
+        for meta, score in search_results[:5]:
+            text = meta.get("text", "")
+            snippet = text[:120] + "..." if len(text) > 120 else text
+            sources.append(SourceCitation(
+                page_number=meta.get("page_number", 0),
+                snippet=snippet,
+                chunk_id=meta.get("chunk_id", ""),
+                score=score
+            ))
+    else:
+        # Use node's supporting chunks (cached summary)
+        sources = [
+            SourceCitation(
+                page_number=chunk.page_number or 0,
+                snippet=chunk.snippet or "",
+                chunk_id=chunk.chunk_id,
+                score=chunk.score
+            )
+            for chunk in node.supporting_chunks[:5]
+        ]
+    return NodeDetailResponse(
+        node_id=node.node_id,
+        label=node.label,
+        type=node.type,
+        summary=summary,
+        sources=sources,
+        related_nodes=related_nodes
+    )
+@app.post("/chat", response_model=ChatResponse)
+async def chat(request: ChatRequest):
+    """
+    Agent-based RAG chat endpoint
+    Uses LangGraph agent with multiple tools:
+    - vector_search: Semantic search through chunks
+    - graph_search: Find concepts in knowledge graph
+    - get_node_details: Get detailed node information
+    - get_related_nodes: Graph traversal for relationships
+    - get_chunk_by_id: Retrieve specific chunks
+    The agent intelligently decides which tools to use based on the query
+    """
+    logger.info(f"🤖 Agent chat request: '{request.query}'")
+    # Use agent-based RAG
+    response = await rag_agent.chat(
+        query=request.query,
+        pdf_id=request.pdf_id,
+        include_citations=True
+    )
+    # Limit sources to requested max
+    if len(response.sources) > request.max_sources:
+        response.sources = response.sources[:request.max_sources]
+    return response
+@app.get("/status/{pdf_id}")
+async def get_pdf_status(pdf_id: str):
+    """Get processing status for a specific PDF"""
+    if pdf_id not in pdf_metadata_store:
+        raise HTTPException(status_code=404, detail="PDF not found")
+    metadata = pdf_metadata_store[pdf_id]
+    return {
+        "pdf_id": pdf_id,
+        "filename": metadata.get("filename"),
+        "status": metadata.get("status"),
+        "progress": metadata.get("progress", {}),
+        "num_nodes": metadata.get("num_nodes", 0),
+        "num_edges": metadata.get("num_edges", 0),
+        "error": metadata.get("error")
+    }
+@app.get("/admin/status", response_model=AdminStatus)
+async def admin_status():
+    """Get system status and statistics"""
+    faiss_stats = embedding_service.get_stats()
+    return AdminStatus(
+        total_pdfs=len(pdf_metadata_store),
+        total_chunks=faiss_stats["num_chunks"],
+        total_nodes=len(graph_store.get_all_nodes()),
+        total_edges=len(graph_store.get_all_edges()),
+        vector_index_size=faiss_stats["total_vectors"],
+        recent_logs=[]  # Would fetch from logs in production
+    )
+@app.post("/admin/reindex")
+async def admin_reindex(pdf_id: str):
+    """Re-run ingestion for a PDF"""
+    if pdf_id not in pdf_metadata_store:
+        raise HTTPException(status_code=404, detail="PDF not found")
+    filepath = pdf_metadata_store[pdf_id]["filepath"]
+    # Clear existing data for this PDF (would need better tracking)
+    # For now, just re-run the pipeline
+    await process_pdf_pipeline(pdf_id, filepath)
+    return {"message": "Reindexing started", "pdf_id": pdf_id}
+@app.post("/admin/clear")
+async def admin_clear():
+    """Clear all data"""
+    graph_store.clear()
+    embedding_service.clear()
+    pdf_metadata_store.clear()
+    logger.warning("All data cleared by admin")
+    return {"message": "All data cleared"}
+# Mount static files for frontend
+if os.path.exists("frontend"):
+    app.mount("/static", StaticFiles(directory="frontend"), name="static")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "main:app",
+        host=settings.api_host,
+        port=settings.api_port,
+        reload=settings.debug,
+        log_level=settings.log_level.lower()
+    )

modal_app.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+GraphLLM - Modal Deployment
+Serverless ML deployment with auto-scaling
+"""
+import modal
+# Create Modal app
+app = modal.App("graphllm")
+# Define the container image with all dependencies
+image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .apt_install("tesseract-ocr", "ghostscript", "gcc", "g++")
+    .pip_install_from_requirements("requirements.txt")
+)
+# Create persistent volume for data storage
+volume = modal.Volume.from_name("graphllm-data", create_if_missing=True)
+# Mount FastAPI app
+@app.function(
+    image=image,
+    gpu=None,  # Use CPU (cheaper)
+    memory=4096,  # 4GB RAM
+    timeout=600,  # 10 min timeout
+    volumes={"/app/data": volume},
+    secrets=[modal.Secret.from_name("graphllm-secrets")],  # GEMINI_API_KEY
+)
+@modal.asgi_app()
+def fastapi_app():
+    """
+    Mount the FastAPI application
+    """
+    import sys
+    sys.path.insert(0, "/root")
+    # Import main FastAPI app
+    from main import app as fastapi_app
+    return fastapi_app
+# Local testing endpoint
+@app.local_entrypoint()
+def main():
+    """
+    Test the deployment locally
+    """
+    print("GraphLLM deployed to Modal!")
+    print("Access your app at: https://YOUR_USERNAME--graphllm-fastapi-app.modal.run")

models.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+Data models for GraphLLM system following the manual specifications
+"""
+from pydantic import BaseModel, Field
+from typing import Optional, List, Dict, Any, Literal
+from datetime import datetime
+from enum import Enum
+import uuid
+# Enums
+class ChunkType(str, Enum):
+    """Types of chunks extracted from PDF"""
+    PARAGRAPH = "paragraph"
+    CODE = "code"
+    TABLE = "table"
+    IMAGE = "image"
+    IMAGE_TEXT = "image_text"
+class NodeType(str, Enum):
+    """Types of graph nodes"""
+    CONCEPT = "concept"
+    PERSON = "person"
+    METHOD = "method"
+    TERM = "term"
+    CLASS = "class"
+    FUNCTION = "function"
+    ENTITY = "entity"
+class RelationType(str, Enum):
+    """Canonical relation types for edges"""
+    IS_A = "is_a"
+    PART_OF = "part_of"
+    METHOD_OF = "method_of"
+    CAUSES = "causes"
+    USES = "uses"
+    RELATED_TO = "related_to"
+    DEFINED_AS = "defined_as"
+    DEPENDS_ON = "depends_on"
+    IMPLEMENTS = "implements"
+    SIMILAR_TO = "similar_to"
+    OBSERVES = "observes"
+    MEASURES = "measures"
+    PRODUCES = "produces"
+    CONTAINS = "contains"
+    AFFECTS = "affects"
+    ENABLES = "enables"
+    REQUIRES = "requires"
+    INTERACTS_WITH = "interacts_with"
+    ENRICHES = "enriches"
+    ENHANCES = "enhances"
+    SUPPORTS = "supports"
+    DESCRIBES = "describes"
+    EXPLAINS = "explains"
+    REFERS_TO = "refers_to"
+    ASSOCIATED_WITH = "associated_with"
+# Core Data Models
+class Chunk(BaseModel):
+    """Individual chunk of text/content from PDF"""
+    chunk_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    pdf_id: str
+    page_number: int
+    char_range: tuple[int, int]
+    type: ChunkType
+    text: str
+    table_json: Optional[Dict[str, Any]] = None
+    image_id: Optional[str] = None
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+class EmbeddingEntry(BaseModel):
+    """Vector embedding for a chunk"""
+    chunk_id: str
+    embedding: List[float]
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+class SupportingChunk(BaseModel):
+    """Reference to a chunk supporting a node or edge"""
+    chunk_id: str
+    score: float
+    page_number: Optional[int] = None
+    snippet: Optional[str] = None
+class GraphNode(BaseModel):
+    """Node in the knowledge graph"""
+    node_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    label: str
+    type: NodeType
+    aliases: List[str] = Field(default_factory=list)
+    supporting_chunks: List[SupportingChunk] = Field(default_factory=list)
+    importance_score: float = 0.0
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+class GraphEdge(BaseModel):
+    """Edge in the knowledge graph"""
+    edge_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    from_node: str = Field(alias="from")
+    to_node: str = Field(alias="to")
+    relation: RelationType
+    confidence: float
+    supporting_chunks: List[SupportingChunk] = Field(default_factory=list)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    class Config:
+        populate_by_name = True
+        # FastAPI automatically serializes enums as their string values in JSON
+class Triple(BaseModel):
+    """Extracted triple from text"""
+    subject: str
+    predicate: str
+    object: str
+    confidence: float = 1.0
+    source_chunk_id: Optional[str] = None
+    page_number: Optional[int] = None
+    justification: Optional[str] = None
+class CanonicalTriple(BaseModel):
+    """LLM-canonicalized triple"""
+    subject_label: str
+    object_label: str
+    relation: RelationType
+    confidence: float
+    justification: str
+    page_number: int
+# API Request/Response Models
+class UploadResponse(BaseModel):
+    """Response from PDF upload"""
+    pdf_id: str
+    filename: str
+    status: str
+    message: str
+    num_pages: Optional[int] = None
+    num_chunks: Optional[int] = None
+class GraphResponse(BaseModel):
+    """Response containing graph data"""
+    nodes: List[GraphNode]
+    edges: List[GraphEdge]
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+class SourceCitation(BaseModel):
+    """Source citation with page number and snippet"""
+    page_number: int
+    snippet: str
+    chunk_id: str
+    score: Optional[float] = None
+class NodeDetailResponse(BaseModel):
+    """Response for node detail request"""
+    node_id: str
+    label: str
+    type: NodeType
+    summary: str
+    sources: List[SourceCitation]
+    related_nodes: List[Dict[str, Any]] = Field(default_factory=list)
+    raw_chunks: Optional[List[Chunk]] = None
+class ChatMessage(BaseModel):
+    """Chat message"""
+    role: Literal["user", "assistant", "system"]
+    content: str
+    sources: Optional[List[SourceCitation]] = None
+    timestamp: datetime = Field(default_factory=datetime.utcnow)
+class ChatRequest(BaseModel):
+    """Chat request"""
+    query: str
+    pdf_id: str
+    include_citations: bool = True
+    max_sources: int = 5
+class ChatResponse(BaseModel):
+    """Chat response with answer and citations"""
+    answer: str
+    sources: List[SourceCitation]
+    context_chunks: Optional[List[str]] = None
+class PDFMetadata(BaseModel):
+    """Metadata for uploaded PDF"""
+    pdf_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    filename: str
+    filepath: str
+    num_pages: int
+    file_size_bytes: int
+    upload_timestamp: datetime = Field(default_factory=datetime.utcnow)
+    processing_status: str = "pending"
+    num_chunks: int = 0
+    num_nodes: int = 0
+    num_edges: int = 0
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+class IngestionLog(BaseModel):
+    """Log entry for ingestion process"""
+    log_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    pdf_id: str
+    timestamp: datetime = Field(default_factory=datetime.utcnow)
+    stage: str
+    status: str
+    message: str
+    details: Optional[Dict[str, Any]] = None
+class AdminStatus(BaseModel):
+    """Admin status response"""
+    total_pdfs: int
+    total_chunks: int
+    total_nodes: int
+    total_edges: int
+    vector_index_size: int
+    recent_logs: List[IngestionLog]

pdf_processor.py ADDED Viewed

	@@ -0,0 +1,325 @@

+"""
+PDF Ingestion & Preprocessing Module
+Handles extraction of text, tables, code blocks, and images from PDFs
+"""
+import fitz  # PyMuPDF
+import pdfplumber
+import pytesseract
+from PIL import Image
+import io
+import re
+from typing import List, Dict, Any, Optional, Tuple
+from loguru import logger
+from models import Chunk, ChunkType, PDFMetadata
+from config import settings
+import uuid
+class PDFProcessor:
+    """
+    Comprehensive PDF processor that extracts:
+    - Page-level text with character ranges
+    - Tables (structured)
+    - Code blocks (detected heuristically)
+    - Images (with OCR)
+    """
+    def __init__(self):
+        self.code_patterns = [
+            re.compile(r'```[\s\S]*?```'),  # Markdown code blocks
+            re.compile(r'def\s+\w+\s*\('),   # Python functions
+            re.compile(r'class\s+\w+\s*[:\(]'),  # Python/Java classes
+            re.compile(r'function\s+\w+\s*\('),  # JavaScript functions
+            re.compile(r'public\s+class\s+\w+'),  # Java classes
+        ]
+    def process_pdf(self, filepath: str, pdf_id: str) -> Tuple[List[Chunk], PDFMetadata]:
+        """
+        Main entry point: process entire PDF and return chunks + metadata
+        Args:
+            filepath: Path to PDF file
+            pdf_id: Unique identifier for this PDF
+        Returns:
+            Tuple of (chunks list, metadata object)
+        """
+        logger.info(f"Processing PDF: {filepath}")
+        chunks: List[Chunk] = []
+        # Open with PyMuPDF for text and images
+        pdf_doc = fitz.open(filepath)
+        num_pages = len(pdf_doc)
+        # Open with pdfplumber for tables
+        with pdfplumber.open(filepath) as plumber_pdf:
+            for page_num in range(num_pages):
+                logger.debug(f"Processing page {page_num + 1}/{num_pages}")
+                # Extract from PyMuPDF
+                fitz_page = pdf_doc[page_num]
+                page_chunks = self._process_page(
+                    fitz_page=fitz_page,
+                    plumber_page=plumber_pdf.pages[page_num],
+                    page_num=page_num + 1,  # 1-indexed
+                    pdf_id=pdf_id
+                )
+                chunks.extend(page_chunks)
+        pdf_doc.close()
+        # Create metadata
+        import os
+        file_size = os.path.getsize(filepath)
+        metadata = PDFMetadata(
+            pdf_id=pdf_id,
+            filename=os.path.basename(filepath),
+            filepath=filepath,
+            num_pages=num_pages,
+            file_size_bytes=file_size,
+            num_chunks=len(chunks),
+            processing_status="completed"
+        )
+        logger.info(f"Extracted {len(chunks)} chunks from {num_pages} pages")
+        return chunks, metadata
+    def _process_page(
+        self,
+        fitz_page,
+        plumber_page,
+        page_num: int,
+        pdf_id: str
+    ) -> List[Chunk]:
+        """Process a single page and return all chunks"""
+        chunks: List[Chunk] = []
+        # 1. Extract raw text with character positions
+        page_text = fitz_page.get_text("text")
+        # 2. Extract tables
+        table_chunks = self._extract_tables(plumber_page, page_num, pdf_id)
+        chunks.extend(table_chunks)
+        # 3. Extract code blocks
+        code_chunks = self._extract_code_blocks(page_text, page_num, pdf_id)
+        chunks.extend(code_chunks)
+        # 4. Extract images and run OCR
+        image_chunks = self._extract_images(fitz_page, page_num, pdf_id)
+        chunks.extend(image_chunks)
+        # 5. Extract remaining text as paragraphs
+        # Remove table and code regions from text before creating paragraph chunks
+        cleaned_text = self._remove_extracted_regions(
+            page_text,
+            [c.text for c in code_chunks]
+        )
+        if cleaned_text.strip():
+            para_chunk = Chunk(
+                chunk_id=str(uuid.uuid4()),
+                pdf_id=pdf_id,
+                page_number=page_num,
+                char_range=(0, len(cleaned_text)),
+                type=ChunkType.PARAGRAPH,
+                text=cleaned_text,
+                metadata={"source": "text_extraction"}
+            )
+            chunks.append(para_chunk)
+        return chunks
+    def _extract_tables(self, plumber_page, page_num: int, pdf_id: str) -> List[Chunk]:
+        """Extract tables from page using pdfplumber"""
+        chunks = []
+        tables = plumber_page.extract_tables()
+        for idx, table in enumerate(tables):
+            if not table:
+                continue
+            # Convert table to structured JSON
+            table_json = self._table_to_json(table)
+            # Convert table to text representation
+            table_text = self._table_to_text(table)
+            chunk = Chunk(
+                chunk_id=str(uuid.uuid4()),
+                pdf_id=pdf_id,
+                page_number=page_num,
+                char_range=(0, len(table_text)),
+                type=ChunkType.TABLE,
+                text=table_text,
+                table_json=table_json,
+                metadata={"table_index": idx, "num_rows": len(table)}
+            )
+            chunks.append(chunk)
+        logger.debug(f"Extracted {len(chunks)} tables from page {page_num}")
+        return chunks
+    def _table_to_json(self, table: List[List[str]]) -> Dict[str, Any]:
+        """Convert table to structured JSON"""
+        if not table or len(table) < 2:
+            return {"headers": [], "rows": []}
+        headers = table[0]
+        rows = table[1:]
+        return {
+            "headers": headers,
+            "rows": [
+                {headers[i]: cell for i, cell in enumerate(row) if i < len(headers)}
+                for row in rows
+            ]
+        }
+    def _table_to_text(self, table: List[List[str]]) -> str:
+        """Convert table to readable text"""
+        return "\n".join([" | ".join([str(cell) for cell in row]) for row in table])
+    def _extract_code_blocks(self, text: str, page_num: int, pdf_id: str) -> List[Chunk]:
+        """Extract code blocks using heuristic patterns"""
+        chunks = []
+        # Look for code patterns
+        for pattern in self.code_patterns:
+            matches = pattern.finditer(text)
+            for match in matches:
+                code_text = match.group(0)
+                if len(code_text) < 20:  # Skip very short matches
+                    continue
+                chunk = Chunk(
+                    chunk_id=str(uuid.uuid4()),
+                    pdf_id=pdf_id,
+                    page_number=page_num,
+                    char_range=(match.start(), match.end()),
+                    type=ChunkType.CODE,
+                    text=code_text,
+                    metadata={
+                        "pattern": pattern.pattern,
+                        "detected_language": self._detect_language(code_text)
+                    }
+                )
+                chunks.append(chunk)
+        # Also detect monospace font regions (if PDF has font info)
+        # This is more advanced and would require font analysis
+        logger.debug(f"Extracted {len(chunks)} code blocks from page {page_num}")
+        return chunks
+    def _detect_language(self, code: str) -> str:
+        """Heuristically detect programming language"""
+        if 'def ' in code and ':' in code:
+            return 'python'
+        elif 'function' in code or 'const' in code or 'let' in code:
+            return 'javascript'
+        elif 'public class' in code or 'private' in code:
+            return 'java'
+        elif '#include' in code:
+            return 'c++'
+        else:
+            return 'unknown'
+    def _extract_images(self, fitz_page, page_num: int, pdf_id: str) -> List[Chunk]:
+        """Extract images and run OCR"""
+        chunks = []
+        image_list = fitz_page.get_images()
+        for img_index, img in enumerate(image_list):
+            try:
+                xref = img[0]
+                base_image = fitz_page.parent.extract_image(xref)
+                image_bytes = base_image["image"]
+                # Convert to PIL Image
+                image = Image.open(io.BytesIO(image_bytes))
+                # Run OCR
+                ocr_text = pytesseract.image_to_string(image)
+                if ocr_text.strip():
+                    image_id = f"{pdf_id}_p{page_num}_img{img_index}"
+                    chunk = Chunk(
+                        chunk_id=str(uuid.uuid4()),
+                        pdf_id=pdf_id,
+                        page_number=page_num,
+                        char_range=(0, len(ocr_text)),
+                        type=ChunkType.IMAGE_TEXT,
+                        text=ocr_text,
+                        image_id=image_id,
+                        metadata={
+                            "image_format": base_image["ext"],
+                            "image_index": img_index
+                        }
+                    )
+                    chunks.append(chunk)
+            except Exception as e:
+                logger.warning(f"Failed to extract image {img_index} on page {page_num}: {e}")
+        logger.debug(f"Extracted {len(chunks)} images from page {page_num}")
+        return chunks
+    def _remove_extracted_regions(self, text: str, code_blocks: List[str]) -> str:
+        """Remove already-extracted code blocks from text"""
+        for code in code_blocks:
+            text = text.replace(code, "")
+        return text
+    def chunk_text(self, chunks: List[Chunk]) -> List[Chunk]:
+        """
+        Further chunk large text blocks into smaller overlapping chunks
+        Args:
+            chunks: Initial chunks from PDF extraction
+        Returns:
+            Refined chunks with proper overlap
+        """
+        refined_chunks = []
+        for chunk in chunks:
+            # Skip non-text chunks (tables, images already chunked)
+            if chunk.type in [ChunkType.TABLE, ChunkType.CODE]:
+                refined_chunks.append(chunk)
+                continue
+            # Split long paragraphs into smaller chunks with overlap
+            text = chunk.text
+            chunk_size = settings.chunk_size
+            overlap = settings.chunk_overlap
+            if len(text) <= chunk_size:
+                refined_chunks.append(chunk)
+                continue
+            # Create overlapping windows
+            for i in range(0, len(text), chunk_size - overlap):
+                chunk_text = text[i:i + chunk_size]
+                if len(chunk_text) < settings.min_chunk_size:
+                    continue
+                new_chunk = Chunk(
+                    chunk_id=str(uuid.uuid4()),
+                    pdf_id=chunk.pdf_id,
+                    page_number=chunk.page_number,
+                    char_range=(i, i + len(chunk_text)),
+                    type=chunk.type,
+                    text=chunk_text,
+                    metadata={
+                        **chunk.metadata,
+                        "parent_chunk_id": chunk.chunk_id,
+                        "window_index": i // (chunk_size - overlap)
+                    }
+                )
+                refined_chunks.append(new_chunk)
+        logger.info(f"Refined {len(chunks)} chunks into {len(refined_chunks)} chunks")
+        return refined_chunks

rag_agent.py ADDED Viewed

	@@ -0,0 +1,485 @@

+"""
+Agent-Based RAG System using LangGraph
+Provides intelligent query answering with tool use and multi-hop reasoning
+"""
+from typing import List, Dict, Any, TypedDict, Annotated
+from typing_extensions import TypedDict
+from langgraph.graph import StateGraph, END, START
+from langgraph.prebuilt import ToolNode
+from langchain_core.tools import tool
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage
+from loguru import logger
+import asyncio
+from models import SourceCitation, ChatResponse
+from graph_store import GraphStore
+from embedding_service import EmbeddingService
+from llm_service import LLMService
+class AgentState(TypedDict):
+    """State for the RAG agent workflow"""
+    messages: List  # Conversation history
+    query: str  # Current user question
+    pdf_id: str  # PDF context
+    tool_results: Dict[str, Any]  # Results from tool executions
+    reasoning_steps: List[str]  # Agent's reasoning process
+    final_answer: str  # Synthesized answer
+    citations: List[SourceCitation]  # Supporting citations
+    next_action: str  # What to do next
+class RAGAgent:
+    """
+    Intelligent RAG agent that uses multiple tools to answer questions
+    Tools available:
+    1. vector_search - Semantic search through document chunks
+    2. graph_search - Find concepts in knowledge graph
+    3. get_node_details - Get detailed info about a graph node
+    4. get_related_nodes - Traverse graph relationships
+    5. get_chunk_by_id - Retrieve specific chunks for citations
+    """
+    def __init__(self,
+                 graph_store: GraphStore,
+                 embedding_service: EmbeddingService,
+                 llm_service: LLMService):
+        """Initialize the RAG agent with necessary services"""
+        self.graph_store = graph_store
+        self.embedding_service = embedding_service
+        self.llm_service = llm_service
+        # Build LangGraph workflow
+        self.workflow = self._build_workflow()
+        self.app = self.workflow.compile()
+        logger.info("✓ RAG Agent initialized with LangGraph workflow")
+    def _create_tools(self):
+        """Create tool functions for the agent"""
+        @tool
+        def vector_search(query: str, top_k: int = 5) -> List[Dict[str, Any]]:
+            """
+            Search document chunks using semantic similarity.
+            Args:
+                query: The search query
+                top_k: Number of results to return
+            Returns:
+                List of relevant chunks with metadata and scores
+            """
+            logger.info(f"🔍 Tool: vector_search('{query}', top_k={top_k})")
+            try:
+                results = self.embedding_service.search(
+                    query=query,
+                    top_k=top_k
+                )
+                formatted_results = []
+                for metadata, score in results:
+                    formatted_results.append({
+                        "text": metadata.get("text", ""),
+                        "page_number": metadata.get("page_number", 0),
+                        "chunk_id": metadata.get("chunk_id", ""),
+                        "score": float(score)
+                    })
+                logger.info(f"  ✓ Found {len(formatted_results)} chunks")
+                return formatted_results
+            except Exception as e:
+                logger.error(f"  ✗ vector_search failed: {e}")
+                return []
+        @tool
+        def graph_search(concept: str) -> Dict[str, Any]:
+            """
+            Find a concept node in the knowledge graph.
+            Args:
+                concept: The concept to search for
+            Returns:
+                Node information if found, None otherwise
+            """
+            logger.info(f"🔍 Tool: graph_search('{concept}')")
+            try:
+                node = self.graph_store.get_node_by_label(concept)
+                if node:
+                    logger.info(f"  ✓ Found node: {node.label}")
+                    return {
+                        "node_id": node.node_id,
+                        "label": node.label,
+                        "type": node.type.value if hasattr(node.type, 'value') else node.type,
+                        "importance": node.importance_score
+                    }
+                else:
+                    logger.info(f"  ✗ No node found for '{concept}'")
+                    return None
+            except Exception as e:
+                logger.error(f"  ✗ graph_search failed: {e}")
+                return None
+        @tool
+        def get_node_details(node_id: str) -> Dict[str, Any]:
+            """
+            Get detailed information about a graph node.
+            Args:
+                node_id: The ID of the node
+            Returns:
+                Detailed node information including supporting chunks
+            """
+            logger.info(f"🔍 Tool: get_node_details('{node_id}')")
+            try:
+                node = self.graph_store.get_node(node_id)
+                if not node:
+                    logger.info(f"  ✗ Node not found")
+                    return None
+                # Get supporting chunks
+                chunks = []
+                for chunk in node.supporting_chunks[:5]:  # Top 5
+                    chunks.append({
+                        "page_number": chunk.page_number,
+                        "snippet": chunk.snippet,
+                        "score": chunk.score
+                    })
+                logger.info(f"  ✓ Got details for {node.label}")
+                return {
+                    "label": node.label,
+                    "type": node.type.value if hasattr(node.type, 'value') else node.type,
+                    "importance": node.importance_score,
+                    "supporting_chunks": chunks
+                }
+            except Exception as e:
+                logger.error(f"  ✗ get_node_details failed: {e}")
+                return None
+        @tool
+        def get_related_nodes(node_id: str, max_neighbors: int = 5) -> List[Dict[str, Any]]:
+            """
+            Get nodes related to a given node (graph traversal).
+            Args:
+                node_id: The ID of the starting node
+                max_neighbors: Maximum number of related nodes to return
+            Returns:
+                List of related nodes with relationship information
+            """
+            logger.info(f"🔍 Tool: get_related_nodes('{node_id}', max={max_neighbors})")
+            try:
+                neighbors = self.graph_store.get_neighbors(node_id)
+                related = []
+                for neighbor_node, edge in neighbors[:max_neighbors]:
+                    relation_value = edge.relation.value if hasattr(edge.relation, 'value') else edge.relation
+                    related.append({
+                        "node_id": neighbor_node.node_id,
+                        "label": neighbor_node.label,
+                        "relation": relation_value,
+                        "confidence": edge.confidence
+                    })
+                logger.info(f"  ✓ Found {len(related)} related nodes")
+                return related
+            except Exception as e:
+                logger.error(f"  ✗ get_related_nodes failed: {e}")
+                return []
+        @tool
+        def get_chunk_by_id(chunk_id: str) -> Dict[str, Any]:
+            """
+            Retrieve a specific chunk by its ID (for detailed citations).
+            Args:
+                chunk_id: The chunk identifier
+            Returns:
+                Chunk content and metadata
+            """
+            logger.info(f"🔍 Tool: get_chunk_by_id('{chunk_id}')")
+            try:
+                # Search by chunk_id in metadata
+                # This is a simplified version - you may need to implement proper chunk lookup
+                results = self.embedding_service.search_by_chunk_ids([chunk_id], top_k=1)
+                if results:
+                    metadata, score = results[0]
+                    logger.info(f"  ✓ Found chunk")
+                    return {
+                        "text": metadata.get("text", ""),
+                        "page_number": metadata.get("page_number", 0),
+                        "chunk_id": chunk_id
+                    }
+                else:
+                    logger.info(f"  ✗ Chunk not found")
+                    return None
+            except Exception as e:
+                logger.error(f"  ✗ get_chunk_by_id failed: {e}")
+                return None
+        return [vector_search, graph_search, get_node_details, get_related_nodes, get_chunk_by_id]
+    def _build_workflow(self) -> StateGraph:
+        """Build the LangGraph workflow for the agent"""
+        workflow = StateGraph(AgentState)
+        # Define workflow nodes
+        workflow.add_node("plan", self._plan_node)
+        workflow.add_node("execute_tools", self._execute_tools_node)
+        workflow.add_node("synthesize", self._synthesize_node)
+        # Define edges
+        workflow.add_edge(START, "plan")
+        workflow.add_conditional_edges(
+            "plan",
+            self._should_use_tools,
+            {
+                "tools": "execute_tools",
+                "direct": "synthesize"
+            }
+        )
+        workflow.add_edge("execute_tools", "synthesize")
+        workflow.add_edge("synthesize", END)
+        return workflow
+    def _plan_node(self, state: AgentState) -> AgentState:
+        """Agent decides which tools to use"""
+        logger.info("🤖 Agent: Planning which tools to use...")
+        query = state["query"]
+        # Simple heuristic-based planning (can be enhanced with LLM)
+        tools_to_use = []
+        reasoning = []
+        # Always use vector search for semantic matching
+        tools_to_use.append("vector_search")
+        reasoning.append("Use vector search for semantic document retrieval")
+        # Check if query mentions specific concepts (use graph)
+        if any(word in query.lower() for word in ["relate", "connection", "link", "between"]):
+            tools_to_use.append("graph_search")
+            reasoning.append("Query asks about relationships - use graph search")
+        # Check if asking about a specific concept
+        if any(word in query.lower() for word in ["what is", "define", "explain"]):
+            tools_to_use.append("graph_search")
+            reasoning.append("Query asks for concept definition - check graph")
+        state["tool_results"] = {"planned_tools": tools_to_use}
+        state["reasoning_steps"] = reasoning
+        state["next_action"] = "tools" if tools_to_use else "direct"
+        logger.info(f"  Plan: {tools_to_use}")
+        return state
+    def _should_use_tools(self, state: AgentState) -> str:
+        """Decide if tools are needed"""
+        return state.get("next_action", "direct")
+    def _execute_tools_node(self, state: AgentState) -> AgentState:
+        """Execute the planned tools"""
+        logger.info("🔧 Agent: Executing tools...")
+        query = state["query"]
+        planned_tools = state["tool_results"].get("planned_tools", [])
+        results = {}
+        # Create tools
+        tools_map = {}
+        for tool in self._create_tools():
+            tools_map[tool.name] = tool
+        # Execute tools
+        if "vector_search" in planned_tools:
+            vector_tool = tools_map["vector_search"]
+            results["vector_results"] = vector_tool.invoke({"query": query, "top_k": 5})
+        if "graph_search" in planned_tools:
+            # Extract main concept from query (simplified)
+            # In production, use NER or LLM to extract concept
+            words = query.lower().split()
+            potential_concepts = [w for w in words if len(w) > 4 and w not in ["what", "how", "does", "relate"]]
+            for concept in potential_concepts[:2]:  # Try first 2
+                graph_tool = tools_map["graph_search"]
+                node_result = graph_tool.invoke({"concept": concept})
+                if node_result:
+                    results[f"graph_node_{concept}"] = node_result
+                    # Get related nodes
+                    related_tool = tools_map["get_related_nodes"]
+                    related = related_tool.invoke({"node_id": node_result["node_id"], "max_neighbors": 3})
+                    results[f"related_{concept}"] = related
+                    break
+        state["tool_results"].update(results)
+        logger.info(f"  ✓ Executed {len(planned_tools)} tools, got {len(results)} results")
+        return state
+    async def _synthesize_node(self, state: AgentState) -> AgentState:
+        """Synthesize final answer from tool results"""
+        logger.info("🎯 Agent: Synthesizing answer...")
+        query = state["query"]
+        tool_results = state["tool_results"]
+        # Prepare context from tool results
+        context_parts = []
+        citations = []
+        # Add vector search results
+        if "vector_results" in tool_results:
+            vector_results = tool_results["vector_results"]
+            for i, result in enumerate(vector_results[:3]):  # Top 3
+                context_parts.append(f"[Source {i+1}, p.{result['page_number']}]: {result['text']}")
+                citations.append(SourceCitation(
+                    page_number=result["page_number"],
+                    snippet=result["text"][:120] + "..." if len(result["text"]) > 120 else result["text"],
+                    chunk_id=result["chunk_id"],
+                    score=result["score"]
+                ))
+        # Add graph results
+        for key, value in tool_results.items():
+            if key.startswith("graph_node_"):
+                concept = key.replace("graph_node_", "")
+                context_parts.append(f"[Graph Node]: '{value['label']}' is a {value['type']} (importance: {value['importance']:.2f})")
+            elif key.startswith("related_"):
+                concept = key.replace("related_", "")
+                if value:
+                    relations = ", ".join([f"{r['label']} ({r['relation']})" for r in value])
+                    context_parts.append(f"[Related Concepts]: {relations}")
+        # Create context for LLM
+        context = "\n\n".join(context_parts)
+        # Generate answer using Gemini
+        answer = await self.llm_service.agent_synthesize(query, context)
+        state["final_answer"] = answer
+        state["citations"] = citations
+        logger.info("  ✓ Answer synthesized")
+        return state
+    async def chat(self, query: str, pdf_id: str = None, include_citations: bool = True) -> ChatResponse:
+        """
+        Main entry point for agent-based chat
+        Args:
+            query: User's question
+            pdf_id: Optional PDF context
+            include_citations: Whether to include source citations
+        Returns:
+            ChatResponse with answer and citations
+        """
+        logger.info(f"\n{'='*80}")
+        logger.info(f"🤖 Agent-Based RAG Query: '{query}'")
+        logger.info(f"{'='*80}")
+        # Initialize state
+        initial_state = {
+            "messages": [HumanMessage(content=query)],
+            "query": query,
+            "pdf_id": pdf_id or "",
+            "tool_results": {},
+            "reasoning_steps": [],
+            "final_answer": "",
+            "citations": [],
+            "next_action": ""
+        }
+        try:
+            # Run workflow
+            final_state = await self.app.ainvoke(initial_state)
+            # Extract results
+            answer = final_state.get("final_answer", "I couldn't generate an answer.")
+            citations = final_state.get("citations", [])
+            if not include_citations:
+                citations = []
+            logger.info(f"✓ Agent completed successfully")
+            logger.info(f"  Answer length: {len(answer)} chars")
+            logger.info(f"  Citations: {len(citations)}")
+            logger.info(f"{'='*80}\n")
+            return ChatResponse(
+                answer=answer,
+                sources=citations[:5]  # Top 5 citations
+            )
+        except Exception as e:
+            logger.error(f"❌ Agent failed: {e}", exc_info=True)
+            # Fallback to simple vector search
+            logger.warning("Falling back to simple RAG...")
+            return await self._fallback_simple_rag(query, pdf_id)
+    async def _fallback_simple_rag(self, query: str, pdf_id: str = None) -> ChatResponse:
+        """Fallback to simple RAG if agent fails"""
+        try:
+            results = self.embedding_service.search(query=query, top_k=5, filter_pdf_id=pdf_id)
+            if not results:
+                return ChatResponse(
+                    answer="I couldn't find relevant information to answer your question.",
+                    sources=[]
+                )
+            # Prepare context
+            context_chunks = [
+                {
+                    "page_number": meta.get("page_number", 0),
+                    "text": meta.get("text", "")
+                }
+                for meta, score in results[:3]
+            ]
+            # Generate answer
+            answer = await self.llm_service.rag_chat(query, context_chunks)
+            # Format sources
+            sources = []
+            for meta, score in results[:5]:
+                text = meta.get("text", "")
+                snippet = text[:120] + "..." if len(text) > 120 else text
+                sources.append(SourceCitation(
+                    page_number=meta.get("page_number", 0),
+                    snippet=snippet,
+                    chunk_id=meta.get("chunk_id", ""),
+                    score=score
+                ))
+            return ChatResponse(answer=answer, sources=sources)
+        except Exception as e:
+            logger.error(f"Fallback RAG also failed: {e}")
+            return ChatResponse(
+                answer="I encountered an error processing your question.",
+                sources=[]
+            )

requirements.txt ADDED Viewed

	@@ -0,0 +1,59 @@

+# PDF Processing
+PyMuPDF
+pdfplumber
+pytesseract
+Pillow
+camelot-py[cv]
+tabula-py
+# NLP & Embeddings
+sentence-transformers
+transformers
+torch>=2.2.0
+keybert
+yake
+# Knowledge Graph Generation
+kg-gen
+litellm
+# Multi-Agent System with LangGraph
+langgraph
+langchain
+langchain-core
+langchain-community
+# Vector Store & Search
+faiss-cpu
+# Graph Database & Processing
+neo4j
+networkx
+# Backend & API
+fastapi
+uvicorn[standard]
+python-multipart
+pydantic
+pydantic-settings
+# Database
+sqlalchemy
+psycopg2-binary
+pymongo
+# Utilities
+python-dotenv
+loguru
+tenacity
+httpx
+aiofiles
+# Monitoring & DevOps
+prometheus-client
+python-json-logger
+# Testing
+pytest
+pytest-asyncio
+pytest-cov

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Tests package

tests/test_basic.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+Basic tests for GraphLLM components
+"""
+import pytest
+from models import Chunk, ChunkType, GraphNode, GraphEdge, Triple, NodeType, RelationType
+from config import settings
+def test_chunk_creation():
+    """Test chunk model creation"""
+    chunk = Chunk(
+        pdf_id="test-pdf",
+        page_number=1,
+        char_range=(0, 100),
+        type=ChunkType.PARAGRAPH,
+        text="This is a test chunk."
+    )
+    assert chunk.pdf_id == "test-pdf"
+    assert chunk.page_number == 1
+    assert chunk.type == ChunkType.PARAGRAPH
+    assert chunk.text == "This is a test chunk."
+def test_graph_node_creation():
+    """Test graph node creation"""
+    node = GraphNode(
+        label="Test Concept",
+        type=NodeType.CONCEPT,
+        aliases=["test", "concept"],
+        supporting_chunks=[],
+        importance_score=0.75
+    )
+    assert node.label == "Test Concept"
+    assert node.type == NodeType.CONCEPT
+    assert node.importance_score == 0.75
+def test_graph_edge_creation():
+    """Test graph edge creation"""
+    edge = GraphEdge(
+        from_node="node1",
+        to_node="node2",
+        relation=RelationType.USES,
+        confidence=0.8,
+        supporting_chunks=[]
+    )
+    assert edge.from_node == "node1"
+    assert edge.to_node == "node2"
+    assert edge.relation == RelationType.USES
+    assert edge.confidence == 0.8
+def test_triple_creation():
+    """Test triple model"""
+    triple = Triple(
+        subject="Machine Learning",
+        predicate="uses",
+        object="Neural Networks",
+        confidence=0.9,
+        page_number=5
+    )
+    assert triple.subject == "Machine Learning"
+    assert triple.predicate == "uses"
+    assert triple.object == "Neural Networks"
+    assert triple.confidence == 0.9
+def test_settings_load():
+    """Test configuration loading"""
+    assert settings.app_name == "GraphLLM"
+    assert settings.chunk_size > 0
+    assert settings.embedding_model is not None
+@pytest.mark.asyncio
+async def test_pdf_processor_import():
+    """Test PDF processor can be imported"""
+    from pdf_processor import PDFProcessor
+    processor = PDFProcessor()
+    assert processor is not None
+@pytest.mark.asyncio
+async def test_embedding_service_import():
+    """Test embedding service can be imported"""
+    from embedding_service import EmbeddingService
+    # Note: This will load the model, may take time
+    # service = EmbeddingService()
+    # assert service is not None
+    pass
+@pytest.mark.asyncio
+async def test_graph_store_import():
+    """Test graph store can be imported"""
+    from graph_store import GraphStore
+    store = GraphStore(use_neo4j=False)
+    assert store is not None
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])