nvtitan commited on
Commit
e884643
·
verified ·
1 Parent(s): 5cc12a8

Upload 24 files

Browse files
.dockerignore ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+
11
+ # Virtual Environments
12
+ venv/
13
+ env/
14
+ .venv/
15
+ ENV/
16
+
17
+ # Environment Variables
18
+ .env
19
+ .env.*
20
+ !.env.example
21
+
22
+ # Data (exclude from image - will be created at runtime)
23
+ data/
24
+ uploads/
25
+ *.pdf
26
+ *.pkl
27
+ *.faiss
28
+ *.index
29
+
30
+ # Logs (exclude from image)
31
+ logs/
32
+ *.log
33
+
34
+ # Cache
35
+ cache/
36
+ .cache/
37
+ __pycache__/
38
+
39
+ # IDEs
40
+ .vscode/
41
+ .idea/
42
+ *.swp
43
+ *.swo
44
+ *.sublime-*
45
+
46
+ # OS
47
+ .DS_Store
48
+ Thumbs.db
49
+ desktop.ini
50
+
51
+ # Testing
52
+ .coverage
53
+ htmlcov/
54
+ .pytest_cache/
55
+ .tox/
56
+ *.cover
57
+ tests/
58
+
59
+ # Database files
60
+ *.db
61
+ *.sqlite
62
+ *.sqlite3
63
+
64
+ # Git
65
+ .git/
66
+ .gitignore
67
+ .gitattributes
68
+
69
+ # Documentation
70
+ docs/
71
+ *.md
72
+ !README.md
73
+
74
+ # Deployment configs (not needed in container)
75
+ railway.toml
76
+ nixpacks.toml
77
+ Procfile
78
+ modal_app.py
79
+ fly.toml
80
+ vercel.json
81
+ heroku.yml
82
+ docker-compose*.yml
83
+
84
+ # CI/CD
85
+ .github/
86
+ .gitlab-ci.yml
87
+ .travis.yml
88
+
89
+ # Misc
90
+ *.bak
91
+ *.tmp
92
+ *.temp
.gitignore ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ ENV/
26
+ env/
27
+ .venv
28
+
29
+ # Environment Variables
30
+ .env
31
+ .env.local
32
+
33
+ # Data & Uploads
34
+ data/
35
+ uploads/
36
+ *.pdf
37
+ *.faiss
38
+ *.index
39
+
40
+ # Logs
41
+ logs/
42
+ *.log
43
+
44
+ # Cache
45
+ cache/
46
+ .cache/
47
+ *.cache
48
+
49
+ # IDEs
50
+ .vscode/
51
+ .idea/
52
+ *.swp
53
+ *.swo
54
+ *~
55
+
56
+ # OS
57
+ .DS_Store
58
+ Thumbs.db
59
+
60
+ # Testing
61
+ .coverage
62
+ htmlcov/
63
+ .pytest_cache/
64
+ .tox/
65
+
66
+ # Database
67
+ *.db
68
+ *.sqlite
69
+ *.sqlite3
70
+
71
+ # Neo4j
72
+ neo4j/
.railwayignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore local development files
2
+ venv/
3
+ __pycache__/
4
+ *.pyc
5
+ *.pyo
6
+ *.pyd
7
+ .Python
8
+ *.so
9
+ *.egg
10
+ *.egg-info/
11
+ dist/
12
+ build/
13
+ .env.local
14
+ .DS_Store
15
+
16
+ # Ignore local data (will be created on Railway)
17
+ data/
18
+ uploads/
19
+ logs/
20
+ cache/
21
+
22
+ # Ignore development artifacts
23
+ .pytest_cache/
24
+ .coverage
25
+ htmlcov/
26
+ *.log
Dockerfile ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GraphLLM - Hugging Face Spaces Deployment
2
+ # Optimized Docker image for HF Spaces
3
+
4
+ FROM python:3.12-slim
5
+
6
+ # Set environment variables
7
+ ENV PYTHONUNBUFFERED=1 \
8
+ PYTHONDONTWRITEBYTECODE=1 \
9
+ PIP_NO_CACHE_DIR=1 \
10
+ PIP_DISABLE_PIP_VERSION_CHECK=1 \
11
+ DEBIAN_FRONTEND=noninteractive \
12
+ API_PORT=7860 \
13
+ HF_HOME=/app/cache \
14
+ TRANSFORMERS_CACHE=/app/cache \
15
+ SENTENCE_TRANSFORMERS_HOME=/app/cache
16
+
17
+ # Set working directory
18
+ WORKDIR /app
19
+
20
+ # Install system dependencies (minimal set for HF Spaces)
21
+ RUN apt-get update && apt-get install -y --no-install-recommends \
22
+ build-essential \
23
+ curl \
24
+ tesseract-ocr \
25
+ ghostscript \
26
+ && rm -rf /var/lib/apt/lists/* \
27
+ && apt-get clean
28
+
29
+ # Copy requirements first (for better layer caching)
30
+ COPY requirements.txt .
31
+
32
+ # Install Python dependencies
33
+ RUN pip install --no-cache-dir -r requirements.txt
34
+
35
+ # Copy application code
36
+ COPY . .
37
+
38
+ # Create data directories with proper permissions (777 for HF Spaces non-root user)
39
+ RUN mkdir -p data uploads logs cache data/faiss_index && \
40
+ chmod -R 777 data uploads logs cache
41
+
42
+ # Expose Hugging Face Spaces default port
43
+ EXPOSE 7860
44
+
45
+ # Health check
46
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
47
+ CMD curl -f http://localhost:7860/ || exit 1
48
+
49
+ # Run the application
50
+ # HF Spaces expects the app to listen on 0.0.0.0:7860
51
+ CMD ["python3", "main.py"]
Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: python main.py
README.md CHANGED
@@ -1,12 +1,94 @@
1
- ---
2
- title: GraphRAG
3
- emoji: 🦀
4
- colorFrom: red
5
- colorTo: indigo
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: graphRAG for PDFs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+
2
+
3
+ # 🕸️ GraphLLM - PDF Knowledge Graph + RAG System
4
+
5
+ Transform PDFs into interactive knowledge graphs with AI-powered Q&A.
6
+
7
+ ## 🚀 Features
8
+
9
+ - **📄 PDF Processing:** Extract text, tables, and images from PDFs
10
+ - **🕸️ Knowledge Graph Generation:** Build semantic graphs using Gemini AI
11
+ - **🔍 Vector Search:** FAISS-powered semantic search with sentence transformers
12
+ - **💬 RAG Chat:** Ask questions and get answers with source citations
13
+ - **🎨 Interactive Visualization:** Explore knowledge graphs in your browser
14
+
15
+ ## 🛠️ Technology Stack
16
+
17
+ - **LLM:** Google Gemini (gemini-2.5-flash)
18
+ - **Embeddings:** sentence-transformers/all-MiniLM-L6-v2
19
+ - **Vector Store:** FAISS with HNSW index
20
+ - **Graph:** NetworkX (in-memory)
21
+ - **Backend:** FastAPI + Uvicorn
22
+ - **Frontend:** Vanilla JS with D3.js/Cytoscape
23
+
24
+ ## 📋 Setup
25
+
26
+ ### Required: Gemini API Key
27
+
28
+ This app requires a Google Gemini API key:
29
+
30
+ 1. Get your API key from [Google AI Studio](https://makersuite.google.com/app/apikey)
31
+ 2. Add it as a **Secret** in Hugging Face Spaces settings:
32
+ - Name: `GEMINI_API_KEY`
33
+ - Value: Your API key
34
+
35
+ ### Configuration (Optional)
36
+
37
+ You can set these environment variables in Space Settings:
38
+
39
+ ```bash
40
+ # LLM Settings
41
+ GEMINI_MODEL=gemini-2.5-flash # Gemini model
42
+ LLM_TEMPERATURE=0.0 # Temperature for extraction
43
+
44
+ # Embedding Settings
45
+ EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
46
+
47
+ # Environment
48
+ ENVIRONMENT=production
49
+ LOG_LEVEL=INFO
50
+ ```
51
+
52
+ ## 🎯 Usage
53
+
54
+ 1. **Upload PDF:** Click "Upload PDF" and select your document
55
+ 2. **Wait for Processing:** The system will:
56
+ - Extract text chunks
57
+ - Generate embeddings
58
+ - Build knowledge graph with Gemini
59
+ 3. **Explore Graph:** Click nodes to see details and related concepts
60
+ 4. **Ask Questions:** Use the chat interface for Q&A with citations
61
+
62
+ ## 📊 Graph Generation
63
+
64
+ - **Per-Page Extraction:** Max 2 concepts per page (quality over quantity)
65
+ - **Parallel Processing:** All pages processed concurrently via Gemini API
66
+ - **Strict Filtering:** Only technical/domain-specific concepts
67
+ - **Co-occurrence Relationships:** Concepts on same page are linked
68
+
69
+ ## 🎨 Frontend
70
+
71
+ The frontend is a single-page application located in `/frontend/`:
72
+ - `index.html` - Main UI
73
+ - `app.js` - Graph visualization & API calls
74
+ - `styles.css` - Styling
75
+
76
+ Access it at: `http://your-space-url.hf.space/frontend/`
77
+
78
+
79
+ ## 📦 Docker
80
+
81
+ This Space uses Docker for deployment:
82
+ - Base: `python:3.12-slim`
83
+ - Port: 7860 (HF Spaces default)
84
+ - Health check enabled
85
+ - Persistent data directory
86
+
87
+ ## 🤝 Credits
88
+
89
+ - **LLM:** Google Gemini
90
+ - **Embeddings:** Hugging Face sentence-transformers
91
+
92
+
93
  ---
94
 
 
config.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration management for GraphLLM system
3
+ """
4
+ from pydantic_settings import BaseSettings
5
+ from pydantic import Field, field_validator
6
+ from typing import Optional
7
+ import os
8
+
9
+
10
+ class Settings(BaseSettings):
11
+ """Application settings loaded from environment variables"""
12
+
13
+ # Application
14
+ app_name: str = "GraphLLM"
15
+ app_version: str = "1.0.0"
16
+ environment: str = "development"
17
+ debug: bool = True
18
+
19
+ # API
20
+ api_host: str = "0.0.0.0"
21
+ api_port: int = 8000
22
+ api_workers: int = 4
23
+
24
+ # LLM Settings - Gemini (Primary)
25
+ gemini_api_key: str = Field(default="", env="GEMINI_API_KEY")
26
+ gemini_model: str = "gemini-2.5-flash"
27
+
28
+ # LLM Settings - Mistral (Fallback)
29
+ mistral_api_key: str = Field(default="", env="MISTRAL_API_KEY")
30
+ mistral_model: str = "mistral-7b-instruct-v0.1"
31
+
32
+ # LLM Parameters
33
+ llm_temperature: float = 0.0
34
+ llm_max_tokens: int = 2048
35
+ llm_timeout: int = 120
36
+
37
+ # Embedding Settings
38
+ embedding_model: str = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
39
+ embedding_dimension: int = 384
40
+ embedding_batch_size: int = 32
41
+
42
+ # FAISS Vector DB
43
+ faiss_index_path: str = "./data/faiss_index"
44
+ faiss_metric: str = "cosine"
45
+
46
+ # Neo4j Graph DB
47
+ neo4j_uri: str = "bolt://localhost:7687"
48
+ neo4j_user: str = "neo4j"
49
+ neo4j_password: str = Field(default="", env="NEO4J_PASSWORD")
50
+ neo4j_database: str = "neo4j"
51
+
52
+ # PostgreSQL
53
+ postgres_host: str = "localhost"
54
+ postgres_port: int = 5432
55
+ postgres_db: str = "graphllm"
56
+ postgres_user: str = "postgres"
57
+ postgres_password: str = Field(default="", env="POSTGRES_PASSWORD")
58
+
59
+ # MongoDB (optional)
60
+ mongodb_uri: str = "mongodb://localhost:27017"
61
+ mongodb_database: str = "graphllm"
62
+
63
+ # Chunking
64
+ chunk_size: int = 512
65
+ chunk_overlap: int = 128
66
+ min_chunk_size: int = 100
67
+
68
+ # Triplet Extraction
69
+ triplet_confidence_threshold: float = 0.6
70
+ entity_similarity_threshold: float = 0.85
71
+ max_triples_per_chunk: int = 10
72
+
73
+ # Graph Pruning
74
+ node_importance_threshold: float = 0.3
75
+ edge_confidence_threshold: float = 0.5
76
+ min_node_mentions: int = 2
77
+
78
+ # RAG
79
+ rag_top_k: int = 10
80
+ rag_rerank_top_k: int = 5
81
+ max_context_length: int = 4000
82
+
83
+ # File Upload
84
+ max_file_size_mb: int = 50
85
+ allowed_extensions: str = "pdf"
86
+ upload_dir: str = "./data/uploads"
87
+
88
+ # Storage
89
+ data_dir: str = "./data"
90
+ logs_dir: str = "./logs"
91
+ cache_dir: str = "./cache"
92
+
93
+ # Monitoring
94
+ enable_metrics: bool = True
95
+ metrics_port: int = 9090
96
+ log_level: str = "INFO"
97
+
98
+ @property
99
+ def postgres_url(self) -> str:
100
+ """Build PostgreSQL connection URL"""
101
+ return f"postgresql://{self.postgres_user}:{self.postgres_password}@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}"
102
+
103
+ @property
104
+ def max_file_size_bytes(self) -> int:
105
+ """Convert MB to bytes"""
106
+ return self.max_file_size_mb * 1024 * 1024
107
+
108
+ class Config:
109
+ env_file = ".env"
110
+ case_sensitive = False
111
+
112
+
113
+ # Global settings instance
114
+ settings = Settings()
115
+
116
+
117
+ def ensure_directories():
118
+ """Ensure all required directories exist"""
119
+ dirs = [
120
+ settings.data_dir,
121
+ settings.upload_dir,
122
+ settings.logs_dir,
123
+ settings.cache_dir,
124
+ settings.faiss_index_path,
125
+ ]
126
+ for directory in dirs:
127
+ os.makedirs(directory, exist_ok=True)
docker-compose.yml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GraphLLM Docker Compose Configuration
2
+ # Simple standalone deployment with persistent storage
3
+
4
+ version: '3.8'
5
+
6
+ services:
7
+ # Main GraphLLM Application
8
+ graphllm:
9
+ build:
10
+ context: .
11
+ dockerfile: Dockerfile
12
+ container_name: graphllm
13
+ image: graphllm:latest
14
+ ports:
15
+ - "8000:8000"
16
+ volumes:
17
+ # Persistent storage for data, uploads, and logs
18
+ - graphllm-data:/app/data
19
+ - graphllm-uploads:/app/uploads
20
+ - graphllm-logs:/app/logs
21
+ - graphllm-cache:/app/cache
22
+ environment:
23
+ # Gemini API Configuration
24
+ - GEMINI_API_KEY=${GEMINI_API_KEY}
25
+ - GEMINI_MODEL=${GEMINI_MODEL:-gemini-1.5-flash}
26
+
27
+ # Application Settings
28
+ - ENVIRONMENT=${ENVIRONMENT:-production}
29
+ - LOG_LEVEL=${LOG_LEVEL:-INFO}
30
+ - DEBUG=false
31
+
32
+ # LLM Settings
33
+ - LLM_TEMPERATURE=${LLM_TEMPERATURE:-0.7}
34
+ - LLM_MAX_TOKENS=${LLM_MAX_TOKENS:-2048}
35
+
36
+ # Embedding Settings
37
+ - EMBEDDING_MODEL=${EMBEDDING_MODEL:-all-MiniLM-L6-v2}
38
+ - EMBEDDING_BATCH_SIZE=${EMBEDDING_BATCH_SIZE:-128}
39
+
40
+ # API Settings
41
+ - API_HOST=0.0.0.0
42
+ - API_PORT=8000
43
+ - MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
44
+
45
+ restart: unless-stopped
46
+ healthcheck:
47
+ test: ["CMD", "curl", "-f", "http://localhost:8000/"]
48
+ interval: 30s
49
+ timeout: 10s
50
+ retries: 3
51
+ start_period: 60s
52
+ networks:
53
+ - graphllm-network
54
+
55
+ volumes:
56
+ # Named volumes for persistent storage
57
+ graphllm-data:
58
+ driver: local
59
+ graphllm-uploads:
60
+ driver: local
61
+ graphllm-logs:
62
+ driver: local
63
+ graphllm-cache:
64
+ driver: local
65
+
66
+ networks:
67
+ graphllm-network:
68
+ driver: bridge
embedding_service.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Embedding & Vector Index Service
3
+ Handles embedding generation and FAISS vector store management
4
+ """
5
+ # Import SentenceTransformer lazily to avoid hanging on startup
6
+ import faiss
7
+ import numpy as np
8
+ from typing import List, Dict, Any, Tuple, Optional
9
+ from loguru import logger
10
+ import pickle
11
+ import os
12
+ from models import Chunk, EmbeddingEntry
13
+ from config import settings
14
+ import json
15
+
16
+
17
+ class EmbeddingService:
18
+ """
19
+ Service for creating embeddings and managing FAISS vector index
20
+ Uses lazy loading for the embedding model (loads on first use)
21
+ """
22
+
23
+ def __init__(self):
24
+ logger.info(f"EmbeddingService initialized (model will load on first use)")
25
+ self._model = None # Lazy-loaded
26
+ self.dimension = settings.embedding_dimension
27
+ self.index: Optional[faiss.Index] = None
28
+ self.chunk_metadata: Dict[int, Dict[str, Any]] = {} # index_id -> metadata
29
+ self._initialize_index()
30
+
31
+ @property
32
+ def model(self):
33
+ """Lazy-load the embedding model on first access"""
34
+ if self._model is None:
35
+ logger.info(f"Loading embedding model: {settings.embedding_model}")
36
+ # Import only when needed to avoid hanging on startup
37
+ from sentence_transformers import SentenceTransformer
38
+ self._model = SentenceTransformer(settings.embedding_model)
39
+ logger.info(f"✓ Embedding model loaded successfully")
40
+ return self._model
41
+
42
+ def _initialize_index(self):
43
+ """Initialize or load FAISS index"""
44
+ index_path = os.path.join(settings.faiss_index_path, "index.faiss")
45
+ metadata_path = os.path.join(settings.faiss_index_path, "metadata.pkl")
46
+
47
+ if os.path.exists(index_path) and os.path.exists(metadata_path):
48
+ logger.info("Loading existing FAISS index")
49
+ self.index = faiss.read_index(index_path)
50
+ with open(metadata_path, 'rb') as f:
51
+ self.chunk_metadata = pickle.load(f)
52
+ logger.info(f"Loaded index with {self.index.ntotal} vectors")
53
+ else:
54
+ logger.info("Creating new FAISS index (optimized)")
55
+ # Use HNSW for better performance on larger datasets
56
+ # HNSW is ~10x faster than flat index with 99%+ accuracy
57
+ # M=32 is good balance (higher M = more accurate but slower)
58
+ self.index = faiss.IndexHNSWFlat(self.dimension, 32)
59
+ # Set ef construction (higher = better quality, slower build)
60
+ self.index.hnsw.efConstruction = 40
61
+ # Set ef search (higher = better recall, slower search)
62
+ self.index.hnsw.efSearch = 16
63
+ self.chunk_metadata = {}
64
+ logger.info("Using HNSW index for faster approximate search")
65
+
66
+ def create_embeddings(self, chunks: List[Chunk]) -> List[EmbeddingEntry]:
67
+ """
68
+ ⚡ OPTIMIZED: Create embeddings with larger batches and parallel processing
69
+
70
+ Args:
71
+ chunks: List of Chunk objects
72
+
73
+ Returns:
74
+ List of EmbeddingEntry objects
75
+ """
76
+ texts = [chunk.text for chunk in chunks]
77
+ logger.info(f"⚡ Creating embeddings for {len(texts)} chunks (batch_size={settings.embedding_batch_size})")
78
+
79
+ import time
80
+ start = time.time()
81
+
82
+ # Batch encode with optimized settings
83
+ embeddings = self.model.encode(
84
+ texts,
85
+ batch_size=settings.embedding_batch_size,
86
+ show_progress_bar=False, # Disable for less overhead
87
+ convert_to_numpy=True,
88
+ normalize_embeddings=True # Built-in normalization is faster
89
+ )
90
+
91
+ elapsed = time.time() - start
92
+ logger.info(f"✓ Created {len(embeddings)} embeddings in {elapsed:.2f}s ({len(embeddings)/elapsed:.1f} chunks/sec)")
93
+
94
+ # Create embedding entries
95
+ embedding_entries = []
96
+ for chunk, embedding in zip(chunks, embeddings):
97
+ entry = EmbeddingEntry(
98
+ chunk_id=chunk.chunk_id,
99
+ embedding=embedding.tolist(),
100
+ metadata={
101
+ "pdf_id": chunk.pdf_id,
102
+ "page_number": chunk.page_number,
103
+ "type": chunk.type.value,
104
+ "char_range": chunk.char_range
105
+ }
106
+ )
107
+ embedding_entries.append(entry)
108
+
109
+ return embedding_entries
110
+
111
+ def add_to_index(self, chunks: List[Chunk], embeddings: List[EmbeddingEntry]):
112
+ """
113
+ Add chunks and their embeddings to FAISS index
114
+
115
+ Args:
116
+ chunks: List of chunks
117
+ embeddings: Corresponding embeddings
118
+ """
119
+ if len(chunks) != len(embeddings):
120
+ raise ValueError("Chunks and embeddings must have same length")
121
+
122
+ # Convert embeddings to numpy array
123
+ embedding_array = np.array([e.embedding for e in embeddings]).astype('float32')
124
+
125
+ # Get current index size (starting ID for new chunks)
126
+ start_id = self.index.ntotal
127
+
128
+ # Add to FAISS index
129
+ self.index.add(embedding_array)
130
+
131
+ # Store metadata mapping
132
+ for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
133
+ idx = start_id + i
134
+ self.chunk_metadata[idx] = {
135
+ "chunk_id": chunk.chunk_id,
136
+ "pdf_id": chunk.pdf_id,
137
+ "page_number": chunk.page_number,
138
+ "type": chunk.type.value,
139
+ "text": chunk.text,
140
+ "char_range": chunk.char_range,
141
+ "metadata": chunk.metadata
142
+ }
143
+
144
+ logger.info(f"Added {len(chunks)} chunks to index. Total: {self.index.ntotal}")
145
+
146
+ def search(
147
+ self,
148
+ query: str,
149
+ top_k: int = 10,
150
+ filter_pdf_id: Optional[str] = None
151
+ ) -> List[Tuple[Dict[str, Any], float]]:
152
+ """
153
+ Search for similar chunks
154
+
155
+ Args:
156
+ query: Query string
157
+ top_k: Number of results to return
158
+ filter_pdf_id: Optional PDF ID to filter results
159
+
160
+ Returns:
161
+ List of (chunk_metadata, score) tuples
162
+ """
163
+ # Encode and normalize query
164
+ query_embedding = self.model.encode([query], convert_to_numpy=True)
165
+ faiss.normalize_L2(query_embedding)
166
+
167
+ # Search
168
+ # Fetch more if we need to filter
169
+ k = top_k * 10 if filter_pdf_id else top_k
170
+ scores, indices = self.index.search(query_embedding, k)
171
+
172
+ # Retrieve metadata
173
+ results = []
174
+ for score, idx in zip(scores[0], indices[0]):
175
+ if idx == -1: # FAISS returns -1 for empty results
176
+ continue
177
+
178
+ metadata = self.chunk_metadata.get(idx)
179
+ if metadata is None:
180
+ continue
181
+
182
+ # Apply filter if specified
183
+ if filter_pdf_id and metadata.get("pdf_id") != filter_pdf_id:
184
+ continue
185
+
186
+ results.append((metadata, float(score)))
187
+
188
+ if len(results) >= top_k:
189
+ break
190
+
191
+ return results
192
+
193
+ def search_by_chunk_ids(self, chunk_ids: List[str], top_k: int = 5) -> List[Tuple[Dict[str, Any], float]]:
194
+ """
195
+ Find similar chunks to a set of chunk IDs (for node expansion)
196
+
197
+ Args:
198
+ chunk_ids: List of chunk IDs
199
+ top_k: Number of similar chunks per input chunk
200
+
201
+ Returns:
202
+ List of (chunk_metadata, score) tuples
203
+ """
204
+ # Find the chunks in metadata
205
+ chunk_indices = []
206
+ for idx, meta in self.chunk_metadata.items():
207
+ if meta["chunk_id"] in chunk_ids:
208
+ chunk_indices.append(idx)
209
+
210
+ if not chunk_indices:
211
+ return []
212
+
213
+ # Get embeddings for these chunks
214
+ # Note: FAISS doesn't have a direct "get vector" API for IndexFlatIP
215
+ # We'll search from the index using reconstruct (if supported)
216
+ results = []
217
+ for idx in chunk_indices:
218
+ # Reconstruct vector (works for Flat indices)
219
+ try:
220
+ vector = self.index.reconstruct(idx)
221
+ vector = vector.reshape(1, -1)
222
+ scores, indices = self.index.search(vector, top_k + 1) # +1 to exclude self
223
+
224
+ for score, res_idx in zip(scores[0], indices[0]):
225
+ if res_idx == idx: # Skip self
226
+ continue
227
+ if res_idx == -1:
228
+ continue
229
+
230
+ metadata = self.chunk_metadata.get(res_idx)
231
+ if metadata:
232
+ results.append((metadata, float(score)))
233
+ except Exception as e:
234
+ logger.warning(f"Could not reconstruct vector for index {idx}: {e}")
235
+
236
+ # Sort by score and return top
237
+ results.sort(key=lambda x: x[1], reverse=True)
238
+ return results[:top_k]
239
+
240
+ def save(self):
241
+ """Save FAISS index and metadata to disk"""
242
+ os.makedirs(settings.faiss_index_path, exist_ok=True)
243
+
244
+ index_path = os.path.join(settings.faiss_index_path, "index.faiss")
245
+ metadata_path = os.path.join(settings.faiss_index_path, "metadata.pkl")
246
+
247
+ faiss.write_index(self.index, index_path)
248
+ with open(metadata_path, 'wb') as f:
249
+ pickle.dump(self.chunk_metadata, f)
250
+
251
+ logger.info(f"Saved FAISS index with {self.index.ntotal} vectors")
252
+
253
+ def clear(self):
254
+ """Clear the index and metadata"""
255
+ self.index = faiss.IndexFlatIP(self.dimension)
256
+ self.chunk_metadata = {}
257
+ logger.info("Cleared FAISS index")
258
+
259
+ def get_stats(self) -> Dict[str, Any]:
260
+ """Get index statistics"""
261
+ return {
262
+ "total_vectors": self.index.ntotal,
263
+ "dimension": self.dimension,
264
+ "index_type": type(self.index).__name__,
265
+ "num_chunks": len(self.chunk_metadata)
266
+ }
frontend/app.js ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * GraphLLM Frontend JavaScript
3
+ * Handles user interactions, API calls, and dynamic UI updates
4
+ */
5
+
6
+ // ========== Global State ==========
7
+ let currentPdfId = null;
8
+ let graphData = { nodes: [], edges: [] };
9
+ let selectedNodeId = null;
10
+
11
+ // ========== API Configuration ==========
12
+ const API_BASE = window.location.origin;
13
+
14
+ // ========== Processing Overlay Functions ==========
15
+ function showProcessingOverlay(title = 'Processing PDF', message = 'Starting...', percent = 0) {
16
+ const overlay = document.getElementById('processing-overlay');
17
+ const titleEl = document.getElementById('processing-title');
18
+ const messageEl = document.getElementById('processing-message');
19
+ const percentEl = document.getElementById('processing-percent');
20
+ const progressFill = document.getElementById('progress-fill');
21
+
22
+ titleEl.textContent = title;
23
+ messageEl.textContent = message;
24
+ percentEl.textContent = `${percent}%`;
25
+ progressFill.style.width = `${percent}%`;
26
+
27
+ overlay.hidden = false;
28
+ }
29
+
30
+ function updateProcessingOverlay(message, percent) {
31
+ const messageEl = document.getElementById('processing-message');
32
+ const percentEl = document.getElementById('processing-percent');
33
+ const progressFill = document.getElementById('progress-fill');
34
+
35
+ messageEl.textContent = message;
36
+ percentEl.textContent = `${percent}%`;
37
+ progressFill.style.width = `${percent}%`;
38
+ }
39
+
40
+ function hideProcessingOverlay() {
41
+ const overlay = document.getElementById('processing-overlay');
42
+ overlay.hidden = true;
43
+ }
44
+
45
+ // ========== Utility Functions ==========
46
+ async function apiCall(endpoint, options = {}) {
47
+ try {
48
+ const response = await fetch(`${API_BASE}${endpoint}`, options);
49
+ if (!response.ok) {
50
+ throw new Error(`API Error: ${response.statusText}`);
51
+ }
52
+ return await response.json();
53
+ } catch (error) {
54
+ console.error('API call failed:', error);
55
+ showNotification(error.message, 'error');
56
+ throw error;
57
+ }
58
+ }
59
+
60
+ function showNotification(message, type = 'info') {
61
+ const statusEl = document.getElementById('upload-status');
62
+ statusEl.textContent = message;
63
+ statusEl.style.color = type === 'error' ? '#f44336' : type === 'success' ? '#4caf50' : '#4f9eff';
64
+
65
+ setTimeout(() => {
66
+ statusEl.textContent = '';
67
+ }, 5000);
68
+ }
69
+
70
+ // ========== PDF Upload ==========
71
+ document.getElementById('pdf-upload').addEventListener('change', async (e) => {
72
+ const file = e.target.files[0];
73
+ if (!file) return;
74
+
75
+ // Show overlay immediately
76
+ showProcessingOverlay('Uploading PDF', `Uploading ${file.name}...`, 0);
77
+
78
+ const formData = new FormData();
79
+ formData.append('file', file);
80
+
81
+ try {
82
+ const result = await apiCall('/upload', {
83
+ method: 'POST',
84
+ body: formData
85
+ });
86
+
87
+ currentPdfId = result.pdf_id;
88
+ updateProcessingOverlay('Upload complete, starting processing...', 5);
89
+
90
+ // Poll for completion
91
+ pollProcessingStatus(result.pdf_id);
92
+
93
+ } catch (error) {
94
+ hideProcessingOverlay();
95
+ showNotification('Upload failed', 'error');
96
+ }
97
+ });
98
+
99
+ async function pollProcessingStatus(pdfId) {
100
+ const interval = setInterval(async () => {
101
+ try {
102
+ // Fetch detailed status for this PDF
103
+ const status = await apiCall(`/status/${pdfId}`);
104
+
105
+ // Update overlay with progress
106
+ if (status.progress) {
107
+ const { message, percent } = status.progress;
108
+ updateProcessingOverlay(message, percent);
109
+ }
110
+
111
+ // Check if processing is complete
112
+ if (status.status === 'completed') {
113
+ clearInterval(interval);
114
+
115
+ // Show completion message briefly
116
+ updateProcessingOverlay(
117
+ `✓ Complete! ${status.num_nodes} nodes, ${status.num_edges} edges`,
118
+ 100
119
+ );
120
+
121
+ // Load graph and hide overlay
122
+ setTimeout(async () => {
123
+ hideProcessingOverlay();
124
+ await loadGraph();
125
+ await updateStats();
126
+ showNotification(`✓ Graph loaded: ${status.num_nodes} nodes, ${status.num_edges} edges`, 'success');
127
+ }, 1500); // Show completion for 1.5s
128
+
129
+ } else if (status.status === 'failed') {
130
+ clearInterval(interval);
131
+ hideProcessingOverlay();
132
+ showNotification(`Error: ${status.error}`, 'error');
133
+ }
134
+ } catch (error) {
135
+ clearInterval(interval);
136
+ hideProcessingOverlay();
137
+ showNotification('Failed to check status', 'error');
138
+ }
139
+ }, 1000); // Poll every 1 second for responsive updates
140
+
141
+ // Stop polling after 5 minutes
142
+ setTimeout(() => {
143
+ clearInterval(interval);
144
+ hideProcessingOverlay();
145
+ showNotification('Processing timeout', 'error');
146
+ }, 300000);
147
+ }
148
+
149
+ // ========== Graph Loading ==========
150
+ let network = null;
151
+
152
+ async function loadGraph() {
153
+ try {
154
+ const data = await apiCall('/graph');
155
+ graphData = data;
156
+
157
+ // Render interactive graph visualization
158
+ renderGraph(data);
159
+
160
+ } catch (error) {
161
+ console.error('Failed to load graph:', error);
162
+ }
163
+ }
164
+
165
+ function renderGraph(data) {
166
+ const container = document.getElementById('graph-container');
167
+
168
+ // Clear any existing content
169
+ container.innerHTML = '';
170
+
171
+ console.log(`Rendering graph: ${data.nodes.length} nodes, ${data.edges.length} edges`);
172
+
173
+ // Get actual container dimensions
174
+ const rect = container.getBoundingClientRect();
175
+ const containerHeight = rect.height || 600; // Fallback to 600px
176
+ const containerWidth = rect.width || 800; // Fallback to 800px
177
+
178
+ // Set explicit container styles to prevent overflow
179
+ container.style.position = 'relative';
180
+ container.style.width = containerWidth + 'px';
181
+ container.style.height = containerHeight + 'px';
182
+ container.style.overflow = 'hidden';
183
+
184
+ // Prepare nodes for vis.js
185
+ const visNodes = data.nodes.map(node => ({
186
+ id: node.node_id,
187
+ label: node.label,
188
+ title: `${node.label}\nType: ${node.type}\nImportance: ${node.importance_score.toFixed(2)}`,
189
+ value: node.importance_score * 20, // Size based on importance
190
+ group: node.type,
191
+ font: { color: '#e6eef8' }
192
+ }));
193
+
194
+ // Prepare edges for vis.js (thin, bright green, no arrows - undirected graph)
195
+ const visEdges = data.edges.map(edge => ({
196
+ from: edge.from || edge.from_node, // Handle both alias and field name
197
+ to: edge.to || edge.to_node, // Handle both alias and field name
198
+ label: edge.relation,
199
+ title: `${edge.relation} (${edge.confidence.toFixed(2)})`,
200
+ width: 1.5, // Thin edges
201
+ // No arrows for undirected graph
202
+ color: {
203
+ color: '#00ff00', // BRIGHT NEON GREEN (most visible)
204
+ highlight: '#ff00ff', // Neon magenta when highlighted
205
+ hover: '#ffff00', // Yellow on hover
206
+ opacity: 1.0 // Full opacity
207
+ },
208
+ font: {
209
+ size: 12,
210
+ color: '#ffffff',
211
+ strokeWidth: 3,
212
+ strokeColor: '#000000',
213
+ background: 'rgba(0, 0, 0, 0.8)',
214
+ bold: true
215
+ }
216
+ }));
217
+
218
+ // Create vis.js network
219
+ const graphData = {
220
+ nodes: new vis.DataSet(visNodes),
221
+ edges: new vis.DataSet(visEdges)
222
+ };
223
+
224
+ const options = {
225
+ nodes: {
226
+ shape: 'dot',
227
+ scaling: {
228
+ min: 10,
229
+ max: 30
230
+ },
231
+ font: {
232
+ size: 12,
233
+ face: 'Arial',
234
+ color: '#e6eef8'
235
+ },
236
+ borderWidth: 2,
237
+ shadow: true
238
+ },
239
+ edges: {
240
+ width: 1.5, // Thin edges
241
+ color: {
242
+ color: '#00ff00', // BRIGHT NEON GREEN (most visible against dark bg)
243
+ highlight: '#ff00ff', // Neon magenta when highlighted
244
+ hover: '#ffff00', // Yellow on hover
245
+ opacity: 1.0 // Full opacity
246
+ },
247
+ arrows: {
248
+ to: { enabled: false } // No arrows - undirected graph
249
+ },
250
+ smooth: {
251
+ type: 'continuous',
252
+ roundness: 0.2 // Less curved = more visible
253
+ },
254
+ font: {
255
+ size: 12, // Moderate text size
256
+ color: '#ffffff', // White text
257
+ strokeWidth: 3, // Moderate outline
258
+ strokeColor: '#000000', // Black outline for readability
259
+ align: 'top', // Position above edge
260
+ bold: true,
261
+ background: 'rgba(0, 0, 0, 0.8)' // Dark background for label
262
+ },
263
+ selectionWidth: 3, // Moderately thicker when selected
264
+ hoverWidth: 2.5, // Slightly thicker on hover
265
+ shadow: {
266
+ enabled: true,
267
+ color: 'rgba(0, 255, 0, 0.5)', // Green glow
268
+ size: 5,
269
+ x: 0,
270
+ y: 0
271
+ }
272
+ },
273
+ groups: {
274
+ concept: { color: { background: '#4f9eff', border: '#3d8ae6' } },
275
+ function: { color: { background: '#9c27b0', border: '#7b1fa2' } },
276
+ class: { color: { background: '#ff5722', border: '#e64a19' } },
277
+ term: { color: { background: '#4caf50', border: '#388e3c' } },
278
+ person: { color: { background: '#ff9800', border: '#f57c00' } },
279
+ method: { color: { background: '#00bcd4', border: '#0097a7' } },
280
+ entity: { color: { background: '#607d8b', border: '#455a64' } }
281
+ },
282
+ physics: {
283
+ stabilization: { iterations: 200 },
284
+ barnesHut: {
285
+ gravitationalConstant: -8000,
286
+ springConstant: 0.04,
287
+ springLength: 95
288
+ }
289
+ },
290
+ interaction: {
291
+ hover: true,
292
+ navigationButtons: true,
293
+ keyboard: true
294
+ },
295
+ autoResize: false, // Disable auto-resize to prevent infinite stretching
296
+ height: containerHeight + 'px',
297
+ width: containerWidth + 'px'
298
+ };
299
+
300
+ // Create network
301
+ network = new vis.Network(container, graphData, options);
302
+
303
+ // Prevent any further resize attempts
304
+ if (network) {
305
+ network.setOptions({ autoResize: false });
306
+ }
307
+
308
+ // Add click handler for nodes
309
+ network.on('click', function(params) {
310
+ if (params.nodes.length > 0) {
311
+ const nodeId = params.nodes[0];
312
+ selectNode(nodeId);
313
+ }
314
+ });
315
+ }
316
+
317
+ // ========== Node Selection ==========
318
+ window.selectNode = async function(nodeId) {
319
+ selectedNodeId = nodeId;
320
+
321
+ try {
322
+ const nodeData = await apiCall(`/node/${nodeId}`);
323
+ displayNodeDetails(nodeData);
324
+ } catch (error) {
325
+ console.error('Failed to load node details:', error);
326
+ }
327
+ }
328
+
329
+ function displayNodeDetails(nodeData) {
330
+ const content = document.getElementById('node-content');
331
+
332
+ const sourcesHtml = nodeData.sources.map((source, i) => `
333
+ <li>p.${source.page_number} - "${source.snippet}" <span style="color: #8b92a0;">(${source.chunk_id})</span></li>
334
+ `).join('');
335
+
336
+ const relatedHtml = nodeData.related_nodes.map(related => `
337
+ <li onclick="selectNode('${related.node_id}')" style="cursor: pointer; padding: 0.5rem; background: #23262e; border-radius: 6px; margin-bottom: 0.25rem;">
338
+ <strong>${related.label}</strong> - ${related.relation} (confidence: ${related.confidence.toFixed(2)})
339
+ </li>
340
+ `).join('');
341
+
342
+ content.innerHTML = `
343
+ <div class="node-info">
344
+ <h3 class="node-label">${nodeData.label}</h3>
345
+ <span class="badge">${nodeData.type}</span>
346
+
347
+ <div class="node-summary">
348
+ <h4>Summary</h4>
349
+ <p>${nodeData.summary}</p>
350
+ </div>
351
+
352
+ <div class="node-sources">
353
+ <h4>Sources</h4>
354
+ <button class="expand-toggle" onclick="toggleSources()">Show Sources</button>
355
+ <ul class="sources-list" id="sources-list" hidden>
356
+ ${sourcesHtml}
357
+ </ul>
358
+ </div>
359
+
360
+ ${nodeData.related_nodes.length > 0 ? `
361
+ <div class="related-nodes">
362
+ <h4>Related Nodes</h4>
363
+ <ul class="related-list">
364
+ ${relatedHtml}
365
+ </ul>
366
+ </div>
367
+ ` : ''}
368
+ </div>
369
+ `;
370
+ }
371
+
372
+ window.toggleSources = function() {
373
+ const sourcesList = document.getElementById('sources-list');
374
+ const toggle = document.querySelector('.expand-toggle');
375
+
376
+ if (sourcesList.hidden) {
377
+ sourcesList.hidden = false;
378
+ toggle.textContent = 'Hide Sources';
379
+ } else {
380
+ sourcesList.hidden = true;
381
+ toggle.textContent = 'Show Sources';
382
+ }
383
+ }
384
+
385
+ document.getElementById('close-node-detail').addEventListener('click', () => {
386
+ document.getElementById('node-content').innerHTML = '<p class="placeholder-text">Click a node in the graph to view details</p>';
387
+ selectedNodeId = null;
388
+ });
389
+
390
+ // ========== Chat ==========
391
+ document.getElementById('send-btn').addEventListener('click', sendMessage);
392
+ document.getElementById('chat-input').addEventListener('keydown', (e) => {
393
+ if (e.key === 'Enter' && !e.shiftKey) {
394
+ e.preventDefault();
395
+ sendMessage();
396
+ }
397
+ });
398
+
399
+ async function sendMessage() {
400
+ const input = document.getElementById('chat-input');
401
+ const query = input.value.trim();
402
+
403
+ if (!query) return;
404
+ if (!currentPdfId) {
405
+ showNotification('Please upload a PDF first', 'error');
406
+ return;
407
+ }
408
+
409
+ // Add user message to chat
410
+ addMessageToChat('user', query);
411
+ input.value = '';
412
+
413
+ try {
414
+ const includeCitations = document.getElementById('include-citations').checked;
415
+
416
+ const response = await apiCall('/chat', {
417
+ method: 'POST',
418
+ headers: { 'Content-Type': 'application/json' },
419
+ body: JSON.stringify({
420
+ query,
421
+ pdf_id: currentPdfId,
422
+ include_citations: includeCitations,
423
+ max_sources: 5
424
+ })
425
+ });
426
+
427
+ // Add assistant response
428
+ addMessageToChat('assistant', response.answer, response.sources);
429
+
430
+ } catch (error) {
431
+ addMessageToChat('assistant', 'Sorry, I encountered an error processing your question.');
432
+ }
433
+ }
434
+
435
+ function addMessageToChat(role, content, sources = []) {
436
+ const messagesContainer = document.getElementById('chat-messages');
437
+
438
+ const messageDiv = document.createElement('div');
439
+ messageDiv.className = `message ${role}`;
440
+
441
+ let html = `<p>${content}</p>`;
442
+
443
+ if (sources && sources.length > 0) {
444
+ html += '<div style="margin-top: 0.5rem; padding-top: 0.5rem; border-top: 1px solid rgba(255,255,255,0.1);">';
445
+ html += '<strong style="font-size: 0.875rem;">Sources:</strong><ul style="margin-top: 0.25rem; font-size: 0.875rem;">';
446
+ sources.forEach(source => {
447
+ html += `<li>p.${source.page_number}: "${source.snippet}"</li>`;
448
+ });
449
+ html += '</ul></div>';
450
+ }
451
+
452
+ messageDiv.innerHTML = html;
453
+ messagesContainer.appendChild(messageDiv);
454
+
455
+ // Scroll to bottom
456
+ messagesContainer.scrollTop = messagesContainer.scrollHeight;
457
+ }
458
+
459
+ // ========== Stats Update ==========
460
+ async function updateStats() {
461
+ try {
462
+ const status = await apiCall('/admin/status');
463
+
464
+ document.getElementById('stats-nodes').textContent = `Nodes: ${status.total_nodes}`;
465
+ document.getElementById('stats-edges').textContent = `Edges: ${status.total_edges}`;
466
+ document.getElementById('stats-chunks').textContent = `Chunks: ${status.total_chunks}`;
467
+ } catch (error) {
468
+ console.error('Failed to update stats:', error);
469
+ }
470
+ }
471
+
472
+ // ========== Admin Controls ==========
473
+ document.getElementById('reindex-btn').addEventListener('click', async () => {
474
+ if (!currentPdfId) {
475
+ showNotification('No PDF to reindex', 'error');
476
+ return;
477
+ }
478
+
479
+ if (!confirm('Reindex current PDF? This will take some time.')) return;
480
+
481
+ try {
482
+ // Show overlay for reindexing
483
+ showProcessingOverlay('Reindexing PDF', 'Starting reindex...', 0);
484
+
485
+ await apiCall(`/admin/reindex?pdf_id=${currentPdfId}`, { method: 'POST' });
486
+
487
+ // Poll for completion
488
+ pollProcessingStatus(currentPdfId);
489
+ } catch (error) {
490
+ hideProcessingOverlay();
491
+ showNotification('Reindex failed', 'error');
492
+ }
493
+ });
494
+
495
+ document.getElementById('clear-btn').addEventListener('click', async () => {
496
+ if (!confirm('Clear all data? This cannot be undone!')) return;
497
+
498
+ try {
499
+ await apiCall('/admin/clear', { method: 'POST' });
500
+ showNotification('All data cleared', 'success');
501
+
502
+ // Reset UI
503
+ currentPdfId = null;
504
+ graphData = { nodes: [], edges: [] };
505
+ document.getElementById('graph-container').innerHTML = '<div class="graph-placeholder"><p>Upload a PDF to generate a knowledge graph</p></div>';
506
+ document.getElementById('node-content').innerHTML = '<p class="placeholder-text">Click a node in the graph to view details</p>';
507
+ document.getElementById('chat-messages').innerHTML = '<div class="message system"><p>Ask questions about your uploaded PDF. Answers will cite page numbers.</p></div>';
508
+ await updateStats();
509
+ } catch (error) {
510
+ showNotification('Clear failed', 'error');
511
+ }
512
+ });
513
+
514
+ // ========== Graph Controls ==========
515
+ document.getElementById('zoom-in-btn').addEventListener('click', () => {
516
+ if (network) {
517
+ const scale = network.getScale();
518
+ network.moveTo({ scale: scale * 1.2 });
519
+ }
520
+ });
521
+
522
+ document.getElementById('zoom-out-btn').addEventListener('click', () => {
523
+ if (network) {
524
+ const scale = network.getScale();
525
+ network.moveTo({ scale: scale * 0.8 });
526
+ }
527
+ });
528
+
529
+ document.getElementById('reset-view-btn').addEventListener('click', () => {
530
+ if (network) {
531
+ network.fit();
532
+ }
533
+ });
534
+
535
+ // ========== Initialization ==========
536
+ document.addEventListener('DOMContentLoaded', () => {
537
+ updateStats();
538
+ console.log('GraphLLM Frontend Initialized');
539
+ });
frontend/index.html ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>GraphLLM - PDF Knowledge Graph & RAG</title>
7
+ <link rel="stylesheet" href="/static/styles.css">
8
+ </head>
9
+ <body>
10
+ <!-- Header -->
11
+ <header class="app-header">
12
+ <div class="header-content">
13
+ <h1 class="app-title">GraphLLM</h1>
14
+ <p class="app-subtitle">PDF Knowledge Graph & RAG System</p>
15
+ </div>
16
+
17
+ <div class="header-controls">
18
+ <div class="upload-section">
19
+ <input type="file" id="pdf-upload" accept=".pdf" hidden>
20
+ <button id="upload-btn" class="btn btn-primary" onclick="document.getElementById('pdf-upload').click()">
21
+ Upload PDF
22
+ </button>
23
+ <span id="upload-status" class="status-text"></span>
24
+ </div>
25
+
26
+ <button id="reindex-btn" class="btn btn-secondary">Reindex</button>
27
+ <button id="clear-btn" class="btn btn-danger">Clear All</button>
28
+ </div>
29
+ </header>
30
+
31
+ <!-- Main Content Area -->
32
+ <main class="main-container">
33
+ <!-- Left Pane: Graph Visualization -->
34
+ <aside id="graph-pane" class="graph-pane" role="region" aria-label="Knowledge Graph Visualization">
35
+ <div class="pane-header">
36
+ <h2>Knowledge Graph</h2>
37
+ <div class="graph-controls">
38
+ <button id="zoom-in-btn" class="icon-btn" aria-label="Zoom In">+</button>
39
+ <button id="zoom-out-btn" class="icon-btn" aria-label="Zoom Out">-</button>
40
+ <button id="reset-view-btn" class="icon-btn" aria-label="Reset View">⟲</button>
41
+ </div>
42
+ </div>
43
+
44
+ <div id="graph-container" class="graph-container" role="img" aria-label="Interactive knowledge graph">
45
+ <!-- Graph visualization will be rendered here via JavaScript -->
46
+ <div class="graph-placeholder">
47
+ <p>Upload a PDF to generate a knowledge graph</p>
48
+ <p class="help-text">Graph visualization requires JavaScript for interactivity</p>
49
+ </div>
50
+ </div>
51
+
52
+ <div class="graph-legend">
53
+ <h3>Legend</h3>
54
+ <div class="legend-items">
55
+ <div class="legend-item">
56
+ <span class="legend-color concept"></span>
57
+ <span>Concept</span>
58
+ </div>
59
+ <div class="legend-item">
60
+ <span class="legend-color function"></span>
61
+ <span>Function/Method</span>
62
+ </div>
63
+ <div class="legend-item">
64
+ <span class="legend-color class"></span>
65
+ <span>Class/Type</span>
66
+ </div>
67
+ <div class="legend-item">
68
+ <span class="legend-color term"></span>
69
+ <span>Term/Definition</span>
70
+ </div>
71
+ </div>
72
+ </div>
73
+ </aside>
74
+
75
+ <!-- Right Pane: Node Details & Chat -->
76
+ <section id="detail-pane" class="detail-pane">
77
+ <!-- Node Detail Section -->
78
+ <div id="node-detail" class="node-detail card">
79
+ <div class="card-header">
80
+ <h2>Node Details</h2>
81
+ <button id="close-node-detail" class="icon-btn" aria-label="Close">✕</button>
82
+ </div>
83
+
84
+ <div id="node-content" class="node-content">
85
+ <p class="placeholder-text">Click a node in the graph to view details</p>
86
+ </div>
87
+
88
+ <!-- Node detail will be populated dynamically:
89
+ <div class="node-info">
90
+ <h3 class="node-label">[Node Label]</h3>
91
+ <span class="node-type badge">[Type]</span>
92
+ <span class="node-importance">Importance: [Score]</span>
93
+
94
+ <div class="node-summary">
95
+ <h4>Summary</h4>
96
+ <p>[AI-generated summary with (p. N) citations]</p>
97
+ </div>
98
+
99
+ <div class="node-sources">
100
+ <h4>Sources</h4>
101
+ <button class="expand-toggle">Show Sources</button>
102
+ <ul class="sources-list" hidden>
103
+ <li>p.12 - "Exact snippet..." (chunk_id)</li>
104
+ </ul>
105
+ </div>
106
+
107
+ <div class="related-nodes">
108
+ <h4>Related Nodes</h4>
109
+ <ul class="related-list">
110
+ <li>[Node] - [relation] - [confidence]</li>
111
+ </ul>
112
+ </div>
113
+ </div>
114
+ -->
115
+ </div>
116
+
117
+ <!-- Chat Section -->
118
+ <div id="chat" class="chat-section card">
119
+ <div class="card-header">
120
+ <h2>Chat with Document</h2>
121
+ <label class="checkbox-label">
122
+ <input type="checkbox" id="include-citations" checked>
123
+ <span>Include Citations</span>
124
+ </label>
125
+ </div>
126
+
127
+ <div id="chat-messages" class="chat-messages" role="log" aria-live="polite">
128
+ <div class="message system">
129
+ <p>Ask questions about your uploaded PDF. Answers will cite page numbers.</p>
130
+ </div>
131
+ </div>
132
+
133
+ <div class="chat-input-area">
134
+ <textarea
135
+ id="chat-input"
136
+ class="chat-input"
137
+ placeholder="Ask a question about the document..."
138
+ rows="3"
139
+ aria-label="Chat input"
140
+ ></textarea>
141
+ <button id="send-btn" class="btn btn-primary" aria-label="Send message">Send</button>
142
+ </div>
143
+ </div>
144
+ </section>
145
+ </main>
146
+
147
+ <!-- Footer -->
148
+ <footer class="app-footer">
149
+ <div class="footer-content">
150
+ <div class="stats">
151
+ <span id="stats-nodes">Nodes: 0</span>
152
+ <span id="stats-edges">Edges: 0</span>
153
+ <span id="stats-chunks">Chunks: 0</span>
154
+ </div>
155
+ <p class="footer-text">GraphLLM v1.0 | Powered by Gemini & Mistral</p>
156
+ </div>
157
+ </footer>
158
+
159
+ <!-- Processing Overlay -->
160
+ <div id="processing-overlay" class="processing-overlay" hidden>
161
+ <div class="processing-modal">
162
+ <div class="spinner"></div>
163
+ <h2 id="processing-title">Processing PDF</h2>
164
+ <p id="processing-message">Starting...</p>
165
+ <div class="progress-bar">
166
+ <div id="progress-fill" class="progress-fill"></div>
167
+ </div>
168
+ <p id="processing-percent" class="processing-percent">0%</p>
169
+ </div>
170
+ </div>
171
+
172
+ <!-- JavaScript Libraries -->
173
+ <script type="text/javascript" src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
174
+ <script src="/static/app.js"></script>
175
+ </body>
176
+ </html>
frontend/styles.css ADDED
@@ -0,0 +1,800 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* GraphLLM Stylesheet - Dark Sleek Theme */
2
+
3
+ /* ========== CSS Reset & Base Styles ========== */
4
+ * {
5
+ margin: 0;
6
+ padding: 0;
7
+ box-sizing: border-box;
8
+ }
9
+
10
+ html, body {
11
+ height: 100%;
12
+ width: 100%;
13
+ overflow-x: hidden;
14
+ }
15
+
16
+ :root {
17
+ /* Color Palette */
18
+ --bg-primary: #0f1115;
19
+ --bg-secondary: #12151a;
20
+ --bg-card: #1a1d24;
21
+ --bg-hover: #23262e;
22
+
23
+ --text-primary: #e6eef8;
24
+ --text-secondary: #cfd8e3;
25
+ --text-muted: #8b92a0;
26
+
27
+ --accent-primary: #4f9eff;
28
+ --accent-hover: #3d8ae6;
29
+ --accent-glow: rgba(79, 158, 255, 0.3);
30
+
31
+ --success: #4caf50;
32
+ --warning: #ff9800;
33
+ --danger: #f44336;
34
+
35
+ --border-color: #2a2f3a;
36
+ --shadow-sm: 0 2px 4px rgba(0, 0, 0, 0.3);
37
+ --shadow-md: 0 4px 12px rgba(0, 0, 0, 0.4);
38
+ --shadow-lg: 0 8px 24px rgba(0, 0, 0, 0.5);
39
+
40
+ /* Graph Node Colors */
41
+ --node-concept: #4f9eff;
42
+ --node-function: #9c27b0;
43
+ --node-class: #ff5722;
44
+ --node-term: #4caf50;
45
+
46
+ /* Spacing */
47
+ --spacing-xs: 0.25rem;
48
+ --spacing-sm: 0.5rem;
49
+ --spacing-md: 1rem;
50
+ --spacing-lg: 1.5rem;
51
+ --spacing-xl: 2rem;
52
+
53
+ /* Border Radius */
54
+ --radius-sm: 6px;
55
+ --radius-md: 12px;
56
+ --radius-lg: 16px;
57
+
58
+ /* Transitions */
59
+ --transition-fast: 0.15s ease;
60
+ --transition-normal: 0.3s ease;
61
+ }
62
+
63
+ /* ========== Typography ========== */
64
+ body {
65
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
66
+ 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif;
67
+ font-size: 16px;
68
+ line-height: 1.6;
69
+ color: var(--text-primary);
70
+ background-color: var(--bg-primary);
71
+ overflow-x: hidden;
72
+ }
73
+
74
+ h1, h2, h3, h4, h5, h6 {
75
+ font-weight: 600;
76
+ line-height: 1.2;
77
+ margin-bottom: var(--spacing-md);
78
+ color: var(--text-primary);
79
+ }
80
+
81
+ h1 { font-size: 2rem; }
82
+ h2 { font-size: 1.5rem; }
83
+ h3 { font-size: 1.25rem; }
84
+ h4 { font-size: 1.1rem; }
85
+
86
+ code, pre {
87
+ font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
88
+ background-color: var(--bg-secondary);
89
+ padding: 0.2em 0.4em;
90
+ border-radius: var(--radius-sm);
91
+ }
92
+
93
+ /* ========== Layout ========== */
94
+ body {
95
+ display: flex;
96
+ flex-direction: column;
97
+ min-height: 100vh;
98
+ }
99
+
100
+ /* Header */
101
+ .app-header {
102
+ background-color: var(--bg-secondary);
103
+ border-bottom: 1px solid var(--border-color);
104
+ padding: var(--spacing-lg) var(--spacing-xl);
105
+ display: flex;
106
+ justify-content: space-between;
107
+ align-items: center;
108
+ box-shadow: var(--shadow-sm);
109
+ }
110
+
111
+ .header-content {
112
+ flex: 1;
113
+ }
114
+
115
+ .app-title {
116
+ margin: 0;
117
+ font-size: 1.75rem;
118
+ background: linear-gradient(135deg, var(--accent-primary), #9c27b0);
119
+ -webkit-background-clip: text;
120
+ -webkit-text-fill-color: transparent;
121
+ background-clip: text;
122
+ }
123
+
124
+ .app-subtitle {
125
+ font-size: 0.875rem;
126
+ color: var(--text-muted);
127
+ margin: 0;
128
+ }
129
+
130
+ .header-controls {
131
+ display: flex;
132
+ gap: var(--spacing-md);
133
+ align-items: center;
134
+ }
135
+
136
+ .upload-section {
137
+ display: flex;
138
+ gap: var(--spacing-sm);
139
+ align-items: center;
140
+ }
141
+
142
+ .status-text {
143
+ font-size: 0.875rem;
144
+ color: var(--text-secondary);
145
+ }
146
+
147
+ /* Main Container */
148
+ .main-container {
149
+ flex: 1;
150
+ display: grid;
151
+ grid-template-columns: 65% 35%;
152
+ gap: var(--spacing-lg);
153
+ padding: var(--spacing-lg);
154
+ overflow: hidden;
155
+ height: calc(100vh - 180px); /* Account for header and footer */
156
+ max-height: calc(100vh - 180px);
157
+ }
158
+
159
+ /* ========== Graph Pane (Left) ========== */
160
+ .graph-pane {
161
+ background-color: var(--bg-card);
162
+ border-radius: var(--radius-md);
163
+ padding: var(--spacing-lg);
164
+ display: flex;
165
+ flex-direction: column;
166
+ box-shadow: var(--shadow-md);
167
+ overflow: hidden;
168
+ height: 100%;
169
+ max-height: 100%;
170
+ }
171
+
172
+ .pane-header {
173
+ display: flex;
174
+ justify-content: space-between;
175
+ align-items: center;
176
+ margin-bottom: var(--spacing-md);
177
+ }
178
+
179
+ .pane-header h2 {
180
+ margin: 0;
181
+ font-size: 1.25rem;
182
+ }
183
+
184
+ .graph-controls {
185
+ display: flex;
186
+ gap: var(--spacing-sm);
187
+ }
188
+
189
+ .graph-container {
190
+ flex: 1;
191
+ background-color: var(--bg-secondary);
192
+ border-radius: var(--radius-sm);
193
+ position: relative;
194
+ overflow: hidden;
195
+ border: 1px solid var(--border-color);
196
+ min-height: 500px;
197
+ height: 100%;
198
+ width: 100%;
199
+ }
200
+
201
+ .graph-placeholder {
202
+ display: flex;
203
+ flex-direction: column;
204
+ align-items: center;
205
+ justify-content: center;
206
+ height: 100%;
207
+ color: var(--text-muted);
208
+ text-align: center;
209
+ padding: var(--spacing-xl);
210
+ }
211
+
212
+ .graph-placeholder p {
213
+ margin: var(--spacing-sm) 0;
214
+ }
215
+
216
+ .help-text {
217
+ font-size: 0.875rem;
218
+ color: var(--text-muted);
219
+ opacity: 0.7;
220
+ }
221
+
222
+ /* Graph Legend */
223
+ .graph-legend {
224
+ margin-top: var(--spacing-md);
225
+ padding: var(--spacing-md);
226
+ background-color: var(--bg-secondary);
227
+ border-radius: var(--radius-sm);
228
+ }
229
+
230
+ .graph-legend h3 {
231
+ font-size: 0.875rem;
232
+ margin-bottom: var(--spacing-sm);
233
+ color: var(--text-secondary);
234
+ }
235
+
236
+ .legend-items {
237
+ display: grid;
238
+ grid-template-columns: repeat(2, 1fr);
239
+ gap: var(--spacing-sm);
240
+ }
241
+
242
+ .legend-item {
243
+ display: flex;
244
+ align-items: center;
245
+ gap: var(--spacing-sm);
246
+ font-size: 0.875rem;
247
+ }
248
+
249
+ .legend-color {
250
+ width: 16px;
251
+ height: 16px;
252
+ border-radius: 50%;
253
+ border: 2px solid currentColor;
254
+ }
255
+
256
+ .legend-color.concept { color: var(--node-concept); }
257
+ .legend-color.function { color: var(--node-function); }
258
+ .legend-color.class { color: var(--node-class); }
259
+ .legend-color.term { color: var(--node-term); }
260
+
261
+ /* ========== Detail Pane (Right) ========== */
262
+ .detail-pane {
263
+ display: flex;
264
+ flex-direction: column;
265
+ gap: var(--spacing-lg);
266
+ overflow-y: auto;
267
+ overflow-x: hidden;
268
+ height: 100%;
269
+ max-height: 100%;
270
+ padding-right: var(--spacing-sm); /* Space for scrollbar */
271
+ }
272
+
273
+ .card {
274
+ background-color: var(--bg-card);
275
+ border-radius: var(--radius-md);
276
+ padding: var(--spacing-lg);
277
+ box-shadow: var(--shadow-md);
278
+ width: 100%;
279
+ }
280
+
281
+ .card-header {
282
+ display: flex;
283
+ justify-content: space-between;
284
+ align-items: center;
285
+ margin-bottom: var(--spacing-md);
286
+ padding-bottom: var(--spacing-sm);
287
+ border-bottom: 1px solid var(--border-color);
288
+ }
289
+
290
+ .card-header h2 {
291
+ margin: 0;
292
+ font-size: 1.25rem;
293
+ }
294
+
295
+ /* Node Detail Card */
296
+ .node-detail {
297
+ flex-shrink: 0; /* Never shrink - always show full content */
298
+ overflow-y: visible; /* Don't scroll the card itself */
299
+ display: block;
300
+ margin-bottom: var(--spacing-lg);
301
+ }
302
+
303
+ .node-content {
304
+ color: var(--text-secondary);
305
+ max-height: none; /* No height restriction */
306
+ }
307
+
308
+ .placeholder-text {
309
+ color: var(--text-muted);
310
+ text-align: center;
311
+ padding: var(--spacing-xl);
312
+ }
313
+
314
+ .node-info {
315
+ display: flex;
316
+ flex-direction: column;
317
+ gap: var(--spacing-md);
318
+ width: 100%;
319
+ padding-bottom: var(--spacing-lg);
320
+ }
321
+
322
+ .node-label {
323
+ font-size: 1.5rem;
324
+ color: var(--accent-primary);
325
+ margin-bottom: var(--spacing-xs);
326
+ }
327
+
328
+ .badge {
329
+ display: inline-block;
330
+ padding: 0.25rem 0.5rem;
331
+ background-color: var(--accent-primary);
332
+ color: white;
333
+ border-radius: var(--radius-sm);
334
+ font-size: 0.75rem;
335
+ font-weight: 600;
336
+ text-transform: uppercase;
337
+ margin-right: var(--spacing-sm);
338
+ }
339
+
340
+ .node-importance {
341
+ font-size: 0.875rem;
342
+ color: var(--text-muted);
343
+ }
344
+
345
+ .node-summary, .node-sources, .related-nodes {
346
+ padding: var(--spacing-md);
347
+ background-color: var(--bg-secondary);
348
+ border-radius: var(--radius-sm);
349
+ border-left: 3px solid var(--accent-primary);
350
+ margin-bottom: var(--spacing-md);
351
+ }
352
+
353
+ .node-summary p {
354
+ line-height: 1.7;
355
+ color: var(--text-secondary);
356
+ }
357
+
358
+ .expand-toggle {
359
+ background: none;
360
+ border: none;
361
+ color: var(--accent-primary);
362
+ cursor: pointer;
363
+ font-size: 0.875rem;
364
+ padding: var(--spacing-xs) 0;
365
+ transition: color var(--transition-fast);
366
+ }
367
+
368
+ .expand-toggle:hover {
369
+ color: var(--accent-hover);
370
+ text-decoration: underline;
371
+ }
372
+
373
+ .sources-list, .related-list {
374
+ list-style: none;
375
+ margin-top: var(--spacing-sm);
376
+ }
377
+
378
+ .sources-list li, .related-list li {
379
+ padding: var(--spacing-sm);
380
+ margin-bottom: var(--spacing-xs);
381
+ background-color: var(--bg-hover);
382
+ border-radius: var(--radius-sm);
383
+ font-size: 0.875rem;
384
+ }
385
+
386
+ /* Chat Section */
387
+ .chat-section {
388
+ flex: 0 1 auto; /* Can shrink but don't grow */
389
+ display: flex;
390
+ flex-direction: column;
391
+ min-height: 300px;
392
+ height: 400px; /* Fixed height */
393
+ }
394
+
395
+ .checkbox-label {
396
+ display: flex;
397
+ align-items: center;
398
+ gap: var(--spacing-sm);
399
+ font-size: 0.875rem;
400
+ color: var(--text-secondary);
401
+ cursor: pointer;
402
+ }
403
+
404
+ .checkbox-label input[type="checkbox"] {
405
+ width: 18px;
406
+ height: 18px;
407
+ cursor: pointer;
408
+ }
409
+
410
+ .chat-messages {
411
+ flex: 1;
412
+ overflow-y: auto;
413
+ padding: var(--spacing-md);
414
+ background-color: var(--bg-secondary);
415
+ border-radius: var(--radius-sm);
416
+ margin-bottom: var(--spacing-md);
417
+ min-height: 200px;
418
+ }
419
+
420
+ .message {
421
+ margin-bottom: var(--spacing-md);
422
+ padding: var(--spacing-md);
423
+ border-radius: var(--radius-sm);
424
+ line-height: 1.6;
425
+ }
426
+
427
+ .message.user {
428
+ background-color: var(--accent-primary);
429
+ color: white;
430
+ align-self: flex-end;
431
+ max-width: 80%;
432
+ margin-left: auto;
433
+ }
434
+
435
+ .message.assistant {
436
+ background-color: var(--bg-hover);
437
+ color: var(--text-primary);
438
+ border-left: 3px solid var(--accent-primary);
439
+ }
440
+
441
+ .message.system {
442
+ background-color: transparent;
443
+ color: var(--text-muted);
444
+ font-size: 0.875rem;
445
+ text-align: center;
446
+ border: none;
447
+ }
448
+
449
+ .chat-input-area {
450
+ display: flex;
451
+ gap: var(--spacing-sm);
452
+ }
453
+
454
+ .chat-input {
455
+ flex: 1;
456
+ background-color: var(--bg-secondary);
457
+ border: 1px solid var(--border-color);
458
+ border-radius: var(--radius-sm);
459
+ padding: var(--spacing-md);
460
+ color: var(--text-primary);
461
+ font-family: inherit;
462
+ font-size: 0.95rem;
463
+ resize: vertical;
464
+ transition: border-color var(--transition-fast);
465
+ }
466
+
467
+ .chat-input:focus {
468
+ outline: none;
469
+ border-color: var(--accent-primary);
470
+ box-shadow: 0 0 0 3px var(--accent-glow);
471
+ }
472
+
473
+ /* ========== Buttons ========== */
474
+ .btn {
475
+ padding: 0.625rem 1.25rem;
476
+ border: none;
477
+ border-radius: var(--radius-sm);
478
+ font-size: 0.9rem;
479
+ font-weight: 600;
480
+ cursor: pointer;
481
+ transition: all var(--transition-fast);
482
+ white-space: nowrap;
483
+ }
484
+
485
+ .btn-primary {
486
+ background-color: var(--accent-primary);
487
+ color: white;
488
+ }
489
+
490
+ .btn-primary:hover {
491
+ background-color: var(--accent-hover);
492
+ box-shadow: 0 0 12px var(--accent-glow);
493
+ }
494
+
495
+ .btn-secondary {
496
+ background-color: var(--bg-hover);
497
+ color: var(--text-primary);
498
+ border: 1px solid var(--border-color);
499
+ }
500
+
501
+ .btn-secondary:hover {
502
+ background-color: var(--bg-card);
503
+ }
504
+
505
+ .btn-danger {
506
+ background-color: var(--danger);
507
+ color: white;
508
+ }
509
+
510
+ .btn-danger:hover {
511
+ background-color: #d32f2f;
512
+ }
513
+
514
+ .icon-btn {
515
+ width: 36px;
516
+ height: 36px;
517
+ padding: 0;
518
+ background-color: var(--bg-hover);
519
+ border: 1px solid var(--border-color);
520
+ border-radius: var(--radius-sm);
521
+ color: var(--text-primary);
522
+ cursor: pointer;
523
+ font-size: 1.2rem;
524
+ display: flex;
525
+ align-items: center;
526
+ justify-content: center;
527
+ transition: all var(--transition-fast);
528
+ }
529
+
530
+ .icon-btn:hover {
531
+ background-color: var(--bg-card);
532
+ border-color: var(--accent-primary);
533
+ }
534
+
535
+ /* ========== Footer ========== */
536
+ .app-footer {
537
+ background-color: var(--bg-secondary);
538
+ border-top: 1px solid var(--border-color);
539
+ padding: var(--spacing-md) var(--spacing-xl);
540
+ }
541
+
542
+ .footer-content {
543
+ display: flex;
544
+ justify-content: space-between;
545
+ align-items: center;
546
+ }
547
+
548
+ .stats {
549
+ display: flex;
550
+ gap: var(--spacing-lg);
551
+ font-size: 0.875rem;
552
+ color: var(--text-secondary);
553
+ }
554
+
555
+ .footer-text {
556
+ font-size: 0.875rem;
557
+ color: var(--text-muted);
558
+ }
559
+
560
+ /* ========== Responsive Design ========== */
561
+ @media (max-width: 1024px) {
562
+ .main-container {
563
+ grid-template-columns: 1fr;
564
+ grid-template-rows: auto auto;
565
+ }
566
+
567
+ .graph-pane {
568
+ min-height: 400px;
569
+ }
570
+ }
571
+
572
+ @media (max-width: 768px) {
573
+ .app-header {
574
+ flex-direction: column;
575
+ gap: var(--spacing-md);
576
+ align-items: flex-start;
577
+ }
578
+
579
+ .header-controls {
580
+ width: 100%;
581
+ flex-wrap: wrap;
582
+ }
583
+
584
+ .main-container {
585
+ padding: var(--spacing-sm);
586
+ gap: var(--spacing-sm);
587
+ }
588
+
589
+ .footer-content {
590
+ flex-direction: column;
591
+ gap: var(--spacing-sm);
592
+ text-align: center;
593
+ }
594
+
595
+ .stats {
596
+ flex-direction: column;
597
+ gap: var(--spacing-sm);
598
+ }
599
+ }
600
+
601
+ /* ========== Accessibility ========== */
602
+ /* Focus styles for keyboard navigation */
603
+ button:focus-visible,
604
+ input:focus-visible,
605
+ textarea:focus-visible {
606
+ outline: 2px solid var(--accent-primary);
607
+ outline-offset: 2px;
608
+ }
609
+
610
+ /* Hidden but accessible */
611
+ .sr-only {
612
+ position: absolute;
613
+ width: 1px;
614
+ height: 1px;
615
+ padding: 0;
616
+ margin: -1px;
617
+ overflow: hidden;
618
+ clip: rect(0, 0, 0, 0);
619
+ white-space: nowrap;
620
+ border-width: 0;
621
+ }
622
+
623
+ /* Reduce motion for accessibility */
624
+ @media (prefers-reduced-motion: reduce) {
625
+ *,
626
+ *::before,
627
+ *::after {
628
+ animation-duration: 0.01ms !important;
629
+ animation-iteration-count: 1 !important;
630
+ transition-duration: 0.01ms !important;
631
+ }
632
+ }
633
+
634
+ /* ========== Graph Node Styles (for JS visualization) ========== */
635
+ /* These classes will be used by the graph visualization library */
636
+ .graph-node {
637
+ cursor: pointer;
638
+ transition: all var(--transition-fast);
639
+ }
640
+
641
+ .graph-node.concept circle { fill: var(--node-concept); }
642
+ .graph-node.function circle { fill: var(--node-function); }
643
+ .graph-node.class circle { fill: var(--node-class); }
644
+ .graph-node.term circle { fill: var(--node-term); }
645
+
646
+ .graph-node:hover circle {
647
+ stroke-width: 3px;
648
+ filter: brightness(1.2);
649
+ }
650
+
651
+ .graph-node.selected circle {
652
+ stroke: var(--accent-primary);
653
+ stroke-width: 4px;
654
+ animation: pulse 1.5s infinite;
655
+ }
656
+
657
+ @keyframes pulse {
658
+ 0%, 100% {
659
+ box-shadow: 0 0 0 0 var(--accent-glow);
660
+ }
661
+ 50% {
662
+ box-shadow: 0 0 0 10px rgba(79, 158, 255, 0);
663
+ }
664
+ }
665
+
666
+ .graph-edge {
667
+ stroke: var(--text-muted);
668
+ stroke-width: 1.5px;
669
+ fill: none;
670
+ opacity: 0.6;
671
+ }
672
+
673
+ .graph-edge.highlighted {
674
+ stroke: var(--accent-primary);
675
+ opacity: 1;
676
+ stroke-width: 2px;
677
+ }
678
+
679
+ /* Vis.js Network Canvas Constraints */
680
+ .graph-container canvas {
681
+ max-width: 100% !important;
682
+ max-height: 100% !important;
683
+ }
684
+
685
+ .graph-container > div {
686
+ width: 100% !important;
687
+ height: 100% !important;
688
+ max-height: 100% !important;
689
+ }
690
+
691
+ /* ========== Processing Overlay ========== */
692
+ .processing-overlay {
693
+ position: fixed;
694
+ top: 0;
695
+ left: 0;
696
+ width: 100%;
697
+ height: 100%;
698
+ background: rgba(15, 17, 21, 0.95);
699
+ backdrop-filter: blur(8px);
700
+ display: flex;
701
+ align-items: center;
702
+ justify-content: center;
703
+ z-index: 10000;
704
+ animation: fadeIn 0.3s ease-in-out;
705
+ }
706
+
707
+ .processing-overlay[hidden] {
708
+ display: none;
709
+ }
710
+
711
+ @keyframes fadeIn {
712
+ from {
713
+ opacity: 0;
714
+ }
715
+ to {
716
+ opacity: 1;
717
+ }
718
+ }
719
+
720
+ .processing-modal {
721
+ background: var(--bg-card);
722
+ border: 1px solid var(--border-color);
723
+ border-radius: 16px;
724
+ padding: 3rem 4rem;
725
+ box-shadow: var(--shadow-lg);
726
+ text-align: center;
727
+ min-width: 400px;
728
+ animation: slideUp 0.4s ease-out;
729
+ }
730
+
731
+ @keyframes slideUp {
732
+ from {
733
+ transform: translateY(30px);
734
+ opacity: 0;
735
+ }
736
+ to {
737
+ transform: translateY(0);
738
+ opacity: 1;
739
+ }
740
+ }
741
+
742
+ /* Spinner Animation */
743
+ .spinner {
744
+ width: 80px;
745
+ height: 80px;
746
+ margin: 0 auto 2rem;
747
+ border: 6px solid var(--border-color);
748
+ border-top: 6px solid var(--accent-primary);
749
+ border-radius: 50%;
750
+ animation: spin 1s linear infinite;
751
+ }
752
+
753
+ @keyframes spin {
754
+ 0% {
755
+ transform: rotate(0deg);
756
+ }
757
+ 100% {
758
+ transform: rotate(360deg);
759
+ }
760
+ }
761
+
762
+ #processing-title {
763
+ color: var(--text-primary);
764
+ font-size: 1.75rem;
765
+ font-weight: 600;
766
+ margin-bottom: 1rem;
767
+ }
768
+
769
+ #processing-message {
770
+ color: var(--text-secondary);
771
+ font-size: 1rem;
772
+ margin-bottom: 1.5rem;
773
+ min-height: 1.5rem;
774
+ }
775
+
776
+ /* Progress Bar */
777
+ .progress-bar {
778
+ width: 100%;
779
+ height: 8px;
780
+ background: var(--bg-secondary);
781
+ border-radius: 4px;
782
+ overflow: hidden;
783
+ margin-bottom: 1rem;
784
+ }
785
+
786
+ .progress-fill {
787
+ height: 100%;
788
+ background: linear-gradient(90deg, var(--accent-primary), var(--accent-hover));
789
+ border-radius: 4px;
790
+ transition: width 0.3s ease-out;
791
+ width: 0%;
792
+ box-shadow: 0 0 10px var(--accent-glow);
793
+ }
794
+
795
+ .processing-percent {
796
+ color: var(--text-muted);
797
+ font-size: 0.875rem;
798
+ font-weight: 500;
799
+ letter-spacing: 0.5px;
800
+ }
gemini_extractor.py ADDED
@@ -0,0 +1,612 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gemini-based Knowledge Graph Extraction
3
+ Simple LLM-powered extraction using Google Gemini (cheapest option)
4
+ """
5
+ from typing import List, Dict, Any, Optional
6
+ from loguru import logger
7
+ from models import Chunk, CanonicalTriple, RelationType
8
+ from config import settings
9
+ import json
10
+ import asyncio
11
+
12
+
13
+ class GeminiExtractor:
14
+ """
15
+ Extract key nodes and relationships using Gemini LLM
16
+ Simple, cost-effective approach for knowledge graph generation
17
+ """
18
+
19
+ def __init__(self, llm_service=None):
20
+ """Initialize Gemini extractor"""
21
+ logger.info("Initializing GeminiExtractor")
22
+
23
+ # Import litellm for API calls
24
+ try:
25
+ import litellm
26
+ self.litellm = litellm
27
+
28
+ # Configure litellm for Gemini
29
+ self.model_name = f"gemini/{settings.gemini_model}"
30
+ self.api_key = settings.gemini_api_key
31
+
32
+ logger.info(f"✓ GeminiExtractor initialized with model: {self.model_name}")
33
+
34
+ except ImportError as e:
35
+ logger.error("litellm not installed. Install with: pip install litellm")
36
+ raise RuntimeError("litellm required for Gemini") from e
37
+
38
+ # Comprehensive list of generic terms to REJECT
39
+ self.generic_stopwords = {
40
+ # Generic nouns
41
+ 'system', 'systems', 'data', 'information', 'value', 'values',
42
+ 'method', 'methods', 'approach', 'approaches', 'technique', 'techniques',
43
+ 'result', 'results', 'study', 'studies', 'paper', 'papers',
44
+ 'section', 'sections', 'figure', 'figures', 'table', 'tables',
45
+ 'example', 'examples', 'case', 'cases', 'type', 'types',
46
+ 'way', 'ways', 'thing', 'things', 'part', 'parts',
47
+ 'model', 'models', 'framework', 'frameworks', # Too generic unless specific
48
+ 'process', 'processes', 'analysis', 'problem', 'problems',
49
+ 'solution', 'solutions', 'set', 'sets', 'group', 'groups',
50
+ 'element', 'elements', 'component', 'components',
51
+ 'feature', 'features', 'property', 'properties',
52
+ 'aspect', 'aspects', 'factor', 'factors', 'parameter', 'parameters',
53
+ 'concept', 'concepts', 'idea', 'ideas', 'theory', 'theories',
54
+ 'field', 'fields', 'area', 'areas', 'domain', 'domains',
55
+ 'task', 'tasks', 'goal', 'goals', 'objective', 'objectives',
56
+ 'input', 'inputs', 'output', 'outputs', 'function', 'functions',
57
+ 'operation', 'operations', 'step', 'steps', 'stage', 'stages',
58
+ 'phase', 'phases', 'level', 'levels', 'layer', 'layers',
59
+ 'number', 'numbers', 'amount', 'amounts', 'size', 'sizes',
60
+ 'performance', 'accuracy', 'quality', 'efficiency',
61
+ 'document', 'documents', 'text', 'texts', 'word', 'words',
62
+ 'sentence', 'sentences', 'paragraph', 'paragraphs',
63
+ 'item', 'items', 'object', 'objects', 'entity', 'entities',
64
+ 'relation', 'relations', 'relationship', 'relationships',
65
+
66
+ # Generic verbs/actions
67
+ 'use', 'uses', 'using', 'used', 'usage',
68
+ 'apply', 'applies', 'applying', 'applied', 'application', 'applications',
69
+ 'work', 'works', 'working', 'worked',
70
+ 'provide', 'provides', 'providing', 'provided',
71
+ 'show', 'shows', 'showing', 'shown',
72
+ 'present', 'presents', 'presenting', 'presented', 'presentation',
73
+
74
+ # Generic adjectives
75
+ 'new', 'novel', 'existing', 'current', 'previous',
76
+ 'different', 'similar', 'same', 'other', 'another',
77
+ 'various', 'several', 'multiple', 'single',
78
+ 'important', 'significant', 'main', 'key', 'major',
79
+ 'good', 'better', 'best', 'high', 'low',
80
+ 'large', 'small', 'big', 'little',
81
+
82
+ # Research-specific generic terms
83
+ 'experiment', 'experiments', 'evaluation', 'evaluations',
84
+ 'test', 'tests', 'testing', 'validation',
85
+ 'comparison', 'comparisons', 'benchmark', 'benchmarks',
86
+ 'baseline', 'baselines', 'metric', 'metrics',
87
+ 'dataset', 'datasets', 'corpus', 'corpora',
88
+
89
+ # Time/sequence terms
90
+ 'time', 'times', 'period', 'periods', 'year', 'years',
91
+ 'first', 'second', 'third', 'last', 'final',
92
+ 'next', 'previous', 'current', 'recent',
93
+
94
+ # Common prepositions/articles (shouldn't appear but just in case)
95
+ 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
96
+
97
+ # Additional generic ML/AI terms (too broad)
98
+ 'neural network', 'deep learning', 'machine learning',
99
+ 'training', 'testing', 'prediction', 'classification',
100
+ 'regression', 'clustering', 'optimization',
101
+ 'network', 'networks', 'algorithm', 'algorithms',
102
+ 'learning', 'training data', 'test data',
103
+ 'feature extraction', 'preprocessing',
104
+ 'hyperparameter', 'hyperparameters',
105
+ 'loss', 'error', 'gradient',
106
+ }
107
+
108
+ async def extract_from_chunks(
109
+ self,
110
+ chunks: List[Chunk],
111
+ use_llm: bool = True
112
+ ) -> List[CanonicalTriple]:
113
+ """
114
+ Extract knowledge graph - PER PAGE with HARD CAP of 2 concepts per page
115
+
116
+ Args:
117
+ chunks: List of text chunks
118
+ use_llm: Always True for Gemini extraction
119
+
120
+ Returns:
121
+ List of canonical triples
122
+ """
123
+ logger.info(f"\n{'='*80}")
124
+ logger.info(f"{'GEMINI PER-PAGE EXTRACTION - 2 CONCEPTS MAX PER PAGE':^80}")
125
+ logger.info(f"{'='*80}")
126
+
127
+ all_triples = []
128
+
129
+ # Filter text chunks
130
+ text_chunks = [c for c in chunks if c.type.value in ["paragraph", "code"]]
131
+
132
+ if not text_chunks:
133
+ logger.warning("No text chunks to process")
134
+ return []
135
+
136
+ # GROUP CHUNKS BY PAGE
137
+ from collections import defaultdict
138
+ chunks_by_page = defaultdict(list)
139
+ for chunk in text_chunks:
140
+ page_num = chunk.page_number or 0
141
+ chunks_by_page[page_num].append(chunk)
142
+
143
+ logger.info(f"Processing {len(chunks_by_page)} pages in PARALLEL")
144
+
145
+ # ⚡ PARALLEL PROCESSING: Create tasks for all pages
146
+ tasks = []
147
+ page_numbers = []
148
+ for page_num in sorted(chunks_by_page.keys()):
149
+ page_chunks = chunks_by_page[page_num]
150
+ combined_text = "\n\n".join([chunk.text for chunk in page_chunks])
151
+
152
+ logger.info(f"📄 PAGE {page_num}: {len(page_chunks)} chunks, {len(combined_text)} chars")
153
+
154
+ # Create async task for this page
155
+ tasks.append(self._extract_with_gemini(combined_text, page_num))
156
+ page_numbers.append(page_num)
157
+
158
+ # Execute all Gemini calls in parallel
159
+ logger.info(f"\n🚀 Launching {len(tasks)} parallel Gemini API calls...")
160
+ import time
161
+ start_time = time.time()
162
+
163
+ results = await asyncio.gather(*tasks, return_exceptions=True)
164
+
165
+ elapsed = time.time() - start_time
166
+ logger.info(f"✓ All {len(tasks)} Gemini calls completed in {elapsed:.2f}s (parallel)")
167
+ logger.info(f" Average: {elapsed/len(tasks):.2f}s per page (would be {elapsed*len(tasks):.2f}s sequential)")
168
+
169
+ # Process results
170
+ for page_num, page_triples in zip(page_numbers, results):
171
+ if isinstance(page_triples, Exception):
172
+ logger.error(f" ❌ Page {page_num} failed: {page_triples}")
173
+ continue
174
+
175
+ if page_triples:
176
+ all_triples.extend(page_triples)
177
+ logger.info(f" ✓ Page {page_num}: Extracted {len(page_triples)} triples")
178
+ for t in page_triples:
179
+ relation_value = t.relation.value if hasattr(t.relation, 'value') else t.relation
180
+ logger.info(f" → {t.subject_label} --[{relation_value}]--> {t.object_label}")
181
+ else:
182
+ logger.warning(f" ⚠️ Page {page_num}: NO TRIPLES EXTRACTED!")
183
+
184
+ # Summary
185
+ unique_concepts = set()
186
+ concepts_by_page = {}
187
+ for triple in all_triples:
188
+ unique_concepts.add(triple.subject_label)
189
+ unique_concepts.add(triple.object_label)
190
+ page = triple.page_number
191
+ if page not in concepts_by_page:
192
+ concepts_by_page[page] = set()
193
+ concepts_by_page[page].add(triple.subject_label)
194
+ concepts_by_page[page].add(triple.object_label)
195
+
196
+ logger.info(f"\n{'='*80}")
197
+ logger.info(f"{'EXTRACTION SUMMARY':^80}")
198
+ logger.info(f"{'='*80}")
199
+ logger.info(f"Pages processed: {len(chunks_by_page)}")
200
+ logger.info(f"Total triples: {len(all_triples)}")
201
+ logger.info(f"Unique concepts: {len(unique_concepts)} (max {len(chunks_by_page) * 2})")
202
+
203
+ if len(all_triples) == 0:
204
+ logger.error(f"\n❌❌❌ CRITICAL ERROR: ZERO TRIPLES EXTRACTED! ❌❌❌")
205
+ logger.error(f"This means:")
206
+ logger.error(f" - Either Gemini returned no concepts")
207
+ logger.error(f" - Or all concepts were rejected by filters")
208
+ logger.error(f" - Or there was an API error")
209
+ logger.error(f"Check the logs above for details!")
210
+ else:
211
+ logger.info(f"\nConcepts per page:")
212
+ for page in sorted(concepts_by_page.keys()):
213
+ logger.info(f" Page {page}: {list(concepts_by_page[page])}")
214
+
215
+ logger.info(f"{'='*80}\n")
216
+
217
+ return all_triples
218
+
219
+ async def _extract_with_gemini(self, text: str, page_number: int) -> List[CanonicalTriple]:
220
+ """
221
+ Call Gemini API to extract technical concepts (nodes) from THIS PAGE
222
+
223
+ Args:
224
+ text: Text from single page
225
+ page_number: Page number
226
+
227
+ Returns:
228
+ List of canonical triples
229
+ """
230
+ # Specialized technical concept extraction prompt
231
+ prompt = f"""You are an expert in technical information extraction and knowledge graph construction.
232
+ Your task is to identify only the most meaningful *technical concepts* from the given text.
233
+ Concepts must represent scientific, mathematical, algorithmic, or methodological entities
234
+ that could exist as standalone nodes in a knowledge graph.
235
+ Ignore generic words, section titles, variable names, and everyday terms.
236
+ Focus on high-value, domain-specific terminology relevant to the text.
237
+
238
+ Extract all important technical concepts from the following text that would form the
239
+ nodes of a knowledge graph.
240
+
241
+ ⚙️ Rules:
242
+ • Each concept should represent a self-contained technical idea, model, method, metric, loss, theorem, or process
243
+ • Keep only multi-word phrases when possible ("gradient descent", "convolutional neural network", "cross-entropy loss")
244
+ • Skip single, contextless nouns ("data", "model", "value", "equation", "result")
245
+ • Merge synonymous terms (e.g., "SGD", "stochastic gradient descent" → one entry)
246
+ • Do not include equations, numeric values, figure names, or symbols
247
+ • Do not repeat concepts
248
+ • Maintain consistent naming conventions (lowercase, hyphen-separated words)
249
+ • Extract MAXIMUM 4-5 concepts from this page (quality over quantity)
250
+
251
+ Return output strictly as JSON with "nodes" key:
252
+ {{
253
+ "nodes": [
254
+ "gradient descent",
255
+ "neural network",
256
+ "cross entropy loss"
257
+ ]
258
+ }}
259
+
260
+ PAGE {page_number} TEXT:
261
+ {text}
262
+
263
+ CRITICAL: Return ONLY the JSON. If no technical concepts found, return {{"nodes": []}}"""
264
+
265
+ logger.info(f" 🚀 Starting Gemini extraction for page {page_number}...")
266
+ logger.info(f" Text length: {len(text)} characters")
267
+
268
+ try:
269
+ # Call Gemini via litellm
270
+ logger.info(f" 📡 Calling Gemini API for page {page_number}...")
271
+
272
+ response = await asyncio.to_thread(
273
+ self.litellm.completion,
274
+ model=self.model_name,
275
+ api_key=self.api_key,
276
+ messages=[{
277
+ "role": "user",
278
+ "content": prompt
279
+ }],
280
+ temperature=0.0,
281
+ max_tokens=settings.llm_max_tokens,
282
+ timeout=settings.llm_timeout
283
+ )
284
+
285
+ # Extract response text
286
+ response_text = response.choices[0].message.content.strip()
287
+ logger.info(f" 📥 Gemini response ({len(response_text)} chars):")
288
+ logger.info(f" {response_text[:500]}")
289
+
290
+
291
+ if "```json" in response_text:
292
+ response_text = response_text.split("```json")[1].split("```")[0].strip()
293
+ elif "```" in response_text:
294
+ response_text = response_text.split("```")[1].split("```")[0].strip()
295
+
296
+ data = json.loads(response_text)
297
+
298
+
299
+ if isinstance(data, dict) and "nodes" in data:
300
+ nodes = data["nodes"]
301
+ elif isinstance(data, list):
302
+ # Fallback: if Gemini returned a list directly
303
+ nodes = data
304
+ else:
305
+ logger.warning(f" ❌ Gemini returned unexpected format: {type(data)}")
306
+ return []
307
+
308
+ if not isinstance(nodes, list):
309
+ logger.warning(f" ❌ Nodes is not a list, got: {type(nodes)}")
310
+ return []
311
+
312
+ logger.info(f" ✓ Gemini extracted {len(nodes)} nodes from page {page_number}")
313
+ logger.info(f" Raw nodes: {nodes}")
314
+
315
+ # Validate and filter nodes
316
+ valid_nodes = []
317
+ rejected_nodes = []
318
+
319
+ for node in nodes:
320
+ if not isinstance(node, str):
321
+ logger.warning(f" ⚠️ Skipping non-string node: {node}")
322
+ continue
323
+
324
+ node = node.strip()
325
+ if not node:
326
+ continue
327
+
328
+ logger.info(f" Validating node: '{node}'")
329
+
330
+ # FILTER: Validate node is a technical concept
331
+ if not self._is_technical_concept(node):
332
+ rejected_nodes.append(node)
333
+ logger.warning(f" ✗ REJECTED node '{node}' - not technical enough")
334
+ continue
335
+
336
+ logger.info(f" ✅ ACCEPTED node: '{node}'")
337
+ valid_nodes.append(node.lower())
338
+
339
+ # Summary of rejections
340
+ if rejected_nodes:
341
+ logger.warning(f" 📊 Rejected {len(rejected_nodes)} nodes: {rejected_nodes}")
342
+
343
+ if not valid_nodes:
344
+ logger.warning(f" ⚠️ ALL {len(nodes)} NODES REJECTED for page {page_number}")
345
+ logger.warning(f" No valid technical concepts found. Returning empty list.")
346
+ return []
347
+
348
+
349
+ selected_nodes = valid_nodes[:2] #
350
+ logger.info(f" 🎯 Selected {len(selected_nodes)} nodes (hard cap = 2): {selected_nodes}")
351
+
352
+
353
+ page_triples = []
354
+
355
+ if len(selected_nodes) == 1:
356
+ # Only one node - create self-referencing relationship or skip
357
+ logger.info(f" ℹ️ Only 1 node on page {page_number}, cannot create relationships")
358
+
359
+ return []
360
+
361
+ elif len(selected_nodes) == 2:
362
+ # Use LLM to determine actual relationship between nodes
363
+ node1, node2 = selected_nodes[0], selected_nodes[1]
364
+
365
+ # Extract relationship using LLM with page context
366
+ logger.info(f" 🔍 Extracting relationship between: {node1} ↔ {node2}")
367
+ relationship_triple = await self._extract_relationship_with_gemini(
368
+ text=text,
369
+ node1=node1,
370
+ node2=node2,
371
+ page_number=page_number
372
+ )
373
+
374
+ if relationship_triple:
375
+ page_triples.append(relationship_triple)
376
+ logger.info(f" ✅ Created directed edge:")
377
+ logger.info(f" → {relationship_triple.subject_label} --[{relationship_triple.relation.value}]--> {relationship_triple.object_label}")
378
+ logger.info(f" Justification: {relationship_triple.justification}")
379
+ else:
380
+ logger.warning(f" ⚠️ Could not extract relationship for {node1} ↔ {node2}")
381
+
382
+ logger.info(f" ✅ Returning {len(page_triples)} triples for page {page_number}")
383
+ return page_triples
384
+
385
+ except json.JSONDecodeError as e:
386
+ logger.error(f" ❌ JSON PARSE ERROR for page {page_number}: {e}")
387
+ logger.error(f" Response was: {response_text[:500]}")
388
+ return []
389
+
390
+ except Exception as e:
391
+ logger.error(f" ❌ GEMINI API FAILED for page {page_number}: {e}")
392
+ logger.error(f" Exception type: {type(e).__name__}")
393
+ logger.error(f" Full trace:", exc_info=True)
394
+ return []
395
+
396
+ async def _extract_relationship_with_gemini(self, text: str, node1: str, node2: str, page_number: int) -> Optional[CanonicalTriple]:
397
+ """
398
+ Use Gemini to determine the actual relationship between two nodes based on page context
399
+
400
+ Args:
401
+ text: Full page text for context
402
+ node1: First node/concept
403
+ node2: Second node/concept
404
+ page_number: Page number
405
+
406
+ Returns:
407
+ CanonicalTriple with proper relationship, or None if extraction fails
408
+ """
409
+ # List all available relation types for the LLM
410
+ available_relations = [r.value for r in RelationType]
411
+
412
+ prompt = f"""You are an expert at extracting knowledge graph relationships from technical text.
413
+
414
+ Given two concepts and the text they appear in, determine the most accurate relationship between them.
415
+
416
+ **Concepts:**
417
+ - Concept A: "{node1}"
418
+ - Concept B: "{node2}"
419
+
420
+ **Context (page {page_number}):**
421
+ {text[:3000]}
422
+
423
+ **Available Relationship Types:**
424
+ {', '.join(available_relations)}
425
+
426
+ **Instructions:**
427
+ 1. Analyze how these two concepts relate in the given context
428
+ 2. Choose the MOST SPECIFIC relationship type from the list above
429
+ 3. Determine the direction: which concept is the subject and which is the object
430
+ 4. Provide a brief justification from the text
431
+
432
+ **Output Format (JSON):**
433
+ {{
434
+ "subject": "<node1 or node2>",
435
+ "object": "<node1 or node2>",
436
+ "relation": "<one of the available relationship types>",
437
+ "confidence": <0.0-1.0>,
438
+ "justification": "<brief explanation from text>"
439
+ }}
440
+
441
+ **Rules:**
442
+ - Use the exact concept names provided
443
+ - Choose only ONE relation type from the available list
444
+ - If no clear relationship exists, use "related_to"
445
+ - Direction matters: subject performs/has the relation to the object
446
+ """
447
+
448
+ try:
449
+ # Call Gemini API
450
+ response_text = await self.litellm.acompletion(
451
+ model=self.model_name,
452
+ messages=[
453
+ {"role": "system", "content": "You are an expert at knowledge graph relationship extraction. Always output valid JSON."},
454
+ {"role": "user", "content": prompt}
455
+ ],
456
+ api_key=self.api_key,
457
+ temperature=0.1, # Low temperature for consistent relationship extraction
458
+ response_format={"type": "json_object"}
459
+ )
460
+
461
+ response_content = response_text.choices[0].message.content
462
+ data = json.loads(response_content)
463
+
464
+ # Validate response
465
+ subject = data.get("subject", "").strip()
466
+ obj = data.get("object", "").strip()
467
+ relation_str = data.get("relation", "related_to").lower().strip().replace(" ", "_")
468
+ confidence = float(data.get("confidence", 0.7))
469
+ justification = data.get("justification", f"Relationship extracted from page {page_number}")
470
+
471
+ # Map relation string to enum
472
+ try:
473
+ relation = RelationType(relation_str)
474
+ except ValueError:
475
+ logger.warning(f" ⚠️ Invalid relation '{relation_str}', defaulting to RELATED_TO")
476
+ relation = RelationType.RELATED_TO
477
+
478
+ # Create triple
479
+ triple = CanonicalTriple(
480
+ subject_label=subject,
481
+ object_label=obj,
482
+ relation=relation,
483
+ confidence=confidence,
484
+ justification=justification,
485
+ page_number=page_number
486
+ )
487
+
488
+ return triple
489
+
490
+ except json.JSONDecodeError as e:
491
+ logger.error(f" ❌ JSON parse error in relationship extraction: {e}")
492
+ return None
493
+ except Exception as e:
494
+ logger.error(f" ❌ Relationship extraction failed: {e}")
495
+ return None
496
+
497
+ def _is_technical_concept(self, concept: str) -> bool:
498
+ """
499
+
500
+ Args:
501
+ concept: Concept string to validate
502
+
503
+ Returns:
504
+ True if highly technical/specific, False otherwise
505
+ """
506
+ concept_lower = concept.lower().strip()
507
+
508
+ # RULE 1: Reject if in stopwords
509
+ if concept_lower in self.generic_stopwords:
510
+ logger.debug(f"Rejected '{concept}' - in stopword list")
511
+ return False
512
+
513
+ # RULE 2: Reject if any word is a generic stopword (stricter)
514
+ words = concept_lower.split()
515
+ for word in words:
516
+ if word in self.generic_stopwords:
517
+ # Allow if it's part of a specific multi-word technical term
518
+ # e.g., "convolutional neural network" has "network" but is specific
519
+ if len(words) < 2:
520
+ logger.debug(f"Rejected '{concept}' - contains generic word '{word}'")
521
+ return False
522
+
523
+ # RULE 3: Single-word concepts must have SOME specificity (RELAXED)
524
+ if len(words) == 1:
525
+ # Accept if ANY of these are true:
526
+ # - Has uppercase (BERT, Adam, PyTorch)
527
+ # - Has numbers (VGG16, GPT3)
528
+ # - Has special chars (t-SNE, bi-LSTM)
529
+ # - Longish word (8+ chars like "backpropagation")
530
+ has_uppercase = any(c.isupper() for c in concept)
531
+ has_numbers = any(c.isdigit() for c in concept)
532
+ has_special = '-' in concept or '_' in concept
533
+ is_longish = len(concept) >= 8 # RELAXED from 10
534
+
535
+ if not (has_uppercase or has_numbers or has_special or is_longish):
536
+ logger.debug(f"Rejected '{concept}' - single word not specific enough")
537
+ return False
538
+
539
+ # RULE 4: Multi-word phrases - very lenient
540
+ if len(words) >= 2:
541
+ # Just check that it's not ALL generic words
542
+ # At least one word should be non-generic or have caps/numbers
543
+ has_caps = any(c.isupper() for c in concept)
544
+ has_numbers = any(c.isdigit() for c in concept)
545
+ has_hyphen = '-' in concept
546
+
547
+ # Count non-generic words
548
+ non_generic_count = sum(1 for w in words if w not in self.generic_stopwords)
549
+
550
+ # Accept if ANY of these:
551
+ # - Has caps/numbers/hyphen
552
+ # - At least one word is non-generic
553
+ # - 3+ words (likely specific enough)
554
+ if not (has_caps or has_numbers or has_hyphen or non_generic_count > 0 or len(words) >= 3):
555
+ logger.debug(f"Rejected '{concept}' - multi-word phrase too generic")
556
+ return False
557
+
558
+ # RULE 5: Reject very short terms (1-2 chars) unless they're known acronyms (all caps)
559
+ if len(concept) <= 2 and concept.upper() != concept:
560
+ logger.debug(f"Rejected '{concept}' - too short")
561
+ return False
562
+
563
+ # RULE 6: Must contain at least one alphanumeric character
564
+ if not any(c.isalnum() for c in concept):
565
+ logger.debug(f"Rejected '{concept}' - no alphanumeric chars")
566
+ return False
567
+
568
+ # RULE 7: Reject if it's just a generic category with a modifier
569
+ # e.g., "new algorithm", "proposed method", "our model"
570
+ generic_patterns = [
571
+ 'new ', 'novel ', 'proposed ', 'our ', 'this ', 'that ',
572
+ 'these ', 'those ', 'such ', 'other ', 'another ',
573
+ 'existing ', 'current ', 'previous ', 'standard '
574
+ ]
575
+ for pattern in generic_patterns:
576
+ if concept_lower.startswith(pattern):
577
+ logger.debug(f"Rejected '{concept}' - generic pattern")
578
+ return False
579
+
580
+ # Passed all strict filters
581
+ return True
582
+
583
+ def _map_relation(self, relation_str: str) -> RelationType:
584
+ """Map relation string to RelationType enum"""
585
+ relation_lower = relation_str.lower().strip()
586
+
587
+ # Direct mapping
588
+ mapping = {
589
+ "uses": RelationType.USES,
590
+ "implements": RelationType.IMPLEMENTS,
591
+ "is_a": RelationType.IS_A,
592
+ "is a": RelationType.IS_A,
593
+ "part_of": RelationType.PART_OF,
594
+ "part of": RelationType.PART_OF,
595
+ "requires": RelationType.REQUIRES,
596
+ "produces": RelationType.PRODUCES,
597
+ "enables": RelationType.ENABLES,
598
+ "improves": RelationType.IMPROVES,
599
+ "enhances": RelationType.ENHANCES,
600
+ "contains": RelationType.CONTAINS,
601
+ "depends_on": RelationType.DEPENDS_ON,
602
+ "depends on": RelationType.DEPENDS_ON,
603
+ "related_to": RelationType.RELATED_TO,
604
+ "related to": RelationType.RELATED_TO,
605
+ }
606
+
607
+ if relation_lower in mapping:
608
+ return mapping[relation_lower]
609
+
610
+ # Fallback
611
+ logger.debug(f"Unknown relation '{relation_str}', using 'related_to'")
612
+ return RelationType.RELATED_TO
graph_builder.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Graph Builder - constructs knowledge graph from canonical triples
3
+ Handles entity canonicalization, node/edge creation, and graph pruning
4
+ """
5
+ from typing import List, Dict, Any, Set, Tuple
6
+ from loguru import logger
7
+ from models import CanonicalTriple, GraphNode, GraphEdge, SupportingChunk, NodeType
8
+ from graph_store import GraphStore
9
+ from embedding_service import EmbeddingService
10
+ from config import settings
11
+ import numpy as np
12
+ from collections import defaultdict
13
+
14
+
15
+ class GraphBuilder:
16
+ """
17
+ Builds and refines knowledge graph from canonical triples
18
+ Implements entity canonicalization, deduplication, and pruning
19
+ """
20
+
21
+ def __init__(self, graph_store: GraphStore, embedding_service: EmbeddingService):
22
+ self.graph_store = graph_store
23
+ self.embedding_service = embedding_service
24
+ self.entity_embeddings: Dict[str, np.ndarray] = {}
25
+
26
+ async def build_graph(self, triples: List[CanonicalTriple]) -> Tuple[int, int]:
27
+ """
28
+ Build graph from canonical triples
29
+
30
+ Args:
31
+ triples: List of canonical triples
32
+
33
+ Returns:
34
+ Tuple of (num_nodes_added, num_edges_added)
35
+ """
36
+ logger.info(f"Building graph from {len(triples)} triples")
37
+
38
+ # Step 1: Entity canonicalization - merge similar entities
39
+ entity_map = await self._canonicalize_entities(triples)
40
+
41
+ # Step 2: Create nodes
42
+ nodes_created = 0
43
+ logger.info(f"Creating nodes from {len(entity_map)} canonical entities")
44
+
45
+ for entity_label in entity_map.keys():
46
+ node = await self._create_node(entity_label, entity_map, triples)
47
+ if self.graph_store.add_node(node):
48
+ nodes_created += 1
49
+ logger.debug(f"Created node: {node.label} (type: {node.type.value})")
50
+
51
+ logger.info(f"✓ Successfully created {nodes_created} nodes")
52
+
53
+ # Step 3: Create edges
54
+ edges_created = 0
55
+ for triple in triples:
56
+ # Map to canonical entities
57
+ canonical_subject = entity_map.get(triple.subject_label, triple.subject_label)
58
+ canonical_object = entity_map.get(triple.object_label, triple.object_label)
59
+
60
+ # Skip self-loops
61
+ if canonical_subject == canonical_object:
62
+ continue
63
+
64
+ # Get node IDs
65
+ subject_node = self.graph_store.get_node_by_label(canonical_subject)
66
+ object_node = self.graph_store.get_node_by_label(canonical_object)
67
+
68
+ if not subject_node or not object_node:
69
+ continue
70
+
71
+ # Create edge
72
+ edge = self._create_edge(subject_node, object_node, triple)
73
+ if self.graph_store.add_edge(edge):
74
+ edges_created += 1
75
+
76
+ logger.info(f"Created {nodes_created} nodes and {edges_created} edges")
77
+
78
+ # Step 4: Compute importance scores
79
+ self._compute_importance_scores()
80
+
81
+ # Step 5: Prune low-importance nodes and edges
82
+ pruned_nodes, pruned_edges = self._prune_graph()
83
+
84
+ logger.info(f"Pruned {pruned_nodes} nodes and {pruned_edges} edges")
85
+ logger.info(f"Final graph: {nodes_created - pruned_nodes} nodes, {edges_created - pruned_edges} edges")
86
+
87
+ return nodes_created - pruned_nodes, edges_created - pruned_edges
88
+
89
+ async def _canonicalize_entities(self, triples: List[CanonicalTriple]) -> Dict[str, str]:
90
+ """
91
+ ⚡ OPTIMIZATION: Skip expensive canonicalization (identity mapping)
92
+
93
+ With 2 nodes per page hard cap and strict technical filtering,
94
+ we have very few duplicates and highly specific entities.
95
+ Embedding computation + O(n²) similarity checks not worth the cost.
96
+
97
+ Args:
98
+ triples: List of triples
99
+
100
+ Returns:
101
+ Dict mapping entity_label -> canonical_label (identity map)
102
+ """
103
+ # Collect all unique entities
104
+ entities = set()
105
+ for triple in triples:
106
+ entities.add(triple.subject_label)
107
+ entities.add(triple.object_label)
108
+
109
+ # DETERMINISTIC: Sort entities for consistent ordering across runs
110
+ entities_list = sorted(list(entities))
111
+ logger.info(f"⚡ FAST MODE: Skipping entity canonicalization for {len(entities_list)} unique entities")
112
+ logger.info(f"Each entity maps to itself (no merging)")
113
+
114
+ # Return identity mapping - each entity maps to itself
115
+ entity_map = {entity: entity for entity in entities_list}
116
+
117
+ logger.info(f"✓ Identity mapping created (0 merges, {len(entities_list)} canonical entities)")
118
+
119
+ return entity_map
120
+
121
+ def _entity_to_text(self, entity: str) -> str:
122
+ """Convert entity label to text for embedding"""
123
+ # Simple approach: use the label as-is
124
+ return entity
125
+
126
+ async def _create_node(
127
+ self,
128
+ label: str,
129
+ entity_map: Dict[str, str],
130
+ triples: List[CanonicalTriple]
131
+ ) -> GraphNode:
132
+ """
133
+ Create a graph node for an entity
134
+
135
+ Args:
136
+ label: Canonical entity label
137
+ entity_map: Entity canonicalization map
138
+ triples: All triples (to find supporting chunks)
139
+
140
+ Returns:
141
+ GraphNode
142
+ """
143
+ # Find all triples mentioning this entity
144
+ supporting_chunks = []
145
+ aliases = []
146
+
147
+ for original_label, canonical_label in entity_map.items():
148
+ if canonical_label == label:
149
+ if original_label != label:
150
+ aliases.append(original_label)
151
+
152
+ # Collect supporting chunks from triples
153
+ chunk_scores = defaultdict(float)
154
+ for triple in triples:
155
+ canonical_subject = entity_map.get(triple.subject_label, triple.subject_label)
156
+ canonical_object = entity_map.get(triple.object_label, triple.object_label)
157
+
158
+ if canonical_subject == label or canonical_object == label:
159
+ # This triple supports the node
160
+ chunk_key = (triple.page_number, triple.justification[:100]) # Use justification as proxy
161
+ chunk_scores[chunk_key] += triple.confidence
162
+
163
+ # Convert to SupportingChunk objects
164
+ for (page_number, snippet), score in chunk_scores.items():
165
+ supporting_chunks.append(SupportingChunk(
166
+ chunk_id=f"page_{page_number}", # Placeholder
167
+ score=score,
168
+ page_number=page_number,
169
+ snippet=snippet
170
+ ))
171
+
172
+ # DETERMINISTIC: Sort by score (desc) then page_number (asc) for stable ordering
173
+ supporting_chunks.sort(key=lambda x: (-x.score, x.page_number))
174
+ supporting_chunks = supporting_chunks[:10]
175
+
176
+ # Infer node type (simple heuristic)
177
+ node_type = self._infer_node_type(label)
178
+
179
+ node = GraphNode(
180
+ label=label,
181
+ type=node_type,
182
+ aliases=aliases,
183
+ supporting_chunks=supporting_chunks,
184
+ importance_score=0.0 # Will be computed later
185
+ )
186
+
187
+ return node
188
+
189
+ def _infer_node_type(self, label: str) -> NodeType:
190
+ """Infer node type from label (simple heuristics)"""
191
+ label_lower = label.lower()
192
+
193
+ # Check for common patterns
194
+ if any(word in label_lower for word in ["function", "method", "algorithm"]):
195
+ return NodeType.FUNCTION
196
+ elif any(word in label_lower for word in ["class", "type", "struct"]):
197
+ return NodeType.CLASS
198
+ elif label[0].isupper() and " " not in label: # Capitalized single word
199
+ return NodeType.PERSON
200
+ elif any(word in label_lower for word in ["definition", "term", "concept"]):
201
+ return NodeType.TERM
202
+ else:
203
+ return NodeType.CONCEPT
204
+
205
+ def _create_edge(
206
+ self,
207
+ from_node: GraphNode,
208
+ to_node: GraphNode,
209
+ triple: CanonicalTriple
210
+ ) -> GraphEdge:
211
+ """Create a graph edge from a triple"""
212
+ supporting_chunk = SupportingChunk(
213
+ chunk_id=f"page_{triple.page_number}",
214
+ score=triple.confidence,
215
+ page_number=triple.page_number,
216
+ snippet=triple.justification
217
+ )
218
+
219
+ edge = GraphEdge(
220
+ from_node=from_node.node_id,
221
+ to_node=to_node.node_id,
222
+ relation=triple.relation,
223
+ confidence=triple.confidence,
224
+ supporting_chunks=[supporting_chunk]
225
+ )
226
+
227
+ return edge
228
+
229
+ def _compute_importance_scores(self):
230
+ """
231
+ ⚡ OPTIMIZATION: Simplified importance scoring (skip expensive PageRank)
232
+
233
+ Since we're not pruning, we only need basic scores for display purposes.
234
+ """
235
+ logger.info("⚡ FAST MODE: Computing simplified importance scores (no PageRank)")
236
+
237
+ # Update node importance with simple metric (just degree centrality)
238
+ for node in self.graph_store.get_all_nodes():
239
+ # Simple importance = number of connections (fast to compute)
240
+ num_neighbors = len(self.graph_store.get_neighbors(node.node_id))
241
+
242
+ # Normalize to 0-1 range (assume max 10 connections)
243
+ importance = min(num_neighbors / 10.0, 1.0)
244
+
245
+ node.importance_score = importance
246
+
247
+ # Update in store (for NetworkX)
248
+ if not self.graph_store.use_neo4j:
249
+ self.graph_store.nodes_dict[node.node_id] = node
250
+
251
+ logger.info(f"✓ Importance scores computed (based on degree centrality only)")
252
+
253
+ def _prune_graph(self) -> Tuple[int, int]:
254
+ """
255
+ ⚡ OPTIMIZATION: Skip pruning (we already filter at extraction)
256
+
257
+ Pruning is expensive (PageRank + multiple graph traversals).
258
+ With strict filtering at extraction (technical concepts only, 2 per page),
259
+ we don't need additional pruning.
260
+
261
+ Returns:
262
+ Tuple of (nodes_removed, edges_removed) - always (0, 0)
263
+ """
264
+ logger.info(f"⚡ FAST MODE: Skipping graph pruning")
265
+ logger.info(f"Nodes already filtered at extraction with strict technical validation")
266
+ logger.info(f"Final graph: {len(self.graph_store.get_all_nodes())} nodes, {len(self.graph_store.get_all_edges())} edges")
267
+
268
+ return 0, 0
graph_store.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Knowledge Graph Store
3
+ Manages nodes, edges, and graph operations
4
+ Supports both NetworkX (local) and Neo4j (production)
5
+ """
6
+ import networkx as nx
7
+ from neo4j import GraphDatabase
8
+ from typing import List, Dict, Any, Optional, Tuple, Set
9
+ from loguru import logger
10
+ from models import GraphNode, GraphEdge, CanonicalTriple, SupportingChunk, NodeType, RelationType
11
+ from config import settings
12
+ import json
13
+ import pickle
14
+ from collections import defaultdict
15
+ from embedding_service import EmbeddingService
16
+
17
+
18
+ class GraphStore:
19
+ """
20
+ Manages the knowledge graph with nodes and edges
21
+ Supports multiple backends: NetworkX (default) or Neo4j
22
+ """
23
+
24
+ def __init__(self, use_neo4j: bool = False, embedding_service: Optional[EmbeddingService] = None):
25
+ self.use_neo4j = use_neo4j
26
+ self.embedding_service = embedding_service
27
+
28
+ if use_neo4j:
29
+ self._init_neo4j()
30
+ else:
31
+ self.graph = nx.MultiGraph() # Undirected graph (no arrows)
32
+ self.nodes_dict: Dict[str, GraphNode] = {} # node_id -> GraphNode
33
+ self.edges_dict: Dict[str, GraphEdge] = {} # edge_id -> GraphEdge
34
+
35
+ logger.info(f"Initialized GraphStore (backend: {'Neo4j' if use_neo4j else 'NetworkX'}, undirected graph)")
36
+
37
+ def _init_neo4j(self):
38
+ """Initialize Neo4j connection"""
39
+ try:
40
+ self.driver = GraphDatabase.driver(
41
+ settings.neo4j_uri,
42
+ auth=(settings.neo4j_user, settings.neo4j_password)
43
+ )
44
+ # Test connection
45
+ with self.driver.session() as session:
46
+ session.run("RETURN 1")
47
+ logger.info("Connected to Neo4j successfully")
48
+ except Exception as e:
49
+ logger.error(f"Failed to connect to Neo4j: {e}")
50
+ logger.info("Falling back to NetworkX (undirected)")
51
+ self.use_neo4j = False
52
+ self.graph = nx.MultiGraph() # Undirected graph
53
+ self.nodes_dict = {}
54
+ self.edges_dict = {}
55
+
56
+ def add_node(self, node: GraphNode) -> bool:
57
+ """
58
+ Add a node to the graph
59
+
60
+ Args:
61
+ node: GraphNode to add
62
+
63
+ Returns:
64
+ True if added, False if already exists
65
+ """
66
+ if self.use_neo4j:
67
+ return self._add_node_neo4j(node)
68
+ else:
69
+ if node.node_id in self.nodes_dict:
70
+ return False
71
+
72
+ self.nodes_dict[node.node_id] = node
73
+ # Handle both enum and string for type field
74
+ node_type = node.type.value if hasattr(node.type, 'value') else node.type
75
+ self.graph.add_node(
76
+ node.node_id,
77
+ label=node.label,
78
+ type=node_type,
79
+ importance=node.importance_score
80
+ )
81
+ return True
82
+
83
+ def add_edge(self, edge: GraphEdge) -> bool:
84
+ """
85
+ Add an edge to the graph
86
+
87
+ Args:
88
+ edge: GraphEdge to add
89
+
90
+ Returns:
91
+ True if added successfully
92
+ """
93
+ if self.use_neo4j:
94
+ return self._add_edge_neo4j(edge)
95
+ else:
96
+ self.edges_dict[edge.edge_id] = edge
97
+ # Handle both enum and string for relation field
98
+ relation_value = edge.relation.value if hasattr(edge.relation, 'value') else edge.relation
99
+ self.graph.add_edge(
100
+ edge.from_node,
101
+ edge.to_node,
102
+ key=edge.edge_id,
103
+ relation=relation_value,
104
+ confidence=edge.confidence
105
+ )
106
+ return True
107
+
108
+ def get_node(self, node_id: str) -> Optional[GraphNode]:
109
+ """Get node by ID"""
110
+ if self.use_neo4j:
111
+ return self._get_node_neo4j(node_id)
112
+ else:
113
+ return self.nodes_dict.get(node_id)
114
+
115
+ def update_node(self, node: GraphNode) -> bool:
116
+ """
117
+ Update an existing node in the graph
118
+
119
+ Args:
120
+ node: GraphNode with updated data
121
+
122
+ Returns:
123
+ True if updated successfully, False if node doesn't exist
124
+ """
125
+ if node.node_id not in self.nodes_dict:
126
+ return False
127
+
128
+ # Update in dictionary
129
+ self.nodes_dict[node.node_id] = node
130
+
131
+ # Update NetworkX graph attributes
132
+ if node.node_id in self.graph:
133
+ node_type = node.type.value if hasattr(node.type, 'value') else node.type
134
+ self.graph.nodes[node.node_id]['label'] = node.label
135
+ self.graph.nodes[node.node_id]['type'] = node_type
136
+ self.graph.nodes[node.node_id]['importance'] = node.importance_score
137
+
138
+ return True
139
+
140
+ def get_node_by_label(self, label: str) -> Optional[GraphNode]:
141
+ """Get node by label (case-insensitive)"""
142
+ label_lower = label.lower()
143
+ for node in self.nodes_dict.values():
144
+ if node.label.lower() == label_lower or label_lower in [a.lower() for a in node.aliases]:
145
+ return node
146
+ return None
147
+
148
+ def get_neighbors(self, node_id: str) -> List[Tuple[GraphNode, GraphEdge]]:
149
+ """
150
+ Get neighboring nodes and connecting edges (undirected graph)
151
+
152
+ Args:
153
+ node_id: Node to get neighbors for
154
+
155
+ Returns:
156
+ List of (neighbor_node, edge) tuples
157
+ """
158
+ if self.use_neo4j:
159
+ return self._get_neighbors_neo4j(node_id)
160
+ else:
161
+ neighbors = []
162
+ # For undirected graph, just get all neighbors
163
+ for neighbor_id in self.graph.neighbors(node_id):
164
+ edges = self.graph.get_edge_data(node_id, neighbor_id)
165
+ if edges:
166
+ for edge_key, edge_data in edges.items():
167
+ edge = self.edges_dict.get(edge_key)
168
+ neighbor_node = self.nodes_dict.get(neighbor_id)
169
+ if edge and neighbor_node:
170
+ neighbors.append((neighbor_node, edge))
171
+
172
+ return neighbors
173
+
174
+ def get_all_nodes(self) -> List[GraphNode]:
175
+ """Get all nodes in graph"""
176
+ if self.use_neo4j:
177
+ return self._get_all_nodes_neo4j()
178
+ else:
179
+ return list(self.nodes_dict.values())
180
+
181
+ def get_all_edges(self) -> List[GraphEdge]:
182
+ """Get all edges in graph"""
183
+ if self.use_neo4j:
184
+ return self._get_all_edges_neo4j()
185
+ else:
186
+ return list(self.edges_dict.values())
187
+
188
+ def remove_node(self, node_id: str):
189
+ """Remove node and its edges"""
190
+ if self.use_neo4j:
191
+ self._remove_node_neo4j(node_id)
192
+ else:
193
+ if node_id in self.nodes_dict:
194
+ del self.nodes_dict[node_id]
195
+ self.graph.remove_node(node_id)
196
+
197
+ def remove_edge(self, edge_id: str):
198
+ """Remove edge"""
199
+ if self.use_neo4j:
200
+ self._remove_edge_neo4j(edge_id)
201
+ else:
202
+ if edge_id in self.edges_dict:
203
+ edge = self.edges_dict[edge_id]
204
+ del self.edges_dict[edge_id]
205
+ if self.graph.has_edge(edge.from_node, edge.to_node, key=edge_id):
206
+ self.graph.remove_edge(edge.from_node, edge.to_node, key=edge_id)
207
+
208
+ def compute_centrality(self) -> Dict[str, float]:
209
+ """
210
+ Compute node centrality scores (degree centrality for undirected graph)
211
+
212
+ Returns:
213
+ Dict mapping node_id -> centrality score
214
+ """
215
+ if self.use_neo4j:
216
+ # Use Neo4j's centrality algorithm
217
+ return self._compute_centrality_neo4j()
218
+ else:
219
+ try:
220
+ # Use degree centrality for undirected graph (simpler and faster)
221
+ centrality = nx.degree_centrality(self.graph)
222
+ return centrality
223
+ except Exception as e:
224
+ logger.error(f"Failed to compute centrality: {e}")
225
+ return {}
226
+
227
+ def save(self, filepath: str):
228
+ """Save graph to file (NetworkX only)"""
229
+ if self.use_neo4j:
230
+ logger.info("Neo4j graphs are persisted automatically")
231
+ return
232
+
233
+ data = {
234
+ "nodes": [node.dict() for node in self.nodes_dict.values()],
235
+ "edges": [edge.dict() for edge in self.edges_dict.values()],
236
+ }
237
+
238
+ with open(filepath, 'wb') as f:
239
+ pickle.dump(data, f)
240
+
241
+ logger.info(f"Saved graph with {len(self.nodes_dict)} nodes and {len(self.edges_dict)} edges to {filepath}")
242
+
243
+ def load(self, filepath: str):
244
+ """Load graph from file (NetworkX only)"""
245
+ if self.use_neo4j:
246
+ logger.warning("Cannot load into Neo4j from file")
247
+ return
248
+
249
+ with open(filepath, 'rb') as f:
250
+ data = pickle.load(f)
251
+
252
+ # Reconstruct nodes
253
+ for node_data in data["nodes"]:
254
+ node = GraphNode(**node_data)
255
+ self.add_node(node)
256
+
257
+ # Reconstruct edges
258
+ for edge_data in data["edges"]:
259
+ edge = GraphEdge(**edge_data)
260
+ self.add_edge(edge)
261
+
262
+ logger.info(f"Loaded graph with {len(self.nodes_dict)} nodes and {len(self.edges_dict)} edges")
263
+
264
+ def clear(self):
265
+ """Clear all nodes and edges"""
266
+ if self.use_neo4j:
267
+ self._clear_neo4j()
268
+ else:
269
+ self.graph.clear()
270
+ self.nodes_dict.clear()
271
+ self.edges_dict.clear()
272
+
273
+ # Neo4j implementations (placeholders - implement as needed)
274
+
275
+ def _add_node_neo4j(self, node: GraphNode) -> bool:
276
+ """Add node to Neo4j"""
277
+ with self.driver.session() as session:
278
+ # Handle both enum and string for type field
279
+ node_type = node.type.value if hasattr(node.type, 'value') else node.type
280
+ result = session.run(
281
+ """
282
+ MERGE (n:Entity {node_id: $node_id})
283
+ ON CREATE SET n.label = $label, n.type = $type,
284
+ n.importance = $importance, n.created_at = datetime()
285
+ RETURN n
286
+ """,
287
+ node_id=node.node_id,
288
+ label=node.label,
289
+ type=node_type,
290
+ importance=node.importance_score
291
+ )
292
+ return result.single() is not None
293
+
294
+ def _add_edge_neo4j(self, edge: GraphEdge) -> bool:
295
+ """Add edge to Neo4j"""
296
+ with self.driver.session() as session:
297
+ # Handle both enum and string for relation field
298
+ relation_value = edge.relation.value if hasattr(edge.relation, 'value') else edge.relation
299
+ session.run(
300
+ """
301
+ MATCH (a:Entity {node_id: $from_node})
302
+ MATCH (b:Entity {node_id: $to_node})
303
+ CREATE (a)-[r:RELATES {edge_id: $edge_id, relation: $relation,
304
+ confidence: $confidence}]->(b)
305
+ """,
306
+ from_node=edge.from_node,
307
+ to_node=edge.to_node,
308
+ edge_id=edge.edge_id,
309
+ relation=relation_value,
310
+ confidence=edge.confidence
311
+ )
312
+ return True
313
+
314
+ def _get_node_neo4j(self, node_id: str) -> Optional[GraphNode]:
315
+ """Get node from Neo4j"""
316
+ # Implementation omitted for brevity
317
+ pass
318
+
319
+ def _get_neighbors_neo4j(self, node_id: str) -> List[Tuple[GraphNode, GraphEdge]]:
320
+ """Get neighbors from Neo4j"""
321
+ # Implementation omitted for brevity
322
+ pass
323
+
324
+ def _get_all_nodes_neo4j(self) -> List[GraphNode]:
325
+ """Get all nodes from Neo4j"""
326
+ pass
327
+
328
+ def _get_all_edges_neo4j(self) -> List[GraphEdge]:
329
+ """Get all edges from Neo4j"""
330
+ pass
331
+
332
+ def _remove_node_neo4j(self, node_id: str):
333
+ """Remove node from Neo4j"""
334
+ pass
335
+
336
+ def _remove_edge_neo4j(self, edge_id: str):
337
+ """Remove edge from Neo4j"""
338
+ pass
339
+
340
+ def _compute_centrality_neo4j(self) -> Dict[str, float]:
341
+ """Compute centrality in Neo4j"""
342
+ pass
343
+
344
+ def _clear_neo4j(self):
345
+ """Clear Neo4j database"""
346
+ with self.driver.session() as session:
347
+ session.run("MATCH (n) DETACH DELETE n")
llm_service.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM Inference Layer
3
+ Handles all LLM calls for extraction, summarization, and chat
4
+ Uses Mistral 7B with structured prompt templates
5
+ """
6
+ from typing import List, Dict, Any, Optional
7
+ from loguru import logger
8
+ from config import settings
9
+ import json
10
+ import httpx
11
+ from tenacity import retry, stop_after_attempt, wait_exponential
12
+ from models import Triple, CanonicalTriple, RelationType
13
+
14
+
15
+ class PromptTemplates:
16
+ """Centralized prompt templates following the manual"""
17
+
18
+ @staticmethod
19
+ def triplet_canonicalization(passage: str, triple: Triple) -> str:
20
+ """Template for canonicalizing extracted triples"""
21
+ return f"""Given the passage and an extracted triple, return a cleaned, canonical version.
22
+
23
+ Passage (from page {triple.page_number}):
24
+ {passage}
25
+
26
+ Extracted Triple:
27
+ - Subject: {triple.subject}
28
+ - Relation: {triple.predicate}
29
+ - Object: {triple.object}
30
+
31
+ CRITICAL INSTRUCTION: You MUST select the "relation" field from this EXACT list of 25 canonical relations.
32
+ Copy the exact string - do NOT create variations, synonyms, or modifications.
33
+
34
+ ALLOWED RELATIONS (choose exactly one):
35
+ 1. is_a - for type/class relationships (e.g., "X is a Y")
36
+ 2. part_of - for component relationships (e.g., "X is part of Y")
37
+ 3. uses - for utilization (use "uses" for: utilizes, employs, applies)
38
+ 4. causes - for causality (e.g., "X causes Y")
39
+ 5. defined_as - for definitions (use "defined_as" for: defines, is defined as)
40
+ 6. related_to - ONLY if no other relation fits
41
+ 7. method_of - for methodological relationships
42
+ 8. depends_on - for dependencies (e.g., "X depends on Y")
43
+ 9. implements - for implementation (e.g., "X implements Y")
44
+ 10. similar_to - for similarity
45
+ 11. observes - for observation (use "observes" for: captures, records, detects, monitors)
46
+ 12. measures - for measurement
47
+ 13. produces - for production/generation (use "produces" for: makes, creates, generates, builds)
48
+ 14. contains - for containment
49
+ 15. affects - for influence (use "affects" for: influences, impacts, modifies, changes)
50
+ 16. enables - for enablement (use "enables" for: facilitates, allows, permits)
51
+ 17. requires - for requirements
52
+ 18. interacts_with - for interactions
53
+ 19. enriches - for enrichment
54
+ 20. enhances - for enhancement (use "enhances" for: improves, optimizes, extends)
55
+ 21. supports - for support (use "supports" for: contributes, helps, aids)
56
+ 22. describes - for description (use "describes" for: proposes, suggests, presents, introduces)
57
+ 23. explains - for explanation (use "explains" for: clarifies, demonstrates, shows, disentangles)
58
+ 24. refers_to - for reference (use "refers_to" for: aims, targets, addresses, focuses on)
59
+ 25. associated_with - for associations
60
+
61
+ EXAMPLES OF WHAT TO DO:
62
+ - If input has "utilizes" → use "uses"
63
+ - If input has "proposes" → use "describes"
64
+ - If input has "contributes to" → use "supports"
65
+ - If input has "aims at" → use "refers_to"
66
+
67
+ DO NOT USE: utilizes, proposes, contributes, aims, makes, captures, defines, or any other variations.
68
+ USE ONLY: The exact 25 strings listed above.
69
+
70
+ Return JSON in this exact format:
71
+ {{
72
+ "subject_label": "cleaned subject name",
73
+ "object_label": "cleaned object name",
74
+ "relation": "one_of_the_25_exact_strings_above",
75
+ "confidence": 0.85,
76
+ "justification": "brief explanation referencing page {triple.page_number}"
77
+ }}
78
+
79
+ Output ONLY the JSON, no other text:
80
+ """
81
+
82
+ @staticmethod
83
+ def node_summarization(node_label: str, chunks: List[Dict[str, Any]]) -> str:
84
+ """Template for node summarization with citations"""
85
+ chunks_text = "\n\n".join([
86
+ f"[Chunk from p.{chunk['page_number']}]\n{chunk['text']}"
87
+ for chunk in chunks
88
+ ])
89
+
90
+ return f"""Summarize the key facts about "{node_label}" using ONLY the following supporting chunks.
91
+
92
+ Requirements:
93
+ - Produce a concise summary (3-6 sentences)
94
+ - After any sentence that directly relies on a chunk, append (p. N) where N is the page number
95
+ - Do not invent information not present in the chunks
96
+ - Focus on the most important facts
97
+
98
+ Supporting Chunks:
99
+ {chunks_text}
100
+
101
+ Summary:
102
+ """
103
+
104
+ @staticmethod
105
+ def rag_chat(user_query: str, context_chunks: List[Dict[str, Any]]) -> str:
106
+ """Template for RAG chat with citations"""
107
+ context_text = "\n\n".join([
108
+ f"[Source {i+1}, p.{chunk['page_number']}]\n{chunk['text']}"
109
+ for i, chunk in enumerate(context_chunks)
110
+ ])
111
+
112
+ return f"""You are an assistant that answers questions using ONLY the provided document context.
113
+
114
+ Context from document:
115
+ {context_text}
116
+
117
+ User Question: {user_query}
118
+
119
+ Instructions:
120
+ - Answer in friendly, concise language
121
+ - Include inline citations (p. N) for statements supported by chunks
122
+ - If you cannot find direct support, say "I cannot confirm this from the document"
123
+ - At the end, add a "Sources:" section listing page numbers and short snippets
124
+
125
+ Answer:
126
+ """
127
+
128
+ @staticmethod
129
+ def system_message() -> str:
130
+ """System message for chat"""
131
+ return """You are a helpful assistant that answers questions strictly based on provided document context.
132
+ You always cite page numbers for factual statements. If information is not in the context, you say so clearly."""
133
+
134
+
135
+ class LLMService:
136
+ """
137
+ Service for LLM inference using Gemini API (via litellm)
138
+ Handles generation, extraction, summarization, and agent synthesis
139
+ """
140
+
141
+ def __init__(self):
142
+ # Use Gemini instead of Mistral
143
+ self.api_key = settings.gemini_api_key
144
+ self.model = f"gemini/{settings.gemini_model}"
145
+ self.temperature = settings.llm_temperature
146
+ self.max_tokens = settings.llm_max_tokens
147
+ self.timeout = settings.llm_timeout
148
+
149
+ # Import litellm for Gemini
150
+ try:
151
+ import litellm
152
+ self.litellm = litellm
153
+ logger.info(f"✓ LLMService initialized with Gemini ({settings.gemini_model})")
154
+ except ImportError:
155
+ logger.error("litellm not installed. Install with: pip install litellm")
156
+ raise
157
+
158
+ if not self.api_key:
159
+ logger.warning("No Gemini API key configured. LLM features will not work.")
160
+
161
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
162
+ async def _call_api(
163
+ self,
164
+ messages: List[Dict[str, str]],
165
+ temperature: Optional[float] = None,
166
+ max_tokens: Optional[int] = None,
167
+ json_mode: bool = False
168
+ ) -> str:
169
+ """
170
+ Call Gemini API via litellm with retry logic
171
+
172
+ Args:
173
+ messages: List of message dicts with 'role' and 'content'
174
+ temperature: Override default temperature
175
+ max_tokens: Override default max tokens
176
+ json_mode: Request JSON output
177
+
178
+ Returns:
179
+ Generated text
180
+ """
181
+ if not self.api_key:
182
+ raise ValueError("Gemini API key not configured")
183
+
184
+ try:
185
+ # Use litellm for Gemini API calls
186
+ import asyncio
187
+
188
+ kwargs = {
189
+ "model": self.model,
190
+ "api_key": self.api_key,
191
+ "messages": messages,
192
+ "temperature": temperature or self.temperature,
193
+ "max_tokens": max_tokens or self.max_tokens,
194
+ }
195
+
196
+ if json_mode:
197
+ kwargs["response_format"] = {"type": "json_object"}
198
+
199
+ # litellm.completion is synchronous, wrap in asyncio.to_thread
200
+ response = await asyncio.to_thread(
201
+ self.litellm.completion,
202
+ **kwargs
203
+ )
204
+
205
+ return response.choices[0].message.content
206
+
207
+ except Exception as e:
208
+ logger.error(f"Gemini API error: {str(e)}")
209
+ raise
210
+
211
+ async def canonicalize_triple(
212
+ self,
213
+ triple: Triple,
214
+ passage: str
215
+ ) -> Optional[CanonicalTriple]:
216
+ """
217
+ Canonicalize a raw triple using LLM
218
+
219
+ Args:
220
+ triple: Raw extracted triple
221
+ passage: Surrounding text passage
222
+
223
+ Returns:
224
+ CanonicalTriple or None if LLM fails
225
+ """
226
+ prompt = PromptTemplates.triplet_canonicalization(passage, triple)
227
+
228
+ messages = [
229
+ {"role": "system", "content": "You are an expert at extracting and canonicalizing knowledge graph triples. Always output valid JSON."},
230
+ {"role": "user", "content": prompt}
231
+ ]
232
+
233
+ try:
234
+ response = await self._call_api(messages, temperature=0.1, json_mode=True)
235
+ data = json.loads(response)
236
+
237
+ # Map string relation to enum
238
+ relation_str = data.get("relation", "related_to").lower().strip()
239
+
240
+ # Auto-correct common variations and map semantically similar verbs
241
+ relation_corrections = {
242
+ # Exact variations
243
+ "defines_as": "defined_as",
244
+ "defines": "defined_as",
245
+ "is_part_of": "part_of",
246
+ "used_by": "uses",
247
+ "caused_by": "causes",
248
+ "methods_of": "method_of",
249
+ "depending_on": "depends_on",
250
+ "implemented_by": "implements",
251
+ "similar": "similar_to",
252
+ "observed_by": "observes",
253
+ "measured_by": "measures",
254
+ "produced_by": "produces",
255
+ "contained_in": "contains",
256
+ "affected_by": "affects",
257
+ "enabled_by": "enables",
258
+ "required_by": "requires",
259
+ "interact_with": "interacts_with",
260
+ "enriched_by": "enriches",
261
+ "enhanced_by": "enhances",
262
+ "supported_by": "supports",
263
+ "described_by": "describes",
264
+ "explained_by": "explains",
265
+ "refer_to": "refers_to",
266
+
267
+ # Semantic mappings for common verbs
268
+ "utilizes": "uses",
269
+ "utilize": "uses",
270
+ "employs": "uses",
271
+ "applies": "uses",
272
+ "makes": "produces",
273
+ "creates": "produces",
274
+ "generates": "produces",
275
+ "builds": "produces",
276
+ "proposes": "describes",
277
+ "suggests": "describes",
278
+ "presents": "describes",
279
+ "introduces": "describes",
280
+ "captures": "observes",
281
+ "records": "observes",
282
+ "detects": "observes",
283
+ "monitors": "observes",
284
+ "aims": "refers_to",
285
+ "targets": "refers_to",
286
+ "focuses_on": "refers_to",
287
+ "addresses": "refers_to",
288
+ "disentangles": "explains",
289
+ "clarifies": "explains",
290
+ "demonstrates": "explains",
291
+ "shows": "explains",
292
+ "contributes": "supports",
293
+ "contributes_to": "supports",
294
+ "helps": "supports",
295
+ "aids": "supports",
296
+ "facilitates": "enables",
297
+ "allows": "enables",
298
+ "permits": "enables",
299
+ "improves": "enhances",
300
+ "betters": "enhances",
301
+ "optimizes": "enhances",
302
+ "extends": "enhances",
303
+ "influences": "affects",
304
+ "impacts": "affects",
305
+ "modifies": "affects",
306
+ "changes": "affects",
307
+ }
308
+
309
+ relation_str = relation_corrections.get(relation_str, relation_str)
310
+
311
+ try:
312
+ relation = RelationType(relation_str)
313
+ except ValueError:
314
+ logger.warning(f"Invalid relation '{relation_str}', defaulting to 'related_to'")
315
+ relation = RelationType.RELATED_TO
316
+
317
+ return CanonicalTriple(
318
+ subject_label=data["subject_label"],
319
+ object_label=data["object_label"],
320
+ relation=relation,
321
+ confidence=data["confidence"],
322
+ justification=data["justification"],
323
+ page_number=triple.page_number or 0
324
+ )
325
+ except Exception as e:
326
+ logger.error(f"Failed to canonicalize triple: {e}")
327
+ return None
328
+
329
+ async def summarize_node(
330
+ self,
331
+ node_label: str,
332
+ supporting_chunks: List[Dict[str, Any]]
333
+ ) -> str:
334
+ """
335
+ Generate summary for a graph node with citations
336
+
337
+ Args:
338
+ node_label: Name of the node
339
+ supporting_chunks: List of chunk metadata dicts
340
+
341
+ Returns:
342
+ Summary text with inline citations
343
+ """
344
+ prompt = PromptTemplates.node_summarization(node_label, supporting_chunks)
345
+
346
+ messages = [
347
+ {"role": "system", "content": PromptTemplates.system_message()},
348
+ {"role": "user", "content": prompt}
349
+ ]
350
+
351
+ try:
352
+ # Use faster settings for node summaries
353
+ summary = await self._call_api(
354
+ messages,
355
+ temperature=0.3,
356
+ max_tokens=3072 # Shorter summaries = faster response
357
+ )
358
+ return summary.strip()
359
+ except Exception as e:
360
+ logger.error(f"Failed to summarize node: {e}")
361
+ return f"Unable to generate summary for {node_label}."
362
+
363
+ async def rag_chat(
364
+ self,
365
+ query: str,
366
+ context_chunks: List[Dict[str, Any]]
367
+ ) -> str:
368
+ """
369
+ Answer user query using RAG with citations
370
+
371
+ Args:
372
+ query: User question
373
+ context_chunks: Retrieved context chunks
374
+
375
+ Returns:
376
+ Answer with citations and sources
377
+ """
378
+ prompt = PromptTemplates.rag_chat(query, context_chunks)
379
+
380
+ messages = [
381
+ {"role": "system", "content": PromptTemplates.system_message()},
382
+ {"role": "user", "content": prompt}
383
+ ]
384
+
385
+ try:
386
+ answer = await self._call_api(messages, temperature=0.3)
387
+ return answer.strip()
388
+ except Exception as e:
389
+ logger.error(f"Failed to generate RAG response: {e}")
390
+ return "I encountered an error while processing your question. Please try again."
391
+
392
+ async def agent_synthesize(
393
+ self,
394
+ query: str,
395
+ context: str
396
+ ) -> str:
397
+ """
398
+ Synthesize answer for agent-based RAG from tool results
399
+
400
+ Args:
401
+ query: User question
402
+ context: Combined context from tool executions
403
+
404
+ Returns:
405
+ Synthesized answer with citations
406
+ """
407
+ prompt = f"""You are an assistant that answers questions using the provided context from multiple tools.
408
+
409
+ Context from tools:
410
+ {context}
411
+
412
+ User Question: {query}
413
+
414
+ Instructions:
415
+ - Answer in friendly, concise language
416
+ - Include inline citations (p. N) for statements supported by sources
417
+ - If you cannot find direct support, say "I cannot confirm this from the available information"
418
+ - Synthesize information from different tools (vector search, graph search, etc.) cohesively
419
+
420
+ Answer:
421
+ """
422
+
423
+ messages = [
424
+ {"role": "system", "content": PromptTemplates.system_message()},
425
+ {"role": "user", "content": prompt}
426
+ ]
427
+
428
+ try:
429
+ answer = await self._call_api(messages, temperature=0.3)
430
+ return answer.strip()
431
+ except Exception as e:
432
+ logger.error(f"Failed to synthesize agent response: {e}")
433
+ return "I encountered an error while processing your question. Please try again."
434
+
435
+ async def extract_triples_llm(
436
+ self,
437
+ text: str,
438
+ page_number: int,
439
+ chunk_id: str
440
+ ) -> List[Triple]:
441
+ """
442
+ Use LLM to extract triples directly (alternative to OpenIE)
443
+
444
+ Args:
445
+ text: Text to extract from
446
+ page_number: Page number
447
+ chunk_id: Chunk identifier
448
+
449
+ Returns:
450
+ List of extracted triples
451
+ """
452
+ prompt = f"""Extract key relationships from this text as subject-predicate-object triples.
453
+ Focus on important concepts, methods, definitions, and relationships.
454
+
455
+ Text (from page {page_number}):
456
+ {text}
457
+
458
+ Return a JSON array of triples, each with:
459
+ - subject: The subject entity
460
+ - predicate: The relationship/action
461
+ - object: The object entity
462
+ - confidence: Your confidence (0-1)
463
+
464
+ Output ONLY valid JSON array:
465
+ """
466
+
467
+ messages = [
468
+ {"role": "system", "content": "You are an expert at knowledge extraction. Always output valid JSON."},
469
+ {"role": "user", "content": prompt}
470
+ ]
471
+
472
+ try:
473
+ response = await self._call_api(messages, temperature=0.2, json_mode=True)
474
+ data = json.loads(response)
475
+
476
+ triples = []
477
+ for item in data if isinstance(data, list) else data.get("triples", []):
478
+ triple = Triple(
479
+ subject=item["subject"],
480
+ predicate=item["predicate"],
481
+ object=item["object"],
482
+ confidence=item.get("confidence", 0.7),
483
+ source_chunk_id=chunk_id,
484
+ page_number=page_number
485
+ )
486
+ triples.append(triple)
487
+
488
+ return triples
489
+ except Exception as e:
490
+ logger.error(f"Failed to extract triples: {e}")
491
+ return []
main.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI Backend - Main Application
3
+ Provides REST API for PDF upload, graph retrieval, chat, and node details
4
+ """
5
+ # Suppress PyTorch JIT warnings (harmless, just noisy during import)
6
+ import warnings
7
+ warnings.filterwarnings("ignore", category=UserWarning, module="torch")
8
+ warnings.filterwarnings("ignore", message="Unable to retrieve source")
9
+
10
+ from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from fastapi.staticfiles import StaticFiles
13
+ from fastapi.responses import FileResponse, JSONResponse
14
+ from loguru import logger
15
+ import sys
16
+ from pathlib import Path
17
+ import os
18
+ import uuid
19
+ import pickle
20
+ from datetime import datetime
21
+ from typing import List, Dict, Any
22
+
23
+ from config import settings, ensure_directories
24
+ from models import (
25
+ UploadResponse, GraphResponse, ChatRequest, ChatResponse,
26
+ NodeDetailResponse, AdminStatus, SourceCitation, GraphNode, GraphEdge
27
+ )
28
+ from pdf_processor import PDFProcessor
29
+ from embedding_service import EmbeddingService
30
+ from llm_service import LLMService
31
+ from gemini_extractor import GeminiExtractor
32
+ from graph_store import GraphStore
33
+ from graph_builder import GraphBuilder
34
+ from rag_agent import RAGAgent
35
+
36
+
37
+ # Configure logging
38
+ logger.remove()
39
+ logger.add(
40
+ sys.stderr,
41
+ level=settings.log_level,
42
+ format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan> | <level>{message}</level>"
43
+ )
44
+ logger.add(
45
+ f"{settings.logs_dir}/app.log",
46
+ rotation="500 MB",
47
+ retention="10 days",
48
+ level=settings.log_level
49
+ )
50
+
51
+ # Initialize services
52
+ ensure_directories()
53
+
54
+ app = FastAPI(
55
+ title=settings.app_name,
56
+ version=settings.app_version,
57
+ description="PDF Knowledge Graph and RAG System"
58
+ )
59
+
60
+ # CORS middleware
61
+ app.add_middleware(
62
+ CORSMiddleware,
63
+ allow_origins=["*"], # Configure appropriately for production
64
+ allow_credentials=True,
65
+ allow_methods=["*"],
66
+ allow_headers=["*"],
67
+ )
68
+
69
+ # Global service instances
70
+ logger.info("Initializing PDFProcessor...")
71
+ pdf_processor = PDFProcessor()
72
+
73
+ logger.info("Initializing EmbeddingService...")
74
+ embedding_service = EmbeddingService()
75
+
76
+ logger.info("Initializing LLMService...")
77
+ llm_service = LLMService()
78
+
79
+ logger.info("Initializing GeminiExtractor (direct Gemini API)...")
80
+ triplet_extractor = GeminiExtractor(llm_service)
81
+
82
+ logger.info("Initializing GraphStore...")
83
+ graph_store = GraphStore(use_neo4j=False, embedding_service=embedding_service)
84
+
85
+ logger.info("Initializing GraphBuilder...")
86
+ graph_builder = GraphBuilder(graph_store, embedding_service)
87
+
88
+ logger.info("Initializing RAGAgent (LangGraph-based)...")
89
+ rag_agent = RAGAgent(graph_store, embedding_service, llm_service)
90
+
91
+ logger.info("✓ All services initialized successfully")
92
+
93
+ # In-memory storage for PDF metadata (use database in production)
94
+ pdf_metadata_store: Dict[str, Dict[str, Any]] = {}
95
+
96
+
97
+ @app.on_event("startup")
98
+ async def startup_event():
99
+ """Run on application startup"""
100
+ logger.info(f"Starting {settings.app_name} v{settings.app_version}")
101
+ logger.info(f"Environment: {settings.environment}")
102
+
103
+ # Try to load existing graph
104
+ graph_path = os.path.join(settings.data_dir, "knowledge_graph.pkl")
105
+ if os.path.exists(graph_path):
106
+ try:
107
+ graph_store.load(graph_path)
108
+ logger.info("Loaded existing knowledge graph")
109
+ except Exception as e:
110
+ logger.warning(f"Failed to load existing graph: {e}")
111
+
112
+
113
+ @app.on_event("shutdown")
114
+ async def shutdown_event():
115
+ """Run on application shutdown"""
116
+ logger.info("Shutting down application")
117
+
118
+ # Save graph
119
+ graph_path = os.path.join(settings.data_dir, "knowledge_graph.pkl")
120
+ try:
121
+ graph_store.save(graph_path)
122
+ logger.info("Saved knowledge graph")
123
+ except Exception as e:
124
+ logger.error(f"Failed to save graph: {e}")
125
+
126
+ # Save FAISS index
127
+ try:
128
+ embedding_service.save()
129
+ logger.info("Saved FAISS index")
130
+ except Exception as e:
131
+ logger.error(f"Failed to save FAISS index: {e}")
132
+
133
+
134
+ @app.get("/")
135
+ async def root():
136
+ """Serve the frontend HTML"""
137
+ return FileResponse("frontend/index.html")
138
+
139
+
140
+ @app.post("/upload", response_model=UploadResponse)
141
+ async def upload_pdf(
142
+ file: UploadFile = File(...),
143
+ background_tasks: BackgroundTasks = BackgroundTasks()
144
+ ):
145
+ """
146
+ Upload a PDF and trigger ingestion pipeline
147
+
148
+ Returns immediately with pdf_id, processes in background
149
+ """
150
+ # Validate file
151
+ if not file.filename.endswith('.pdf'):
152
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed")
153
+
154
+ file_size = 0
155
+ content = await file.read()
156
+ file_size = len(content)
157
+
158
+ if file_size > settings.max_file_size_bytes:
159
+ raise HTTPException(
160
+ status_code=400,
161
+ detail=f"File size exceeds maximum of {settings.max_file_size_mb}MB"
162
+ )
163
+
164
+ # Generate PDF ID
165
+ pdf_id = str(uuid.uuid4())
166
+
167
+ # Save file
168
+ filepath = os.path.join(settings.upload_dir, f"{pdf_id}.pdf")
169
+ with open(filepath, 'wb') as f:
170
+ f.write(content)
171
+
172
+ logger.info(f"Uploaded PDF: {file.filename} (ID: {pdf_id})")
173
+
174
+ # Store metadata with detailed progress tracking
175
+ pdf_metadata_store[pdf_id] = {
176
+ "filename": file.filename,
177
+ "filepath": filepath,
178
+ "status": "processing",
179
+ "progress": {
180
+ "stage": "starting",
181
+ "message": "Upload complete, starting processing...",
182
+ "percent": 0
183
+ }
184
+ }
185
+
186
+ # Trigger background processing
187
+ background_tasks.add_task(process_pdf_pipeline, pdf_id, filepath)
188
+
189
+ return UploadResponse(
190
+ pdf_id=pdf_id,
191
+ filename=file.filename,
192
+ status="processing",
193
+ message="PDF uploaded successfully. Processing started in background."
194
+ )
195
+
196
+
197
+ async def process_pdf_pipeline(pdf_id: str, filepath: str):
198
+ """
199
+ ⚡ OPTIMIZED: Full ingestion pipeline with progress tracking
200
+
201
+ Steps:
202
+ 0. Clear existing graph and index (FRESH START)
203
+ 1. Extract chunks from PDF
204
+ 2. Create embeddings
205
+ 3. Add to vector index
206
+ 4. Extract triples (PARALLEL)
207
+ 5. Build knowledge graph (NO PRUNING)
208
+ """
209
+ def update_progress(stage: str, message: str, percent: int):
210
+ """Update progress in metadata store"""
211
+ if pdf_id in pdf_metadata_store:
212
+ pdf_metadata_store[pdf_id]["progress"] = {
213
+ "stage": stage,
214
+ "message": message,
215
+ "percent": percent
216
+ }
217
+
218
+ try:
219
+ logger.info(f"Starting ingestion pipeline for PDF {pdf_id}")
220
+
221
+ # Step 0: CLEAR EVERYTHING for fresh extraction
222
+ update_progress("clearing", "Clearing previous data...", 5)
223
+ logger.info("Step 0: Clearing existing graph and embeddings for fresh extraction")
224
+ graph_store.clear()
225
+ embedding_service.clear()
226
+ logger.info("✓ Cleared all existing data")
227
+
228
+ # Step 1: Extract chunks (with caching)
229
+ cache_path = os.path.join(settings.data_dir, f"chunks_{pdf_id}.pkl")
230
+
231
+ if os.path.exists(cache_path):
232
+ # Load cached chunks (saves 2-3s on reindex)
233
+ update_progress("extraction", "Loading cached text extraction...", 15)
234
+ logger.info("⚡ Step 1: Loading cached chunks from previous extraction")
235
+ with open(cache_path, 'rb') as f:
236
+ cache_data = pickle.load(f)
237
+ refined_chunks = cache_data['chunks']
238
+ metadata = cache_data['metadata']
239
+ logger.info(f"✓ Loaded {len(refined_chunks)} cached chunks (skipped PDF processing)")
240
+ update_progress("extraction", f"Loaded {len(refined_chunks)} cached chunks", 25)
241
+ else:
242
+ # Extract and cache chunks for future reindexing
243
+ update_progress("extraction", "Extracting text from PDF...", 15)
244
+ logger.info("Step 1: Extracting chunks from PDF")
245
+ chunks, metadata = pdf_processor.process_pdf(filepath, pdf_id)
246
+ refined_chunks = pdf_processor.chunk_text(chunks)
247
+
248
+ # Cache for future use
249
+ with open(cache_path, 'wb') as f:
250
+ pickle.dump({'chunks': refined_chunks, 'metadata': metadata}, f)
251
+ logger.info(f"✓ Cached {len(refined_chunks)} chunks for future reindexing")
252
+ update_progress("extraction", f"Extracted {len(refined_chunks)} chunks", 25)
253
+
254
+ # Step 2: Create embeddings
255
+ update_progress("embeddings", f"Creating embeddings for {len(refined_chunks)} chunks...", 35)
256
+ logger.info(f"Step 2: Creating embeddings for {len(refined_chunks)} chunks")
257
+ embeddings = embedding_service.create_embeddings(refined_chunks)
258
+ update_progress("embeddings", "Embeddings created", 50)
259
+
260
+ # Step 3: Add to vector index
261
+ update_progress("indexing", "Building vector index...", 55)
262
+ logger.info("Step 3: Adding to vector index")
263
+ embedding_service.add_to_index(refined_chunks, embeddings)
264
+ embedding_service.save()
265
+ update_progress("indexing", "Vector index complete", 60)
266
+
267
+ # Step 4: Extract triples using Gemini (direct API - PARALLEL)
268
+ update_progress("extraction", "Extracting concepts with AI (parallel)...", 65)
269
+ logger.info("Step 4: Extracting triples using Gemini (PARALLEL per-page, 2 concepts max)")
270
+ canonical_triples = await triplet_extractor.extract_from_chunks(
271
+ refined_chunks,
272
+ use_llm=True # Direct Gemini API calls
273
+ )
274
+ update_progress("extraction", f"Extracted {len(canonical_triples)} relationships", 80)
275
+
276
+ # Step 5: Build graph
277
+ update_progress("graph", "Building knowledge graph...", 85)
278
+ logger.info("Step 5: Building knowledge graph")
279
+ num_nodes, num_edges = await graph_builder.build_graph(canonical_triples)
280
+ update_progress("graph", f"Graph complete: {num_nodes} nodes, {num_edges} edges", 95)
281
+
282
+ # Save graph
283
+ update_progress("saving", "Saving graph to disk...", 98)
284
+ graph_path = os.path.join(settings.data_dir, "knowledge_graph.pkl")
285
+ graph_store.save(graph_path)
286
+
287
+ # Update metadata
288
+ update_progress("completed", f"✓ Complete! {num_nodes} nodes, {num_edges} edges", 100)
289
+ pdf_metadata_store[pdf_id]["status"] = "completed"
290
+ pdf_metadata_store[pdf_id]["num_chunks"] = len(refined_chunks)
291
+ pdf_metadata_store[pdf_id]["num_nodes"] = num_nodes
292
+ pdf_metadata_store[pdf_id]["num_edges"] = num_edges
293
+
294
+ logger.info(f"✓ Completed ingestion for PDF {pdf_id}: {num_nodes} nodes, {num_edges} edges")
295
+
296
+ except Exception as e:
297
+ logger.error(f"❌ Failed to process PDF {pdf_id}: {e}", exc_info=True)
298
+ pdf_metadata_store[pdf_id]["status"] = "failed"
299
+ pdf_metadata_store[pdf_id]["error"] = str(e)
300
+ update_progress("error", f"Error: {str(e)[:100]}", 0)
301
+
302
+
303
+ @app.get("/graph", response_model=GraphResponse)
304
+ async def get_graph(pdf_id: str = None):
305
+ """
306
+ Get the knowledge graph
307
+
308
+ Args:
309
+ pdf_id: Optional filter by PDF ID
310
+
311
+ Returns:
312
+ Graph nodes and edges
313
+ """
314
+ nodes = graph_store.get_all_nodes()
315
+ edges = graph_store.get_all_edges()
316
+
317
+ logger.info(f"Returning {len(nodes)} nodes, {len(edges)} edges")
318
+
319
+ # Filter by PDF if specified
320
+ if pdf_id:
321
+ # Filter nodes and edges that belong to this PDF
322
+ # This requires tracking PDF ID in supporting chunks
323
+ pass
324
+
325
+ return GraphResponse(
326
+ nodes=nodes,
327
+ edges=edges,
328
+ metadata={
329
+ "total_nodes": len(nodes),
330
+ "total_edges": len(edges)
331
+ }
332
+ )
333
+
334
+
335
+ @app.get("/node/{node_id}", response_model=NodeDetailResponse)
336
+ async def get_node_details(node_id: str):
337
+ """
338
+ Get detailed information about a node
339
+
340
+ Includes:
341
+ - Node metadata
342
+ - LLM-generated summary with citations
343
+ - Supporting chunks
344
+ - Related nodes
345
+ """
346
+ node = graph_store.get_node(node_id)
347
+ if not node:
348
+ raise HTTPException(status_code=404, detail="Node not found")
349
+
350
+ # Check if summary is cached in node metadata
351
+ if "cached_summary" in node.metadata:
352
+ logger.info(f"✓ Using cached summary for node {node.label}")
353
+ summary = node.metadata["cached_summary"]
354
+ search_results = None # Use node's supporting chunks for sources
355
+ else:
356
+ # Generate summary (first time)
357
+ logger.info(f"⏳ Generating summary for node {node.label}...")
358
+
359
+ # Get supporting chunks using semantic search on the node label
360
+ # This finds chunks that are semantically similar to the concept
361
+ search_results = embedding_service.search(
362
+ query=node.label,
363
+ top_k=3 # Reduced from 5 to 3 for faster processing
364
+ )
365
+
366
+ # Prepare chunks for LLM
367
+ chunks_for_llm = []
368
+ if search_results:
369
+ chunks_for_llm = [
370
+ {
371
+ "page_number": meta.get("page_number", 0),
372
+ "text": meta.get("text", "")
373
+ }
374
+ for meta, score in search_results
375
+ ]
376
+
377
+ # Fallback: if no chunks found, create a basic summary
378
+ if not chunks_for_llm:
379
+ logger.warning(f"No chunks found for node {node.label}, using basic summary")
380
+ chunks_for_llm = [
381
+ {
382
+ "page_number": chunk.page_number or 0,
383
+ "text": chunk.snippet or ""
384
+ }
385
+ for chunk in node.supporting_chunks[:3]
386
+ ]
387
+
388
+ # Generate summary
389
+ summary = await llm_service.summarize_node(node.label, chunks_for_llm)
390
+
391
+ # Cache summary in node metadata (don't cache search_results - they're not serializable)
392
+ node.metadata["cached_summary"] = summary
393
+ node.metadata["cache_timestamp"] = str(datetime.utcnow())
394
+
395
+ # Update the node in the graph store
396
+ graph_store.update_node(node)
397
+ logger.info(f"✓ Cached summary for node {node.label}")
398
+
399
+ # Get related nodes
400
+ neighbors = graph_store.get_neighbors(node_id)
401
+ related_nodes = [
402
+ {
403
+ "node_id": neighbor.node_id,
404
+ "label": neighbor.label,
405
+ "relation": edge.relation.value,
406
+ "confidence": edge.confidence
407
+ }
408
+ for neighbor, edge in neighbors[:10] # Limit to top 10
409
+ ]
410
+
411
+ # Build source citations
412
+ sources = []
413
+ if search_results is not None:
414
+ # Use search results (freshly generated summary)
415
+ for meta, score in search_results[:5]:
416
+ text = meta.get("text", "")
417
+ snippet = text[:120] + "..." if len(text) > 120 else text
418
+ sources.append(SourceCitation(
419
+ page_number=meta.get("page_number", 0),
420
+ snippet=snippet,
421
+ chunk_id=meta.get("chunk_id", ""),
422
+ score=score
423
+ ))
424
+ else:
425
+ # Use node's supporting chunks (cached summary)
426
+ sources = [
427
+ SourceCitation(
428
+ page_number=chunk.page_number or 0,
429
+ snippet=chunk.snippet or "",
430
+ chunk_id=chunk.chunk_id,
431
+ score=chunk.score
432
+ )
433
+ for chunk in node.supporting_chunks[:5]
434
+ ]
435
+
436
+ return NodeDetailResponse(
437
+ node_id=node.node_id,
438
+ label=node.label,
439
+ type=node.type,
440
+ summary=summary,
441
+ sources=sources,
442
+ related_nodes=related_nodes
443
+ )
444
+
445
+
446
+ @app.post("/chat", response_model=ChatResponse)
447
+ async def chat(request: ChatRequest):
448
+ """
449
+ Agent-based RAG chat endpoint
450
+
451
+ Uses LangGraph agent with multiple tools:
452
+ - vector_search: Semantic search through chunks
453
+ - graph_search: Find concepts in knowledge graph
454
+ - get_node_details: Get detailed node information
455
+ - get_related_nodes: Graph traversal for relationships
456
+ - get_chunk_by_id: Retrieve specific chunks
457
+
458
+ The agent intelligently decides which tools to use based on the query
459
+ """
460
+ logger.info(f"🤖 Agent chat request: '{request.query}'")
461
+
462
+ # Use agent-based RAG
463
+ response = await rag_agent.chat(
464
+ query=request.query,
465
+ pdf_id=request.pdf_id,
466
+ include_citations=True
467
+ )
468
+
469
+ # Limit sources to requested max
470
+ if len(response.sources) > request.max_sources:
471
+ response.sources = response.sources[:request.max_sources]
472
+
473
+ return response
474
+
475
+
476
+ @app.get("/status/{pdf_id}")
477
+ async def get_pdf_status(pdf_id: str):
478
+ """Get processing status for a specific PDF"""
479
+ if pdf_id not in pdf_metadata_store:
480
+ raise HTTPException(status_code=404, detail="PDF not found")
481
+
482
+ metadata = pdf_metadata_store[pdf_id]
483
+ return {
484
+ "pdf_id": pdf_id,
485
+ "filename": metadata.get("filename"),
486
+ "status": metadata.get("status"),
487
+ "progress": metadata.get("progress", {}),
488
+ "num_nodes": metadata.get("num_nodes", 0),
489
+ "num_edges": metadata.get("num_edges", 0),
490
+ "error": metadata.get("error")
491
+ }
492
+
493
+
494
+ @app.get("/admin/status", response_model=AdminStatus)
495
+ async def admin_status():
496
+ """Get system status and statistics"""
497
+ faiss_stats = embedding_service.get_stats()
498
+
499
+ return AdminStatus(
500
+ total_pdfs=len(pdf_metadata_store),
501
+ total_chunks=faiss_stats["num_chunks"],
502
+ total_nodes=len(graph_store.get_all_nodes()),
503
+ total_edges=len(graph_store.get_all_edges()),
504
+ vector_index_size=faiss_stats["total_vectors"],
505
+ recent_logs=[] # Would fetch from logs in production
506
+ )
507
+
508
+
509
+ @app.post("/admin/reindex")
510
+ async def admin_reindex(pdf_id: str):
511
+ """Re-run ingestion for a PDF"""
512
+ if pdf_id not in pdf_metadata_store:
513
+ raise HTTPException(status_code=404, detail="PDF not found")
514
+
515
+ filepath = pdf_metadata_store[pdf_id]["filepath"]
516
+
517
+ # Clear existing data for this PDF (would need better tracking)
518
+ # For now, just re-run the pipeline
519
+
520
+ await process_pdf_pipeline(pdf_id, filepath)
521
+
522
+ return {"message": "Reindexing started", "pdf_id": pdf_id}
523
+
524
+
525
+ @app.post("/admin/clear")
526
+ async def admin_clear():
527
+ """Clear all data"""
528
+ graph_store.clear()
529
+ embedding_service.clear()
530
+ pdf_metadata_store.clear()
531
+
532
+ logger.warning("All data cleared by admin")
533
+
534
+ return {"message": "All data cleared"}
535
+
536
+
537
+ # Mount static files for frontend
538
+ if os.path.exists("frontend"):
539
+ app.mount("/static", StaticFiles(directory="frontend"), name="static")
540
+
541
+
542
+ if __name__ == "__main__":
543
+ import uvicorn
544
+ uvicorn.run(
545
+ "main:app",
546
+ host=settings.api_host,
547
+ port=settings.api_port,
548
+ reload=settings.debug,
549
+ log_level=settings.log_level.lower()
550
+ )
modal_app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GraphLLM - Modal Deployment
3
+ Serverless ML deployment with auto-scaling
4
+ """
5
+ import modal
6
+
7
+ # Create Modal app
8
+ app = modal.App("graphllm")
9
+
10
+ # Define the container image with all dependencies
11
+ image = (
12
+ modal.Image.debian_slim(python_version="3.12")
13
+ .apt_install("tesseract-ocr", "ghostscript", "gcc", "g++")
14
+ .pip_install_from_requirements("requirements.txt")
15
+ )
16
+
17
+ # Create persistent volume for data storage
18
+ volume = modal.Volume.from_name("graphllm-data", create_if_missing=True)
19
+
20
+ # Mount FastAPI app
21
+ @app.function(
22
+ image=image,
23
+ gpu=None, # Use CPU (cheaper)
24
+ memory=4096, # 4GB RAM
25
+ timeout=600, # 10 min timeout
26
+ volumes={"/app/data": volume},
27
+ secrets=[modal.Secret.from_name("graphllm-secrets")], # GEMINI_API_KEY
28
+ )
29
+ @modal.asgi_app()
30
+ def fastapi_app():
31
+ """
32
+ Mount the FastAPI application
33
+ """
34
+ import sys
35
+ sys.path.insert(0, "/root")
36
+
37
+ # Import main FastAPI app
38
+ from main import app as fastapi_app
39
+
40
+ return fastapi_app
41
+
42
+
43
+ # Local testing endpoint
44
+ @app.local_entrypoint()
45
+ def main():
46
+ """
47
+ Test the deployment locally
48
+ """
49
+ print("GraphLLM deployed to Modal!")
50
+ print("Access your app at: https://YOUR_USERNAME--graphllm-fastapi-app.modal.run")
models.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data models for GraphLLM system following the manual specifications
3
+ """
4
+ from pydantic import BaseModel, Field
5
+ from typing import Optional, List, Dict, Any, Literal
6
+ from datetime import datetime
7
+ from enum import Enum
8
+ import uuid
9
+
10
+
11
+ # Enums
12
+ class ChunkType(str, Enum):
13
+ """Types of chunks extracted from PDF"""
14
+ PARAGRAPH = "paragraph"
15
+ CODE = "code"
16
+ TABLE = "table"
17
+ IMAGE = "image"
18
+ IMAGE_TEXT = "image_text"
19
+
20
+
21
+ class NodeType(str, Enum):
22
+ """Types of graph nodes"""
23
+ CONCEPT = "concept"
24
+ PERSON = "person"
25
+ METHOD = "method"
26
+ TERM = "term"
27
+ CLASS = "class"
28
+ FUNCTION = "function"
29
+ ENTITY = "entity"
30
+
31
+
32
+ class RelationType(str, Enum):
33
+ """Canonical relation types for edges"""
34
+ IS_A = "is_a"
35
+ PART_OF = "part_of"
36
+ METHOD_OF = "method_of"
37
+ CAUSES = "causes"
38
+ USES = "uses"
39
+ RELATED_TO = "related_to"
40
+ DEFINED_AS = "defined_as"
41
+ DEPENDS_ON = "depends_on"
42
+ IMPLEMENTS = "implements"
43
+ SIMILAR_TO = "similar_to"
44
+ OBSERVES = "observes"
45
+ MEASURES = "measures"
46
+ PRODUCES = "produces"
47
+ CONTAINS = "contains"
48
+ AFFECTS = "affects"
49
+ ENABLES = "enables"
50
+ REQUIRES = "requires"
51
+ INTERACTS_WITH = "interacts_with"
52
+ ENRICHES = "enriches"
53
+ ENHANCES = "enhances"
54
+ SUPPORTS = "supports"
55
+ DESCRIBES = "describes"
56
+ EXPLAINS = "explains"
57
+ REFERS_TO = "refers_to"
58
+ ASSOCIATED_WITH = "associated_with"
59
+
60
+
61
+ # Core Data Models
62
+
63
+ class Chunk(BaseModel):
64
+ """Individual chunk of text/content from PDF"""
65
+ chunk_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
66
+ pdf_id: str
67
+ page_number: int
68
+ char_range: tuple[int, int]
69
+ type: ChunkType
70
+ text: str
71
+ table_json: Optional[Dict[str, Any]] = None
72
+ image_id: Optional[str] = None
73
+ metadata: Dict[str, Any] = Field(default_factory=dict)
74
+ created_at: datetime = Field(default_factory=datetime.utcnow)
75
+
76
+
77
+ class EmbeddingEntry(BaseModel):
78
+ """Vector embedding for a chunk"""
79
+ chunk_id: str
80
+ embedding: List[float]
81
+ created_at: datetime = Field(default_factory=datetime.utcnow)
82
+ metadata: Dict[str, Any] = Field(default_factory=dict)
83
+
84
+
85
+ class SupportingChunk(BaseModel):
86
+ """Reference to a chunk supporting a node or edge"""
87
+ chunk_id: str
88
+ score: float
89
+ page_number: Optional[int] = None
90
+ snippet: Optional[str] = None
91
+
92
+
93
+ class GraphNode(BaseModel):
94
+ """Node in the knowledge graph"""
95
+ node_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
96
+ label: str
97
+ type: NodeType
98
+ aliases: List[str] = Field(default_factory=list)
99
+ supporting_chunks: List[SupportingChunk] = Field(default_factory=list)
100
+ importance_score: float = 0.0
101
+ metadata: Dict[str, Any] = Field(default_factory=dict)
102
+ created_at: datetime = Field(default_factory=datetime.utcnow)
103
+
104
+
105
+ class GraphEdge(BaseModel):
106
+ """Edge in the knowledge graph"""
107
+ edge_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
108
+ from_node: str = Field(alias="from")
109
+ to_node: str = Field(alias="to")
110
+ relation: RelationType
111
+ confidence: float
112
+ supporting_chunks: List[SupportingChunk] = Field(default_factory=list)
113
+ metadata: Dict[str, Any] = Field(default_factory=dict)
114
+ created_at: datetime = Field(default_factory=datetime.utcnow)
115
+
116
+ class Config:
117
+ populate_by_name = True
118
+ # FastAPI automatically serializes enums as their string values in JSON
119
+
120
+
121
+ class Triple(BaseModel):
122
+ """Extracted triple from text"""
123
+ subject: str
124
+ predicate: str
125
+ object: str
126
+ confidence: float = 1.0
127
+ source_chunk_id: Optional[str] = None
128
+ page_number: Optional[int] = None
129
+ justification: Optional[str] = None
130
+
131
+
132
+ class CanonicalTriple(BaseModel):
133
+ """LLM-canonicalized triple"""
134
+ subject_label: str
135
+ object_label: str
136
+ relation: RelationType
137
+ confidence: float
138
+ justification: str
139
+ page_number: int
140
+
141
+
142
+ # API Request/Response Models
143
+
144
+ class UploadResponse(BaseModel):
145
+ """Response from PDF upload"""
146
+ pdf_id: str
147
+ filename: str
148
+ status: str
149
+ message: str
150
+ num_pages: Optional[int] = None
151
+ num_chunks: Optional[int] = None
152
+
153
+
154
+ class GraphResponse(BaseModel):
155
+ """Response containing graph data"""
156
+ nodes: List[GraphNode]
157
+ edges: List[GraphEdge]
158
+ metadata: Dict[str, Any] = Field(default_factory=dict)
159
+
160
+
161
+ class SourceCitation(BaseModel):
162
+ """Source citation with page number and snippet"""
163
+ page_number: int
164
+ snippet: str
165
+ chunk_id: str
166
+ score: Optional[float] = None
167
+
168
+
169
+ class NodeDetailResponse(BaseModel):
170
+ """Response for node detail request"""
171
+ node_id: str
172
+ label: str
173
+ type: NodeType
174
+ summary: str
175
+ sources: List[SourceCitation]
176
+ related_nodes: List[Dict[str, Any]] = Field(default_factory=list)
177
+ raw_chunks: Optional[List[Chunk]] = None
178
+
179
+
180
+ class ChatMessage(BaseModel):
181
+ """Chat message"""
182
+ role: Literal["user", "assistant", "system"]
183
+ content: str
184
+ sources: Optional[List[SourceCitation]] = None
185
+ timestamp: datetime = Field(default_factory=datetime.utcnow)
186
+
187
+
188
+ class ChatRequest(BaseModel):
189
+ """Chat request"""
190
+ query: str
191
+ pdf_id: str
192
+ include_citations: bool = True
193
+ max_sources: int = 5
194
+
195
+
196
+ class ChatResponse(BaseModel):
197
+ """Chat response with answer and citations"""
198
+ answer: str
199
+ sources: List[SourceCitation]
200
+ context_chunks: Optional[List[str]] = None
201
+
202
+
203
+ class PDFMetadata(BaseModel):
204
+ """Metadata for uploaded PDF"""
205
+ pdf_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
206
+ filename: str
207
+ filepath: str
208
+ num_pages: int
209
+ file_size_bytes: int
210
+ upload_timestamp: datetime = Field(default_factory=datetime.utcnow)
211
+ processing_status: str = "pending"
212
+ num_chunks: int = 0
213
+ num_nodes: int = 0
214
+ num_edges: int = 0
215
+ metadata: Dict[str, Any] = Field(default_factory=dict)
216
+
217
+
218
+ class IngestionLog(BaseModel):
219
+ """Log entry for ingestion process"""
220
+ log_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
221
+ pdf_id: str
222
+ timestamp: datetime = Field(default_factory=datetime.utcnow)
223
+ stage: str
224
+ status: str
225
+ message: str
226
+ details: Optional[Dict[str, Any]] = None
227
+
228
+
229
+ class AdminStatus(BaseModel):
230
+ """Admin status response"""
231
+ total_pdfs: int
232
+ total_chunks: int
233
+ total_nodes: int
234
+ total_edges: int
235
+ vector_index_size: int
236
+ recent_logs: List[IngestionLog]
pdf_processor.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Ingestion & Preprocessing Module
3
+ Handles extraction of text, tables, code blocks, and images from PDFs
4
+ """
5
+ import fitz # PyMuPDF
6
+ import pdfplumber
7
+ import pytesseract
8
+ from PIL import Image
9
+ import io
10
+ import re
11
+ from typing import List, Dict, Any, Optional, Tuple
12
+ from loguru import logger
13
+ from models import Chunk, ChunkType, PDFMetadata
14
+ from config import settings
15
+ import uuid
16
+
17
+
18
+ class PDFProcessor:
19
+ """
20
+ Comprehensive PDF processor that extracts:
21
+ - Page-level text with character ranges
22
+ - Tables (structured)
23
+ - Code blocks (detected heuristically)
24
+ - Images (with OCR)
25
+ """
26
+
27
+ def __init__(self):
28
+ self.code_patterns = [
29
+ re.compile(r'```[\s\S]*?```'), # Markdown code blocks
30
+ re.compile(r'def\s+\w+\s*\('), # Python functions
31
+ re.compile(r'class\s+\w+\s*[:\(]'), # Python/Java classes
32
+ re.compile(r'function\s+\w+\s*\('), # JavaScript functions
33
+ re.compile(r'public\s+class\s+\w+'), # Java classes
34
+ ]
35
+
36
+ def process_pdf(self, filepath: str, pdf_id: str) -> Tuple[List[Chunk], PDFMetadata]:
37
+ """
38
+ Main entry point: process entire PDF and return chunks + metadata
39
+
40
+ Args:
41
+ filepath: Path to PDF file
42
+ pdf_id: Unique identifier for this PDF
43
+
44
+ Returns:
45
+ Tuple of (chunks list, metadata object)
46
+ """
47
+ logger.info(f"Processing PDF: {filepath}")
48
+
49
+ chunks: List[Chunk] = []
50
+
51
+ # Open with PyMuPDF for text and images
52
+ pdf_doc = fitz.open(filepath)
53
+ num_pages = len(pdf_doc)
54
+
55
+ # Open with pdfplumber for tables
56
+ with pdfplumber.open(filepath) as plumber_pdf:
57
+ for page_num in range(num_pages):
58
+ logger.debug(f"Processing page {page_num + 1}/{num_pages}")
59
+
60
+ # Extract from PyMuPDF
61
+ fitz_page = pdf_doc[page_num]
62
+ page_chunks = self._process_page(
63
+ fitz_page=fitz_page,
64
+ plumber_page=plumber_pdf.pages[page_num],
65
+ page_num=page_num + 1, # 1-indexed
66
+ pdf_id=pdf_id
67
+ )
68
+ chunks.extend(page_chunks)
69
+
70
+ pdf_doc.close()
71
+
72
+ # Create metadata
73
+ import os
74
+ file_size = os.path.getsize(filepath)
75
+ metadata = PDFMetadata(
76
+ pdf_id=pdf_id,
77
+ filename=os.path.basename(filepath),
78
+ filepath=filepath,
79
+ num_pages=num_pages,
80
+ file_size_bytes=file_size,
81
+ num_chunks=len(chunks),
82
+ processing_status="completed"
83
+ )
84
+
85
+ logger.info(f"Extracted {len(chunks)} chunks from {num_pages} pages")
86
+ return chunks, metadata
87
+
88
+ def _process_page(
89
+ self,
90
+ fitz_page,
91
+ plumber_page,
92
+ page_num: int,
93
+ pdf_id: str
94
+ ) -> List[Chunk]:
95
+ """Process a single page and return all chunks"""
96
+ chunks: List[Chunk] = []
97
+
98
+ # 1. Extract raw text with character positions
99
+ page_text = fitz_page.get_text("text")
100
+
101
+ # 2. Extract tables
102
+ table_chunks = self._extract_tables(plumber_page, page_num, pdf_id)
103
+ chunks.extend(table_chunks)
104
+
105
+ # 3. Extract code blocks
106
+ code_chunks = self._extract_code_blocks(page_text, page_num, pdf_id)
107
+ chunks.extend(code_chunks)
108
+
109
+ # 4. Extract images and run OCR
110
+ image_chunks = self._extract_images(fitz_page, page_num, pdf_id)
111
+ chunks.extend(image_chunks)
112
+
113
+ # 5. Extract remaining text as paragraphs
114
+ # Remove table and code regions from text before creating paragraph chunks
115
+ cleaned_text = self._remove_extracted_regions(
116
+ page_text,
117
+ [c.text for c in code_chunks]
118
+ )
119
+
120
+ if cleaned_text.strip():
121
+ para_chunk = Chunk(
122
+ chunk_id=str(uuid.uuid4()),
123
+ pdf_id=pdf_id,
124
+ page_number=page_num,
125
+ char_range=(0, len(cleaned_text)),
126
+ type=ChunkType.PARAGRAPH,
127
+ text=cleaned_text,
128
+ metadata={"source": "text_extraction"}
129
+ )
130
+ chunks.append(para_chunk)
131
+
132
+ return chunks
133
+
134
+ def _extract_tables(self, plumber_page, page_num: int, pdf_id: str) -> List[Chunk]:
135
+ """Extract tables from page using pdfplumber"""
136
+ chunks = []
137
+ tables = plumber_page.extract_tables()
138
+
139
+ for idx, table in enumerate(tables):
140
+ if not table:
141
+ continue
142
+
143
+ # Convert table to structured JSON
144
+ table_json = self._table_to_json(table)
145
+
146
+ # Convert table to text representation
147
+ table_text = self._table_to_text(table)
148
+
149
+ chunk = Chunk(
150
+ chunk_id=str(uuid.uuid4()),
151
+ pdf_id=pdf_id,
152
+ page_number=page_num,
153
+ char_range=(0, len(table_text)),
154
+ type=ChunkType.TABLE,
155
+ text=table_text,
156
+ table_json=table_json,
157
+ metadata={"table_index": idx, "num_rows": len(table)}
158
+ )
159
+ chunks.append(chunk)
160
+
161
+ logger.debug(f"Extracted {len(chunks)} tables from page {page_num}")
162
+ return chunks
163
+
164
+ def _table_to_json(self, table: List[List[str]]) -> Dict[str, Any]:
165
+ """Convert table to structured JSON"""
166
+ if not table or len(table) < 2:
167
+ return {"headers": [], "rows": []}
168
+
169
+ headers = table[0]
170
+ rows = table[1:]
171
+
172
+ return {
173
+ "headers": headers,
174
+ "rows": [
175
+ {headers[i]: cell for i, cell in enumerate(row) if i < len(headers)}
176
+ for row in rows
177
+ ]
178
+ }
179
+
180
+ def _table_to_text(self, table: List[List[str]]) -> str:
181
+ """Convert table to readable text"""
182
+ return "\n".join([" | ".join([str(cell) for cell in row]) for row in table])
183
+
184
+ def _extract_code_blocks(self, text: str, page_num: int, pdf_id: str) -> List[Chunk]:
185
+ """Extract code blocks using heuristic patterns"""
186
+ chunks = []
187
+
188
+ # Look for code patterns
189
+ for pattern in self.code_patterns:
190
+ matches = pattern.finditer(text)
191
+ for match in matches:
192
+ code_text = match.group(0)
193
+ if len(code_text) < 20: # Skip very short matches
194
+ continue
195
+
196
+ chunk = Chunk(
197
+ chunk_id=str(uuid.uuid4()),
198
+ pdf_id=pdf_id,
199
+ page_number=page_num,
200
+ char_range=(match.start(), match.end()),
201
+ type=ChunkType.CODE,
202
+ text=code_text,
203
+ metadata={
204
+ "pattern": pattern.pattern,
205
+ "detected_language": self._detect_language(code_text)
206
+ }
207
+ )
208
+ chunks.append(chunk)
209
+
210
+ # Also detect monospace font regions (if PDF has font info)
211
+ # This is more advanced and would require font analysis
212
+
213
+ logger.debug(f"Extracted {len(chunks)} code blocks from page {page_num}")
214
+ return chunks
215
+
216
+ def _detect_language(self, code: str) -> str:
217
+ """Heuristically detect programming language"""
218
+ if 'def ' in code and ':' in code:
219
+ return 'python'
220
+ elif 'function' in code or 'const' in code or 'let' in code:
221
+ return 'javascript'
222
+ elif 'public class' in code or 'private' in code:
223
+ return 'java'
224
+ elif '#include' in code:
225
+ return 'c++'
226
+ else:
227
+ return 'unknown'
228
+
229
+ def _extract_images(self, fitz_page, page_num: int, pdf_id: str) -> List[Chunk]:
230
+ """Extract images and run OCR"""
231
+ chunks = []
232
+ image_list = fitz_page.get_images()
233
+
234
+ for img_index, img in enumerate(image_list):
235
+ try:
236
+ xref = img[0]
237
+ base_image = fitz_page.parent.extract_image(xref)
238
+ image_bytes = base_image["image"]
239
+
240
+ # Convert to PIL Image
241
+ image = Image.open(io.BytesIO(image_bytes))
242
+
243
+ # Run OCR
244
+ ocr_text = pytesseract.image_to_string(image)
245
+
246
+ if ocr_text.strip():
247
+ image_id = f"{pdf_id}_p{page_num}_img{img_index}"
248
+
249
+ chunk = Chunk(
250
+ chunk_id=str(uuid.uuid4()),
251
+ pdf_id=pdf_id,
252
+ page_number=page_num,
253
+ char_range=(0, len(ocr_text)),
254
+ type=ChunkType.IMAGE_TEXT,
255
+ text=ocr_text,
256
+ image_id=image_id,
257
+ metadata={
258
+ "image_format": base_image["ext"],
259
+ "image_index": img_index
260
+ }
261
+ )
262
+ chunks.append(chunk)
263
+ except Exception as e:
264
+ logger.warning(f"Failed to extract image {img_index} on page {page_num}: {e}")
265
+
266
+ logger.debug(f"Extracted {len(chunks)} images from page {page_num}")
267
+ return chunks
268
+
269
+ def _remove_extracted_regions(self, text: str, code_blocks: List[str]) -> str:
270
+ """Remove already-extracted code blocks from text"""
271
+ for code in code_blocks:
272
+ text = text.replace(code, "")
273
+ return text
274
+
275
+ def chunk_text(self, chunks: List[Chunk]) -> List[Chunk]:
276
+ """
277
+ Further chunk large text blocks into smaller overlapping chunks
278
+
279
+ Args:
280
+ chunks: Initial chunks from PDF extraction
281
+
282
+ Returns:
283
+ Refined chunks with proper overlap
284
+ """
285
+ refined_chunks = []
286
+
287
+ for chunk in chunks:
288
+ # Skip non-text chunks (tables, images already chunked)
289
+ if chunk.type in [ChunkType.TABLE, ChunkType.CODE]:
290
+ refined_chunks.append(chunk)
291
+ continue
292
+
293
+ # Split long paragraphs into smaller chunks with overlap
294
+ text = chunk.text
295
+ chunk_size = settings.chunk_size
296
+ overlap = settings.chunk_overlap
297
+
298
+ if len(text) <= chunk_size:
299
+ refined_chunks.append(chunk)
300
+ continue
301
+
302
+ # Create overlapping windows
303
+ for i in range(0, len(text), chunk_size - overlap):
304
+ chunk_text = text[i:i + chunk_size]
305
+
306
+ if len(chunk_text) < settings.min_chunk_size:
307
+ continue
308
+
309
+ new_chunk = Chunk(
310
+ chunk_id=str(uuid.uuid4()),
311
+ pdf_id=chunk.pdf_id,
312
+ page_number=chunk.page_number,
313
+ char_range=(i, i + len(chunk_text)),
314
+ type=chunk.type,
315
+ text=chunk_text,
316
+ metadata={
317
+ **chunk.metadata,
318
+ "parent_chunk_id": chunk.chunk_id,
319
+ "window_index": i // (chunk_size - overlap)
320
+ }
321
+ )
322
+ refined_chunks.append(new_chunk)
323
+
324
+ logger.info(f"Refined {len(chunks)} chunks into {len(refined_chunks)} chunks")
325
+ return refined_chunks
rag_agent.py ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent-Based RAG System using LangGraph
3
+ Provides intelligent query answering with tool use and multi-hop reasoning
4
+ """
5
+ from typing import List, Dict, Any, TypedDict, Annotated
6
+ from typing_extensions import TypedDict
7
+ from langgraph.graph import StateGraph, END, START
8
+ from langgraph.prebuilt import ToolNode
9
+ from langchain_core.tools import tool
10
+ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage
11
+ from loguru import logger
12
+ import asyncio
13
+
14
+ from models import SourceCitation, ChatResponse
15
+ from graph_store import GraphStore
16
+ from embedding_service import EmbeddingService
17
+ from llm_service import LLMService
18
+
19
+
20
+ class AgentState(TypedDict):
21
+ """State for the RAG agent workflow"""
22
+ messages: List # Conversation history
23
+ query: str # Current user question
24
+ pdf_id: str # PDF context
25
+ tool_results: Dict[str, Any] # Results from tool executions
26
+ reasoning_steps: List[str] # Agent's reasoning process
27
+ final_answer: str # Synthesized answer
28
+ citations: List[SourceCitation] # Supporting citations
29
+ next_action: str # What to do next
30
+
31
+
32
+ class RAGAgent:
33
+ """
34
+ Intelligent RAG agent that uses multiple tools to answer questions
35
+
36
+ Tools available:
37
+ 1. vector_search - Semantic search through document chunks
38
+ 2. graph_search - Find concepts in knowledge graph
39
+ 3. get_node_details - Get detailed info about a graph node
40
+ 4. get_related_nodes - Traverse graph relationships
41
+ 5. get_chunk_by_id - Retrieve specific chunks for citations
42
+ """
43
+
44
+ def __init__(self,
45
+ graph_store: GraphStore,
46
+ embedding_service: EmbeddingService,
47
+ llm_service: LLMService):
48
+ """Initialize the RAG agent with necessary services"""
49
+ self.graph_store = graph_store
50
+ self.embedding_service = embedding_service
51
+ self.llm_service = llm_service
52
+
53
+ # Build LangGraph workflow
54
+ self.workflow = self._build_workflow()
55
+ self.app = self.workflow.compile()
56
+
57
+ logger.info("✓ RAG Agent initialized with LangGraph workflow")
58
+
59
+ def _create_tools(self):
60
+ """Create tool functions for the agent"""
61
+
62
+ @tool
63
+ def vector_search(query: str, top_k: int = 5) -> List[Dict[str, Any]]:
64
+ """
65
+ Search document chunks using semantic similarity.
66
+
67
+ Args:
68
+ query: The search query
69
+ top_k: Number of results to return
70
+
71
+ Returns:
72
+ List of relevant chunks with metadata and scores
73
+ """
74
+ logger.info(f"🔍 Tool: vector_search('{query}', top_k={top_k})")
75
+
76
+ try:
77
+ results = self.embedding_service.search(
78
+ query=query,
79
+ top_k=top_k
80
+ )
81
+
82
+ formatted_results = []
83
+ for metadata, score in results:
84
+ formatted_results.append({
85
+ "text": metadata.get("text", ""),
86
+ "page_number": metadata.get("page_number", 0),
87
+ "chunk_id": metadata.get("chunk_id", ""),
88
+ "score": float(score)
89
+ })
90
+
91
+ logger.info(f" ✓ Found {len(formatted_results)} chunks")
92
+ return formatted_results
93
+
94
+ except Exception as e:
95
+ logger.error(f" ✗ vector_search failed: {e}")
96
+ return []
97
+
98
+ @tool
99
+ def graph_search(concept: str) -> Dict[str, Any]:
100
+ """
101
+ Find a concept node in the knowledge graph.
102
+
103
+ Args:
104
+ concept: The concept to search for
105
+
106
+ Returns:
107
+ Node information if found, None otherwise
108
+ """
109
+ logger.info(f"🔍 Tool: graph_search('{concept}')")
110
+
111
+ try:
112
+ node = self.graph_store.get_node_by_label(concept)
113
+
114
+ if node:
115
+ logger.info(f" ✓ Found node: {node.label}")
116
+ return {
117
+ "node_id": node.node_id,
118
+ "label": node.label,
119
+ "type": node.type.value if hasattr(node.type, 'value') else node.type,
120
+ "importance": node.importance_score
121
+ }
122
+ else:
123
+ logger.info(f" ✗ No node found for '{concept}'")
124
+ return None
125
+
126
+ except Exception as e:
127
+ logger.error(f" ✗ graph_search failed: {e}")
128
+ return None
129
+
130
+ @tool
131
+ def get_node_details(node_id: str) -> Dict[str, Any]:
132
+ """
133
+ Get detailed information about a graph node.
134
+
135
+ Args:
136
+ node_id: The ID of the node
137
+
138
+ Returns:
139
+ Detailed node information including supporting chunks
140
+ """
141
+ logger.info(f"🔍 Tool: get_node_details('{node_id}')")
142
+
143
+ try:
144
+ node = self.graph_store.get_node(node_id)
145
+
146
+ if not node:
147
+ logger.info(f" ✗ Node not found")
148
+ return None
149
+
150
+ # Get supporting chunks
151
+ chunks = []
152
+ for chunk in node.supporting_chunks[:5]: # Top 5
153
+ chunks.append({
154
+ "page_number": chunk.page_number,
155
+ "snippet": chunk.snippet,
156
+ "score": chunk.score
157
+ })
158
+
159
+ logger.info(f" ✓ Got details for {node.label}")
160
+ return {
161
+ "label": node.label,
162
+ "type": node.type.value if hasattr(node.type, 'value') else node.type,
163
+ "importance": node.importance_score,
164
+ "supporting_chunks": chunks
165
+ }
166
+
167
+ except Exception as e:
168
+ logger.error(f" ✗ get_node_details failed: {e}")
169
+ return None
170
+
171
+ @tool
172
+ def get_related_nodes(node_id: str, max_neighbors: int = 5) -> List[Dict[str, Any]]:
173
+ """
174
+ Get nodes related to a given node (graph traversal).
175
+
176
+ Args:
177
+ node_id: The ID of the starting node
178
+ max_neighbors: Maximum number of related nodes to return
179
+
180
+ Returns:
181
+ List of related nodes with relationship information
182
+ """
183
+ logger.info(f"🔍 Tool: get_related_nodes('{node_id}', max={max_neighbors})")
184
+
185
+ try:
186
+ neighbors = self.graph_store.get_neighbors(node_id)
187
+
188
+ related = []
189
+ for neighbor_node, edge in neighbors[:max_neighbors]:
190
+ relation_value = edge.relation.value if hasattr(edge.relation, 'value') else edge.relation
191
+ related.append({
192
+ "node_id": neighbor_node.node_id,
193
+ "label": neighbor_node.label,
194
+ "relation": relation_value,
195
+ "confidence": edge.confidence
196
+ })
197
+
198
+ logger.info(f" ✓ Found {len(related)} related nodes")
199
+ return related
200
+
201
+ except Exception as e:
202
+ logger.error(f" ✗ get_related_nodes failed: {e}")
203
+ return []
204
+
205
+ @tool
206
+ def get_chunk_by_id(chunk_id: str) -> Dict[str, Any]:
207
+ """
208
+ Retrieve a specific chunk by its ID (for detailed citations).
209
+
210
+ Args:
211
+ chunk_id: The chunk identifier
212
+
213
+ Returns:
214
+ Chunk content and metadata
215
+ """
216
+ logger.info(f"🔍 Tool: get_chunk_by_id('{chunk_id}')")
217
+
218
+ try:
219
+ # Search by chunk_id in metadata
220
+ # This is a simplified version - you may need to implement proper chunk lookup
221
+ results = self.embedding_service.search_by_chunk_ids([chunk_id], top_k=1)
222
+
223
+ if results:
224
+ metadata, score = results[0]
225
+ logger.info(f" ✓ Found chunk")
226
+ return {
227
+ "text": metadata.get("text", ""),
228
+ "page_number": metadata.get("page_number", 0),
229
+ "chunk_id": chunk_id
230
+ }
231
+ else:
232
+ logger.info(f" ✗ Chunk not found")
233
+ return None
234
+
235
+ except Exception as e:
236
+ logger.error(f" ✗ get_chunk_by_id failed: {e}")
237
+ return None
238
+
239
+ return [vector_search, graph_search, get_node_details, get_related_nodes, get_chunk_by_id]
240
+
241
+ def _build_workflow(self) -> StateGraph:
242
+ """Build the LangGraph workflow for the agent"""
243
+
244
+ workflow = StateGraph(AgentState)
245
+
246
+ # Define workflow nodes
247
+ workflow.add_node("plan", self._plan_node)
248
+ workflow.add_node("execute_tools", self._execute_tools_node)
249
+ workflow.add_node("synthesize", self._synthesize_node)
250
+
251
+ # Define edges
252
+ workflow.add_edge(START, "plan")
253
+ workflow.add_conditional_edges(
254
+ "plan",
255
+ self._should_use_tools,
256
+ {
257
+ "tools": "execute_tools",
258
+ "direct": "synthesize"
259
+ }
260
+ )
261
+ workflow.add_edge("execute_tools", "synthesize")
262
+ workflow.add_edge("synthesize", END)
263
+
264
+ return workflow
265
+
266
+ def _plan_node(self, state: AgentState) -> AgentState:
267
+ """Agent decides which tools to use"""
268
+ logger.info("🤖 Agent: Planning which tools to use...")
269
+
270
+ query = state["query"]
271
+
272
+ # Simple heuristic-based planning (can be enhanced with LLM)
273
+ tools_to_use = []
274
+ reasoning = []
275
+
276
+ # Always use vector search for semantic matching
277
+ tools_to_use.append("vector_search")
278
+ reasoning.append("Use vector search for semantic document retrieval")
279
+
280
+ # Check if query mentions specific concepts (use graph)
281
+ if any(word in query.lower() for word in ["relate", "connection", "link", "between"]):
282
+ tools_to_use.append("graph_search")
283
+ reasoning.append("Query asks about relationships - use graph search")
284
+
285
+ # Check if asking about a specific concept
286
+ if any(word in query.lower() for word in ["what is", "define", "explain"]):
287
+ tools_to_use.append("graph_search")
288
+ reasoning.append("Query asks for concept definition - check graph")
289
+
290
+ state["tool_results"] = {"planned_tools": tools_to_use}
291
+ state["reasoning_steps"] = reasoning
292
+ state["next_action"] = "tools" if tools_to_use else "direct"
293
+
294
+ logger.info(f" Plan: {tools_to_use}")
295
+ return state
296
+
297
+ def _should_use_tools(self, state: AgentState) -> str:
298
+ """Decide if tools are needed"""
299
+ return state.get("next_action", "direct")
300
+
301
+ def _execute_tools_node(self, state: AgentState) -> AgentState:
302
+ """Execute the planned tools"""
303
+ logger.info("🔧 Agent: Executing tools...")
304
+
305
+ query = state["query"]
306
+ planned_tools = state["tool_results"].get("planned_tools", [])
307
+ results = {}
308
+
309
+ # Create tools
310
+ tools_map = {}
311
+ for tool in self._create_tools():
312
+ tools_map[tool.name] = tool
313
+
314
+ # Execute tools
315
+ if "vector_search" in planned_tools:
316
+ vector_tool = tools_map["vector_search"]
317
+ results["vector_results"] = vector_tool.invoke({"query": query, "top_k": 5})
318
+
319
+ if "graph_search" in planned_tools:
320
+ # Extract main concept from query (simplified)
321
+ # In production, use NER or LLM to extract concept
322
+ words = query.lower().split()
323
+ potential_concepts = [w for w in words if len(w) > 4 and w not in ["what", "how", "does", "relate"]]
324
+
325
+ for concept in potential_concepts[:2]: # Try first 2
326
+ graph_tool = tools_map["graph_search"]
327
+ node_result = graph_tool.invoke({"concept": concept})
328
+ if node_result:
329
+ results[f"graph_node_{concept}"] = node_result
330
+
331
+ # Get related nodes
332
+ related_tool = tools_map["get_related_nodes"]
333
+ related = related_tool.invoke({"node_id": node_result["node_id"], "max_neighbors": 3})
334
+ results[f"related_{concept}"] = related
335
+ break
336
+
337
+ state["tool_results"].update(results)
338
+ logger.info(f" ✓ Executed {len(planned_tools)} tools, got {len(results)} results")
339
+ return state
340
+
341
+ async def _synthesize_node(self, state: AgentState) -> AgentState:
342
+ """Synthesize final answer from tool results"""
343
+ logger.info("🎯 Agent: Synthesizing answer...")
344
+
345
+ query = state["query"]
346
+ tool_results = state["tool_results"]
347
+
348
+ # Prepare context from tool results
349
+ context_parts = []
350
+ citations = []
351
+
352
+ # Add vector search results
353
+ if "vector_results" in tool_results:
354
+ vector_results = tool_results["vector_results"]
355
+ for i, result in enumerate(vector_results[:3]): # Top 3
356
+ context_parts.append(f"[Source {i+1}, p.{result['page_number']}]: {result['text']}")
357
+ citations.append(SourceCitation(
358
+ page_number=result["page_number"],
359
+ snippet=result["text"][:120] + "..." if len(result["text"]) > 120 else result["text"],
360
+ chunk_id=result["chunk_id"],
361
+ score=result["score"]
362
+ ))
363
+
364
+ # Add graph results
365
+ for key, value in tool_results.items():
366
+ if key.startswith("graph_node_"):
367
+ concept = key.replace("graph_node_", "")
368
+ context_parts.append(f"[Graph Node]: '{value['label']}' is a {value['type']} (importance: {value['importance']:.2f})")
369
+ elif key.startswith("related_"):
370
+ concept = key.replace("related_", "")
371
+ if value:
372
+ relations = ", ".join([f"{r['label']} ({r['relation']})" for r in value])
373
+ context_parts.append(f"[Related Concepts]: {relations}")
374
+
375
+ # Create context for LLM
376
+ context = "\n\n".join(context_parts)
377
+
378
+ # Generate answer using Gemini
379
+ answer = await self.llm_service.agent_synthesize(query, context)
380
+
381
+ state["final_answer"] = answer
382
+ state["citations"] = citations
383
+
384
+ logger.info(" ✓ Answer synthesized")
385
+ return state
386
+
387
+ async def chat(self, query: str, pdf_id: str = None, include_citations: bool = True) -> ChatResponse:
388
+ """
389
+ Main entry point for agent-based chat
390
+
391
+ Args:
392
+ query: User's question
393
+ pdf_id: Optional PDF context
394
+ include_citations: Whether to include source citations
395
+
396
+ Returns:
397
+ ChatResponse with answer and citations
398
+ """
399
+ logger.info(f"\n{'='*80}")
400
+ logger.info(f"🤖 Agent-Based RAG Query: '{query}'")
401
+ logger.info(f"{'='*80}")
402
+
403
+ # Initialize state
404
+ initial_state = {
405
+ "messages": [HumanMessage(content=query)],
406
+ "query": query,
407
+ "pdf_id": pdf_id or "",
408
+ "tool_results": {},
409
+ "reasoning_steps": [],
410
+ "final_answer": "",
411
+ "citations": [],
412
+ "next_action": ""
413
+ }
414
+
415
+ try:
416
+ # Run workflow
417
+ final_state = await self.app.ainvoke(initial_state)
418
+
419
+ # Extract results
420
+ answer = final_state.get("final_answer", "I couldn't generate an answer.")
421
+ citations = final_state.get("citations", [])
422
+
423
+ if not include_citations:
424
+ citations = []
425
+
426
+ logger.info(f"✓ Agent completed successfully")
427
+ logger.info(f" Answer length: {len(answer)} chars")
428
+ logger.info(f" Citations: {len(citations)}")
429
+ logger.info(f"{'='*80}\n")
430
+
431
+ return ChatResponse(
432
+ answer=answer,
433
+ sources=citations[:5] # Top 5 citations
434
+ )
435
+
436
+ except Exception as e:
437
+ logger.error(f"❌ Agent failed: {e}", exc_info=True)
438
+
439
+ # Fallback to simple vector search
440
+ logger.warning("Falling back to simple RAG...")
441
+ return await self._fallback_simple_rag(query, pdf_id)
442
+
443
+ async def _fallback_simple_rag(self, query: str, pdf_id: str = None) -> ChatResponse:
444
+ """Fallback to simple RAG if agent fails"""
445
+ try:
446
+ results = self.embedding_service.search(query=query, top_k=5, filter_pdf_id=pdf_id)
447
+
448
+ if not results:
449
+ return ChatResponse(
450
+ answer="I couldn't find relevant information to answer your question.",
451
+ sources=[]
452
+ )
453
+
454
+ # Prepare context
455
+ context_chunks = [
456
+ {
457
+ "page_number": meta.get("page_number", 0),
458
+ "text": meta.get("text", "")
459
+ }
460
+ for meta, score in results[:3]
461
+ ]
462
+
463
+ # Generate answer
464
+ answer = await self.llm_service.rag_chat(query, context_chunks)
465
+
466
+ # Format sources
467
+ sources = []
468
+ for meta, score in results[:5]:
469
+ text = meta.get("text", "")
470
+ snippet = text[:120] + "..." if len(text) > 120 else text
471
+ sources.append(SourceCitation(
472
+ page_number=meta.get("page_number", 0),
473
+ snippet=snippet,
474
+ chunk_id=meta.get("chunk_id", ""),
475
+ score=score
476
+ ))
477
+
478
+ return ChatResponse(answer=answer, sources=sources)
479
+
480
+ except Exception as e:
481
+ logger.error(f"Fallback RAG also failed: {e}")
482
+ return ChatResponse(
483
+ answer="I encountered an error processing your question.",
484
+ sources=[]
485
+ )
requirements.txt ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF Processing
2
+ PyMuPDF
3
+ pdfplumber
4
+ pytesseract
5
+ Pillow
6
+ camelot-py[cv]
7
+ tabula-py
8
+
9
+ # NLP & Embeddings
10
+ sentence-transformers
11
+ transformers
12
+ torch>=2.2.0
13
+ keybert
14
+ yake
15
+
16
+ # Knowledge Graph Generation
17
+ kg-gen
18
+ litellm
19
+
20
+ # Multi-Agent System with LangGraph
21
+ langgraph
22
+ langchain
23
+ langchain-core
24
+ langchain-community
25
+
26
+ # Vector Store & Search
27
+ faiss-cpu
28
+
29
+ # Graph Database & Processing
30
+ neo4j
31
+ networkx
32
+
33
+ # Backend & API
34
+ fastapi
35
+ uvicorn[standard]
36
+ python-multipart
37
+ pydantic
38
+ pydantic-settings
39
+
40
+ # Database
41
+ sqlalchemy
42
+ psycopg2-binary
43
+ pymongo
44
+
45
+ # Utilities
46
+ python-dotenv
47
+ loguru
48
+ tenacity
49
+ httpx
50
+ aiofiles
51
+
52
+ # Monitoring & DevOps
53
+ prometheus-client
54
+ python-json-logger
55
+
56
+ # Testing
57
+ pytest
58
+ pytest-asyncio
59
+ pytest-cov
tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Tests package
tests/test_basic.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Basic tests for GraphLLM components
3
+ """
4
+ import pytest
5
+ from models import Chunk, ChunkType, GraphNode, GraphEdge, Triple, NodeType, RelationType
6
+ from config import settings
7
+
8
+
9
+ def test_chunk_creation():
10
+ """Test chunk model creation"""
11
+ chunk = Chunk(
12
+ pdf_id="test-pdf",
13
+ page_number=1,
14
+ char_range=(0, 100),
15
+ type=ChunkType.PARAGRAPH,
16
+ text="This is a test chunk."
17
+ )
18
+
19
+ assert chunk.pdf_id == "test-pdf"
20
+ assert chunk.page_number == 1
21
+ assert chunk.type == ChunkType.PARAGRAPH
22
+ assert chunk.text == "This is a test chunk."
23
+
24
+
25
+ def test_graph_node_creation():
26
+ """Test graph node creation"""
27
+ node = GraphNode(
28
+ label="Test Concept",
29
+ type=NodeType.CONCEPT,
30
+ aliases=["test", "concept"],
31
+ supporting_chunks=[],
32
+ importance_score=0.75
33
+ )
34
+
35
+ assert node.label == "Test Concept"
36
+ assert node.type == NodeType.CONCEPT
37
+ assert node.importance_score == 0.75
38
+
39
+
40
+ def test_graph_edge_creation():
41
+ """Test graph edge creation"""
42
+ edge = GraphEdge(
43
+ from_node="node1",
44
+ to_node="node2",
45
+ relation=RelationType.USES,
46
+ confidence=0.8,
47
+ supporting_chunks=[]
48
+ )
49
+
50
+ assert edge.from_node == "node1"
51
+ assert edge.to_node == "node2"
52
+ assert edge.relation == RelationType.USES
53
+ assert edge.confidence == 0.8
54
+
55
+
56
+ def test_triple_creation():
57
+ """Test triple model"""
58
+ triple = Triple(
59
+ subject="Machine Learning",
60
+ predicate="uses",
61
+ object="Neural Networks",
62
+ confidence=0.9,
63
+ page_number=5
64
+ )
65
+
66
+ assert triple.subject == "Machine Learning"
67
+ assert triple.predicate == "uses"
68
+ assert triple.object == "Neural Networks"
69
+ assert triple.confidence == 0.9
70
+
71
+
72
+ def test_settings_load():
73
+ """Test configuration loading"""
74
+ assert settings.app_name == "GraphLLM"
75
+ assert settings.chunk_size > 0
76
+ assert settings.embedding_model is not None
77
+
78
+
79
+ @pytest.mark.asyncio
80
+ async def test_pdf_processor_import():
81
+ """Test PDF processor can be imported"""
82
+ from pdf_processor import PDFProcessor
83
+ processor = PDFProcessor()
84
+ assert processor is not None
85
+
86
+
87
+ @pytest.mark.asyncio
88
+ async def test_embedding_service_import():
89
+ """Test embedding service can be imported"""
90
+ from embedding_service import EmbeddingService
91
+ # Note: This will load the model, may take time
92
+ # service = EmbeddingService()
93
+ # assert service is not None
94
+ pass
95
+
96
+
97
+ @pytest.mark.asyncio
98
+ async def test_graph_store_import():
99
+ """Test graph store can be imported"""
100
+ from graph_store import GraphStore
101
+ store = GraphStore(use_neo4j=False)
102
+ assert store is not None
103
+
104
+
105
+ if __name__ == "__main__":
106
+ pytest.main([__file__, "-v"])