Upload 24 files
Browse files- .dockerignore +92 -0
- .gitignore +72 -0
- .railwayignore +26 -0
- Dockerfile +51 -0
- Procfile +1 -0
- README.md +92 -10
- config.py +127 -0
- docker-compose.yml +68 -0
- embedding_service.py +266 -0
- frontend/app.js +539 -0
- frontend/index.html +176 -0
- frontend/styles.css +800 -0
- gemini_extractor.py +612 -0
- graph_builder.py +268 -0
- graph_store.py +347 -0
- llm_service.py +491 -0
- main.py +550 -0
- modal_app.py +50 -0
- models.py +236 -0
- pdf_processor.py +325 -0
- rag_agent.py +485 -0
- requirements.txt +59 -0
- tests/__init__.py +1 -0
- tests/test_basic.py +106 -0
.dockerignore
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
dist/
|
| 9 |
+
*.egg-info/
|
| 10 |
+
|
| 11 |
+
# Virtual Environments
|
| 12 |
+
venv/
|
| 13 |
+
env/
|
| 14 |
+
.venv/
|
| 15 |
+
ENV/
|
| 16 |
+
|
| 17 |
+
# Environment Variables
|
| 18 |
+
.env
|
| 19 |
+
.env.*
|
| 20 |
+
!.env.example
|
| 21 |
+
|
| 22 |
+
# Data (exclude from image - will be created at runtime)
|
| 23 |
+
data/
|
| 24 |
+
uploads/
|
| 25 |
+
*.pdf
|
| 26 |
+
*.pkl
|
| 27 |
+
*.faiss
|
| 28 |
+
*.index
|
| 29 |
+
|
| 30 |
+
# Logs (exclude from image)
|
| 31 |
+
logs/
|
| 32 |
+
*.log
|
| 33 |
+
|
| 34 |
+
# Cache
|
| 35 |
+
cache/
|
| 36 |
+
.cache/
|
| 37 |
+
__pycache__/
|
| 38 |
+
|
| 39 |
+
# IDEs
|
| 40 |
+
.vscode/
|
| 41 |
+
.idea/
|
| 42 |
+
*.swp
|
| 43 |
+
*.swo
|
| 44 |
+
*.sublime-*
|
| 45 |
+
|
| 46 |
+
# OS
|
| 47 |
+
.DS_Store
|
| 48 |
+
Thumbs.db
|
| 49 |
+
desktop.ini
|
| 50 |
+
|
| 51 |
+
# Testing
|
| 52 |
+
.coverage
|
| 53 |
+
htmlcov/
|
| 54 |
+
.pytest_cache/
|
| 55 |
+
.tox/
|
| 56 |
+
*.cover
|
| 57 |
+
tests/
|
| 58 |
+
|
| 59 |
+
# Database files
|
| 60 |
+
*.db
|
| 61 |
+
*.sqlite
|
| 62 |
+
*.sqlite3
|
| 63 |
+
|
| 64 |
+
# Git
|
| 65 |
+
.git/
|
| 66 |
+
.gitignore
|
| 67 |
+
.gitattributes
|
| 68 |
+
|
| 69 |
+
# Documentation
|
| 70 |
+
docs/
|
| 71 |
+
*.md
|
| 72 |
+
!README.md
|
| 73 |
+
|
| 74 |
+
# Deployment configs (not needed in container)
|
| 75 |
+
railway.toml
|
| 76 |
+
nixpacks.toml
|
| 77 |
+
Procfile
|
| 78 |
+
modal_app.py
|
| 79 |
+
fly.toml
|
| 80 |
+
vercel.json
|
| 81 |
+
heroku.yml
|
| 82 |
+
docker-compose*.yml
|
| 83 |
+
|
| 84 |
+
# CI/CD
|
| 85 |
+
.github/
|
| 86 |
+
.gitlab-ci.yml
|
| 87 |
+
.travis.yml
|
| 88 |
+
|
| 89 |
+
# Misc
|
| 90 |
+
*.bak
|
| 91 |
+
*.tmp
|
| 92 |
+
*.temp
|
.gitignore
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Virtual Environment
|
| 24 |
+
venv/
|
| 25 |
+
ENV/
|
| 26 |
+
env/
|
| 27 |
+
.venv
|
| 28 |
+
|
| 29 |
+
# Environment Variables
|
| 30 |
+
.env
|
| 31 |
+
.env.local
|
| 32 |
+
|
| 33 |
+
# Data & Uploads
|
| 34 |
+
data/
|
| 35 |
+
uploads/
|
| 36 |
+
*.pdf
|
| 37 |
+
*.faiss
|
| 38 |
+
*.index
|
| 39 |
+
|
| 40 |
+
# Logs
|
| 41 |
+
logs/
|
| 42 |
+
*.log
|
| 43 |
+
|
| 44 |
+
# Cache
|
| 45 |
+
cache/
|
| 46 |
+
.cache/
|
| 47 |
+
*.cache
|
| 48 |
+
|
| 49 |
+
# IDEs
|
| 50 |
+
.vscode/
|
| 51 |
+
.idea/
|
| 52 |
+
*.swp
|
| 53 |
+
*.swo
|
| 54 |
+
*~
|
| 55 |
+
|
| 56 |
+
# OS
|
| 57 |
+
.DS_Store
|
| 58 |
+
Thumbs.db
|
| 59 |
+
|
| 60 |
+
# Testing
|
| 61 |
+
.coverage
|
| 62 |
+
htmlcov/
|
| 63 |
+
.pytest_cache/
|
| 64 |
+
.tox/
|
| 65 |
+
|
| 66 |
+
# Database
|
| 67 |
+
*.db
|
| 68 |
+
*.sqlite
|
| 69 |
+
*.sqlite3
|
| 70 |
+
|
| 71 |
+
# Neo4j
|
| 72 |
+
neo4j/
|
.railwayignore
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore local development files
|
| 2 |
+
venv/
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.pyc
|
| 5 |
+
*.pyo
|
| 6 |
+
*.pyd
|
| 7 |
+
.Python
|
| 8 |
+
*.so
|
| 9 |
+
*.egg
|
| 10 |
+
*.egg-info/
|
| 11 |
+
dist/
|
| 12 |
+
build/
|
| 13 |
+
.env.local
|
| 14 |
+
.DS_Store
|
| 15 |
+
|
| 16 |
+
# Ignore local data (will be created on Railway)
|
| 17 |
+
data/
|
| 18 |
+
uploads/
|
| 19 |
+
logs/
|
| 20 |
+
cache/
|
| 21 |
+
|
| 22 |
+
# Ignore development artifacts
|
| 23 |
+
.pytest_cache/
|
| 24 |
+
.coverage
|
| 25 |
+
htmlcov/
|
| 26 |
+
*.log
|
Dockerfile
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GraphLLM - Hugging Face Spaces Deployment
|
| 2 |
+
# Optimized Docker image for HF Spaces
|
| 3 |
+
|
| 4 |
+
FROM python:3.12-slim
|
| 5 |
+
|
| 6 |
+
# Set environment variables
|
| 7 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 8 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 9 |
+
PIP_NO_CACHE_DIR=1 \
|
| 10 |
+
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 11 |
+
DEBIAN_FRONTEND=noninteractive \
|
| 12 |
+
API_PORT=7860 \
|
| 13 |
+
HF_HOME=/app/cache \
|
| 14 |
+
TRANSFORMERS_CACHE=/app/cache \
|
| 15 |
+
SENTENCE_TRANSFORMERS_HOME=/app/cache
|
| 16 |
+
|
| 17 |
+
# Set working directory
|
| 18 |
+
WORKDIR /app
|
| 19 |
+
|
| 20 |
+
# Install system dependencies (minimal set for HF Spaces)
|
| 21 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 22 |
+
build-essential \
|
| 23 |
+
curl \
|
| 24 |
+
tesseract-ocr \
|
| 25 |
+
ghostscript \
|
| 26 |
+
&& rm -rf /var/lib/apt/lists/* \
|
| 27 |
+
&& apt-get clean
|
| 28 |
+
|
| 29 |
+
# Copy requirements first (for better layer caching)
|
| 30 |
+
COPY requirements.txt .
|
| 31 |
+
|
| 32 |
+
# Install Python dependencies
|
| 33 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 34 |
+
|
| 35 |
+
# Copy application code
|
| 36 |
+
COPY . .
|
| 37 |
+
|
| 38 |
+
# Create data directories with proper permissions (777 for HF Spaces non-root user)
|
| 39 |
+
RUN mkdir -p data uploads logs cache data/faiss_index && \
|
| 40 |
+
chmod -R 777 data uploads logs cache
|
| 41 |
+
|
| 42 |
+
# Expose Hugging Face Spaces default port
|
| 43 |
+
EXPOSE 7860
|
| 44 |
+
|
| 45 |
+
# Health check
|
| 46 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 47 |
+
CMD curl -f http://localhost:7860/ || exit 1
|
| 48 |
+
|
| 49 |
+
# Run the application
|
| 50 |
+
# HF Spaces expects the app to listen on 0.0.0.0:7860
|
| 51 |
+
CMD ["python3", "main.py"]
|
Procfile
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
web: python main.py
|
README.md
CHANGED
|
@@ -1,12 +1,94 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
# 🕸️ GraphLLM - PDF Knowledge Graph + RAG System
|
| 4 |
+
|
| 5 |
+
Transform PDFs into interactive knowledge graphs with AI-powered Q&A.
|
| 6 |
+
|
| 7 |
+
## 🚀 Features
|
| 8 |
+
|
| 9 |
+
- **📄 PDF Processing:** Extract text, tables, and images from PDFs
|
| 10 |
+
- **🕸️ Knowledge Graph Generation:** Build semantic graphs using Gemini AI
|
| 11 |
+
- **🔍 Vector Search:** FAISS-powered semantic search with sentence transformers
|
| 12 |
+
- **💬 RAG Chat:** Ask questions and get answers with source citations
|
| 13 |
+
- **🎨 Interactive Visualization:** Explore knowledge graphs in your browser
|
| 14 |
+
|
| 15 |
+
## 🛠️ Technology Stack
|
| 16 |
+
|
| 17 |
+
- **LLM:** Google Gemini (gemini-2.5-flash)
|
| 18 |
+
- **Embeddings:** sentence-transformers/all-MiniLM-L6-v2
|
| 19 |
+
- **Vector Store:** FAISS with HNSW index
|
| 20 |
+
- **Graph:** NetworkX (in-memory)
|
| 21 |
+
- **Backend:** FastAPI + Uvicorn
|
| 22 |
+
- **Frontend:** Vanilla JS with D3.js/Cytoscape
|
| 23 |
+
|
| 24 |
+
## 📋 Setup
|
| 25 |
+
|
| 26 |
+
### Required: Gemini API Key
|
| 27 |
+
|
| 28 |
+
This app requires a Google Gemini API key:
|
| 29 |
+
|
| 30 |
+
1. Get your API key from [Google AI Studio](https://makersuite.google.com/app/apikey)
|
| 31 |
+
2. Add it as a **Secret** in Hugging Face Spaces settings:
|
| 32 |
+
- Name: `GEMINI_API_KEY`
|
| 33 |
+
- Value: Your API key
|
| 34 |
+
|
| 35 |
+
### Configuration (Optional)
|
| 36 |
+
|
| 37 |
+
You can set these environment variables in Space Settings:
|
| 38 |
+
|
| 39 |
+
```bash
|
| 40 |
+
# LLM Settings
|
| 41 |
+
GEMINI_MODEL=gemini-2.5-flash # Gemini model
|
| 42 |
+
LLM_TEMPERATURE=0.0 # Temperature for extraction
|
| 43 |
+
|
| 44 |
+
# Embedding Settings
|
| 45 |
+
EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 46 |
+
|
| 47 |
+
# Environment
|
| 48 |
+
ENVIRONMENT=production
|
| 49 |
+
LOG_LEVEL=INFO
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
## 🎯 Usage
|
| 53 |
+
|
| 54 |
+
1. **Upload PDF:** Click "Upload PDF" and select your document
|
| 55 |
+
2. **Wait for Processing:** The system will:
|
| 56 |
+
- Extract text chunks
|
| 57 |
+
- Generate embeddings
|
| 58 |
+
- Build knowledge graph with Gemini
|
| 59 |
+
3. **Explore Graph:** Click nodes to see details and related concepts
|
| 60 |
+
4. **Ask Questions:** Use the chat interface for Q&A with citations
|
| 61 |
+
|
| 62 |
+
## 📊 Graph Generation
|
| 63 |
+
|
| 64 |
+
- **Per-Page Extraction:** Max 2 concepts per page (quality over quantity)
|
| 65 |
+
- **Parallel Processing:** All pages processed concurrently via Gemini API
|
| 66 |
+
- **Strict Filtering:** Only technical/domain-specific concepts
|
| 67 |
+
- **Co-occurrence Relationships:** Concepts on same page are linked
|
| 68 |
+
|
| 69 |
+
## 🎨 Frontend
|
| 70 |
+
|
| 71 |
+
The frontend is a single-page application located in `/frontend/`:
|
| 72 |
+
- `index.html` - Main UI
|
| 73 |
+
- `app.js` - Graph visualization & API calls
|
| 74 |
+
- `styles.css` - Styling
|
| 75 |
+
|
| 76 |
+
Access it at: `http://your-space-url.hf.space/frontend/`
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
## 📦 Docker
|
| 80 |
+
|
| 81 |
+
This Space uses Docker for deployment:
|
| 82 |
+
- Base: `python:3.12-slim`
|
| 83 |
+
- Port: 7860 (HF Spaces default)
|
| 84 |
+
- Health check enabled
|
| 85 |
+
- Persistent data directory
|
| 86 |
+
|
| 87 |
+
## 🤝 Credits
|
| 88 |
+
|
| 89 |
+
- **LLM:** Google Gemini
|
| 90 |
+
- **Embeddings:** Hugging Face sentence-transformers
|
| 91 |
+
|
| 92 |
+
|
| 93 |
---
|
| 94 |
|
|
|
config.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration management for GraphLLM system
|
| 3 |
+
"""
|
| 4 |
+
from pydantic_settings import BaseSettings
|
| 5 |
+
from pydantic import Field, field_validator
|
| 6 |
+
from typing import Optional
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Settings(BaseSettings):
|
| 11 |
+
"""Application settings loaded from environment variables"""
|
| 12 |
+
|
| 13 |
+
# Application
|
| 14 |
+
app_name: str = "GraphLLM"
|
| 15 |
+
app_version: str = "1.0.0"
|
| 16 |
+
environment: str = "development"
|
| 17 |
+
debug: bool = True
|
| 18 |
+
|
| 19 |
+
# API
|
| 20 |
+
api_host: str = "0.0.0.0"
|
| 21 |
+
api_port: int = 8000
|
| 22 |
+
api_workers: int = 4
|
| 23 |
+
|
| 24 |
+
# LLM Settings - Gemini (Primary)
|
| 25 |
+
gemini_api_key: str = Field(default="", env="GEMINI_API_KEY")
|
| 26 |
+
gemini_model: str = "gemini-2.5-flash"
|
| 27 |
+
|
| 28 |
+
# LLM Settings - Mistral (Fallback)
|
| 29 |
+
mistral_api_key: str = Field(default="", env="MISTRAL_API_KEY")
|
| 30 |
+
mistral_model: str = "mistral-7b-instruct-v0.1"
|
| 31 |
+
|
| 32 |
+
# LLM Parameters
|
| 33 |
+
llm_temperature: float = 0.0
|
| 34 |
+
llm_max_tokens: int = 2048
|
| 35 |
+
llm_timeout: int = 120
|
| 36 |
+
|
| 37 |
+
# Embedding Settings
|
| 38 |
+
embedding_model: str = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
|
| 39 |
+
embedding_dimension: int = 384
|
| 40 |
+
embedding_batch_size: int = 32
|
| 41 |
+
|
| 42 |
+
# FAISS Vector DB
|
| 43 |
+
faiss_index_path: str = "./data/faiss_index"
|
| 44 |
+
faiss_metric: str = "cosine"
|
| 45 |
+
|
| 46 |
+
# Neo4j Graph DB
|
| 47 |
+
neo4j_uri: str = "bolt://localhost:7687"
|
| 48 |
+
neo4j_user: str = "neo4j"
|
| 49 |
+
neo4j_password: str = Field(default="", env="NEO4J_PASSWORD")
|
| 50 |
+
neo4j_database: str = "neo4j"
|
| 51 |
+
|
| 52 |
+
# PostgreSQL
|
| 53 |
+
postgres_host: str = "localhost"
|
| 54 |
+
postgres_port: int = 5432
|
| 55 |
+
postgres_db: str = "graphllm"
|
| 56 |
+
postgres_user: str = "postgres"
|
| 57 |
+
postgres_password: str = Field(default="", env="POSTGRES_PASSWORD")
|
| 58 |
+
|
| 59 |
+
# MongoDB (optional)
|
| 60 |
+
mongodb_uri: str = "mongodb://localhost:27017"
|
| 61 |
+
mongodb_database: str = "graphllm"
|
| 62 |
+
|
| 63 |
+
# Chunking
|
| 64 |
+
chunk_size: int = 512
|
| 65 |
+
chunk_overlap: int = 128
|
| 66 |
+
min_chunk_size: int = 100
|
| 67 |
+
|
| 68 |
+
# Triplet Extraction
|
| 69 |
+
triplet_confidence_threshold: float = 0.6
|
| 70 |
+
entity_similarity_threshold: float = 0.85
|
| 71 |
+
max_triples_per_chunk: int = 10
|
| 72 |
+
|
| 73 |
+
# Graph Pruning
|
| 74 |
+
node_importance_threshold: float = 0.3
|
| 75 |
+
edge_confidence_threshold: float = 0.5
|
| 76 |
+
min_node_mentions: int = 2
|
| 77 |
+
|
| 78 |
+
# RAG
|
| 79 |
+
rag_top_k: int = 10
|
| 80 |
+
rag_rerank_top_k: int = 5
|
| 81 |
+
max_context_length: int = 4000
|
| 82 |
+
|
| 83 |
+
# File Upload
|
| 84 |
+
max_file_size_mb: int = 50
|
| 85 |
+
allowed_extensions: str = "pdf"
|
| 86 |
+
upload_dir: str = "./data/uploads"
|
| 87 |
+
|
| 88 |
+
# Storage
|
| 89 |
+
data_dir: str = "./data"
|
| 90 |
+
logs_dir: str = "./logs"
|
| 91 |
+
cache_dir: str = "./cache"
|
| 92 |
+
|
| 93 |
+
# Monitoring
|
| 94 |
+
enable_metrics: bool = True
|
| 95 |
+
metrics_port: int = 9090
|
| 96 |
+
log_level: str = "INFO"
|
| 97 |
+
|
| 98 |
+
@property
|
| 99 |
+
def postgres_url(self) -> str:
|
| 100 |
+
"""Build PostgreSQL connection URL"""
|
| 101 |
+
return f"postgresql://{self.postgres_user}:{self.postgres_password}@{self.postgres_host}:{self.postgres_port}/{self.postgres_db}"
|
| 102 |
+
|
| 103 |
+
@property
|
| 104 |
+
def max_file_size_bytes(self) -> int:
|
| 105 |
+
"""Convert MB to bytes"""
|
| 106 |
+
return self.max_file_size_mb * 1024 * 1024
|
| 107 |
+
|
| 108 |
+
class Config:
|
| 109 |
+
env_file = ".env"
|
| 110 |
+
case_sensitive = False
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# Global settings instance
|
| 114 |
+
settings = Settings()
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def ensure_directories():
|
| 118 |
+
"""Ensure all required directories exist"""
|
| 119 |
+
dirs = [
|
| 120 |
+
settings.data_dir,
|
| 121 |
+
settings.upload_dir,
|
| 122 |
+
settings.logs_dir,
|
| 123 |
+
settings.cache_dir,
|
| 124 |
+
settings.faiss_index_path,
|
| 125 |
+
]
|
| 126 |
+
for directory in dirs:
|
| 127 |
+
os.makedirs(directory, exist_ok=True)
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GraphLLM Docker Compose Configuration
|
| 2 |
+
# Simple standalone deployment with persistent storage
|
| 3 |
+
|
| 4 |
+
version: '3.8'
|
| 5 |
+
|
| 6 |
+
services:
|
| 7 |
+
# Main GraphLLM Application
|
| 8 |
+
graphllm:
|
| 9 |
+
build:
|
| 10 |
+
context: .
|
| 11 |
+
dockerfile: Dockerfile
|
| 12 |
+
container_name: graphllm
|
| 13 |
+
image: graphllm:latest
|
| 14 |
+
ports:
|
| 15 |
+
- "8000:8000"
|
| 16 |
+
volumes:
|
| 17 |
+
# Persistent storage for data, uploads, and logs
|
| 18 |
+
- graphllm-data:/app/data
|
| 19 |
+
- graphllm-uploads:/app/uploads
|
| 20 |
+
- graphllm-logs:/app/logs
|
| 21 |
+
- graphllm-cache:/app/cache
|
| 22 |
+
environment:
|
| 23 |
+
# Gemini API Configuration
|
| 24 |
+
- GEMINI_API_KEY=${GEMINI_API_KEY}
|
| 25 |
+
- GEMINI_MODEL=${GEMINI_MODEL:-gemini-1.5-flash}
|
| 26 |
+
|
| 27 |
+
# Application Settings
|
| 28 |
+
- ENVIRONMENT=${ENVIRONMENT:-production}
|
| 29 |
+
- LOG_LEVEL=${LOG_LEVEL:-INFO}
|
| 30 |
+
- DEBUG=false
|
| 31 |
+
|
| 32 |
+
# LLM Settings
|
| 33 |
+
- LLM_TEMPERATURE=${LLM_TEMPERATURE:-0.7}
|
| 34 |
+
- LLM_MAX_TOKENS=${LLM_MAX_TOKENS:-2048}
|
| 35 |
+
|
| 36 |
+
# Embedding Settings
|
| 37 |
+
- EMBEDDING_MODEL=${EMBEDDING_MODEL:-all-MiniLM-L6-v2}
|
| 38 |
+
- EMBEDDING_BATCH_SIZE=${EMBEDDING_BATCH_SIZE:-128}
|
| 39 |
+
|
| 40 |
+
# API Settings
|
| 41 |
+
- API_HOST=0.0.0.0
|
| 42 |
+
- API_PORT=8000
|
| 43 |
+
- MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
|
| 44 |
+
|
| 45 |
+
restart: unless-stopped
|
| 46 |
+
healthcheck:
|
| 47 |
+
test: ["CMD", "curl", "-f", "http://localhost:8000/"]
|
| 48 |
+
interval: 30s
|
| 49 |
+
timeout: 10s
|
| 50 |
+
retries: 3
|
| 51 |
+
start_period: 60s
|
| 52 |
+
networks:
|
| 53 |
+
- graphllm-network
|
| 54 |
+
|
| 55 |
+
volumes:
|
| 56 |
+
# Named volumes for persistent storage
|
| 57 |
+
graphllm-data:
|
| 58 |
+
driver: local
|
| 59 |
+
graphllm-uploads:
|
| 60 |
+
driver: local
|
| 61 |
+
graphllm-logs:
|
| 62 |
+
driver: local
|
| 63 |
+
graphllm-cache:
|
| 64 |
+
driver: local
|
| 65 |
+
|
| 66 |
+
networks:
|
| 67 |
+
graphllm-network:
|
| 68 |
+
driver: bridge
|
embedding_service.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Embedding & Vector Index Service
|
| 3 |
+
Handles embedding generation and FAISS vector store management
|
| 4 |
+
"""
|
| 5 |
+
# Import SentenceTransformer lazily to avoid hanging on startup
|
| 6 |
+
import faiss
|
| 7 |
+
import numpy as np
|
| 8 |
+
from typing import List, Dict, Any, Tuple, Optional
|
| 9 |
+
from loguru import logger
|
| 10 |
+
import pickle
|
| 11 |
+
import os
|
| 12 |
+
from models import Chunk, EmbeddingEntry
|
| 13 |
+
from config import settings
|
| 14 |
+
import json
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class EmbeddingService:
|
| 18 |
+
"""
|
| 19 |
+
Service for creating embeddings and managing FAISS vector index
|
| 20 |
+
Uses lazy loading for the embedding model (loads on first use)
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self):
|
| 24 |
+
logger.info(f"EmbeddingService initialized (model will load on first use)")
|
| 25 |
+
self._model = None # Lazy-loaded
|
| 26 |
+
self.dimension = settings.embedding_dimension
|
| 27 |
+
self.index: Optional[faiss.Index] = None
|
| 28 |
+
self.chunk_metadata: Dict[int, Dict[str, Any]] = {} # index_id -> metadata
|
| 29 |
+
self._initialize_index()
|
| 30 |
+
|
| 31 |
+
@property
|
| 32 |
+
def model(self):
|
| 33 |
+
"""Lazy-load the embedding model on first access"""
|
| 34 |
+
if self._model is None:
|
| 35 |
+
logger.info(f"Loading embedding model: {settings.embedding_model}")
|
| 36 |
+
# Import only when needed to avoid hanging on startup
|
| 37 |
+
from sentence_transformers import SentenceTransformer
|
| 38 |
+
self._model = SentenceTransformer(settings.embedding_model)
|
| 39 |
+
logger.info(f"✓ Embedding model loaded successfully")
|
| 40 |
+
return self._model
|
| 41 |
+
|
| 42 |
+
def _initialize_index(self):
|
| 43 |
+
"""Initialize or load FAISS index"""
|
| 44 |
+
index_path = os.path.join(settings.faiss_index_path, "index.faiss")
|
| 45 |
+
metadata_path = os.path.join(settings.faiss_index_path, "metadata.pkl")
|
| 46 |
+
|
| 47 |
+
if os.path.exists(index_path) and os.path.exists(metadata_path):
|
| 48 |
+
logger.info("Loading existing FAISS index")
|
| 49 |
+
self.index = faiss.read_index(index_path)
|
| 50 |
+
with open(metadata_path, 'rb') as f:
|
| 51 |
+
self.chunk_metadata = pickle.load(f)
|
| 52 |
+
logger.info(f"Loaded index with {self.index.ntotal} vectors")
|
| 53 |
+
else:
|
| 54 |
+
logger.info("Creating new FAISS index (optimized)")
|
| 55 |
+
# Use HNSW for better performance on larger datasets
|
| 56 |
+
# HNSW is ~10x faster than flat index with 99%+ accuracy
|
| 57 |
+
# M=32 is good balance (higher M = more accurate but slower)
|
| 58 |
+
self.index = faiss.IndexHNSWFlat(self.dimension, 32)
|
| 59 |
+
# Set ef construction (higher = better quality, slower build)
|
| 60 |
+
self.index.hnsw.efConstruction = 40
|
| 61 |
+
# Set ef search (higher = better recall, slower search)
|
| 62 |
+
self.index.hnsw.efSearch = 16
|
| 63 |
+
self.chunk_metadata = {}
|
| 64 |
+
logger.info("Using HNSW index for faster approximate search")
|
| 65 |
+
|
| 66 |
+
def create_embeddings(self, chunks: List[Chunk]) -> List[EmbeddingEntry]:
|
| 67 |
+
"""
|
| 68 |
+
⚡ OPTIMIZED: Create embeddings with larger batches and parallel processing
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
chunks: List of Chunk objects
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
List of EmbeddingEntry objects
|
| 75 |
+
"""
|
| 76 |
+
texts = [chunk.text for chunk in chunks]
|
| 77 |
+
logger.info(f"⚡ Creating embeddings for {len(texts)} chunks (batch_size={settings.embedding_batch_size})")
|
| 78 |
+
|
| 79 |
+
import time
|
| 80 |
+
start = time.time()
|
| 81 |
+
|
| 82 |
+
# Batch encode with optimized settings
|
| 83 |
+
embeddings = self.model.encode(
|
| 84 |
+
texts,
|
| 85 |
+
batch_size=settings.embedding_batch_size,
|
| 86 |
+
show_progress_bar=False, # Disable for less overhead
|
| 87 |
+
convert_to_numpy=True,
|
| 88 |
+
normalize_embeddings=True # Built-in normalization is faster
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
elapsed = time.time() - start
|
| 92 |
+
logger.info(f"✓ Created {len(embeddings)} embeddings in {elapsed:.2f}s ({len(embeddings)/elapsed:.1f} chunks/sec)")
|
| 93 |
+
|
| 94 |
+
# Create embedding entries
|
| 95 |
+
embedding_entries = []
|
| 96 |
+
for chunk, embedding in zip(chunks, embeddings):
|
| 97 |
+
entry = EmbeddingEntry(
|
| 98 |
+
chunk_id=chunk.chunk_id,
|
| 99 |
+
embedding=embedding.tolist(),
|
| 100 |
+
metadata={
|
| 101 |
+
"pdf_id": chunk.pdf_id,
|
| 102 |
+
"page_number": chunk.page_number,
|
| 103 |
+
"type": chunk.type.value,
|
| 104 |
+
"char_range": chunk.char_range
|
| 105 |
+
}
|
| 106 |
+
)
|
| 107 |
+
embedding_entries.append(entry)
|
| 108 |
+
|
| 109 |
+
return embedding_entries
|
| 110 |
+
|
| 111 |
+
def add_to_index(self, chunks: List[Chunk], embeddings: List[EmbeddingEntry]):
|
| 112 |
+
"""
|
| 113 |
+
Add chunks and their embeddings to FAISS index
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
chunks: List of chunks
|
| 117 |
+
embeddings: Corresponding embeddings
|
| 118 |
+
"""
|
| 119 |
+
if len(chunks) != len(embeddings):
|
| 120 |
+
raise ValueError("Chunks and embeddings must have same length")
|
| 121 |
+
|
| 122 |
+
# Convert embeddings to numpy array
|
| 123 |
+
embedding_array = np.array([e.embedding for e in embeddings]).astype('float32')
|
| 124 |
+
|
| 125 |
+
# Get current index size (starting ID for new chunks)
|
| 126 |
+
start_id = self.index.ntotal
|
| 127 |
+
|
| 128 |
+
# Add to FAISS index
|
| 129 |
+
self.index.add(embedding_array)
|
| 130 |
+
|
| 131 |
+
# Store metadata mapping
|
| 132 |
+
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
| 133 |
+
idx = start_id + i
|
| 134 |
+
self.chunk_metadata[idx] = {
|
| 135 |
+
"chunk_id": chunk.chunk_id,
|
| 136 |
+
"pdf_id": chunk.pdf_id,
|
| 137 |
+
"page_number": chunk.page_number,
|
| 138 |
+
"type": chunk.type.value,
|
| 139 |
+
"text": chunk.text,
|
| 140 |
+
"char_range": chunk.char_range,
|
| 141 |
+
"metadata": chunk.metadata
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
logger.info(f"Added {len(chunks)} chunks to index. Total: {self.index.ntotal}")
|
| 145 |
+
|
| 146 |
+
def search(
|
| 147 |
+
self,
|
| 148 |
+
query: str,
|
| 149 |
+
top_k: int = 10,
|
| 150 |
+
filter_pdf_id: Optional[str] = None
|
| 151 |
+
) -> List[Tuple[Dict[str, Any], float]]:
|
| 152 |
+
"""
|
| 153 |
+
Search for similar chunks
|
| 154 |
+
|
| 155 |
+
Args:
|
| 156 |
+
query: Query string
|
| 157 |
+
top_k: Number of results to return
|
| 158 |
+
filter_pdf_id: Optional PDF ID to filter results
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
List of (chunk_metadata, score) tuples
|
| 162 |
+
"""
|
| 163 |
+
# Encode and normalize query
|
| 164 |
+
query_embedding = self.model.encode([query], convert_to_numpy=True)
|
| 165 |
+
faiss.normalize_L2(query_embedding)
|
| 166 |
+
|
| 167 |
+
# Search
|
| 168 |
+
# Fetch more if we need to filter
|
| 169 |
+
k = top_k * 10 if filter_pdf_id else top_k
|
| 170 |
+
scores, indices = self.index.search(query_embedding, k)
|
| 171 |
+
|
| 172 |
+
# Retrieve metadata
|
| 173 |
+
results = []
|
| 174 |
+
for score, idx in zip(scores[0], indices[0]):
|
| 175 |
+
if idx == -1: # FAISS returns -1 for empty results
|
| 176 |
+
continue
|
| 177 |
+
|
| 178 |
+
metadata = self.chunk_metadata.get(idx)
|
| 179 |
+
if metadata is None:
|
| 180 |
+
continue
|
| 181 |
+
|
| 182 |
+
# Apply filter if specified
|
| 183 |
+
if filter_pdf_id and metadata.get("pdf_id") != filter_pdf_id:
|
| 184 |
+
continue
|
| 185 |
+
|
| 186 |
+
results.append((metadata, float(score)))
|
| 187 |
+
|
| 188 |
+
if len(results) >= top_k:
|
| 189 |
+
break
|
| 190 |
+
|
| 191 |
+
return results
|
| 192 |
+
|
| 193 |
+
def search_by_chunk_ids(self, chunk_ids: List[str], top_k: int = 5) -> List[Tuple[Dict[str, Any], float]]:
|
| 194 |
+
"""
|
| 195 |
+
Find similar chunks to a set of chunk IDs (for node expansion)
|
| 196 |
+
|
| 197 |
+
Args:
|
| 198 |
+
chunk_ids: List of chunk IDs
|
| 199 |
+
top_k: Number of similar chunks per input chunk
|
| 200 |
+
|
| 201 |
+
Returns:
|
| 202 |
+
List of (chunk_metadata, score) tuples
|
| 203 |
+
"""
|
| 204 |
+
# Find the chunks in metadata
|
| 205 |
+
chunk_indices = []
|
| 206 |
+
for idx, meta in self.chunk_metadata.items():
|
| 207 |
+
if meta["chunk_id"] in chunk_ids:
|
| 208 |
+
chunk_indices.append(idx)
|
| 209 |
+
|
| 210 |
+
if not chunk_indices:
|
| 211 |
+
return []
|
| 212 |
+
|
| 213 |
+
# Get embeddings for these chunks
|
| 214 |
+
# Note: FAISS doesn't have a direct "get vector" API for IndexFlatIP
|
| 215 |
+
# We'll search from the index using reconstruct (if supported)
|
| 216 |
+
results = []
|
| 217 |
+
for idx in chunk_indices:
|
| 218 |
+
# Reconstruct vector (works for Flat indices)
|
| 219 |
+
try:
|
| 220 |
+
vector = self.index.reconstruct(idx)
|
| 221 |
+
vector = vector.reshape(1, -1)
|
| 222 |
+
scores, indices = self.index.search(vector, top_k + 1) # +1 to exclude self
|
| 223 |
+
|
| 224 |
+
for score, res_idx in zip(scores[0], indices[0]):
|
| 225 |
+
if res_idx == idx: # Skip self
|
| 226 |
+
continue
|
| 227 |
+
if res_idx == -1:
|
| 228 |
+
continue
|
| 229 |
+
|
| 230 |
+
metadata = self.chunk_metadata.get(res_idx)
|
| 231 |
+
if metadata:
|
| 232 |
+
results.append((metadata, float(score)))
|
| 233 |
+
except Exception as e:
|
| 234 |
+
logger.warning(f"Could not reconstruct vector for index {idx}: {e}")
|
| 235 |
+
|
| 236 |
+
# Sort by score and return top
|
| 237 |
+
results.sort(key=lambda x: x[1], reverse=True)
|
| 238 |
+
return results[:top_k]
|
| 239 |
+
|
| 240 |
+
def save(self):
|
| 241 |
+
"""Save FAISS index and metadata to disk"""
|
| 242 |
+
os.makedirs(settings.faiss_index_path, exist_ok=True)
|
| 243 |
+
|
| 244 |
+
index_path = os.path.join(settings.faiss_index_path, "index.faiss")
|
| 245 |
+
metadata_path = os.path.join(settings.faiss_index_path, "metadata.pkl")
|
| 246 |
+
|
| 247 |
+
faiss.write_index(self.index, index_path)
|
| 248 |
+
with open(metadata_path, 'wb') as f:
|
| 249 |
+
pickle.dump(self.chunk_metadata, f)
|
| 250 |
+
|
| 251 |
+
logger.info(f"Saved FAISS index with {self.index.ntotal} vectors")
|
| 252 |
+
|
| 253 |
+
def clear(self):
|
| 254 |
+
"""Clear the index and metadata"""
|
| 255 |
+
self.index = faiss.IndexFlatIP(self.dimension)
|
| 256 |
+
self.chunk_metadata = {}
|
| 257 |
+
logger.info("Cleared FAISS index")
|
| 258 |
+
|
| 259 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 260 |
+
"""Get index statistics"""
|
| 261 |
+
return {
|
| 262 |
+
"total_vectors": self.index.ntotal,
|
| 263 |
+
"dimension": self.dimension,
|
| 264 |
+
"index_type": type(self.index).__name__,
|
| 265 |
+
"num_chunks": len(self.chunk_metadata)
|
| 266 |
+
}
|
frontend/app.js
ADDED
|
@@ -0,0 +1,539 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* GraphLLM Frontend JavaScript
|
| 3 |
+
* Handles user interactions, API calls, and dynamic UI updates
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
// ========== Global State ==========
|
| 7 |
+
let currentPdfId = null;
|
| 8 |
+
let graphData = { nodes: [], edges: [] };
|
| 9 |
+
let selectedNodeId = null;
|
| 10 |
+
|
| 11 |
+
// ========== API Configuration ==========
|
| 12 |
+
const API_BASE = window.location.origin;
|
| 13 |
+
|
| 14 |
+
// ========== Processing Overlay Functions ==========
|
| 15 |
+
function showProcessingOverlay(title = 'Processing PDF', message = 'Starting...', percent = 0) {
|
| 16 |
+
const overlay = document.getElementById('processing-overlay');
|
| 17 |
+
const titleEl = document.getElementById('processing-title');
|
| 18 |
+
const messageEl = document.getElementById('processing-message');
|
| 19 |
+
const percentEl = document.getElementById('processing-percent');
|
| 20 |
+
const progressFill = document.getElementById('progress-fill');
|
| 21 |
+
|
| 22 |
+
titleEl.textContent = title;
|
| 23 |
+
messageEl.textContent = message;
|
| 24 |
+
percentEl.textContent = `${percent}%`;
|
| 25 |
+
progressFill.style.width = `${percent}%`;
|
| 26 |
+
|
| 27 |
+
overlay.hidden = false;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
function updateProcessingOverlay(message, percent) {
|
| 31 |
+
const messageEl = document.getElementById('processing-message');
|
| 32 |
+
const percentEl = document.getElementById('processing-percent');
|
| 33 |
+
const progressFill = document.getElementById('progress-fill');
|
| 34 |
+
|
| 35 |
+
messageEl.textContent = message;
|
| 36 |
+
percentEl.textContent = `${percent}%`;
|
| 37 |
+
progressFill.style.width = `${percent}%`;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
function hideProcessingOverlay() {
|
| 41 |
+
const overlay = document.getElementById('processing-overlay');
|
| 42 |
+
overlay.hidden = true;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
// ========== Utility Functions ==========
|
| 46 |
+
async function apiCall(endpoint, options = {}) {
|
| 47 |
+
try {
|
| 48 |
+
const response = await fetch(`${API_BASE}${endpoint}`, options);
|
| 49 |
+
if (!response.ok) {
|
| 50 |
+
throw new Error(`API Error: ${response.statusText}`);
|
| 51 |
+
}
|
| 52 |
+
return await response.json();
|
| 53 |
+
} catch (error) {
|
| 54 |
+
console.error('API call failed:', error);
|
| 55 |
+
showNotification(error.message, 'error');
|
| 56 |
+
throw error;
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
function showNotification(message, type = 'info') {
|
| 61 |
+
const statusEl = document.getElementById('upload-status');
|
| 62 |
+
statusEl.textContent = message;
|
| 63 |
+
statusEl.style.color = type === 'error' ? '#f44336' : type === 'success' ? '#4caf50' : '#4f9eff';
|
| 64 |
+
|
| 65 |
+
setTimeout(() => {
|
| 66 |
+
statusEl.textContent = '';
|
| 67 |
+
}, 5000);
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
// ========== PDF Upload ==========
|
| 71 |
+
document.getElementById('pdf-upload').addEventListener('change', async (e) => {
|
| 72 |
+
const file = e.target.files[0];
|
| 73 |
+
if (!file) return;
|
| 74 |
+
|
| 75 |
+
// Show overlay immediately
|
| 76 |
+
showProcessingOverlay('Uploading PDF', `Uploading ${file.name}...`, 0);
|
| 77 |
+
|
| 78 |
+
const formData = new FormData();
|
| 79 |
+
formData.append('file', file);
|
| 80 |
+
|
| 81 |
+
try {
|
| 82 |
+
const result = await apiCall('/upload', {
|
| 83 |
+
method: 'POST',
|
| 84 |
+
body: formData
|
| 85 |
+
});
|
| 86 |
+
|
| 87 |
+
currentPdfId = result.pdf_id;
|
| 88 |
+
updateProcessingOverlay('Upload complete, starting processing...', 5);
|
| 89 |
+
|
| 90 |
+
// Poll for completion
|
| 91 |
+
pollProcessingStatus(result.pdf_id);
|
| 92 |
+
|
| 93 |
+
} catch (error) {
|
| 94 |
+
hideProcessingOverlay();
|
| 95 |
+
showNotification('Upload failed', 'error');
|
| 96 |
+
}
|
| 97 |
+
});
|
| 98 |
+
|
| 99 |
+
async function pollProcessingStatus(pdfId) {
|
| 100 |
+
const interval = setInterval(async () => {
|
| 101 |
+
try {
|
| 102 |
+
// Fetch detailed status for this PDF
|
| 103 |
+
const status = await apiCall(`/status/${pdfId}`);
|
| 104 |
+
|
| 105 |
+
// Update overlay with progress
|
| 106 |
+
if (status.progress) {
|
| 107 |
+
const { message, percent } = status.progress;
|
| 108 |
+
updateProcessingOverlay(message, percent);
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
// Check if processing is complete
|
| 112 |
+
if (status.status === 'completed') {
|
| 113 |
+
clearInterval(interval);
|
| 114 |
+
|
| 115 |
+
// Show completion message briefly
|
| 116 |
+
updateProcessingOverlay(
|
| 117 |
+
`✓ Complete! ${status.num_nodes} nodes, ${status.num_edges} edges`,
|
| 118 |
+
100
|
| 119 |
+
);
|
| 120 |
+
|
| 121 |
+
// Load graph and hide overlay
|
| 122 |
+
setTimeout(async () => {
|
| 123 |
+
hideProcessingOverlay();
|
| 124 |
+
await loadGraph();
|
| 125 |
+
await updateStats();
|
| 126 |
+
showNotification(`✓ Graph loaded: ${status.num_nodes} nodes, ${status.num_edges} edges`, 'success');
|
| 127 |
+
}, 1500); // Show completion for 1.5s
|
| 128 |
+
|
| 129 |
+
} else if (status.status === 'failed') {
|
| 130 |
+
clearInterval(interval);
|
| 131 |
+
hideProcessingOverlay();
|
| 132 |
+
showNotification(`Error: ${status.error}`, 'error');
|
| 133 |
+
}
|
| 134 |
+
} catch (error) {
|
| 135 |
+
clearInterval(interval);
|
| 136 |
+
hideProcessingOverlay();
|
| 137 |
+
showNotification('Failed to check status', 'error');
|
| 138 |
+
}
|
| 139 |
+
}, 1000); // Poll every 1 second for responsive updates
|
| 140 |
+
|
| 141 |
+
// Stop polling after 5 minutes
|
| 142 |
+
setTimeout(() => {
|
| 143 |
+
clearInterval(interval);
|
| 144 |
+
hideProcessingOverlay();
|
| 145 |
+
showNotification('Processing timeout', 'error');
|
| 146 |
+
}, 300000);
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
// ========== Graph Loading ==========
|
| 150 |
+
let network = null;
|
| 151 |
+
|
| 152 |
+
async function loadGraph() {
|
| 153 |
+
try {
|
| 154 |
+
const data = await apiCall('/graph');
|
| 155 |
+
graphData = data;
|
| 156 |
+
|
| 157 |
+
// Render interactive graph visualization
|
| 158 |
+
renderGraph(data);
|
| 159 |
+
|
| 160 |
+
} catch (error) {
|
| 161 |
+
console.error('Failed to load graph:', error);
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
function renderGraph(data) {
|
| 166 |
+
const container = document.getElementById('graph-container');
|
| 167 |
+
|
| 168 |
+
// Clear any existing content
|
| 169 |
+
container.innerHTML = '';
|
| 170 |
+
|
| 171 |
+
console.log(`Rendering graph: ${data.nodes.length} nodes, ${data.edges.length} edges`);
|
| 172 |
+
|
| 173 |
+
// Get actual container dimensions
|
| 174 |
+
const rect = container.getBoundingClientRect();
|
| 175 |
+
const containerHeight = rect.height || 600; // Fallback to 600px
|
| 176 |
+
const containerWidth = rect.width || 800; // Fallback to 800px
|
| 177 |
+
|
| 178 |
+
// Set explicit container styles to prevent overflow
|
| 179 |
+
container.style.position = 'relative';
|
| 180 |
+
container.style.width = containerWidth + 'px';
|
| 181 |
+
container.style.height = containerHeight + 'px';
|
| 182 |
+
container.style.overflow = 'hidden';
|
| 183 |
+
|
| 184 |
+
// Prepare nodes for vis.js
|
| 185 |
+
const visNodes = data.nodes.map(node => ({
|
| 186 |
+
id: node.node_id,
|
| 187 |
+
label: node.label,
|
| 188 |
+
title: `${node.label}\nType: ${node.type}\nImportance: ${node.importance_score.toFixed(2)}`,
|
| 189 |
+
value: node.importance_score * 20, // Size based on importance
|
| 190 |
+
group: node.type,
|
| 191 |
+
font: { color: '#e6eef8' }
|
| 192 |
+
}));
|
| 193 |
+
|
| 194 |
+
// Prepare edges for vis.js (thin, bright green, no arrows - undirected graph)
|
| 195 |
+
const visEdges = data.edges.map(edge => ({
|
| 196 |
+
from: edge.from || edge.from_node, // Handle both alias and field name
|
| 197 |
+
to: edge.to || edge.to_node, // Handle both alias and field name
|
| 198 |
+
label: edge.relation,
|
| 199 |
+
title: `${edge.relation} (${edge.confidence.toFixed(2)})`,
|
| 200 |
+
width: 1.5, // Thin edges
|
| 201 |
+
// No arrows for undirected graph
|
| 202 |
+
color: {
|
| 203 |
+
color: '#00ff00', // BRIGHT NEON GREEN (most visible)
|
| 204 |
+
highlight: '#ff00ff', // Neon magenta when highlighted
|
| 205 |
+
hover: '#ffff00', // Yellow on hover
|
| 206 |
+
opacity: 1.0 // Full opacity
|
| 207 |
+
},
|
| 208 |
+
font: {
|
| 209 |
+
size: 12,
|
| 210 |
+
color: '#ffffff',
|
| 211 |
+
strokeWidth: 3,
|
| 212 |
+
strokeColor: '#000000',
|
| 213 |
+
background: 'rgba(0, 0, 0, 0.8)',
|
| 214 |
+
bold: true
|
| 215 |
+
}
|
| 216 |
+
}));
|
| 217 |
+
|
| 218 |
+
// Create vis.js network
|
| 219 |
+
const graphData = {
|
| 220 |
+
nodes: new vis.DataSet(visNodes),
|
| 221 |
+
edges: new vis.DataSet(visEdges)
|
| 222 |
+
};
|
| 223 |
+
|
| 224 |
+
const options = {
|
| 225 |
+
nodes: {
|
| 226 |
+
shape: 'dot',
|
| 227 |
+
scaling: {
|
| 228 |
+
min: 10,
|
| 229 |
+
max: 30
|
| 230 |
+
},
|
| 231 |
+
font: {
|
| 232 |
+
size: 12,
|
| 233 |
+
face: 'Arial',
|
| 234 |
+
color: '#e6eef8'
|
| 235 |
+
},
|
| 236 |
+
borderWidth: 2,
|
| 237 |
+
shadow: true
|
| 238 |
+
},
|
| 239 |
+
edges: {
|
| 240 |
+
width: 1.5, // Thin edges
|
| 241 |
+
color: {
|
| 242 |
+
color: '#00ff00', // BRIGHT NEON GREEN (most visible against dark bg)
|
| 243 |
+
highlight: '#ff00ff', // Neon magenta when highlighted
|
| 244 |
+
hover: '#ffff00', // Yellow on hover
|
| 245 |
+
opacity: 1.0 // Full opacity
|
| 246 |
+
},
|
| 247 |
+
arrows: {
|
| 248 |
+
to: { enabled: false } // No arrows - undirected graph
|
| 249 |
+
},
|
| 250 |
+
smooth: {
|
| 251 |
+
type: 'continuous',
|
| 252 |
+
roundness: 0.2 // Less curved = more visible
|
| 253 |
+
},
|
| 254 |
+
font: {
|
| 255 |
+
size: 12, // Moderate text size
|
| 256 |
+
color: '#ffffff', // White text
|
| 257 |
+
strokeWidth: 3, // Moderate outline
|
| 258 |
+
strokeColor: '#000000', // Black outline for readability
|
| 259 |
+
align: 'top', // Position above edge
|
| 260 |
+
bold: true,
|
| 261 |
+
background: 'rgba(0, 0, 0, 0.8)' // Dark background for label
|
| 262 |
+
},
|
| 263 |
+
selectionWidth: 3, // Moderately thicker when selected
|
| 264 |
+
hoverWidth: 2.5, // Slightly thicker on hover
|
| 265 |
+
shadow: {
|
| 266 |
+
enabled: true,
|
| 267 |
+
color: 'rgba(0, 255, 0, 0.5)', // Green glow
|
| 268 |
+
size: 5,
|
| 269 |
+
x: 0,
|
| 270 |
+
y: 0
|
| 271 |
+
}
|
| 272 |
+
},
|
| 273 |
+
groups: {
|
| 274 |
+
concept: { color: { background: '#4f9eff', border: '#3d8ae6' } },
|
| 275 |
+
function: { color: { background: '#9c27b0', border: '#7b1fa2' } },
|
| 276 |
+
class: { color: { background: '#ff5722', border: '#e64a19' } },
|
| 277 |
+
term: { color: { background: '#4caf50', border: '#388e3c' } },
|
| 278 |
+
person: { color: { background: '#ff9800', border: '#f57c00' } },
|
| 279 |
+
method: { color: { background: '#00bcd4', border: '#0097a7' } },
|
| 280 |
+
entity: { color: { background: '#607d8b', border: '#455a64' } }
|
| 281 |
+
},
|
| 282 |
+
physics: {
|
| 283 |
+
stabilization: { iterations: 200 },
|
| 284 |
+
barnesHut: {
|
| 285 |
+
gravitationalConstant: -8000,
|
| 286 |
+
springConstant: 0.04,
|
| 287 |
+
springLength: 95
|
| 288 |
+
}
|
| 289 |
+
},
|
| 290 |
+
interaction: {
|
| 291 |
+
hover: true,
|
| 292 |
+
navigationButtons: true,
|
| 293 |
+
keyboard: true
|
| 294 |
+
},
|
| 295 |
+
autoResize: false, // Disable auto-resize to prevent infinite stretching
|
| 296 |
+
height: containerHeight + 'px',
|
| 297 |
+
width: containerWidth + 'px'
|
| 298 |
+
};
|
| 299 |
+
|
| 300 |
+
// Create network
|
| 301 |
+
network = new vis.Network(container, graphData, options);
|
| 302 |
+
|
| 303 |
+
// Prevent any further resize attempts
|
| 304 |
+
if (network) {
|
| 305 |
+
network.setOptions({ autoResize: false });
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
// Add click handler for nodes
|
| 309 |
+
network.on('click', function(params) {
|
| 310 |
+
if (params.nodes.length > 0) {
|
| 311 |
+
const nodeId = params.nodes[0];
|
| 312 |
+
selectNode(nodeId);
|
| 313 |
+
}
|
| 314 |
+
});
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
// ========== Node Selection ==========
|
| 318 |
+
window.selectNode = async function(nodeId) {
|
| 319 |
+
selectedNodeId = nodeId;
|
| 320 |
+
|
| 321 |
+
try {
|
| 322 |
+
const nodeData = await apiCall(`/node/${nodeId}`);
|
| 323 |
+
displayNodeDetails(nodeData);
|
| 324 |
+
} catch (error) {
|
| 325 |
+
console.error('Failed to load node details:', error);
|
| 326 |
+
}
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
function displayNodeDetails(nodeData) {
|
| 330 |
+
const content = document.getElementById('node-content');
|
| 331 |
+
|
| 332 |
+
const sourcesHtml = nodeData.sources.map((source, i) => `
|
| 333 |
+
<li>p.${source.page_number} - "${source.snippet}" <span style="color: #8b92a0;">(${source.chunk_id})</span></li>
|
| 334 |
+
`).join('');
|
| 335 |
+
|
| 336 |
+
const relatedHtml = nodeData.related_nodes.map(related => `
|
| 337 |
+
<li onclick="selectNode('${related.node_id}')" style="cursor: pointer; padding: 0.5rem; background: #23262e; border-radius: 6px; margin-bottom: 0.25rem;">
|
| 338 |
+
<strong>${related.label}</strong> - ${related.relation} (confidence: ${related.confidence.toFixed(2)})
|
| 339 |
+
</li>
|
| 340 |
+
`).join('');
|
| 341 |
+
|
| 342 |
+
content.innerHTML = `
|
| 343 |
+
<div class="node-info">
|
| 344 |
+
<h3 class="node-label">${nodeData.label}</h3>
|
| 345 |
+
<span class="badge">${nodeData.type}</span>
|
| 346 |
+
|
| 347 |
+
<div class="node-summary">
|
| 348 |
+
<h4>Summary</h4>
|
| 349 |
+
<p>${nodeData.summary}</p>
|
| 350 |
+
</div>
|
| 351 |
+
|
| 352 |
+
<div class="node-sources">
|
| 353 |
+
<h4>Sources</h4>
|
| 354 |
+
<button class="expand-toggle" onclick="toggleSources()">Show Sources</button>
|
| 355 |
+
<ul class="sources-list" id="sources-list" hidden>
|
| 356 |
+
${sourcesHtml}
|
| 357 |
+
</ul>
|
| 358 |
+
</div>
|
| 359 |
+
|
| 360 |
+
${nodeData.related_nodes.length > 0 ? `
|
| 361 |
+
<div class="related-nodes">
|
| 362 |
+
<h4>Related Nodes</h4>
|
| 363 |
+
<ul class="related-list">
|
| 364 |
+
${relatedHtml}
|
| 365 |
+
</ul>
|
| 366 |
+
</div>
|
| 367 |
+
` : ''}
|
| 368 |
+
</div>
|
| 369 |
+
`;
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
window.toggleSources = function() {
|
| 373 |
+
const sourcesList = document.getElementById('sources-list');
|
| 374 |
+
const toggle = document.querySelector('.expand-toggle');
|
| 375 |
+
|
| 376 |
+
if (sourcesList.hidden) {
|
| 377 |
+
sourcesList.hidden = false;
|
| 378 |
+
toggle.textContent = 'Hide Sources';
|
| 379 |
+
} else {
|
| 380 |
+
sourcesList.hidden = true;
|
| 381 |
+
toggle.textContent = 'Show Sources';
|
| 382 |
+
}
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
document.getElementById('close-node-detail').addEventListener('click', () => {
|
| 386 |
+
document.getElementById('node-content').innerHTML = '<p class="placeholder-text">Click a node in the graph to view details</p>';
|
| 387 |
+
selectedNodeId = null;
|
| 388 |
+
});
|
| 389 |
+
|
| 390 |
+
// ========== Chat ==========
|
| 391 |
+
document.getElementById('send-btn').addEventListener('click', sendMessage);
|
| 392 |
+
document.getElementById('chat-input').addEventListener('keydown', (e) => {
|
| 393 |
+
if (e.key === 'Enter' && !e.shiftKey) {
|
| 394 |
+
e.preventDefault();
|
| 395 |
+
sendMessage();
|
| 396 |
+
}
|
| 397 |
+
});
|
| 398 |
+
|
| 399 |
+
async function sendMessage() {
|
| 400 |
+
const input = document.getElementById('chat-input');
|
| 401 |
+
const query = input.value.trim();
|
| 402 |
+
|
| 403 |
+
if (!query) return;
|
| 404 |
+
if (!currentPdfId) {
|
| 405 |
+
showNotification('Please upload a PDF first', 'error');
|
| 406 |
+
return;
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
// Add user message to chat
|
| 410 |
+
addMessageToChat('user', query);
|
| 411 |
+
input.value = '';
|
| 412 |
+
|
| 413 |
+
try {
|
| 414 |
+
const includeCitations = document.getElementById('include-citations').checked;
|
| 415 |
+
|
| 416 |
+
const response = await apiCall('/chat', {
|
| 417 |
+
method: 'POST',
|
| 418 |
+
headers: { 'Content-Type': 'application/json' },
|
| 419 |
+
body: JSON.stringify({
|
| 420 |
+
query,
|
| 421 |
+
pdf_id: currentPdfId,
|
| 422 |
+
include_citations: includeCitations,
|
| 423 |
+
max_sources: 5
|
| 424 |
+
})
|
| 425 |
+
});
|
| 426 |
+
|
| 427 |
+
// Add assistant response
|
| 428 |
+
addMessageToChat('assistant', response.answer, response.sources);
|
| 429 |
+
|
| 430 |
+
} catch (error) {
|
| 431 |
+
addMessageToChat('assistant', 'Sorry, I encountered an error processing your question.');
|
| 432 |
+
}
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
function addMessageToChat(role, content, sources = []) {
|
| 436 |
+
const messagesContainer = document.getElementById('chat-messages');
|
| 437 |
+
|
| 438 |
+
const messageDiv = document.createElement('div');
|
| 439 |
+
messageDiv.className = `message ${role}`;
|
| 440 |
+
|
| 441 |
+
let html = `<p>${content}</p>`;
|
| 442 |
+
|
| 443 |
+
if (sources && sources.length > 0) {
|
| 444 |
+
html += '<div style="margin-top: 0.5rem; padding-top: 0.5rem; border-top: 1px solid rgba(255,255,255,0.1);">';
|
| 445 |
+
html += '<strong style="font-size: 0.875rem;">Sources:</strong><ul style="margin-top: 0.25rem; font-size: 0.875rem;">';
|
| 446 |
+
sources.forEach(source => {
|
| 447 |
+
html += `<li>p.${source.page_number}: "${source.snippet}"</li>`;
|
| 448 |
+
});
|
| 449 |
+
html += '</ul></div>';
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
messageDiv.innerHTML = html;
|
| 453 |
+
messagesContainer.appendChild(messageDiv);
|
| 454 |
+
|
| 455 |
+
// Scroll to bottom
|
| 456 |
+
messagesContainer.scrollTop = messagesContainer.scrollHeight;
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
// ========== Stats Update ==========
|
| 460 |
+
async function updateStats() {
|
| 461 |
+
try {
|
| 462 |
+
const status = await apiCall('/admin/status');
|
| 463 |
+
|
| 464 |
+
document.getElementById('stats-nodes').textContent = `Nodes: ${status.total_nodes}`;
|
| 465 |
+
document.getElementById('stats-edges').textContent = `Edges: ${status.total_edges}`;
|
| 466 |
+
document.getElementById('stats-chunks').textContent = `Chunks: ${status.total_chunks}`;
|
| 467 |
+
} catch (error) {
|
| 468 |
+
console.error('Failed to update stats:', error);
|
| 469 |
+
}
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
// ========== Admin Controls ==========
|
| 473 |
+
document.getElementById('reindex-btn').addEventListener('click', async () => {
|
| 474 |
+
if (!currentPdfId) {
|
| 475 |
+
showNotification('No PDF to reindex', 'error');
|
| 476 |
+
return;
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
if (!confirm('Reindex current PDF? This will take some time.')) return;
|
| 480 |
+
|
| 481 |
+
try {
|
| 482 |
+
// Show overlay for reindexing
|
| 483 |
+
showProcessingOverlay('Reindexing PDF', 'Starting reindex...', 0);
|
| 484 |
+
|
| 485 |
+
await apiCall(`/admin/reindex?pdf_id=${currentPdfId}`, { method: 'POST' });
|
| 486 |
+
|
| 487 |
+
// Poll for completion
|
| 488 |
+
pollProcessingStatus(currentPdfId);
|
| 489 |
+
} catch (error) {
|
| 490 |
+
hideProcessingOverlay();
|
| 491 |
+
showNotification('Reindex failed', 'error');
|
| 492 |
+
}
|
| 493 |
+
});
|
| 494 |
+
|
| 495 |
+
document.getElementById('clear-btn').addEventListener('click', async () => {
|
| 496 |
+
if (!confirm('Clear all data? This cannot be undone!')) return;
|
| 497 |
+
|
| 498 |
+
try {
|
| 499 |
+
await apiCall('/admin/clear', { method: 'POST' });
|
| 500 |
+
showNotification('All data cleared', 'success');
|
| 501 |
+
|
| 502 |
+
// Reset UI
|
| 503 |
+
currentPdfId = null;
|
| 504 |
+
graphData = { nodes: [], edges: [] };
|
| 505 |
+
document.getElementById('graph-container').innerHTML = '<div class="graph-placeholder"><p>Upload a PDF to generate a knowledge graph</p></div>';
|
| 506 |
+
document.getElementById('node-content').innerHTML = '<p class="placeholder-text">Click a node in the graph to view details</p>';
|
| 507 |
+
document.getElementById('chat-messages').innerHTML = '<div class="message system"><p>Ask questions about your uploaded PDF. Answers will cite page numbers.</p></div>';
|
| 508 |
+
await updateStats();
|
| 509 |
+
} catch (error) {
|
| 510 |
+
showNotification('Clear failed', 'error');
|
| 511 |
+
}
|
| 512 |
+
});
|
| 513 |
+
|
| 514 |
+
// ========== Graph Controls ==========
|
| 515 |
+
document.getElementById('zoom-in-btn').addEventListener('click', () => {
|
| 516 |
+
if (network) {
|
| 517 |
+
const scale = network.getScale();
|
| 518 |
+
network.moveTo({ scale: scale * 1.2 });
|
| 519 |
+
}
|
| 520 |
+
});
|
| 521 |
+
|
| 522 |
+
document.getElementById('zoom-out-btn').addEventListener('click', () => {
|
| 523 |
+
if (network) {
|
| 524 |
+
const scale = network.getScale();
|
| 525 |
+
network.moveTo({ scale: scale * 0.8 });
|
| 526 |
+
}
|
| 527 |
+
});
|
| 528 |
+
|
| 529 |
+
document.getElementById('reset-view-btn').addEventListener('click', () => {
|
| 530 |
+
if (network) {
|
| 531 |
+
network.fit();
|
| 532 |
+
}
|
| 533 |
+
});
|
| 534 |
+
|
| 535 |
+
// ========== Initialization ==========
|
| 536 |
+
document.addEventListener('DOMContentLoaded', () => {
|
| 537 |
+
updateStats();
|
| 538 |
+
console.log('GraphLLM Frontend Initialized');
|
| 539 |
+
});
|
frontend/index.html
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>GraphLLM - PDF Knowledge Graph & RAG</title>
|
| 7 |
+
<link rel="stylesheet" href="/static/styles.css">
|
| 8 |
+
</head>
|
| 9 |
+
<body>
|
| 10 |
+
<!-- Header -->
|
| 11 |
+
<header class="app-header">
|
| 12 |
+
<div class="header-content">
|
| 13 |
+
<h1 class="app-title">GraphLLM</h1>
|
| 14 |
+
<p class="app-subtitle">PDF Knowledge Graph & RAG System</p>
|
| 15 |
+
</div>
|
| 16 |
+
|
| 17 |
+
<div class="header-controls">
|
| 18 |
+
<div class="upload-section">
|
| 19 |
+
<input type="file" id="pdf-upload" accept=".pdf" hidden>
|
| 20 |
+
<button id="upload-btn" class="btn btn-primary" onclick="document.getElementById('pdf-upload').click()">
|
| 21 |
+
Upload PDF
|
| 22 |
+
</button>
|
| 23 |
+
<span id="upload-status" class="status-text"></span>
|
| 24 |
+
</div>
|
| 25 |
+
|
| 26 |
+
<button id="reindex-btn" class="btn btn-secondary">Reindex</button>
|
| 27 |
+
<button id="clear-btn" class="btn btn-danger">Clear All</button>
|
| 28 |
+
</div>
|
| 29 |
+
</header>
|
| 30 |
+
|
| 31 |
+
<!-- Main Content Area -->
|
| 32 |
+
<main class="main-container">
|
| 33 |
+
<!-- Left Pane: Graph Visualization -->
|
| 34 |
+
<aside id="graph-pane" class="graph-pane" role="region" aria-label="Knowledge Graph Visualization">
|
| 35 |
+
<div class="pane-header">
|
| 36 |
+
<h2>Knowledge Graph</h2>
|
| 37 |
+
<div class="graph-controls">
|
| 38 |
+
<button id="zoom-in-btn" class="icon-btn" aria-label="Zoom In">+</button>
|
| 39 |
+
<button id="zoom-out-btn" class="icon-btn" aria-label="Zoom Out">-</button>
|
| 40 |
+
<button id="reset-view-btn" class="icon-btn" aria-label="Reset View">⟲</button>
|
| 41 |
+
</div>
|
| 42 |
+
</div>
|
| 43 |
+
|
| 44 |
+
<div id="graph-container" class="graph-container" role="img" aria-label="Interactive knowledge graph">
|
| 45 |
+
<!-- Graph visualization will be rendered here via JavaScript -->
|
| 46 |
+
<div class="graph-placeholder">
|
| 47 |
+
<p>Upload a PDF to generate a knowledge graph</p>
|
| 48 |
+
<p class="help-text">Graph visualization requires JavaScript for interactivity</p>
|
| 49 |
+
</div>
|
| 50 |
+
</div>
|
| 51 |
+
|
| 52 |
+
<div class="graph-legend">
|
| 53 |
+
<h3>Legend</h3>
|
| 54 |
+
<div class="legend-items">
|
| 55 |
+
<div class="legend-item">
|
| 56 |
+
<span class="legend-color concept"></span>
|
| 57 |
+
<span>Concept</span>
|
| 58 |
+
</div>
|
| 59 |
+
<div class="legend-item">
|
| 60 |
+
<span class="legend-color function"></span>
|
| 61 |
+
<span>Function/Method</span>
|
| 62 |
+
</div>
|
| 63 |
+
<div class="legend-item">
|
| 64 |
+
<span class="legend-color class"></span>
|
| 65 |
+
<span>Class/Type</span>
|
| 66 |
+
</div>
|
| 67 |
+
<div class="legend-item">
|
| 68 |
+
<span class="legend-color term"></span>
|
| 69 |
+
<span>Term/Definition</span>
|
| 70 |
+
</div>
|
| 71 |
+
</div>
|
| 72 |
+
</div>
|
| 73 |
+
</aside>
|
| 74 |
+
|
| 75 |
+
<!-- Right Pane: Node Details & Chat -->
|
| 76 |
+
<section id="detail-pane" class="detail-pane">
|
| 77 |
+
<!-- Node Detail Section -->
|
| 78 |
+
<div id="node-detail" class="node-detail card">
|
| 79 |
+
<div class="card-header">
|
| 80 |
+
<h2>Node Details</h2>
|
| 81 |
+
<button id="close-node-detail" class="icon-btn" aria-label="Close">✕</button>
|
| 82 |
+
</div>
|
| 83 |
+
|
| 84 |
+
<div id="node-content" class="node-content">
|
| 85 |
+
<p class="placeholder-text">Click a node in the graph to view details</p>
|
| 86 |
+
</div>
|
| 87 |
+
|
| 88 |
+
<!-- Node detail will be populated dynamically:
|
| 89 |
+
<div class="node-info">
|
| 90 |
+
<h3 class="node-label">[Node Label]</h3>
|
| 91 |
+
<span class="node-type badge">[Type]</span>
|
| 92 |
+
<span class="node-importance">Importance: [Score]</span>
|
| 93 |
+
|
| 94 |
+
<div class="node-summary">
|
| 95 |
+
<h4>Summary</h4>
|
| 96 |
+
<p>[AI-generated summary with (p. N) citations]</p>
|
| 97 |
+
</div>
|
| 98 |
+
|
| 99 |
+
<div class="node-sources">
|
| 100 |
+
<h4>Sources</h4>
|
| 101 |
+
<button class="expand-toggle">Show Sources</button>
|
| 102 |
+
<ul class="sources-list" hidden>
|
| 103 |
+
<li>p.12 - "Exact snippet..." (chunk_id)</li>
|
| 104 |
+
</ul>
|
| 105 |
+
</div>
|
| 106 |
+
|
| 107 |
+
<div class="related-nodes">
|
| 108 |
+
<h4>Related Nodes</h4>
|
| 109 |
+
<ul class="related-list">
|
| 110 |
+
<li>[Node] - [relation] - [confidence]</li>
|
| 111 |
+
</ul>
|
| 112 |
+
</div>
|
| 113 |
+
</div>
|
| 114 |
+
-->
|
| 115 |
+
</div>
|
| 116 |
+
|
| 117 |
+
<!-- Chat Section -->
|
| 118 |
+
<div id="chat" class="chat-section card">
|
| 119 |
+
<div class="card-header">
|
| 120 |
+
<h2>Chat with Document</h2>
|
| 121 |
+
<label class="checkbox-label">
|
| 122 |
+
<input type="checkbox" id="include-citations" checked>
|
| 123 |
+
<span>Include Citations</span>
|
| 124 |
+
</label>
|
| 125 |
+
</div>
|
| 126 |
+
|
| 127 |
+
<div id="chat-messages" class="chat-messages" role="log" aria-live="polite">
|
| 128 |
+
<div class="message system">
|
| 129 |
+
<p>Ask questions about your uploaded PDF. Answers will cite page numbers.</p>
|
| 130 |
+
</div>
|
| 131 |
+
</div>
|
| 132 |
+
|
| 133 |
+
<div class="chat-input-area">
|
| 134 |
+
<textarea
|
| 135 |
+
id="chat-input"
|
| 136 |
+
class="chat-input"
|
| 137 |
+
placeholder="Ask a question about the document..."
|
| 138 |
+
rows="3"
|
| 139 |
+
aria-label="Chat input"
|
| 140 |
+
></textarea>
|
| 141 |
+
<button id="send-btn" class="btn btn-primary" aria-label="Send message">Send</button>
|
| 142 |
+
</div>
|
| 143 |
+
</div>
|
| 144 |
+
</section>
|
| 145 |
+
</main>
|
| 146 |
+
|
| 147 |
+
<!-- Footer -->
|
| 148 |
+
<footer class="app-footer">
|
| 149 |
+
<div class="footer-content">
|
| 150 |
+
<div class="stats">
|
| 151 |
+
<span id="stats-nodes">Nodes: 0</span>
|
| 152 |
+
<span id="stats-edges">Edges: 0</span>
|
| 153 |
+
<span id="stats-chunks">Chunks: 0</span>
|
| 154 |
+
</div>
|
| 155 |
+
<p class="footer-text">GraphLLM v1.0 | Powered by Gemini & Mistral</p>
|
| 156 |
+
</div>
|
| 157 |
+
</footer>
|
| 158 |
+
|
| 159 |
+
<!-- Processing Overlay -->
|
| 160 |
+
<div id="processing-overlay" class="processing-overlay" hidden>
|
| 161 |
+
<div class="processing-modal">
|
| 162 |
+
<div class="spinner"></div>
|
| 163 |
+
<h2 id="processing-title">Processing PDF</h2>
|
| 164 |
+
<p id="processing-message">Starting...</p>
|
| 165 |
+
<div class="progress-bar">
|
| 166 |
+
<div id="progress-fill" class="progress-fill"></div>
|
| 167 |
+
</div>
|
| 168 |
+
<p id="processing-percent" class="processing-percent">0%</p>
|
| 169 |
+
</div>
|
| 170 |
+
</div>
|
| 171 |
+
|
| 172 |
+
<!-- JavaScript Libraries -->
|
| 173 |
+
<script type="text/javascript" src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
|
| 174 |
+
<script src="/static/app.js"></script>
|
| 175 |
+
</body>
|
| 176 |
+
</html>
|
frontend/styles.css
ADDED
|
@@ -0,0 +1,800 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* GraphLLM Stylesheet - Dark Sleek Theme */
|
| 2 |
+
|
| 3 |
+
/* ========== CSS Reset & Base Styles ========== */
|
| 4 |
+
* {
|
| 5 |
+
margin: 0;
|
| 6 |
+
padding: 0;
|
| 7 |
+
box-sizing: border-box;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
html, body {
|
| 11 |
+
height: 100%;
|
| 12 |
+
width: 100%;
|
| 13 |
+
overflow-x: hidden;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
:root {
|
| 17 |
+
/* Color Palette */
|
| 18 |
+
--bg-primary: #0f1115;
|
| 19 |
+
--bg-secondary: #12151a;
|
| 20 |
+
--bg-card: #1a1d24;
|
| 21 |
+
--bg-hover: #23262e;
|
| 22 |
+
|
| 23 |
+
--text-primary: #e6eef8;
|
| 24 |
+
--text-secondary: #cfd8e3;
|
| 25 |
+
--text-muted: #8b92a0;
|
| 26 |
+
|
| 27 |
+
--accent-primary: #4f9eff;
|
| 28 |
+
--accent-hover: #3d8ae6;
|
| 29 |
+
--accent-glow: rgba(79, 158, 255, 0.3);
|
| 30 |
+
|
| 31 |
+
--success: #4caf50;
|
| 32 |
+
--warning: #ff9800;
|
| 33 |
+
--danger: #f44336;
|
| 34 |
+
|
| 35 |
+
--border-color: #2a2f3a;
|
| 36 |
+
--shadow-sm: 0 2px 4px rgba(0, 0, 0, 0.3);
|
| 37 |
+
--shadow-md: 0 4px 12px rgba(0, 0, 0, 0.4);
|
| 38 |
+
--shadow-lg: 0 8px 24px rgba(0, 0, 0, 0.5);
|
| 39 |
+
|
| 40 |
+
/* Graph Node Colors */
|
| 41 |
+
--node-concept: #4f9eff;
|
| 42 |
+
--node-function: #9c27b0;
|
| 43 |
+
--node-class: #ff5722;
|
| 44 |
+
--node-term: #4caf50;
|
| 45 |
+
|
| 46 |
+
/* Spacing */
|
| 47 |
+
--spacing-xs: 0.25rem;
|
| 48 |
+
--spacing-sm: 0.5rem;
|
| 49 |
+
--spacing-md: 1rem;
|
| 50 |
+
--spacing-lg: 1.5rem;
|
| 51 |
+
--spacing-xl: 2rem;
|
| 52 |
+
|
| 53 |
+
/* Border Radius */
|
| 54 |
+
--radius-sm: 6px;
|
| 55 |
+
--radius-md: 12px;
|
| 56 |
+
--radius-lg: 16px;
|
| 57 |
+
|
| 58 |
+
/* Transitions */
|
| 59 |
+
--transition-fast: 0.15s ease;
|
| 60 |
+
--transition-normal: 0.3s ease;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
/* ========== Typography ========== */
|
| 64 |
+
body {
|
| 65 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
|
| 66 |
+
'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif;
|
| 67 |
+
font-size: 16px;
|
| 68 |
+
line-height: 1.6;
|
| 69 |
+
color: var(--text-primary);
|
| 70 |
+
background-color: var(--bg-primary);
|
| 71 |
+
overflow-x: hidden;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
h1, h2, h3, h4, h5, h6 {
|
| 75 |
+
font-weight: 600;
|
| 76 |
+
line-height: 1.2;
|
| 77 |
+
margin-bottom: var(--spacing-md);
|
| 78 |
+
color: var(--text-primary);
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
h1 { font-size: 2rem; }
|
| 82 |
+
h2 { font-size: 1.5rem; }
|
| 83 |
+
h3 { font-size: 1.25rem; }
|
| 84 |
+
h4 { font-size: 1.1rem; }
|
| 85 |
+
|
| 86 |
+
code, pre {
|
| 87 |
+
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
|
| 88 |
+
background-color: var(--bg-secondary);
|
| 89 |
+
padding: 0.2em 0.4em;
|
| 90 |
+
border-radius: var(--radius-sm);
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
/* ========== Layout ========== */
|
| 94 |
+
body {
|
| 95 |
+
display: flex;
|
| 96 |
+
flex-direction: column;
|
| 97 |
+
min-height: 100vh;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
/* Header */
|
| 101 |
+
.app-header {
|
| 102 |
+
background-color: var(--bg-secondary);
|
| 103 |
+
border-bottom: 1px solid var(--border-color);
|
| 104 |
+
padding: var(--spacing-lg) var(--spacing-xl);
|
| 105 |
+
display: flex;
|
| 106 |
+
justify-content: space-between;
|
| 107 |
+
align-items: center;
|
| 108 |
+
box-shadow: var(--shadow-sm);
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
.header-content {
|
| 112 |
+
flex: 1;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
.app-title {
|
| 116 |
+
margin: 0;
|
| 117 |
+
font-size: 1.75rem;
|
| 118 |
+
background: linear-gradient(135deg, var(--accent-primary), #9c27b0);
|
| 119 |
+
-webkit-background-clip: text;
|
| 120 |
+
-webkit-text-fill-color: transparent;
|
| 121 |
+
background-clip: text;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
.app-subtitle {
|
| 125 |
+
font-size: 0.875rem;
|
| 126 |
+
color: var(--text-muted);
|
| 127 |
+
margin: 0;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
.header-controls {
|
| 131 |
+
display: flex;
|
| 132 |
+
gap: var(--spacing-md);
|
| 133 |
+
align-items: center;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.upload-section {
|
| 137 |
+
display: flex;
|
| 138 |
+
gap: var(--spacing-sm);
|
| 139 |
+
align-items: center;
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
.status-text {
|
| 143 |
+
font-size: 0.875rem;
|
| 144 |
+
color: var(--text-secondary);
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
/* Main Container */
|
| 148 |
+
.main-container {
|
| 149 |
+
flex: 1;
|
| 150 |
+
display: grid;
|
| 151 |
+
grid-template-columns: 65% 35%;
|
| 152 |
+
gap: var(--spacing-lg);
|
| 153 |
+
padding: var(--spacing-lg);
|
| 154 |
+
overflow: hidden;
|
| 155 |
+
height: calc(100vh - 180px); /* Account for header and footer */
|
| 156 |
+
max-height: calc(100vh - 180px);
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
/* ========== Graph Pane (Left) ========== */
|
| 160 |
+
.graph-pane {
|
| 161 |
+
background-color: var(--bg-card);
|
| 162 |
+
border-radius: var(--radius-md);
|
| 163 |
+
padding: var(--spacing-lg);
|
| 164 |
+
display: flex;
|
| 165 |
+
flex-direction: column;
|
| 166 |
+
box-shadow: var(--shadow-md);
|
| 167 |
+
overflow: hidden;
|
| 168 |
+
height: 100%;
|
| 169 |
+
max-height: 100%;
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
.pane-header {
|
| 173 |
+
display: flex;
|
| 174 |
+
justify-content: space-between;
|
| 175 |
+
align-items: center;
|
| 176 |
+
margin-bottom: var(--spacing-md);
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
.pane-header h2 {
|
| 180 |
+
margin: 0;
|
| 181 |
+
font-size: 1.25rem;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
.graph-controls {
|
| 185 |
+
display: flex;
|
| 186 |
+
gap: var(--spacing-sm);
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
.graph-container {
|
| 190 |
+
flex: 1;
|
| 191 |
+
background-color: var(--bg-secondary);
|
| 192 |
+
border-radius: var(--radius-sm);
|
| 193 |
+
position: relative;
|
| 194 |
+
overflow: hidden;
|
| 195 |
+
border: 1px solid var(--border-color);
|
| 196 |
+
min-height: 500px;
|
| 197 |
+
height: 100%;
|
| 198 |
+
width: 100%;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
.graph-placeholder {
|
| 202 |
+
display: flex;
|
| 203 |
+
flex-direction: column;
|
| 204 |
+
align-items: center;
|
| 205 |
+
justify-content: center;
|
| 206 |
+
height: 100%;
|
| 207 |
+
color: var(--text-muted);
|
| 208 |
+
text-align: center;
|
| 209 |
+
padding: var(--spacing-xl);
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
.graph-placeholder p {
|
| 213 |
+
margin: var(--spacing-sm) 0;
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
.help-text {
|
| 217 |
+
font-size: 0.875rem;
|
| 218 |
+
color: var(--text-muted);
|
| 219 |
+
opacity: 0.7;
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
/* Graph Legend */
|
| 223 |
+
.graph-legend {
|
| 224 |
+
margin-top: var(--spacing-md);
|
| 225 |
+
padding: var(--spacing-md);
|
| 226 |
+
background-color: var(--bg-secondary);
|
| 227 |
+
border-radius: var(--radius-sm);
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
.graph-legend h3 {
|
| 231 |
+
font-size: 0.875rem;
|
| 232 |
+
margin-bottom: var(--spacing-sm);
|
| 233 |
+
color: var(--text-secondary);
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
.legend-items {
|
| 237 |
+
display: grid;
|
| 238 |
+
grid-template-columns: repeat(2, 1fr);
|
| 239 |
+
gap: var(--spacing-sm);
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
.legend-item {
|
| 243 |
+
display: flex;
|
| 244 |
+
align-items: center;
|
| 245 |
+
gap: var(--spacing-sm);
|
| 246 |
+
font-size: 0.875rem;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
.legend-color {
|
| 250 |
+
width: 16px;
|
| 251 |
+
height: 16px;
|
| 252 |
+
border-radius: 50%;
|
| 253 |
+
border: 2px solid currentColor;
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
.legend-color.concept { color: var(--node-concept); }
|
| 257 |
+
.legend-color.function { color: var(--node-function); }
|
| 258 |
+
.legend-color.class { color: var(--node-class); }
|
| 259 |
+
.legend-color.term { color: var(--node-term); }
|
| 260 |
+
|
| 261 |
+
/* ========== Detail Pane (Right) ========== */
|
| 262 |
+
.detail-pane {
|
| 263 |
+
display: flex;
|
| 264 |
+
flex-direction: column;
|
| 265 |
+
gap: var(--spacing-lg);
|
| 266 |
+
overflow-y: auto;
|
| 267 |
+
overflow-x: hidden;
|
| 268 |
+
height: 100%;
|
| 269 |
+
max-height: 100%;
|
| 270 |
+
padding-right: var(--spacing-sm); /* Space for scrollbar */
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
.card {
|
| 274 |
+
background-color: var(--bg-card);
|
| 275 |
+
border-radius: var(--radius-md);
|
| 276 |
+
padding: var(--spacing-lg);
|
| 277 |
+
box-shadow: var(--shadow-md);
|
| 278 |
+
width: 100%;
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
.card-header {
|
| 282 |
+
display: flex;
|
| 283 |
+
justify-content: space-between;
|
| 284 |
+
align-items: center;
|
| 285 |
+
margin-bottom: var(--spacing-md);
|
| 286 |
+
padding-bottom: var(--spacing-sm);
|
| 287 |
+
border-bottom: 1px solid var(--border-color);
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
.card-header h2 {
|
| 291 |
+
margin: 0;
|
| 292 |
+
font-size: 1.25rem;
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
/* Node Detail Card */
|
| 296 |
+
.node-detail {
|
| 297 |
+
flex-shrink: 0; /* Never shrink - always show full content */
|
| 298 |
+
overflow-y: visible; /* Don't scroll the card itself */
|
| 299 |
+
display: block;
|
| 300 |
+
margin-bottom: var(--spacing-lg);
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
.node-content {
|
| 304 |
+
color: var(--text-secondary);
|
| 305 |
+
max-height: none; /* No height restriction */
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
.placeholder-text {
|
| 309 |
+
color: var(--text-muted);
|
| 310 |
+
text-align: center;
|
| 311 |
+
padding: var(--spacing-xl);
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
.node-info {
|
| 315 |
+
display: flex;
|
| 316 |
+
flex-direction: column;
|
| 317 |
+
gap: var(--spacing-md);
|
| 318 |
+
width: 100%;
|
| 319 |
+
padding-bottom: var(--spacing-lg);
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
.node-label {
|
| 323 |
+
font-size: 1.5rem;
|
| 324 |
+
color: var(--accent-primary);
|
| 325 |
+
margin-bottom: var(--spacing-xs);
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
.badge {
|
| 329 |
+
display: inline-block;
|
| 330 |
+
padding: 0.25rem 0.5rem;
|
| 331 |
+
background-color: var(--accent-primary);
|
| 332 |
+
color: white;
|
| 333 |
+
border-radius: var(--radius-sm);
|
| 334 |
+
font-size: 0.75rem;
|
| 335 |
+
font-weight: 600;
|
| 336 |
+
text-transform: uppercase;
|
| 337 |
+
margin-right: var(--spacing-sm);
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
.node-importance {
|
| 341 |
+
font-size: 0.875rem;
|
| 342 |
+
color: var(--text-muted);
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
.node-summary, .node-sources, .related-nodes {
|
| 346 |
+
padding: var(--spacing-md);
|
| 347 |
+
background-color: var(--bg-secondary);
|
| 348 |
+
border-radius: var(--radius-sm);
|
| 349 |
+
border-left: 3px solid var(--accent-primary);
|
| 350 |
+
margin-bottom: var(--spacing-md);
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
.node-summary p {
|
| 354 |
+
line-height: 1.7;
|
| 355 |
+
color: var(--text-secondary);
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
.expand-toggle {
|
| 359 |
+
background: none;
|
| 360 |
+
border: none;
|
| 361 |
+
color: var(--accent-primary);
|
| 362 |
+
cursor: pointer;
|
| 363 |
+
font-size: 0.875rem;
|
| 364 |
+
padding: var(--spacing-xs) 0;
|
| 365 |
+
transition: color var(--transition-fast);
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
.expand-toggle:hover {
|
| 369 |
+
color: var(--accent-hover);
|
| 370 |
+
text-decoration: underline;
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
.sources-list, .related-list {
|
| 374 |
+
list-style: none;
|
| 375 |
+
margin-top: var(--spacing-sm);
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
.sources-list li, .related-list li {
|
| 379 |
+
padding: var(--spacing-sm);
|
| 380 |
+
margin-bottom: var(--spacing-xs);
|
| 381 |
+
background-color: var(--bg-hover);
|
| 382 |
+
border-radius: var(--radius-sm);
|
| 383 |
+
font-size: 0.875rem;
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
/* Chat Section */
|
| 387 |
+
.chat-section {
|
| 388 |
+
flex: 0 1 auto; /* Can shrink but don't grow */
|
| 389 |
+
display: flex;
|
| 390 |
+
flex-direction: column;
|
| 391 |
+
min-height: 300px;
|
| 392 |
+
height: 400px; /* Fixed height */
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
.checkbox-label {
|
| 396 |
+
display: flex;
|
| 397 |
+
align-items: center;
|
| 398 |
+
gap: var(--spacing-sm);
|
| 399 |
+
font-size: 0.875rem;
|
| 400 |
+
color: var(--text-secondary);
|
| 401 |
+
cursor: pointer;
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
.checkbox-label input[type="checkbox"] {
|
| 405 |
+
width: 18px;
|
| 406 |
+
height: 18px;
|
| 407 |
+
cursor: pointer;
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
.chat-messages {
|
| 411 |
+
flex: 1;
|
| 412 |
+
overflow-y: auto;
|
| 413 |
+
padding: var(--spacing-md);
|
| 414 |
+
background-color: var(--bg-secondary);
|
| 415 |
+
border-radius: var(--radius-sm);
|
| 416 |
+
margin-bottom: var(--spacing-md);
|
| 417 |
+
min-height: 200px;
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
.message {
|
| 421 |
+
margin-bottom: var(--spacing-md);
|
| 422 |
+
padding: var(--spacing-md);
|
| 423 |
+
border-radius: var(--radius-sm);
|
| 424 |
+
line-height: 1.6;
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
.message.user {
|
| 428 |
+
background-color: var(--accent-primary);
|
| 429 |
+
color: white;
|
| 430 |
+
align-self: flex-end;
|
| 431 |
+
max-width: 80%;
|
| 432 |
+
margin-left: auto;
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
.message.assistant {
|
| 436 |
+
background-color: var(--bg-hover);
|
| 437 |
+
color: var(--text-primary);
|
| 438 |
+
border-left: 3px solid var(--accent-primary);
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
.message.system {
|
| 442 |
+
background-color: transparent;
|
| 443 |
+
color: var(--text-muted);
|
| 444 |
+
font-size: 0.875rem;
|
| 445 |
+
text-align: center;
|
| 446 |
+
border: none;
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
.chat-input-area {
|
| 450 |
+
display: flex;
|
| 451 |
+
gap: var(--spacing-sm);
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
.chat-input {
|
| 455 |
+
flex: 1;
|
| 456 |
+
background-color: var(--bg-secondary);
|
| 457 |
+
border: 1px solid var(--border-color);
|
| 458 |
+
border-radius: var(--radius-sm);
|
| 459 |
+
padding: var(--spacing-md);
|
| 460 |
+
color: var(--text-primary);
|
| 461 |
+
font-family: inherit;
|
| 462 |
+
font-size: 0.95rem;
|
| 463 |
+
resize: vertical;
|
| 464 |
+
transition: border-color var(--transition-fast);
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
.chat-input:focus {
|
| 468 |
+
outline: none;
|
| 469 |
+
border-color: var(--accent-primary);
|
| 470 |
+
box-shadow: 0 0 0 3px var(--accent-glow);
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
/* ========== Buttons ========== */
|
| 474 |
+
.btn {
|
| 475 |
+
padding: 0.625rem 1.25rem;
|
| 476 |
+
border: none;
|
| 477 |
+
border-radius: var(--radius-sm);
|
| 478 |
+
font-size: 0.9rem;
|
| 479 |
+
font-weight: 600;
|
| 480 |
+
cursor: pointer;
|
| 481 |
+
transition: all var(--transition-fast);
|
| 482 |
+
white-space: nowrap;
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
.btn-primary {
|
| 486 |
+
background-color: var(--accent-primary);
|
| 487 |
+
color: white;
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
.btn-primary:hover {
|
| 491 |
+
background-color: var(--accent-hover);
|
| 492 |
+
box-shadow: 0 0 12px var(--accent-glow);
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
.btn-secondary {
|
| 496 |
+
background-color: var(--bg-hover);
|
| 497 |
+
color: var(--text-primary);
|
| 498 |
+
border: 1px solid var(--border-color);
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
.btn-secondary:hover {
|
| 502 |
+
background-color: var(--bg-card);
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
.btn-danger {
|
| 506 |
+
background-color: var(--danger);
|
| 507 |
+
color: white;
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
.btn-danger:hover {
|
| 511 |
+
background-color: #d32f2f;
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
.icon-btn {
|
| 515 |
+
width: 36px;
|
| 516 |
+
height: 36px;
|
| 517 |
+
padding: 0;
|
| 518 |
+
background-color: var(--bg-hover);
|
| 519 |
+
border: 1px solid var(--border-color);
|
| 520 |
+
border-radius: var(--radius-sm);
|
| 521 |
+
color: var(--text-primary);
|
| 522 |
+
cursor: pointer;
|
| 523 |
+
font-size: 1.2rem;
|
| 524 |
+
display: flex;
|
| 525 |
+
align-items: center;
|
| 526 |
+
justify-content: center;
|
| 527 |
+
transition: all var(--transition-fast);
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
.icon-btn:hover {
|
| 531 |
+
background-color: var(--bg-card);
|
| 532 |
+
border-color: var(--accent-primary);
|
| 533 |
+
}
|
| 534 |
+
|
| 535 |
+
/* ========== Footer ========== */
|
| 536 |
+
.app-footer {
|
| 537 |
+
background-color: var(--bg-secondary);
|
| 538 |
+
border-top: 1px solid var(--border-color);
|
| 539 |
+
padding: var(--spacing-md) var(--spacing-xl);
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
.footer-content {
|
| 543 |
+
display: flex;
|
| 544 |
+
justify-content: space-between;
|
| 545 |
+
align-items: center;
|
| 546 |
+
}
|
| 547 |
+
|
| 548 |
+
.stats {
|
| 549 |
+
display: flex;
|
| 550 |
+
gap: var(--spacing-lg);
|
| 551 |
+
font-size: 0.875rem;
|
| 552 |
+
color: var(--text-secondary);
|
| 553 |
+
}
|
| 554 |
+
|
| 555 |
+
.footer-text {
|
| 556 |
+
font-size: 0.875rem;
|
| 557 |
+
color: var(--text-muted);
|
| 558 |
+
}
|
| 559 |
+
|
| 560 |
+
/* ========== Responsive Design ========== */
|
| 561 |
+
@media (max-width: 1024px) {
|
| 562 |
+
.main-container {
|
| 563 |
+
grid-template-columns: 1fr;
|
| 564 |
+
grid-template-rows: auto auto;
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
.graph-pane {
|
| 568 |
+
min-height: 400px;
|
| 569 |
+
}
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
@media (max-width: 768px) {
|
| 573 |
+
.app-header {
|
| 574 |
+
flex-direction: column;
|
| 575 |
+
gap: var(--spacing-md);
|
| 576 |
+
align-items: flex-start;
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
.header-controls {
|
| 580 |
+
width: 100%;
|
| 581 |
+
flex-wrap: wrap;
|
| 582 |
+
}
|
| 583 |
+
|
| 584 |
+
.main-container {
|
| 585 |
+
padding: var(--spacing-sm);
|
| 586 |
+
gap: var(--spacing-sm);
|
| 587 |
+
}
|
| 588 |
+
|
| 589 |
+
.footer-content {
|
| 590 |
+
flex-direction: column;
|
| 591 |
+
gap: var(--spacing-sm);
|
| 592 |
+
text-align: center;
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
.stats {
|
| 596 |
+
flex-direction: column;
|
| 597 |
+
gap: var(--spacing-sm);
|
| 598 |
+
}
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
/* ========== Accessibility ========== */
|
| 602 |
+
/* Focus styles for keyboard navigation */
|
| 603 |
+
button:focus-visible,
|
| 604 |
+
input:focus-visible,
|
| 605 |
+
textarea:focus-visible {
|
| 606 |
+
outline: 2px solid var(--accent-primary);
|
| 607 |
+
outline-offset: 2px;
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
/* Hidden but accessible */
|
| 611 |
+
.sr-only {
|
| 612 |
+
position: absolute;
|
| 613 |
+
width: 1px;
|
| 614 |
+
height: 1px;
|
| 615 |
+
padding: 0;
|
| 616 |
+
margin: -1px;
|
| 617 |
+
overflow: hidden;
|
| 618 |
+
clip: rect(0, 0, 0, 0);
|
| 619 |
+
white-space: nowrap;
|
| 620 |
+
border-width: 0;
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
/* Reduce motion for accessibility */
|
| 624 |
+
@media (prefers-reduced-motion: reduce) {
|
| 625 |
+
*,
|
| 626 |
+
*::before,
|
| 627 |
+
*::after {
|
| 628 |
+
animation-duration: 0.01ms !important;
|
| 629 |
+
animation-iteration-count: 1 !important;
|
| 630 |
+
transition-duration: 0.01ms !important;
|
| 631 |
+
}
|
| 632 |
+
}
|
| 633 |
+
|
| 634 |
+
/* ========== Graph Node Styles (for JS visualization) ========== */
|
| 635 |
+
/* These classes will be used by the graph visualization library */
|
| 636 |
+
.graph-node {
|
| 637 |
+
cursor: pointer;
|
| 638 |
+
transition: all var(--transition-fast);
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
.graph-node.concept circle { fill: var(--node-concept); }
|
| 642 |
+
.graph-node.function circle { fill: var(--node-function); }
|
| 643 |
+
.graph-node.class circle { fill: var(--node-class); }
|
| 644 |
+
.graph-node.term circle { fill: var(--node-term); }
|
| 645 |
+
|
| 646 |
+
.graph-node:hover circle {
|
| 647 |
+
stroke-width: 3px;
|
| 648 |
+
filter: brightness(1.2);
|
| 649 |
+
}
|
| 650 |
+
|
| 651 |
+
.graph-node.selected circle {
|
| 652 |
+
stroke: var(--accent-primary);
|
| 653 |
+
stroke-width: 4px;
|
| 654 |
+
animation: pulse 1.5s infinite;
|
| 655 |
+
}
|
| 656 |
+
|
| 657 |
+
@keyframes pulse {
|
| 658 |
+
0%, 100% {
|
| 659 |
+
box-shadow: 0 0 0 0 var(--accent-glow);
|
| 660 |
+
}
|
| 661 |
+
50% {
|
| 662 |
+
box-shadow: 0 0 0 10px rgba(79, 158, 255, 0);
|
| 663 |
+
}
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
.graph-edge {
|
| 667 |
+
stroke: var(--text-muted);
|
| 668 |
+
stroke-width: 1.5px;
|
| 669 |
+
fill: none;
|
| 670 |
+
opacity: 0.6;
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
.graph-edge.highlighted {
|
| 674 |
+
stroke: var(--accent-primary);
|
| 675 |
+
opacity: 1;
|
| 676 |
+
stroke-width: 2px;
|
| 677 |
+
}
|
| 678 |
+
|
| 679 |
+
/* Vis.js Network Canvas Constraints */
|
| 680 |
+
.graph-container canvas {
|
| 681 |
+
max-width: 100% !important;
|
| 682 |
+
max-height: 100% !important;
|
| 683 |
+
}
|
| 684 |
+
|
| 685 |
+
.graph-container > div {
|
| 686 |
+
width: 100% !important;
|
| 687 |
+
height: 100% !important;
|
| 688 |
+
max-height: 100% !important;
|
| 689 |
+
}
|
| 690 |
+
|
| 691 |
+
/* ========== Processing Overlay ========== */
|
| 692 |
+
.processing-overlay {
|
| 693 |
+
position: fixed;
|
| 694 |
+
top: 0;
|
| 695 |
+
left: 0;
|
| 696 |
+
width: 100%;
|
| 697 |
+
height: 100%;
|
| 698 |
+
background: rgba(15, 17, 21, 0.95);
|
| 699 |
+
backdrop-filter: blur(8px);
|
| 700 |
+
display: flex;
|
| 701 |
+
align-items: center;
|
| 702 |
+
justify-content: center;
|
| 703 |
+
z-index: 10000;
|
| 704 |
+
animation: fadeIn 0.3s ease-in-out;
|
| 705 |
+
}
|
| 706 |
+
|
| 707 |
+
.processing-overlay[hidden] {
|
| 708 |
+
display: none;
|
| 709 |
+
}
|
| 710 |
+
|
| 711 |
+
@keyframes fadeIn {
|
| 712 |
+
from {
|
| 713 |
+
opacity: 0;
|
| 714 |
+
}
|
| 715 |
+
to {
|
| 716 |
+
opacity: 1;
|
| 717 |
+
}
|
| 718 |
+
}
|
| 719 |
+
|
| 720 |
+
.processing-modal {
|
| 721 |
+
background: var(--bg-card);
|
| 722 |
+
border: 1px solid var(--border-color);
|
| 723 |
+
border-radius: 16px;
|
| 724 |
+
padding: 3rem 4rem;
|
| 725 |
+
box-shadow: var(--shadow-lg);
|
| 726 |
+
text-align: center;
|
| 727 |
+
min-width: 400px;
|
| 728 |
+
animation: slideUp 0.4s ease-out;
|
| 729 |
+
}
|
| 730 |
+
|
| 731 |
+
@keyframes slideUp {
|
| 732 |
+
from {
|
| 733 |
+
transform: translateY(30px);
|
| 734 |
+
opacity: 0;
|
| 735 |
+
}
|
| 736 |
+
to {
|
| 737 |
+
transform: translateY(0);
|
| 738 |
+
opacity: 1;
|
| 739 |
+
}
|
| 740 |
+
}
|
| 741 |
+
|
| 742 |
+
/* Spinner Animation */
|
| 743 |
+
.spinner {
|
| 744 |
+
width: 80px;
|
| 745 |
+
height: 80px;
|
| 746 |
+
margin: 0 auto 2rem;
|
| 747 |
+
border: 6px solid var(--border-color);
|
| 748 |
+
border-top: 6px solid var(--accent-primary);
|
| 749 |
+
border-radius: 50%;
|
| 750 |
+
animation: spin 1s linear infinite;
|
| 751 |
+
}
|
| 752 |
+
|
| 753 |
+
@keyframes spin {
|
| 754 |
+
0% {
|
| 755 |
+
transform: rotate(0deg);
|
| 756 |
+
}
|
| 757 |
+
100% {
|
| 758 |
+
transform: rotate(360deg);
|
| 759 |
+
}
|
| 760 |
+
}
|
| 761 |
+
|
| 762 |
+
#processing-title {
|
| 763 |
+
color: var(--text-primary);
|
| 764 |
+
font-size: 1.75rem;
|
| 765 |
+
font-weight: 600;
|
| 766 |
+
margin-bottom: 1rem;
|
| 767 |
+
}
|
| 768 |
+
|
| 769 |
+
#processing-message {
|
| 770 |
+
color: var(--text-secondary);
|
| 771 |
+
font-size: 1rem;
|
| 772 |
+
margin-bottom: 1.5rem;
|
| 773 |
+
min-height: 1.5rem;
|
| 774 |
+
}
|
| 775 |
+
|
| 776 |
+
/* Progress Bar */
|
| 777 |
+
.progress-bar {
|
| 778 |
+
width: 100%;
|
| 779 |
+
height: 8px;
|
| 780 |
+
background: var(--bg-secondary);
|
| 781 |
+
border-radius: 4px;
|
| 782 |
+
overflow: hidden;
|
| 783 |
+
margin-bottom: 1rem;
|
| 784 |
+
}
|
| 785 |
+
|
| 786 |
+
.progress-fill {
|
| 787 |
+
height: 100%;
|
| 788 |
+
background: linear-gradient(90deg, var(--accent-primary), var(--accent-hover));
|
| 789 |
+
border-radius: 4px;
|
| 790 |
+
transition: width 0.3s ease-out;
|
| 791 |
+
width: 0%;
|
| 792 |
+
box-shadow: 0 0 10px var(--accent-glow);
|
| 793 |
+
}
|
| 794 |
+
|
| 795 |
+
.processing-percent {
|
| 796 |
+
color: var(--text-muted);
|
| 797 |
+
font-size: 0.875rem;
|
| 798 |
+
font-weight: 500;
|
| 799 |
+
letter-spacing: 0.5px;
|
| 800 |
+
}
|
gemini_extractor.py
ADDED
|
@@ -0,0 +1,612 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Gemini-based Knowledge Graph Extraction
|
| 3 |
+
Simple LLM-powered extraction using Google Gemini (cheapest option)
|
| 4 |
+
"""
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
+
from loguru import logger
|
| 7 |
+
from models import Chunk, CanonicalTriple, RelationType
|
| 8 |
+
from config import settings
|
| 9 |
+
import json
|
| 10 |
+
import asyncio
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class GeminiExtractor:
|
| 14 |
+
"""
|
| 15 |
+
Extract key nodes and relationships using Gemini LLM
|
| 16 |
+
Simple, cost-effective approach for knowledge graph generation
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, llm_service=None):
|
| 20 |
+
"""Initialize Gemini extractor"""
|
| 21 |
+
logger.info("Initializing GeminiExtractor")
|
| 22 |
+
|
| 23 |
+
# Import litellm for API calls
|
| 24 |
+
try:
|
| 25 |
+
import litellm
|
| 26 |
+
self.litellm = litellm
|
| 27 |
+
|
| 28 |
+
# Configure litellm for Gemini
|
| 29 |
+
self.model_name = f"gemini/{settings.gemini_model}"
|
| 30 |
+
self.api_key = settings.gemini_api_key
|
| 31 |
+
|
| 32 |
+
logger.info(f"✓ GeminiExtractor initialized with model: {self.model_name}")
|
| 33 |
+
|
| 34 |
+
except ImportError as e:
|
| 35 |
+
logger.error("litellm not installed. Install with: pip install litellm")
|
| 36 |
+
raise RuntimeError("litellm required for Gemini") from e
|
| 37 |
+
|
| 38 |
+
# Comprehensive list of generic terms to REJECT
|
| 39 |
+
self.generic_stopwords = {
|
| 40 |
+
# Generic nouns
|
| 41 |
+
'system', 'systems', 'data', 'information', 'value', 'values',
|
| 42 |
+
'method', 'methods', 'approach', 'approaches', 'technique', 'techniques',
|
| 43 |
+
'result', 'results', 'study', 'studies', 'paper', 'papers',
|
| 44 |
+
'section', 'sections', 'figure', 'figures', 'table', 'tables',
|
| 45 |
+
'example', 'examples', 'case', 'cases', 'type', 'types',
|
| 46 |
+
'way', 'ways', 'thing', 'things', 'part', 'parts',
|
| 47 |
+
'model', 'models', 'framework', 'frameworks', # Too generic unless specific
|
| 48 |
+
'process', 'processes', 'analysis', 'problem', 'problems',
|
| 49 |
+
'solution', 'solutions', 'set', 'sets', 'group', 'groups',
|
| 50 |
+
'element', 'elements', 'component', 'components',
|
| 51 |
+
'feature', 'features', 'property', 'properties',
|
| 52 |
+
'aspect', 'aspects', 'factor', 'factors', 'parameter', 'parameters',
|
| 53 |
+
'concept', 'concepts', 'idea', 'ideas', 'theory', 'theories',
|
| 54 |
+
'field', 'fields', 'area', 'areas', 'domain', 'domains',
|
| 55 |
+
'task', 'tasks', 'goal', 'goals', 'objective', 'objectives',
|
| 56 |
+
'input', 'inputs', 'output', 'outputs', 'function', 'functions',
|
| 57 |
+
'operation', 'operations', 'step', 'steps', 'stage', 'stages',
|
| 58 |
+
'phase', 'phases', 'level', 'levels', 'layer', 'layers',
|
| 59 |
+
'number', 'numbers', 'amount', 'amounts', 'size', 'sizes',
|
| 60 |
+
'performance', 'accuracy', 'quality', 'efficiency',
|
| 61 |
+
'document', 'documents', 'text', 'texts', 'word', 'words',
|
| 62 |
+
'sentence', 'sentences', 'paragraph', 'paragraphs',
|
| 63 |
+
'item', 'items', 'object', 'objects', 'entity', 'entities',
|
| 64 |
+
'relation', 'relations', 'relationship', 'relationships',
|
| 65 |
+
|
| 66 |
+
# Generic verbs/actions
|
| 67 |
+
'use', 'uses', 'using', 'used', 'usage',
|
| 68 |
+
'apply', 'applies', 'applying', 'applied', 'application', 'applications',
|
| 69 |
+
'work', 'works', 'working', 'worked',
|
| 70 |
+
'provide', 'provides', 'providing', 'provided',
|
| 71 |
+
'show', 'shows', 'showing', 'shown',
|
| 72 |
+
'present', 'presents', 'presenting', 'presented', 'presentation',
|
| 73 |
+
|
| 74 |
+
# Generic adjectives
|
| 75 |
+
'new', 'novel', 'existing', 'current', 'previous',
|
| 76 |
+
'different', 'similar', 'same', 'other', 'another',
|
| 77 |
+
'various', 'several', 'multiple', 'single',
|
| 78 |
+
'important', 'significant', 'main', 'key', 'major',
|
| 79 |
+
'good', 'better', 'best', 'high', 'low',
|
| 80 |
+
'large', 'small', 'big', 'little',
|
| 81 |
+
|
| 82 |
+
# Research-specific generic terms
|
| 83 |
+
'experiment', 'experiments', 'evaluation', 'evaluations',
|
| 84 |
+
'test', 'tests', 'testing', 'validation',
|
| 85 |
+
'comparison', 'comparisons', 'benchmark', 'benchmarks',
|
| 86 |
+
'baseline', 'baselines', 'metric', 'metrics',
|
| 87 |
+
'dataset', 'datasets', 'corpus', 'corpora',
|
| 88 |
+
|
| 89 |
+
# Time/sequence terms
|
| 90 |
+
'time', 'times', 'period', 'periods', 'year', 'years',
|
| 91 |
+
'first', 'second', 'third', 'last', 'final',
|
| 92 |
+
'next', 'previous', 'current', 'recent',
|
| 93 |
+
|
| 94 |
+
# Common prepositions/articles (shouldn't appear but just in case)
|
| 95 |
+
'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
| 96 |
+
|
| 97 |
+
# Additional generic ML/AI terms (too broad)
|
| 98 |
+
'neural network', 'deep learning', 'machine learning',
|
| 99 |
+
'training', 'testing', 'prediction', 'classification',
|
| 100 |
+
'regression', 'clustering', 'optimization',
|
| 101 |
+
'network', 'networks', 'algorithm', 'algorithms',
|
| 102 |
+
'learning', 'training data', 'test data',
|
| 103 |
+
'feature extraction', 'preprocessing',
|
| 104 |
+
'hyperparameter', 'hyperparameters',
|
| 105 |
+
'loss', 'error', 'gradient',
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
async def extract_from_chunks(
|
| 109 |
+
self,
|
| 110 |
+
chunks: List[Chunk],
|
| 111 |
+
use_llm: bool = True
|
| 112 |
+
) -> List[CanonicalTriple]:
|
| 113 |
+
"""
|
| 114 |
+
Extract knowledge graph - PER PAGE with HARD CAP of 2 concepts per page
|
| 115 |
+
|
| 116 |
+
Args:
|
| 117 |
+
chunks: List of text chunks
|
| 118 |
+
use_llm: Always True for Gemini extraction
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
List of canonical triples
|
| 122 |
+
"""
|
| 123 |
+
logger.info(f"\n{'='*80}")
|
| 124 |
+
logger.info(f"{'GEMINI PER-PAGE EXTRACTION - 2 CONCEPTS MAX PER PAGE':^80}")
|
| 125 |
+
logger.info(f"{'='*80}")
|
| 126 |
+
|
| 127 |
+
all_triples = []
|
| 128 |
+
|
| 129 |
+
# Filter text chunks
|
| 130 |
+
text_chunks = [c for c in chunks if c.type.value in ["paragraph", "code"]]
|
| 131 |
+
|
| 132 |
+
if not text_chunks:
|
| 133 |
+
logger.warning("No text chunks to process")
|
| 134 |
+
return []
|
| 135 |
+
|
| 136 |
+
# GROUP CHUNKS BY PAGE
|
| 137 |
+
from collections import defaultdict
|
| 138 |
+
chunks_by_page = defaultdict(list)
|
| 139 |
+
for chunk in text_chunks:
|
| 140 |
+
page_num = chunk.page_number or 0
|
| 141 |
+
chunks_by_page[page_num].append(chunk)
|
| 142 |
+
|
| 143 |
+
logger.info(f"Processing {len(chunks_by_page)} pages in PARALLEL")
|
| 144 |
+
|
| 145 |
+
# ⚡ PARALLEL PROCESSING: Create tasks for all pages
|
| 146 |
+
tasks = []
|
| 147 |
+
page_numbers = []
|
| 148 |
+
for page_num in sorted(chunks_by_page.keys()):
|
| 149 |
+
page_chunks = chunks_by_page[page_num]
|
| 150 |
+
combined_text = "\n\n".join([chunk.text for chunk in page_chunks])
|
| 151 |
+
|
| 152 |
+
logger.info(f"📄 PAGE {page_num}: {len(page_chunks)} chunks, {len(combined_text)} chars")
|
| 153 |
+
|
| 154 |
+
# Create async task for this page
|
| 155 |
+
tasks.append(self._extract_with_gemini(combined_text, page_num))
|
| 156 |
+
page_numbers.append(page_num)
|
| 157 |
+
|
| 158 |
+
# Execute all Gemini calls in parallel
|
| 159 |
+
logger.info(f"\n🚀 Launching {len(tasks)} parallel Gemini API calls...")
|
| 160 |
+
import time
|
| 161 |
+
start_time = time.time()
|
| 162 |
+
|
| 163 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 164 |
+
|
| 165 |
+
elapsed = time.time() - start_time
|
| 166 |
+
logger.info(f"✓ All {len(tasks)} Gemini calls completed in {elapsed:.2f}s (parallel)")
|
| 167 |
+
logger.info(f" Average: {elapsed/len(tasks):.2f}s per page (would be {elapsed*len(tasks):.2f}s sequential)")
|
| 168 |
+
|
| 169 |
+
# Process results
|
| 170 |
+
for page_num, page_triples in zip(page_numbers, results):
|
| 171 |
+
if isinstance(page_triples, Exception):
|
| 172 |
+
logger.error(f" ❌ Page {page_num} failed: {page_triples}")
|
| 173 |
+
continue
|
| 174 |
+
|
| 175 |
+
if page_triples:
|
| 176 |
+
all_triples.extend(page_triples)
|
| 177 |
+
logger.info(f" ✓ Page {page_num}: Extracted {len(page_triples)} triples")
|
| 178 |
+
for t in page_triples:
|
| 179 |
+
relation_value = t.relation.value if hasattr(t.relation, 'value') else t.relation
|
| 180 |
+
logger.info(f" → {t.subject_label} --[{relation_value}]--> {t.object_label}")
|
| 181 |
+
else:
|
| 182 |
+
logger.warning(f" ⚠️ Page {page_num}: NO TRIPLES EXTRACTED!")
|
| 183 |
+
|
| 184 |
+
# Summary
|
| 185 |
+
unique_concepts = set()
|
| 186 |
+
concepts_by_page = {}
|
| 187 |
+
for triple in all_triples:
|
| 188 |
+
unique_concepts.add(triple.subject_label)
|
| 189 |
+
unique_concepts.add(triple.object_label)
|
| 190 |
+
page = triple.page_number
|
| 191 |
+
if page not in concepts_by_page:
|
| 192 |
+
concepts_by_page[page] = set()
|
| 193 |
+
concepts_by_page[page].add(triple.subject_label)
|
| 194 |
+
concepts_by_page[page].add(triple.object_label)
|
| 195 |
+
|
| 196 |
+
logger.info(f"\n{'='*80}")
|
| 197 |
+
logger.info(f"{'EXTRACTION SUMMARY':^80}")
|
| 198 |
+
logger.info(f"{'='*80}")
|
| 199 |
+
logger.info(f"Pages processed: {len(chunks_by_page)}")
|
| 200 |
+
logger.info(f"Total triples: {len(all_triples)}")
|
| 201 |
+
logger.info(f"Unique concepts: {len(unique_concepts)} (max {len(chunks_by_page) * 2})")
|
| 202 |
+
|
| 203 |
+
if len(all_triples) == 0:
|
| 204 |
+
logger.error(f"\n❌❌❌ CRITICAL ERROR: ZERO TRIPLES EXTRACTED! ❌❌❌")
|
| 205 |
+
logger.error(f"This means:")
|
| 206 |
+
logger.error(f" - Either Gemini returned no concepts")
|
| 207 |
+
logger.error(f" - Or all concepts were rejected by filters")
|
| 208 |
+
logger.error(f" - Or there was an API error")
|
| 209 |
+
logger.error(f"Check the logs above for details!")
|
| 210 |
+
else:
|
| 211 |
+
logger.info(f"\nConcepts per page:")
|
| 212 |
+
for page in sorted(concepts_by_page.keys()):
|
| 213 |
+
logger.info(f" Page {page}: {list(concepts_by_page[page])}")
|
| 214 |
+
|
| 215 |
+
logger.info(f"{'='*80}\n")
|
| 216 |
+
|
| 217 |
+
return all_triples
|
| 218 |
+
|
| 219 |
+
async def _extract_with_gemini(self, text: str, page_number: int) -> List[CanonicalTriple]:
|
| 220 |
+
"""
|
| 221 |
+
Call Gemini API to extract technical concepts (nodes) from THIS PAGE
|
| 222 |
+
|
| 223 |
+
Args:
|
| 224 |
+
text: Text from single page
|
| 225 |
+
page_number: Page number
|
| 226 |
+
|
| 227 |
+
Returns:
|
| 228 |
+
List of canonical triples
|
| 229 |
+
"""
|
| 230 |
+
# Specialized technical concept extraction prompt
|
| 231 |
+
prompt = f"""You are an expert in technical information extraction and knowledge graph construction.
|
| 232 |
+
Your task is to identify only the most meaningful *technical concepts* from the given text.
|
| 233 |
+
Concepts must represent scientific, mathematical, algorithmic, or methodological entities
|
| 234 |
+
that could exist as standalone nodes in a knowledge graph.
|
| 235 |
+
Ignore generic words, section titles, variable names, and everyday terms.
|
| 236 |
+
Focus on high-value, domain-specific terminology relevant to the text.
|
| 237 |
+
|
| 238 |
+
Extract all important technical concepts from the following text that would form the
|
| 239 |
+
nodes of a knowledge graph.
|
| 240 |
+
|
| 241 |
+
⚙️ Rules:
|
| 242 |
+
• Each concept should represent a self-contained technical idea, model, method, metric, loss, theorem, or process
|
| 243 |
+
• Keep only multi-word phrases when possible ("gradient descent", "convolutional neural network", "cross-entropy loss")
|
| 244 |
+
• Skip single, contextless nouns ("data", "model", "value", "equation", "result")
|
| 245 |
+
• Merge synonymous terms (e.g., "SGD", "stochastic gradient descent" → one entry)
|
| 246 |
+
• Do not include equations, numeric values, figure names, or symbols
|
| 247 |
+
• Do not repeat concepts
|
| 248 |
+
• Maintain consistent naming conventions (lowercase, hyphen-separated words)
|
| 249 |
+
• Extract MAXIMUM 4-5 concepts from this page (quality over quantity)
|
| 250 |
+
|
| 251 |
+
Return output strictly as JSON with "nodes" key:
|
| 252 |
+
{{
|
| 253 |
+
"nodes": [
|
| 254 |
+
"gradient descent",
|
| 255 |
+
"neural network",
|
| 256 |
+
"cross entropy loss"
|
| 257 |
+
]
|
| 258 |
+
}}
|
| 259 |
+
|
| 260 |
+
PAGE {page_number} TEXT:
|
| 261 |
+
{text}
|
| 262 |
+
|
| 263 |
+
CRITICAL: Return ONLY the JSON. If no technical concepts found, return {{"nodes": []}}"""
|
| 264 |
+
|
| 265 |
+
logger.info(f" 🚀 Starting Gemini extraction for page {page_number}...")
|
| 266 |
+
logger.info(f" Text length: {len(text)} characters")
|
| 267 |
+
|
| 268 |
+
try:
|
| 269 |
+
# Call Gemini via litellm
|
| 270 |
+
logger.info(f" 📡 Calling Gemini API for page {page_number}...")
|
| 271 |
+
|
| 272 |
+
response = await asyncio.to_thread(
|
| 273 |
+
self.litellm.completion,
|
| 274 |
+
model=self.model_name,
|
| 275 |
+
api_key=self.api_key,
|
| 276 |
+
messages=[{
|
| 277 |
+
"role": "user",
|
| 278 |
+
"content": prompt
|
| 279 |
+
}],
|
| 280 |
+
temperature=0.0,
|
| 281 |
+
max_tokens=settings.llm_max_tokens,
|
| 282 |
+
timeout=settings.llm_timeout
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
# Extract response text
|
| 286 |
+
response_text = response.choices[0].message.content.strip()
|
| 287 |
+
logger.info(f" 📥 Gemini response ({len(response_text)} chars):")
|
| 288 |
+
logger.info(f" {response_text[:500]}")
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
if "```json" in response_text:
|
| 292 |
+
response_text = response_text.split("```json")[1].split("```")[0].strip()
|
| 293 |
+
elif "```" in response_text:
|
| 294 |
+
response_text = response_text.split("```")[1].split("```")[0].strip()
|
| 295 |
+
|
| 296 |
+
data = json.loads(response_text)
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
if isinstance(data, dict) and "nodes" in data:
|
| 300 |
+
nodes = data["nodes"]
|
| 301 |
+
elif isinstance(data, list):
|
| 302 |
+
# Fallback: if Gemini returned a list directly
|
| 303 |
+
nodes = data
|
| 304 |
+
else:
|
| 305 |
+
logger.warning(f" ❌ Gemini returned unexpected format: {type(data)}")
|
| 306 |
+
return []
|
| 307 |
+
|
| 308 |
+
if not isinstance(nodes, list):
|
| 309 |
+
logger.warning(f" ❌ Nodes is not a list, got: {type(nodes)}")
|
| 310 |
+
return []
|
| 311 |
+
|
| 312 |
+
logger.info(f" ✓ Gemini extracted {len(nodes)} nodes from page {page_number}")
|
| 313 |
+
logger.info(f" Raw nodes: {nodes}")
|
| 314 |
+
|
| 315 |
+
# Validate and filter nodes
|
| 316 |
+
valid_nodes = []
|
| 317 |
+
rejected_nodes = []
|
| 318 |
+
|
| 319 |
+
for node in nodes:
|
| 320 |
+
if not isinstance(node, str):
|
| 321 |
+
logger.warning(f" ⚠️ Skipping non-string node: {node}")
|
| 322 |
+
continue
|
| 323 |
+
|
| 324 |
+
node = node.strip()
|
| 325 |
+
if not node:
|
| 326 |
+
continue
|
| 327 |
+
|
| 328 |
+
logger.info(f" Validating node: '{node}'")
|
| 329 |
+
|
| 330 |
+
# FILTER: Validate node is a technical concept
|
| 331 |
+
if not self._is_technical_concept(node):
|
| 332 |
+
rejected_nodes.append(node)
|
| 333 |
+
logger.warning(f" ✗ REJECTED node '{node}' - not technical enough")
|
| 334 |
+
continue
|
| 335 |
+
|
| 336 |
+
logger.info(f" ✅ ACCEPTED node: '{node}'")
|
| 337 |
+
valid_nodes.append(node.lower())
|
| 338 |
+
|
| 339 |
+
# Summary of rejections
|
| 340 |
+
if rejected_nodes:
|
| 341 |
+
logger.warning(f" 📊 Rejected {len(rejected_nodes)} nodes: {rejected_nodes}")
|
| 342 |
+
|
| 343 |
+
if not valid_nodes:
|
| 344 |
+
logger.warning(f" ⚠️ ALL {len(nodes)} NODES REJECTED for page {page_number}")
|
| 345 |
+
logger.warning(f" No valid technical concepts found. Returning empty list.")
|
| 346 |
+
return []
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
selected_nodes = valid_nodes[:2] #
|
| 350 |
+
logger.info(f" 🎯 Selected {len(selected_nodes)} nodes (hard cap = 2): {selected_nodes}")
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
page_triples = []
|
| 354 |
+
|
| 355 |
+
if len(selected_nodes) == 1:
|
| 356 |
+
# Only one node - create self-referencing relationship or skip
|
| 357 |
+
logger.info(f" ℹ️ Only 1 node on page {page_number}, cannot create relationships")
|
| 358 |
+
|
| 359 |
+
return []
|
| 360 |
+
|
| 361 |
+
elif len(selected_nodes) == 2:
|
| 362 |
+
# Use LLM to determine actual relationship between nodes
|
| 363 |
+
node1, node2 = selected_nodes[0], selected_nodes[1]
|
| 364 |
+
|
| 365 |
+
# Extract relationship using LLM with page context
|
| 366 |
+
logger.info(f" 🔍 Extracting relationship between: {node1} ↔ {node2}")
|
| 367 |
+
relationship_triple = await self._extract_relationship_with_gemini(
|
| 368 |
+
text=text,
|
| 369 |
+
node1=node1,
|
| 370 |
+
node2=node2,
|
| 371 |
+
page_number=page_number
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
if relationship_triple:
|
| 375 |
+
page_triples.append(relationship_triple)
|
| 376 |
+
logger.info(f" ✅ Created directed edge:")
|
| 377 |
+
logger.info(f" → {relationship_triple.subject_label} --[{relationship_triple.relation.value}]--> {relationship_triple.object_label}")
|
| 378 |
+
logger.info(f" Justification: {relationship_triple.justification}")
|
| 379 |
+
else:
|
| 380 |
+
logger.warning(f" ⚠️ Could not extract relationship for {node1} ↔ {node2}")
|
| 381 |
+
|
| 382 |
+
logger.info(f" ✅ Returning {len(page_triples)} triples for page {page_number}")
|
| 383 |
+
return page_triples
|
| 384 |
+
|
| 385 |
+
except json.JSONDecodeError as e:
|
| 386 |
+
logger.error(f" ❌ JSON PARSE ERROR for page {page_number}: {e}")
|
| 387 |
+
logger.error(f" Response was: {response_text[:500]}")
|
| 388 |
+
return []
|
| 389 |
+
|
| 390 |
+
except Exception as e:
|
| 391 |
+
logger.error(f" ❌ GEMINI API FAILED for page {page_number}: {e}")
|
| 392 |
+
logger.error(f" Exception type: {type(e).__name__}")
|
| 393 |
+
logger.error(f" Full trace:", exc_info=True)
|
| 394 |
+
return []
|
| 395 |
+
|
| 396 |
+
async def _extract_relationship_with_gemini(self, text: str, node1: str, node2: str, page_number: int) -> Optional[CanonicalTriple]:
|
| 397 |
+
"""
|
| 398 |
+
Use Gemini to determine the actual relationship between two nodes based on page context
|
| 399 |
+
|
| 400 |
+
Args:
|
| 401 |
+
text: Full page text for context
|
| 402 |
+
node1: First node/concept
|
| 403 |
+
node2: Second node/concept
|
| 404 |
+
page_number: Page number
|
| 405 |
+
|
| 406 |
+
Returns:
|
| 407 |
+
CanonicalTriple with proper relationship, or None if extraction fails
|
| 408 |
+
"""
|
| 409 |
+
# List all available relation types for the LLM
|
| 410 |
+
available_relations = [r.value for r in RelationType]
|
| 411 |
+
|
| 412 |
+
prompt = f"""You are an expert at extracting knowledge graph relationships from technical text.
|
| 413 |
+
|
| 414 |
+
Given two concepts and the text they appear in, determine the most accurate relationship between them.
|
| 415 |
+
|
| 416 |
+
**Concepts:**
|
| 417 |
+
- Concept A: "{node1}"
|
| 418 |
+
- Concept B: "{node2}"
|
| 419 |
+
|
| 420 |
+
**Context (page {page_number}):**
|
| 421 |
+
{text[:3000]}
|
| 422 |
+
|
| 423 |
+
**Available Relationship Types:**
|
| 424 |
+
{', '.join(available_relations)}
|
| 425 |
+
|
| 426 |
+
**Instructions:**
|
| 427 |
+
1. Analyze how these two concepts relate in the given context
|
| 428 |
+
2. Choose the MOST SPECIFIC relationship type from the list above
|
| 429 |
+
3. Determine the direction: which concept is the subject and which is the object
|
| 430 |
+
4. Provide a brief justification from the text
|
| 431 |
+
|
| 432 |
+
**Output Format (JSON):**
|
| 433 |
+
{{
|
| 434 |
+
"subject": "<node1 or node2>",
|
| 435 |
+
"object": "<node1 or node2>",
|
| 436 |
+
"relation": "<one of the available relationship types>",
|
| 437 |
+
"confidence": <0.0-1.0>,
|
| 438 |
+
"justification": "<brief explanation from text>"
|
| 439 |
+
}}
|
| 440 |
+
|
| 441 |
+
**Rules:**
|
| 442 |
+
- Use the exact concept names provided
|
| 443 |
+
- Choose only ONE relation type from the available list
|
| 444 |
+
- If no clear relationship exists, use "related_to"
|
| 445 |
+
- Direction matters: subject performs/has the relation to the object
|
| 446 |
+
"""
|
| 447 |
+
|
| 448 |
+
try:
|
| 449 |
+
# Call Gemini API
|
| 450 |
+
response_text = await self.litellm.acompletion(
|
| 451 |
+
model=self.model_name,
|
| 452 |
+
messages=[
|
| 453 |
+
{"role": "system", "content": "You are an expert at knowledge graph relationship extraction. Always output valid JSON."},
|
| 454 |
+
{"role": "user", "content": prompt}
|
| 455 |
+
],
|
| 456 |
+
api_key=self.api_key,
|
| 457 |
+
temperature=0.1, # Low temperature for consistent relationship extraction
|
| 458 |
+
response_format={"type": "json_object"}
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
response_content = response_text.choices[0].message.content
|
| 462 |
+
data = json.loads(response_content)
|
| 463 |
+
|
| 464 |
+
# Validate response
|
| 465 |
+
subject = data.get("subject", "").strip()
|
| 466 |
+
obj = data.get("object", "").strip()
|
| 467 |
+
relation_str = data.get("relation", "related_to").lower().strip().replace(" ", "_")
|
| 468 |
+
confidence = float(data.get("confidence", 0.7))
|
| 469 |
+
justification = data.get("justification", f"Relationship extracted from page {page_number}")
|
| 470 |
+
|
| 471 |
+
# Map relation string to enum
|
| 472 |
+
try:
|
| 473 |
+
relation = RelationType(relation_str)
|
| 474 |
+
except ValueError:
|
| 475 |
+
logger.warning(f" ⚠️ Invalid relation '{relation_str}', defaulting to RELATED_TO")
|
| 476 |
+
relation = RelationType.RELATED_TO
|
| 477 |
+
|
| 478 |
+
# Create triple
|
| 479 |
+
triple = CanonicalTriple(
|
| 480 |
+
subject_label=subject,
|
| 481 |
+
object_label=obj,
|
| 482 |
+
relation=relation,
|
| 483 |
+
confidence=confidence,
|
| 484 |
+
justification=justification,
|
| 485 |
+
page_number=page_number
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
return triple
|
| 489 |
+
|
| 490 |
+
except json.JSONDecodeError as e:
|
| 491 |
+
logger.error(f" ❌ JSON parse error in relationship extraction: {e}")
|
| 492 |
+
return None
|
| 493 |
+
except Exception as e:
|
| 494 |
+
logger.error(f" ❌ Relationship extraction failed: {e}")
|
| 495 |
+
return None
|
| 496 |
+
|
| 497 |
+
def _is_technical_concept(self, concept: str) -> bool:
|
| 498 |
+
"""
|
| 499 |
+
|
| 500 |
+
Args:
|
| 501 |
+
concept: Concept string to validate
|
| 502 |
+
|
| 503 |
+
Returns:
|
| 504 |
+
True if highly technical/specific, False otherwise
|
| 505 |
+
"""
|
| 506 |
+
concept_lower = concept.lower().strip()
|
| 507 |
+
|
| 508 |
+
# RULE 1: Reject if in stopwords
|
| 509 |
+
if concept_lower in self.generic_stopwords:
|
| 510 |
+
logger.debug(f"Rejected '{concept}' - in stopword list")
|
| 511 |
+
return False
|
| 512 |
+
|
| 513 |
+
# RULE 2: Reject if any word is a generic stopword (stricter)
|
| 514 |
+
words = concept_lower.split()
|
| 515 |
+
for word in words:
|
| 516 |
+
if word in self.generic_stopwords:
|
| 517 |
+
# Allow if it's part of a specific multi-word technical term
|
| 518 |
+
# e.g., "convolutional neural network" has "network" but is specific
|
| 519 |
+
if len(words) < 2:
|
| 520 |
+
logger.debug(f"Rejected '{concept}' - contains generic word '{word}'")
|
| 521 |
+
return False
|
| 522 |
+
|
| 523 |
+
# RULE 3: Single-word concepts must have SOME specificity (RELAXED)
|
| 524 |
+
if len(words) == 1:
|
| 525 |
+
# Accept if ANY of these are true:
|
| 526 |
+
# - Has uppercase (BERT, Adam, PyTorch)
|
| 527 |
+
# - Has numbers (VGG16, GPT3)
|
| 528 |
+
# - Has special chars (t-SNE, bi-LSTM)
|
| 529 |
+
# - Longish word (8+ chars like "backpropagation")
|
| 530 |
+
has_uppercase = any(c.isupper() for c in concept)
|
| 531 |
+
has_numbers = any(c.isdigit() for c in concept)
|
| 532 |
+
has_special = '-' in concept or '_' in concept
|
| 533 |
+
is_longish = len(concept) >= 8 # RELAXED from 10
|
| 534 |
+
|
| 535 |
+
if not (has_uppercase or has_numbers or has_special or is_longish):
|
| 536 |
+
logger.debug(f"Rejected '{concept}' - single word not specific enough")
|
| 537 |
+
return False
|
| 538 |
+
|
| 539 |
+
# RULE 4: Multi-word phrases - very lenient
|
| 540 |
+
if len(words) >= 2:
|
| 541 |
+
# Just check that it's not ALL generic words
|
| 542 |
+
# At least one word should be non-generic or have caps/numbers
|
| 543 |
+
has_caps = any(c.isupper() for c in concept)
|
| 544 |
+
has_numbers = any(c.isdigit() for c in concept)
|
| 545 |
+
has_hyphen = '-' in concept
|
| 546 |
+
|
| 547 |
+
# Count non-generic words
|
| 548 |
+
non_generic_count = sum(1 for w in words if w not in self.generic_stopwords)
|
| 549 |
+
|
| 550 |
+
# Accept if ANY of these:
|
| 551 |
+
# - Has caps/numbers/hyphen
|
| 552 |
+
# - At least one word is non-generic
|
| 553 |
+
# - 3+ words (likely specific enough)
|
| 554 |
+
if not (has_caps or has_numbers or has_hyphen or non_generic_count > 0 or len(words) >= 3):
|
| 555 |
+
logger.debug(f"Rejected '{concept}' - multi-word phrase too generic")
|
| 556 |
+
return False
|
| 557 |
+
|
| 558 |
+
# RULE 5: Reject very short terms (1-2 chars) unless they're known acronyms (all caps)
|
| 559 |
+
if len(concept) <= 2 and concept.upper() != concept:
|
| 560 |
+
logger.debug(f"Rejected '{concept}' - too short")
|
| 561 |
+
return False
|
| 562 |
+
|
| 563 |
+
# RULE 6: Must contain at least one alphanumeric character
|
| 564 |
+
if not any(c.isalnum() for c in concept):
|
| 565 |
+
logger.debug(f"Rejected '{concept}' - no alphanumeric chars")
|
| 566 |
+
return False
|
| 567 |
+
|
| 568 |
+
# RULE 7: Reject if it's just a generic category with a modifier
|
| 569 |
+
# e.g., "new algorithm", "proposed method", "our model"
|
| 570 |
+
generic_patterns = [
|
| 571 |
+
'new ', 'novel ', 'proposed ', 'our ', 'this ', 'that ',
|
| 572 |
+
'these ', 'those ', 'such ', 'other ', 'another ',
|
| 573 |
+
'existing ', 'current ', 'previous ', 'standard '
|
| 574 |
+
]
|
| 575 |
+
for pattern in generic_patterns:
|
| 576 |
+
if concept_lower.startswith(pattern):
|
| 577 |
+
logger.debug(f"Rejected '{concept}' - generic pattern")
|
| 578 |
+
return False
|
| 579 |
+
|
| 580 |
+
# Passed all strict filters
|
| 581 |
+
return True
|
| 582 |
+
|
| 583 |
+
def _map_relation(self, relation_str: str) -> RelationType:
|
| 584 |
+
"""Map relation string to RelationType enum"""
|
| 585 |
+
relation_lower = relation_str.lower().strip()
|
| 586 |
+
|
| 587 |
+
# Direct mapping
|
| 588 |
+
mapping = {
|
| 589 |
+
"uses": RelationType.USES,
|
| 590 |
+
"implements": RelationType.IMPLEMENTS,
|
| 591 |
+
"is_a": RelationType.IS_A,
|
| 592 |
+
"is a": RelationType.IS_A,
|
| 593 |
+
"part_of": RelationType.PART_OF,
|
| 594 |
+
"part of": RelationType.PART_OF,
|
| 595 |
+
"requires": RelationType.REQUIRES,
|
| 596 |
+
"produces": RelationType.PRODUCES,
|
| 597 |
+
"enables": RelationType.ENABLES,
|
| 598 |
+
"improves": RelationType.IMPROVES,
|
| 599 |
+
"enhances": RelationType.ENHANCES,
|
| 600 |
+
"contains": RelationType.CONTAINS,
|
| 601 |
+
"depends_on": RelationType.DEPENDS_ON,
|
| 602 |
+
"depends on": RelationType.DEPENDS_ON,
|
| 603 |
+
"related_to": RelationType.RELATED_TO,
|
| 604 |
+
"related to": RelationType.RELATED_TO,
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
if relation_lower in mapping:
|
| 608 |
+
return mapping[relation_lower]
|
| 609 |
+
|
| 610 |
+
# Fallback
|
| 611 |
+
logger.debug(f"Unknown relation '{relation_str}', using 'related_to'")
|
| 612 |
+
return RelationType.RELATED_TO
|
graph_builder.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Graph Builder - constructs knowledge graph from canonical triples
|
| 3 |
+
Handles entity canonicalization, node/edge creation, and graph pruning
|
| 4 |
+
"""
|
| 5 |
+
from typing import List, Dict, Any, Set, Tuple
|
| 6 |
+
from loguru import logger
|
| 7 |
+
from models import CanonicalTriple, GraphNode, GraphEdge, SupportingChunk, NodeType
|
| 8 |
+
from graph_store import GraphStore
|
| 9 |
+
from embedding_service import EmbeddingService
|
| 10 |
+
from config import settings
|
| 11 |
+
import numpy as np
|
| 12 |
+
from collections import defaultdict
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class GraphBuilder:
|
| 16 |
+
"""
|
| 17 |
+
Builds and refines knowledge graph from canonical triples
|
| 18 |
+
Implements entity canonicalization, deduplication, and pruning
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, graph_store: GraphStore, embedding_service: EmbeddingService):
|
| 22 |
+
self.graph_store = graph_store
|
| 23 |
+
self.embedding_service = embedding_service
|
| 24 |
+
self.entity_embeddings: Dict[str, np.ndarray] = {}
|
| 25 |
+
|
| 26 |
+
async def build_graph(self, triples: List[CanonicalTriple]) -> Tuple[int, int]:
|
| 27 |
+
"""
|
| 28 |
+
Build graph from canonical triples
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
triples: List of canonical triples
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Tuple of (num_nodes_added, num_edges_added)
|
| 35 |
+
"""
|
| 36 |
+
logger.info(f"Building graph from {len(triples)} triples")
|
| 37 |
+
|
| 38 |
+
# Step 1: Entity canonicalization - merge similar entities
|
| 39 |
+
entity_map = await self._canonicalize_entities(triples)
|
| 40 |
+
|
| 41 |
+
# Step 2: Create nodes
|
| 42 |
+
nodes_created = 0
|
| 43 |
+
logger.info(f"Creating nodes from {len(entity_map)} canonical entities")
|
| 44 |
+
|
| 45 |
+
for entity_label in entity_map.keys():
|
| 46 |
+
node = await self._create_node(entity_label, entity_map, triples)
|
| 47 |
+
if self.graph_store.add_node(node):
|
| 48 |
+
nodes_created += 1
|
| 49 |
+
logger.debug(f"Created node: {node.label} (type: {node.type.value})")
|
| 50 |
+
|
| 51 |
+
logger.info(f"✓ Successfully created {nodes_created} nodes")
|
| 52 |
+
|
| 53 |
+
# Step 3: Create edges
|
| 54 |
+
edges_created = 0
|
| 55 |
+
for triple in triples:
|
| 56 |
+
# Map to canonical entities
|
| 57 |
+
canonical_subject = entity_map.get(triple.subject_label, triple.subject_label)
|
| 58 |
+
canonical_object = entity_map.get(triple.object_label, triple.object_label)
|
| 59 |
+
|
| 60 |
+
# Skip self-loops
|
| 61 |
+
if canonical_subject == canonical_object:
|
| 62 |
+
continue
|
| 63 |
+
|
| 64 |
+
# Get node IDs
|
| 65 |
+
subject_node = self.graph_store.get_node_by_label(canonical_subject)
|
| 66 |
+
object_node = self.graph_store.get_node_by_label(canonical_object)
|
| 67 |
+
|
| 68 |
+
if not subject_node or not object_node:
|
| 69 |
+
continue
|
| 70 |
+
|
| 71 |
+
# Create edge
|
| 72 |
+
edge = self._create_edge(subject_node, object_node, triple)
|
| 73 |
+
if self.graph_store.add_edge(edge):
|
| 74 |
+
edges_created += 1
|
| 75 |
+
|
| 76 |
+
logger.info(f"Created {nodes_created} nodes and {edges_created} edges")
|
| 77 |
+
|
| 78 |
+
# Step 4: Compute importance scores
|
| 79 |
+
self._compute_importance_scores()
|
| 80 |
+
|
| 81 |
+
# Step 5: Prune low-importance nodes and edges
|
| 82 |
+
pruned_nodes, pruned_edges = self._prune_graph()
|
| 83 |
+
|
| 84 |
+
logger.info(f"Pruned {pruned_nodes} nodes and {pruned_edges} edges")
|
| 85 |
+
logger.info(f"Final graph: {nodes_created - pruned_nodes} nodes, {edges_created - pruned_edges} edges")
|
| 86 |
+
|
| 87 |
+
return nodes_created - pruned_nodes, edges_created - pruned_edges
|
| 88 |
+
|
| 89 |
+
async def _canonicalize_entities(self, triples: List[CanonicalTriple]) -> Dict[str, str]:
|
| 90 |
+
"""
|
| 91 |
+
⚡ OPTIMIZATION: Skip expensive canonicalization (identity mapping)
|
| 92 |
+
|
| 93 |
+
With 2 nodes per page hard cap and strict technical filtering,
|
| 94 |
+
we have very few duplicates and highly specific entities.
|
| 95 |
+
Embedding computation + O(n²) similarity checks not worth the cost.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
triples: List of triples
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Dict mapping entity_label -> canonical_label (identity map)
|
| 102 |
+
"""
|
| 103 |
+
# Collect all unique entities
|
| 104 |
+
entities = set()
|
| 105 |
+
for triple in triples:
|
| 106 |
+
entities.add(triple.subject_label)
|
| 107 |
+
entities.add(triple.object_label)
|
| 108 |
+
|
| 109 |
+
# DETERMINISTIC: Sort entities for consistent ordering across runs
|
| 110 |
+
entities_list = sorted(list(entities))
|
| 111 |
+
logger.info(f"⚡ FAST MODE: Skipping entity canonicalization for {len(entities_list)} unique entities")
|
| 112 |
+
logger.info(f"Each entity maps to itself (no merging)")
|
| 113 |
+
|
| 114 |
+
# Return identity mapping - each entity maps to itself
|
| 115 |
+
entity_map = {entity: entity for entity in entities_list}
|
| 116 |
+
|
| 117 |
+
logger.info(f"✓ Identity mapping created (0 merges, {len(entities_list)} canonical entities)")
|
| 118 |
+
|
| 119 |
+
return entity_map
|
| 120 |
+
|
| 121 |
+
def _entity_to_text(self, entity: str) -> str:
|
| 122 |
+
"""Convert entity label to text for embedding"""
|
| 123 |
+
# Simple approach: use the label as-is
|
| 124 |
+
return entity
|
| 125 |
+
|
| 126 |
+
async def _create_node(
|
| 127 |
+
self,
|
| 128 |
+
label: str,
|
| 129 |
+
entity_map: Dict[str, str],
|
| 130 |
+
triples: List[CanonicalTriple]
|
| 131 |
+
) -> GraphNode:
|
| 132 |
+
"""
|
| 133 |
+
Create a graph node for an entity
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
label: Canonical entity label
|
| 137 |
+
entity_map: Entity canonicalization map
|
| 138 |
+
triples: All triples (to find supporting chunks)
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
GraphNode
|
| 142 |
+
"""
|
| 143 |
+
# Find all triples mentioning this entity
|
| 144 |
+
supporting_chunks = []
|
| 145 |
+
aliases = []
|
| 146 |
+
|
| 147 |
+
for original_label, canonical_label in entity_map.items():
|
| 148 |
+
if canonical_label == label:
|
| 149 |
+
if original_label != label:
|
| 150 |
+
aliases.append(original_label)
|
| 151 |
+
|
| 152 |
+
# Collect supporting chunks from triples
|
| 153 |
+
chunk_scores = defaultdict(float)
|
| 154 |
+
for triple in triples:
|
| 155 |
+
canonical_subject = entity_map.get(triple.subject_label, triple.subject_label)
|
| 156 |
+
canonical_object = entity_map.get(triple.object_label, triple.object_label)
|
| 157 |
+
|
| 158 |
+
if canonical_subject == label or canonical_object == label:
|
| 159 |
+
# This triple supports the node
|
| 160 |
+
chunk_key = (triple.page_number, triple.justification[:100]) # Use justification as proxy
|
| 161 |
+
chunk_scores[chunk_key] += triple.confidence
|
| 162 |
+
|
| 163 |
+
# Convert to SupportingChunk objects
|
| 164 |
+
for (page_number, snippet), score in chunk_scores.items():
|
| 165 |
+
supporting_chunks.append(SupportingChunk(
|
| 166 |
+
chunk_id=f"page_{page_number}", # Placeholder
|
| 167 |
+
score=score,
|
| 168 |
+
page_number=page_number,
|
| 169 |
+
snippet=snippet
|
| 170 |
+
))
|
| 171 |
+
|
| 172 |
+
# DETERMINISTIC: Sort by score (desc) then page_number (asc) for stable ordering
|
| 173 |
+
supporting_chunks.sort(key=lambda x: (-x.score, x.page_number))
|
| 174 |
+
supporting_chunks = supporting_chunks[:10]
|
| 175 |
+
|
| 176 |
+
# Infer node type (simple heuristic)
|
| 177 |
+
node_type = self._infer_node_type(label)
|
| 178 |
+
|
| 179 |
+
node = GraphNode(
|
| 180 |
+
label=label,
|
| 181 |
+
type=node_type,
|
| 182 |
+
aliases=aliases,
|
| 183 |
+
supporting_chunks=supporting_chunks,
|
| 184 |
+
importance_score=0.0 # Will be computed later
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
return node
|
| 188 |
+
|
| 189 |
+
def _infer_node_type(self, label: str) -> NodeType:
|
| 190 |
+
"""Infer node type from label (simple heuristics)"""
|
| 191 |
+
label_lower = label.lower()
|
| 192 |
+
|
| 193 |
+
# Check for common patterns
|
| 194 |
+
if any(word in label_lower for word in ["function", "method", "algorithm"]):
|
| 195 |
+
return NodeType.FUNCTION
|
| 196 |
+
elif any(word in label_lower for word in ["class", "type", "struct"]):
|
| 197 |
+
return NodeType.CLASS
|
| 198 |
+
elif label[0].isupper() and " " not in label: # Capitalized single word
|
| 199 |
+
return NodeType.PERSON
|
| 200 |
+
elif any(word in label_lower for word in ["definition", "term", "concept"]):
|
| 201 |
+
return NodeType.TERM
|
| 202 |
+
else:
|
| 203 |
+
return NodeType.CONCEPT
|
| 204 |
+
|
| 205 |
+
def _create_edge(
|
| 206 |
+
self,
|
| 207 |
+
from_node: GraphNode,
|
| 208 |
+
to_node: GraphNode,
|
| 209 |
+
triple: CanonicalTriple
|
| 210 |
+
) -> GraphEdge:
|
| 211 |
+
"""Create a graph edge from a triple"""
|
| 212 |
+
supporting_chunk = SupportingChunk(
|
| 213 |
+
chunk_id=f"page_{triple.page_number}",
|
| 214 |
+
score=triple.confidence,
|
| 215 |
+
page_number=triple.page_number,
|
| 216 |
+
snippet=triple.justification
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
edge = GraphEdge(
|
| 220 |
+
from_node=from_node.node_id,
|
| 221 |
+
to_node=to_node.node_id,
|
| 222 |
+
relation=triple.relation,
|
| 223 |
+
confidence=triple.confidence,
|
| 224 |
+
supporting_chunks=[supporting_chunk]
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
return edge
|
| 228 |
+
|
| 229 |
+
def _compute_importance_scores(self):
|
| 230 |
+
"""
|
| 231 |
+
⚡ OPTIMIZATION: Simplified importance scoring (skip expensive PageRank)
|
| 232 |
+
|
| 233 |
+
Since we're not pruning, we only need basic scores for display purposes.
|
| 234 |
+
"""
|
| 235 |
+
logger.info("⚡ FAST MODE: Computing simplified importance scores (no PageRank)")
|
| 236 |
+
|
| 237 |
+
# Update node importance with simple metric (just degree centrality)
|
| 238 |
+
for node in self.graph_store.get_all_nodes():
|
| 239 |
+
# Simple importance = number of connections (fast to compute)
|
| 240 |
+
num_neighbors = len(self.graph_store.get_neighbors(node.node_id))
|
| 241 |
+
|
| 242 |
+
# Normalize to 0-1 range (assume max 10 connections)
|
| 243 |
+
importance = min(num_neighbors / 10.0, 1.0)
|
| 244 |
+
|
| 245 |
+
node.importance_score = importance
|
| 246 |
+
|
| 247 |
+
# Update in store (for NetworkX)
|
| 248 |
+
if not self.graph_store.use_neo4j:
|
| 249 |
+
self.graph_store.nodes_dict[node.node_id] = node
|
| 250 |
+
|
| 251 |
+
logger.info(f"✓ Importance scores computed (based on degree centrality only)")
|
| 252 |
+
|
| 253 |
+
def _prune_graph(self) -> Tuple[int, int]:
|
| 254 |
+
"""
|
| 255 |
+
⚡ OPTIMIZATION: Skip pruning (we already filter at extraction)
|
| 256 |
+
|
| 257 |
+
Pruning is expensive (PageRank + multiple graph traversals).
|
| 258 |
+
With strict filtering at extraction (technical concepts only, 2 per page),
|
| 259 |
+
we don't need additional pruning.
|
| 260 |
+
|
| 261 |
+
Returns:
|
| 262 |
+
Tuple of (nodes_removed, edges_removed) - always (0, 0)
|
| 263 |
+
"""
|
| 264 |
+
logger.info(f"⚡ FAST MODE: Skipping graph pruning")
|
| 265 |
+
logger.info(f"Nodes already filtered at extraction with strict technical validation")
|
| 266 |
+
logger.info(f"Final graph: {len(self.graph_store.get_all_nodes())} nodes, {len(self.graph_store.get_all_edges())} edges")
|
| 267 |
+
|
| 268 |
+
return 0, 0
|
graph_store.py
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Knowledge Graph Store
|
| 3 |
+
Manages nodes, edges, and graph operations
|
| 4 |
+
Supports both NetworkX (local) and Neo4j (production)
|
| 5 |
+
"""
|
| 6 |
+
import networkx as nx
|
| 7 |
+
from neo4j import GraphDatabase
|
| 8 |
+
from typing import List, Dict, Any, Optional, Tuple, Set
|
| 9 |
+
from loguru import logger
|
| 10 |
+
from models import GraphNode, GraphEdge, CanonicalTriple, SupportingChunk, NodeType, RelationType
|
| 11 |
+
from config import settings
|
| 12 |
+
import json
|
| 13 |
+
import pickle
|
| 14 |
+
from collections import defaultdict
|
| 15 |
+
from embedding_service import EmbeddingService
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class GraphStore:
|
| 19 |
+
"""
|
| 20 |
+
Manages the knowledge graph with nodes and edges
|
| 21 |
+
Supports multiple backends: NetworkX (default) or Neo4j
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, use_neo4j: bool = False, embedding_service: Optional[EmbeddingService] = None):
|
| 25 |
+
self.use_neo4j = use_neo4j
|
| 26 |
+
self.embedding_service = embedding_service
|
| 27 |
+
|
| 28 |
+
if use_neo4j:
|
| 29 |
+
self._init_neo4j()
|
| 30 |
+
else:
|
| 31 |
+
self.graph = nx.MultiGraph() # Undirected graph (no arrows)
|
| 32 |
+
self.nodes_dict: Dict[str, GraphNode] = {} # node_id -> GraphNode
|
| 33 |
+
self.edges_dict: Dict[str, GraphEdge] = {} # edge_id -> GraphEdge
|
| 34 |
+
|
| 35 |
+
logger.info(f"Initialized GraphStore (backend: {'Neo4j' if use_neo4j else 'NetworkX'}, undirected graph)")
|
| 36 |
+
|
| 37 |
+
def _init_neo4j(self):
|
| 38 |
+
"""Initialize Neo4j connection"""
|
| 39 |
+
try:
|
| 40 |
+
self.driver = GraphDatabase.driver(
|
| 41 |
+
settings.neo4j_uri,
|
| 42 |
+
auth=(settings.neo4j_user, settings.neo4j_password)
|
| 43 |
+
)
|
| 44 |
+
# Test connection
|
| 45 |
+
with self.driver.session() as session:
|
| 46 |
+
session.run("RETURN 1")
|
| 47 |
+
logger.info("Connected to Neo4j successfully")
|
| 48 |
+
except Exception as e:
|
| 49 |
+
logger.error(f"Failed to connect to Neo4j: {e}")
|
| 50 |
+
logger.info("Falling back to NetworkX (undirected)")
|
| 51 |
+
self.use_neo4j = False
|
| 52 |
+
self.graph = nx.MultiGraph() # Undirected graph
|
| 53 |
+
self.nodes_dict = {}
|
| 54 |
+
self.edges_dict = {}
|
| 55 |
+
|
| 56 |
+
def add_node(self, node: GraphNode) -> bool:
|
| 57 |
+
"""
|
| 58 |
+
Add a node to the graph
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
node: GraphNode to add
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
True if added, False if already exists
|
| 65 |
+
"""
|
| 66 |
+
if self.use_neo4j:
|
| 67 |
+
return self._add_node_neo4j(node)
|
| 68 |
+
else:
|
| 69 |
+
if node.node_id in self.nodes_dict:
|
| 70 |
+
return False
|
| 71 |
+
|
| 72 |
+
self.nodes_dict[node.node_id] = node
|
| 73 |
+
# Handle both enum and string for type field
|
| 74 |
+
node_type = node.type.value if hasattr(node.type, 'value') else node.type
|
| 75 |
+
self.graph.add_node(
|
| 76 |
+
node.node_id,
|
| 77 |
+
label=node.label,
|
| 78 |
+
type=node_type,
|
| 79 |
+
importance=node.importance_score
|
| 80 |
+
)
|
| 81 |
+
return True
|
| 82 |
+
|
| 83 |
+
def add_edge(self, edge: GraphEdge) -> bool:
|
| 84 |
+
"""
|
| 85 |
+
Add an edge to the graph
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
edge: GraphEdge to add
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
True if added successfully
|
| 92 |
+
"""
|
| 93 |
+
if self.use_neo4j:
|
| 94 |
+
return self._add_edge_neo4j(edge)
|
| 95 |
+
else:
|
| 96 |
+
self.edges_dict[edge.edge_id] = edge
|
| 97 |
+
# Handle both enum and string for relation field
|
| 98 |
+
relation_value = edge.relation.value if hasattr(edge.relation, 'value') else edge.relation
|
| 99 |
+
self.graph.add_edge(
|
| 100 |
+
edge.from_node,
|
| 101 |
+
edge.to_node,
|
| 102 |
+
key=edge.edge_id,
|
| 103 |
+
relation=relation_value,
|
| 104 |
+
confidence=edge.confidence
|
| 105 |
+
)
|
| 106 |
+
return True
|
| 107 |
+
|
| 108 |
+
def get_node(self, node_id: str) -> Optional[GraphNode]:
|
| 109 |
+
"""Get node by ID"""
|
| 110 |
+
if self.use_neo4j:
|
| 111 |
+
return self._get_node_neo4j(node_id)
|
| 112 |
+
else:
|
| 113 |
+
return self.nodes_dict.get(node_id)
|
| 114 |
+
|
| 115 |
+
def update_node(self, node: GraphNode) -> bool:
|
| 116 |
+
"""
|
| 117 |
+
Update an existing node in the graph
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
node: GraphNode with updated data
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
True if updated successfully, False if node doesn't exist
|
| 124 |
+
"""
|
| 125 |
+
if node.node_id not in self.nodes_dict:
|
| 126 |
+
return False
|
| 127 |
+
|
| 128 |
+
# Update in dictionary
|
| 129 |
+
self.nodes_dict[node.node_id] = node
|
| 130 |
+
|
| 131 |
+
# Update NetworkX graph attributes
|
| 132 |
+
if node.node_id in self.graph:
|
| 133 |
+
node_type = node.type.value if hasattr(node.type, 'value') else node.type
|
| 134 |
+
self.graph.nodes[node.node_id]['label'] = node.label
|
| 135 |
+
self.graph.nodes[node.node_id]['type'] = node_type
|
| 136 |
+
self.graph.nodes[node.node_id]['importance'] = node.importance_score
|
| 137 |
+
|
| 138 |
+
return True
|
| 139 |
+
|
| 140 |
+
def get_node_by_label(self, label: str) -> Optional[GraphNode]:
|
| 141 |
+
"""Get node by label (case-insensitive)"""
|
| 142 |
+
label_lower = label.lower()
|
| 143 |
+
for node in self.nodes_dict.values():
|
| 144 |
+
if node.label.lower() == label_lower or label_lower in [a.lower() for a in node.aliases]:
|
| 145 |
+
return node
|
| 146 |
+
return None
|
| 147 |
+
|
| 148 |
+
def get_neighbors(self, node_id: str) -> List[Tuple[GraphNode, GraphEdge]]:
|
| 149 |
+
"""
|
| 150 |
+
Get neighboring nodes and connecting edges (undirected graph)
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
node_id: Node to get neighbors for
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
List of (neighbor_node, edge) tuples
|
| 157 |
+
"""
|
| 158 |
+
if self.use_neo4j:
|
| 159 |
+
return self._get_neighbors_neo4j(node_id)
|
| 160 |
+
else:
|
| 161 |
+
neighbors = []
|
| 162 |
+
# For undirected graph, just get all neighbors
|
| 163 |
+
for neighbor_id in self.graph.neighbors(node_id):
|
| 164 |
+
edges = self.graph.get_edge_data(node_id, neighbor_id)
|
| 165 |
+
if edges:
|
| 166 |
+
for edge_key, edge_data in edges.items():
|
| 167 |
+
edge = self.edges_dict.get(edge_key)
|
| 168 |
+
neighbor_node = self.nodes_dict.get(neighbor_id)
|
| 169 |
+
if edge and neighbor_node:
|
| 170 |
+
neighbors.append((neighbor_node, edge))
|
| 171 |
+
|
| 172 |
+
return neighbors
|
| 173 |
+
|
| 174 |
+
def get_all_nodes(self) -> List[GraphNode]:
|
| 175 |
+
"""Get all nodes in graph"""
|
| 176 |
+
if self.use_neo4j:
|
| 177 |
+
return self._get_all_nodes_neo4j()
|
| 178 |
+
else:
|
| 179 |
+
return list(self.nodes_dict.values())
|
| 180 |
+
|
| 181 |
+
def get_all_edges(self) -> List[GraphEdge]:
|
| 182 |
+
"""Get all edges in graph"""
|
| 183 |
+
if self.use_neo4j:
|
| 184 |
+
return self._get_all_edges_neo4j()
|
| 185 |
+
else:
|
| 186 |
+
return list(self.edges_dict.values())
|
| 187 |
+
|
| 188 |
+
def remove_node(self, node_id: str):
|
| 189 |
+
"""Remove node and its edges"""
|
| 190 |
+
if self.use_neo4j:
|
| 191 |
+
self._remove_node_neo4j(node_id)
|
| 192 |
+
else:
|
| 193 |
+
if node_id in self.nodes_dict:
|
| 194 |
+
del self.nodes_dict[node_id]
|
| 195 |
+
self.graph.remove_node(node_id)
|
| 196 |
+
|
| 197 |
+
def remove_edge(self, edge_id: str):
|
| 198 |
+
"""Remove edge"""
|
| 199 |
+
if self.use_neo4j:
|
| 200 |
+
self._remove_edge_neo4j(edge_id)
|
| 201 |
+
else:
|
| 202 |
+
if edge_id in self.edges_dict:
|
| 203 |
+
edge = self.edges_dict[edge_id]
|
| 204 |
+
del self.edges_dict[edge_id]
|
| 205 |
+
if self.graph.has_edge(edge.from_node, edge.to_node, key=edge_id):
|
| 206 |
+
self.graph.remove_edge(edge.from_node, edge.to_node, key=edge_id)
|
| 207 |
+
|
| 208 |
+
def compute_centrality(self) -> Dict[str, float]:
|
| 209 |
+
"""
|
| 210 |
+
Compute node centrality scores (degree centrality for undirected graph)
|
| 211 |
+
|
| 212 |
+
Returns:
|
| 213 |
+
Dict mapping node_id -> centrality score
|
| 214 |
+
"""
|
| 215 |
+
if self.use_neo4j:
|
| 216 |
+
# Use Neo4j's centrality algorithm
|
| 217 |
+
return self._compute_centrality_neo4j()
|
| 218 |
+
else:
|
| 219 |
+
try:
|
| 220 |
+
# Use degree centrality for undirected graph (simpler and faster)
|
| 221 |
+
centrality = nx.degree_centrality(self.graph)
|
| 222 |
+
return centrality
|
| 223 |
+
except Exception as e:
|
| 224 |
+
logger.error(f"Failed to compute centrality: {e}")
|
| 225 |
+
return {}
|
| 226 |
+
|
| 227 |
+
def save(self, filepath: str):
|
| 228 |
+
"""Save graph to file (NetworkX only)"""
|
| 229 |
+
if self.use_neo4j:
|
| 230 |
+
logger.info("Neo4j graphs are persisted automatically")
|
| 231 |
+
return
|
| 232 |
+
|
| 233 |
+
data = {
|
| 234 |
+
"nodes": [node.dict() for node in self.nodes_dict.values()],
|
| 235 |
+
"edges": [edge.dict() for edge in self.edges_dict.values()],
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
with open(filepath, 'wb') as f:
|
| 239 |
+
pickle.dump(data, f)
|
| 240 |
+
|
| 241 |
+
logger.info(f"Saved graph with {len(self.nodes_dict)} nodes and {len(self.edges_dict)} edges to {filepath}")
|
| 242 |
+
|
| 243 |
+
def load(self, filepath: str):
|
| 244 |
+
"""Load graph from file (NetworkX only)"""
|
| 245 |
+
if self.use_neo4j:
|
| 246 |
+
logger.warning("Cannot load into Neo4j from file")
|
| 247 |
+
return
|
| 248 |
+
|
| 249 |
+
with open(filepath, 'rb') as f:
|
| 250 |
+
data = pickle.load(f)
|
| 251 |
+
|
| 252 |
+
# Reconstruct nodes
|
| 253 |
+
for node_data in data["nodes"]:
|
| 254 |
+
node = GraphNode(**node_data)
|
| 255 |
+
self.add_node(node)
|
| 256 |
+
|
| 257 |
+
# Reconstruct edges
|
| 258 |
+
for edge_data in data["edges"]:
|
| 259 |
+
edge = GraphEdge(**edge_data)
|
| 260 |
+
self.add_edge(edge)
|
| 261 |
+
|
| 262 |
+
logger.info(f"Loaded graph with {len(self.nodes_dict)} nodes and {len(self.edges_dict)} edges")
|
| 263 |
+
|
| 264 |
+
def clear(self):
|
| 265 |
+
"""Clear all nodes and edges"""
|
| 266 |
+
if self.use_neo4j:
|
| 267 |
+
self._clear_neo4j()
|
| 268 |
+
else:
|
| 269 |
+
self.graph.clear()
|
| 270 |
+
self.nodes_dict.clear()
|
| 271 |
+
self.edges_dict.clear()
|
| 272 |
+
|
| 273 |
+
# Neo4j implementations (placeholders - implement as needed)
|
| 274 |
+
|
| 275 |
+
def _add_node_neo4j(self, node: GraphNode) -> bool:
|
| 276 |
+
"""Add node to Neo4j"""
|
| 277 |
+
with self.driver.session() as session:
|
| 278 |
+
# Handle both enum and string for type field
|
| 279 |
+
node_type = node.type.value if hasattr(node.type, 'value') else node.type
|
| 280 |
+
result = session.run(
|
| 281 |
+
"""
|
| 282 |
+
MERGE (n:Entity {node_id: $node_id})
|
| 283 |
+
ON CREATE SET n.label = $label, n.type = $type,
|
| 284 |
+
n.importance = $importance, n.created_at = datetime()
|
| 285 |
+
RETURN n
|
| 286 |
+
""",
|
| 287 |
+
node_id=node.node_id,
|
| 288 |
+
label=node.label,
|
| 289 |
+
type=node_type,
|
| 290 |
+
importance=node.importance_score
|
| 291 |
+
)
|
| 292 |
+
return result.single() is not None
|
| 293 |
+
|
| 294 |
+
def _add_edge_neo4j(self, edge: GraphEdge) -> bool:
|
| 295 |
+
"""Add edge to Neo4j"""
|
| 296 |
+
with self.driver.session() as session:
|
| 297 |
+
# Handle both enum and string for relation field
|
| 298 |
+
relation_value = edge.relation.value if hasattr(edge.relation, 'value') else edge.relation
|
| 299 |
+
session.run(
|
| 300 |
+
"""
|
| 301 |
+
MATCH (a:Entity {node_id: $from_node})
|
| 302 |
+
MATCH (b:Entity {node_id: $to_node})
|
| 303 |
+
CREATE (a)-[r:RELATES {edge_id: $edge_id, relation: $relation,
|
| 304 |
+
confidence: $confidence}]->(b)
|
| 305 |
+
""",
|
| 306 |
+
from_node=edge.from_node,
|
| 307 |
+
to_node=edge.to_node,
|
| 308 |
+
edge_id=edge.edge_id,
|
| 309 |
+
relation=relation_value,
|
| 310 |
+
confidence=edge.confidence
|
| 311 |
+
)
|
| 312 |
+
return True
|
| 313 |
+
|
| 314 |
+
def _get_node_neo4j(self, node_id: str) -> Optional[GraphNode]:
|
| 315 |
+
"""Get node from Neo4j"""
|
| 316 |
+
# Implementation omitted for brevity
|
| 317 |
+
pass
|
| 318 |
+
|
| 319 |
+
def _get_neighbors_neo4j(self, node_id: str) -> List[Tuple[GraphNode, GraphEdge]]:
|
| 320 |
+
"""Get neighbors from Neo4j"""
|
| 321 |
+
# Implementation omitted for brevity
|
| 322 |
+
pass
|
| 323 |
+
|
| 324 |
+
def _get_all_nodes_neo4j(self) -> List[GraphNode]:
|
| 325 |
+
"""Get all nodes from Neo4j"""
|
| 326 |
+
pass
|
| 327 |
+
|
| 328 |
+
def _get_all_edges_neo4j(self) -> List[GraphEdge]:
|
| 329 |
+
"""Get all edges from Neo4j"""
|
| 330 |
+
pass
|
| 331 |
+
|
| 332 |
+
def _remove_node_neo4j(self, node_id: str):
|
| 333 |
+
"""Remove node from Neo4j"""
|
| 334 |
+
pass
|
| 335 |
+
|
| 336 |
+
def _remove_edge_neo4j(self, edge_id: str):
|
| 337 |
+
"""Remove edge from Neo4j"""
|
| 338 |
+
pass
|
| 339 |
+
|
| 340 |
+
def _compute_centrality_neo4j(self) -> Dict[str, float]:
|
| 341 |
+
"""Compute centrality in Neo4j"""
|
| 342 |
+
pass
|
| 343 |
+
|
| 344 |
+
def _clear_neo4j(self):
|
| 345 |
+
"""Clear Neo4j database"""
|
| 346 |
+
with self.driver.session() as session:
|
| 347 |
+
session.run("MATCH (n) DETACH DELETE n")
|
llm_service.py
ADDED
|
@@ -0,0 +1,491 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LLM Inference Layer
|
| 3 |
+
Handles all LLM calls for extraction, summarization, and chat
|
| 4 |
+
Uses Mistral 7B with structured prompt templates
|
| 5 |
+
"""
|
| 6 |
+
from typing import List, Dict, Any, Optional
|
| 7 |
+
from loguru import logger
|
| 8 |
+
from config import settings
|
| 9 |
+
import json
|
| 10 |
+
import httpx
|
| 11 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 12 |
+
from models import Triple, CanonicalTriple, RelationType
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class PromptTemplates:
|
| 16 |
+
"""Centralized prompt templates following the manual"""
|
| 17 |
+
|
| 18 |
+
@staticmethod
|
| 19 |
+
def triplet_canonicalization(passage: str, triple: Triple) -> str:
|
| 20 |
+
"""Template for canonicalizing extracted triples"""
|
| 21 |
+
return f"""Given the passage and an extracted triple, return a cleaned, canonical version.
|
| 22 |
+
|
| 23 |
+
Passage (from page {triple.page_number}):
|
| 24 |
+
{passage}
|
| 25 |
+
|
| 26 |
+
Extracted Triple:
|
| 27 |
+
- Subject: {triple.subject}
|
| 28 |
+
- Relation: {triple.predicate}
|
| 29 |
+
- Object: {triple.object}
|
| 30 |
+
|
| 31 |
+
CRITICAL INSTRUCTION: You MUST select the "relation" field from this EXACT list of 25 canonical relations.
|
| 32 |
+
Copy the exact string - do NOT create variations, synonyms, or modifications.
|
| 33 |
+
|
| 34 |
+
ALLOWED RELATIONS (choose exactly one):
|
| 35 |
+
1. is_a - for type/class relationships (e.g., "X is a Y")
|
| 36 |
+
2. part_of - for component relationships (e.g., "X is part of Y")
|
| 37 |
+
3. uses - for utilization (use "uses" for: utilizes, employs, applies)
|
| 38 |
+
4. causes - for causality (e.g., "X causes Y")
|
| 39 |
+
5. defined_as - for definitions (use "defined_as" for: defines, is defined as)
|
| 40 |
+
6. related_to - ONLY if no other relation fits
|
| 41 |
+
7. method_of - for methodological relationships
|
| 42 |
+
8. depends_on - for dependencies (e.g., "X depends on Y")
|
| 43 |
+
9. implements - for implementation (e.g., "X implements Y")
|
| 44 |
+
10. similar_to - for similarity
|
| 45 |
+
11. observes - for observation (use "observes" for: captures, records, detects, monitors)
|
| 46 |
+
12. measures - for measurement
|
| 47 |
+
13. produces - for production/generation (use "produces" for: makes, creates, generates, builds)
|
| 48 |
+
14. contains - for containment
|
| 49 |
+
15. affects - for influence (use "affects" for: influences, impacts, modifies, changes)
|
| 50 |
+
16. enables - for enablement (use "enables" for: facilitates, allows, permits)
|
| 51 |
+
17. requires - for requirements
|
| 52 |
+
18. interacts_with - for interactions
|
| 53 |
+
19. enriches - for enrichment
|
| 54 |
+
20. enhances - for enhancement (use "enhances" for: improves, optimizes, extends)
|
| 55 |
+
21. supports - for support (use "supports" for: contributes, helps, aids)
|
| 56 |
+
22. describes - for description (use "describes" for: proposes, suggests, presents, introduces)
|
| 57 |
+
23. explains - for explanation (use "explains" for: clarifies, demonstrates, shows, disentangles)
|
| 58 |
+
24. refers_to - for reference (use "refers_to" for: aims, targets, addresses, focuses on)
|
| 59 |
+
25. associated_with - for associations
|
| 60 |
+
|
| 61 |
+
EXAMPLES OF WHAT TO DO:
|
| 62 |
+
- If input has "utilizes" → use "uses"
|
| 63 |
+
- If input has "proposes" → use "describes"
|
| 64 |
+
- If input has "contributes to" → use "supports"
|
| 65 |
+
- If input has "aims at" → use "refers_to"
|
| 66 |
+
|
| 67 |
+
DO NOT USE: utilizes, proposes, contributes, aims, makes, captures, defines, or any other variations.
|
| 68 |
+
USE ONLY: The exact 25 strings listed above.
|
| 69 |
+
|
| 70 |
+
Return JSON in this exact format:
|
| 71 |
+
{{
|
| 72 |
+
"subject_label": "cleaned subject name",
|
| 73 |
+
"object_label": "cleaned object name",
|
| 74 |
+
"relation": "one_of_the_25_exact_strings_above",
|
| 75 |
+
"confidence": 0.85,
|
| 76 |
+
"justification": "brief explanation referencing page {triple.page_number}"
|
| 77 |
+
}}
|
| 78 |
+
|
| 79 |
+
Output ONLY the JSON, no other text:
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
@staticmethod
|
| 83 |
+
def node_summarization(node_label: str, chunks: List[Dict[str, Any]]) -> str:
|
| 84 |
+
"""Template for node summarization with citations"""
|
| 85 |
+
chunks_text = "\n\n".join([
|
| 86 |
+
f"[Chunk from p.{chunk['page_number']}]\n{chunk['text']}"
|
| 87 |
+
for chunk in chunks
|
| 88 |
+
])
|
| 89 |
+
|
| 90 |
+
return f"""Summarize the key facts about "{node_label}" using ONLY the following supporting chunks.
|
| 91 |
+
|
| 92 |
+
Requirements:
|
| 93 |
+
- Produce a concise summary (3-6 sentences)
|
| 94 |
+
- After any sentence that directly relies on a chunk, append (p. N) where N is the page number
|
| 95 |
+
- Do not invent information not present in the chunks
|
| 96 |
+
- Focus on the most important facts
|
| 97 |
+
|
| 98 |
+
Supporting Chunks:
|
| 99 |
+
{chunks_text}
|
| 100 |
+
|
| 101 |
+
Summary:
|
| 102 |
+
"""
|
| 103 |
+
|
| 104 |
+
@staticmethod
|
| 105 |
+
def rag_chat(user_query: str, context_chunks: List[Dict[str, Any]]) -> str:
|
| 106 |
+
"""Template for RAG chat with citations"""
|
| 107 |
+
context_text = "\n\n".join([
|
| 108 |
+
f"[Source {i+1}, p.{chunk['page_number']}]\n{chunk['text']}"
|
| 109 |
+
for i, chunk in enumerate(context_chunks)
|
| 110 |
+
])
|
| 111 |
+
|
| 112 |
+
return f"""You are an assistant that answers questions using ONLY the provided document context.
|
| 113 |
+
|
| 114 |
+
Context from document:
|
| 115 |
+
{context_text}
|
| 116 |
+
|
| 117 |
+
User Question: {user_query}
|
| 118 |
+
|
| 119 |
+
Instructions:
|
| 120 |
+
- Answer in friendly, concise language
|
| 121 |
+
- Include inline citations (p. N) for statements supported by chunks
|
| 122 |
+
- If you cannot find direct support, say "I cannot confirm this from the document"
|
| 123 |
+
- At the end, add a "Sources:" section listing page numbers and short snippets
|
| 124 |
+
|
| 125 |
+
Answer:
|
| 126 |
+
"""
|
| 127 |
+
|
| 128 |
+
@staticmethod
|
| 129 |
+
def system_message() -> str:
|
| 130 |
+
"""System message for chat"""
|
| 131 |
+
return """You are a helpful assistant that answers questions strictly based on provided document context.
|
| 132 |
+
You always cite page numbers for factual statements. If information is not in the context, you say so clearly."""
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
class LLMService:
|
| 136 |
+
"""
|
| 137 |
+
Service for LLM inference using Gemini API (via litellm)
|
| 138 |
+
Handles generation, extraction, summarization, and agent synthesis
|
| 139 |
+
"""
|
| 140 |
+
|
| 141 |
+
def __init__(self):
|
| 142 |
+
# Use Gemini instead of Mistral
|
| 143 |
+
self.api_key = settings.gemini_api_key
|
| 144 |
+
self.model = f"gemini/{settings.gemini_model}"
|
| 145 |
+
self.temperature = settings.llm_temperature
|
| 146 |
+
self.max_tokens = settings.llm_max_tokens
|
| 147 |
+
self.timeout = settings.llm_timeout
|
| 148 |
+
|
| 149 |
+
# Import litellm for Gemini
|
| 150 |
+
try:
|
| 151 |
+
import litellm
|
| 152 |
+
self.litellm = litellm
|
| 153 |
+
logger.info(f"✓ LLMService initialized with Gemini ({settings.gemini_model})")
|
| 154 |
+
except ImportError:
|
| 155 |
+
logger.error("litellm not installed. Install with: pip install litellm")
|
| 156 |
+
raise
|
| 157 |
+
|
| 158 |
+
if not self.api_key:
|
| 159 |
+
logger.warning("No Gemini API key configured. LLM features will not work.")
|
| 160 |
+
|
| 161 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
|
| 162 |
+
async def _call_api(
|
| 163 |
+
self,
|
| 164 |
+
messages: List[Dict[str, str]],
|
| 165 |
+
temperature: Optional[float] = None,
|
| 166 |
+
max_tokens: Optional[int] = None,
|
| 167 |
+
json_mode: bool = False
|
| 168 |
+
) -> str:
|
| 169 |
+
"""
|
| 170 |
+
Call Gemini API via litellm with retry logic
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
messages: List of message dicts with 'role' and 'content'
|
| 174 |
+
temperature: Override default temperature
|
| 175 |
+
max_tokens: Override default max tokens
|
| 176 |
+
json_mode: Request JSON output
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
Generated text
|
| 180 |
+
"""
|
| 181 |
+
if not self.api_key:
|
| 182 |
+
raise ValueError("Gemini API key not configured")
|
| 183 |
+
|
| 184 |
+
try:
|
| 185 |
+
# Use litellm for Gemini API calls
|
| 186 |
+
import asyncio
|
| 187 |
+
|
| 188 |
+
kwargs = {
|
| 189 |
+
"model": self.model,
|
| 190 |
+
"api_key": self.api_key,
|
| 191 |
+
"messages": messages,
|
| 192 |
+
"temperature": temperature or self.temperature,
|
| 193 |
+
"max_tokens": max_tokens or self.max_tokens,
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
if json_mode:
|
| 197 |
+
kwargs["response_format"] = {"type": "json_object"}
|
| 198 |
+
|
| 199 |
+
# litellm.completion is synchronous, wrap in asyncio.to_thread
|
| 200 |
+
response = await asyncio.to_thread(
|
| 201 |
+
self.litellm.completion,
|
| 202 |
+
**kwargs
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
return response.choices[0].message.content
|
| 206 |
+
|
| 207 |
+
except Exception as e:
|
| 208 |
+
logger.error(f"Gemini API error: {str(e)}")
|
| 209 |
+
raise
|
| 210 |
+
|
| 211 |
+
async def canonicalize_triple(
|
| 212 |
+
self,
|
| 213 |
+
triple: Triple,
|
| 214 |
+
passage: str
|
| 215 |
+
) -> Optional[CanonicalTriple]:
|
| 216 |
+
"""
|
| 217 |
+
Canonicalize a raw triple using LLM
|
| 218 |
+
|
| 219 |
+
Args:
|
| 220 |
+
triple: Raw extracted triple
|
| 221 |
+
passage: Surrounding text passage
|
| 222 |
+
|
| 223 |
+
Returns:
|
| 224 |
+
CanonicalTriple or None if LLM fails
|
| 225 |
+
"""
|
| 226 |
+
prompt = PromptTemplates.triplet_canonicalization(passage, triple)
|
| 227 |
+
|
| 228 |
+
messages = [
|
| 229 |
+
{"role": "system", "content": "You are an expert at extracting and canonicalizing knowledge graph triples. Always output valid JSON."},
|
| 230 |
+
{"role": "user", "content": prompt}
|
| 231 |
+
]
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
response = await self._call_api(messages, temperature=0.1, json_mode=True)
|
| 235 |
+
data = json.loads(response)
|
| 236 |
+
|
| 237 |
+
# Map string relation to enum
|
| 238 |
+
relation_str = data.get("relation", "related_to").lower().strip()
|
| 239 |
+
|
| 240 |
+
# Auto-correct common variations and map semantically similar verbs
|
| 241 |
+
relation_corrections = {
|
| 242 |
+
# Exact variations
|
| 243 |
+
"defines_as": "defined_as",
|
| 244 |
+
"defines": "defined_as",
|
| 245 |
+
"is_part_of": "part_of",
|
| 246 |
+
"used_by": "uses",
|
| 247 |
+
"caused_by": "causes",
|
| 248 |
+
"methods_of": "method_of",
|
| 249 |
+
"depending_on": "depends_on",
|
| 250 |
+
"implemented_by": "implements",
|
| 251 |
+
"similar": "similar_to",
|
| 252 |
+
"observed_by": "observes",
|
| 253 |
+
"measured_by": "measures",
|
| 254 |
+
"produced_by": "produces",
|
| 255 |
+
"contained_in": "contains",
|
| 256 |
+
"affected_by": "affects",
|
| 257 |
+
"enabled_by": "enables",
|
| 258 |
+
"required_by": "requires",
|
| 259 |
+
"interact_with": "interacts_with",
|
| 260 |
+
"enriched_by": "enriches",
|
| 261 |
+
"enhanced_by": "enhances",
|
| 262 |
+
"supported_by": "supports",
|
| 263 |
+
"described_by": "describes",
|
| 264 |
+
"explained_by": "explains",
|
| 265 |
+
"refer_to": "refers_to",
|
| 266 |
+
|
| 267 |
+
# Semantic mappings for common verbs
|
| 268 |
+
"utilizes": "uses",
|
| 269 |
+
"utilize": "uses",
|
| 270 |
+
"employs": "uses",
|
| 271 |
+
"applies": "uses",
|
| 272 |
+
"makes": "produces",
|
| 273 |
+
"creates": "produces",
|
| 274 |
+
"generates": "produces",
|
| 275 |
+
"builds": "produces",
|
| 276 |
+
"proposes": "describes",
|
| 277 |
+
"suggests": "describes",
|
| 278 |
+
"presents": "describes",
|
| 279 |
+
"introduces": "describes",
|
| 280 |
+
"captures": "observes",
|
| 281 |
+
"records": "observes",
|
| 282 |
+
"detects": "observes",
|
| 283 |
+
"monitors": "observes",
|
| 284 |
+
"aims": "refers_to",
|
| 285 |
+
"targets": "refers_to",
|
| 286 |
+
"focuses_on": "refers_to",
|
| 287 |
+
"addresses": "refers_to",
|
| 288 |
+
"disentangles": "explains",
|
| 289 |
+
"clarifies": "explains",
|
| 290 |
+
"demonstrates": "explains",
|
| 291 |
+
"shows": "explains",
|
| 292 |
+
"contributes": "supports",
|
| 293 |
+
"contributes_to": "supports",
|
| 294 |
+
"helps": "supports",
|
| 295 |
+
"aids": "supports",
|
| 296 |
+
"facilitates": "enables",
|
| 297 |
+
"allows": "enables",
|
| 298 |
+
"permits": "enables",
|
| 299 |
+
"improves": "enhances",
|
| 300 |
+
"betters": "enhances",
|
| 301 |
+
"optimizes": "enhances",
|
| 302 |
+
"extends": "enhances",
|
| 303 |
+
"influences": "affects",
|
| 304 |
+
"impacts": "affects",
|
| 305 |
+
"modifies": "affects",
|
| 306 |
+
"changes": "affects",
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
relation_str = relation_corrections.get(relation_str, relation_str)
|
| 310 |
+
|
| 311 |
+
try:
|
| 312 |
+
relation = RelationType(relation_str)
|
| 313 |
+
except ValueError:
|
| 314 |
+
logger.warning(f"Invalid relation '{relation_str}', defaulting to 'related_to'")
|
| 315 |
+
relation = RelationType.RELATED_TO
|
| 316 |
+
|
| 317 |
+
return CanonicalTriple(
|
| 318 |
+
subject_label=data["subject_label"],
|
| 319 |
+
object_label=data["object_label"],
|
| 320 |
+
relation=relation,
|
| 321 |
+
confidence=data["confidence"],
|
| 322 |
+
justification=data["justification"],
|
| 323 |
+
page_number=triple.page_number or 0
|
| 324 |
+
)
|
| 325 |
+
except Exception as e:
|
| 326 |
+
logger.error(f"Failed to canonicalize triple: {e}")
|
| 327 |
+
return None
|
| 328 |
+
|
| 329 |
+
async def summarize_node(
|
| 330 |
+
self,
|
| 331 |
+
node_label: str,
|
| 332 |
+
supporting_chunks: List[Dict[str, Any]]
|
| 333 |
+
) -> str:
|
| 334 |
+
"""
|
| 335 |
+
Generate summary for a graph node with citations
|
| 336 |
+
|
| 337 |
+
Args:
|
| 338 |
+
node_label: Name of the node
|
| 339 |
+
supporting_chunks: List of chunk metadata dicts
|
| 340 |
+
|
| 341 |
+
Returns:
|
| 342 |
+
Summary text with inline citations
|
| 343 |
+
"""
|
| 344 |
+
prompt = PromptTemplates.node_summarization(node_label, supporting_chunks)
|
| 345 |
+
|
| 346 |
+
messages = [
|
| 347 |
+
{"role": "system", "content": PromptTemplates.system_message()},
|
| 348 |
+
{"role": "user", "content": prompt}
|
| 349 |
+
]
|
| 350 |
+
|
| 351 |
+
try:
|
| 352 |
+
# Use faster settings for node summaries
|
| 353 |
+
summary = await self._call_api(
|
| 354 |
+
messages,
|
| 355 |
+
temperature=0.3,
|
| 356 |
+
max_tokens=3072 # Shorter summaries = faster response
|
| 357 |
+
)
|
| 358 |
+
return summary.strip()
|
| 359 |
+
except Exception as e:
|
| 360 |
+
logger.error(f"Failed to summarize node: {e}")
|
| 361 |
+
return f"Unable to generate summary for {node_label}."
|
| 362 |
+
|
| 363 |
+
async def rag_chat(
|
| 364 |
+
self,
|
| 365 |
+
query: str,
|
| 366 |
+
context_chunks: List[Dict[str, Any]]
|
| 367 |
+
) -> str:
|
| 368 |
+
"""
|
| 369 |
+
Answer user query using RAG with citations
|
| 370 |
+
|
| 371 |
+
Args:
|
| 372 |
+
query: User question
|
| 373 |
+
context_chunks: Retrieved context chunks
|
| 374 |
+
|
| 375 |
+
Returns:
|
| 376 |
+
Answer with citations and sources
|
| 377 |
+
"""
|
| 378 |
+
prompt = PromptTemplates.rag_chat(query, context_chunks)
|
| 379 |
+
|
| 380 |
+
messages = [
|
| 381 |
+
{"role": "system", "content": PromptTemplates.system_message()},
|
| 382 |
+
{"role": "user", "content": prompt}
|
| 383 |
+
]
|
| 384 |
+
|
| 385 |
+
try:
|
| 386 |
+
answer = await self._call_api(messages, temperature=0.3)
|
| 387 |
+
return answer.strip()
|
| 388 |
+
except Exception as e:
|
| 389 |
+
logger.error(f"Failed to generate RAG response: {e}")
|
| 390 |
+
return "I encountered an error while processing your question. Please try again."
|
| 391 |
+
|
| 392 |
+
async def agent_synthesize(
|
| 393 |
+
self,
|
| 394 |
+
query: str,
|
| 395 |
+
context: str
|
| 396 |
+
) -> str:
|
| 397 |
+
"""
|
| 398 |
+
Synthesize answer for agent-based RAG from tool results
|
| 399 |
+
|
| 400 |
+
Args:
|
| 401 |
+
query: User question
|
| 402 |
+
context: Combined context from tool executions
|
| 403 |
+
|
| 404 |
+
Returns:
|
| 405 |
+
Synthesized answer with citations
|
| 406 |
+
"""
|
| 407 |
+
prompt = f"""You are an assistant that answers questions using the provided context from multiple tools.
|
| 408 |
+
|
| 409 |
+
Context from tools:
|
| 410 |
+
{context}
|
| 411 |
+
|
| 412 |
+
User Question: {query}
|
| 413 |
+
|
| 414 |
+
Instructions:
|
| 415 |
+
- Answer in friendly, concise language
|
| 416 |
+
- Include inline citations (p. N) for statements supported by sources
|
| 417 |
+
- If you cannot find direct support, say "I cannot confirm this from the available information"
|
| 418 |
+
- Synthesize information from different tools (vector search, graph search, etc.) cohesively
|
| 419 |
+
|
| 420 |
+
Answer:
|
| 421 |
+
"""
|
| 422 |
+
|
| 423 |
+
messages = [
|
| 424 |
+
{"role": "system", "content": PromptTemplates.system_message()},
|
| 425 |
+
{"role": "user", "content": prompt}
|
| 426 |
+
]
|
| 427 |
+
|
| 428 |
+
try:
|
| 429 |
+
answer = await self._call_api(messages, temperature=0.3)
|
| 430 |
+
return answer.strip()
|
| 431 |
+
except Exception as e:
|
| 432 |
+
logger.error(f"Failed to synthesize agent response: {e}")
|
| 433 |
+
return "I encountered an error while processing your question. Please try again."
|
| 434 |
+
|
| 435 |
+
async def extract_triples_llm(
|
| 436 |
+
self,
|
| 437 |
+
text: str,
|
| 438 |
+
page_number: int,
|
| 439 |
+
chunk_id: str
|
| 440 |
+
) -> List[Triple]:
|
| 441 |
+
"""
|
| 442 |
+
Use LLM to extract triples directly (alternative to OpenIE)
|
| 443 |
+
|
| 444 |
+
Args:
|
| 445 |
+
text: Text to extract from
|
| 446 |
+
page_number: Page number
|
| 447 |
+
chunk_id: Chunk identifier
|
| 448 |
+
|
| 449 |
+
Returns:
|
| 450 |
+
List of extracted triples
|
| 451 |
+
"""
|
| 452 |
+
prompt = f"""Extract key relationships from this text as subject-predicate-object triples.
|
| 453 |
+
Focus on important concepts, methods, definitions, and relationships.
|
| 454 |
+
|
| 455 |
+
Text (from page {page_number}):
|
| 456 |
+
{text}
|
| 457 |
+
|
| 458 |
+
Return a JSON array of triples, each with:
|
| 459 |
+
- subject: The subject entity
|
| 460 |
+
- predicate: The relationship/action
|
| 461 |
+
- object: The object entity
|
| 462 |
+
- confidence: Your confidence (0-1)
|
| 463 |
+
|
| 464 |
+
Output ONLY valid JSON array:
|
| 465 |
+
"""
|
| 466 |
+
|
| 467 |
+
messages = [
|
| 468 |
+
{"role": "system", "content": "You are an expert at knowledge extraction. Always output valid JSON."},
|
| 469 |
+
{"role": "user", "content": prompt}
|
| 470 |
+
]
|
| 471 |
+
|
| 472 |
+
try:
|
| 473 |
+
response = await self._call_api(messages, temperature=0.2, json_mode=True)
|
| 474 |
+
data = json.loads(response)
|
| 475 |
+
|
| 476 |
+
triples = []
|
| 477 |
+
for item in data if isinstance(data, list) else data.get("triples", []):
|
| 478 |
+
triple = Triple(
|
| 479 |
+
subject=item["subject"],
|
| 480 |
+
predicate=item["predicate"],
|
| 481 |
+
object=item["object"],
|
| 482 |
+
confidence=item.get("confidence", 0.7),
|
| 483 |
+
source_chunk_id=chunk_id,
|
| 484 |
+
page_number=page_number
|
| 485 |
+
)
|
| 486 |
+
triples.append(triple)
|
| 487 |
+
|
| 488 |
+
return triples
|
| 489 |
+
except Exception as e:
|
| 490 |
+
logger.error(f"Failed to extract triples: {e}")
|
| 491 |
+
return []
|
main.py
ADDED
|
@@ -0,0 +1,550 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI Backend - Main Application
|
| 3 |
+
Provides REST API for PDF upload, graph retrieval, chat, and node details
|
| 4 |
+
"""
|
| 5 |
+
# Suppress PyTorch JIT warnings (harmless, just noisy during import)
|
| 6 |
+
import warnings
|
| 7 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="torch")
|
| 8 |
+
warnings.filterwarnings("ignore", message="Unable to retrieve source")
|
| 9 |
+
|
| 10 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
|
| 11 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
+
from fastapi.staticfiles import StaticFiles
|
| 13 |
+
from fastapi.responses import FileResponse, JSONResponse
|
| 14 |
+
from loguru import logger
|
| 15 |
+
import sys
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
import os
|
| 18 |
+
import uuid
|
| 19 |
+
import pickle
|
| 20 |
+
from datetime import datetime
|
| 21 |
+
from typing import List, Dict, Any
|
| 22 |
+
|
| 23 |
+
from config import settings, ensure_directories
|
| 24 |
+
from models import (
|
| 25 |
+
UploadResponse, GraphResponse, ChatRequest, ChatResponse,
|
| 26 |
+
NodeDetailResponse, AdminStatus, SourceCitation, GraphNode, GraphEdge
|
| 27 |
+
)
|
| 28 |
+
from pdf_processor import PDFProcessor
|
| 29 |
+
from embedding_service import EmbeddingService
|
| 30 |
+
from llm_service import LLMService
|
| 31 |
+
from gemini_extractor import GeminiExtractor
|
| 32 |
+
from graph_store import GraphStore
|
| 33 |
+
from graph_builder import GraphBuilder
|
| 34 |
+
from rag_agent import RAGAgent
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# Configure logging
|
| 38 |
+
logger.remove()
|
| 39 |
+
logger.add(
|
| 40 |
+
sys.stderr,
|
| 41 |
+
level=settings.log_level,
|
| 42 |
+
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan> | <level>{message}</level>"
|
| 43 |
+
)
|
| 44 |
+
logger.add(
|
| 45 |
+
f"{settings.logs_dir}/app.log",
|
| 46 |
+
rotation="500 MB",
|
| 47 |
+
retention="10 days",
|
| 48 |
+
level=settings.log_level
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Initialize services
|
| 52 |
+
ensure_directories()
|
| 53 |
+
|
| 54 |
+
app = FastAPI(
|
| 55 |
+
title=settings.app_name,
|
| 56 |
+
version=settings.app_version,
|
| 57 |
+
description="PDF Knowledge Graph and RAG System"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# CORS middleware
|
| 61 |
+
app.add_middleware(
|
| 62 |
+
CORSMiddleware,
|
| 63 |
+
allow_origins=["*"], # Configure appropriately for production
|
| 64 |
+
allow_credentials=True,
|
| 65 |
+
allow_methods=["*"],
|
| 66 |
+
allow_headers=["*"],
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Global service instances
|
| 70 |
+
logger.info("Initializing PDFProcessor...")
|
| 71 |
+
pdf_processor = PDFProcessor()
|
| 72 |
+
|
| 73 |
+
logger.info("Initializing EmbeddingService...")
|
| 74 |
+
embedding_service = EmbeddingService()
|
| 75 |
+
|
| 76 |
+
logger.info("Initializing LLMService...")
|
| 77 |
+
llm_service = LLMService()
|
| 78 |
+
|
| 79 |
+
logger.info("Initializing GeminiExtractor (direct Gemini API)...")
|
| 80 |
+
triplet_extractor = GeminiExtractor(llm_service)
|
| 81 |
+
|
| 82 |
+
logger.info("Initializing GraphStore...")
|
| 83 |
+
graph_store = GraphStore(use_neo4j=False, embedding_service=embedding_service)
|
| 84 |
+
|
| 85 |
+
logger.info("Initializing GraphBuilder...")
|
| 86 |
+
graph_builder = GraphBuilder(graph_store, embedding_service)
|
| 87 |
+
|
| 88 |
+
logger.info("Initializing RAGAgent (LangGraph-based)...")
|
| 89 |
+
rag_agent = RAGAgent(graph_store, embedding_service, llm_service)
|
| 90 |
+
|
| 91 |
+
logger.info("✓ All services initialized successfully")
|
| 92 |
+
|
| 93 |
+
# In-memory storage for PDF metadata (use database in production)
|
| 94 |
+
pdf_metadata_store: Dict[str, Dict[str, Any]] = {}
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
@app.on_event("startup")
|
| 98 |
+
async def startup_event():
|
| 99 |
+
"""Run on application startup"""
|
| 100 |
+
logger.info(f"Starting {settings.app_name} v{settings.app_version}")
|
| 101 |
+
logger.info(f"Environment: {settings.environment}")
|
| 102 |
+
|
| 103 |
+
# Try to load existing graph
|
| 104 |
+
graph_path = os.path.join(settings.data_dir, "knowledge_graph.pkl")
|
| 105 |
+
if os.path.exists(graph_path):
|
| 106 |
+
try:
|
| 107 |
+
graph_store.load(graph_path)
|
| 108 |
+
logger.info("Loaded existing knowledge graph")
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.warning(f"Failed to load existing graph: {e}")
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
@app.on_event("shutdown")
|
| 114 |
+
async def shutdown_event():
|
| 115 |
+
"""Run on application shutdown"""
|
| 116 |
+
logger.info("Shutting down application")
|
| 117 |
+
|
| 118 |
+
# Save graph
|
| 119 |
+
graph_path = os.path.join(settings.data_dir, "knowledge_graph.pkl")
|
| 120 |
+
try:
|
| 121 |
+
graph_store.save(graph_path)
|
| 122 |
+
logger.info("Saved knowledge graph")
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.error(f"Failed to save graph: {e}")
|
| 125 |
+
|
| 126 |
+
# Save FAISS index
|
| 127 |
+
try:
|
| 128 |
+
embedding_service.save()
|
| 129 |
+
logger.info("Saved FAISS index")
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error(f"Failed to save FAISS index: {e}")
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
@app.get("/")
|
| 135 |
+
async def root():
|
| 136 |
+
"""Serve the frontend HTML"""
|
| 137 |
+
return FileResponse("frontend/index.html")
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
@app.post("/upload", response_model=UploadResponse)
|
| 141 |
+
async def upload_pdf(
|
| 142 |
+
file: UploadFile = File(...),
|
| 143 |
+
background_tasks: BackgroundTasks = BackgroundTasks()
|
| 144 |
+
):
|
| 145 |
+
"""
|
| 146 |
+
Upload a PDF and trigger ingestion pipeline
|
| 147 |
+
|
| 148 |
+
Returns immediately with pdf_id, processes in background
|
| 149 |
+
"""
|
| 150 |
+
# Validate file
|
| 151 |
+
if not file.filename.endswith('.pdf'):
|
| 152 |
+
raise HTTPException(status_code=400, detail="Only PDF files are allowed")
|
| 153 |
+
|
| 154 |
+
file_size = 0
|
| 155 |
+
content = await file.read()
|
| 156 |
+
file_size = len(content)
|
| 157 |
+
|
| 158 |
+
if file_size > settings.max_file_size_bytes:
|
| 159 |
+
raise HTTPException(
|
| 160 |
+
status_code=400,
|
| 161 |
+
detail=f"File size exceeds maximum of {settings.max_file_size_mb}MB"
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
# Generate PDF ID
|
| 165 |
+
pdf_id = str(uuid.uuid4())
|
| 166 |
+
|
| 167 |
+
# Save file
|
| 168 |
+
filepath = os.path.join(settings.upload_dir, f"{pdf_id}.pdf")
|
| 169 |
+
with open(filepath, 'wb') as f:
|
| 170 |
+
f.write(content)
|
| 171 |
+
|
| 172 |
+
logger.info(f"Uploaded PDF: {file.filename} (ID: {pdf_id})")
|
| 173 |
+
|
| 174 |
+
# Store metadata with detailed progress tracking
|
| 175 |
+
pdf_metadata_store[pdf_id] = {
|
| 176 |
+
"filename": file.filename,
|
| 177 |
+
"filepath": filepath,
|
| 178 |
+
"status": "processing",
|
| 179 |
+
"progress": {
|
| 180 |
+
"stage": "starting",
|
| 181 |
+
"message": "Upload complete, starting processing...",
|
| 182 |
+
"percent": 0
|
| 183 |
+
}
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
# Trigger background processing
|
| 187 |
+
background_tasks.add_task(process_pdf_pipeline, pdf_id, filepath)
|
| 188 |
+
|
| 189 |
+
return UploadResponse(
|
| 190 |
+
pdf_id=pdf_id,
|
| 191 |
+
filename=file.filename,
|
| 192 |
+
status="processing",
|
| 193 |
+
message="PDF uploaded successfully. Processing started in background."
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
async def process_pdf_pipeline(pdf_id: str, filepath: str):
|
| 198 |
+
"""
|
| 199 |
+
⚡ OPTIMIZED: Full ingestion pipeline with progress tracking
|
| 200 |
+
|
| 201 |
+
Steps:
|
| 202 |
+
0. Clear existing graph and index (FRESH START)
|
| 203 |
+
1. Extract chunks from PDF
|
| 204 |
+
2. Create embeddings
|
| 205 |
+
3. Add to vector index
|
| 206 |
+
4. Extract triples (PARALLEL)
|
| 207 |
+
5. Build knowledge graph (NO PRUNING)
|
| 208 |
+
"""
|
| 209 |
+
def update_progress(stage: str, message: str, percent: int):
|
| 210 |
+
"""Update progress in metadata store"""
|
| 211 |
+
if pdf_id in pdf_metadata_store:
|
| 212 |
+
pdf_metadata_store[pdf_id]["progress"] = {
|
| 213 |
+
"stage": stage,
|
| 214 |
+
"message": message,
|
| 215 |
+
"percent": percent
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
try:
|
| 219 |
+
logger.info(f"Starting ingestion pipeline for PDF {pdf_id}")
|
| 220 |
+
|
| 221 |
+
# Step 0: CLEAR EVERYTHING for fresh extraction
|
| 222 |
+
update_progress("clearing", "Clearing previous data...", 5)
|
| 223 |
+
logger.info("Step 0: Clearing existing graph and embeddings for fresh extraction")
|
| 224 |
+
graph_store.clear()
|
| 225 |
+
embedding_service.clear()
|
| 226 |
+
logger.info("✓ Cleared all existing data")
|
| 227 |
+
|
| 228 |
+
# Step 1: Extract chunks (with caching)
|
| 229 |
+
cache_path = os.path.join(settings.data_dir, f"chunks_{pdf_id}.pkl")
|
| 230 |
+
|
| 231 |
+
if os.path.exists(cache_path):
|
| 232 |
+
# Load cached chunks (saves 2-3s on reindex)
|
| 233 |
+
update_progress("extraction", "Loading cached text extraction...", 15)
|
| 234 |
+
logger.info("⚡ Step 1: Loading cached chunks from previous extraction")
|
| 235 |
+
with open(cache_path, 'rb') as f:
|
| 236 |
+
cache_data = pickle.load(f)
|
| 237 |
+
refined_chunks = cache_data['chunks']
|
| 238 |
+
metadata = cache_data['metadata']
|
| 239 |
+
logger.info(f"✓ Loaded {len(refined_chunks)} cached chunks (skipped PDF processing)")
|
| 240 |
+
update_progress("extraction", f"Loaded {len(refined_chunks)} cached chunks", 25)
|
| 241 |
+
else:
|
| 242 |
+
# Extract and cache chunks for future reindexing
|
| 243 |
+
update_progress("extraction", "Extracting text from PDF...", 15)
|
| 244 |
+
logger.info("Step 1: Extracting chunks from PDF")
|
| 245 |
+
chunks, metadata = pdf_processor.process_pdf(filepath, pdf_id)
|
| 246 |
+
refined_chunks = pdf_processor.chunk_text(chunks)
|
| 247 |
+
|
| 248 |
+
# Cache for future use
|
| 249 |
+
with open(cache_path, 'wb') as f:
|
| 250 |
+
pickle.dump({'chunks': refined_chunks, 'metadata': metadata}, f)
|
| 251 |
+
logger.info(f"✓ Cached {len(refined_chunks)} chunks for future reindexing")
|
| 252 |
+
update_progress("extraction", f"Extracted {len(refined_chunks)} chunks", 25)
|
| 253 |
+
|
| 254 |
+
# Step 2: Create embeddings
|
| 255 |
+
update_progress("embeddings", f"Creating embeddings for {len(refined_chunks)} chunks...", 35)
|
| 256 |
+
logger.info(f"Step 2: Creating embeddings for {len(refined_chunks)} chunks")
|
| 257 |
+
embeddings = embedding_service.create_embeddings(refined_chunks)
|
| 258 |
+
update_progress("embeddings", "Embeddings created", 50)
|
| 259 |
+
|
| 260 |
+
# Step 3: Add to vector index
|
| 261 |
+
update_progress("indexing", "Building vector index...", 55)
|
| 262 |
+
logger.info("Step 3: Adding to vector index")
|
| 263 |
+
embedding_service.add_to_index(refined_chunks, embeddings)
|
| 264 |
+
embedding_service.save()
|
| 265 |
+
update_progress("indexing", "Vector index complete", 60)
|
| 266 |
+
|
| 267 |
+
# Step 4: Extract triples using Gemini (direct API - PARALLEL)
|
| 268 |
+
update_progress("extraction", "Extracting concepts with AI (parallel)...", 65)
|
| 269 |
+
logger.info("Step 4: Extracting triples using Gemini (PARALLEL per-page, 2 concepts max)")
|
| 270 |
+
canonical_triples = await triplet_extractor.extract_from_chunks(
|
| 271 |
+
refined_chunks,
|
| 272 |
+
use_llm=True # Direct Gemini API calls
|
| 273 |
+
)
|
| 274 |
+
update_progress("extraction", f"Extracted {len(canonical_triples)} relationships", 80)
|
| 275 |
+
|
| 276 |
+
# Step 5: Build graph
|
| 277 |
+
update_progress("graph", "Building knowledge graph...", 85)
|
| 278 |
+
logger.info("Step 5: Building knowledge graph")
|
| 279 |
+
num_nodes, num_edges = await graph_builder.build_graph(canonical_triples)
|
| 280 |
+
update_progress("graph", f"Graph complete: {num_nodes} nodes, {num_edges} edges", 95)
|
| 281 |
+
|
| 282 |
+
# Save graph
|
| 283 |
+
update_progress("saving", "Saving graph to disk...", 98)
|
| 284 |
+
graph_path = os.path.join(settings.data_dir, "knowledge_graph.pkl")
|
| 285 |
+
graph_store.save(graph_path)
|
| 286 |
+
|
| 287 |
+
# Update metadata
|
| 288 |
+
update_progress("completed", f"✓ Complete! {num_nodes} nodes, {num_edges} edges", 100)
|
| 289 |
+
pdf_metadata_store[pdf_id]["status"] = "completed"
|
| 290 |
+
pdf_metadata_store[pdf_id]["num_chunks"] = len(refined_chunks)
|
| 291 |
+
pdf_metadata_store[pdf_id]["num_nodes"] = num_nodes
|
| 292 |
+
pdf_metadata_store[pdf_id]["num_edges"] = num_edges
|
| 293 |
+
|
| 294 |
+
logger.info(f"✓ Completed ingestion for PDF {pdf_id}: {num_nodes} nodes, {num_edges} edges")
|
| 295 |
+
|
| 296 |
+
except Exception as e:
|
| 297 |
+
logger.error(f"❌ Failed to process PDF {pdf_id}: {e}", exc_info=True)
|
| 298 |
+
pdf_metadata_store[pdf_id]["status"] = "failed"
|
| 299 |
+
pdf_metadata_store[pdf_id]["error"] = str(e)
|
| 300 |
+
update_progress("error", f"Error: {str(e)[:100]}", 0)
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
@app.get("/graph", response_model=GraphResponse)
|
| 304 |
+
async def get_graph(pdf_id: str = None):
|
| 305 |
+
"""
|
| 306 |
+
Get the knowledge graph
|
| 307 |
+
|
| 308 |
+
Args:
|
| 309 |
+
pdf_id: Optional filter by PDF ID
|
| 310 |
+
|
| 311 |
+
Returns:
|
| 312 |
+
Graph nodes and edges
|
| 313 |
+
"""
|
| 314 |
+
nodes = graph_store.get_all_nodes()
|
| 315 |
+
edges = graph_store.get_all_edges()
|
| 316 |
+
|
| 317 |
+
logger.info(f"Returning {len(nodes)} nodes, {len(edges)} edges")
|
| 318 |
+
|
| 319 |
+
# Filter by PDF if specified
|
| 320 |
+
if pdf_id:
|
| 321 |
+
# Filter nodes and edges that belong to this PDF
|
| 322 |
+
# This requires tracking PDF ID in supporting chunks
|
| 323 |
+
pass
|
| 324 |
+
|
| 325 |
+
return GraphResponse(
|
| 326 |
+
nodes=nodes,
|
| 327 |
+
edges=edges,
|
| 328 |
+
metadata={
|
| 329 |
+
"total_nodes": len(nodes),
|
| 330 |
+
"total_edges": len(edges)
|
| 331 |
+
}
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
@app.get("/node/{node_id}", response_model=NodeDetailResponse)
|
| 336 |
+
async def get_node_details(node_id: str):
|
| 337 |
+
"""
|
| 338 |
+
Get detailed information about a node
|
| 339 |
+
|
| 340 |
+
Includes:
|
| 341 |
+
- Node metadata
|
| 342 |
+
- LLM-generated summary with citations
|
| 343 |
+
- Supporting chunks
|
| 344 |
+
- Related nodes
|
| 345 |
+
"""
|
| 346 |
+
node = graph_store.get_node(node_id)
|
| 347 |
+
if not node:
|
| 348 |
+
raise HTTPException(status_code=404, detail="Node not found")
|
| 349 |
+
|
| 350 |
+
# Check if summary is cached in node metadata
|
| 351 |
+
if "cached_summary" in node.metadata:
|
| 352 |
+
logger.info(f"✓ Using cached summary for node {node.label}")
|
| 353 |
+
summary = node.metadata["cached_summary"]
|
| 354 |
+
search_results = None # Use node's supporting chunks for sources
|
| 355 |
+
else:
|
| 356 |
+
# Generate summary (first time)
|
| 357 |
+
logger.info(f"⏳ Generating summary for node {node.label}...")
|
| 358 |
+
|
| 359 |
+
# Get supporting chunks using semantic search on the node label
|
| 360 |
+
# This finds chunks that are semantically similar to the concept
|
| 361 |
+
search_results = embedding_service.search(
|
| 362 |
+
query=node.label,
|
| 363 |
+
top_k=3 # Reduced from 5 to 3 for faster processing
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
# Prepare chunks for LLM
|
| 367 |
+
chunks_for_llm = []
|
| 368 |
+
if search_results:
|
| 369 |
+
chunks_for_llm = [
|
| 370 |
+
{
|
| 371 |
+
"page_number": meta.get("page_number", 0),
|
| 372 |
+
"text": meta.get("text", "")
|
| 373 |
+
}
|
| 374 |
+
for meta, score in search_results
|
| 375 |
+
]
|
| 376 |
+
|
| 377 |
+
# Fallback: if no chunks found, create a basic summary
|
| 378 |
+
if not chunks_for_llm:
|
| 379 |
+
logger.warning(f"No chunks found for node {node.label}, using basic summary")
|
| 380 |
+
chunks_for_llm = [
|
| 381 |
+
{
|
| 382 |
+
"page_number": chunk.page_number or 0,
|
| 383 |
+
"text": chunk.snippet or ""
|
| 384 |
+
}
|
| 385 |
+
for chunk in node.supporting_chunks[:3]
|
| 386 |
+
]
|
| 387 |
+
|
| 388 |
+
# Generate summary
|
| 389 |
+
summary = await llm_service.summarize_node(node.label, chunks_for_llm)
|
| 390 |
+
|
| 391 |
+
# Cache summary in node metadata (don't cache search_results - they're not serializable)
|
| 392 |
+
node.metadata["cached_summary"] = summary
|
| 393 |
+
node.metadata["cache_timestamp"] = str(datetime.utcnow())
|
| 394 |
+
|
| 395 |
+
# Update the node in the graph store
|
| 396 |
+
graph_store.update_node(node)
|
| 397 |
+
logger.info(f"✓ Cached summary for node {node.label}")
|
| 398 |
+
|
| 399 |
+
# Get related nodes
|
| 400 |
+
neighbors = graph_store.get_neighbors(node_id)
|
| 401 |
+
related_nodes = [
|
| 402 |
+
{
|
| 403 |
+
"node_id": neighbor.node_id,
|
| 404 |
+
"label": neighbor.label,
|
| 405 |
+
"relation": edge.relation.value,
|
| 406 |
+
"confidence": edge.confidence
|
| 407 |
+
}
|
| 408 |
+
for neighbor, edge in neighbors[:10] # Limit to top 10
|
| 409 |
+
]
|
| 410 |
+
|
| 411 |
+
# Build source citations
|
| 412 |
+
sources = []
|
| 413 |
+
if search_results is not None:
|
| 414 |
+
# Use search results (freshly generated summary)
|
| 415 |
+
for meta, score in search_results[:5]:
|
| 416 |
+
text = meta.get("text", "")
|
| 417 |
+
snippet = text[:120] + "..." if len(text) > 120 else text
|
| 418 |
+
sources.append(SourceCitation(
|
| 419 |
+
page_number=meta.get("page_number", 0),
|
| 420 |
+
snippet=snippet,
|
| 421 |
+
chunk_id=meta.get("chunk_id", ""),
|
| 422 |
+
score=score
|
| 423 |
+
))
|
| 424 |
+
else:
|
| 425 |
+
# Use node's supporting chunks (cached summary)
|
| 426 |
+
sources = [
|
| 427 |
+
SourceCitation(
|
| 428 |
+
page_number=chunk.page_number or 0,
|
| 429 |
+
snippet=chunk.snippet or "",
|
| 430 |
+
chunk_id=chunk.chunk_id,
|
| 431 |
+
score=chunk.score
|
| 432 |
+
)
|
| 433 |
+
for chunk in node.supporting_chunks[:5]
|
| 434 |
+
]
|
| 435 |
+
|
| 436 |
+
return NodeDetailResponse(
|
| 437 |
+
node_id=node.node_id,
|
| 438 |
+
label=node.label,
|
| 439 |
+
type=node.type,
|
| 440 |
+
summary=summary,
|
| 441 |
+
sources=sources,
|
| 442 |
+
related_nodes=related_nodes
|
| 443 |
+
)
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
@app.post("/chat", response_model=ChatResponse)
|
| 447 |
+
async def chat(request: ChatRequest):
|
| 448 |
+
"""
|
| 449 |
+
Agent-based RAG chat endpoint
|
| 450 |
+
|
| 451 |
+
Uses LangGraph agent with multiple tools:
|
| 452 |
+
- vector_search: Semantic search through chunks
|
| 453 |
+
- graph_search: Find concepts in knowledge graph
|
| 454 |
+
- get_node_details: Get detailed node information
|
| 455 |
+
- get_related_nodes: Graph traversal for relationships
|
| 456 |
+
- get_chunk_by_id: Retrieve specific chunks
|
| 457 |
+
|
| 458 |
+
The agent intelligently decides which tools to use based on the query
|
| 459 |
+
"""
|
| 460 |
+
logger.info(f"🤖 Agent chat request: '{request.query}'")
|
| 461 |
+
|
| 462 |
+
# Use agent-based RAG
|
| 463 |
+
response = await rag_agent.chat(
|
| 464 |
+
query=request.query,
|
| 465 |
+
pdf_id=request.pdf_id,
|
| 466 |
+
include_citations=True
|
| 467 |
+
)
|
| 468 |
+
|
| 469 |
+
# Limit sources to requested max
|
| 470 |
+
if len(response.sources) > request.max_sources:
|
| 471 |
+
response.sources = response.sources[:request.max_sources]
|
| 472 |
+
|
| 473 |
+
return response
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
@app.get("/status/{pdf_id}")
|
| 477 |
+
async def get_pdf_status(pdf_id: str):
|
| 478 |
+
"""Get processing status for a specific PDF"""
|
| 479 |
+
if pdf_id not in pdf_metadata_store:
|
| 480 |
+
raise HTTPException(status_code=404, detail="PDF not found")
|
| 481 |
+
|
| 482 |
+
metadata = pdf_metadata_store[pdf_id]
|
| 483 |
+
return {
|
| 484 |
+
"pdf_id": pdf_id,
|
| 485 |
+
"filename": metadata.get("filename"),
|
| 486 |
+
"status": metadata.get("status"),
|
| 487 |
+
"progress": metadata.get("progress", {}),
|
| 488 |
+
"num_nodes": metadata.get("num_nodes", 0),
|
| 489 |
+
"num_edges": metadata.get("num_edges", 0),
|
| 490 |
+
"error": metadata.get("error")
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
@app.get("/admin/status", response_model=AdminStatus)
|
| 495 |
+
async def admin_status():
|
| 496 |
+
"""Get system status and statistics"""
|
| 497 |
+
faiss_stats = embedding_service.get_stats()
|
| 498 |
+
|
| 499 |
+
return AdminStatus(
|
| 500 |
+
total_pdfs=len(pdf_metadata_store),
|
| 501 |
+
total_chunks=faiss_stats["num_chunks"],
|
| 502 |
+
total_nodes=len(graph_store.get_all_nodes()),
|
| 503 |
+
total_edges=len(graph_store.get_all_edges()),
|
| 504 |
+
vector_index_size=faiss_stats["total_vectors"],
|
| 505 |
+
recent_logs=[] # Would fetch from logs in production
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
@app.post("/admin/reindex")
|
| 510 |
+
async def admin_reindex(pdf_id: str):
|
| 511 |
+
"""Re-run ingestion for a PDF"""
|
| 512 |
+
if pdf_id not in pdf_metadata_store:
|
| 513 |
+
raise HTTPException(status_code=404, detail="PDF not found")
|
| 514 |
+
|
| 515 |
+
filepath = pdf_metadata_store[pdf_id]["filepath"]
|
| 516 |
+
|
| 517 |
+
# Clear existing data for this PDF (would need better tracking)
|
| 518 |
+
# For now, just re-run the pipeline
|
| 519 |
+
|
| 520 |
+
await process_pdf_pipeline(pdf_id, filepath)
|
| 521 |
+
|
| 522 |
+
return {"message": "Reindexing started", "pdf_id": pdf_id}
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
@app.post("/admin/clear")
|
| 526 |
+
async def admin_clear():
|
| 527 |
+
"""Clear all data"""
|
| 528 |
+
graph_store.clear()
|
| 529 |
+
embedding_service.clear()
|
| 530 |
+
pdf_metadata_store.clear()
|
| 531 |
+
|
| 532 |
+
logger.warning("All data cleared by admin")
|
| 533 |
+
|
| 534 |
+
return {"message": "All data cleared"}
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
# Mount static files for frontend
|
| 538 |
+
if os.path.exists("frontend"):
|
| 539 |
+
app.mount("/static", StaticFiles(directory="frontend"), name="static")
|
| 540 |
+
|
| 541 |
+
|
| 542 |
+
if __name__ == "__main__":
|
| 543 |
+
import uvicorn
|
| 544 |
+
uvicorn.run(
|
| 545 |
+
"main:app",
|
| 546 |
+
host=settings.api_host,
|
| 547 |
+
port=settings.api_port,
|
| 548 |
+
reload=settings.debug,
|
| 549 |
+
log_level=settings.log_level.lower()
|
| 550 |
+
)
|
modal_app.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GraphLLM - Modal Deployment
|
| 3 |
+
Serverless ML deployment with auto-scaling
|
| 4 |
+
"""
|
| 5 |
+
import modal
|
| 6 |
+
|
| 7 |
+
# Create Modal app
|
| 8 |
+
app = modal.App("graphllm")
|
| 9 |
+
|
| 10 |
+
# Define the container image with all dependencies
|
| 11 |
+
image = (
|
| 12 |
+
modal.Image.debian_slim(python_version="3.12")
|
| 13 |
+
.apt_install("tesseract-ocr", "ghostscript", "gcc", "g++")
|
| 14 |
+
.pip_install_from_requirements("requirements.txt")
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# Create persistent volume for data storage
|
| 18 |
+
volume = modal.Volume.from_name("graphllm-data", create_if_missing=True)
|
| 19 |
+
|
| 20 |
+
# Mount FastAPI app
|
| 21 |
+
@app.function(
|
| 22 |
+
image=image,
|
| 23 |
+
gpu=None, # Use CPU (cheaper)
|
| 24 |
+
memory=4096, # 4GB RAM
|
| 25 |
+
timeout=600, # 10 min timeout
|
| 26 |
+
volumes={"/app/data": volume},
|
| 27 |
+
secrets=[modal.Secret.from_name("graphllm-secrets")], # GEMINI_API_KEY
|
| 28 |
+
)
|
| 29 |
+
@modal.asgi_app()
|
| 30 |
+
def fastapi_app():
|
| 31 |
+
"""
|
| 32 |
+
Mount the FastAPI application
|
| 33 |
+
"""
|
| 34 |
+
import sys
|
| 35 |
+
sys.path.insert(0, "/root")
|
| 36 |
+
|
| 37 |
+
# Import main FastAPI app
|
| 38 |
+
from main import app as fastapi_app
|
| 39 |
+
|
| 40 |
+
return fastapi_app
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# Local testing endpoint
|
| 44 |
+
@app.local_entrypoint()
|
| 45 |
+
def main():
|
| 46 |
+
"""
|
| 47 |
+
Test the deployment locally
|
| 48 |
+
"""
|
| 49 |
+
print("GraphLLM deployed to Modal!")
|
| 50 |
+
print("Access your app at: https://YOUR_USERNAME--graphllm-fastapi-app.modal.run")
|
models.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data models for GraphLLM system following the manual specifications
|
| 3 |
+
"""
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
from typing import Optional, List, Dict, Any, Literal
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from enum import Enum
|
| 8 |
+
import uuid
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# Enums
|
| 12 |
+
class ChunkType(str, Enum):
|
| 13 |
+
"""Types of chunks extracted from PDF"""
|
| 14 |
+
PARAGRAPH = "paragraph"
|
| 15 |
+
CODE = "code"
|
| 16 |
+
TABLE = "table"
|
| 17 |
+
IMAGE = "image"
|
| 18 |
+
IMAGE_TEXT = "image_text"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class NodeType(str, Enum):
|
| 22 |
+
"""Types of graph nodes"""
|
| 23 |
+
CONCEPT = "concept"
|
| 24 |
+
PERSON = "person"
|
| 25 |
+
METHOD = "method"
|
| 26 |
+
TERM = "term"
|
| 27 |
+
CLASS = "class"
|
| 28 |
+
FUNCTION = "function"
|
| 29 |
+
ENTITY = "entity"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class RelationType(str, Enum):
|
| 33 |
+
"""Canonical relation types for edges"""
|
| 34 |
+
IS_A = "is_a"
|
| 35 |
+
PART_OF = "part_of"
|
| 36 |
+
METHOD_OF = "method_of"
|
| 37 |
+
CAUSES = "causes"
|
| 38 |
+
USES = "uses"
|
| 39 |
+
RELATED_TO = "related_to"
|
| 40 |
+
DEFINED_AS = "defined_as"
|
| 41 |
+
DEPENDS_ON = "depends_on"
|
| 42 |
+
IMPLEMENTS = "implements"
|
| 43 |
+
SIMILAR_TO = "similar_to"
|
| 44 |
+
OBSERVES = "observes"
|
| 45 |
+
MEASURES = "measures"
|
| 46 |
+
PRODUCES = "produces"
|
| 47 |
+
CONTAINS = "contains"
|
| 48 |
+
AFFECTS = "affects"
|
| 49 |
+
ENABLES = "enables"
|
| 50 |
+
REQUIRES = "requires"
|
| 51 |
+
INTERACTS_WITH = "interacts_with"
|
| 52 |
+
ENRICHES = "enriches"
|
| 53 |
+
ENHANCES = "enhances"
|
| 54 |
+
SUPPORTS = "supports"
|
| 55 |
+
DESCRIBES = "describes"
|
| 56 |
+
EXPLAINS = "explains"
|
| 57 |
+
REFERS_TO = "refers_to"
|
| 58 |
+
ASSOCIATED_WITH = "associated_with"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# Core Data Models
|
| 62 |
+
|
| 63 |
+
class Chunk(BaseModel):
|
| 64 |
+
"""Individual chunk of text/content from PDF"""
|
| 65 |
+
chunk_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
| 66 |
+
pdf_id: str
|
| 67 |
+
page_number: int
|
| 68 |
+
char_range: tuple[int, int]
|
| 69 |
+
type: ChunkType
|
| 70 |
+
text: str
|
| 71 |
+
table_json: Optional[Dict[str, Any]] = None
|
| 72 |
+
image_id: Optional[str] = None
|
| 73 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 74 |
+
created_at: datetime = Field(default_factory=datetime.utcnow)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class EmbeddingEntry(BaseModel):
|
| 78 |
+
"""Vector embedding for a chunk"""
|
| 79 |
+
chunk_id: str
|
| 80 |
+
embedding: List[float]
|
| 81 |
+
created_at: datetime = Field(default_factory=datetime.utcnow)
|
| 82 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class SupportingChunk(BaseModel):
|
| 86 |
+
"""Reference to a chunk supporting a node or edge"""
|
| 87 |
+
chunk_id: str
|
| 88 |
+
score: float
|
| 89 |
+
page_number: Optional[int] = None
|
| 90 |
+
snippet: Optional[str] = None
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class GraphNode(BaseModel):
|
| 94 |
+
"""Node in the knowledge graph"""
|
| 95 |
+
node_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
| 96 |
+
label: str
|
| 97 |
+
type: NodeType
|
| 98 |
+
aliases: List[str] = Field(default_factory=list)
|
| 99 |
+
supporting_chunks: List[SupportingChunk] = Field(default_factory=list)
|
| 100 |
+
importance_score: float = 0.0
|
| 101 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 102 |
+
created_at: datetime = Field(default_factory=datetime.utcnow)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class GraphEdge(BaseModel):
|
| 106 |
+
"""Edge in the knowledge graph"""
|
| 107 |
+
edge_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
| 108 |
+
from_node: str = Field(alias="from")
|
| 109 |
+
to_node: str = Field(alias="to")
|
| 110 |
+
relation: RelationType
|
| 111 |
+
confidence: float
|
| 112 |
+
supporting_chunks: List[SupportingChunk] = Field(default_factory=list)
|
| 113 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 114 |
+
created_at: datetime = Field(default_factory=datetime.utcnow)
|
| 115 |
+
|
| 116 |
+
class Config:
|
| 117 |
+
populate_by_name = True
|
| 118 |
+
# FastAPI automatically serializes enums as their string values in JSON
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class Triple(BaseModel):
|
| 122 |
+
"""Extracted triple from text"""
|
| 123 |
+
subject: str
|
| 124 |
+
predicate: str
|
| 125 |
+
object: str
|
| 126 |
+
confidence: float = 1.0
|
| 127 |
+
source_chunk_id: Optional[str] = None
|
| 128 |
+
page_number: Optional[int] = None
|
| 129 |
+
justification: Optional[str] = None
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
class CanonicalTriple(BaseModel):
|
| 133 |
+
"""LLM-canonicalized triple"""
|
| 134 |
+
subject_label: str
|
| 135 |
+
object_label: str
|
| 136 |
+
relation: RelationType
|
| 137 |
+
confidence: float
|
| 138 |
+
justification: str
|
| 139 |
+
page_number: int
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
# API Request/Response Models
|
| 143 |
+
|
| 144 |
+
class UploadResponse(BaseModel):
|
| 145 |
+
"""Response from PDF upload"""
|
| 146 |
+
pdf_id: str
|
| 147 |
+
filename: str
|
| 148 |
+
status: str
|
| 149 |
+
message: str
|
| 150 |
+
num_pages: Optional[int] = None
|
| 151 |
+
num_chunks: Optional[int] = None
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
class GraphResponse(BaseModel):
|
| 155 |
+
"""Response containing graph data"""
|
| 156 |
+
nodes: List[GraphNode]
|
| 157 |
+
edges: List[GraphEdge]
|
| 158 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
class SourceCitation(BaseModel):
|
| 162 |
+
"""Source citation with page number and snippet"""
|
| 163 |
+
page_number: int
|
| 164 |
+
snippet: str
|
| 165 |
+
chunk_id: str
|
| 166 |
+
score: Optional[float] = None
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
class NodeDetailResponse(BaseModel):
|
| 170 |
+
"""Response for node detail request"""
|
| 171 |
+
node_id: str
|
| 172 |
+
label: str
|
| 173 |
+
type: NodeType
|
| 174 |
+
summary: str
|
| 175 |
+
sources: List[SourceCitation]
|
| 176 |
+
related_nodes: List[Dict[str, Any]] = Field(default_factory=list)
|
| 177 |
+
raw_chunks: Optional[List[Chunk]] = None
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
class ChatMessage(BaseModel):
|
| 181 |
+
"""Chat message"""
|
| 182 |
+
role: Literal["user", "assistant", "system"]
|
| 183 |
+
content: str
|
| 184 |
+
sources: Optional[List[SourceCitation]] = None
|
| 185 |
+
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
class ChatRequest(BaseModel):
|
| 189 |
+
"""Chat request"""
|
| 190 |
+
query: str
|
| 191 |
+
pdf_id: str
|
| 192 |
+
include_citations: bool = True
|
| 193 |
+
max_sources: int = 5
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
class ChatResponse(BaseModel):
|
| 197 |
+
"""Chat response with answer and citations"""
|
| 198 |
+
answer: str
|
| 199 |
+
sources: List[SourceCitation]
|
| 200 |
+
context_chunks: Optional[List[str]] = None
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
class PDFMetadata(BaseModel):
|
| 204 |
+
"""Metadata for uploaded PDF"""
|
| 205 |
+
pdf_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
| 206 |
+
filename: str
|
| 207 |
+
filepath: str
|
| 208 |
+
num_pages: int
|
| 209 |
+
file_size_bytes: int
|
| 210 |
+
upload_timestamp: datetime = Field(default_factory=datetime.utcnow)
|
| 211 |
+
processing_status: str = "pending"
|
| 212 |
+
num_chunks: int = 0
|
| 213 |
+
num_nodes: int = 0
|
| 214 |
+
num_edges: int = 0
|
| 215 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
class IngestionLog(BaseModel):
|
| 219 |
+
"""Log entry for ingestion process"""
|
| 220 |
+
log_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
| 221 |
+
pdf_id: str
|
| 222 |
+
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
| 223 |
+
stage: str
|
| 224 |
+
status: str
|
| 225 |
+
message: str
|
| 226 |
+
details: Optional[Dict[str, Any]] = None
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
class AdminStatus(BaseModel):
|
| 230 |
+
"""Admin status response"""
|
| 231 |
+
total_pdfs: int
|
| 232 |
+
total_chunks: int
|
| 233 |
+
total_nodes: int
|
| 234 |
+
total_edges: int
|
| 235 |
+
vector_index_size: int
|
| 236 |
+
recent_logs: List[IngestionLog]
|
pdf_processor.py
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PDF Ingestion & Preprocessing Module
|
| 3 |
+
Handles extraction of text, tables, code blocks, and images from PDFs
|
| 4 |
+
"""
|
| 5 |
+
import fitz # PyMuPDF
|
| 6 |
+
import pdfplumber
|
| 7 |
+
import pytesseract
|
| 8 |
+
from PIL import Image
|
| 9 |
+
import io
|
| 10 |
+
import re
|
| 11 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 12 |
+
from loguru import logger
|
| 13 |
+
from models import Chunk, ChunkType, PDFMetadata
|
| 14 |
+
from config import settings
|
| 15 |
+
import uuid
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class PDFProcessor:
|
| 19 |
+
"""
|
| 20 |
+
Comprehensive PDF processor that extracts:
|
| 21 |
+
- Page-level text with character ranges
|
| 22 |
+
- Tables (structured)
|
| 23 |
+
- Code blocks (detected heuristically)
|
| 24 |
+
- Images (with OCR)
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self):
|
| 28 |
+
self.code_patterns = [
|
| 29 |
+
re.compile(r'```[\s\S]*?```'), # Markdown code blocks
|
| 30 |
+
re.compile(r'def\s+\w+\s*\('), # Python functions
|
| 31 |
+
re.compile(r'class\s+\w+\s*[:\(]'), # Python/Java classes
|
| 32 |
+
re.compile(r'function\s+\w+\s*\('), # JavaScript functions
|
| 33 |
+
re.compile(r'public\s+class\s+\w+'), # Java classes
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
def process_pdf(self, filepath: str, pdf_id: str) -> Tuple[List[Chunk], PDFMetadata]:
|
| 37 |
+
"""
|
| 38 |
+
Main entry point: process entire PDF and return chunks + metadata
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
filepath: Path to PDF file
|
| 42 |
+
pdf_id: Unique identifier for this PDF
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
Tuple of (chunks list, metadata object)
|
| 46 |
+
"""
|
| 47 |
+
logger.info(f"Processing PDF: {filepath}")
|
| 48 |
+
|
| 49 |
+
chunks: List[Chunk] = []
|
| 50 |
+
|
| 51 |
+
# Open with PyMuPDF for text and images
|
| 52 |
+
pdf_doc = fitz.open(filepath)
|
| 53 |
+
num_pages = len(pdf_doc)
|
| 54 |
+
|
| 55 |
+
# Open with pdfplumber for tables
|
| 56 |
+
with pdfplumber.open(filepath) as plumber_pdf:
|
| 57 |
+
for page_num in range(num_pages):
|
| 58 |
+
logger.debug(f"Processing page {page_num + 1}/{num_pages}")
|
| 59 |
+
|
| 60 |
+
# Extract from PyMuPDF
|
| 61 |
+
fitz_page = pdf_doc[page_num]
|
| 62 |
+
page_chunks = self._process_page(
|
| 63 |
+
fitz_page=fitz_page,
|
| 64 |
+
plumber_page=plumber_pdf.pages[page_num],
|
| 65 |
+
page_num=page_num + 1, # 1-indexed
|
| 66 |
+
pdf_id=pdf_id
|
| 67 |
+
)
|
| 68 |
+
chunks.extend(page_chunks)
|
| 69 |
+
|
| 70 |
+
pdf_doc.close()
|
| 71 |
+
|
| 72 |
+
# Create metadata
|
| 73 |
+
import os
|
| 74 |
+
file_size = os.path.getsize(filepath)
|
| 75 |
+
metadata = PDFMetadata(
|
| 76 |
+
pdf_id=pdf_id,
|
| 77 |
+
filename=os.path.basename(filepath),
|
| 78 |
+
filepath=filepath,
|
| 79 |
+
num_pages=num_pages,
|
| 80 |
+
file_size_bytes=file_size,
|
| 81 |
+
num_chunks=len(chunks),
|
| 82 |
+
processing_status="completed"
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
logger.info(f"Extracted {len(chunks)} chunks from {num_pages} pages")
|
| 86 |
+
return chunks, metadata
|
| 87 |
+
|
| 88 |
+
def _process_page(
|
| 89 |
+
self,
|
| 90 |
+
fitz_page,
|
| 91 |
+
plumber_page,
|
| 92 |
+
page_num: int,
|
| 93 |
+
pdf_id: str
|
| 94 |
+
) -> List[Chunk]:
|
| 95 |
+
"""Process a single page and return all chunks"""
|
| 96 |
+
chunks: List[Chunk] = []
|
| 97 |
+
|
| 98 |
+
# 1. Extract raw text with character positions
|
| 99 |
+
page_text = fitz_page.get_text("text")
|
| 100 |
+
|
| 101 |
+
# 2. Extract tables
|
| 102 |
+
table_chunks = self._extract_tables(plumber_page, page_num, pdf_id)
|
| 103 |
+
chunks.extend(table_chunks)
|
| 104 |
+
|
| 105 |
+
# 3. Extract code blocks
|
| 106 |
+
code_chunks = self._extract_code_blocks(page_text, page_num, pdf_id)
|
| 107 |
+
chunks.extend(code_chunks)
|
| 108 |
+
|
| 109 |
+
# 4. Extract images and run OCR
|
| 110 |
+
image_chunks = self._extract_images(fitz_page, page_num, pdf_id)
|
| 111 |
+
chunks.extend(image_chunks)
|
| 112 |
+
|
| 113 |
+
# 5. Extract remaining text as paragraphs
|
| 114 |
+
# Remove table and code regions from text before creating paragraph chunks
|
| 115 |
+
cleaned_text = self._remove_extracted_regions(
|
| 116 |
+
page_text,
|
| 117 |
+
[c.text for c in code_chunks]
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
if cleaned_text.strip():
|
| 121 |
+
para_chunk = Chunk(
|
| 122 |
+
chunk_id=str(uuid.uuid4()),
|
| 123 |
+
pdf_id=pdf_id,
|
| 124 |
+
page_number=page_num,
|
| 125 |
+
char_range=(0, len(cleaned_text)),
|
| 126 |
+
type=ChunkType.PARAGRAPH,
|
| 127 |
+
text=cleaned_text,
|
| 128 |
+
metadata={"source": "text_extraction"}
|
| 129 |
+
)
|
| 130 |
+
chunks.append(para_chunk)
|
| 131 |
+
|
| 132 |
+
return chunks
|
| 133 |
+
|
| 134 |
+
def _extract_tables(self, plumber_page, page_num: int, pdf_id: str) -> List[Chunk]:
|
| 135 |
+
"""Extract tables from page using pdfplumber"""
|
| 136 |
+
chunks = []
|
| 137 |
+
tables = plumber_page.extract_tables()
|
| 138 |
+
|
| 139 |
+
for idx, table in enumerate(tables):
|
| 140 |
+
if not table:
|
| 141 |
+
continue
|
| 142 |
+
|
| 143 |
+
# Convert table to structured JSON
|
| 144 |
+
table_json = self._table_to_json(table)
|
| 145 |
+
|
| 146 |
+
# Convert table to text representation
|
| 147 |
+
table_text = self._table_to_text(table)
|
| 148 |
+
|
| 149 |
+
chunk = Chunk(
|
| 150 |
+
chunk_id=str(uuid.uuid4()),
|
| 151 |
+
pdf_id=pdf_id,
|
| 152 |
+
page_number=page_num,
|
| 153 |
+
char_range=(0, len(table_text)),
|
| 154 |
+
type=ChunkType.TABLE,
|
| 155 |
+
text=table_text,
|
| 156 |
+
table_json=table_json,
|
| 157 |
+
metadata={"table_index": idx, "num_rows": len(table)}
|
| 158 |
+
)
|
| 159 |
+
chunks.append(chunk)
|
| 160 |
+
|
| 161 |
+
logger.debug(f"Extracted {len(chunks)} tables from page {page_num}")
|
| 162 |
+
return chunks
|
| 163 |
+
|
| 164 |
+
def _table_to_json(self, table: List[List[str]]) -> Dict[str, Any]:
|
| 165 |
+
"""Convert table to structured JSON"""
|
| 166 |
+
if not table or len(table) < 2:
|
| 167 |
+
return {"headers": [], "rows": []}
|
| 168 |
+
|
| 169 |
+
headers = table[0]
|
| 170 |
+
rows = table[1:]
|
| 171 |
+
|
| 172 |
+
return {
|
| 173 |
+
"headers": headers,
|
| 174 |
+
"rows": [
|
| 175 |
+
{headers[i]: cell for i, cell in enumerate(row) if i < len(headers)}
|
| 176 |
+
for row in rows
|
| 177 |
+
]
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
def _table_to_text(self, table: List[List[str]]) -> str:
|
| 181 |
+
"""Convert table to readable text"""
|
| 182 |
+
return "\n".join([" | ".join([str(cell) for cell in row]) for row in table])
|
| 183 |
+
|
| 184 |
+
def _extract_code_blocks(self, text: str, page_num: int, pdf_id: str) -> List[Chunk]:
|
| 185 |
+
"""Extract code blocks using heuristic patterns"""
|
| 186 |
+
chunks = []
|
| 187 |
+
|
| 188 |
+
# Look for code patterns
|
| 189 |
+
for pattern in self.code_patterns:
|
| 190 |
+
matches = pattern.finditer(text)
|
| 191 |
+
for match in matches:
|
| 192 |
+
code_text = match.group(0)
|
| 193 |
+
if len(code_text) < 20: # Skip very short matches
|
| 194 |
+
continue
|
| 195 |
+
|
| 196 |
+
chunk = Chunk(
|
| 197 |
+
chunk_id=str(uuid.uuid4()),
|
| 198 |
+
pdf_id=pdf_id,
|
| 199 |
+
page_number=page_num,
|
| 200 |
+
char_range=(match.start(), match.end()),
|
| 201 |
+
type=ChunkType.CODE,
|
| 202 |
+
text=code_text,
|
| 203 |
+
metadata={
|
| 204 |
+
"pattern": pattern.pattern,
|
| 205 |
+
"detected_language": self._detect_language(code_text)
|
| 206 |
+
}
|
| 207 |
+
)
|
| 208 |
+
chunks.append(chunk)
|
| 209 |
+
|
| 210 |
+
# Also detect monospace font regions (if PDF has font info)
|
| 211 |
+
# This is more advanced and would require font analysis
|
| 212 |
+
|
| 213 |
+
logger.debug(f"Extracted {len(chunks)} code blocks from page {page_num}")
|
| 214 |
+
return chunks
|
| 215 |
+
|
| 216 |
+
def _detect_language(self, code: str) -> str:
|
| 217 |
+
"""Heuristically detect programming language"""
|
| 218 |
+
if 'def ' in code and ':' in code:
|
| 219 |
+
return 'python'
|
| 220 |
+
elif 'function' in code or 'const' in code or 'let' in code:
|
| 221 |
+
return 'javascript'
|
| 222 |
+
elif 'public class' in code or 'private' in code:
|
| 223 |
+
return 'java'
|
| 224 |
+
elif '#include' in code:
|
| 225 |
+
return 'c++'
|
| 226 |
+
else:
|
| 227 |
+
return 'unknown'
|
| 228 |
+
|
| 229 |
+
def _extract_images(self, fitz_page, page_num: int, pdf_id: str) -> List[Chunk]:
|
| 230 |
+
"""Extract images and run OCR"""
|
| 231 |
+
chunks = []
|
| 232 |
+
image_list = fitz_page.get_images()
|
| 233 |
+
|
| 234 |
+
for img_index, img in enumerate(image_list):
|
| 235 |
+
try:
|
| 236 |
+
xref = img[0]
|
| 237 |
+
base_image = fitz_page.parent.extract_image(xref)
|
| 238 |
+
image_bytes = base_image["image"]
|
| 239 |
+
|
| 240 |
+
# Convert to PIL Image
|
| 241 |
+
image = Image.open(io.BytesIO(image_bytes))
|
| 242 |
+
|
| 243 |
+
# Run OCR
|
| 244 |
+
ocr_text = pytesseract.image_to_string(image)
|
| 245 |
+
|
| 246 |
+
if ocr_text.strip():
|
| 247 |
+
image_id = f"{pdf_id}_p{page_num}_img{img_index}"
|
| 248 |
+
|
| 249 |
+
chunk = Chunk(
|
| 250 |
+
chunk_id=str(uuid.uuid4()),
|
| 251 |
+
pdf_id=pdf_id,
|
| 252 |
+
page_number=page_num,
|
| 253 |
+
char_range=(0, len(ocr_text)),
|
| 254 |
+
type=ChunkType.IMAGE_TEXT,
|
| 255 |
+
text=ocr_text,
|
| 256 |
+
image_id=image_id,
|
| 257 |
+
metadata={
|
| 258 |
+
"image_format": base_image["ext"],
|
| 259 |
+
"image_index": img_index
|
| 260 |
+
}
|
| 261 |
+
)
|
| 262 |
+
chunks.append(chunk)
|
| 263 |
+
except Exception as e:
|
| 264 |
+
logger.warning(f"Failed to extract image {img_index} on page {page_num}: {e}")
|
| 265 |
+
|
| 266 |
+
logger.debug(f"Extracted {len(chunks)} images from page {page_num}")
|
| 267 |
+
return chunks
|
| 268 |
+
|
| 269 |
+
def _remove_extracted_regions(self, text: str, code_blocks: List[str]) -> str:
|
| 270 |
+
"""Remove already-extracted code blocks from text"""
|
| 271 |
+
for code in code_blocks:
|
| 272 |
+
text = text.replace(code, "")
|
| 273 |
+
return text
|
| 274 |
+
|
| 275 |
+
def chunk_text(self, chunks: List[Chunk]) -> List[Chunk]:
|
| 276 |
+
"""
|
| 277 |
+
Further chunk large text blocks into smaller overlapping chunks
|
| 278 |
+
|
| 279 |
+
Args:
|
| 280 |
+
chunks: Initial chunks from PDF extraction
|
| 281 |
+
|
| 282 |
+
Returns:
|
| 283 |
+
Refined chunks with proper overlap
|
| 284 |
+
"""
|
| 285 |
+
refined_chunks = []
|
| 286 |
+
|
| 287 |
+
for chunk in chunks:
|
| 288 |
+
# Skip non-text chunks (tables, images already chunked)
|
| 289 |
+
if chunk.type in [ChunkType.TABLE, ChunkType.CODE]:
|
| 290 |
+
refined_chunks.append(chunk)
|
| 291 |
+
continue
|
| 292 |
+
|
| 293 |
+
# Split long paragraphs into smaller chunks with overlap
|
| 294 |
+
text = chunk.text
|
| 295 |
+
chunk_size = settings.chunk_size
|
| 296 |
+
overlap = settings.chunk_overlap
|
| 297 |
+
|
| 298 |
+
if len(text) <= chunk_size:
|
| 299 |
+
refined_chunks.append(chunk)
|
| 300 |
+
continue
|
| 301 |
+
|
| 302 |
+
# Create overlapping windows
|
| 303 |
+
for i in range(0, len(text), chunk_size - overlap):
|
| 304 |
+
chunk_text = text[i:i + chunk_size]
|
| 305 |
+
|
| 306 |
+
if len(chunk_text) < settings.min_chunk_size:
|
| 307 |
+
continue
|
| 308 |
+
|
| 309 |
+
new_chunk = Chunk(
|
| 310 |
+
chunk_id=str(uuid.uuid4()),
|
| 311 |
+
pdf_id=chunk.pdf_id,
|
| 312 |
+
page_number=chunk.page_number,
|
| 313 |
+
char_range=(i, i + len(chunk_text)),
|
| 314 |
+
type=chunk.type,
|
| 315 |
+
text=chunk_text,
|
| 316 |
+
metadata={
|
| 317 |
+
**chunk.metadata,
|
| 318 |
+
"parent_chunk_id": chunk.chunk_id,
|
| 319 |
+
"window_index": i // (chunk_size - overlap)
|
| 320 |
+
}
|
| 321 |
+
)
|
| 322 |
+
refined_chunks.append(new_chunk)
|
| 323 |
+
|
| 324 |
+
logger.info(f"Refined {len(chunks)} chunks into {len(refined_chunks)} chunks")
|
| 325 |
+
return refined_chunks
|
rag_agent.py
ADDED
|
@@ -0,0 +1,485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent-Based RAG System using LangGraph
|
| 3 |
+
Provides intelligent query answering with tool use and multi-hop reasoning
|
| 4 |
+
"""
|
| 5 |
+
from typing import List, Dict, Any, TypedDict, Annotated
|
| 6 |
+
from typing_extensions import TypedDict
|
| 7 |
+
from langgraph.graph import StateGraph, END, START
|
| 8 |
+
from langgraph.prebuilt import ToolNode
|
| 9 |
+
from langchain_core.tools import tool
|
| 10 |
+
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage
|
| 11 |
+
from loguru import logger
|
| 12 |
+
import asyncio
|
| 13 |
+
|
| 14 |
+
from models import SourceCitation, ChatResponse
|
| 15 |
+
from graph_store import GraphStore
|
| 16 |
+
from embedding_service import EmbeddingService
|
| 17 |
+
from llm_service import LLMService
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class AgentState(TypedDict):
|
| 21 |
+
"""State for the RAG agent workflow"""
|
| 22 |
+
messages: List # Conversation history
|
| 23 |
+
query: str # Current user question
|
| 24 |
+
pdf_id: str # PDF context
|
| 25 |
+
tool_results: Dict[str, Any] # Results from tool executions
|
| 26 |
+
reasoning_steps: List[str] # Agent's reasoning process
|
| 27 |
+
final_answer: str # Synthesized answer
|
| 28 |
+
citations: List[SourceCitation] # Supporting citations
|
| 29 |
+
next_action: str # What to do next
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class RAGAgent:
|
| 33 |
+
"""
|
| 34 |
+
Intelligent RAG agent that uses multiple tools to answer questions
|
| 35 |
+
|
| 36 |
+
Tools available:
|
| 37 |
+
1. vector_search - Semantic search through document chunks
|
| 38 |
+
2. graph_search - Find concepts in knowledge graph
|
| 39 |
+
3. get_node_details - Get detailed info about a graph node
|
| 40 |
+
4. get_related_nodes - Traverse graph relationships
|
| 41 |
+
5. get_chunk_by_id - Retrieve specific chunks for citations
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
def __init__(self,
|
| 45 |
+
graph_store: GraphStore,
|
| 46 |
+
embedding_service: EmbeddingService,
|
| 47 |
+
llm_service: LLMService):
|
| 48 |
+
"""Initialize the RAG agent with necessary services"""
|
| 49 |
+
self.graph_store = graph_store
|
| 50 |
+
self.embedding_service = embedding_service
|
| 51 |
+
self.llm_service = llm_service
|
| 52 |
+
|
| 53 |
+
# Build LangGraph workflow
|
| 54 |
+
self.workflow = self._build_workflow()
|
| 55 |
+
self.app = self.workflow.compile()
|
| 56 |
+
|
| 57 |
+
logger.info("✓ RAG Agent initialized with LangGraph workflow")
|
| 58 |
+
|
| 59 |
+
def _create_tools(self):
|
| 60 |
+
"""Create tool functions for the agent"""
|
| 61 |
+
|
| 62 |
+
@tool
|
| 63 |
+
def vector_search(query: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
| 64 |
+
"""
|
| 65 |
+
Search document chunks using semantic similarity.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
query: The search query
|
| 69 |
+
top_k: Number of results to return
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
List of relevant chunks with metadata and scores
|
| 73 |
+
"""
|
| 74 |
+
logger.info(f"🔍 Tool: vector_search('{query}', top_k={top_k})")
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
results = self.embedding_service.search(
|
| 78 |
+
query=query,
|
| 79 |
+
top_k=top_k
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
formatted_results = []
|
| 83 |
+
for metadata, score in results:
|
| 84 |
+
formatted_results.append({
|
| 85 |
+
"text": metadata.get("text", ""),
|
| 86 |
+
"page_number": metadata.get("page_number", 0),
|
| 87 |
+
"chunk_id": metadata.get("chunk_id", ""),
|
| 88 |
+
"score": float(score)
|
| 89 |
+
})
|
| 90 |
+
|
| 91 |
+
logger.info(f" ✓ Found {len(formatted_results)} chunks")
|
| 92 |
+
return formatted_results
|
| 93 |
+
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.error(f" ✗ vector_search failed: {e}")
|
| 96 |
+
return []
|
| 97 |
+
|
| 98 |
+
@tool
|
| 99 |
+
def graph_search(concept: str) -> Dict[str, Any]:
|
| 100 |
+
"""
|
| 101 |
+
Find a concept node in the knowledge graph.
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
concept: The concept to search for
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
Node information if found, None otherwise
|
| 108 |
+
"""
|
| 109 |
+
logger.info(f"🔍 Tool: graph_search('{concept}')")
|
| 110 |
+
|
| 111 |
+
try:
|
| 112 |
+
node = self.graph_store.get_node_by_label(concept)
|
| 113 |
+
|
| 114 |
+
if node:
|
| 115 |
+
logger.info(f" ✓ Found node: {node.label}")
|
| 116 |
+
return {
|
| 117 |
+
"node_id": node.node_id,
|
| 118 |
+
"label": node.label,
|
| 119 |
+
"type": node.type.value if hasattr(node.type, 'value') else node.type,
|
| 120 |
+
"importance": node.importance_score
|
| 121 |
+
}
|
| 122 |
+
else:
|
| 123 |
+
logger.info(f" ✗ No node found for '{concept}'")
|
| 124 |
+
return None
|
| 125 |
+
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f" ✗ graph_search failed: {e}")
|
| 128 |
+
return None
|
| 129 |
+
|
| 130 |
+
@tool
|
| 131 |
+
def get_node_details(node_id: str) -> Dict[str, Any]:
|
| 132 |
+
"""
|
| 133 |
+
Get detailed information about a graph node.
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
node_id: The ID of the node
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
Detailed node information including supporting chunks
|
| 140 |
+
"""
|
| 141 |
+
logger.info(f"🔍 Tool: get_node_details('{node_id}')")
|
| 142 |
+
|
| 143 |
+
try:
|
| 144 |
+
node = self.graph_store.get_node(node_id)
|
| 145 |
+
|
| 146 |
+
if not node:
|
| 147 |
+
logger.info(f" ✗ Node not found")
|
| 148 |
+
return None
|
| 149 |
+
|
| 150 |
+
# Get supporting chunks
|
| 151 |
+
chunks = []
|
| 152 |
+
for chunk in node.supporting_chunks[:5]: # Top 5
|
| 153 |
+
chunks.append({
|
| 154 |
+
"page_number": chunk.page_number,
|
| 155 |
+
"snippet": chunk.snippet,
|
| 156 |
+
"score": chunk.score
|
| 157 |
+
})
|
| 158 |
+
|
| 159 |
+
logger.info(f" ✓ Got details for {node.label}")
|
| 160 |
+
return {
|
| 161 |
+
"label": node.label,
|
| 162 |
+
"type": node.type.value if hasattr(node.type, 'value') else node.type,
|
| 163 |
+
"importance": node.importance_score,
|
| 164 |
+
"supporting_chunks": chunks
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f" ✗ get_node_details failed: {e}")
|
| 169 |
+
return None
|
| 170 |
+
|
| 171 |
+
@tool
|
| 172 |
+
def get_related_nodes(node_id: str, max_neighbors: int = 5) -> List[Dict[str, Any]]:
|
| 173 |
+
"""
|
| 174 |
+
Get nodes related to a given node (graph traversal).
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
node_id: The ID of the starting node
|
| 178 |
+
max_neighbors: Maximum number of related nodes to return
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
List of related nodes with relationship information
|
| 182 |
+
"""
|
| 183 |
+
logger.info(f"🔍 Tool: get_related_nodes('{node_id}', max={max_neighbors})")
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
neighbors = self.graph_store.get_neighbors(node_id)
|
| 187 |
+
|
| 188 |
+
related = []
|
| 189 |
+
for neighbor_node, edge in neighbors[:max_neighbors]:
|
| 190 |
+
relation_value = edge.relation.value if hasattr(edge.relation, 'value') else edge.relation
|
| 191 |
+
related.append({
|
| 192 |
+
"node_id": neighbor_node.node_id,
|
| 193 |
+
"label": neighbor_node.label,
|
| 194 |
+
"relation": relation_value,
|
| 195 |
+
"confidence": edge.confidence
|
| 196 |
+
})
|
| 197 |
+
|
| 198 |
+
logger.info(f" ✓ Found {len(related)} related nodes")
|
| 199 |
+
return related
|
| 200 |
+
|
| 201 |
+
except Exception as e:
|
| 202 |
+
logger.error(f" ✗ get_related_nodes failed: {e}")
|
| 203 |
+
return []
|
| 204 |
+
|
| 205 |
+
@tool
|
| 206 |
+
def get_chunk_by_id(chunk_id: str) -> Dict[str, Any]:
|
| 207 |
+
"""
|
| 208 |
+
Retrieve a specific chunk by its ID (for detailed citations).
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
chunk_id: The chunk identifier
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
Chunk content and metadata
|
| 215 |
+
"""
|
| 216 |
+
logger.info(f"🔍 Tool: get_chunk_by_id('{chunk_id}')")
|
| 217 |
+
|
| 218 |
+
try:
|
| 219 |
+
# Search by chunk_id in metadata
|
| 220 |
+
# This is a simplified version - you may need to implement proper chunk lookup
|
| 221 |
+
results = self.embedding_service.search_by_chunk_ids([chunk_id], top_k=1)
|
| 222 |
+
|
| 223 |
+
if results:
|
| 224 |
+
metadata, score = results[0]
|
| 225 |
+
logger.info(f" ✓ Found chunk")
|
| 226 |
+
return {
|
| 227 |
+
"text": metadata.get("text", ""),
|
| 228 |
+
"page_number": metadata.get("page_number", 0),
|
| 229 |
+
"chunk_id": chunk_id
|
| 230 |
+
}
|
| 231 |
+
else:
|
| 232 |
+
logger.info(f" ✗ Chunk not found")
|
| 233 |
+
return None
|
| 234 |
+
|
| 235 |
+
except Exception as e:
|
| 236 |
+
logger.error(f" ✗ get_chunk_by_id failed: {e}")
|
| 237 |
+
return None
|
| 238 |
+
|
| 239 |
+
return [vector_search, graph_search, get_node_details, get_related_nodes, get_chunk_by_id]
|
| 240 |
+
|
| 241 |
+
def _build_workflow(self) -> StateGraph:
|
| 242 |
+
"""Build the LangGraph workflow for the agent"""
|
| 243 |
+
|
| 244 |
+
workflow = StateGraph(AgentState)
|
| 245 |
+
|
| 246 |
+
# Define workflow nodes
|
| 247 |
+
workflow.add_node("plan", self._plan_node)
|
| 248 |
+
workflow.add_node("execute_tools", self._execute_tools_node)
|
| 249 |
+
workflow.add_node("synthesize", self._synthesize_node)
|
| 250 |
+
|
| 251 |
+
# Define edges
|
| 252 |
+
workflow.add_edge(START, "plan")
|
| 253 |
+
workflow.add_conditional_edges(
|
| 254 |
+
"plan",
|
| 255 |
+
self._should_use_tools,
|
| 256 |
+
{
|
| 257 |
+
"tools": "execute_tools",
|
| 258 |
+
"direct": "synthesize"
|
| 259 |
+
}
|
| 260 |
+
)
|
| 261 |
+
workflow.add_edge("execute_tools", "synthesize")
|
| 262 |
+
workflow.add_edge("synthesize", END)
|
| 263 |
+
|
| 264 |
+
return workflow
|
| 265 |
+
|
| 266 |
+
def _plan_node(self, state: AgentState) -> AgentState:
|
| 267 |
+
"""Agent decides which tools to use"""
|
| 268 |
+
logger.info("🤖 Agent: Planning which tools to use...")
|
| 269 |
+
|
| 270 |
+
query = state["query"]
|
| 271 |
+
|
| 272 |
+
# Simple heuristic-based planning (can be enhanced with LLM)
|
| 273 |
+
tools_to_use = []
|
| 274 |
+
reasoning = []
|
| 275 |
+
|
| 276 |
+
# Always use vector search for semantic matching
|
| 277 |
+
tools_to_use.append("vector_search")
|
| 278 |
+
reasoning.append("Use vector search for semantic document retrieval")
|
| 279 |
+
|
| 280 |
+
# Check if query mentions specific concepts (use graph)
|
| 281 |
+
if any(word in query.lower() for word in ["relate", "connection", "link", "between"]):
|
| 282 |
+
tools_to_use.append("graph_search")
|
| 283 |
+
reasoning.append("Query asks about relationships - use graph search")
|
| 284 |
+
|
| 285 |
+
# Check if asking about a specific concept
|
| 286 |
+
if any(word in query.lower() for word in ["what is", "define", "explain"]):
|
| 287 |
+
tools_to_use.append("graph_search")
|
| 288 |
+
reasoning.append("Query asks for concept definition - check graph")
|
| 289 |
+
|
| 290 |
+
state["tool_results"] = {"planned_tools": tools_to_use}
|
| 291 |
+
state["reasoning_steps"] = reasoning
|
| 292 |
+
state["next_action"] = "tools" if tools_to_use else "direct"
|
| 293 |
+
|
| 294 |
+
logger.info(f" Plan: {tools_to_use}")
|
| 295 |
+
return state
|
| 296 |
+
|
| 297 |
+
def _should_use_tools(self, state: AgentState) -> str:
|
| 298 |
+
"""Decide if tools are needed"""
|
| 299 |
+
return state.get("next_action", "direct")
|
| 300 |
+
|
| 301 |
+
def _execute_tools_node(self, state: AgentState) -> AgentState:
|
| 302 |
+
"""Execute the planned tools"""
|
| 303 |
+
logger.info("🔧 Agent: Executing tools...")
|
| 304 |
+
|
| 305 |
+
query = state["query"]
|
| 306 |
+
planned_tools = state["tool_results"].get("planned_tools", [])
|
| 307 |
+
results = {}
|
| 308 |
+
|
| 309 |
+
# Create tools
|
| 310 |
+
tools_map = {}
|
| 311 |
+
for tool in self._create_tools():
|
| 312 |
+
tools_map[tool.name] = tool
|
| 313 |
+
|
| 314 |
+
# Execute tools
|
| 315 |
+
if "vector_search" in planned_tools:
|
| 316 |
+
vector_tool = tools_map["vector_search"]
|
| 317 |
+
results["vector_results"] = vector_tool.invoke({"query": query, "top_k": 5})
|
| 318 |
+
|
| 319 |
+
if "graph_search" in planned_tools:
|
| 320 |
+
# Extract main concept from query (simplified)
|
| 321 |
+
# In production, use NER or LLM to extract concept
|
| 322 |
+
words = query.lower().split()
|
| 323 |
+
potential_concepts = [w for w in words if len(w) > 4 and w not in ["what", "how", "does", "relate"]]
|
| 324 |
+
|
| 325 |
+
for concept in potential_concepts[:2]: # Try first 2
|
| 326 |
+
graph_tool = tools_map["graph_search"]
|
| 327 |
+
node_result = graph_tool.invoke({"concept": concept})
|
| 328 |
+
if node_result:
|
| 329 |
+
results[f"graph_node_{concept}"] = node_result
|
| 330 |
+
|
| 331 |
+
# Get related nodes
|
| 332 |
+
related_tool = tools_map["get_related_nodes"]
|
| 333 |
+
related = related_tool.invoke({"node_id": node_result["node_id"], "max_neighbors": 3})
|
| 334 |
+
results[f"related_{concept}"] = related
|
| 335 |
+
break
|
| 336 |
+
|
| 337 |
+
state["tool_results"].update(results)
|
| 338 |
+
logger.info(f" ✓ Executed {len(planned_tools)} tools, got {len(results)} results")
|
| 339 |
+
return state
|
| 340 |
+
|
| 341 |
+
async def _synthesize_node(self, state: AgentState) -> AgentState:
|
| 342 |
+
"""Synthesize final answer from tool results"""
|
| 343 |
+
logger.info("🎯 Agent: Synthesizing answer...")
|
| 344 |
+
|
| 345 |
+
query = state["query"]
|
| 346 |
+
tool_results = state["tool_results"]
|
| 347 |
+
|
| 348 |
+
# Prepare context from tool results
|
| 349 |
+
context_parts = []
|
| 350 |
+
citations = []
|
| 351 |
+
|
| 352 |
+
# Add vector search results
|
| 353 |
+
if "vector_results" in tool_results:
|
| 354 |
+
vector_results = tool_results["vector_results"]
|
| 355 |
+
for i, result in enumerate(vector_results[:3]): # Top 3
|
| 356 |
+
context_parts.append(f"[Source {i+1}, p.{result['page_number']}]: {result['text']}")
|
| 357 |
+
citations.append(SourceCitation(
|
| 358 |
+
page_number=result["page_number"],
|
| 359 |
+
snippet=result["text"][:120] + "..." if len(result["text"]) > 120 else result["text"],
|
| 360 |
+
chunk_id=result["chunk_id"],
|
| 361 |
+
score=result["score"]
|
| 362 |
+
))
|
| 363 |
+
|
| 364 |
+
# Add graph results
|
| 365 |
+
for key, value in tool_results.items():
|
| 366 |
+
if key.startswith("graph_node_"):
|
| 367 |
+
concept = key.replace("graph_node_", "")
|
| 368 |
+
context_parts.append(f"[Graph Node]: '{value['label']}' is a {value['type']} (importance: {value['importance']:.2f})")
|
| 369 |
+
elif key.startswith("related_"):
|
| 370 |
+
concept = key.replace("related_", "")
|
| 371 |
+
if value:
|
| 372 |
+
relations = ", ".join([f"{r['label']} ({r['relation']})" for r in value])
|
| 373 |
+
context_parts.append(f"[Related Concepts]: {relations}")
|
| 374 |
+
|
| 375 |
+
# Create context for LLM
|
| 376 |
+
context = "\n\n".join(context_parts)
|
| 377 |
+
|
| 378 |
+
# Generate answer using Gemini
|
| 379 |
+
answer = await self.llm_service.agent_synthesize(query, context)
|
| 380 |
+
|
| 381 |
+
state["final_answer"] = answer
|
| 382 |
+
state["citations"] = citations
|
| 383 |
+
|
| 384 |
+
logger.info(" ✓ Answer synthesized")
|
| 385 |
+
return state
|
| 386 |
+
|
| 387 |
+
async def chat(self, query: str, pdf_id: str = None, include_citations: bool = True) -> ChatResponse:
|
| 388 |
+
"""
|
| 389 |
+
Main entry point for agent-based chat
|
| 390 |
+
|
| 391 |
+
Args:
|
| 392 |
+
query: User's question
|
| 393 |
+
pdf_id: Optional PDF context
|
| 394 |
+
include_citations: Whether to include source citations
|
| 395 |
+
|
| 396 |
+
Returns:
|
| 397 |
+
ChatResponse with answer and citations
|
| 398 |
+
"""
|
| 399 |
+
logger.info(f"\n{'='*80}")
|
| 400 |
+
logger.info(f"🤖 Agent-Based RAG Query: '{query}'")
|
| 401 |
+
logger.info(f"{'='*80}")
|
| 402 |
+
|
| 403 |
+
# Initialize state
|
| 404 |
+
initial_state = {
|
| 405 |
+
"messages": [HumanMessage(content=query)],
|
| 406 |
+
"query": query,
|
| 407 |
+
"pdf_id": pdf_id or "",
|
| 408 |
+
"tool_results": {},
|
| 409 |
+
"reasoning_steps": [],
|
| 410 |
+
"final_answer": "",
|
| 411 |
+
"citations": [],
|
| 412 |
+
"next_action": ""
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
try:
|
| 416 |
+
# Run workflow
|
| 417 |
+
final_state = await self.app.ainvoke(initial_state)
|
| 418 |
+
|
| 419 |
+
# Extract results
|
| 420 |
+
answer = final_state.get("final_answer", "I couldn't generate an answer.")
|
| 421 |
+
citations = final_state.get("citations", [])
|
| 422 |
+
|
| 423 |
+
if not include_citations:
|
| 424 |
+
citations = []
|
| 425 |
+
|
| 426 |
+
logger.info(f"✓ Agent completed successfully")
|
| 427 |
+
logger.info(f" Answer length: {len(answer)} chars")
|
| 428 |
+
logger.info(f" Citations: {len(citations)}")
|
| 429 |
+
logger.info(f"{'='*80}\n")
|
| 430 |
+
|
| 431 |
+
return ChatResponse(
|
| 432 |
+
answer=answer,
|
| 433 |
+
sources=citations[:5] # Top 5 citations
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
except Exception as e:
|
| 437 |
+
logger.error(f"❌ Agent failed: {e}", exc_info=True)
|
| 438 |
+
|
| 439 |
+
# Fallback to simple vector search
|
| 440 |
+
logger.warning("Falling back to simple RAG...")
|
| 441 |
+
return await self._fallback_simple_rag(query, pdf_id)
|
| 442 |
+
|
| 443 |
+
async def _fallback_simple_rag(self, query: str, pdf_id: str = None) -> ChatResponse:
|
| 444 |
+
"""Fallback to simple RAG if agent fails"""
|
| 445 |
+
try:
|
| 446 |
+
results = self.embedding_service.search(query=query, top_k=5, filter_pdf_id=pdf_id)
|
| 447 |
+
|
| 448 |
+
if not results:
|
| 449 |
+
return ChatResponse(
|
| 450 |
+
answer="I couldn't find relevant information to answer your question.",
|
| 451 |
+
sources=[]
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
# Prepare context
|
| 455 |
+
context_chunks = [
|
| 456 |
+
{
|
| 457 |
+
"page_number": meta.get("page_number", 0),
|
| 458 |
+
"text": meta.get("text", "")
|
| 459 |
+
}
|
| 460 |
+
for meta, score in results[:3]
|
| 461 |
+
]
|
| 462 |
+
|
| 463 |
+
# Generate answer
|
| 464 |
+
answer = await self.llm_service.rag_chat(query, context_chunks)
|
| 465 |
+
|
| 466 |
+
# Format sources
|
| 467 |
+
sources = []
|
| 468 |
+
for meta, score in results[:5]:
|
| 469 |
+
text = meta.get("text", "")
|
| 470 |
+
snippet = text[:120] + "..." if len(text) > 120 else text
|
| 471 |
+
sources.append(SourceCitation(
|
| 472 |
+
page_number=meta.get("page_number", 0),
|
| 473 |
+
snippet=snippet,
|
| 474 |
+
chunk_id=meta.get("chunk_id", ""),
|
| 475 |
+
score=score
|
| 476 |
+
))
|
| 477 |
+
|
| 478 |
+
return ChatResponse(answer=answer, sources=sources)
|
| 479 |
+
|
| 480 |
+
except Exception as e:
|
| 481 |
+
logger.error(f"Fallback RAG also failed: {e}")
|
| 482 |
+
return ChatResponse(
|
| 483 |
+
answer="I encountered an error processing your question.",
|
| 484 |
+
sources=[]
|
| 485 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PDF Processing
|
| 2 |
+
PyMuPDF
|
| 3 |
+
pdfplumber
|
| 4 |
+
pytesseract
|
| 5 |
+
Pillow
|
| 6 |
+
camelot-py[cv]
|
| 7 |
+
tabula-py
|
| 8 |
+
|
| 9 |
+
# NLP & Embeddings
|
| 10 |
+
sentence-transformers
|
| 11 |
+
transformers
|
| 12 |
+
torch>=2.2.0
|
| 13 |
+
keybert
|
| 14 |
+
yake
|
| 15 |
+
|
| 16 |
+
# Knowledge Graph Generation
|
| 17 |
+
kg-gen
|
| 18 |
+
litellm
|
| 19 |
+
|
| 20 |
+
# Multi-Agent System with LangGraph
|
| 21 |
+
langgraph
|
| 22 |
+
langchain
|
| 23 |
+
langchain-core
|
| 24 |
+
langchain-community
|
| 25 |
+
|
| 26 |
+
# Vector Store & Search
|
| 27 |
+
faiss-cpu
|
| 28 |
+
|
| 29 |
+
# Graph Database & Processing
|
| 30 |
+
neo4j
|
| 31 |
+
networkx
|
| 32 |
+
|
| 33 |
+
# Backend & API
|
| 34 |
+
fastapi
|
| 35 |
+
uvicorn[standard]
|
| 36 |
+
python-multipart
|
| 37 |
+
pydantic
|
| 38 |
+
pydantic-settings
|
| 39 |
+
|
| 40 |
+
# Database
|
| 41 |
+
sqlalchemy
|
| 42 |
+
psycopg2-binary
|
| 43 |
+
pymongo
|
| 44 |
+
|
| 45 |
+
# Utilities
|
| 46 |
+
python-dotenv
|
| 47 |
+
loguru
|
| 48 |
+
tenacity
|
| 49 |
+
httpx
|
| 50 |
+
aiofiles
|
| 51 |
+
|
| 52 |
+
# Monitoring & DevOps
|
| 53 |
+
prometheus-client
|
| 54 |
+
python-json-logger
|
| 55 |
+
|
| 56 |
+
# Testing
|
| 57 |
+
pytest
|
| 58 |
+
pytest-asyncio
|
| 59 |
+
pytest-cov
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Tests package
|
tests/test_basic.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Basic tests for GraphLLM components
|
| 3 |
+
"""
|
| 4 |
+
import pytest
|
| 5 |
+
from models import Chunk, ChunkType, GraphNode, GraphEdge, Triple, NodeType, RelationType
|
| 6 |
+
from config import settings
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def test_chunk_creation():
|
| 10 |
+
"""Test chunk model creation"""
|
| 11 |
+
chunk = Chunk(
|
| 12 |
+
pdf_id="test-pdf",
|
| 13 |
+
page_number=1,
|
| 14 |
+
char_range=(0, 100),
|
| 15 |
+
type=ChunkType.PARAGRAPH,
|
| 16 |
+
text="This is a test chunk."
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
assert chunk.pdf_id == "test-pdf"
|
| 20 |
+
assert chunk.page_number == 1
|
| 21 |
+
assert chunk.type == ChunkType.PARAGRAPH
|
| 22 |
+
assert chunk.text == "This is a test chunk."
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def test_graph_node_creation():
|
| 26 |
+
"""Test graph node creation"""
|
| 27 |
+
node = GraphNode(
|
| 28 |
+
label="Test Concept",
|
| 29 |
+
type=NodeType.CONCEPT,
|
| 30 |
+
aliases=["test", "concept"],
|
| 31 |
+
supporting_chunks=[],
|
| 32 |
+
importance_score=0.75
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
assert node.label == "Test Concept"
|
| 36 |
+
assert node.type == NodeType.CONCEPT
|
| 37 |
+
assert node.importance_score == 0.75
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_graph_edge_creation():
|
| 41 |
+
"""Test graph edge creation"""
|
| 42 |
+
edge = GraphEdge(
|
| 43 |
+
from_node="node1",
|
| 44 |
+
to_node="node2",
|
| 45 |
+
relation=RelationType.USES,
|
| 46 |
+
confidence=0.8,
|
| 47 |
+
supporting_chunks=[]
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
assert edge.from_node == "node1"
|
| 51 |
+
assert edge.to_node == "node2"
|
| 52 |
+
assert edge.relation == RelationType.USES
|
| 53 |
+
assert edge.confidence == 0.8
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def test_triple_creation():
|
| 57 |
+
"""Test triple model"""
|
| 58 |
+
triple = Triple(
|
| 59 |
+
subject="Machine Learning",
|
| 60 |
+
predicate="uses",
|
| 61 |
+
object="Neural Networks",
|
| 62 |
+
confidence=0.9,
|
| 63 |
+
page_number=5
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
assert triple.subject == "Machine Learning"
|
| 67 |
+
assert triple.predicate == "uses"
|
| 68 |
+
assert triple.object == "Neural Networks"
|
| 69 |
+
assert triple.confidence == 0.9
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def test_settings_load():
|
| 73 |
+
"""Test configuration loading"""
|
| 74 |
+
assert settings.app_name == "GraphLLM"
|
| 75 |
+
assert settings.chunk_size > 0
|
| 76 |
+
assert settings.embedding_model is not None
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@pytest.mark.asyncio
|
| 80 |
+
async def test_pdf_processor_import():
|
| 81 |
+
"""Test PDF processor can be imported"""
|
| 82 |
+
from pdf_processor import PDFProcessor
|
| 83 |
+
processor = PDFProcessor()
|
| 84 |
+
assert processor is not None
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
@pytest.mark.asyncio
|
| 88 |
+
async def test_embedding_service_import():
|
| 89 |
+
"""Test embedding service can be imported"""
|
| 90 |
+
from embedding_service import EmbeddingService
|
| 91 |
+
# Note: This will load the model, may take time
|
| 92 |
+
# service = EmbeddingService()
|
| 93 |
+
# assert service is not None
|
| 94 |
+
pass
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
@pytest.mark.asyncio
|
| 98 |
+
async def test_graph_store_import():
|
| 99 |
+
"""Test graph store can be imported"""
|
| 100 |
+
from graph_store import GraphStore
|
| 101 |
+
store = GraphStore(use_neo4j=False)
|
| 102 |
+
assert store is not None
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
pytest.main([__file__, "-v"])
|