Upload 25 files
Browse files- .dockerignore +78 -0
- .gitattributes +1 -0
- Dockerfile +35 -9
- README_Docker.md +305 -0
- README_Streamlit_App.md +300 -0
- docker-compose.yml +28 -0
- nz-legislation.txt +3 -0
- requirements.txt +9 -3
- run_streamlit_app.py +176 -0
- streamlit_app/app.py +732 -0
- streamlit_app/core/__pycache__/cache_manager.cpython-312.pyc +0 -0
- streamlit_app/core/__pycache__/dataset_builder.cpython-312.pyc +0 -0
- streamlit_app/core/__pycache__/llm_analyzer.cpython-312.pyc +0 -0
- streamlit_app/core/__pycache__/text_processor.cpython-312.pyc +0 -0
- streamlit_app/core/cache_manager.py +505 -0
- streamlit_app/core/dataset_builder.py +649 -0
- streamlit_app/core/llm_analyzer.py +469 -0
- streamlit_app/core/text_processor.py +377 -0
- streamlit_app/utils/__pycache__/config.cpython-312.pyc +0 -0
- streamlit_app/utils/__pycache__/performance.cpython-312.pyc +0 -0
- streamlit_app/utils/__pycache__/ui_helpers.cpython-312.pyc +0 -0
- streamlit_app/utils/config.py +241 -0
- streamlit_app/utils/performance.py +271 -0
- streamlit_app/utils/ui_helpers.py +415 -0
- test_app_imports.py +178 -0
- trl copy.py +532 -0
.dockerignore
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Version control
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
|
| 5 |
+
# Python
|
| 6 |
+
__pycache__/
|
| 7 |
+
*.py[cod]
|
| 8 |
+
*$py.class
|
| 9 |
+
*.so
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
*.egg-info/
|
| 24 |
+
.installed.cfg
|
| 25 |
+
*.egg
|
| 26 |
+
|
| 27 |
+
# Virtual environments
|
| 28 |
+
.env
|
| 29 |
+
.venv
|
| 30 |
+
env/
|
| 31 |
+
venv/
|
| 32 |
+
ENV/
|
| 33 |
+
env.bak/
|
| 34 |
+
venv.bak/
|
| 35 |
+
|
| 36 |
+
# IDEs
|
| 37 |
+
.vscode/
|
| 38 |
+
.idea/
|
| 39 |
+
*.swp
|
| 40 |
+
*.swo
|
| 41 |
+
*~
|
| 42 |
+
|
| 43 |
+
# OS
|
| 44 |
+
.DS_Store
|
| 45 |
+
.DS_Store?
|
| 46 |
+
._*
|
| 47 |
+
.Spotlight-V100
|
| 48 |
+
.Trashes
|
| 49 |
+
ehthumbs.db
|
| 50 |
+
Thumbs.db
|
| 51 |
+
|
| 52 |
+
# Documentation (will be copied if needed)
|
| 53 |
+
*.md
|
| 54 |
+
!README_Streamlit_App.md
|
| 55 |
+
|
| 56 |
+
# Test files (will be copied if needed)
|
| 57 |
+
test_app_imports.py
|
| 58 |
+
|
| 59 |
+
# Original CLI script (replaced by Streamlit app)
|
| 60 |
+
trl.py
|
| 61 |
+
trl copy.py
|
| 62 |
+
|
| 63 |
+
# Cache and temporary files
|
| 64 |
+
*.log
|
| 65 |
+
.cache
|
| 66 |
+
.temp
|
| 67 |
+
|
| 68 |
+
# Model files (will be mounted or downloaded at runtime)
|
| 69 |
+
*.gguf
|
| 70 |
+
*.bin
|
| 71 |
+
|
| 72 |
+
# Node modules (if any)
|
| 73 |
+
node_modules/
|
| 74 |
+
|
| 75 |
+
# Docker files
|
| 76 |
+
Dockerfile
|
| 77 |
+
docker-compose.yml
|
| 78 |
+
.dockerignore
|
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
nz-legislation.txt filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
CHANGED
|
@@ -1,20 +1,46 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
WORKDIR /app
|
| 4 |
|
|
|
|
| 5 |
RUN apt-get update && apt-get install -y \
|
| 6 |
build-essential \
|
| 7 |
-
|
| 8 |
git \
|
| 9 |
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
|
|
|
| 16 |
EXPOSE 8501
|
| 17 |
|
| 18 |
-
|
|
|
|
| 19 |
|
| 20 |
-
|
|
|
|
|
|
| 1 |
+
# Use Python 3.11 slim as base image for the NZ Legislation Loophole Analysis Streamlit App
|
| 2 |
+
FROM python:3.11-slim
|
|
|
|
| 3 |
|
| 4 |
+
# Install system dependencies required for llama-cpp-python compilation and general app functionality
|
| 5 |
RUN apt-get update && apt-get install -y \
|
| 6 |
build-essential \
|
| 7 |
+
cmake \
|
| 8 |
git \
|
| 9 |
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
|
| 11 |
+
# Set working directory
|
| 12 |
+
WORKDIR /app
|
| 13 |
+
|
| 14 |
+
# Copy requirements file and install Python dependencies
|
| 15 |
+
COPY requirements.txt .
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
+
|
| 18 |
+
# Copy the entire Streamlit application
|
| 19 |
+
COPY streamlit_app/ ./streamlit_app/
|
| 20 |
+
|
| 21 |
+
# Copy data files (if needed for testing or default data)
|
| 22 |
+
COPY nz-legislation.txt ./
|
| 23 |
+
|
| 24 |
+
# Create necessary directories for the Streamlit app
|
| 25 |
+
RUN mkdir -p \
|
| 26 |
+
streamlit_app/cache \
|
| 27 |
+
streamlit_app/config \
|
| 28 |
+
streamlit_app/datasets \
|
| 29 |
+
streamlit_app/logs \
|
| 30 |
+
streamlit_app/uploads \
|
| 31 |
+
nz_legislation_dataset
|
| 32 |
|
| 33 |
+
# Set environment variables for Streamlit
|
| 34 |
+
ENV STREAMLIT_SERVER_HEADLESS=true
|
| 35 |
+
ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
| 36 |
+
ENV STREAMLIT_SERVER_PORT=8501
|
| 37 |
+
ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
| 38 |
|
| 39 |
+
# Expose the Streamlit port
|
| 40 |
EXPOSE 8501
|
| 41 |
|
| 42 |
+
# Set working directory to the Streamlit app
|
| 43 |
+
WORKDIR /app/streamlit_app
|
| 44 |
|
| 45 |
+
# Set the default command to run the Streamlit application
|
| 46 |
+
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
README_Docker.md
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Docker Setup for NZ Legislation Loophole Analysis Streamlit App
|
| 2 |
+
|
| 3 |
+
This guide explains how to run the NZ Legislation Loophole Analysis Streamlit App using Docker.
|
| 4 |
+
|
| 5 |
+
## π Prerequisites
|
| 6 |
+
|
| 7 |
+
- Docker installed on your system
|
| 8 |
+
- Docker Compose (recommended for easier management)
|
| 9 |
+
- At least 4GB of available RAM (8GB recommended for optimal performance)
|
| 10 |
+
|
| 11 |
+
## π Quick Start
|
| 12 |
+
|
| 13 |
+
### Method 1: Using Docker Compose (Recommended)
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
# Clone or navigate to the project directory
|
| 17 |
+
cd /path/to/nz-legislation-analyzer
|
| 18 |
+
|
| 19 |
+
# Build and run the application
|
| 20 |
+
docker-compose up --build
|
| 21 |
+
|
| 22 |
+
# Or run in detached mode
|
| 23 |
+
docker-compose up -d --build
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
The application will be available at: **http://localhost:8501**
|
| 27 |
+
|
| 28 |
+
### Method 2: Using Docker Directly
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
# Build the Docker image
|
| 32 |
+
docker build -t nz-legislation-analyzer .
|
| 33 |
+
|
| 34 |
+
# Run the container
|
| 35 |
+
docker run -p 8501:8501 \
|
| 36 |
+
-v $(pwd)/streamlit_app/cache:/app/streamlit_app/cache \
|
| 37 |
+
-v $(pwd)/streamlit_app/config:/app/streamlit_app/config \
|
| 38 |
+
-v $(pwd)/streamlit_app/datasets:/app/streamlit_app/datasets \
|
| 39 |
+
-v $(pwd)/nz-legislation.txt:/app/nz-legislation.txt:ro \
|
| 40 |
+
nz-legislation-analyzer
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## π Directory Structure
|
| 44 |
+
|
| 45 |
+
When using Docker, the following directories are created and can be persisted:
|
| 46 |
+
|
| 47 |
+
```
|
| 48 |
+
π streamlit_app/
|
| 49 |
+
βββ π§ cache/ # Persistent cache for processed chunks
|
| 50 |
+
βββ βοΈ config/ # Application configuration files
|
| 51 |
+
βββ π datasets/ # Generated datasets and results
|
| 52 |
+
βββ π logs/ # Application logs
|
| 53 |
+
βββ π€ uploads/ # Uploaded files (if any)
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
## π οΈ Configuration
|
| 57 |
+
|
| 58 |
+
### Environment Variables
|
| 59 |
+
|
| 60 |
+
| Variable | Default | Description |
|
| 61 |
+
|----------|---------|-------------|
|
| 62 |
+
| `STREAMLIT_SERVER_HEADLESS` | `true` | Run in headless mode |
|
| 63 |
+
| `STREAMLIT_SERVER_PORT` | `8501` | Streamlit server port |
|
| 64 |
+
| `STREAMLIT_SERVER_ADDRESS` | `0.0.0.0` | Server bind address |
|
| 65 |
+
|
| 66 |
+
### Volume Mounts
|
| 67 |
+
|
| 68 |
+
The Docker setup includes the following volume mounts for data persistence:
|
| 69 |
+
|
| 70 |
+
- `./streamlit_app/cache:/app/streamlit_app/cache` - Cache persistence
|
| 71 |
+
- `./streamlit_app/config:/app/streamlit_app/config` - Configuration files
|
| 72 |
+
- `./streamlit_app/datasets:/app/streamlit_app/datasets` - Generated datasets
|
| 73 |
+
- `./streamlit_app/logs:/app/streamlit_app/logs` - Application logs
|
| 74 |
+
- `./nz-legislation.txt:/app/nz-legislation.txt:ro` - Input data (read-only)
|
| 75 |
+
|
| 76 |
+
## π§ Docker Commands
|
| 77 |
+
|
| 78 |
+
### Building the Image
|
| 79 |
+
|
| 80 |
+
```bash
|
| 81 |
+
# Build with no cache
|
| 82 |
+
docker build --no-cache -t nz-legislation-analyzer .
|
| 83 |
+
|
| 84 |
+
# Build with specific Dockerfile
|
| 85 |
+
docker build -f Dockerfile -t nz-legislation-analyzer .
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
### Running the Container
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
# Interactive mode
|
| 92 |
+
docker run -it --rm -p 8501:8501 nz-legislation-analyzer
|
| 93 |
+
|
| 94 |
+
# Background mode
|
| 95 |
+
docker run -d -p 8501:8501 nz-legislation-analyzer
|
| 96 |
+
|
| 97 |
+
# With custom environment variables
|
| 98 |
+
docker run -p 8501:8501 \
|
| 99 |
+
-e STREAMLIT_SERVER_PORT=8502 \
|
| 100 |
+
nz-legislation-analyzer
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
### Docker Compose Commands
|
| 104 |
+
|
| 105 |
+
```bash
|
| 106 |
+
# Start services
|
| 107 |
+
docker-compose up
|
| 108 |
+
|
| 109 |
+
# Start in background
|
| 110 |
+
docker-compose up -d
|
| 111 |
+
|
| 112 |
+
# Stop services
|
| 113 |
+
docker-compose down
|
| 114 |
+
|
| 115 |
+
# Rebuild and start
|
| 116 |
+
docker-compose up --build
|
| 117 |
+
|
| 118 |
+
# View logs
|
| 119 |
+
docker-compose logs -f
|
| 120 |
+
|
| 121 |
+
# Scale services (if needed)
|
| 122 |
+
docker-compose up -d --scale nz-legislation-analyzer=2
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
## π Monitoring and Logs
|
| 126 |
+
|
| 127 |
+
### Viewing Logs
|
| 128 |
+
|
| 129 |
+
```bash
|
| 130 |
+
# Docker Compose logs
|
| 131 |
+
docker-compose logs -f nz-legislation-analyzer
|
| 132 |
+
|
| 133 |
+
# Docker logs
|
| 134 |
+
docker logs -f <container_id>
|
| 135 |
+
|
| 136 |
+
# Follow logs in real-time
|
| 137 |
+
docker-compose logs -f --tail=100
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
### Health Checks
|
| 141 |
+
|
| 142 |
+
The Docker Compose setup includes health checks that monitor the Streamlit application:
|
| 143 |
+
|
| 144 |
+
```yaml
|
| 145 |
+
healthcheck:
|
| 146 |
+
test: ["CMD", "curl", "-f", "http://localhost:8501/healthz"]
|
| 147 |
+
interval: 30s
|
| 148 |
+
timeout: 10s
|
| 149 |
+
retries: 3
|
| 150 |
+
start_period: 40s
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
## π Troubleshooting
|
| 154 |
+
|
| 155 |
+
### Common Issues
|
| 156 |
+
|
| 157 |
+
1. **Port Already in Use**
|
| 158 |
+
```bash
|
| 159 |
+
# Change the port mapping
|
| 160 |
+
docker run -p 8502:8501 nz-legislation-analyzer
|
| 161 |
+
# Or with docker-compose, modify the ports section
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
2. **Memory Issues**
|
| 165 |
+
```bash
|
| 166 |
+
# Increase Docker memory allocation
|
| 167 |
+
# Docker Desktop: Settings > Resources > Memory
|
| 168 |
+
# Or add memory limits to docker-compose.yml
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
3. **Model Loading Errors**
|
| 172 |
+
- Ensure sufficient RAM (8GB+ recommended)
|
| 173 |
+
- Check that model files are accessible
|
| 174 |
+
- Verify model path in configuration
|
| 175 |
+
|
| 176 |
+
4. **Permission Issues**
|
| 177 |
+
```bash
|
| 178 |
+
# Fix directory permissions
|
| 179 |
+
sudo chown -R $USER:$USER streamlit_app/
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
5. **Cache Issues**
|
| 183 |
+
```bash
|
| 184 |
+
# Clear persistent cache
|
| 185 |
+
sudo rm -rf streamlit_app/cache/*
|
| 186 |
+
docker-compose restart
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
### Debug Mode
|
| 190 |
+
|
| 191 |
+
Enable debug logging by modifying the environment:
|
| 192 |
+
|
| 193 |
+
```bash
|
| 194 |
+
# Add to docker-compose.yml environment section
|
| 195 |
+
- PYTHONPATH=/app
|
| 196 |
+
- LOG_LEVEL=DEBUG
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
## π Updates and Maintenance
|
| 200 |
+
|
| 201 |
+
### Updating the Application
|
| 202 |
+
|
| 203 |
+
```bash
|
| 204 |
+
# Pull latest changes
|
| 205 |
+
git pull
|
| 206 |
+
|
| 207 |
+
# Rebuild the image
|
| 208 |
+
docker-compose build --no-cache
|
| 209 |
+
|
| 210 |
+
# Restart services
|
| 211 |
+
docker-compose up -d
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
### Backup Important Data
|
| 215 |
+
|
| 216 |
+
```bash
|
| 217 |
+
# Backup cache and configuration
|
| 218 |
+
tar -czf backup.tar.gz streamlit_app/cache/ streamlit_app/config/
|
| 219 |
+
|
| 220 |
+
# Backup datasets
|
| 221 |
+
tar -czf datasets_backup.tar.gz streamlit_app/datasets/
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
### Cleaning Up
|
| 225 |
+
|
| 226 |
+
```bash
|
| 227 |
+
# Remove containers and volumes
|
| 228 |
+
docker-compose down -v
|
| 229 |
+
|
| 230 |
+
# Remove images
|
| 231 |
+
docker rmi nz-legislation-analyzer
|
| 232 |
+
|
| 233 |
+
# Clean up unused Docker resources
|
| 234 |
+
docker system prune -a
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
## ποΈ Advanced Configuration
|
| 238 |
+
|
| 239 |
+
### Custom Model Files
|
| 240 |
+
|
| 241 |
+
To use custom model files:
|
| 242 |
+
|
| 243 |
+
1. **Mount model directory:**
|
| 244 |
+
```yaml
|
| 245 |
+
volumes:
|
| 246 |
+
- ./models:/app/models:ro
|
| 247 |
+
```
|
| 248 |
+
|
| 249 |
+
2. **Update configuration** in the Streamlit app to point to `/app/models/your-model.gguf`
|
| 250 |
+
|
| 251 |
+
### GPU Support (Optional)
|
| 252 |
+
|
| 253 |
+
For GPU acceleration with CUDA:
|
| 254 |
+
|
| 255 |
+
```dockerfile
|
| 256 |
+
# Use CUDA-enabled base image
|
| 257 |
+
FROM nvidia/cuda:11.8-devel-ubuntu22.04
|
| 258 |
+
|
| 259 |
+
# Install Python and dependencies
|
| 260 |
+
# ... (additional setup for CUDA)
|
| 261 |
+
```
|
| 262 |
+
|
| 263 |
+
Note: GPU support requires additional configuration and CUDA-compatible hardware.
|
| 264 |
+
|
| 265 |
+
## π Security Considerations
|
| 266 |
+
|
| 267 |
+
- The application runs in headless mode by default
|
| 268 |
+
- All data is stored locally in mounted volumes
|
| 269 |
+
- No external network access is required for basic functionality
|
| 270 |
+
- Consider implementing authentication for production deployments
|
| 271 |
+
|
| 272 |
+
## π Performance Optimization
|
| 273 |
+
|
| 274 |
+
### Memory Management
|
| 275 |
+
|
| 276 |
+
- Default cache size: 1024MB (configurable in app settings)
|
| 277 |
+
- Adjust based on available system memory
|
| 278 |
+
- Monitor memory usage through the app's Performance dashboard
|
| 279 |
+
|
| 280 |
+
### Disk I/O
|
| 281 |
+
|
| 282 |
+
- Use SSD storage for better performance
|
| 283 |
+
- Ensure adequate disk space for cache and datasets
|
| 284 |
+
- Consider using tmpfs for temporary processing
|
| 285 |
+
|
| 286 |
+
### Network
|
| 287 |
+
|
| 288 |
+
- The application binds to all interfaces (`0.0.0.0`)
|
| 289 |
+
- Access via `localhost` or container IP
|
| 290 |
+
- No external dependencies required
|
| 291 |
+
|
| 292 |
+
## π Support
|
| 293 |
+
|
| 294 |
+
For Docker-specific issues:
|
| 295 |
+
|
| 296 |
+
1. Check Docker logs: `docker-compose logs`
|
| 297 |
+
2. Verify Docker installation and version
|
| 298 |
+
3. Ensure adequate system resources
|
| 299 |
+
4. Review the main application logs in `streamlit_app/logs/`
|
| 300 |
+
|
| 301 |
+
For application-specific issues, refer to the main documentation in `README_Streamlit_App.md`.
|
| 302 |
+
|
| 303 |
+
---
|
| 304 |
+
|
| 305 |
+
**π Happy analyzing with your containerized NZ Legislation Loophole Analysis Streamlit App!**
|
README_Streamlit_App.md
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# NZ Legislation Loophole Analysis Streamlit App
|
| 2 |
+
|
| 3 |
+
A modern, AI-powered web application for analyzing New Zealand legislation to identify potential loopholes, ambiguities, and unintended consequences.
|
| 4 |
+
|
| 5 |
+
## π Features
|
| 6 |
+
|
| 7 |
+
### π€ AI-Powered Analysis
|
| 8 |
+
- **Legal Expertise**: Specialized analysis for NZ legislation with Treaty of Waitangi references
|
| 9 |
+
- **Multiple Analysis Types**: Standard, Detailed, and Comprehensive analysis modes
|
| 10 |
+
- **Intelligent Chunking**: Sentence-aware text splitting with overlap for context preservation
|
| 11 |
+
|
| 12 |
+
### π§ Context Memory Cache System
|
| 13 |
+
- **Smart Caching**: Hash-based chunk identification prevents re-processing identical content
|
| 14 |
+
- **Multi-level Storage**: In-memory LRU cache with optional SQLite persistence
|
| 15 |
+
- **Performance Boost**: Significant speed improvements for large documents and batch processing
|
| 16 |
+
- **Cache Management**: View statistics, export/import cache, and set TTL limits
|
| 17 |
+
|
| 18 |
+
### π¨ Modern Web Interface
|
| 19 |
+
- **Multi-page Layout**: Organized navigation with Home, Upload, Analysis, Settings, and Performance pages
|
| 20 |
+
- **Real-time Progress**: Live progress bars and processing status updates
|
| 21 |
+
- **Interactive Dashboards**: Performance metrics, cache statistics, and analysis results
|
| 22 |
+
- **Responsive Design**: Works on desktop and mobile devices
|
| 23 |
+
|
| 24 |
+
### π Advanced Analytics
|
| 25 |
+
- **Quality Metrics**: Confidence scoring and analysis quality assessment
|
| 26 |
+
- **Performance Monitoring**: Memory usage, CPU utilization, and processing times
|
| 27 |
+
- **Batch Processing**: Handle multiple legislation files simultaneously
|
| 28 |
+
- **Export Options**: Multiple formats (JSON, CSV, Excel) with metadata
|
| 29 |
+
|
| 30 |
+
## π Quick Start
|
| 31 |
+
|
| 32 |
+
### Prerequisites
|
| 33 |
+
```bash
|
| 34 |
+
# Python 3.8 or higher
|
| 35 |
+
python --version
|
| 36 |
+
|
| 37 |
+
# Install dependencies
|
| 38 |
+
pip install -r requirements.txt
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
### Running the Application
|
| 42 |
+
```bash
|
| 43 |
+
# Method 1: Use the run script (recommended)
|
| 44 |
+
python run_streamlit_app.py
|
| 45 |
+
|
| 46 |
+
# Method 2: Direct Streamlit command
|
| 47 |
+
cd streamlit_app
|
| 48 |
+
streamlit run app.py
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
The app will be available at: **http://localhost:8501**
|
| 52 |
+
|
| 53 |
+
## π Project Structure
|
| 54 |
+
|
| 55 |
+
```
|
| 56 |
+
streamlit_app/
|
| 57 |
+
βββ app.py # Main Streamlit application
|
| 58 |
+
βββ core/
|
| 59 |
+
β βββ cache_manager.py # Context memory cache system
|
| 60 |
+
β βββ text_processor.py # Text cleaning and chunking
|
| 61 |
+
β βββ llm_analyzer.py # LLM integration and analysis
|
| 62 |
+
β βββ dataset_builder.py # Dataset creation and export
|
| 63 |
+
βββ utils/
|
| 64 |
+
β βββ config.py # Configuration management
|
| 65 |
+
β βββ performance.py # Performance monitoring
|
| 66 |
+
β βββ ui_helpers.py # UI components and formatting
|
| 67 |
+
βββ pages/ # Multi-page navigation
|
| 68 |
+
βββ assets/ # Custom styling and assets
|
| 69 |
+
βββ cache/ # Cache storage directory
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
## π οΈ Configuration
|
| 73 |
+
|
| 74 |
+
### Model Configuration
|
| 75 |
+
The app supports both local GGUF models and HuggingFace models:
|
| 76 |
+
|
| 77 |
+
```python
|
| 78 |
+
# Local model
|
| 79 |
+
model_path = "path/to/your/model.gguf"
|
| 80 |
+
|
| 81 |
+
# HuggingFace model
|
| 82 |
+
repo_id = "DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-GGUF"
|
| 83 |
+
filename = "model-file-name.gguf"
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
### Cache Configuration
|
| 87 |
+
```python
|
| 88 |
+
cache_config = {
|
| 89 |
+
'enabled': True, # Enable/disable caching
|
| 90 |
+
'max_size_mb': 1024, # Maximum memory for cache
|
| 91 |
+
'ttl_hours': 24, # Time-to-live for cached entries
|
| 92 |
+
'persistent': True # Use disk persistence
|
| 93 |
+
}
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
### Processing Configuration
|
| 97 |
+
```python
|
| 98 |
+
processing_config = {
|
| 99 |
+
'chunk_size': 4096, # Size of text chunks
|
| 100 |
+
'chunk_overlap': 256, # Overlap between chunks
|
| 101 |
+
'batch_size': 16, # Number of chunks to process at once
|
| 102 |
+
'clean_text': True # Apply text cleaning
|
| 103 |
+
}
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
## π Usage Guide
|
| 107 |
+
|
| 108 |
+
### 1. Home Page
|
| 109 |
+
- Overview of the application capabilities
|
| 110 |
+
- Current configuration status
|
| 111 |
+
- Quick start guide
|
| 112 |
+
|
| 113 |
+
### 2. Upload & Process Page
|
| 114 |
+
- **File Upload**: Support for JSON lines, JSON arrays, and raw text files
|
| 115 |
+
- **Configuration**: Adjust model, processing, and analysis parameters
|
| 116 |
+
- **Batch Processing**: Upload multiple files for simultaneous analysis
|
| 117 |
+
- **Real-time Progress**: Monitor processing status and performance
|
| 118 |
+
|
| 119 |
+
### 3. Analysis Results Page
|
| 120 |
+
- **Results Overview**: Summary metrics and statistics
|
| 121 |
+
- **Detailed Analysis**: Expandable results with confidence scores
|
| 122 |
+
- **Export Options**: Download results in multiple formats
|
| 123 |
+
- **Quality Metrics**: Analysis quality assessment and recommendations
|
| 124 |
+
|
| 125 |
+
### 4. Settings Page
|
| 126 |
+
- **Model Settings**: Configure LLM parameters and model paths
|
| 127 |
+
- **Processing Settings**: Adjust text processing parameters
|
| 128 |
+
- **Cache Settings**: Manage cache behavior and persistence
|
| 129 |
+
- **UI Settings**: Customize interface appearance
|
| 130 |
+
|
| 131 |
+
### 5. Performance Dashboard
|
| 132 |
+
- **Real-time Metrics**: Memory usage, CPU utilization, processing speed
|
| 133 |
+
- **Performance History**: Charts showing performance over time
|
| 134 |
+
- **Cache Statistics**: Hit rates, evictions, and cache efficiency
|
| 135 |
+
- **System Information**: Hardware and software details
|
| 136 |
+
- **Performance Recommendations**: Automated suggestions for optimization
|
| 137 |
+
|
| 138 |
+
## π§ Advanced Features
|
| 139 |
+
|
| 140 |
+
### Cache Management
|
| 141 |
+
```python
|
| 142 |
+
from core.cache_manager import get_cache_manager
|
| 143 |
+
|
| 144 |
+
# Get cache instance
|
| 145 |
+
cache = get_cache_manager()
|
| 146 |
+
|
| 147 |
+
# View statistics
|
| 148 |
+
stats = cache.get_stats()
|
| 149 |
+
print(f"Hit Rate: {stats['hit_rate']:.1f}%")
|
| 150 |
+
|
| 151 |
+
# Clear cache
|
| 152 |
+
cache.clear_cache()
|
| 153 |
+
|
| 154 |
+
# Export cache
|
| 155 |
+
cache.export_cache('cache_backup.json')
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
### Custom Analysis Templates
|
| 159 |
+
The app supports custom analysis templates for different legal domains:
|
| 160 |
+
|
| 161 |
+
```python
|
| 162 |
+
# Define custom template
|
| 163 |
+
custom_template = {
|
| 164 |
+
'name': 'Commercial Law Analysis',
|
| 165 |
+
'depth': 'Detailed',
|
| 166 |
+
'focus_areas': [
|
| 167 |
+
'contractual loopholes',
|
| 168 |
+
'commercial implications',
|
| 169 |
+
'regulatory compliance',
|
| 170 |
+
'enforcement mechanisms'
|
| 171 |
+
]
|
| 172 |
+
}
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
### Performance Optimization
|
| 176 |
+
- **Memory Management**: Automatic cache eviction based on memory limits
|
| 177 |
+
- **Batch Processing**: Optimized for large document collections
|
| 178 |
+
- **Concurrent Processing**: Thread-safe operations for multi-user scenarios
|
| 179 |
+
- **Progress Callbacks**: Real-time progress updates during long operations
|
| 180 |
+
|
| 181 |
+
## π API Reference
|
| 182 |
+
|
| 183 |
+
### Core Classes
|
| 184 |
+
|
| 185 |
+
#### CacheManager
|
| 186 |
+
```python
|
| 187 |
+
class CacheManager:
|
| 188 |
+
def get(self, content, model_config, processing_config) -> Optional[Dict]
|
| 189 |
+
def put(self, content, analysis_result, model_config, processing_config)
|
| 190 |
+
def get_stats(self) -> Dict[str, Any]
|
| 191 |
+
def clear_cache(self)
|
| 192 |
+
def export_cache(self, filepath: str) -> bool
|
| 193 |
+
def import_cache(self, filepath: str) -> int
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
#### TextProcessor
|
| 197 |
+
```python
|
| 198 |
+
class TextProcessor:
|
| 199 |
+
def clean_text(self, text: str, preserve_structure: bool = True) -> str
|
| 200 |
+
def chunk_text(self, text: str, chunk_size: int = 4096, overlap: int = 256) -> List[str]
|
| 201 |
+
def extract_metadata(self, text: str) -> Dict[str, Any]
|
| 202 |
+
def preprocess_legislation_json(self, json_data: Dict) -> Dict
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
#### LLMAnalyzer
|
| 206 |
+
```python
|
| 207 |
+
class LLMAnalyzer:
|
| 208 |
+
def analyze_chunk(self, chunk: str, analysis_type: str = 'standard') -> Dict[str, Any]
|
| 209 |
+
def batch_analyze_chunks(self, chunks: List[str], analysis_type: str = 'standard') -> List[Dict]
|
| 210 |
+
def load_model(self) -> bool
|
| 211 |
+
def unload_model(self)
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
## π Analysis Output Format
|
| 215 |
+
|
| 216 |
+
Each analysis result contains:
|
| 217 |
+
|
| 218 |
+
```json
|
| 219 |
+
{
|
| 220 |
+
"chunk": "original text chunk",
|
| 221 |
+
"analysis_type": "standard|detailed|comprehensive",
|
| 222 |
+
"model_config": {...},
|
| 223 |
+
"structured_analysis": {
|
| 224 |
+
"text_meaning": "explanation of text purpose",
|
| 225 |
+
"key_assumptions": ["list of assumptions"],
|
| 226 |
+
"exploitable_interpretations": ["potential interpretations"],
|
| 227 |
+
"critical_loopholes": ["identified loopholes"],
|
| 228 |
+
"circumvention_strategies": ["exploitation methods"],
|
| 229 |
+
"recommendations": ["suggested fixes"],
|
| 230 |
+
"confidence_score": 85,
|
| 231 |
+
"analysis_quality": "high|medium|low"
|
| 232 |
+
},
|
| 233 |
+
"processing_time": 2.34,
|
| 234 |
+
"chunk_size": 4096,
|
| 235 |
+
"word_count": 512
|
| 236 |
+
}
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
+
## π Troubleshooting
|
| 240 |
+
|
| 241 |
+
### Common Issues
|
| 242 |
+
|
| 243 |
+
1. **Model Loading Errors**
|
| 244 |
+
- Ensure model file exists and is accessible
|
| 245 |
+
- Check model format (GGUF required)
|
| 246 |
+
- Verify sufficient RAM for model loading
|
| 247 |
+
|
| 248 |
+
2. **Cache Performance Issues**
|
| 249 |
+
- Clear cache if memory usage is high
|
| 250 |
+
- Adjust cache size limits in settings
|
| 251 |
+
- Check persistent cache database integrity
|
| 252 |
+
|
| 253 |
+
3. **Processing Slowdowns**
|
| 254 |
+
- Reduce batch size for large documents
|
| 255 |
+
- Increase chunk overlap for better context
|
| 256 |
+
- Consider using a more powerful model
|
| 257 |
+
|
| 258 |
+
4. **Memory Errors**
|
| 259 |
+
- Reduce cache size in settings
|
| 260 |
+
- Process files individually instead of batch
|
| 261 |
+
- Monitor memory usage in performance dashboard
|
| 262 |
+
|
| 263 |
+
### Debug Mode
|
| 264 |
+
Enable debug mode in settings for detailed logging:
|
| 265 |
+
```python
|
| 266 |
+
# In settings, enable debug mode
|
| 267 |
+
debug_mode = True
|
| 268 |
+
log_level = "DEBUG"
|
| 269 |
+
```
|
| 270 |
+
|
| 271 |
+
## π€ Contributing
|
| 272 |
+
|
| 273 |
+
1. Fork the repository
|
| 274 |
+
2. Create a feature branch
|
| 275 |
+
3. Make your changes
|
| 276 |
+
4. Add tests if applicable
|
| 277 |
+
5. Submit a pull request
|
| 278 |
+
|
| 279 |
+
## π License
|
| 280 |
+
|
| 281 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
| 282 |
+
|
| 283 |
+
## π Support
|
| 284 |
+
|
| 285 |
+
For support and questions:
|
| 286 |
+
- Check the troubleshooting section above
|
| 287 |
+
- Review the performance recommendations in the app
|
| 288 |
+
- Examine the logs in the `streamlit_app/logs/` directory
|
| 289 |
+
|
| 290 |
+
## π Migration from Original Script
|
| 291 |
+
|
| 292 |
+
If you're migrating from the original `trl.py` script:
|
| 293 |
+
|
| 294 |
+
1. **Configuration**: Settings are now managed through the UI
|
| 295 |
+
2. **Output**: Results are displayed in the web interface
|
| 296 |
+
3. **Caching**: Automatic caching with no manual intervention needed
|
| 297 |
+
4. **Batch Processing**: Multiple files can be uploaded simultaneously
|
| 298 |
+
5. **Progress Tracking**: Real-time progress bars and status updates
|
| 299 |
+
|
| 300 |
+
The new app maintains all functionality of the original script while providing a modern, user-friendly interface and significant performance improvements through intelligent caching.
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
nz-legislation-analyzer:
|
| 5 |
+
build:
|
| 6 |
+
context: .
|
| 7 |
+
dockerfile: Dockerfile
|
| 8 |
+
ports:
|
| 9 |
+
- "8501:8501"
|
| 10 |
+
volumes:
|
| 11 |
+
# Mount directories for persistent data
|
| 12 |
+
- ./streamlit_app/cache:/app/streamlit_app/cache
|
| 13 |
+
- ./streamlit_app/config:/app/streamlit_app/config
|
| 14 |
+
- ./streamlit_app/datasets:/app/streamlit_app/datasets
|
| 15 |
+
- ./streamlit_app/logs:/app/streamlit_app/logs
|
| 16 |
+
- ./nz-legislation.txt:/app/nz-legislation.txt:ro
|
| 17 |
+
environment:
|
| 18 |
+
- STREAMLIT_SERVER_HEADLESS=true
|
| 19 |
+
- STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
| 20 |
+
- STREAMLIT_SERVER_PORT=8501
|
| 21 |
+
- STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
| 22 |
+
restart: unless-stopped
|
| 23 |
+
healthcheck:
|
| 24 |
+
test: ["CMD", "curl", "-f", "http://localhost:8501/healthz"]
|
| 25 |
+
interval: 30s
|
| 26 |
+
timeout: 10s
|
| 27 |
+
retries: 3
|
| 28 |
+
start_period: 40s
|
nz-legislation.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e671ba88cfc0d52bf03dcc089e67c6f73fa1ce7680cef6cf860bab1b5809e8e1
|
| 3 |
+
size 112806614
|
requirements.txt
CHANGED
|
@@ -1,3 +1,9 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
llama-cpp-python
|
| 2 |
+
psutil
|
| 3 |
+
numpy
|
| 4 |
+
streamlit>=1.28.0
|
| 5 |
+
streamlit-extras>=0.3.0
|
| 6 |
+
plotly>=5.15.0
|
| 7 |
+
pandas>=2.0.0
|
| 8 |
+
streamlit-aggrid>=0.3.0
|
| 9 |
+
streamlit-ace>=0.1.1
|
run_streamlit_app.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
NZ Legislation Loophole Analysis Streamlit App Runner
|
| 4 |
+
|
| 5 |
+
This script runs the modern Streamlit application for analyzing New Zealand legislation
|
| 6 |
+
to identify potential loopholes, ambiguities, and unintended consequences using AI.
|
| 7 |
+
|
| 8 |
+
Features:
|
| 9 |
+
- Advanced UI with multi-page layout
|
| 10 |
+
- Context memory cache system for improved performance
|
| 11 |
+
- Real-time progress monitoring
|
| 12 |
+
- Interactive results visualization
|
| 13 |
+
- Batch processing capabilities
|
| 14 |
+
- Comprehensive configuration management
|
| 15 |
+
|
| 16 |
+
Usage:
|
| 17 |
+
python run_streamlit_app.py
|
| 18 |
+
|
| 19 |
+
Requirements:
|
| 20 |
+
- All dependencies from requirements.txt must be installed
|
| 21 |
+
- Run from the project root directory
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
import os
|
| 25 |
+
import sys
|
| 26 |
+
import subprocess
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
|
| 29 |
+
def check_requirements():
|
| 30 |
+
"""Check if all required packages are installed"""
|
| 31 |
+
required_packages = [
|
| 32 |
+
'streamlit',
|
| 33 |
+
'pandas',
|
| 34 |
+
'plotly',
|
| 35 |
+
'llama-cpp-python',
|
| 36 |
+
'psutil',
|
| 37 |
+
'numpy'
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
missing_packages = []
|
| 41 |
+
|
| 42 |
+
for package in required_packages:
|
| 43 |
+
try:
|
| 44 |
+
__import__(package.replace('-', '_'))
|
| 45 |
+
except ImportError:
|
| 46 |
+
missing_packages.append(package)
|
| 47 |
+
|
| 48 |
+
if missing_packages:
|
| 49 |
+
print("β Missing required packages:")
|
| 50 |
+
for package in missing_packages:
|
| 51 |
+
print(f" - {package}")
|
| 52 |
+
|
| 53 |
+
print("\nπ¦ Installing missing packages...")
|
| 54 |
+
try:
|
| 55 |
+
subprocess.check_call([
|
| 56 |
+
sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'
|
| 57 |
+
])
|
| 58 |
+
print("β
All packages installed successfully!")
|
| 59 |
+
except subprocess.CalledProcessError:
|
| 60 |
+
print("β Failed to install packages. Please install manually:")
|
| 61 |
+
print(" pip install -r requirements.txt")
|
| 62 |
+
return False
|
| 63 |
+
|
| 64 |
+
return True
|
| 65 |
+
|
| 66 |
+
def check_app_structure():
|
| 67 |
+
"""Check if the app structure is correct"""
|
| 68 |
+
app_dir = Path('streamlit_app')
|
| 69 |
+
required_files = [
|
| 70 |
+
'app.py',
|
| 71 |
+
'core/cache_manager.py',
|
| 72 |
+
'core/text_processor.py',
|
| 73 |
+
'core/llm_analyzer.py',
|
| 74 |
+
'core/dataset_builder.py',
|
| 75 |
+
'utils/config.py',
|
| 76 |
+
'utils/performance.py',
|
| 77 |
+
'utils/ui_helpers.py'
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
missing_files = []
|
| 81 |
+
|
| 82 |
+
for file_path in required_files:
|
| 83 |
+
full_path = app_dir / file_path
|
| 84 |
+
if not full_path.exists():
|
| 85 |
+
missing_files.append(str(full_path))
|
| 86 |
+
|
| 87 |
+
if missing_files:
|
| 88 |
+
print("β Missing app files:")
|
| 89 |
+
for file_path in missing_files:
|
| 90 |
+
print(f" - {file_path}")
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
print("β
App structure is complete!")
|
| 94 |
+
return True
|
| 95 |
+
|
| 96 |
+
def create_directories():
|
| 97 |
+
"""Create necessary directories"""
|
| 98 |
+
directories = [
|
| 99 |
+
'streamlit_app/cache',
|
| 100 |
+
'streamlit_app/config',
|
| 101 |
+
'streamlit_app/datasets',
|
| 102 |
+
'streamlit_app/logs'
|
| 103 |
+
]
|
| 104 |
+
|
| 105 |
+
for dir_path in directories:
|
| 106 |
+
Path(dir_path).mkdir(parents=True, exist_ok=True)
|
| 107 |
+
print(f"π Created directory: {dir_path}")
|
| 108 |
+
|
| 109 |
+
def setup_environment():
|
| 110 |
+
"""Setup environment variables and configuration"""
|
| 111 |
+
# Add current directory to Python path for imports
|
| 112 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 113 |
+
if current_dir not in sys.path:
|
| 114 |
+
sys.path.insert(0, current_dir)
|
| 115 |
+
|
| 116 |
+
# Set environment variables
|
| 117 |
+
os.environ.setdefault('STREAMLIT_SERVER_HEADLESS', 'true')
|
| 118 |
+
os.environ.setdefault('STREAMLIT_BROWSER_GATHER_USAGE_STATS', 'false')
|
| 119 |
+
|
| 120 |
+
print("π§ Environment setup complete!")
|
| 121 |
+
|
| 122 |
+
def run_app():
|
| 123 |
+
"""Run the Streamlit application"""
|
| 124 |
+
print("\nπ Starting NZ Legislation Loophole Analyzer...")
|
| 125 |
+
print("=" * 60)
|
| 126 |
+
print("π± Access the app at: http://localhost:8501")
|
| 127 |
+
print("π Press Ctrl+C to stop the application")
|
| 128 |
+
print("=" * 60)
|
| 129 |
+
|
| 130 |
+
try:
|
| 131 |
+
# Change to app directory
|
| 132 |
+
os.chdir('streamlit_app')
|
| 133 |
+
|
| 134 |
+
# Run Streamlit
|
| 135 |
+
subprocess.run([
|
| 136 |
+
sys.executable, '-m', 'streamlit', 'run', 'app.py',
|
| 137 |
+
'--server.port', '8501',
|
| 138 |
+
'--server.address', '0.0.0.0',
|
| 139 |
+
'--theme.base', 'light'
|
| 140 |
+
])
|
| 141 |
+
|
| 142 |
+
except KeyboardInterrupt:
|
| 143 |
+
print("\n\nπ Application stopped by user")
|
| 144 |
+
except Exception as e:
|
| 145 |
+
print(f"\nβ Error running application: {e}")
|
| 146 |
+
return False
|
| 147 |
+
|
| 148 |
+
return True
|
| 149 |
+
|
| 150 |
+
def main():
|
| 151 |
+
"""Main function"""
|
| 152 |
+
print("ποΈ NZ Legislation Loophole Analysis Streamlit App")
|
| 153 |
+
print("=" * 60)
|
| 154 |
+
|
| 155 |
+
# Check requirements
|
| 156 |
+
if not check_requirements():
|
| 157 |
+
return 1
|
| 158 |
+
|
| 159 |
+
# Check app structure
|
| 160 |
+
if not check_app_structure():
|
| 161 |
+
return 1
|
| 162 |
+
|
| 163 |
+
# Create directories
|
| 164 |
+
create_directories()
|
| 165 |
+
|
| 166 |
+
# Setup environment
|
| 167 |
+
setup_environment()
|
| 168 |
+
|
| 169 |
+
# Run the app
|
| 170 |
+
if not run_app():
|
| 171 |
+
return 1
|
| 172 |
+
|
| 173 |
+
return 0
|
| 174 |
+
|
| 175 |
+
if __name__ == "__main__":
|
| 176 |
+
sys.exit(main())
|
streamlit_app/app.py
ADDED
|
@@ -0,0 +1,732 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
NZ Legislation Loophole Analysis Streamlit App
|
| 4 |
+
|
| 5 |
+
A modern web interface for analyzing New Zealand legislation text to identify
|
| 6 |
+
potential loopholes, ambiguities, and unintended consequences using AI.
|
| 7 |
+
|
| 8 |
+
Features:
|
| 9 |
+
- Advanced UI with multi-page layout
|
| 10 |
+
- Context memory cache system for improved performance
|
| 11 |
+
- Real-time progress monitoring
|
| 12 |
+
- Interactive results visualization
|
| 13 |
+
- Batch processing capabilities
|
| 14 |
+
- Comprehensive configuration management
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import streamlit as st
|
| 18 |
+
import sys
|
| 19 |
+
import os
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
|
| 22 |
+
# Add the current directory to Python path for imports
|
| 23 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
| 24 |
+
|
| 25 |
+
# Import core modules
|
| 26 |
+
from core.cache_manager import CacheManager
|
| 27 |
+
from core.text_processor import TextProcessor
|
| 28 |
+
from core.llm_analyzer import LLMAnalyzer
|
| 29 |
+
from core.dataset_builder import DatasetBuilder
|
| 30 |
+
from utils.config import ConfigManager
|
| 31 |
+
from utils.ui_helpers import UIHelpers
|
| 32 |
+
from utils.performance import PerformanceMonitor
|
| 33 |
+
|
| 34 |
+
# Configure page settings
|
| 35 |
+
st.set_page_config(
|
| 36 |
+
page_title="NZ Legislation Loophole Analyzer",
|
| 37 |
+
page_icon="βοΈ",
|
| 38 |
+
layout="wide",
|
| 39 |
+
initial_sidebar_state="expanded",
|
| 40 |
+
menu_items={
|
| 41 |
+
'Get Help': 'https://github.com/your-repo',
|
| 42 |
+
'Report a bug': 'https://github.com/your-repo/issues',
|
| 43 |
+
'About': '''
|
| 44 |
+
## NZ Legislation Loophole Analyzer
|
| 45 |
+
A powerful AI tool for analyzing New Zealand legislation to identify
|
| 46 |
+
potential loopholes, ambiguities, and unintended consequences.
|
| 47 |
+
|
| 48 |
+
**Version:** 1.0.0
|
| 49 |
+
**Built with:** Streamlit, Llama.cpp, and advanced caching
|
| 50 |
+
'''
|
| 51 |
+
}
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# Initialize session state
|
| 55 |
+
def initialize_session_state():
|
| 56 |
+
"""Initialize all session state variables"""
|
| 57 |
+
if 'cache_manager' not in st.session_state:
|
| 58 |
+
st.session_state.cache_manager = CacheManager()
|
| 59 |
+
|
| 60 |
+
if 'config_manager' not in st.session_state:
|
| 61 |
+
st.session_state.config_manager = ConfigManager()
|
| 62 |
+
|
| 63 |
+
if 'performance_monitor' not in st.session_state:
|
| 64 |
+
st.session_state.performance_monitor = PerformanceMonitor()
|
| 65 |
+
|
| 66 |
+
if 'current_analysis' not in st.session_state:
|
| 67 |
+
st.session_state.current_analysis = None
|
| 68 |
+
|
| 69 |
+
if 'analysis_results' not in st.session_state:
|
| 70 |
+
st.session_state.analysis_results = []
|
| 71 |
+
|
| 72 |
+
if 'processing_status' not in st.session_state:
|
| 73 |
+
st.session_state.processing_status = {
|
| 74 |
+
'is_running': False,
|
| 75 |
+
'progress': 0,
|
| 76 |
+
'current_task': '',
|
| 77 |
+
'total_chunks': 0,
|
| 78 |
+
'processed_chunks': 0
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
def main():
|
| 82 |
+
"""Main application function"""
|
| 83 |
+
# Initialize session state
|
| 84 |
+
initialize_session_state()
|
| 85 |
+
|
| 86 |
+
# Create sidebar with navigation and status
|
| 87 |
+
with st.sidebar:
|
| 88 |
+
st.title("βοΈ NZ Legislation Analyzer")
|
| 89 |
+
st.markdown("---")
|
| 90 |
+
|
| 91 |
+
# Navigation
|
| 92 |
+
pages = {
|
| 93 |
+
"π Home": "home",
|
| 94 |
+
"π€ Upload & Process": "upload",
|
| 95 |
+
"π Analysis Results": "results",
|
| 96 |
+
"βοΈ Settings": "settings",
|
| 97 |
+
"π Performance": "performance"
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
selected_page = st.selectbox(
|
| 101 |
+
"Navigate to:",
|
| 102 |
+
list(pages.keys()),
|
| 103 |
+
key="nav_select"
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
st.markdown("---")
|
| 107 |
+
|
| 108 |
+
# Cache status
|
| 109 |
+
with st.expander("π§ Cache Status", expanded=True):
|
| 110 |
+
cache_stats = st.session_state.cache_manager.get_stats()
|
| 111 |
+
st.metric("Cache Hits", cache_stats['hits'])
|
| 112 |
+
st.metric("Cache Misses", cache_stats['misses'])
|
| 113 |
+
st.metric("Hit Rate", ".1f")
|
| 114 |
+
st.metric("Cached Chunks", cache_stats['entries'])
|
| 115 |
+
|
| 116 |
+
if st.button("Clear Cache", type="secondary"):
|
| 117 |
+
st.session_state.cache_manager.clear_cache()
|
| 118 |
+
st.rerun()
|
| 119 |
+
|
| 120 |
+
# Performance metrics
|
| 121 |
+
with st.expander("π Performance", expanded=True):
|
| 122 |
+
perf_stats = st.session_state.performance_monitor.get_stats()
|
| 123 |
+
st.metric("Memory Usage", ".1f")
|
| 124 |
+
st.metric("Avg Processing Time", ".2f")
|
| 125 |
+
|
| 126 |
+
# Processing status
|
| 127 |
+
if st.session_state.processing_status['is_running']:
|
| 128 |
+
with st.expander("π Processing Status", expanded=True):
|
| 129 |
+
st.progress(st.session_state.processing_status['progress'])
|
| 130 |
+
st.text(st.session_state.processing_status['current_task'])
|
| 131 |
+
st.text(f"Chunk {st.session_state.processing_status['processed_chunks']}/"
|
| 132 |
+
f"{st.session_state.processing_status['total_chunks']}")
|
| 133 |
+
|
| 134 |
+
# Main content area
|
| 135 |
+
page = pages[selected_page]
|
| 136 |
+
|
| 137 |
+
if page == "home":
|
| 138 |
+
show_home_page()
|
| 139 |
+
elif page == "upload":
|
| 140 |
+
show_upload_page()
|
| 141 |
+
elif page == "results":
|
| 142 |
+
show_results_page()
|
| 143 |
+
elif page == "settings":
|
| 144 |
+
show_settings_page()
|
| 145 |
+
elif page == "performance":
|
| 146 |
+
show_performance_page()
|
| 147 |
+
|
| 148 |
+
# Footer
|
| 149 |
+
st.markdown("---")
|
| 150 |
+
st.markdown(
|
| 151 |
+
"""
|
| 152 |
+
<div style='text-align: center; color: #666; font-size: 12px;'>
|
| 153 |
+
NZ Legislation Loophole Analyzer v1.0.0 | Built with Streamlit & Llama.cpp
|
| 154 |
+
</div>
|
| 155 |
+
""",
|
| 156 |
+
unsafe_allow_html=True
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
def show_home_page():
|
| 160 |
+
"""Display the home page with overview and quick start"""
|
| 161 |
+
st.title("π NZ Legislation Loophole Analyzer")
|
| 162 |
+
st.markdown("### AI-Powered Legal Analysis Tool")
|
| 163 |
+
|
| 164 |
+
col1, col2 = st.columns([2, 1])
|
| 165 |
+
|
| 166 |
+
with col1:
|
| 167 |
+
st.markdown("""
|
| 168 |
+
This advanced tool analyzes New Zealand legislation to identify:
|
| 169 |
+
|
| 170 |
+
π **Potential Loopholes** - Legal ambiguities that could be exploited
|
| 171 |
+
π **Unintended Consequences** - Hidden implications in legislative language
|
| 172 |
+
βοΈ **Ambiguities** - Vague or unclear legal provisions
|
| 173 |
+
π― **Circumvention Strategies** - Ways legislation might be bypassed
|
| 174 |
+
|
| 175 |
+
**Key Features:**
|
| 176 |
+
- **Smart Caching**: Avoid re-processing identical content
|
| 177 |
+
- **Advanced UI**: Modern interface with real-time progress
|
| 178 |
+
- **Batch Processing**: Handle multiple legislation files
|
| 179 |
+
- **Performance Monitoring**: Track memory usage and processing speed
|
| 180 |
+
- **Export Options**: Multiple formats for analysis results
|
| 181 |
+
""")
|
| 182 |
+
|
| 183 |
+
st.markdown("### Quick Start")
|
| 184 |
+
st.markdown("""
|
| 185 |
+
1. **Upload** your NZ legislation files (JSON lines or raw text)
|
| 186 |
+
2. **Configure** analysis parameters and model settings
|
| 187 |
+
3. **Process** the legislation with AI-powered analysis
|
| 188 |
+
4. **Review** results with interactive visualizations
|
| 189 |
+
5. **Export** findings in multiple formats
|
| 190 |
+
""")
|
| 191 |
+
|
| 192 |
+
with col2:
|
| 193 |
+
st.markdown("### Current Configuration")
|
| 194 |
+
|
| 195 |
+
config = st.session_state.config_manager.get_config()
|
| 196 |
+
|
| 197 |
+
# Model settings
|
| 198 |
+
st.subheader("π€ Model Settings")
|
| 199 |
+
st.info(f"**Model:** {config['model']['path']}")
|
| 200 |
+
st.info(f"**Context Length:** {config['model']['context_length']}")
|
| 201 |
+
st.info(f"**Max Tokens:** {config['model']['max_tokens']}")
|
| 202 |
+
|
| 203 |
+
# Processing settings
|
| 204 |
+
st.subheader("βοΈ Processing")
|
| 205 |
+
st.info(f"**Chunk Size:** {config['processing']['chunk_size']}")
|
| 206 |
+
st.info(f"**Overlap:** {config['processing']['chunk_overlap']}")
|
| 207 |
+
st.info(f"**Batch Size:** {config['processing']['batch_size']}")
|
| 208 |
+
|
| 209 |
+
# Cache settings
|
| 210 |
+
st.subheader("π§ Cache")
|
| 211 |
+
cache_stats = st.session_state.cache_manager.get_stats()
|
| 212 |
+
st.info(f"**Status:** {'Active' if cache_stats['enabled'] else 'Disabled'}")
|
| 213 |
+
st.info(f"**Hit Rate:** {cache_stats['hit_rate']:.1f}%")
|
| 214 |
+
|
| 215 |
+
if st.button("π Start Analysis", type="primary", use_container_width=True):
|
| 216 |
+
st.switch_page("pages/1_upload.py")
|
| 217 |
+
|
| 218 |
+
def show_upload_page():
|
| 219 |
+
"""Display the upload and processing page"""
|
| 220 |
+
st.title("π€ Upload & Process Legislation")
|
| 221 |
+
|
| 222 |
+
# File upload section
|
| 223 |
+
st.subheader("π Upload Legislation Files")
|
| 224 |
+
|
| 225 |
+
col1, col2 = st.columns([1, 1])
|
| 226 |
+
|
| 227 |
+
with col1:
|
| 228 |
+
uploaded_files = st.file_uploader(
|
| 229 |
+
"Select NZ legislation files",
|
| 230 |
+
accept_multiple_files=True,
|
| 231 |
+
type=['json', 'txt', 'jsonl'],
|
| 232 |
+
help="Upload JSON lines format (.jsonl), JSON arrays (.json), or raw text (.txt) files"
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
if uploaded_files:
|
| 236 |
+
st.success(f"π {len(uploaded_files)} file(s) selected")
|
| 237 |
+
|
| 238 |
+
# Show file details
|
| 239 |
+
for file in uploaded_files:
|
| 240 |
+
with st.expander(f"π {file.name}"):
|
| 241 |
+
st.write(f"**Size:** {file.size:,} bytes")
|
| 242 |
+
st.write(f"**Type:** {file.type}")
|
| 243 |
+
|
| 244 |
+
# Preview content
|
| 245 |
+
if file.type in ['text/plain', 'application/json']:
|
| 246 |
+
content = file.read().decode('utf-8')
|
| 247 |
+
st.text_area("Preview", content[:500] + "..." if len(content) > 500 else content,
|
| 248 |
+
height=100, disabled=True)
|
| 249 |
+
file.seek(0) # Reset file pointer
|
| 250 |
+
|
| 251 |
+
with col2:
|
| 252 |
+
# Processing configuration
|
| 253 |
+
st.subheader("βοΈ Processing Configuration")
|
| 254 |
+
|
| 255 |
+
config = st.session_state.config_manager.get_config()
|
| 256 |
+
|
| 257 |
+
# Model settings
|
| 258 |
+
with st.expander("π€ Model Configuration", expanded=True):
|
| 259 |
+
model_path = st.text_input(
|
| 260 |
+
"Model Path",
|
| 261 |
+
value=config['model']['path'],
|
| 262 |
+
help="Path to your GGUF model file"
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
context_length = st.slider(
|
| 266 |
+
"Context Length",
|
| 267 |
+
min_value=1024,
|
| 268 |
+
max_value=65536,
|
| 269 |
+
value=config['model']['context_length'],
|
| 270 |
+
step=1024,
|
| 271 |
+
help="Maximum context length for the model"
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
max_tokens = st.slider(
|
| 275 |
+
"Max Response Tokens",
|
| 276 |
+
min_value=256,
|
| 277 |
+
max_value=4096,
|
| 278 |
+
value=config['model']['max_tokens'],
|
| 279 |
+
step=64,
|
| 280 |
+
help="Maximum tokens in model response"
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
# Text processing settings
|
| 284 |
+
with st.expander("π Text Processing", expanded=True):
|
| 285 |
+
chunk_size = st.slider(
|
| 286 |
+
"Chunk Size",
|
| 287 |
+
min_value=512,
|
| 288 |
+
max_value=8192,
|
| 289 |
+
value=config['processing']['chunk_size'],
|
| 290 |
+
step=256,
|
| 291 |
+
help="Size of text chunks for processing"
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
chunk_overlap = st.slider(
|
| 295 |
+
"Chunk Overlap",
|
| 296 |
+
min_value=64,
|
| 297 |
+
max_value=1024,
|
| 298 |
+
value=config['processing']['chunk_overlap'],
|
| 299 |
+
step=32,
|
| 300 |
+
help="Overlap between chunks for context preservation"
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
# Analysis settings
|
| 304 |
+
with st.expander("π Analysis Settings", expanded=True):
|
| 305 |
+
analysis_depth = st.select_slider(
|
| 306 |
+
"Analysis Depth",
|
| 307 |
+
options=["Basic", "Standard", "Detailed", "Comprehensive"],
|
| 308 |
+
value=config['analysis']['depth'],
|
| 309 |
+
help="Level of detail in legal analysis"
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
include_recommendations = st.checkbox(
|
| 313 |
+
"Include Recommendations",
|
| 314 |
+
value=config['analysis']['include_recommendations'],
|
| 315 |
+
help="Generate specific recommendations for addressing identified issues"
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
# Process button and status
|
| 319 |
+
col1, col2, col3 = st.columns([1, 1, 1])
|
| 320 |
+
|
| 321 |
+
with col1:
|
| 322 |
+
if st.button("π Start Processing", type="primary", use_container_width=True):
|
| 323 |
+
if not uploaded_files:
|
| 324 |
+
st.error("Please upload at least one legislation file")
|
| 325 |
+
else:
|
| 326 |
+
start_processing(uploaded_files, {
|
| 327 |
+
'model': {
|
| 328 |
+
'path': model_path,
|
| 329 |
+
'context_length': context_length,
|
| 330 |
+
'max_tokens': max_tokens
|
| 331 |
+
},
|
| 332 |
+
'processing': {
|
| 333 |
+
'chunk_size': chunk_size,
|
| 334 |
+
'chunk_overlap': chunk_overlap
|
| 335 |
+
},
|
| 336 |
+
'analysis': {
|
| 337 |
+
'depth': analysis_depth,
|
| 338 |
+
'include_recommendations': include_recommendations
|
| 339 |
+
}
|
| 340 |
+
})
|
| 341 |
+
|
| 342 |
+
with col2:
|
| 343 |
+
if st.button("βΉοΈ Stop Processing", use_container_width=True):
|
| 344 |
+
stop_processing()
|
| 345 |
+
|
| 346 |
+
with col3:
|
| 347 |
+
if st.button("π View Results", use_container_width=True):
|
| 348 |
+
st.switch_page("pages/2_analysis.py")
|
| 349 |
+
|
| 350 |
+
def start_processing(files, config):
|
| 351 |
+
"""Start the processing workflow"""
|
| 352 |
+
st.session_state.processing_status = {
|
| 353 |
+
'is_running': True,
|
| 354 |
+
'progress': 0,
|
| 355 |
+
'current_task': 'Initializing...',
|
| 356 |
+
'total_chunks': 0,
|
| 357 |
+
'processed_chunks': 0
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
# Update configuration
|
| 361 |
+
st.session_state.config_manager.update_config(config)
|
| 362 |
+
|
| 363 |
+
# TODO: Implement actual processing logic
|
| 364 |
+
st.rerun()
|
| 365 |
+
|
| 366 |
+
def stop_processing():
|
| 367 |
+
"""Stop the current processing"""
|
| 368 |
+
st.session_state.processing_status['is_running'] = False
|
| 369 |
+
st.session_state.processing_status['current_task'] = 'Stopped by user'
|
| 370 |
+
|
| 371 |
+
def show_results_page():
|
| 372 |
+
"""Display analysis results page"""
|
| 373 |
+
st.title("π Analysis Results")
|
| 374 |
+
|
| 375 |
+
if not st.session_state.analysis_results:
|
| 376 |
+
st.info("No analysis results available. Please upload and process legislation files first.")
|
| 377 |
+
return
|
| 378 |
+
|
| 379 |
+
# Results overview
|
| 380 |
+
st.subheader("π Results Overview")
|
| 381 |
+
|
| 382 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 383 |
+
|
| 384 |
+
total_results = len(st.session_state.analysis_results)
|
| 385 |
+
total_loopholes = sum(len(result.get('loopholes', [])) for result in st.session_state.analysis_results)
|
| 386 |
+
avg_confidence = sum(result.get('confidence', 0) for result in st.session_state.analysis_results) / max(total_results, 1)
|
| 387 |
+
|
| 388 |
+
with col1:
|
| 389 |
+
st.metric("Total Analyses", total_results)
|
| 390 |
+
|
| 391 |
+
with col2:
|
| 392 |
+
st.metric("Loopholes Found", total_loopholes)
|
| 393 |
+
|
| 394 |
+
with col3:
|
| 395 |
+
st.metric("Avg Confidence", ".2f")
|
| 396 |
+
|
| 397 |
+
with col4:
|
| 398 |
+
cache_stats = st.session_state.cache_manager.get_stats()
|
| 399 |
+
st.metric("Cache Hit Rate", ".1f")
|
| 400 |
+
|
| 401 |
+
# Results display
|
| 402 |
+
st.subheader("π Detailed Results")
|
| 403 |
+
|
| 404 |
+
for i, result in enumerate(st.session_state.analysis_results):
|
| 405 |
+
with st.expander(f"π Analysis {i+1}: {result.get('title', 'Unknown Title')}", expanded=i==0):
|
| 406 |
+
col1, col2 = st.columns([2, 1])
|
| 407 |
+
|
| 408 |
+
with col1:
|
| 409 |
+
st.markdown("**Summary:**")
|
| 410 |
+
st.write(result.get('summary', 'No summary available'))
|
| 411 |
+
|
| 412 |
+
st.markdown("**Key Findings:**")
|
| 413 |
+
for finding in result.get('loopholes', []):
|
| 414 |
+
st.markdown(f"- {finding}")
|
| 415 |
+
|
| 416 |
+
with col2:
|
| 417 |
+
st.metric("Confidence", ".2f")
|
| 418 |
+
st.metric("Processing Time", ".2f")
|
| 419 |
+
st.metric("Chunks Processed", result.get('chunks_processed', 0))
|
| 420 |
+
|
| 421 |
+
# Export options
|
| 422 |
+
st.subheader("πΎ Export Results")
|
| 423 |
+
|
| 424 |
+
col1, col2, col3 = st.columns(3)
|
| 425 |
+
|
| 426 |
+
with col1:
|
| 427 |
+
if st.button("π Export as JSON", use_container_width=True):
|
| 428 |
+
export_results('json')
|
| 429 |
+
|
| 430 |
+
with col2:
|
| 431 |
+
if st.button("π Export as CSV", use_container_width=True):
|
| 432 |
+
export_results('csv')
|
| 433 |
+
|
| 434 |
+
with col3:
|
| 435 |
+
if st.button("π Export as Excel", use_container_width=True):
|
| 436 |
+
export_results('excel')
|
| 437 |
+
|
| 438 |
+
def export_results(format_type):
|
| 439 |
+
"""Export analysis results in specified format"""
|
| 440 |
+
# TODO: Implement export functionality
|
| 441 |
+
st.success(f"Results exported as {format_type.upper()}")
|
| 442 |
+
|
| 443 |
+
def show_settings_page():
|
| 444 |
+
"""Display settings page"""
|
| 445 |
+
st.title("βοΈ Settings & Configuration")
|
| 446 |
+
|
| 447 |
+
tabs = st.tabs(["π€ Model Settings", "π Processing", "π§ Cache", "π¨ UI", "π§ Advanced"])
|
| 448 |
+
|
| 449 |
+
with tabs[0]:
|
| 450 |
+
st.subheader("π€ Model Configuration")
|
| 451 |
+
|
| 452 |
+
config = st.session_state.config_manager.get_config()
|
| 453 |
+
|
| 454 |
+
model_path = st.text_input(
|
| 455 |
+
"Model Path",
|
| 456 |
+
value=config['model']['path'],
|
| 457 |
+
help="Path to your GGUF model file"
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
repo_id = st.text_input(
|
| 461 |
+
"HuggingFace Repo ID",
|
| 462 |
+
value=config['model']['repo_id'],
|
| 463 |
+
help="HuggingFace repository ID for model download"
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
filename = st.text_input(
|
| 467 |
+
"Model Filename",
|
| 468 |
+
value=config['model']['filename'],
|
| 469 |
+
help="Specific model filename in the repository"
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
context_length = st.slider(
|
| 473 |
+
"Context Length",
|
| 474 |
+
min_value=1024,
|
| 475 |
+
max_value=131072,
|
| 476 |
+
value=config['model']['context_length'],
|
| 477 |
+
step=1024
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
max_tokens = st.slider(
|
| 481 |
+
"Max Response Tokens",
|
| 482 |
+
min_value=256,
|
| 483 |
+
max_value=8192,
|
| 484 |
+
value=config['model']['max_tokens'],
|
| 485 |
+
step=64
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
temperature = st.slider(
|
| 489 |
+
"Temperature",
|
| 490 |
+
min_value=0.0,
|
| 491 |
+
max_value=2.0,
|
| 492 |
+
value=config['model']['temperature'],
|
| 493 |
+
step=0.1,
|
| 494 |
+
help="Controls randomness in model output"
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
with tabs[1]:
|
| 498 |
+
st.subheader("π Text Processing")
|
| 499 |
+
|
| 500 |
+
chunk_size = st.slider(
|
| 501 |
+
"Chunk Size",
|
| 502 |
+
min_value=256,
|
| 503 |
+
max_value=16384,
|
| 504 |
+
value=config['processing']['chunk_size'],
|
| 505 |
+
step=256
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
chunk_overlap = st.slider(
|
| 509 |
+
"Chunk Overlap",
|
| 510 |
+
min_value=32,
|
| 511 |
+
max_value=2048,
|
| 512 |
+
value=config['processing']['chunk_overlap'],
|
| 513 |
+
step=32
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
batch_size = st.slider(
|
| 517 |
+
"Batch Size",
|
| 518 |
+
min_value=1,
|
| 519 |
+
max_value=32,
|
| 520 |
+
value=config['processing']['batch_size'],
|
| 521 |
+
step=1
|
| 522 |
+
)
|
| 523 |
+
|
| 524 |
+
clean_text = st.checkbox(
|
| 525 |
+
"Clean Text",
|
| 526 |
+
value=config['processing']['clean_text'],
|
| 527 |
+
help="Apply text cleaning and normalization"
|
| 528 |
+
)
|
| 529 |
+
|
| 530 |
+
with tabs[2]:
|
| 531 |
+
st.subheader("π§ Cache Configuration")
|
| 532 |
+
|
| 533 |
+
enable_cache = st.checkbox(
|
| 534 |
+
"Enable Caching",
|
| 535 |
+
value=config['cache']['enabled'],
|
| 536 |
+
help="Use cache to avoid re-processing identical chunks"
|
| 537 |
+
)
|
| 538 |
+
|
| 539 |
+
max_cache_size = st.slider(
|
| 540 |
+
"Max Cache Size (MB)",
|
| 541 |
+
min_value=100,
|
| 542 |
+
max_value=8192,
|
| 543 |
+
value=config['cache']['max_size_mb'],
|
| 544 |
+
step=100
|
| 545 |
+
)
|
| 546 |
+
|
| 547 |
+
cache_ttl = st.slider(
|
| 548 |
+
"Cache TTL (hours)",
|
| 549 |
+
min_value=1,
|
| 550 |
+
max_value=168,
|
| 551 |
+
value=config['cache']['ttl_hours'],
|
| 552 |
+
step=1,
|
| 553 |
+
help="Time-to-live for cached entries"
|
| 554 |
+
)
|
| 555 |
+
|
| 556 |
+
persistent_cache = st.checkbox(
|
| 557 |
+
"Persistent Cache",
|
| 558 |
+
value=config['cache']['persistent'],
|
| 559 |
+
help="Save cache to disk for persistence across sessions"
|
| 560 |
+
)
|
| 561 |
+
|
| 562 |
+
with tabs[3]:
|
| 563 |
+
st.subheader("π¨ UI Configuration")
|
| 564 |
+
|
| 565 |
+
theme = st.selectbox(
|
| 566 |
+
"Theme",
|
| 567 |
+
options=["Auto", "Light", "Dark"],
|
| 568 |
+
index=["Auto", "Light", "Dark"].index(config['ui']['theme'])
|
| 569 |
+
)
|
| 570 |
+
|
| 571 |
+
show_progress = st.checkbox(
|
| 572 |
+
"Show Progress Bars",
|
| 573 |
+
value=config['ui']['show_progress'],
|
| 574 |
+
help="Display progress indicators during processing"
|
| 575 |
+
)
|
| 576 |
+
|
| 577 |
+
auto_refresh = st.checkbox(
|
| 578 |
+
"Auto-refresh Results",
|
| 579 |
+
value=config['ui']['auto_refresh'],
|
| 580 |
+
help="Automatically refresh results view"
|
| 581 |
+
)
|
| 582 |
+
|
| 583 |
+
with tabs[4]:
|
| 584 |
+
st.subheader("π§ Advanced Settings")
|
| 585 |
+
|
| 586 |
+
debug_mode = st.checkbox(
|
| 587 |
+
"Debug Mode",
|
| 588 |
+
value=config['advanced']['debug_mode'],
|
| 589 |
+
help="Enable detailed logging and debugging information"
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
log_level = st.selectbox(
|
| 593 |
+
"Log Level",
|
| 594 |
+
options=["DEBUG", "INFO", "WARNING", "ERROR"],
|
| 595 |
+
index=["DEBUG", "INFO", "WARNING", "ERROR"].index(config['advanced']['log_level'])
|
| 596 |
+
)
|
| 597 |
+
|
| 598 |
+
memory_limit = st.slider(
|
| 599 |
+
"Memory Limit (MB)",
|
| 600 |
+
min_value=512,
|
| 601 |
+
max_value=32768,
|
| 602 |
+
value=config['advanced']['memory_limit_mb'],
|
| 603 |
+
step=512
|
| 604 |
+
)
|
| 605 |
+
|
| 606 |
+
# Save settings
|
| 607 |
+
col1, col2 = st.columns([1, 1])
|
| 608 |
+
|
| 609 |
+
with col1:
|
| 610 |
+
if st.button("πΎ Save Settings", type="primary", use_container_width=True):
|
| 611 |
+
new_config = {
|
| 612 |
+
'model': {
|
| 613 |
+
'path': model_path,
|
| 614 |
+
'repo_id': repo_id,
|
| 615 |
+
'filename': filename,
|
| 616 |
+
'context_length': context_length,
|
| 617 |
+
'max_tokens': max_tokens,
|
| 618 |
+
'temperature': temperature
|
| 619 |
+
},
|
| 620 |
+
'processing': {
|
| 621 |
+
'chunk_size': chunk_size,
|
| 622 |
+
'chunk_overlap': chunk_overlap,
|
| 623 |
+
'batch_size': batch_size,
|
| 624 |
+
'clean_text': clean_text
|
| 625 |
+
},
|
| 626 |
+
'cache': {
|
| 627 |
+
'enabled': enable_cache,
|
| 628 |
+
'max_size_mb': max_cache_size,
|
| 629 |
+
'ttl_hours': cache_ttl,
|
| 630 |
+
'persistent': persistent_cache
|
| 631 |
+
},
|
| 632 |
+
'ui': {
|
| 633 |
+
'theme': theme,
|
| 634 |
+
'show_progress': show_progress,
|
| 635 |
+
'auto_refresh': auto_refresh
|
| 636 |
+
},
|
| 637 |
+
'advanced': {
|
| 638 |
+
'debug_mode': debug_mode,
|
| 639 |
+
'log_level': log_level,
|
| 640 |
+
'memory_limit_mb': memory_limit
|
| 641 |
+
}
|
| 642 |
+
}
|
| 643 |
+
|
| 644 |
+
st.session_state.config_manager.update_config(new_config)
|
| 645 |
+
st.success("Settings saved successfully!")
|
| 646 |
+
|
| 647 |
+
with col2:
|
| 648 |
+
if st.button("π Reset to Defaults", use_container_width=True):
|
| 649 |
+
st.session_state.config_manager.reset_to_defaults()
|
| 650 |
+
st.success("Settings reset to defaults!")
|
| 651 |
+
st.rerun()
|
| 652 |
+
|
| 653 |
+
def show_performance_page():
|
| 654 |
+
"""Display performance monitoring page"""
|
| 655 |
+
st.title("π Performance Dashboard")
|
| 656 |
+
|
| 657 |
+
# Real-time metrics
|
| 658 |
+
st.subheader("π Real-time Metrics")
|
| 659 |
+
|
| 660 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 661 |
+
|
| 662 |
+
perf_stats = st.session_state.performance_monitor.get_stats()
|
| 663 |
+
|
| 664 |
+
with col1:
|
| 665 |
+
st.metric("Memory Usage", ".1f", "MB")
|
| 666 |
+
|
| 667 |
+
with col2:
|
| 668 |
+
st.metric("CPU Usage", ".1f", "%")
|
| 669 |
+
|
| 670 |
+
with col3:
|
| 671 |
+
st.metric("Active Threads", perf_stats.get('active_threads', 0))
|
| 672 |
+
|
| 673 |
+
with col4:
|
| 674 |
+
cache_stats = st.session_state.cache_manager.get_stats()
|
| 675 |
+
st.metric("Cache Hit Rate", ".1f", "%")
|
| 676 |
+
|
| 677 |
+
# Performance charts
|
| 678 |
+
st.subheader("π Performance History")
|
| 679 |
+
|
| 680 |
+
# TODO: Add interactive charts for performance metrics
|
| 681 |
+
|
| 682 |
+
# System information
|
| 683 |
+
st.subheader("π» System Information")
|
| 684 |
+
|
| 685 |
+
col1, col2 = st.columns(2)
|
| 686 |
+
|
| 687 |
+
with col1:
|
| 688 |
+
st.markdown("**Hardware:**")
|
| 689 |
+
# TODO: Add system information display
|
| 690 |
+
|
| 691 |
+
with col2:
|
| 692 |
+
st.markdown("**Software:**")
|
| 693 |
+
# TODO: Add software information display
|
| 694 |
+
|
| 695 |
+
# Cache performance
|
| 696 |
+
st.subheader("π§ Cache Performance")
|
| 697 |
+
|
| 698 |
+
cache_stats = st.session_state.cache_manager.get_stats()
|
| 699 |
+
|
| 700 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 701 |
+
|
| 702 |
+
with col1:
|
| 703 |
+
st.metric("Total Requests", cache_stats['hits'] + cache_stats['misses'])
|
| 704 |
+
|
| 705 |
+
with col2:
|
| 706 |
+
st.metric("Cache Hits", cache_stats['hits'])
|
| 707 |
+
|
| 708 |
+
with col3:
|
| 709 |
+
st.metric("Cache Misses", cache_stats['misses'])
|
| 710 |
+
|
| 711 |
+
with col4:
|
| 712 |
+
st.metric("Hit Rate", ".1f")
|
| 713 |
+
|
| 714 |
+
# Performance recommendations
|
| 715 |
+
st.subheader("π‘ Performance Recommendations")
|
| 716 |
+
|
| 717 |
+
recommendations = []
|
| 718 |
+
|
| 719 |
+
if cache_stats['hit_rate'] < 50:
|
| 720 |
+
recommendations.append("Consider increasing cache size or adjusting chunk sizes to improve hit rate")
|
| 721 |
+
|
| 722 |
+
if perf_stats.get('memory_usage_mb', 0) > 8000:
|
| 723 |
+
recommendations.append("High memory usage detected. Consider reducing batch size or chunk size")
|
| 724 |
+
|
| 725 |
+
if not recommendations:
|
| 726 |
+
recommendations.append("Performance is optimal!")
|
| 727 |
+
|
| 728 |
+
for rec in recommendations:
|
| 729 |
+
st.info(rec)
|
| 730 |
+
|
| 731 |
+
if __name__ == "__main__":
|
| 732 |
+
main()
|
streamlit_app/core/__pycache__/cache_manager.cpython-312.pyc
ADDED
|
Binary file (24.5 kB). View file
|
|
|
streamlit_app/core/__pycache__/dataset_builder.cpython-312.pyc
ADDED
|
Binary file (25.8 kB). View file
|
|
|
streamlit_app/core/__pycache__/llm_analyzer.cpython-312.pyc
ADDED
|
Binary file (17.2 kB). View file
|
|
|
streamlit_app/core/__pycache__/text_processor.cpython-312.pyc
ADDED
|
Binary file (15.9 kB). View file
|
|
|
streamlit_app/core/cache_manager.py
ADDED
|
@@ -0,0 +1,505 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Context Memory Cache Manager
|
| 4 |
+
|
| 5 |
+
A sophisticated caching system for NZ Legislation Loophole Analysis that provides:
|
| 6 |
+
- Hash-based chunk identification for unique content tracking
|
| 7 |
+
- Multi-level caching (memory + optional disk persistence)
|
| 8 |
+
- Intelligent cache invalidation based on memory limits
|
| 9 |
+
- Performance metrics and cache statistics
|
| 10 |
+
- Thread-safe operations for concurrent processing
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import hashlib
|
| 14 |
+
import json
|
| 15 |
+
import os
|
| 16 |
+
import time
|
| 17 |
+
import threading
|
| 18 |
+
from typing import Dict, Any, Optional, Tuple
|
| 19 |
+
from functools import lru_cache
|
| 20 |
+
import sqlite3
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
import psutil
|
| 23 |
+
import streamlit as st
|
| 24 |
+
|
| 25 |
+
class CacheEntry:
|
| 26 |
+
"""Represents a single cache entry with metadata"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, key: str, content: str, analysis_result: Dict[str, Any],
|
| 29 |
+
model_config: Dict[str, Any], processing_config: Dict[str, Any]):
|
| 30 |
+
self.key = key
|
| 31 |
+
self.content = content
|
| 32 |
+
self.analysis_result = analysis_result
|
| 33 |
+
self.model_config = model_config
|
| 34 |
+
self.processing_config = processing_config
|
| 35 |
+
self.created_at = time.time()
|
| 36 |
+
self.last_accessed = time.time()
|
| 37 |
+
self.access_count = 0
|
| 38 |
+
self.size_bytes = len(content.encode('utf-8')) + len(str(analysis_result).encode('utf-8'))
|
| 39 |
+
|
| 40 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 41 |
+
"""Convert cache entry to dictionary for serialization"""
|
| 42 |
+
return {
|
| 43 |
+
'key': self.key,
|
| 44 |
+
'content': self.content,
|
| 45 |
+
'analysis_result': self.analysis_result,
|
| 46 |
+
'model_config': self.model_config,
|
| 47 |
+
'processing_config': self.processing_config,
|
| 48 |
+
'created_at': self.created_at,
|
| 49 |
+
'last_accessed': self.last_accessed,
|
| 50 |
+
'access_count': self.access_count,
|
| 51 |
+
'size_bytes': self.size_bytes
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
@classmethod
|
| 55 |
+
def from_dict(cls, data: Dict[str, Any]) -> 'CacheEntry':
|
| 56 |
+
"""Create cache entry from dictionary"""
|
| 57 |
+
entry = cls(
|
| 58 |
+
key=data['key'],
|
| 59 |
+
content=data['content'],
|
| 60 |
+
analysis_result=data['analysis_result'],
|
| 61 |
+
model_config=data['model_config'],
|
| 62 |
+
processing_config=data['processing_config']
|
| 63 |
+
)
|
| 64 |
+
entry.created_at = data.get('created_at', time.time())
|
| 65 |
+
entry.last_accessed = data.get('last_accessed', time.time())
|
| 66 |
+
entry.access_count = data.get('access_count', 0)
|
| 67 |
+
entry.size_bytes = data.get('size_bytes', entry.size_bytes)
|
| 68 |
+
return entry
|
| 69 |
+
|
| 70 |
+
def update_access(self):
|
| 71 |
+
"""Update access statistics"""
|
| 72 |
+
self.last_accessed = time.time()
|
| 73 |
+
self.access_count += 1
|
| 74 |
+
|
| 75 |
+
class CacheManager:
|
| 76 |
+
"""Advanced cache manager for legislation analysis"""
|
| 77 |
+
|
| 78 |
+
def __init__(self, max_memory_mb: int = 1024, persistent: bool = True,
|
| 79 |
+
cache_dir: str = None, ttl_hours: int = 24):
|
| 80 |
+
"""
|
| 81 |
+
Initialize the cache manager
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
max_memory_mb: Maximum memory to use for caching (MB)
|
| 85 |
+
persistent: Whether to use persistent disk cache
|
| 86 |
+
cache_dir: Directory for persistent cache storage
|
| 87 |
+
ttl_hours: Time-to-live for cache entries (hours)
|
| 88 |
+
"""
|
| 89 |
+
self.max_memory_mb = max_memory_mb
|
| 90 |
+
self.persistent = persistent
|
| 91 |
+
self.ttl_hours = ttl_hours
|
| 92 |
+
self.ttl_seconds = ttl_hours * 3600
|
| 93 |
+
|
| 94 |
+
# Set up cache directory
|
| 95 |
+
if cache_dir is None:
|
| 96 |
+
cache_dir = os.path.join(os.path.dirname(__file__), '..', 'cache')
|
| 97 |
+
self.cache_dir = Path(cache_dir)
|
| 98 |
+
self.cache_dir.mkdir(exist_ok=True)
|
| 99 |
+
self.db_path = self.cache_dir / 'cache.db'
|
| 100 |
+
|
| 101 |
+
# Thread synchronization
|
| 102 |
+
self.lock = threading.RLock()
|
| 103 |
+
|
| 104 |
+
# In-memory cache with LRU eviction
|
| 105 |
+
self.memory_cache: Dict[str, CacheEntry] = {}
|
| 106 |
+
self.memory_size = 0 # Current memory usage in bytes
|
| 107 |
+
|
| 108 |
+
# Statistics
|
| 109 |
+
self.stats = {
|
| 110 |
+
'hits': 0,
|
| 111 |
+
'misses': 0,
|
| 112 |
+
'entries': 0,
|
| 113 |
+
'memory_usage_mb': 0,
|
| 114 |
+
'evictions': 0,
|
| 115 |
+
'enabled': True
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
# Initialize database if persistent
|
| 119 |
+
if self.persistent:
|
| 120 |
+
self._init_database()
|
| 121 |
+
|
| 122 |
+
# Load existing cache entries if persistent
|
| 123 |
+
if self.persistent:
|
| 124 |
+
self._load_persistent_cache()
|
| 125 |
+
|
| 126 |
+
def _init_database(self):
|
| 127 |
+
"""Initialize SQLite database for persistent cache"""
|
| 128 |
+
try:
|
| 129 |
+
with sqlite3.connect(str(self.db_path)) as conn:
|
| 130 |
+
conn.execute('''
|
| 131 |
+
CREATE TABLE IF NOT EXISTS cache_entries (
|
| 132 |
+
key TEXT PRIMARY KEY,
|
| 133 |
+
data TEXT NOT NULL,
|
| 134 |
+
created_at REAL NOT NULL,
|
| 135 |
+
last_accessed REAL NOT NULL,
|
| 136 |
+
access_count INTEGER DEFAULT 0,
|
| 137 |
+
size_bytes INTEGER DEFAULT 0
|
| 138 |
+
)
|
| 139 |
+
''')
|
| 140 |
+
conn.execute('CREATE INDEX IF NOT EXISTS idx_created_at ON cache_entries(created_at)')
|
| 141 |
+
conn.execute('CREATE INDEX IF NOT EXISTS idx_last_accessed ON cache_entries(last_accessed)')
|
| 142 |
+
except Exception as e:
|
| 143 |
+
print(f"Warning: Could not initialize persistent cache: {e}")
|
| 144 |
+
self.persistent = False
|
| 145 |
+
|
| 146 |
+
def _load_persistent_cache(self):
|
| 147 |
+
"""Load existing cache entries from database"""
|
| 148 |
+
if not self.persistent:
|
| 149 |
+
return
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
with sqlite3.connect(str(self.db_path)) as conn:
|
| 153 |
+
cursor = conn.execute('SELECT data FROM cache_entries')
|
| 154 |
+
for row in cursor:
|
| 155 |
+
try:
|
| 156 |
+
entry_data = json.loads(row[0])
|
| 157 |
+
entry = CacheEntry.from_dict(entry_data)
|
| 158 |
+
|
| 159 |
+
# Check if entry is still valid
|
| 160 |
+
if self._is_entry_valid(entry):
|
| 161 |
+
self._add_to_memory_cache(entry)
|
| 162 |
+
else:
|
| 163 |
+
# Remove expired entry from database
|
| 164 |
+
conn.execute('DELETE FROM cache_entries WHERE key = ?', (entry.key,))
|
| 165 |
+
except (json.JSONDecodeError, KeyError):
|
| 166 |
+
continue
|
| 167 |
+
except Exception as e:
|
| 168 |
+
print(f"Warning: Could not load persistent cache: {e}")
|
| 169 |
+
|
| 170 |
+
def _generate_cache_key(self, content: str, model_config: Dict[str, Any],
|
| 171 |
+
processing_config: Dict[str, Any]) -> str:
|
| 172 |
+
"""
|
| 173 |
+
Generate a unique cache key based on content and configuration
|
| 174 |
+
|
| 175 |
+
Args:
|
| 176 |
+
content: The text content to be analyzed
|
| 177 |
+
model_config: Model configuration used for analysis
|
| 178 |
+
processing_config: Processing configuration used
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
SHA-256 hash string as cache key
|
| 182 |
+
"""
|
| 183 |
+
# Create a deterministic string from all parameters
|
| 184 |
+
key_data = {
|
| 185 |
+
'content': content,
|
| 186 |
+
'model_config': model_config,
|
| 187 |
+
'processing_config': processing_config
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
# Convert to JSON string with sorted keys for consistency
|
| 191 |
+
key_string = json.dumps(key_data, sort_keys=True)
|
| 192 |
+
|
| 193 |
+
# Generate SHA-256 hash
|
| 194 |
+
return hashlib.sha256(key_string.encode('utf-8')).hexdigest()
|
| 195 |
+
|
| 196 |
+
def _is_entry_valid(self, entry: CacheEntry) -> bool:
|
| 197 |
+
"""Check if a cache entry is still valid"""
|
| 198 |
+
# Check TTL
|
| 199 |
+
if time.time() - entry.created_at > self.ttl_seconds:
|
| 200 |
+
return False
|
| 201 |
+
|
| 202 |
+
# Check if configurations match (for future-proofing)
|
| 203 |
+
# This could be enhanced to handle configuration changes
|
| 204 |
+
|
| 205 |
+
return True
|
| 206 |
+
|
| 207 |
+
def _add_to_memory_cache(self, entry: CacheEntry):
|
| 208 |
+
"""Add entry to memory cache with size management"""
|
| 209 |
+
with self.lock:
|
| 210 |
+
# Check if we need to evict entries
|
| 211 |
+
while self.memory_size + entry.size_bytes > self.max_memory_mb * 1024 * 1024:
|
| 212 |
+
if not self.memory_cache:
|
| 213 |
+
break
|
| 214 |
+
self._evict_lru_entry()
|
| 215 |
+
|
| 216 |
+
self.memory_cache[entry.key] = entry
|
| 217 |
+
self.memory_size += entry.size_bytes
|
| 218 |
+
self.stats['entries'] = len(self.memory_cache)
|
| 219 |
+
self.stats['memory_usage_mb'] = self.memory_size / (1024 * 1024)
|
| 220 |
+
|
| 221 |
+
def _evict_lru_entry(self):
|
| 222 |
+
"""Evict the least recently used entry from memory cache"""
|
| 223 |
+
if not self.memory_cache:
|
| 224 |
+
return
|
| 225 |
+
|
| 226 |
+
# Find entry with oldest last_accessed time
|
| 227 |
+
lru_key = min(self.memory_cache.keys(),
|
| 228 |
+
key=lambda k: self.memory_cache[k].last_accessed)
|
| 229 |
+
|
| 230 |
+
evicted_entry = self.memory_cache.pop(lru_key)
|
| 231 |
+
self.memory_size -= evicted_entry.size_bytes
|
| 232 |
+
self.stats['evictions'] += 1
|
| 233 |
+
|
| 234 |
+
# If persistent, we could keep it in database but remove from memory
|
| 235 |
+
# For now, we'll just remove it completely
|
| 236 |
+
|
| 237 |
+
def _save_to_persistent_cache(self, entry: CacheEntry):
|
| 238 |
+
"""Save entry to persistent cache"""
|
| 239 |
+
if not self.persistent:
|
| 240 |
+
return
|
| 241 |
+
|
| 242 |
+
try:
|
| 243 |
+
with sqlite3.connect(str(self.db_path)) as conn:
|
| 244 |
+
conn.execute('''
|
| 245 |
+
INSERT OR REPLACE INTO cache_entries
|
| 246 |
+
(key, data, created_at, last_accessed, access_count, size_bytes)
|
| 247 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
| 248 |
+
''', (
|
| 249 |
+
entry.key,
|
| 250 |
+
json.dumps(entry.to_dict()),
|
| 251 |
+
entry.created_at,
|
| 252 |
+
entry.last_accessed,
|
| 253 |
+
entry.access_count,
|
| 254 |
+
entry.size_bytes
|
| 255 |
+
))
|
| 256 |
+
except Exception as e:
|
| 257 |
+
print(f"Warning: Could not save to persistent cache: {e}")
|
| 258 |
+
|
| 259 |
+
def get(self, content: str, model_config: Dict[str, Any],
|
| 260 |
+
processing_config: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 261 |
+
"""
|
| 262 |
+
Get cached analysis result for given content and configuration
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
content: Text content to look up
|
| 266 |
+
model_config: Model configuration used for analysis
|
| 267 |
+
processing_config: Processing configuration used
|
| 268 |
+
|
| 269 |
+
Returns:
|
| 270 |
+
Cached analysis result or None if not found
|
| 271 |
+
"""
|
| 272 |
+
if not self.stats['enabled']:
|
| 273 |
+
self.stats['misses'] += 1
|
| 274 |
+
return None
|
| 275 |
+
|
| 276 |
+
cache_key = self._generate_cache_key(content, model_config, processing_config)
|
| 277 |
+
|
| 278 |
+
with self.lock:
|
| 279 |
+
# Check memory cache first
|
| 280 |
+
if cache_key in self.memory_cache:
|
| 281 |
+
entry = self.memory_cache[cache_key]
|
| 282 |
+
|
| 283 |
+
if self._is_entry_valid(entry):
|
| 284 |
+
entry.update_access()
|
| 285 |
+
self.stats['hits'] += 1
|
| 286 |
+
return entry.analysis_result
|
| 287 |
+
else:
|
| 288 |
+
# Remove invalid entry
|
| 289 |
+
self.memory_cache.pop(cache_key)
|
| 290 |
+
self.memory_size -= entry.size_bytes
|
| 291 |
+
self.stats['entries'] = len(self.memory_cache)
|
| 292 |
+
|
| 293 |
+
# Check persistent cache if not in memory
|
| 294 |
+
if self.persistent:
|
| 295 |
+
try:
|
| 296 |
+
with sqlite3.connect(str(self.db_path)) as conn:
|
| 297 |
+
cursor = conn.execute('SELECT data FROM cache_entries WHERE key = ?', (cache_key,))
|
| 298 |
+
row = cursor.fetchone()
|
| 299 |
+
|
| 300 |
+
if row:
|
| 301 |
+
entry_data = json.loads(row[0])
|
| 302 |
+
entry = CacheEntry.from_dict(entry_data)
|
| 303 |
+
|
| 304 |
+
if self._is_entry_valid(entry):
|
| 305 |
+
entry.update_access()
|
| 306 |
+
self.stats['hits'] += 1
|
| 307 |
+
|
| 308 |
+
# Move to memory cache for faster future access
|
| 309 |
+
self._add_to_memory_cache(entry)
|
| 310 |
+
|
| 311 |
+
# Update persistent cache with new access stats
|
| 312 |
+
self._save_to_persistent_cache(entry)
|
| 313 |
+
|
| 314 |
+
return entry.analysis_result
|
| 315 |
+
except Exception as e:
|
| 316 |
+
print(f"Warning: Error accessing persistent cache: {e}")
|
| 317 |
+
|
| 318 |
+
self.stats['misses'] += 1
|
| 319 |
+
return None
|
| 320 |
+
|
| 321 |
+
def put(self, content: str, analysis_result: Dict[str, Any],
|
| 322 |
+
model_config: Dict[str, Any], processing_config: Dict[str, Any]):
|
| 323 |
+
"""
|
| 324 |
+
Store analysis result in cache
|
| 325 |
+
|
| 326 |
+
Args:
|
| 327 |
+
content: Text content that was analyzed
|
| 328 |
+
analysis_result: Analysis result to cache
|
| 329 |
+
model_config: Model configuration used for analysis
|
| 330 |
+
processing_config: Processing configuration used
|
| 331 |
+
"""
|
| 332 |
+
if not self.stats['enabled']:
|
| 333 |
+
return
|
| 334 |
+
|
| 335 |
+
cache_key = self._generate_cache_key(content, model_config, processing_config)
|
| 336 |
+
|
| 337 |
+
with self.lock:
|
| 338 |
+
entry = CacheEntry(cache_key, content, analysis_result,
|
| 339 |
+
model_config, processing_config)
|
| 340 |
+
|
| 341 |
+
# Add to memory cache
|
| 342 |
+
self._add_to_memory_cache(entry)
|
| 343 |
+
|
| 344 |
+
# Save to persistent cache
|
| 345 |
+
self._save_to_persistent_cache(entry)
|
| 346 |
+
|
| 347 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 348 |
+
"""Get cache statistics"""
|
| 349 |
+
with self.lock:
|
| 350 |
+
total_requests = self.stats['hits'] + self.stats['misses']
|
| 351 |
+
hit_rate = (self.stats['hits'] / total_requests * 100) if total_requests > 0 else 0
|
| 352 |
+
|
| 353 |
+
return {
|
| 354 |
+
**self.stats,
|
| 355 |
+
'hit_rate': hit_rate,
|
| 356 |
+
'total_requests': total_requests,
|
| 357 |
+
'persistent_enabled': self.persistent,
|
| 358 |
+
'memory_limit_mb': self.max_memory_mb,
|
| 359 |
+
'ttl_hours': self.ttl_hours
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
def clear_cache(self):
|
| 363 |
+
"""Clear all cache entries"""
|
| 364 |
+
with self.lock:
|
| 365 |
+
self.memory_cache.clear()
|
| 366 |
+
self.memory_size = 0
|
| 367 |
+
self.stats['entries'] = 0
|
| 368 |
+
self.stats['hits'] = 0
|
| 369 |
+
self.stats['misses'] = 0
|
| 370 |
+
self.stats['evictions'] = 0
|
| 371 |
+
self.stats['memory_usage_mb'] = 0
|
| 372 |
+
|
| 373 |
+
# Clear persistent cache
|
| 374 |
+
if self.persistent:
|
| 375 |
+
try:
|
| 376 |
+
with sqlite3.connect(str(self.db_path)) as conn:
|
| 377 |
+
conn.execute('DELETE FROM cache_entries')
|
| 378 |
+
except Exception as e:
|
| 379 |
+
print(f"Warning: Could not clear persistent cache: {e}")
|
| 380 |
+
|
| 381 |
+
def cleanup_expired_entries(self):
|
| 382 |
+
"""Remove expired entries from cache"""
|
| 383 |
+
current_time = time.time()
|
| 384 |
+
expired_keys = []
|
| 385 |
+
|
| 386 |
+
with self.lock:
|
| 387 |
+
# Find expired entries in memory
|
| 388 |
+
for key, entry in self.memory_cache.items():
|
| 389 |
+
if current_time - entry.created_at > self.ttl_seconds:
|
| 390 |
+
expired_keys.append(key)
|
| 391 |
+
self.memory_size -= entry.size_bytes
|
| 392 |
+
|
| 393 |
+
# Remove expired entries from memory
|
| 394 |
+
for key in expired_keys:
|
| 395 |
+
del self.memory_cache[key]
|
| 396 |
+
|
| 397 |
+
self.stats['entries'] = len(self.memory_cache)
|
| 398 |
+
self.stats['memory_usage_mb'] = self.memory_size / (1024 * 1024)
|
| 399 |
+
|
| 400 |
+
# Clean up persistent cache
|
| 401 |
+
if self.persistent:
|
| 402 |
+
try:
|
| 403 |
+
with sqlite3.connect(str(self.db_path)) as conn:
|
| 404 |
+
conn.execute('DELETE FROM cache_entries WHERE ? - created_at > ?',
|
| 405 |
+
(current_time, self.ttl_seconds))
|
| 406 |
+
except Exception as e:
|
| 407 |
+
print(f"Warning: Could not cleanup persistent cache: {e}")
|
| 408 |
+
|
| 409 |
+
def enable(self):
|
| 410 |
+
"""Enable caching"""
|
| 411 |
+
self.stats['enabled'] = True
|
| 412 |
+
|
| 413 |
+
def disable(self):
|
| 414 |
+
"""Disable caching"""
|
| 415 |
+
self.stats['enabled'] = False
|
| 416 |
+
|
| 417 |
+
def export_cache(self, filepath: str):
|
| 418 |
+
"""Export cache contents to JSON file"""
|
| 419 |
+
cache_data = {
|
| 420 |
+
'metadata': {
|
| 421 |
+
'exported_at': time.time(),
|
| 422 |
+
'version': '1.0',
|
| 423 |
+
'total_entries': len(self.memory_cache)
|
| 424 |
+
},
|
| 425 |
+
'entries': []
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
with self.lock:
|
| 429 |
+
for entry in self.memory_cache.values():
|
| 430 |
+
cache_data['entries'].append(entry.to_dict())
|
| 431 |
+
|
| 432 |
+
# Also export persistent cache entries
|
| 433 |
+
if self.persistent:
|
| 434 |
+
try:
|
| 435 |
+
with sqlite3.connect(str(self.db_path)) as conn:
|
| 436 |
+
cursor = conn.execute('SELECT data FROM cache_entries')
|
| 437 |
+
for row in cursor:
|
| 438 |
+
try:
|
| 439 |
+
entry_data = json.loads(row[0])
|
| 440 |
+
cache_data['entries'].append(entry_data)
|
| 441 |
+
except json.JSONDecodeError:
|
| 442 |
+
continue
|
| 443 |
+
except Exception as e:
|
| 444 |
+
print(f"Warning: Could not export persistent cache: {e}")
|
| 445 |
+
|
| 446 |
+
try:
|
| 447 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 448 |
+
json.dump(cache_data, f, indent=2, ensure_ascii=False)
|
| 449 |
+
return True
|
| 450 |
+
except Exception as e:
|
| 451 |
+
print(f"Error exporting cache: {e}")
|
| 452 |
+
return False
|
| 453 |
+
|
| 454 |
+
def import_cache(self, filepath: str):
|
| 455 |
+
"""Import cache contents from JSON file"""
|
| 456 |
+
try:
|
| 457 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 458 |
+
cache_data = json.load(f)
|
| 459 |
+
|
| 460 |
+
imported_count = 0
|
| 461 |
+
for entry_data in cache_data.get('entries', []):
|
| 462 |
+
try:
|
| 463 |
+
entry = CacheEntry.from_dict(entry_data)
|
| 464 |
+
if self._is_entry_valid(entry):
|
| 465 |
+
self._add_to_memory_cache(entry)
|
| 466 |
+
if self.persistent:
|
| 467 |
+
self._save_to_persistent_cache(entry)
|
| 468 |
+
imported_count += 1
|
| 469 |
+
except Exception as e:
|
| 470 |
+
print(f"Warning: Could not import cache entry: {e}")
|
| 471 |
+
continue
|
| 472 |
+
|
| 473 |
+
return imported_count
|
| 474 |
+
except Exception as e:
|
| 475 |
+
print(f"Error importing cache: {e}")
|
| 476 |
+
return 0
|
| 477 |
+
|
| 478 |
+
# Global cache instance for use across the application
|
| 479 |
+
_cache_instance = None
|
| 480 |
+
_cache_lock = threading.Lock()
|
| 481 |
+
|
| 482 |
+
def get_cache_manager(max_memory_mb: int = 1024, persistent: bool = True,
|
| 483 |
+
cache_dir: str = None, ttl_hours: int = 24) -> CacheManager:
|
| 484 |
+
"""
|
| 485 |
+
Get or create global cache manager instance
|
| 486 |
+
|
| 487 |
+
This ensures we have a single cache instance across the application
|
| 488 |
+
while allowing configuration updates.
|
| 489 |
+
"""
|
| 490 |
+
global _cache_instance
|
| 491 |
+
|
| 492 |
+
with _cache_lock:
|
| 493 |
+
if _cache_instance is None:
|
| 494 |
+
_cache_instance = CacheManager(max_memory_mb, persistent, cache_dir, ttl_hours)
|
| 495 |
+
else:
|
| 496 |
+
# Update configuration if different
|
| 497 |
+
if (_cache_instance.max_memory_mb != max_memory_mb or
|
| 498 |
+
_cache_instance.persistent != persistent or
|
| 499 |
+
_cache_instance.ttl_hours != ttl_hours):
|
| 500 |
+
_cache_instance.max_memory_mb = max_memory_mb
|
| 501 |
+
_cache_instance.persistent = persistent
|
| 502 |
+
_cache_instance.ttl_hours = ttl_hours
|
| 503 |
+
_cache_instance.ttl_seconds = ttl_hours * 3600
|
| 504 |
+
|
| 505 |
+
return _cache_instance
|
streamlit_app/core/dataset_builder.py
ADDED
|
@@ -0,0 +1,649 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Dataset Builder
|
| 4 |
+
|
| 5 |
+
Creates and manages finetuning datasets from legislation analysis results.
|
| 6 |
+
Handles data formatting, validation, and export in multiple formats.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import json
|
| 11 |
+
import time
|
| 12 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import pandas as pd
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
import uuid
|
| 17 |
+
|
| 18 |
+
class DatasetBuilder:
|
| 19 |
+
"""Builder for creating finetuning datasets from legislation analysis"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, output_dir: str = "datasets"):
|
| 22 |
+
"""
|
| 23 |
+
Initialize the dataset builder
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
output_dir: Directory to save datasets
|
| 27 |
+
"""
|
| 28 |
+
self.output_dir = Path(output_dir)
|
| 29 |
+
self.output_dir.mkdir(exist_ok=True)
|
| 30 |
+
|
| 31 |
+
# Dataset metadata
|
| 32 |
+
self.metadata = {
|
| 33 |
+
'version': '1.0',
|
| 34 |
+
'created_at': datetime.now().isoformat(),
|
| 35 |
+
'total_entries': 0,
|
| 36 |
+
'analysis_types': set(),
|
| 37 |
+
'legislation_sources': set(),
|
| 38 |
+
'quality_metrics': {}
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
def create_finetuning_dataset(self, analysis_results: List[Dict[str, Any]],
|
| 42 |
+
dataset_name: str = None,
|
| 43 |
+
include_metadata: bool = True) -> Dict[str, Any]:
|
| 44 |
+
"""
|
| 45 |
+
Create a finetuning dataset from analysis results
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
analysis_results: List of analysis results from LLM analyzer
|
| 49 |
+
dataset_name: Name for the dataset (optional)
|
| 50 |
+
include_metadata: Whether to include metadata in the dataset
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
Dataset information and statistics
|
| 54 |
+
"""
|
| 55 |
+
if not dataset_name:
|
| 56 |
+
timestamp = int(time.time())
|
| 57 |
+
dataset_name = f"nz_legislation_dataset_{timestamp}"
|
| 58 |
+
|
| 59 |
+
dataset_entries = []
|
| 60 |
+
successful_entries = 0
|
| 61 |
+
|
| 62 |
+
for result in analysis_results:
|
| 63 |
+
if 'error' in result:
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
# Create finetuning entry
|
| 67 |
+
entry = self._create_finetuning_entry(result)
|
| 68 |
+
if entry:
|
| 69 |
+
dataset_entries.append(entry)
|
| 70 |
+
successful_entries += 1
|
| 71 |
+
|
| 72 |
+
# Update metadata
|
| 73 |
+
if 'analysis_type' in result:
|
| 74 |
+
self.metadata['analysis_types'].add(result['analysis_type'])
|
| 75 |
+
|
| 76 |
+
# Update metadata
|
| 77 |
+
self.metadata['total_entries'] = len(dataset_entries)
|
| 78 |
+
self.metadata['created_at'] = datetime.now().isoformat()
|
| 79 |
+
|
| 80 |
+
# Calculate quality metrics
|
| 81 |
+
self._calculate_quality_metrics(dataset_entries)
|
| 82 |
+
|
| 83 |
+
# Create dataset structure
|
| 84 |
+
dataset = {
|
| 85 |
+
'metadata': dict(self.metadata),
|
| 86 |
+
'entries': dataset_entries
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
if include_metadata:
|
| 90 |
+
dataset['metadata'].update({
|
| 91 |
+
'dataset_name': dataset_name,
|
| 92 |
+
'successful_entries': successful_entries,
|
| 93 |
+
'total_input_results': len(analysis_results),
|
| 94 |
+
'success_rate': successful_entries / len(analysis_results) if analysis_results else 0
|
| 95 |
+
})
|
| 96 |
+
|
| 97 |
+
return dataset
|
| 98 |
+
|
| 99 |
+
def _create_finetuning_entry(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 100 |
+
"""
|
| 101 |
+
Create a single finetuning dataset entry
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
result: Analysis result from LLM analyzer
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
Finetuning entry or None if invalid
|
| 108 |
+
"""
|
| 109 |
+
try:
|
| 110 |
+
# Extract key components
|
| 111 |
+
chunk = result.get('chunk', '')
|
| 112 |
+
structured_analysis = result.get('structured_analysis', {})
|
| 113 |
+
response = result.get('response', '')
|
| 114 |
+
|
| 115 |
+
# Create the prompt (input)
|
| 116 |
+
prompt = self._create_prompt(chunk, result.get('analysis_type', 'standard'))
|
| 117 |
+
|
| 118 |
+
# Create the response (output) - structured format
|
| 119 |
+
response_text = self._create_response(structured_analysis, response)
|
| 120 |
+
|
| 121 |
+
if not prompt or not response_text:
|
| 122 |
+
return None
|
| 123 |
+
|
| 124 |
+
# Create entry
|
| 125 |
+
entry = {
|
| 126 |
+
'id': str(uuid.uuid4()),
|
| 127 |
+
'prompt': prompt,
|
| 128 |
+
'response': response_text,
|
| 129 |
+
'metadata': {
|
| 130 |
+
'chunk_size': len(chunk),
|
| 131 |
+
'word_count': len(chunk.split()),
|
| 132 |
+
'analysis_type': result.get('analysis_type', 'standard'),
|
| 133 |
+
'model_config': result.get('model_config', {}),
|
| 134 |
+
'confidence_score': structured_analysis.get('confidence_score', 0),
|
| 135 |
+
'analysis_quality': structured_analysis.get('analysis_quality', 'unknown'),
|
| 136 |
+
'created_at': datetime.now().isoformat()
|
| 137 |
+
},
|
| 138 |
+
'raw_data': {
|
| 139 |
+
'original_chunk': chunk,
|
| 140 |
+
'structured_analysis': structured_analysis,
|
| 141 |
+
'raw_response': response
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
return entry
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"Error creating finetuning entry: {e}")
|
| 149 |
+
return None
|
| 150 |
+
|
| 151 |
+
def _create_prompt(self, chunk: str, analysis_type: str) -> str:
|
| 152 |
+
"""
|
| 153 |
+
Create a standardized prompt for the finetuning dataset
|
| 154 |
+
|
| 155 |
+
Args:
|
| 156 |
+
chunk: Text chunk to analyze
|
| 157 |
+
analysis_type: Type of analysis
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
Formatted prompt
|
| 161 |
+
"""
|
| 162 |
+
analysis_configs = {
|
| 163 |
+
'standard': {
|
| 164 |
+
'depth': 'Standard',
|
| 165 |
+
'focus': 'loopholes, ambiguities, and unintended consequences'
|
| 166 |
+
},
|
| 167 |
+
'detailed': {
|
| 168 |
+
'depth': 'Detailed',
|
| 169 |
+
'focus': 'loopholes, ambiguities, unintended consequences, and implementation issues'
|
| 170 |
+
},
|
| 171 |
+
'comprehensive': {
|
| 172 |
+
'depth': 'Comprehensive',
|
| 173 |
+
'focus': 'all aspects including policy conflicts and enforcement challenges'
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
config = analysis_configs.get(analysis_type, analysis_configs['standard'])
|
| 178 |
+
|
| 179 |
+
prompt = f"""You are a legal expert analyzing New Zealand legislation for loopholes and ambiguities.
|
| 180 |
+
|
| 181 |
+
LEGISLATION TEXT:
|
| 182 |
+
{chunk}
|
| 183 |
+
|
| 184 |
+
TASK: Analyze this legislative text for potential loopholes, ambiguities, or unintended consequences.
|
| 185 |
+
|
| 186 |
+
ANALYSIS DEPTH: {config['depth']}
|
| 187 |
+
FOCUS AREAS: {config['focus']}
|
| 188 |
+
|
| 189 |
+
Provide a structured analysis covering:
|
| 190 |
+
1. Text Meaning - Explain what the text means and its intended purpose
|
| 191 |
+
2. Key Assumptions - Identify any assumptions that could be exploited
|
| 192 |
+
3. Exploitable Interpretations - Discuss how the text could be interpreted in unintended ways
|
| 193 |
+
4. Critical Loopholes - Identify specific loopholes or ambiguities
|
| 194 |
+
5. Circumvention Strategies - Suggest practical methods for exploiting these loopholes
|
| 195 |
+
|
| 196 |
+
Format your response clearly with section headers."""
|
| 197 |
+
|
| 198 |
+
return prompt
|
| 199 |
+
|
| 200 |
+
def _create_response(self, structured_analysis: Dict[str, Any], raw_response: str) -> str:
|
| 201 |
+
"""
|
| 202 |
+
Create a standardized response format for the finetuning dataset
|
| 203 |
+
|
| 204 |
+
Args:
|
| 205 |
+
structured_analysis: Structured analysis data
|
| 206 |
+
raw_response: Raw LLM response
|
| 207 |
+
|
| 208 |
+
Returns:
|
| 209 |
+
Formatted response
|
| 210 |
+
"""
|
| 211 |
+
sections = []
|
| 212 |
+
|
| 213 |
+
# Text Meaning
|
| 214 |
+
if structured_analysis.get('text_meaning'):
|
| 215 |
+
sections.append(f"**Text Meaning:** {structured_analysis['text_meaning']}")
|
| 216 |
+
|
| 217 |
+
# Key Assumptions
|
| 218 |
+
if structured_analysis.get('key_assumptions'):
|
| 219 |
+
assumptions = structured_analysis['key_assumptions']
|
| 220 |
+
if assumptions:
|
| 221 |
+
sections.append("**Key Assumptions:**")
|
| 222 |
+
for i, assumption in enumerate(assumptions, 1):
|
| 223 |
+
sections.append(f"{i}. {assumption}")
|
| 224 |
+
|
| 225 |
+
# Exploitable Interpretations
|
| 226 |
+
if structured_analysis.get('exploitable_interpretations'):
|
| 227 |
+
interpretations = structured_analysis['exploitable_interpretations']
|
| 228 |
+
if interpretations:
|
| 229 |
+
sections.append("**Exploitable Interpretations:**")
|
| 230 |
+
for i, interpretation in enumerate(interpretations, 1):
|
| 231 |
+
sections.append(f"{i}. {interpretation}")
|
| 232 |
+
|
| 233 |
+
# Critical Loopholes
|
| 234 |
+
if structured_analysis.get('critical_loopholes'):
|
| 235 |
+
loopholes = structured_analysis['critical_loopholes']
|
| 236 |
+
if loopholes:
|
| 237 |
+
sections.append("**Critical Loopholes:**")
|
| 238 |
+
for i, loophole in enumerate(loopholes, 1):
|
| 239 |
+
sections.append(f"{i}. {loophole}")
|
| 240 |
+
|
| 241 |
+
# Circumvention Strategies
|
| 242 |
+
if structured_analysis.get('circumvention_strategies'):
|
| 243 |
+
strategies = structured_analysis['circumvention_strategies']
|
| 244 |
+
if strategies:
|
| 245 |
+
sections.append("**Circumvention Strategies:**")
|
| 246 |
+
for i, strategy in enumerate(strategies, 1):
|
| 247 |
+
sections.append(f"{i}. {strategy}")
|
| 248 |
+
|
| 249 |
+
# Recommendations
|
| 250 |
+
if structured_analysis.get('recommendations'):
|
| 251 |
+
recommendations = structured_analysis['recommendations']
|
| 252 |
+
if recommendations:
|
| 253 |
+
sections.append("**Recommendations:**")
|
| 254 |
+
for i, rec in enumerate(recommendations, 1):
|
| 255 |
+
sections.append(f"{i}. {rec}")
|
| 256 |
+
|
| 257 |
+
return "\n\n".join(sections) if sections else raw_response
|
| 258 |
+
|
| 259 |
+
def _calculate_quality_metrics(self, entries: List[Dict[str, Any]]):
|
| 260 |
+
"""Calculate quality metrics for the dataset"""
|
| 261 |
+
if not entries:
|
| 262 |
+
return
|
| 263 |
+
|
| 264 |
+
confidence_scores = []
|
| 265 |
+
analysis_qualities = {'high': 0, 'medium': 0, 'low': 0, 'unknown': 0}
|
| 266 |
+
|
| 267 |
+
for entry in entries:
|
| 268 |
+
metadata = entry.get('metadata', {})
|
| 269 |
+
confidence = metadata.get('confidence_score', 0)
|
| 270 |
+
quality = metadata.get('analysis_quality', 'unknown')
|
| 271 |
+
|
| 272 |
+
confidence_scores.append(confidence)
|
| 273 |
+
analysis_qualities[quality] = analysis_qualities.get(quality, 0) + 1
|
| 274 |
+
|
| 275 |
+
self.metadata['quality_metrics'] = {
|
| 276 |
+
'average_confidence': sum(confidence_scores) / len(confidence_scores) if confidence_scores else 0,
|
| 277 |
+
'max_confidence': max(confidence_scores) if confidence_scores else 0,
|
| 278 |
+
'min_confidence': min(confidence_scores) if confidence_scores else 0,
|
| 279 |
+
'quality_distribution': analysis_qualities,
|
| 280 |
+
'total_entries': len(entries)
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
def save_dataset(self, dataset: Dict[str, Any], format_type: str = 'json',
|
| 284 |
+
filename: str = None) -> str:
|
| 285 |
+
"""
|
| 286 |
+
Save dataset in specified format
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
dataset: Dataset to save
|
| 290 |
+
format_type: Format ('json', 'jsonl', 'csv', 'excel')
|
| 291 |
+
filename: Output filename (optional)
|
| 292 |
+
|
| 293 |
+
Returns:
|
| 294 |
+
Path to saved file
|
| 295 |
+
"""
|
| 296 |
+
if not filename:
|
| 297 |
+
timestamp = int(time.time())
|
| 298 |
+
filename = f"nz_legislation_dataset_{timestamp}"
|
| 299 |
+
|
| 300 |
+
# Ensure filename has correct extension
|
| 301 |
+
if not filename.endswith(f'.{format_type}'):
|
| 302 |
+
filename += f'.{format_type}'
|
| 303 |
+
|
| 304 |
+
filepath = self.output_dir / filename
|
| 305 |
+
|
| 306 |
+
try:
|
| 307 |
+
if format_type == 'json':
|
| 308 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 309 |
+
json.dump(dataset, f, indent=2, ensure_ascii=False)
|
| 310 |
+
|
| 311 |
+
elif format_type == 'jsonl':
|
| 312 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 313 |
+
for entry in dataset.get('entries', []):
|
| 314 |
+
json.dump(entry, f, ensure_ascii=False)
|
| 315 |
+
f.write('\n')
|
| 316 |
+
|
| 317 |
+
elif format_type == 'csv':
|
| 318 |
+
self._save_as_csv(dataset, filepath)
|
| 319 |
+
|
| 320 |
+
elif format_type == 'excel':
|
| 321 |
+
self._save_as_excel(dataset, filepath)
|
| 322 |
+
|
| 323 |
+
else:
|
| 324 |
+
raise ValueError(f"Unsupported format: {format_type}")
|
| 325 |
+
|
| 326 |
+
return str(filepath)
|
| 327 |
+
|
| 328 |
+
except Exception as e:
|
| 329 |
+
raise Exception(f"Error saving dataset: {e}")
|
| 330 |
+
|
| 331 |
+
def _save_as_csv(self, dataset: Dict[str, Any], filepath: Path):
|
| 332 |
+
"""Save dataset as CSV"""
|
| 333 |
+
entries = dataset.get('entries', [])
|
| 334 |
+
|
| 335 |
+
if not entries:
|
| 336 |
+
# Create empty CSV with headers
|
| 337 |
+
df = pd.DataFrame(columns=['id', 'prompt', 'response', 'metadata'])
|
| 338 |
+
df.to_csv(filepath, index=False)
|
| 339 |
+
return
|
| 340 |
+
|
| 341 |
+
# Flatten the data for CSV
|
| 342 |
+
csv_data = []
|
| 343 |
+
for entry in entries:
|
| 344 |
+
csv_row = {
|
| 345 |
+
'id': entry.get('id', ''),
|
| 346 |
+
'prompt': entry.get('prompt', ''),
|
| 347 |
+
'response': entry.get('response', ''),
|
| 348 |
+
'confidence_score': entry.get('metadata', {}).get('confidence_score', 0),
|
| 349 |
+
'analysis_type': entry.get('metadata', {}).get('analysis_type', ''),
|
| 350 |
+
'chunk_size': entry.get('metadata', {}).get('chunk_size', 0),
|
| 351 |
+
'word_count': entry.get('metadata', {}).get('word_count', 0),
|
| 352 |
+
'analysis_quality': entry.get('metadata', {}).get('analysis_quality', ''),
|
| 353 |
+
'created_at': entry.get('metadata', {}).get('created_at', '')
|
| 354 |
+
}
|
| 355 |
+
csv_data.append(csv_row)
|
| 356 |
+
|
| 357 |
+
df = pd.DataFrame(csv_data)
|
| 358 |
+
df.to_csv(filepath, index=False, encoding='utf-8')
|
| 359 |
+
|
| 360 |
+
def _save_as_excel(self, dataset: Dict[str, Any], filepath: Path):
|
| 361 |
+
"""Save dataset as Excel with multiple sheets"""
|
| 362 |
+
entries = dataset.get('entries', [])
|
| 363 |
+
|
| 364 |
+
with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
|
| 365 |
+
# Main dataset sheet
|
| 366 |
+
if entries:
|
| 367 |
+
csv_data = []
|
| 368 |
+
for entry in entries:
|
| 369 |
+
csv_row = {
|
| 370 |
+
'id': entry.get('id', ''),
|
| 371 |
+
'prompt': entry.get('prompt', ''),
|
| 372 |
+
'response': entry.get('response', ''),
|
| 373 |
+
'confidence_score': entry.get('metadata', {}).get('confidence_score', 0),
|
| 374 |
+
'analysis_type': entry.get('metadata', {}).get('analysis_type', ''),
|
| 375 |
+
'chunk_size': entry.get('metadata', {}).get('chunk_size', 0),
|
| 376 |
+
'word_count': entry.get('metadata', {}).get('word_count', 0),
|
| 377 |
+
'analysis_quality': entry.get('metadata', {}).get('analysis_quality', ''),
|
| 378 |
+
'created_at': entry.get('metadata', {}).get('created_at', '')
|
| 379 |
+
}
|
| 380 |
+
csv_data.append(csv_row)
|
| 381 |
+
|
| 382 |
+
df_main = pd.DataFrame(csv_data)
|
| 383 |
+
df_main.to_excel(writer, sheet_name='Dataset', index=False)
|
| 384 |
+
|
| 385 |
+
# Metadata sheet
|
| 386 |
+
metadata_df = pd.DataFrame([dataset.get('metadata', {})])
|
| 387 |
+
metadata_df.to_excel(writer, sheet_name='Metadata', index=False)
|
| 388 |
+
|
| 389 |
+
# Quality metrics sheet
|
| 390 |
+
quality_data = dataset.get('metadata', {}).get('quality_metrics', {})
|
| 391 |
+
if quality_data:
|
| 392 |
+
quality_df = pd.DataFrame([quality_data])
|
| 393 |
+
quality_df.to_excel(writer, sheet_name='Quality_Metrics', index=False)
|
| 394 |
+
|
| 395 |
+
def load_dataset(self, filepath: str) -> Dict[str, Any]:
|
| 396 |
+
"""
|
| 397 |
+
Load a dataset from file
|
| 398 |
+
|
| 399 |
+
Args:
|
| 400 |
+
filepath: Path to dataset file
|
| 401 |
+
|
| 402 |
+
Returns:
|
| 403 |
+
Loaded dataset
|
| 404 |
+
"""
|
| 405 |
+
filepath = Path(filepath)
|
| 406 |
+
|
| 407 |
+
if not filepath.exists():
|
| 408 |
+
raise FileNotFoundError(f"Dataset file not found: {filepath}")
|
| 409 |
+
|
| 410 |
+
try:
|
| 411 |
+
if filepath.suffix == '.json':
|
| 412 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 413 |
+
return json.load(f)
|
| 414 |
+
|
| 415 |
+
elif filepath.suffix == '.jsonl':
|
| 416 |
+
entries = []
|
| 417 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 418 |
+
for line in f:
|
| 419 |
+
if line.strip():
|
| 420 |
+
entries.append(json.loads(line))
|
| 421 |
+
|
| 422 |
+
return {
|
| 423 |
+
'metadata': {
|
| 424 |
+
'loaded_from': str(filepath),
|
| 425 |
+
'total_entries': len(entries)
|
| 426 |
+
},
|
| 427 |
+
'entries': entries
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
elif filepath.suffix in ['.csv', '.xlsx', '.xls']:
|
| 431 |
+
return self._load_from_spreadsheet(filepath)
|
| 432 |
+
|
| 433 |
+
else:
|
| 434 |
+
raise ValueError(f"Unsupported file format: {filepath.suffix}")
|
| 435 |
+
|
| 436 |
+
except Exception as e:
|
| 437 |
+
raise Exception(f"Error loading dataset: {e}")
|
| 438 |
+
|
| 439 |
+
def _load_from_spreadsheet(self, filepath: Path) -> Dict[str, Any]:
|
| 440 |
+
"""Load dataset from spreadsheet format"""
|
| 441 |
+
try:
|
| 442 |
+
if filepath.suffix == '.csv':
|
| 443 |
+
df = pd.read_csv(filepath)
|
| 444 |
+
else:
|
| 445 |
+
df = pd.read_excel(filepath)
|
| 446 |
+
|
| 447 |
+
# Convert back to dataset format
|
| 448 |
+
entries = []
|
| 449 |
+
for _, row in df.iterrows():
|
| 450 |
+
entry = {
|
| 451 |
+
'id': row.get('id', str(uuid.uuid4())),
|
| 452 |
+
'prompt': row.get('prompt', ''),
|
| 453 |
+
'response': row.get('response', ''),
|
| 454 |
+
'metadata': {
|
| 455 |
+
'confidence_score': row.get('confidence_score', 0),
|
| 456 |
+
'analysis_type': row.get('analysis_type', 'standard'),
|
| 457 |
+
'chunk_size': row.get('chunk_size', 0),
|
| 458 |
+
'word_count': row.get('word_count', 0),
|
| 459 |
+
'analysis_quality': row.get('analysis_quality', 'unknown'),
|
| 460 |
+
'created_at': row.get('created_at', datetime.now().isoformat())
|
| 461 |
+
}
|
| 462 |
+
}
|
| 463 |
+
entries.append(entry)
|
| 464 |
+
|
| 465 |
+
return {
|
| 466 |
+
'metadata': {
|
| 467 |
+
'loaded_from': str(filepath),
|
| 468 |
+
'total_entries': len(entries),
|
| 469 |
+
'original_format': filepath.suffix[1:]
|
| 470 |
+
},
|
| 471 |
+
'entries': entries
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
except Exception as e:
|
| 475 |
+
raise Exception(f"Error loading spreadsheet: {e}")
|
| 476 |
+
|
| 477 |
+
def merge_datasets(self, datasets: List[Dict[str, Any]],
|
| 478 |
+
output_name: str = None) -> Dict[str, Any]:
|
| 479 |
+
"""
|
| 480 |
+
Merge multiple datasets into one
|
| 481 |
+
|
| 482 |
+
Args:
|
| 483 |
+
datasets: List of datasets to merge
|
| 484 |
+
output_name: Name for merged dataset
|
| 485 |
+
|
| 486 |
+
Returns:
|
| 487 |
+
Merged dataset
|
| 488 |
+
"""
|
| 489 |
+
if not datasets:
|
| 490 |
+
return self.create_finetuning_dataset([])
|
| 491 |
+
|
| 492 |
+
merged_entries = []
|
| 493 |
+
all_analysis_types = set()
|
| 494 |
+
all_sources = set()
|
| 495 |
+
|
| 496 |
+
for dataset in datasets:
|
| 497 |
+
entries = dataset.get('entries', [])
|
| 498 |
+
merged_entries.extend(entries)
|
| 499 |
+
|
| 500 |
+
metadata = dataset.get('metadata', {})
|
| 501 |
+
all_analysis_types.update(metadata.get('analysis_types', []))
|
| 502 |
+
all_sources.update(metadata.get('legislation_sources', []))
|
| 503 |
+
|
| 504 |
+
# Create merged dataset
|
| 505 |
+
merged_dataset = {
|
| 506 |
+
'metadata': {
|
| 507 |
+
'version': '1.0',
|
| 508 |
+
'created_at': datetime.now().isoformat(),
|
| 509 |
+
'dataset_name': output_name or f"merged_dataset_{int(time.time())}",
|
| 510 |
+
'total_entries': len(merged_entries),
|
| 511 |
+
'analysis_types': list(all_analysis_types),
|
| 512 |
+
'legislation_sources': list(all_sources),
|
| 513 |
+
'merged_from': len(datasets),
|
| 514 |
+
'success_rate': 1.0 # Assuming all entries are valid
|
| 515 |
+
},
|
| 516 |
+
'entries': merged_entries
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
# Recalculate quality metrics
|
| 520 |
+
self._calculate_quality_metrics(merged_entries)
|
| 521 |
+
merged_dataset['metadata']['quality_metrics'] = self.metadata['quality_metrics']
|
| 522 |
+
|
| 523 |
+
return merged_dataset
|
| 524 |
+
|
| 525 |
+
def validate_dataset(self, dataset: Dict[str, Any]) -> Dict[str, Any]:
|
| 526 |
+
"""
|
| 527 |
+
Validate dataset quality and completeness
|
| 528 |
+
|
| 529 |
+
Args:
|
| 530 |
+
dataset: Dataset to validate
|
| 531 |
+
|
| 532 |
+
Returns:
|
| 533 |
+
Validation results
|
| 534 |
+
"""
|
| 535 |
+
validation = {
|
| 536 |
+
'is_valid': True,
|
| 537 |
+
'issues': [],
|
| 538 |
+
'warnings': [],
|
| 539 |
+
'statistics': {}
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
entries = dataset.get('entries', [])
|
| 543 |
+
metadata = dataset.get('metadata', {})
|
| 544 |
+
|
| 545 |
+
# Check basic structure
|
| 546 |
+
if not isinstance(entries, list):
|
| 547 |
+
validation['issues'].append("Entries must be a list")
|
| 548 |
+
validation['is_valid'] = False
|
| 549 |
+
return validation
|
| 550 |
+
|
| 551 |
+
if not entries:
|
| 552 |
+
validation['warnings'].append("Dataset is empty")
|
| 553 |
+
return validation
|
| 554 |
+
|
| 555 |
+
# Validate entries
|
| 556 |
+
valid_entries = 0
|
| 557 |
+
total_confidence = 0
|
| 558 |
+
|
| 559 |
+
for i, entry in enumerate(entries):
|
| 560 |
+
if not isinstance(entry, dict):
|
| 561 |
+
validation['issues'].append(f"Entry {i} is not a dictionary")
|
| 562 |
+
continue
|
| 563 |
+
|
| 564 |
+
# Check required fields
|
| 565 |
+
required_fields = ['id', 'prompt', 'response']
|
| 566 |
+
for field in required_fields:
|
| 567 |
+
if field not in entry:
|
| 568 |
+
validation['issues'].append(f"Entry {i} missing required field: {field}")
|
| 569 |
+
|
| 570 |
+
# Check prompt and response quality
|
| 571 |
+
prompt = entry.get('prompt', '')
|
| 572 |
+
response = entry.get('response', '')
|
| 573 |
+
|
| 574 |
+
if len(prompt.strip()) < 10:
|
| 575 |
+
validation['warnings'].append(f"Entry {i} has very short prompt")
|
| 576 |
+
|
| 577 |
+
if len(response.strip()) < 10:
|
| 578 |
+
validation['warnings'].append(f"Entry {i} has very short response")
|
| 579 |
+
|
| 580 |
+
# Check confidence score
|
| 581 |
+
confidence = entry.get('metadata', {}).get('confidence_score', 0)
|
| 582 |
+
total_confidence += confidence
|
| 583 |
+
|
| 584 |
+
valid_entries += 1
|
| 585 |
+
|
| 586 |
+
# Calculate statistics
|
| 587 |
+
validation['statistics'] = {
|
| 588 |
+
'total_entries': len(entries),
|
| 589 |
+
'valid_entries': valid_entries,
|
| 590 |
+
'average_confidence': total_confidence / valid_entries if valid_entries > 0 else 0,
|
| 591 |
+
'validation_rate': valid_entries / len(entries) if entries else 0
|
| 592 |
+
}
|
| 593 |
+
|
| 594 |
+
return validation
|
| 595 |
+
|
| 596 |
+
def get_dataset_statistics(self, dataset: Dict[str, Any]) -> Dict[str, Any]:
|
| 597 |
+
"""
|
| 598 |
+
Get comprehensive statistics about the dataset
|
| 599 |
+
|
| 600 |
+
Args:
|
| 601 |
+
dataset: Dataset to analyze
|
| 602 |
+
|
| 603 |
+
Returns:
|
| 604 |
+
Dataset statistics
|
| 605 |
+
"""
|
| 606 |
+
entries = dataset.get('entries', [])
|
| 607 |
+
|
| 608 |
+
if not entries:
|
| 609 |
+
return {'total_entries': 0}
|
| 610 |
+
|
| 611 |
+
# Basic statistics
|
| 612 |
+
stats = {
|
| 613 |
+
'total_entries': len(entries),
|
| 614 |
+
'total_prompts': len([e for e in entries if e.get('prompt')]),
|
| 615 |
+
'total_responses': len([e for e in entries if e.get('response')]),
|
| 616 |
+
'average_prompt_length': 0,
|
| 617 |
+
'average_response_length': 0,
|
| 618 |
+
'confidence_distribution': {},
|
| 619 |
+
'analysis_type_distribution': {},
|
| 620 |
+
'quality_distribution': {}
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
# Calculate averages
|
| 624 |
+
prompt_lengths = [len(e.get('prompt', '')) for e in entries if e.get('prompt')]
|
| 625 |
+
response_lengths = [len(e.get('response', '')) for e in entries if e.get('response')]
|
| 626 |
+
|
| 627 |
+
if prompt_lengths:
|
| 628 |
+
stats['average_prompt_length'] = sum(prompt_lengths) / len(prompt_lengths)
|
| 629 |
+
if response_lengths:
|
| 630 |
+
stats['average_response_length'] = sum(response_lengths) / len(response_lengths)
|
| 631 |
+
|
| 632 |
+
# Distribution analysis
|
| 633 |
+
for entry in entries:
|
| 634 |
+
metadata = entry.get('metadata', {})
|
| 635 |
+
|
| 636 |
+
# Confidence distribution
|
| 637 |
+
confidence = metadata.get('confidence_score', 0)
|
| 638 |
+
conf_range = f"{(confidence // 20) * 20}-{(confidence // 20) * 20 + 19}"
|
| 639 |
+
stats['confidence_distribution'][conf_range] = stats['confidence_distribution'].get(conf_range, 0) + 1
|
| 640 |
+
|
| 641 |
+
# Analysis type distribution
|
| 642 |
+
analysis_type = metadata.get('analysis_type', 'unknown')
|
| 643 |
+
stats['analysis_type_distribution'][analysis_type] = stats['analysis_type_distribution'].get(analysis_type, 0) + 1
|
| 644 |
+
|
| 645 |
+
# Quality distribution
|
| 646 |
+
quality = metadata.get('analysis_quality', 'unknown')
|
| 647 |
+
stats['quality_distribution'][quality] = stats['quality_distribution'].get(quality, 0) + 1
|
| 648 |
+
|
| 649 |
+
return stats
|
streamlit_app/core/llm_analyzer.py
ADDED
|
@@ -0,0 +1,469 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
LLM Analyzer
|
| 4 |
+
|
| 5 |
+
Handles LLM model loading, inference, and analysis for the NZ Legislation Loophole Analysis.
|
| 6 |
+
Provides optimized prompts and response parsing for legal text analysis.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import time
|
| 11 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 12 |
+
import json
|
| 13 |
+
from llama_cpp import Llama
|
| 14 |
+
import re
|
| 15 |
+
|
| 16 |
+
class LLMAnalyzer:
|
| 17 |
+
"""LLM-based analyzer for legislation loophole detection"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, model_config: Dict[str, Any]):
|
| 20 |
+
"""
|
| 21 |
+
Initialize the LLM analyzer
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
model_config: Configuration for the LLM model
|
| 25 |
+
"""
|
| 26 |
+
self.model_config = model_config
|
| 27 |
+
self.model = None
|
| 28 |
+
self.is_loaded = False
|
| 29 |
+
|
| 30 |
+
# Analysis templates
|
| 31 |
+
self.analysis_templates = {
|
| 32 |
+
'standard': {
|
| 33 |
+
'depth': 'Standard',
|
| 34 |
+
'include_recommendations': True,
|
| 35 |
+
'focus_areas': ['loopholes', 'ambiguities', 'unintended_consequences']
|
| 36 |
+
},
|
| 37 |
+
'detailed': {
|
| 38 |
+
'depth': 'Detailed',
|
| 39 |
+
'include_recommendations': True,
|
| 40 |
+
'focus_areas': ['loopholes', 'ambiguities', 'unintended_consequences', 'implementation_issues']
|
| 41 |
+
},
|
| 42 |
+
'comprehensive': {
|
| 43 |
+
'depth': 'Comprehensive',
|
| 44 |
+
'include_recommendations': True,
|
| 45 |
+
'focus_areas': ['loopholes', 'ambiguities', 'unintended_consequences',
|
| 46 |
+
'implementation_issues', 'policy_conflicts', 'enforcement_challenges']
|
| 47 |
+
}
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
# Prompt templates
|
| 51 |
+
self.prompt_templates = {
|
| 52 |
+
'loophole_analysis': self._get_loophole_analysis_template(),
|
| 53 |
+
'ambiguity_detection': self._get_ambiguity_detection_template(),
|
| 54 |
+
'recommendations': self._get_recommendations_template()
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
def _get_loophole_analysis_template(self) -> str:
|
| 58 |
+
"""Get the main loophole analysis prompt template"""
|
| 59 |
+
return """You are a legal expert analyzing New Zealand legislation for loopholes and ambiguities.
|
| 60 |
+
|
| 61 |
+
LEGISLATION TEXT:
|
| 62 |
+
{text}
|
| 63 |
+
|
| 64 |
+
TASK: Analyze this legislative text for potential loopholes, ambiguities, or unintended consequences.
|
| 65 |
+
|
| 66 |
+
INSTRUCTIONS:
|
| 67 |
+
Provide a structured analysis following this format:
|
| 68 |
+
|
| 69 |
+
1. **Text Meaning**: Explain what the text means and its intended purpose
|
| 70 |
+
2. **Key Assumptions**: Identify any assumptions the text makes that could be exploited
|
| 71 |
+
3. **Exploitable Interpretations**: Discuss how the text could be interpreted or applied in ways that circumvent its intended purpose
|
| 72 |
+
4. **Critical Loopholes**: Identify specific loopholes, ambiguities, or unintended consequences that could be used to bypass the legislation
|
| 73 |
+
5. **Circumvention Strategies**: Suggest practical methods or scenarios for exploiting these loopholes to achieve objectives contrary to the legislation's intent
|
| 74 |
+
|
| 75 |
+
{reasoning_format}
|
| 76 |
+
{recommendations_format}
|
| 77 |
+
|
| 78 |
+
ANALYSIS DEPTH: {depth}
|
| 79 |
+
FOCUS AREAS: {focus_areas}
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
def _get_ambiguity_detection_template(self) -> str:
|
| 83 |
+
"""Get the ambiguity detection prompt template"""
|
| 84 |
+
return """Analyze the following legal text for ambiguities and unclear provisions:
|
| 85 |
+
|
| 86 |
+
TEXT: {text}
|
| 87 |
+
|
| 88 |
+
Identify:
|
| 89 |
+
1. Vague terms or phrases
|
| 90 |
+
2. Ambiguous references
|
| 91 |
+
3. Unclear conditions or requirements
|
| 92 |
+
4. Missing definitions
|
| 93 |
+
5. Conflicting provisions
|
| 94 |
+
|
| 95 |
+
Provide specific examples and suggest clarifications.
|
| 96 |
+
"""
|
| 97 |
+
|
| 98 |
+
def _get_recommendations_template(self) -> str:
|
| 99 |
+
"""Get the recommendations prompt template"""
|
| 100 |
+
return """Based on the loopholes and ambiguities identified, provide specific recommendations for:
|
| 101 |
+
|
| 102 |
+
1. Legislative amendments to close identified loopholes
|
| 103 |
+
2. Additional definitions or clarifications needed
|
| 104 |
+
3. Implementation guidelines or regulations
|
| 105 |
+
4. Monitoring and enforcement mechanisms
|
| 106 |
+
|
| 107 |
+
Prioritize recommendations by impact and feasibility.
|
| 108 |
+
"""
|
| 109 |
+
|
| 110 |
+
def load_model(self) -> bool:
|
| 111 |
+
"""
|
| 112 |
+
Load the LLM model
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
True if model loaded successfully, False otherwise
|
| 116 |
+
"""
|
| 117 |
+
if self.is_loaded:
|
| 118 |
+
return True
|
| 119 |
+
|
| 120 |
+
try:
|
| 121 |
+
print("Loading LLM model...")
|
| 122 |
+
|
| 123 |
+
# Try to load from HuggingFace
|
| 124 |
+
if self.model_config.get('repo_id'):
|
| 125 |
+
self.model = Llama.from_pretrained(
|
| 126 |
+
repo_id=self.model_config['repo_id'],
|
| 127 |
+
filename=self.model_config.get('filename', ''),
|
| 128 |
+
n_ctx=self.model_config.get('context_length', 40960),
|
| 129 |
+
n_threads=min(os.cpu_count(), 8),
|
| 130 |
+
verbose=False,
|
| 131 |
+
n_gpu_layers=-1,
|
| 132 |
+
n_batch=4096,
|
| 133 |
+
logits_all=False,
|
| 134 |
+
use_mlock=True,
|
| 135 |
+
use_mmap=True,
|
| 136 |
+
)
|
| 137 |
+
else:
|
| 138 |
+
# Load from local path
|
| 139 |
+
model_path = self.model_config.get('path', '')
|
| 140 |
+
if not model_path or not os.path.exists(model_path):
|
| 141 |
+
print(f"Model path not found: {model_path}")
|
| 142 |
+
return False
|
| 143 |
+
|
| 144 |
+
self.model = Llama(
|
| 145 |
+
model_path=model_path,
|
| 146 |
+
n_ctx=self.model_config.get('context_length', 40960),
|
| 147 |
+
n_threads=min(os.cpu_count(), 8),
|
| 148 |
+
verbose=False,
|
| 149 |
+
n_gpu_layers=-1,
|
| 150 |
+
n_batch=4096,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
self.is_loaded = True
|
| 154 |
+
print("Model loaded successfully")
|
| 155 |
+
return True
|
| 156 |
+
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print(f"Error loading model: {e}")
|
| 159 |
+
return False
|
| 160 |
+
|
| 161 |
+
def unload_model(self):
|
| 162 |
+
"""Unload the model to free memory"""
|
| 163 |
+
if self.model:
|
| 164 |
+
del self.model
|
| 165 |
+
self.model = None
|
| 166 |
+
self.is_loaded = False
|
| 167 |
+
|
| 168 |
+
def generate_chat_template(self, system_prompt: str, user_message: str = "") -> str:
|
| 169 |
+
"""
|
| 170 |
+
Generate a chat template for the model
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
system_prompt: The system prompt
|
| 174 |
+
user_message: The user message (optional)
|
| 175 |
+
|
| 176 |
+
Returns:
|
| 177 |
+
Formatted chat template
|
| 178 |
+
"""
|
| 179 |
+
chat_messages = []
|
| 180 |
+
|
| 181 |
+
# System message
|
| 182 |
+
if system_prompt:
|
| 183 |
+
chat_messages.append("<|im_start|>system")
|
| 184 |
+
chat_messages.append(system_prompt)
|
| 185 |
+
chat_messages.append("<|im_end|>")
|
| 186 |
+
|
| 187 |
+
# User message
|
| 188 |
+
if user_message:
|
| 189 |
+
chat_messages.append("<|im_start|>user")
|
| 190 |
+
chat_messages.append(user_message)
|
| 191 |
+
chat_messages.append("<|im_end|>")
|
| 192 |
+
|
| 193 |
+
# Assistant message with generation prompt
|
| 194 |
+
chat_messages.append("<|im_start|>assistant")
|
| 195 |
+
chat_messages.append("") # Empty for generation
|
| 196 |
+
|
| 197 |
+
return "\n".join(chat_messages)
|
| 198 |
+
|
| 199 |
+
def analyze_chunk(self, chunk: str, analysis_type: str = 'standard',
|
| 200 |
+
cache_manager = None) -> Dict[str, Any]:
|
| 201 |
+
"""
|
| 202 |
+
Analyze a single text chunk for loopholes and ambiguities
|
| 203 |
+
|
| 204 |
+
Args:
|
| 205 |
+
chunk: Text chunk to analyze
|
| 206 |
+
analysis_type: Type of analysis to perform
|
| 207 |
+
cache_manager: Cache manager instance for caching results
|
| 208 |
+
|
| 209 |
+
Returns:
|
| 210 |
+
Analysis results
|
| 211 |
+
"""
|
| 212 |
+
if not self.is_loaded and not self.load_model():
|
| 213 |
+
return {
|
| 214 |
+
'error': 'Model not loaded',
|
| 215 |
+
'chunk': chunk[:100] + "..." if len(chunk) > 100 else chunk
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
# Check cache first
|
| 219 |
+
if cache_manager:
|
| 220 |
+
cached_result = cache_manager.get(
|
| 221 |
+
chunk,
|
| 222 |
+
self.model_config,
|
| 223 |
+
{'analysis_type': analysis_type}
|
| 224 |
+
)
|
| 225 |
+
if cached_result:
|
| 226 |
+
return cached_result
|
| 227 |
+
|
| 228 |
+
try:
|
| 229 |
+
# Prepare analysis template
|
| 230 |
+
template_config = self.analysis_templates.get(analysis_type, self.analysis_templates['standard'])
|
| 231 |
+
|
| 232 |
+
# Build the full prompt
|
| 233 |
+
reasoning_format = """
|
| 234 |
+
Write your complete analysis between <start_working_out> and <end_working_out>.
|
| 235 |
+
|
| 236 |
+
Then provide your overall conclusion between <SOLUTION> and </SOLUTION>.
|
| 237 |
+
"""
|
| 238 |
+
|
| 239 |
+
recommendations_format = """
|
| 240 |
+
**Recommendations**: Provide specific recommendations for addressing identified issues.
|
| 241 |
+
""" if template_config['include_recommendations'] else ""
|
| 242 |
+
|
| 243 |
+
full_prompt = self.prompt_templates['loophole_analysis'].format(
|
| 244 |
+
text=chunk,
|
| 245 |
+
reasoning_format=reasoning_format,
|
| 246 |
+
recommendations_format=recommendations_format,
|
| 247 |
+
depth=template_config['depth'],
|
| 248 |
+
focus_areas=', '.join(template_config['focus_areas'])
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
# Generate chat template
|
| 252 |
+
chat_template = self.generate_chat_template(full_prompt)
|
| 253 |
+
|
| 254 |
+
# Generate response
|
| 255 |
+
response = self._generate_response(chat_template)
|
| 256 |
+
|
| 257 |
+
# Parse and structure the response
|
| 258 |
+
structured_response = self._parse_response(response)
|
| 259 |
+
|
| 260 |
+
# Add metadata
|
| 261 |
+
result = {
|
| 262 |
+
'chunk': chunk,
|
| 263 |
+
'analysis_type': analysis_type,
|
| 264 |
+
'model_config': self.model_config,
|
| 265 |
+
'response': response,
|
| 266 |
+
'structured_analysis': structured_response,
|
| 267 |
+
'processing_time': time.time(),
|
| 268 |
+
'chunk_size': len(chunk),
|
| 269 |
+
'word_count': len(chunk.split())
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
# Cache the result
|
| 273 |
+
if cache_manager:
|
| 274 |
+
cache_manager.put(chunk, result, self.model_config, {'analysis_type': analysis_type})
|
| 275 |
+
|
| 276 |
+
return result
|
| 277 |
+
|
| 278 |
+
except Exception as e:
|
| 279 |
+
return {
|
| 280 |
+
'error': str(e),
|
| 281 |
+
'chunk': chunk[:100] + "..." if len(chunk) > 100 else chunk
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
def _generate_response(self, prompt: str, max_tokens: int = None) -> str:
|
| 285 |
+
"""
|
| 286 |
+
Generate a response from the model
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
prompt: Input prompt
|
| 290 |
+
max_tokens: Maximum tokens to generate
|
| 291 |
+
|
| 292 |
+
Returns:
|
| 293 |
+
Generated response
|
| 294 |
+
"""
|
| 295 |
+
if max_tokens is None:
|
| 296 |
+
max_tokens = self.model_config.get('max_tokens', 4096)
|
| 297 |
+
|
| 298 |
+
try:
|
| 299 |
+
response = self.model(
|
| 300 |
+
prompt,
|
| 301 |
+
max_tokens=max_tokens,
|
| 302 |
+
temperature=self.model_config.get('temperature', 0.3),
|
| 303 |
+
top_p=self.model_config.get('top_p', 0.85),
|
| 304 |
+
top_k=self.model_config.get('top_k', 50),
|
| 305 |
+
repeat_penalty=self.model_config.get('repeat_penalty', 1.15),
|
| 306 |
+
stop=["<end_working_out>", "</SOLUTION>", "<|im_end|>"],
|
| 307 |
+
echo=False
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
return response['choices'][0]['text'].strip()
|
| 311 |
+
|
| 312 |
+
except Exception as e:
|
| 313 |
+
print(f"Error generating response: {e}")
|
| 314 |
+
return ""
|
| 315 |
+
|
| 316 |
+
def _parse_response(self, response: str) -> Dict[str, Any]:
|
| 317 |
+
"""
|
| 318 |
+
Parse the LLM response into structured data
|
| 319 |
+
|
| 320 |
+
Args:
|
| 321 |
+
response: Raw LLM response
|
| 322 |
+
|
| 323 |
+
Returns:
|
| 324 |
+
Structured analysis data
|
| 325 |
+
"""
|
| 326 |
+
structured = {
|
| 327 |
+
'text_meaning': '',
|
| 328 |
+
'key_assumptions': [],
|
| 329 |
+
'exploitable_interpretations': [],
|
| 330 |
+
'critical_loopholes': [],
|
| 331 |
+
'circumvention_strategies': [],
|
| 332 |
+
'recommendations': [],
|
| 333 |
+
'confidence_score': 0,
|
| 334 |
+
'analysis_quality': 'unknown'
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
try:
|
| 338 |
+
# Extract sections using regex patterns
|
| 339 |
+
patterns = {
|
| 340 |
+
'text_meaning': r'\*\*Text Meaning\*\*:\s*(.*?)(?=\*\*|$)',
|
| 341 |
+
'key_assumptions': r'\*\*Key Assumptions\*\*:\s*(.*?)(?=\*\*|$)',
|
| 342 |
+
'exploitable_interpretations': r'\*\*Exploitable Interpretations\*\*:\s*(.*?)(?=\*\*|$)',
|
| 343 |
+
'critical_loopholes': r'\*\*Critical Loopholes\*\*:\s*(.*?)(?=\*\*|$)',
|
| 344 |
+
'circumvention_strategies': r'\*\*Circumvention Strategies\*\*:\s*(.*?)(?=\*\*|$)',
|
| 345 |
+
'recommendations': r'\*\*Recommendations\*\*:\s*(.*?)(?=\*\*|$|$)',
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
for key, pattern in patterns.items():
|
| 349 |
+
matches = re.findall(pattern, response, re.DOTALL | re.IGNORECASE)
|
| 350 |
+
if matches:
|
| 351 |
+
content = matches[0].strip()
|
| 352 |
+
if key in ['key_assumptions', 'exploitable_interpretations',
|
| 353 |
+
'critical_loopholes', 'circumvention_strategies', 'recommendations']:
|
| 354 |
+
# Split into list items
|
| 355 |
+
items = re.findall(r'(?:\d+\.|-|\β’)\s*(.*?)(?=(?:\d+\.|-|\β’)|$)',
|
| 356 |
+
content, re.DOTALL)
|
| 357 |
+
structured[key] = [item.strip() for item in items if item.strip()]
|
| 358 |
+
else:
|
| 359 |
+
structured[key] = content
|
| 360 |
+
|
| 361 |
+
# Calculate confidence score based on analysis completeness
|
| 362 |
+
completeness_score = 0
|
| 363 |
+
if structured['text_meaning']:
|
| 364 |
+
completeness_score += 20
|
| 365 |
+
for key in ['key_assumptions', 'exploitable_interpretations',
|
| 366 |
+
'critical_loopholes', 'circumvention_strategies']:
|
| 367 |
+
if structured[key]:
|
| 368 |
+
completeness_score += 20
|
| 369 |
+
|
| 370 |
+
structured['confidence_score'] = min(100, completeness_score)
|
| 371 |
+
|
| 372 |
+
# Determine analysis quality
|
| 373 |
+
if structured['confidence_score'] >= 80:
|
| 374 |
+
structured['analysis_quality'] = 'high'
|
| 375 |
+
elif structured['confidence_score'] >= 60:
|
| 376 |
+
structured['analysis_quality'] = 'medium'
|
| 377 |
+
else:
|
| 378 |
+
structured['analysis_quality'] = 'low'
|
| 379 |
+
|
| 380 |
+
except Exception as e:
|
| 381 |
+
print(f"Error parsing response: {e}")
|
| 382 |
+
structured['error'] = str(e)
|
| 383 |
+
|
| 384 |
+
return structured
|
| 385 |
+
|
| 386 |
+
def batch_analyze_chunks(self, chunks: List[str], analysis_type: str = 'standard',
|
| 387 |
+
cache_manager = None, progress_callback = None) -> List[Dict[str, Any]]:
|
| 388 |
+
"""
|
| 389 |
+
Analyze multiple chunks in batch
|
| 390 |
+
|
| 391 |
+
Args:
|
| 392 |
+
chunks: List of text chunks to analyze
|
| 393 |
+
analysis_type: Type of analysis to perform
|
| 394 |
+
cache_manager: Cache manager instance
|
| 395 |
+
progress_callback: Callback function for progress updates
|
| 396 |
+
|
| 397 |
+
Returns:
|
| 398 |
+
List of analysis results
|
| 399 |
+
"""
|
| 400 |
+
results = []
|
| 401 |
+
total_chunks = len(chunks)
|
| 402 |
+
|
| 403 |
+
for i, chunk in enumerate(chunks):
|
| 404 |
+
if progress_callback:
|
| 405 |
+
progress = (i + 1) / total_chunks
|
| 406 |
+
progress_callback(progress, f"Analyzing chunk {i + 1}/{total_chunks}")
|
| 407 |
+
|
| 408 |
+
result = self.analyze_chunk(chunk, analysis_type, cache_manager)
|
| 409 |
+
results.append(result)
|
| 410 |
+
|
| 411 |
+
return results
|
| 412 |
+
|
| 413 |
+
def get_model_info(self) -> Dict[str, Any]:
|
| 414 |
+
"""Get information about the loaded model"""
|
| 415 |
+
if not self.is_loaded:
|
| 416 |
+
return {'status': 'not_loaded'}
|
| 417 |
+
|
| 418 |
+
try:
|
| 419 |
+
return {
|
| 420 |
+
'status': 'loaded',
|
| 421 |
+
'config': self.model_config,
|
| 422 |
+
'model_type': type(self.model).__name__,
|
| 423 |
+
'context_length': self.model_config.get('context_length', 'unknown'),
|
| 424 |
+
'vocab_size': getattr(self.model, 'vocab_size', 'unknown')
|
| 425 |
+
}
|
| 426 |
+
except Exception as e:
|
| 427 |
+
return {
|
| 428 |
+
'status': 'error',
|
| 429 |
+
'error': str(e)
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
def validate_model_config(self) -> Dict[str, Any]:
|
| 433 |
+
"""Validate the current model configuration"""
|
| 434 |
+
validation = {
|
| 435 |
+
'is_valid': True,
|
| 436 |
+
'issues': [],
|
| 437 |
+
'warnings': []
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
# Check required parameters
|
| 441 |
+
required_params = ['context_length', 'max_tokens']
|
| 442 |
+
for param in required_params:
|
| 443 |
+
if param not in self.model_config:
|
| 444 |
+
validation['issues'].append(f"Missing required parameter: {param}")
|
| 445 |
+
validation['is_valid'] = False
|
| 446 |
+
|
| 447 |
+
# Check parameter ranges
|
| 448 |
+
if 'context_length' in self.model_config:
|
| 449 |
+
if self.model_config['context_length'] < 1024:
|
| 450 |
+
validation['issues'].append("Context length too small (minimum: 1024)")
|
| 451 |
+
validation['is_valid'] = False
|
| 452 |
+
|
| 453 |
+
if 'max_tokens' in self.model_config:
|
| 454 |
+
if self.model_config['max_tokens'] < 64:
|
| 455 |
+
validation['issues'].append("Max tokens too small (minimum: 64)")
|
| 456 |
+
validation['is_valid'] = False
|
| 457 |
+
|
| 458 |
+
if 'temperature' in self.model_config:
|
| 459 |
+
temp = self.model_config['temperature']
|
| 460 |
+
if not (0 <= temp <= 2):
|
| 461 |
+
validation['issues'].append("Temperature out of valid range (0-2)")
|
| 462 |
+
validation['is_valid'] = False
|
| 463 |
+
|
| 464 |
+
# Check model path/file
|
| 465 |
+
if 'path' in self.model_config and self.model_config['path']:
|
| 466 |
+
if not os.path.exists(self.model_config['path']):
|
| 467 |
+
validation['warnings'].append(f"Model file not found: {self.model_config['path']}")
|
| 468 |
+
|
| 469 |
+
return validation
|
streamlit_app/core/text_processor.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Text Processor
|
| 4 |
+
|
| 5 |
+
Handles text cleaning, chunking, and preprocessing for the NZ Legislation Loophole Analysis.
|
| 6 |
+
Optimized for legal/legislative content with specialized cleaning and structuring.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import re
|
| 10 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 11 |
+
import hashlib
|
| 12 |
+
import json
|
| 13 |
+
|
| 14 |
+
class TextProcessor:
|
| 15 |
+
"""Advanced text processing for legislation analysis"""
|
| 16 |
+
|
| 17 |
+
def __init__(self):
|
| 18 |
+
"""Initialize the text processor with legal-specific patterns"""
|
| 19 |
+
# Legal-specific patterns
|
| 20 |
+
self.section_pattern = re.compile(r'^(\d+):', re.MULTILINE)
|
| 21 |
+
self.act_name_pattern = re.compile(r'(\b\w+(?:\s+\w+)*)\s+Act\s+(\d{4})', re.IGNORECASE)
|
| 22 |
+
self.date_patterns = [
|
| 23 |
+
(r'(\d{1,2})\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*(\d{4})',
|
| 24 |
+
lambda m: f"{m.group(1)} {m.group(2)}"),
|
| 25 |
+
(r'(\d{1,2})/(\d{1,2})/(\d{4})', r'\1/\2/\3'),
|
| 26 |
+
(r'(\d{4})-(\d{2})-(\d{2})', r'\1-\2-\3')
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
# NZ-specific legal terms
|
| 30 |
+
self.nz_terms = {
|
| 31 |
+
'New Zealand': 'New Zealand',
|
| 32 |
+
'Parliament': 'Parliament',
|
| 33 |
+
'Crown': 'Crown',
|
| 34 |
+
'Government': 'Government',
|
| 35 |
+
'Treaty of Waitangi': 'Treaty of Waitangi',
|
| 36 |
+
'NZB': 'NZB',
|
| 37 |
+
'Her Majesty': 'Her Majesty',
|
| 38 |
+
'Governor-General': 'Governor-General'
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
def clean_text(self, text: str, preserve_structure: bool = True) -> str:
|
| 42 |
+
"""
|
| 43 |
+
Clean and normalize text for better processing, optimized for legal content
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
text: Raw text to clean
|
| 47 |
+
preserve_structure: Whether to preserve legal document structure
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
Cleaned text
|
| 51 |
+
"""
|
| 52 |
+
if not text:
|
| 53 |
+
return ""
|
| 54 |
+
|
| 55 |
+
# Preserve section numbers and legal structure if requested
|
| 56 |
+
if preserve_structure:
|
| 57 |
+
# Keep section numbers like "1:", "2:", etc.
|
| 58 |
+
text = self.section_pattern.sub(r'\1', text)
|
| 59 |
+
|
| 60 |
+
# Remove excessive whitespace but preserve paragraph structure
|
| 61 |
+
text = re.sub(r'[ \t]+', ' ', text) # Replace multiple spaces/tabs with single space
|
| 62 |
+
text = re.sub(r'\n\s*\n', '\n\n', text) # Preserve paragraph breaks but clean up
|
| 63 |
+
text = re.sub(r'\n{3,}', '\n\n', text) # Reduce excessive newlines to double
|
| 64 |
+
|
| 65 |
+
# Remove control characters but preserve legal formatting
|
| 66 |
+
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)
|
| 67 |
+
|
| 68 |
+
# Handle legal-specific characters and formatting
|
| 69 |
+
allowed_chars = r'\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}\"\'\/\@\#\$\%\^\&\*\+\=\~\`°§'
|
| 70 |
+
text = re.sub(r'[^' + allowed_chars + ']', '', text)
|
| 71 |
+
|
| 72 |
+
# Normalize quotes and apostrophes for legal text
|
| 73 |
+
text = re.sub(r'[""]', '"', text)
|
| 74 |
+
text = re.sub(r"['']", "'", text)
|
| 75 |
+
text = re.sub(r'`', "'", text)
|
| 76 |
+
|
| 77 |
+
# Clean up legal numbering and references
|
| 78 |
+
text = re.sub(r'section\s+(\d+)', r'section \1', text, flags=re.IGNORECASE)
|
| 79 |
+
|
| 80 |
+
# Normalize date formats
|
| 81 |
+
for pattern, replacement in self.date_patterns:
|
| 82 |
+
if callable(replacement):
|
| 83 |
+
text = re.compile(pattern, re.IGNORECASE).sub(replacement, text)
|
| 84 |
+
else:
|
| 85 |
+
text = re.compile(pattern, re.IGNORECASE).sub(replacement, text)
|
| 86 |
+
|
| 87 |
+
# Normalize act names with years
|
| 88 |
+
text = self.act_name_pattern.sub(r'\1 Act', text)
|
| 89 |
+
|
| 90 |
+
# Clean up amendment references
|
| 91 |
+
text = re.sub(r'[Aa]mendment\(s\)\s+incorporated\s+in\s+the\s+[Aa]ct\(s\)', 'Amendments incorporated', text)
|
| 92 |
+
|
| 93 |
+
# Normalize section references
|
| 94 |
+
text = re.sub(r'section\s+\d+\(\d+\)\([a-zA-Z]\)', lambda m: m.group(0).lower(), text)
|
| 95 |
+
|
| 96 |
+
# Generic pattern for legal document sections
|
| 97 |
+
text = re.sub(r'(\b(?:section|part|chapter|article|clause|subsection|paragraph))\s+(\d+[a-zA-Z]*)',
|
| 98 |
+
lambda m: f"{m.group(1)} {m.group(2)}", text, flags=re.IGNORECASE)
|
| 99 |
+
|
| 100 |
+
# NZ-specific legal enhancements
|
| 101 |
+
for term, normalized in self.nz_terms.items():
|
| 102 |
+
text = re.sub(re.escape(term), normalized, text, flags=re.IGNORECASE)
|
| 103 |
+
|
| 104 |
+
# Handle Maori-specific characters if present
|
| 105 |
+
maori_chars = 'ΔΔΔ«ΕΕ«whΔΔΔͺΕΕͺWH'
|
| 106 |
+
allowed_chars += maori_chars
|
| 107 |
+
text = re.sub(r'[^' + allowed_chars + ']', '', text)
|
| 108 |
+
|
| 109 |
+
# Remove empty lines and trim while preserving legal structure
|
| 110 |
+
lines = []
|
| 111 |
+
for line in text.split('\n'):
|
| 112 |
+
stripped = line.strip()
|
| 113 |
+
if stripped: # Keep non-empty lines
|
| 114 |
+
if preserve_structure and re.match(r'^\d+:', stripped):
|
| 115 |
+
lines.append(stripped) # Preserve section headers
|
| 116 |
+
else:
|
| 117 |
+
lines.append(stripped)
|
| 118 |
+
|
| 119 |
+
text = '\n'.join(lines)
|
| 120 |
+
|
| 121 |
+
return text.strip()
|
| 122 |
+
|
| 123 |
+
def chunk_text(self, text: str, chunk_size: int = 4096, overlap: int = 256,
|
| 124 |
+
method: str = "sentence") -> List[str]:
|
| 125 |
+
"""
|
| 126 |
+
Split text into overlapping chunks for processing
|
| 127 |
+
|
| 128 |
+
Args:
|
| 129 |
+
text: Text to chunk
|
| 130 |
+
chunk_size: Size of each chunk
|
| 131 |
+
overlap: Overlap between chunks
|
| 132 |
+
method: Chunking method ('sentence', 'word', 'character')
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
List of text chunks
|
| 136 |
+
"""
|
| 137 |
+
if not text or len(text) <= chunk_size:
|
| 138 |
+
return [text] if text else []
|
| 139 |
+
|
| 140 |
+
chunks = []
|
| 141 |
+
|
| 142 |
+
if method == "sentence":
|
| 143 |
+
chunks = self._chunk_by_sentence(text, chunk_size, overlap)
|
| 144 |
+
elif method == "word":
|
| 145 |
+
chunks = self._chunk_by_word(text, chunk_size, overlap)
|
| 146 |
+
else: # character
|
| 147 |
+
chunks = self._chunk_by_character(text, chunk_size, overlap)
|
| 148 |
+
|
| 149 |
+
return chunks
|
| 150 |
+
|
| 151 |
+
def _chunk_by_sentence(self, text: str, chunk_size: int, overlap: int) -> List[str]:
|
| 152 |
+
"""Chunk text by sentence boundaries"""
|
| 153 |
+
# Split into sentences (rough approximation)
|
| 154 |
+
sentence_pattern = r'(?<=[.!?])\s+'
|
| 155 |
+
sentences = re.split(sentence_pattern, text)
|
| 156 |
+
|
| 157 |
+
chunks = []
|
| 158 |
+
current_chunk = ""
|
| 159 |
+
overlap_text = ""
|
| 160 |
+
|
| 161 |
+
for sentence in sentences:
|
| 162 |
+
if not sentence.strip():
|
| 163 |
+
continue
|
| 164 |
+
|
| 165 |
+
# Check if adding this sentence would exceed chunk size
|
| 166 |
+
potential_chunk = current_chunk + sentence + " "
|
| 167 |
+
|
| 168 |
+
if len(potential_chunk) > chunk_size and current_chunk:
|
| 169 |
+
# Save current chunk
|
| 170 |
+
chunks.append(current_chunk.strip())
|
| 171 |
+
|
| 172 |
+
# Start new chunk with overlap
|
| 173 |
+
if overlap > 0 and len(current_chunk) > overlap:
|
| 174 |
+
overlap_text = current_chunk[-overlap:].strip()
|
| 175 |
+
current_chunk = overlap_text + " " + sentence + " "
|
| 176 |
+
else:
|
| 177 |
+
current_chunk = sentence + " "
|
| 178 |
+
else:
|
| 179 |
+
current_chunk = potential_chunk
|
| 180 |
+
|
| 181 |
+
# Add the last chunk
|
| 182 |
+
if current_chunk.strip():
|
| 183 |
+
chunks.append(current_chunk.strip())
|
| 184 |
+
|
| 185 |
+
return chunks
|
| 186 |
+
|
| 187 |
+
def _chunk_by_word(self, text: str, chunk_size: int, overlap: int) -> List[str]:
|
| 188 |
+
"""Chunk text by word boundaries"""
|
| 189 |
+
words = text.split()
|
| 190 |
+
chunks = []
|
| 191 |
+
|
| 192 |
+
if not words:
|
| 193 |
+
return []
|
| 194 |
+
|
| 195 |
+
start = 0
|
| 196 |
+
while start < len(words):
|
| 197 |
+
end = start + 1
|
| 198 |
+
chunk_words = []
|
| 199 |
+
|
| 200 |
+
# Build chunk up to chunk_size
|
| 201 |
+
while end <= len(words):
|
| 202 |
+
potential_chunk = " ".join(words[start:end])
|
| 203 |
+
if len(potential_chunk) > chunk_size:
|
| 204 |
+
break
|
| 205 |
+
chunk_words = words[start:end]
|
| 206 |
+
end += 1
|
| 207 |
+
|
| 208 |
+
if chunk_words:
|
| 209 |
+
chunk = " ".join(chunk_words)
|
| 210 |
+
chunks.append(chunk)
|
| 211 |
+
|
| 212 |
+
# Move start position with overlap
|
| 213 |
+
overlap_words = max(0, min(overlap // 5, len(chunk_words))) # Rough word overlap
|
| 214 |
+
start = max(start + 1, end - overlap_words)
|
| 215 |
+
else:
|
| 216 |
+
break
|
| 217 |
+
|
| 218 |
+
return chunks
|
| 219 |
+
|
| 220 |
+
def _chunk_by_character(self, text: str, chunk_size: int, overlap: int) -> List[str]:
|
| 221 |
+
"""Chunk text by character count (simple fallback)"""
|
| 222 |
+
chunks = []
|
| 223 |
+
start = 0
|
| 224 |
+
|
| 225 |
+
while start < len(text):
|
| 226 |
+
end = min(start + chunk_size, len(text))
|
| 227 |
+
chunk = text[start:end]
|
| 228 |
+
chunks.append(chunk)
|
| 229 |
+
|
| 230 |
+
# Move start with overlap
|
| 231 |
+
start = end - overlap if end < len(text) else len(text)
|
| 232 |
+
|
| 233 |
+
return chunks
|
| 234 |
+
|
| 235 |
+
def extract_metadata(self, text: str) -> Dict[str, Any]:
|
| 236 |
+
"""Extract metadata from legislation text"""
|
| 237 |
+
metadata = {
|
| 238 |
+
'sections': [],
|
| 239 |
+
'acts_referenced': [],
|
| 240 |
+
'dates': [],
|
| 241 |
+
'word_count': len(text.split()),
|
| 242 |
+
'character_count': len(text),
|
| 243 |
+
'has_nz_references': False,
|
| 244 |
+
'has_maori_terms': False
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
# Extract section numbers
|
| 248 |
+
sections = self.section_pattern.findall(text)
|
| 249 |
+
metadata['sections'] = [int(s) for s in sections]
|
| 250 |
+
|
| 251 |
+
# Extract referenced acts
|
| 252 |
+
acts = self.act_name_pattern.findall(text)
|
| 253 |
+
metadata['acts_referenced'] = [f"{act[0]} Act" for act in acts]
|
| 254 |
+
|
| 255 |
+
# Check for NZ-specific references
|
| 256 |
+
nz_indicators = ['New Zealand', 'Parliament', 'Crown', 'Government', 'Treaty of Waitangi']
|
| 257 |
+
metadata['has_nz_references'] = any(term in text for term in nz_indicators)
|
| 258 |
+
|
| 259 |
+
# Check for Maori terms
|
| 260 |
+
maori_indicators = ['Δ', 'Δ', 'Δ«', 'Ε', 'Ε«', 'whakapapa', 'tangata whenua', 'mana']
|
| 261 |
+
metadata['has_maori_terms'] = any(term in text.lower() for term in maori_indicators)
|
| 262 |
+
|
| 263 |
+
# Extract dates (basic)
|
| 264 |
+
date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{4}\b'
|
| 265 |
+
dates = re.findall(date_pattern, text)
|
| 266 |
+
metadata['dates'] = dates
|
| 267 |
+
|
| 268 |
+
return metadata
|
| 269 |
+
|
| 270 |
+
def calculate_text_hash(self, text: str) -> str:
|
| 271 |
+
"""Calculate SHA-256 hash of text for caching"""
|
| 272 |
+
return hashlib.sha256(text.encode('utf-8')).hexdigest()
|
| 273 |
+
|
| 274 |
+
def get_chunk_statistics(self, chunks: List[str]) -> Dict[str, Any]:
|
| 275 |
+
"""Get statistics about text chunks"""
|
| 276 |
+
if not chunks:
|
| 277 |
+
return {
|
| 278 |
+
'total_chunks': 0,
|
| 279 |
+
'avg_chunk_size': 0,
|
| 280 |
+
'min_chunk_size': 0,
|
| 281 |
+
'max_chunk_size': 0,
|
| 282 |
+
'total_characters': 0
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
chunk_sizes = [len(chunk) for chunk in chunks]
|
| 286 |
+
|
| 287 |
+
return {
|
| 288 |
+
'total_chunks': len(chunks),
|
| 289 |
+
'avg_chunk_size': sum(chunk_sizes) / len(chunks),
|
| 290 |
+
'min_chunk_size': min(chunk_sizes),
|
| 291 |
+
'max_chunk_size': max(chunk_sizes),
|
| 292 |
+
'total_characters': sum(chunk_sizes)
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
def preprocess_legislation_json(self, json_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 296 |
+
"""Preprocess legislation data from JSON format"""
|
| 297 |
+
processed = {
|
| 298 |
+
'id': json_data.get('id', ''),
|
| 299 |
+
'title': json_data.get('title', ''),
|
| 300 |
+
'year': json_data.get('year', ''),
|
| 301 |
+
'source': json_data.get('source', ''),
|
| 302 |
+
'original_text': json_data.get('text', ''),
|
| 303 |
+
'cleaned_text': '',
|
| 304 |
+
'chunks': [],
|
| 305 |
+
'metadata': {},
|
| 306 |
+
'processing_stats': {}
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
# Clean the text
|
| 310 |
+
raw_text = json_data.get('text', '')
|
| 311 |
+
processed['cleaned_text'] = self.clean_text(raw_text)
|
| 312 |
+
|
| 313 |
+
# Extract metadata
|
| 314 |
+
processed['metadata'] = self.extract_metadata(processed['cleaned_text'])
|
| 315 |
+
|
| 316 |
+
return processed
|
| 317 |
+
|
| 318 |
+
def batch_process_texts(self, texts: List[str], chunk_size: int = 4096,
|
| 319 |
+
overlap: int = 256) -> List[Dict[str, Any]]:
|
| 320 |
+
"""Process multiple texts in batch"""
|
| 321 |
+
results = []
|
| 322 |
+
|
| 323 |
+
for text in texts:
|
| 324 |
+
cleaned = self.clean_text(text)
|
| 325 |
+
chunks = self.chunk_text(cleaned, chunk_size, overlap)
|
| 326 |
+
metadata = self.extract_metadata(cleaned)
|
| 327 |
+
stats = self.get_chunk_statistics(chunks)
|
| 328 |
+
|
| 329 |
+
result = {
|
| 330 |
+
'original_text': text,
|
| 331 |
+
'cleaned_text': cleaned,
|
| 332 |
+
'chunks': chunks,
|
| 333 |
+
'metadata': metadata,
|
| 334 |
+
'processing_stats': stats
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
results.append(result)
|
| 338 |
+
|
| 339 |
+
return results
|
| 340 |
+
|
| 341 |
+
def validate_text_quality(self, text: str) -> Dict[str, Any]:
|
| 342 |
+
"""Validate and assess text quality for processing"""
|
| 343 |
+
quality = {
|
| 344 |
+
'is_valid': True,
|
| 345 |
+
'issues': [],
|
| 346 |
+
'score': 100,
|
| 347 |
+
'metrics': {}
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
# Check minimum length
|
| 351 |
+
if len(text.strip()) < 10:
|
| 352 |
+
quality['issues'].append("Text too short")
|
| 353 |
+
quality['score'] -= 50
|
| 354 |
+
|
| 355 |
+
# Check for excessive special characters
|
| 356 |
+
special_chars = len(re.findall(r'[^\w\s]', text))
|
| 357 |
+
special_ratio = special_chars / len(text) if text else 0
|
| 358 |
+
if special_ratio > 0.3:
|
| 359 |
+
quality['issues'].append("High special character ratio")
|
| 360 |
+
quality['score'] -= 20
|
| 361 |
+
|
| 362 |
+
# Check for legal content indicators
|
| 363 |
+
legal_indicators = ['section', 'act', 'law', 'regulation', 'clause', 'subsection']
|
| 364 |
+
has_legal_content = any(indicator in text.lower() for indicator in legal_indicators)
|
| 365 |
+
if not has_legal_content:
|
| 366 |
+
quality['issues'].append("May not be legal content")
|
| 367 |
+
quality['score'] -= 30
|
| 368 |
+
|
| 369 |
+
quality['is_valid'] = len(quality['issues']) == 0
|
| 370 |
+
quality['metrics'] = {
|
| 371 |
+
'length': len(text),
|
| 372 |
+
'word_count': len(text.split()),
|
| 373 |
+
'special_char_ratio': special_ratio,
|
| 374 |
+
'has_legal_content': has_legal_content
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
return quality
|
streamlit_app/utils/__pycache__/config.cpython-312.pyc
ADDED
|
Binary file (11.8 kB). View file
|
|
|
streamlit_app/utils/__pycache__/performance.cpython-312.pyc
ADDED
|
Binary file (13.7 kB). View file
|
|
|
streamlit_app/utils/__pycache__/ui_helpers.cpython-312.pyc
ADDED
|
Binary file (21.1 kB). View file
|
|
|
streamlit_app/utils/config.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Configuration Manager
|
| 4 |
+
|
| 5 |
+
Handles all configuration settings for the NZ Legislation Loophole Analysis application.
|
| 6 |
+
Provides default configurations, persistent storage, and validation.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Dict, Any, Optional
|
| 13 |
+
import streamlit as st
|
| 14 |
+
|
| 15 |
+
class ConfigManager:
|
| 16 |
+
"""Configuration manager for the application"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, config_file: str = None):
|
| 19 |
+
"""
|
| 20 |
+
Initialize configuration manager
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
config_file: Path to configuration file (optional)
|
| 24 |
+
"""
|
| 25 |
+
if config_file is None:
|
| 26 |
+
config_dir = Path(__file__).parent.parent / 'config'
|
| 27 |
+
config_dir.mkdir(exist_ok=True)
|
| 28 |
+
config_file = config_dir / 'app_config.json'
|
| 29 |
+
|
| 30 |
+
self.config_file = Path(config_file)
|
| 31 |
+
self._config = {}
|
| 32 |
+
self._load_config()
|
| 33 |
+
|
| 34 |
+
def _load_config(self):
|
| 35 |
+
"""Load configuration from file or use defaults"""
|
| 36 |
+
if self.config_file.exists():
|
| 37 |
+
try:
|
| 38 |
+
with open(self.config_file, 'r', encoding='utf-8') as f:
|
| 39 |
+
self._config = json.load(f)
|
| 40 |
+
# Validate and merge with defaults
|
| 41 |
+
self._config = self._merge_with_defaults(self._config)
|
| 42 |
+
except (json.JSONDecodeError, IOError) as e:
|
| 43 |
+
print(f"Warning: Could not load config file: {e}")
|
| 44 |
+
self._config = self._get_default_config()
|
| 45 |
+
else:
|
| 46 |
+
self._config = self._get_default_config()
|
| 47 |
+
|
| 48 |
+
def _get_default_config(self) -> Dict[str, Any]:
|
| 49 |
+
"""Get default configuration"""
|
| 50 |
+
return {
|
| 51 |
+
'model': {
|
| 52 |
+
'path': 'qwen3.gguf',
|
| 53 |
+
'repo_id': 'DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-GGUF',
|
| 54 |
+
'filename': 'Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-D_AU-IQ4_XS-imat.gguf',
|
| 55 |
+
'context_length': 40960,
|
| 56 |
+
'max_tokens': 4096,
|
| 57 |
+
'temperature': 0.3,
|
| 58 |
+
'top_p': 0.85,
|
| 59 |
+
'top_k': 50,
|
| 60 |
+
'repeat_penalty': 1.15
|
| 61 |
+
},
|
| 62 |
+
'processing': {
|
| 63 |
+
'chunk_size': 4096,
|
| 64 |
+
'chunk_overlap': 256,
|
| 65 |
+
'batch_size': 16,
|
| 66 |
+
'clean_text': True,
|
| 67 |
+
'preserve_structure': True
|
| 68 |
+
},
|
| 69 |
+
'cache': {
|
| 70 |
+
'enabled': True,
|
| 71 |
+
'max_size_mb': 1024,
|
| 72 |
+
'ttl_hours': 24,
|
| 73 |
+
'persistent': True
|
| 74 |
+
},
|
| 75 |
+
'analysis': {
|
| 76 |
+
'depth': 'Standard',
|
| 77 |
+
'include_recommendations': True,
|
| 78 |
+
'focus_areas': ['loopholes', 'ambiguities', 'unintended_consequences'],
|
| 79 |
+
'legal_domains': ['constitutional', 'administrative', 'criminal', 'civil']
|
| 80 |
+
},
|
| 81 |
+
'ui': {
|
| 82 |
+
'theme': 'Auto',
|
| 83 |
+
'show_progress': True,
|
| 84 |
+
'auto_refresh': False,
|
| 85 |
+
'max_display_items': 50
|
| 86 |
+
},
|
| 87 |
+
'advanced': {
|
| 88 |
+
'debug_mode': False,
|
| 89 |
+
'log_level': 'INFO',
|
| 90 |
+
'memory_limit_mb': 8192,
|
| 91 |
+
'thread_pool_size': 4,
|
| 92 |
+
'save_intermediate_results': True
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
def _merge_with_defaults(self, user_config: Dict[str, Any]) -> Dict[str, Any]:
|
| 97 |
+
"""Merge user configuration with defaults"""
|
| 98 |
+
default_config = self._get_default_config()
|
| 99 |
+
|
| 100 |
+
def merge_dicts(default: Dict[str, Any], user: Dict[str, Any]) -> Dict[str, Any]:
|
| 101 |
+
merged = default.copy()
|
| 102 |
+
for key, value in user.items():
|
| 103 |
+
if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
|
| 104 |
+
merged[key] = merge_dicts(merged[key], value)
|
| 105 |
+
else:
|
| 106 |
+
merged[key] = value
|
| 107 |
+
return merged
|
| 108 |
+
|
| 109 |
+
return merge_dicts(default_config, user_config)
|
| 110 |
+
|
| 111 |
+
def get_config(self) -> Dict[str, Any]:
|
| 112 |
+
"""Get current configuration"""
|
| 113 |
+
return self._config.copy()
|
| 114 |
+
|
| 115 |
+
def update_config(self, new_config: Dict[str, Any]):
|
| 116 |
+
"""Update configuration with validation"""
|
| 117 |
+
# Validate configuration
|
| 118 |
+
if self._validate_config(new_config):
|
| 119 |
+
self._config = self._merge_with_defaults(new_config)
|
| 120 |
+
self._save_config()
|
| 121 |
+
else:
|
| 122 |
+
raise ValueError("Invalid configuration provided")
|
| 123 |
+
|
| 124 |
+
def _validate_config(self, config: Dict[str, Any]) -> bool:
|
| 125 |
+
"""Validate configuration values"""
|
| 126 |
+
try:
|
| 127 |
+
# Model validation
|
| 128 |
+
model_config = config.get('model', {})
|
| 129 |
+
if model_config.get('context_length', 0) < 1024:
|
| 130 |
+
return False
|
| 131 |
+
if model_config.get('max_tokens', 0) < 64:
|
| 132 |
+
return False
|
| 133 |
+
if not (0 <= model_config.get('temperature', 0) <= 2):
|
| 134 |
+
return False
|
| 135 |
+
|
| 136 |
+
# Processing validation
|
| 137 |
+
proc_config = config.get('processing', {})
|
| 138 |
+
if proc_config.get('chunk_size', 0) < 256:
|
| 139 |
+
return False
|
| 140 |
+
if proc_config.get('chunk_overlap', 0) >= proc_config.get('chunk_size', 1):
|
| 141 |
+
return False
|
| 142 |
+
if proc_config.get('batch_size', 0) < 1:
|
| 143 |
+
return False
|
| 144 |
+
|
| 145 |
+
# Cache validation
|
| 146 |
+
cache_config = config.get('cache', {})
|
| 147 |
+
if cache_config.get('max_size_mb', 0) < 100:
|
| 148 |
+
return False
|
| 149 |
+
if cache_config.get('ttl_hours', 0) < 1:
|
| 150 |
+
return False
|
| 151 |
+
|
| 152 |
+
return True
|
| 153 |
+
except Exception:
|
| 154 |
+
return False
|
| 155 |
+
|
| 156 |
+
def _save_config(self):
|
| 157 |
+
"""Save configuration to file"""
|
| 158 |
+
try:
|
| 159 |
+
self.config_file.parent.mkdir(exist_ok=True)
|
| 160 |
+
with open(self.config_file, 'w', encoding='utf-8') as f:
|
| 161 |
+
json.dump(self._config, f, indent=2, ensure_ascii=False)
|
| 162 |
+
except IOError as e:
|
| 163 |
+
print(f"Warning: Could not save config file: {e}")
|
| 164 |
+
|
| 165 |
+
def reset_to_defaults(self):
|
| 166 |
+
"""Reset configuration to defaults"""
|
| 167 |
+
self._config = self._get_default_config()
|
| 168 |
+
self._save_config()
|
| 169 |
+
|
| 170 |
+
def get_section(self, section: str) -> Dict[str, Any]:
|
| 171 |
+
"""Get a specific configuration section"""
|
| 172 |
+
return self._config.get(section, {})
|
| 173 |
+
|
| 174 |
+
def update_section(self, section: str, values: Dict[str, Any]):
|
| 175 |
+
"""Update a specific configuration section"""
|
| 176 |
+
if section not in self._config:
|
| 177 |
+
self._config[section] = {}
|
| 178 |
+
|
| 179 |
+
self._config[section].update(values)
|
| 180 |
+
|
| 181 |
+
# Validate the updated config
|
| 182 |
+
if self._validate_config(self._config):
|
| 183 |
+
self._save_config()
|
| 184 |
+
else:
|
| 185 |
+
raise ValueError(f"Invalid configuration for section: {section}")
|
| 186 |
+
|
| 187 |
+
def export_config(self, filepath: str) -> bool:
|
| 188 |
+
"""Export configuration to file"""
|
| 189 |
+
try:
|
| 190 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 191 |
+
json.dump(self._config, f, indent=2, ensure_ascii=False)
|
| 192 |
+
return True
|
| 193 |
+
except IOError:
|
| 194 |
+
return False
|
| 195 |
+
|
| 196 |
+
def import_config(self, filepath: str) -> bool:
|
| 197 |
+
"""Import configuration from file"""
|
| 198 |
+
try:
|
| 199 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 200 |
+
imported_config = json.load(f)
|
| 201 |
+
|
| 202 |
+
if self._validate_config(imported_config):
|
| 203 |
+
self._config = self._merge_with_defaults(imported_config)
|
| 204 |
+
self._save_config()
|
| 205 |
+
return True
|
| 206 |
+
else:
|
| 207 |
+
return False
|
| 208 |
+
except (IOError, json.JSONDecodeError):
|
| 209 |
+
return False
|
| 210 |
+
|
| 211 |
+
def get_model_config(self) -> Dict[str, Any]:
|
| 212 |
+
"""Get model-specific configuration"""
|
| 213 |
+
return self._config.get('model', {})
|
| 214 |
+
|
| 215 |
+
def get_processing_config(self) -> Dict[str, Any]:
|
| 216 |
+
"""Get processing-specific configuration"""
|
| 217 |
+
return self._config.get('processing', {})
|
| 218 |
+
|
| 219 |
+
def get_cache_config(self) -> Dict[str, Any]:
|
| 220 |
+
"""Get cache-specific configuration"""
|
| 221 |
+
return self._config.get('cache', {})
|
| 222 |
+
|
| 223 |
+
def get_ui_config(self) -> Dict[str, Any]:
|
| 224 |
+
"""Get UI-specific configuration"""
|
| 225 |
+
return self._config.get('ui', {})
|
| 226 |
+
|
| 227 |
+
def get_advanced_config(self) -> Dict[str, Any]:
|
| 228 |
+
"""Get advanced configuration"""
|
| 229 |
+
return self._config.get('advanced', {})
|
| 230 |
+
|
| 231 |
+
# Global configuration instance
|
| 232 |
+
_config_instance = None
|
| 233 |
+
|
| 234 |
+
def get_config_manager(config_file: str = None) -> ConfigManager:
|
| 235 |
+
"""Get or create global configuration manager instance"""
|
| 236 |
+
global _config_instance
|
| 237 |
+
|
| 238 |
+
if _config_instance is None:
|
| 239 |
+
_config_instance = ConfigManager(config_file)
|
| 240 |
+
|
| 241 |
+
return _config_instance
|
streamlit_app/utils/performance.py
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Performance Monitor
|
| 4 |
+
|
| 5 |
+
Monitors system performance metrics for the NZ Legislation Loophole Analysis application.
|
| 6 |
+
Tracks memory usage, CPU utilization, processing times, and other performance indicators.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import time
|
| 10 |
+
import threading
|
| 11 |
+
import psutil
|
| 12 |
+
from typing import Dict, Any, Optional, List
|
| 13 |
+
from collections import deque
|
| 14 |
+
import streamlit as st
|
| 15 |
+
|
| 16 |
+
class PerformanceMonitor:
|
| 17 |
+
"""Performance monitoring system"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, max_history: int = 1000):
|
| 20 |
+
"""
|
| 21 |
+
Initialize performance monitor
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
max_history: Maximum number of historical data points to keep
|
| 25 |
+
"""
|
| 26 |
+
self.max_history = max_history
|
| 27 |
+
self.lock = threading.RLock()
|
| 28 |
+
|
| 29 |
+
# Historical data storage
|
| 30 |
+
self.memory_history = deque(maxlen=max_history)
|
| 31 |
+
self.cpu_history = deque(maxlen=max_history)
|
| 32 |
+
self.processing_times = deque(maxlen=max_history)
|
| 33 |
+
|
| 34 |
+
# Current metrics
|
| 35 |
+
self.current_metrics = {
|
| 36 |
+
'memory_usage_mb': 0,
|
| 37 |
+
'memory_percent': 0,
|
| 38 |
+
'cpu_percent': 0,
|
| 39 |
+
'active_threads': 0,
|
| 40 |
+
'processing_time_avg': 0,
|
| 41 |
+
'processing_time_max': 0,
|
| 42 |
+
'processing_time_min': 0,
|
| 43 |
+
'total_processed_chunks': 0,
|
| 44 |
+
'chunks_per_second': 0
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
# Processing timing
|
| 48 |
+
self.processing_start_time = None
|
| 49 |
+
self.last_chunk_time = time.time()
|
| 50 |
+
|
| 51 |
+
# Start monitoring thread
|
| 52 |
+
self.monitoring = True
|
| 53 |
+
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
| 54 |
+
self.monitor_thread.start()
|
| 55 |
+
|
| 56 |
+
def _monitor_loop(self):
|
| 57 |
+
"""Background monitoring loop"""
|
| 58 |
+
while self.monitoring:
|
| 59 |
+
try:
|
| 60 |
+
self._update_metrics()
|
| 61 |
+
time.sleep(1) # Update every second
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f"Performance monitoring error: {e}")
|
| 64 |
+
time.sleep(5) # Wait longer on error
|
| 65 |
+
|
| 66 |
+
def _update_metrics(self):
|
| 67 |
+
"""Update current performance metrics"""
|
| 68 |
+
process = psutil.Process()
|
| 69 |
+
|
| 70 |
+
with self.lock:
|
| 71 |
+
# Memory metrics
|
| 72 |
+
memory_info = process.memory_info()
|
| 73 |
+
memory_usage_mb = memory_info.rss / 1024 / 1024
|
| 74 |
+
memory_percent = process.memory_percent()
|
| 75 |
+
|
| 76 |
+
# CPU metrics
|
| 77 |
+
cpu_percent = process.cpu_percent(interval=0.1)
|
| 78 |
+
|
| 79 |
+
# Thread count
|
| 80 |
+
active_threads = len(process.threads())
|
| 81 |
+
|
| 82 |
+
# Update current metrics
|
| 83 |
+
self.current_metrics.update({
|
| 84 |
+
'memory_usage_mb': memory_usage_mb,
|
| 85 |
+
'memory_percent': memory_percent,
|
| 86 |
+
'cpu_percent': cpu_percent,
|
| 87 |
+
'active_threads': active_threads
|
| 88 |
+
})
|
| 89 |
+
|
| 90 |
+
# Store historical data
|
| 91 |
+
current_time = time.time()
|
| 92 |
+
self.memory_history.append((current_time, memory_usage_mb))
|
| 93 |
+
self.cpu_history.append((current_time, cpu_percent))
|
| 94 |
+
|
| 95 |
+
def start_processing_timer(self):
|
| 96 |
+
"""Start timing a processing operation"""
|
| 97 |
+
self.processing_start_time = time.time()
|
| 98 |
+
|
| 99 |
+
def end_processing_timer(self) -> float:
|
| 100 |
+
"""End timing and return elapsed time"""
|
| 101 |
+
if self.processing_start_time is None:
|
| 102 |
+
return 0
|
| 103 |
+
|
| 104 |
+
elapsed = time.time() - self.processing_start_time
|
| 105 |
+
self.processing_start_time = None
|
| 106 |
+
|
| 107 |
+
with self.lock:
|
| 108 |
+
self.processing_times.append(elapsed)
|
| 109 |
+
|
| 110 |
+
# Update processing time statistics
|
| 111 |
+
if self.processing_times:
|
| 112 |
+
self.current_metrics['processing_time_avg'] = sum(self.processing_times) / len(self.processing_times)
|
| 113 |
+
self.current_metrics['processing_time_max'] = max(self.processing_times)
|
| 114 |
+
self.current_metrics['processing_time_min'] = min(self.processing_times)
|
| 115 |
+
|
| 116 |
+
return elapsed
|
| 117 |
+
|
| 118 |
+
def record_chunk_processing(self):
|
| 119 |
+
"""Record that a chunk has been processed"""
|
| 120 |
+
current_time = time.time()
|
| 121 |
+
|
| 122 |
+
with self.lock:
|
| 123 |
+
self.current_metrics['total_processed_chunks'] += 1
|
| 124 |
+
|
| 125 |
+
# Calculate chunks per second
|
| 126 |
+
time_diff = current_time - self.last_chunk_time
|
| 127 |
+
if time_diff > 0:
|
| 128 |
+
current_cps = 1.0 / time_diff
|
| 129 |
+
# Smooth the chunks per second calculation
|
| 130 |
+
self.current_metrics['chunks_per_second'] = (
|
| 131 |
+
0.9 * self.current_metrics['chunks_per_second'] + 0.1 * current_cps
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
self.last_chunk_time = current_time
|
| 135 |
+
|
| 136 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 137 |
+
"""Get current performance statistics"""
|
| 138 |
+
with self.lock:
|
| 139 |
+
return self.current_metrics.copy()
|
| 140 |
+
|
| 141 |
+
def get_memory_history(self, time_window_seconds: int = 300) -> List[tuple]:
|
| 142 |
+
"""Get memory usage history within time window"""
|
| 143 |
+
current_time = time.time()
|
| 144 |
+
cutoff_time = current_time - time_window_seconds
|
| 145 |
+
|
| 146 |
+
with self.lock:
|
| 147 |
+
return [(t, v) for t, v in self.memory_history if t >= cutoff_time]
|
| 148 |
+
|
| 149 |
+
def get_cpu_history(self, time_window_seconds: int = 300) -> List[tuple]:
|
| 150 |
+
"""Get CPU usage history within time window"""
|
| 151 |
+
current_time = time.time()
|
| 152 |
+
cutoff_time = current_time - time_window_seconds
|
| 153 |
+
|
| 154 |
+
with self.lock:
|
| 155 |
+
return [(t, v) for t, v in self.cpu_history if t >= cutoff_time]
|
| 156 |
+
|
| 157 |
+
def get_processing_time_stats(self) -> Dict[str, Any]:
|
| 158 |
+
"""Get processing time statistics"""
|
| 159 |
+
with self.lock:
|
| 160 |
+
if not self.processing_times:
|
| 161 |
+
return {
|
| 162 |
+
'count': 0,
|
| 163 |
+
'average': 0,
|
| 164 |
+
'maximum': 0,
|
| 165 |
+
'minimum': 0,
|
| 166 |
+
'median': 0
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
sorted_times = sorted(self.processing_times)
|
| 170 |
+
|
| 171 |
+
return {
|
| 172 |
+
'count': len(self.processing_times),
|
| 173 |
+
'average': sum(self.processing_times) / len(self.processing_times),
|
| 174 |
+
'maximum': max(self.processing_times),
|
| 175 |
+
'minimum': min(self.processing_times),
|
| 176 |
+
'median': sorted_times[len(sorted_times) // 2]
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
def get_system_info(self) -> Dict[str, Any]:
|
| 180 |
+
"""Get system information"""
|
| 181 |
+
return {
|
| 182 |
+
'cpu_count': psutil.cpu_count(),
|
| 183 |
+
'cpu_count_logical': psutil.cpu_count(logical=True),
|
| 184 |
+
'total_memory_gb': psutil.virtual_memory().total / (1024**3),
|
| 185 |
+
'available_memory_gb': psutil.virtual_memory().available / (1024**3),
|
| 186 |
+
'python_version': f"{psutil.python_implementation()} {psutil.python_version()}",
|
| 187 |
+
'platform': psutil.platform
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
def reset_stats(self):
|
| 191 |
+
"""Reset performance statistics"""
|
| 192 |
+
with self.lock:
|
| 193 |
+
self.processing_times.clear()
|
| 194 |
+
self.current_metrics['total_processed_chunks'] = 0
|
| 195 |
+
self.current_metrics['chunks_per_second'] = 0
|
| 196 |
+
self.current_metrics['processing_time_avg'] = 0
|
| 197 |
+
self.current_metrics['processing_time_max'] = 0
|
| 198 |
+
self.current_metrics['processing_time_min'] = 0
|
| 199 |
+
|
| 200 |
+
def cleanup(self):
|
| 201 |
+
"""Cleanup resources"""
|
| 202 |
+
self.monitoring = False
|
| 203 |
+
if self.monitor_thread.is_alive():
|
| 204 |
+
self.monitor_thread.join(timeout=2)
|
| 205 |
+
|
| 206 |
+
def get_performance_report(self) -> Dict[str, Any]:
|
| 207 |
+
"""Generate a comprehensive performance report"""
|
| 208 |
+
return {
|
| 209 |
+
'current_metrics': self.get_stats(),
|
| 210 |
+
'processing_stats': self.get_processing_time_stats(),
|
| 211 |
+
'system_info': self.get_system_info(),
|
| 212 |
+
'memory_history_count': len(self.memory_history),
|
| 213 |
+
'cpu_history_count': len(self.cpu_history),
|
| 214 |
+
'processing_times_count': len(self.processing_times)
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
def check_memory_threshold(self, threshold_mb: int) -> bool:
|
| 218 |
+
"""Check if memory usage is above threshold"""
|
| 219 |
+
return self.current_metrics['memory_usage_mb'] > threshold_mb
|
| 220 |
+
|
| 221 |
+
def check_cpu_threshold(self, threshold_percent: float) -> bool:
|
| 222 |
+
"""Check if CPU usage is above threshold"""
|
| 223 |
+
return self.current_metrics['cpu_percent'] > threshold_percent
|
| 224 |
+
|
| 225 |
+
def get_recommendations(self) -> List[str]:
|
| 226 |
+
"""Get performance recommendations based on current metrics"""
|
| 227 |
+
recommendations = []
|
| 228 |
+
|
| 229 |
+
# Memory recommendations
|
| 230 |
+
if self.current_metrics['memory_usage_mb'] > 7000:
|
| 231 |
+
recommendations.append("High memory usage detected. Consider reducing batch size or chunk size.")
|
| 232 |
+
elif self.current_metrics['memory_usage_mb'] > 5000:
|
| 233 |
+
recommendations.append("Moderate memory usage. Monitor closely during processing.")
|
| 234 |
+
|
| 235 |
+
# CPU recommendations
|
| 236 |
+
if self.current_metrics['cpu_percent'] > 90:
|
| 237 |
+
recommendations.append("High CPU usage. Consider reducing processing intensity.")
|
| 238 |
+
elif self.current_metrics['cpu_percent'] > 70:
|
| 239 |
+
recommendations.append("Moderate CPU usage. Processing is running optimally.")
|
| 240 |
+
|
| 241 |
+
# Processing speed recommendations
|
| 242 |
+
avg_time = self.current_metrics.get('processing_time_avg', 0)
|
| 243 |
+
if avg_time > 10:
|
| 244 |
+
recommendations.append("Slow processing detected. Consider using a more powerful model or optimizing settings.")
|
| 245 |
+
elif avg_time > 5:
|
| 246 |
+
recommendations.append("Moderate processing speed. Consider increasing batch size if memory allows.")
|
| 247 |
+
|
| 248 |
+
# Cache recommendations
|
| 249 |
+
# This would be integrated with cache manager stats
|
| 250 |
+
chunks_per_second = self.current_metrics.get('chunks_per_second', 0)
|
| 251 |
+
if chunks_per_second < 1:
|
| 252 |
+
recommendations.append("Low processing throughput. Consider optimizing chunk size or model parameters.")
|
| 253 |
+
|
| 254 |
+
if not recommendations:
|
| 255 |
+
recommendations.append("Performance is optimal. All metrics are within normal ranges.")
|
| 256 |
+
|
| 257 |
+
return recommendations
|
| 258 |
+
|
| 259 |
+
# Global performance monitor instance
|
| 260 |
+
_performance_instance = None
|
| 261 |
+
_performance_lock = threading.Lock()
|
| 262 |
+
|
| 263 |
+
def get_performance_monitor(max_history: int = 1000) -> PerformanceMonitor:
|
| 264 |
+
"""Get or create global performance monitor instance"""
|
| 265 |
+
global _performance_instance
|
| 266 |
+
|
| 267 |
+
with _performance_lock:
|
| 268 |
+
if _performance_instance is None:
|
| 269 |
+
_performance_instance = PerformanceMonitor(max_history)
|
| 270 |
+
|
| 271 |
+
return _performance_instance
|
streamlit_app/utils/ui_helpers.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
UI Helpers
|
| 4 |
+
|
| 5 |
+
Utility functions and components for the Streamlit application UI.
|
| 6 |
+
Provides reusable UI elements, formatting functions, and visual components.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import streamlit as st
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import plotly.graph_objects as go
|
| 12 |
+
import plotly.express as px
|
| 13 |
+
from typing import Dict, Any, List, Optional, Tuple
|
| 14 |
+
import time
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
import json
|
| 17 |
+
|
| 18 |
+
class UIHelpers:
|
| 19 |
+
"""UI helper functions and components"""
|
| 20 |
+
|
| 21 |
+
@staticmethod
|
| 22 |
+
def create_metric_card(title: str, value: Any, delta: Optional[Any] = None,
|
| 23 |
+
delta_color: str = "normal", help_text: Optional[str] = None):
|
| 24 |
+
"""Create a styled metric card"""
|
| 25 |
+
if isinstance(value, float):
|
| 26 |
+
if title.lower().endswith(('rate', 'ratio', 'percentage', 'percent')):
|
| 27 |
+
formatted_value = ".1f"
|
| 28 |
+
else:
|
| 29 |
+
formatted_value = ".2f"
|
| 30 |
+
else:
|
| 31 |
+
formatted_value = str(value)
|
| 32 |
+
|
| 33 |
+
return st.metric(
|
| 34 |
+
label=title,
|
| 35 |
+
value=formatted_value,
|
| 36 |
+
delta=delta,
|
| 37 |
+
delta_color=delta_color,
|
| 38 |
+
help=help_text
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
@staticmethod
|
| 42 |
+
def create_progress_bar(progress: float, text: str = "", color: str = "primary"):
|
| 43 |
+
"""Create a styled progress bar with text"""
|
| 44 |
+
if text:
|
| 45 |
+
st.write(f"**{text}**")
|
| 46 |
+
|
| 47 |
+
if color == "success":
|
| 48 |
+
bar_color = "#28a745"
|
| 49 |
+
elif color == "warning":
|
| 50 |
+
bar_color = "#ffc107"
|
| 51 |
+
elif color == "danger":
|
| 52 |
+
bar_color = "#dc3545"
|
| 53 |
+
else:
|
| 54 |
+
bar_color = None
|
| 55 |
+
|
| 56 |
+
st.progress(progress, text=f"{progress:.1%} Complete")
|
| 57 |
+
|
| 58 |
+
@staticmethod
|
| 59 |
+
def create_info_box(message: str, type: str = "info"):
|
| 60 |
+
"""Create a styled info/warning/success box"""
|
| 61 |
+
if type == "success":
|
| 62 |
+
st.success(message)
|
| 63 |
+
elif type == "warning":
|
| 64 |
+
st.warning(message)
|
| 65 |
+
elif type == "error":
|
| 66 |
+
st.error(message)
|
| 67 |
+
else:
|
| 68 |
+
st.info(message)
|
| 69 |
+
|
| 70 |
+
@staticmethod
|
| 71 |
+
def format_file_size(size_bytes: int) -> str:
|
| 72 |
+
"""Format file size in human-readable format"""
|
| 73 |
+
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
| 74 |
+
if size_bytes < 1024.0:
|
| 75 |
+
return ".1f"
|
| 76 |
+
size_bytes /= 1024.0
|
| 77 |
+
return ".1f"
|
| 78 |
+
|
| 79 |
+
@staticmethod
|
| 80 |
+
def format_time_duration(seconds: float) -> str:
|
| 81 |
+
"""Format time duration in human-readable format"""
|
| 82 |
+
if seconds < 60:
|
| 83 |
+
return ".1f"
|
| 84 |
+
elif seconds < 3600:
|
| 85 |
+
minutes = int(seconds // 60)
|
| 86 |
+
remaining_seconds = seconds % 60
|
| 87 |
+
return ".1f"
|
| 88 |
+
else:
|
| 89 |
+
hours = int(seconds // 3600)
|
| 90 |
+
minutes = int((seconds % 3600) // 60)
|
| 91 |
+
return f"{hours}h {minutes}m"
|
| 92 |
+
|
| 93 |
+
@staticmethod
|
| 94 |
+
def create_performance_chart(data: List[Tuple[float, float]],
|
| 95 |
+
title: str, y_label: str, color: str = "#1f77b4"):
|
| 96 |
+
"""Create a performance chart using Plotly"""
|
| 97 |
+
if not data:
|
| 98 |
+
return None
|
| 99 |
+
|
| 100 |
+
times, values = zip(*data)
|
| 101 |
+
|
| 102 |
+
# Convert timestamps to relative time
|
| 103 |
+
start_time = min(times)
|
| 104 |
+
relative_times = [t - start_time for t in times]
|
| 105 |
+
|
| 106 |
+
fig = go.Figure()
|
| 107 |
+
fig.add_trace(go.Scatter(
|
| 108 |
+
x=relative_times,
|
| 109 |
+
y=values,
|
| 110 |
+
mode='lines+markers',
|
| 111 |
+
line=dict(color=color, width=2),
|
| 112 |
+
marker=dict(size=4),
|
| 113 |
+
name=y_label
|
| 114 |
+
))
|
| 115 |
+
|
| 116 |
+
fig.update_layout(
|
| 117 |
+
title=title,
|
| 118 |
+
xaxis_title="Time (seconds)",
|
| 119 |
+
yaxis_title=y_label,
|
| 120 |
+
template="plotly_white",
|
| 121 |
+
height=300,
|
| 122 |
+
margin=dict(l=20, r=20, t=40, b=20)
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
return fig
|
| 126 |
+
|
| 127 |
+
@staticmethod
|
| 128 |
+
def create_comparison_chart(data_dict: Dict[str, List[float]],
|
| 129 |
+
title: str, x_label: str, y_label: str):
|
| 130 |
+
"""Create a comparison bar chart"""
|
| 131 |
+
fig = go.Figure()
|
| 132 |
+
|
| 133 |
+
for label, values in data_dict.items():
|
| 134 |
+
fig.add_trace(go.Bar(
|
| 135 |
+
name=label,
|
| 136 |
+
x=list(range(len(values))),
|
| 137 |
+
y=values,
|
| 138 |
+
text=[f"{v:.2f}" for v in values],
|
| 139 |
+
textposition='auto',
|
| 140 |
+
))
|
| 141 |
+
|
| 142 |
+
fig.update_layout(
|
| 143 |
+
title=title,
|
| 144 |
+
xaxis_title=x_label,
|
| 145 |
+
yaxis_title=y_label,
|
| 146 |
+
template="plotly_white",
|
| 147 |
+
height=400,
|
| 148 |
+
margin=dict(l=20, r=20, t=40, b=20)
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
return fig
|
| 152 |
+
|
| 153 |
+
@staticmethod
|
| 154 |
+
def create_analysis_summary(results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 155 |
+
"""Create a summary of analysis results"""
|
| 156 |
+
if not results:
|
| 157 |
+
return {
|
| 158 |
+
'total_analyses': 0,
|
| 159 |
+
'total_loopholes': 0,
|
| 160 |
+
'avg_confidence': 0,
|
| 161 |
+
'total_chunks': 0,
|
| 162 |
+
'analysis_types': {}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
total_loopholes = sum(len(result.get('loopholes', [])) for result in results)
|
| 166 |
+
total_confidence = sum(result.get('confidence', 0) for result in results)
|
| 167 |
+
total_chunks = sum(result.get('chunks_processed', 0) for result in results)
|
| 168 |
+
|
| 169 |
+
# Count analysis types
|
| 170 |
+
analysis_types = {}
|
| 171 |
+
for result in results:
|
| 172 |
+
analysis_type = result.get('analysis_type', 'Unknown')
|
| 173 |
+
analysis_types[analysis_type] = analysis_types.get(analysis_type, 0) + 1
|
| 174 |
+
|
| 175 |
+
return {
|
| 176 |
+
'total_analyses': len(results),
|
| 177 |
+
'total_loopholes': total_loopholes,
|
| 178 |
+
'avg_confidence': total_confidence / len(results) if results else 0,
|
| 179 |
+
'total_chunks': total_chunks,
|
| 180 |
+
'analysis_types': analysis_types
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
@staticmethod
|
| 184 |
+
def display_analysis_result(result: Dict[str, Any], index: int = 0):
|
| 185 |
+
"""Display a single analysis result in a formatted way"""
|
| 186 |
+
with st.expander(f"π Analysis {index + 1}: {result.get('title', 'Unknown Title')}", expanded=index == 0):
|
| 187 |
+
col1, col2 = st.columns([2, 1])
|
| 188 |
+
|
| 189 |
+
with col1:
|
| 190 |
+
st.markdown("**Summary:**")
|
| 191 |
+
st.write(result.get('summary', 'No summary available'))
|
| 192 |
+
|
| 193 |
+
st.markdown("**Key Findings:**")
|
| 194 |
+
loopholes = result.get('loopholes', [])
|
| 195 |
+
if loopholes:
|
| 196 |
+
for i, loophole in enumerate(loopholes, 1):
|
| 197 |
+
st.markdown(f"{i}. {loophole}")
|
| 198 |
+
else:
|
| 199 |
+
st.write("No significant loopholes identified.")
|
| 200 |
+
|
| 201 |
+
if result.get('recommendations'):
|
| 202 |
+
st.markdown("**Recommendations:**")
|
| 203 |
+
for rec in result.get('recommendations', []):
|
| 204 |
+
st.markdown(f"β’ {rec}")
|
| 205 |
+
|
| 206 |
+
with col2:
|
| 207 |
+
UIHelpers.create_metric_card(
|
| 208 |
+
"Confidence",
|
| 209 |
+
".2f",
|
| 210 |
+
help_text="Model confidence in analysis"
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
UIHelpers.create_metric_card(
|
| 214 |
+
"Processing Time",
|
| 215 |
+
".2f",
|
| 216 |
+
help_text="Time taken to analyze this content"
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
UIHelpers.create_metric_card(
|
| 220 |
+
"Chunks Processed",
|
| 221 |
+
result.get('chunks_processed', 0),
|
| 222 |
+
help_text="Number of text chunks analyzed"
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
st.markdown("**Metadata:**")
|
| 226 |
+
st.write(f"**Source:** {result.get('source', 'Unknown')}")
|
| 227 |
+
st.write(f"**Date:** {result.get('date', 'Unknown')}")
|
| 228 |
+
st.write(f"**Analysis Type:** {result.get('analysis_type', 'Standard')}")
|
| 229 |
+
|
| 230 |
+
@staticmethod
|
| 231 |
+
def create_export_section(results: List[Dict[str, Any]]):
|
| 232 |
+
"""Create the export section for results"""
|
| 233 |
+
st.subheader("πΎ Export Results")
|
| 234 |
+
|
| 235 |
+
if not results:
|
| 236 |
+
st.info("No results to export")
|
| 237 |
+
return
|
| 238 |
+
|
| 239 |
+
col1, col2, col3 = st.columns(3)
|
| 240 |
+
|
| 241 |
+
with col1:
|
| 242 |
+
if st.button("π Export as JSON", use_container_width=True):
|
| 243 |
+
json_data = json.dumps(results, indent=2, ensure_ascii=False)
|
| 244 |
+
st.download_button(
|
| 245 |
+
label="Download JSON",
|
| 246 |
+
data=json_data,
|
| 247 |
+
file_name=f"nz_legislation_analysis_{int(time.time())}.json",
|
| 248 |
+
mime="application/json",
|
| 249 |
+
use_container_width=True
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
with col2:
|
| 253 |
+
if st.button("π Export as CSV", use_container_width=True):
|
| 254 |
+
df = pd.DataFrame(results)
|
| 255 |
+
csv_data = df.to_csv(index=False)
|
| 256 |
+
st.download_button(
|
| 257 |
+
label="Download CSV",
|
| 258 |
+
data=csv_data,
|
| 259 |
+
file_name=f"nz_legislation_analysis_{int(time.time())}.csv",
|
| 260 |
+
mime="text/csv",
|
| 261 |
+
use_container_width=True
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
with col3:
|
| 265 |
+
if st.button("π Export as Excel", use_container_width=True):
|
| 266 |
+
df = pd.DataFrame(results)
|
| 267 |
+
excel_data = df.to_excel(index=False, engine='openpyxl')
|
| 268 |
+
st.download_button(
|
| 269 |
+
label="Download Excel",
|
| 270 |
+
data=excel_data,
|
| 271 |
+
file_name=f"nz_legislation_analysis_{int(time.time())}.xlsx",
|
| 272 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 273 |
+
use_container_width=True
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
@staticmethod
|
| 277 |
+
def create_cache_management_section(cache_manager):
|
| 278 |
+
"""Create cache management section"""
|
| 279 |
+
st.subheader("π§ Cache Management")
|
| 280 |
+
|
| 281 |
+
cache_stats = cache_manager.get_stats()
|
| 282 |
+
|
| 283 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 284 |
+
|
| 285 |
+
with col1:
|
| 286 |
+
UIHelpers.create_metric_card("Cache Hits", cache_stats['hits'])
|
| 287 |
+
|
| 288 |
+
with col2:
|
| 289 |
+
UIHelpers.create_metric_card("Cache Misses", cache_stats['misses'])
|
| 290 |
+
|
| 291 |
+
with col3:
|
| 292 |
+
UIHelpers.create_metric_card("Hit Rate", ".1f")
|
| 293 |
+
|
| 294 |
+
with col4:
|
| 295 |
+
UIHelpers.create_metric_card("Cached Entries", cache_stats['entries'])
|
| 296 |
+
|
| 297 |
+
col1, col2, col3 = st.columns(3)
|
| 298 |
+
|
| 299 |
+
with col1:
|
| 300 |
+
if st.button("π Clear Cache", type="secondary", use_container_width=True):
|
| 301 |
+
cache_manager.clear_cache()
|
| 302 |
+
st.rerun()
|
| 303 |
+
|
| 304 |
+
with col2:
|
| 305 |
+
if st.button("π€ Export Cache", use_container_width=True):
|
| 306 |
+
import tempfile
|
| 307 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
|
| 308 |
+
success = cache_manager.export_cache(f.name)
|
| 309 |
+
if success:
|
| 310 |
+
st.success("Cache exported successfully")
|
| 311 |
+
else:
|
| 312 |
+
st.error("Failed to export cache")
|
| 313 |
+
|
| 314 |
+
with col3:
|
| 315 |
+
uploaded_cache = st.file_uploader("π₯ Import Cache", type=['json'])
|
| 316 |
+
if uploaded_cache:
|
| 317 |
+
import tempfile
|
| 318 |
+
with tempfile.NamedTemporaryFile(mode='wb', delete=False) as f:
|
| 319 |
+
f.write(uploaded_cache.read())
|
| 320 |
+
imported_count = cache_manager.import_cache(f.name)
|
| 321 |
+
st.success(f"Imported {imported_count} cache entries")
|
| 322 |
+
|
| 323 |
+
@staticmethod
|
| 324 |
+
def create_system_info_section(perf_monitor):
|
| 325 |
+
"""Create system information section"""
|
| 326 |
+
st.subheader("π» System Information")
|
| 327 |
+
|
| 328 |
+
sys_info = perf_monitor.get_system_info()
|
| 329 |
+
|
| 330 |
+
col1, col2 = st.columns(2)
|
| 331 |
+
|
| 332 |
+
with col1:
|
| 333 |
+
st.markdown("**Hardware:**")
|
| 334 |
+
st.write(f"**CPU Cores:** {sys_info['cpu_count']} physical, {sys_info['cpu_count_logical']} logical")
|
| 335 |
+
st.write(f"**Total Memory:** {sys_info['total_memory_gb']:.1f} GB")
|
| 336 |
+
st.write(f"**Available Memory:** {sys_info['available_memory_gb']:.1f} GB")
|
| 337 |
+
|
| 338 |
+
with col2:
|
| 339 |
+
st.markdown("**Software:**")
|
| 340 |
+
st.write(f"**Python:** {sys_info['python_version']}")
|
| 341 |
+
st.write(f"**Platform:** {sys_info['platform']}")
|
| 342 |
+
st.write(f"**Active Threads:** {st.session_state.performance_monitor.get_stats()['active_threads']}")
|
| 343 |
+
|
| 344 |
+
@staticmethod
|
| 345 |
+
def create_performance_recommendations(perf_monitor):
|
| 346 |
+
"""Create performance recommendations section"""
|
| 347 |
+
st.subheader("π‘ Performance Recommendations")
|
| 348 |
+
|
| 349 |
+
recommendations = perf_monitor.get_recommendations()
|
| 350 |
+
|
| 351 |
+
if recommendations:
|
| 352 |
+
for rec in recommendations:
|
| 353 |
+
if "High" in rec or "Slow" in rec:
|
| 354 |
+
st.error(rec)
|
| 355 |
+
elif "Moderate" in rec or "Consider" in rec:
|
| 356 |
+
st.warning(rec)
|
| 357 |
+
else:
|
| 358 |
+
st.info(rec)
|
| 359 |
+
else:
|
| 360 |
+
st.success("All performance metrics are within optimal ranges!")
|
| 361 |
+
|
| 362 |
+
@staticmethod
|
| 363 |
+
def create_loading_spinner(text: str = "Processing..."):
|
| 364 |
+
"""Create a loading spinner"""
|
| 365 |
+
return st.spinner(text)
|
| 366 |
+
|
| 367 |
+
@staticmethod
|
| 368 |
+
def create_success_message(message: str):
|
| 369 |
+
"""Create a success message"""
|
| 370 |
+
st.success(message)
|
| 371 |
+
|
| 372 |
+
@staticmethod
|
| 373 |
+
def create_error_message(message: str):
|
| 374 |
+
"""Create an error message"""
|
| 375 |
+
st.error(message)
|
| 376 |
+
|
| 377 |
+
@staticmethod
|
| 378 |
+
def create_warning_message(message: str):
|
| 379 |
+
"""Create a warning message"""
|
| 380 |
+
st.warning(message)
|
| 381 |
+
|
| 382 |
+
@staticmethod
|
| 383 |
+
def create_data_table(data: List[Dict[str, Any]], columns: Optional[List[str]] = None):
|
| 384 |
+
"""Create a formatted data table"""
|
| 385 |
+
if not data:
|
| 386 |
+
st.info("No data to display")
|
| 387 |
+
return
|
| 388 |
+
|
| 389 |
+
df = pd.DataFrame(data)
|
| 390 |
+
|
| 391 |
+
if columns:
|
| 392 |
+
available_columns = [col for col in columns if col in df.columns]
|
| 393 |
+
if available_columns:
|
| 394 |
+
df = df[available_columns]
|
| 395 |
+
|
| 396 |
+
st.dataframe(df, use_container_width=True)
|
| 397 |
+
|
| 398 |
+
@staticmethod
|
| 399 |
+
def create_json_viewer(data: Dict[str, Any], title: str = "JSON Data"):
|
| 400 |
+
"""Create a JSON viewer"""
|
| 401 |
+
st.subheader(title)
|
| 402 |
+
|
| 403 |
+
with st.expander("View JSON", expanded=False):
|
| 404 |
+
st.json(data)
|
| 405 |
+
|
| 406 |
+
@staticmethod
|
| 407 |
+
def create_file_preview(file_content: str, max_lines: int = 20):
|
| 408 |
+
"""Create a file content preview"""
|
| 409 |
+
lines = file_content.split('\n')
|
| 410 |
+
preview_content = '\n'.join(lines[:max_lines])
|
| 411 |
+
|
| 412 |
+
if len(lines) > max_lines:
|
| 413 |
+
preview_content += f"\n\n... ({len(lines) - max_lines} more lines)"
|
| 414 |
+
|
| 415 |
+
st.text_area("File Preview", preview_content, height=200, disabled=True)
|
test_app_imports.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to validate Streamlit app imports and basic functionality
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
def test_imports():
|
| 11 |
+
"""Test that all required modules can be imported"""
|
| 12 |
+
print("π Testing Streamlit app imports...")
|
| 13 |
+
|
| 14 |
+
# Add current directory to Python path
|
| 15 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 16 |
+
if current_dir not in sys.path:
|
| 17 |
+
sys.path.insert(0, current_dir)
|
| 18 |
+
|
| 19 |
+
# Test core modules
|
| 20 |
+
modules_to_test = [
|
| 21 |
+
'streamlit',
|
| 22 |
+
'pandas',
|
| 23 |
+
'plotly',
|
| 24 |
+
'psutil',
|
| 25 |
+
'numpy',
|
| 26 |
+
'streamlit_app.core.cache_manager',
|
| 27 |
+
'streamlit_app.core.text_processor',
|
| 28 |
+
'streamlit_app.core.llm_analyzer',
|
| 29 |
+
'streamlit_app.core.dataset_builder',
|
| 30 |
+
'streamlit_app.utils.config',
|
| 31 |
+
'streamlit_app.utils.performance',
|
| 32 |
+
'streamlit_app.utils.ui_helpers'
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
failed_imports = []
|
| 36 |
+
|
| 37 |
+
for module in modules_to_test:
|
| 38 |
+
try:
|
| 39 |
+
__import__(module)
|
| 40 |
+
print(f"β
{module}")
|
| 41 |
+
except ImportError as e:
|
| 42 |
+
print(f"β {module}: {e}")
|
| 43 |
+
failed_imports.append(module)
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"β οΈ {module}: Unexpected error - {e}")
|
| 46 |
+
|
| 47 |
+
if failed_imports:
|
| 48 |
+
print(f"\nβ Failed to import {len(failed_imports)} modules:")
|
| 49 |
+
for module in failed_imports:
|
| 50 |
+
print(f" - {module}")
|
| 51 |
+
return False
|
| 52 |
+
|
| 53 |
+
print(f"\nβ
All {len(modules_to_test)} modules imported successfully!")
|
| 54 |
+
return True
|
| 55 |
+
|
| 56 |
+
def test_core_functionality():
|
| 57 |
+
"""Test basic functionality of core modules"""
|
| 58 |
+
print("\nπ§ Testing core functionality...")
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
# Test cache manager
|
| 62 |
+
from streamlit_app.core.cache_manager import CacheManager, get_cache_manager
|
| 63 |
+
|
| 64 |
+
cache = get_cache_manager(max_memory_mb=10, persistent=False) # Small cache for testing
|
| 65 |
+
cache_stats = cache.get_stats()
|
| 66 |
+
print(f"β
Cache Manager: {cache_stats}")
|
| 67 |
+
|
| 68 |
+
# Test text processor
|
| 69 |
+
from streamlit_app.core.text_processor import TextProcessor
|
| 70 |
+
|
| 71 |
+
processor = TextProcessor()
|
| 72 |
+
test_text = "This is a test of the New Zealand legislation analysis system."
|
| 73 |
+
cleaned = processor.clean_text(test_text)
|
| 74 |
+
chunks = processor.chunk_text(cleaned, chunk_size=50, overlap=10)
|
| 75 |
+
print(f"β
Text Processor: {len(chunks)} chunks created")
|
| 76 |
+
|
| 77 |
+
# Test configuration manager
|
| 78 |
+
from streamlit_app.utils.config import ConfigManager
|
| 79 |
+
|
| 80 |
+
config = ConfigManager()
|
| 81 |
+
config_dict = config.get_config()
|
| 82 |
+
print(f"β
Config Manager: {len(config_dict)} configuration sections")
|
| 83 |
+
|
| 84 |
+
# Test performance monitor
|
| 85 |
+
from streamlit_app.utils.performance import PerformanceMonitor
|
| 86 |
+
|
| 87 |
+
perf = PerformanceMonitor(max_history=10)
|
| 88 |
+
stats = perf.get_stats()
|
| 89 |
+
print(f"β
Performance Monitor: Memory usage {stats['memory_usage_mb']:.1f} MB")
|
| 90 |
+
|
| 91 |
+
# Test UI helpers (basic instantiation)
|
| 92 |
+
from streamlit_app.utils.ui_helpers import UIHelpers
|
| 93 |
+
|
| 94 |
+
helper = UIHelpers()
|
| 95 |
+
print("β
UI Helpers: Module loaded")
|
| 96 |
+
|
| 97 |
+
print("\nπ All core functionality tests passed!")
|
| 98 |
+
return True
|
| 99 |
+
|
| 100 |
+
except Exception as e:
|
| 101 |
+
print(f"\nβ Core functionality test failed: {e}")
|
| 102 |
+
import traceback
|
| 103 |
+
traceback.print_exc()
|
| 104 |
+
return False
|
| 105 |
+
|
| 106 |
+
def test_file_structure():
|
| 107 |
+
"""Test that all required files exist"""
|
| 108 |
+
print("\nπ Testing file structure...")
|
| 109 |
+
|
| 110 |
+
required_files = [
|
| 111 |
+
'streamlit_app/app.py',
|
| 112 |
+
'streamlit_app/core/cache_manager.py',
|
| 113 |
+
'streamlit_app/core/text_processor.py',
|
| 114 |
+
'streamlit_app/core/llm_analyzer.py',
|
| 115 |
+
'streamlit_app/core/dataset_builder.py',
|
| 116 |
+
'streamlit_app/utils/config.py',
|
| 117 |
+
'streamlit_app/utils/performance.py',
|
| 118 |
+
'streamlit_app/utils/ui_helpers.py',
|
| 119 |
+
'requirements.txt',
|
| 120 |
+
'run_streamlit_app.py',
|
| 121 |
+
'README_Streamlit_App.md'
|
| 122 |
+
]
|
| 123 |
+
|
| 124 |
+
missing_files = []
|
| 125 |
+
|
| 126 |
+
for file_path in required_files:
|
| 127 |
+
if not Path(file_path).exists():
|
| 128 |
+
missing_files.append(file_path)
|
| 129 |
+
print(f"β Missing: {file_path}")
|
| 130 |
+
else:
|
| 131 |
+
print(f"β
Found: {file_path}")
|
| 132 |
+
|
| 133 |
+
if missing_files:
|
| 134 |
+
print(f"\nβ Missing {len(missing_files)} files:")
|
| 135 |
+
for file_path in missing_files:
|
| 136 |
+
print(f" - {file_path}")
|
| 137 |
+
return False
|
| 138 |
+
|
| 139 |
+
print(f"\nβ
All {len(required_files)} files present!")
|
| 140 |
+
return True
|
| 141 |
+
|
| 142 |
+
def main():
|
| 143 |
+
"""Main test function"""
|
| 144 |
+
print("ποΈ NZ Legislation Loophole Analysis - App Validation")
|
| 145 |
+
print("=" * 60)
|
| 146 |
+
|
| 147 |
+
all_passed = True
|
| 148 |
+
|
| 149 |
+
# Test file structure
|
| 150 |
+
if not test_file_structure():
|
| 151 |
+
all_passed = False
|
| 152 |
+
|
| 153 |
+
# Test imports
|
| 154 |
+
if not test_imports():
|
| 155 |
+
all_passed = False
|
| 156 |
+
|
| 157 |
+
# Test core functionality
|
| 158 |
+
if not test_core_functionality():
|
| 159 |
+
all_passed = False
|
| 160 |
+
|
| 161 |
+
print("\n" + "=" * 60)
|
| 162 |
+
if all_passed:
|
| 163 |
+
print("π VALIDATION COMPLETE - App is ready to run!")
|
| 164 |
+
print("\nπ To start the application:")
|
| 165 |
+
print(" python run_streamlit_app.py")
|
| 166 |
+
print("\nπ± Then visit: http://localhost:8501")
|
| 167 |
+
else:
|
| 168 |
+
print("β VALIDATION FAILED - Please check the errors above")
|
| 169 |
+
print("\nπ§ Troubleshooting:")
|
| 170 |
+
print(" - Ensure all dependencies are installed: pip install -r requirements.txt")
|
| 171 |
+
print(" - Check Python version (3.8+ required)")
|
| 172 |
+
print(" - Verify file permissions")
|
| 173 |
+
|
| 174 |
+
return all_passed
|
| 175 |
+
|
| 176 |
+
if __name__ == "__main__":
|
| 177 |
+
success = main()
|
| 178 |
+
sys.exit(0 if success else 1)
|
trl copy.py
ADDED
|
@@ -0,0 +1,532 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
NZ Legislation Loophole Analysis Dataset Creation Tool
|
| 4 |
+
|
| 5 |
+
This script processes New Zealand legislation text to create a finetuning dataset for AI models
|
| 6 |
+
that can identify potential loopholes, ambiguities, and unintended consequences in legal text.
|
| 7 |
+
|
| 8 |
+
The script:
|
| 9 |
+
1. Loads and cleans NZ legislation text, preserving legal structure and terminology
|
| 10 |
+
2. Chunks the text into manageable sections with overlap for context
|
| 11 |
+
3. Uses an LLM to analyze each chunk for legal issues
|
| 12 |
+
4. Generates a structured dataset for training AI models on legal loophole detection
|
| 13 |
+
|
| 14 |
+
Usage:
|
| 15 |
+
python trl.py
|
| 16 |
+
|
| 17 |
+
Requirements:
|
| 18 |
+
- llama-cpp-python with GGUF model support
|
| 19 |
+
- psutil for memory monitoring
|
| 20 |
+
- Input file: nz-legislation.txt containing NZ legislation in JSON lines format
|
| 21 |
+
|
| 22 |
+
Output:
|
| 23 |
+
- JSON dataset saved to nz_legislation_dataset/nz_legislation_loophole_dataset.json
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
import os
|
| 27 |
+
import json
|
| 28 |
+
import time
|
| 29 |
+
import psutil
|
| 30 |
+
from typing import List, Dict, Any
|
| 31 |
+
import numpy as np
|
| 32 |
+
from llama_cpp import Llama
|
| 33 |
+
import re
|
| 34 |
+
|
| 35 |
+
# Placeholder classes and functions for missing dependencies
|
| 36 |
+
class ProgressManager:
|
| 37 |
+
"""Simple placeholder for progress tracking"""
|
| 38 |
+
def __init__(self):
|
| 39 |
+
pass
|
| 40 |
+
|
| 41 |
+
def show_memory_usage(label: str):
|
| 42 |
+
"""Simple memory usage display"""
|
| 43 |
+
process = psutil.Process(os.getpid())
|
| 44 |
+
memory_mb = process.memory_info().rss / 1024 / 1024
|
| 45 |
+
print(f"{label}: {memory_mb:.2f} MB")
|
| 46 |
+
|
| 47 |
+
# Configuration for NZ Legislation Loophole Analysis Dataset Creation
|
| 48 |
+
INPUT_FILE = "nz-legislation.txt" # Path to New Zealand legislation JSON dataset
|
| 49 |
+
OUTPUT_DIR = "nz_legislation_dataset" # Directory to save the dataset
|
| 50 |
+
CHUNK_SIZE = 4096 # Size of text chunks for processing legislation sections
|
| 51 |
+
CHUNK_OVERLAP = 256 # Overlap between chunks to maintain context
|
| 52 |
+
BATCH_SIZE = 16 # Number of chunks to process at once
|
| 53 |
+
MODEL_PATH = "qwen3.gguf" # Path to your Qwen3 GGUF model
|
| 54 |
+
MAX_TOKENS = 4096 # Maximum tokens for model response
|
| 55 |
+
|
| 56 |
+
# Ensure output directory exists
|
| 57 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 58 |
+
|
| 59 |
+
def load_model(progress_manager: ProgressManager = None):
|
| 60 |
+
"""Load the LLM model for text generation with progress tracking"""
|
| 61 |
+
if progress_manager is None:
|
| 62 |
+
progress_manager = ProgressManager()
|
| 63 |
+
|
| 64 |
+
print("Loading LLM model...")
|
| 65 |
+
show_memory_usage("Initial memory usage")
|
| 66 |
+
|
| 67 |
+
start_time = time.time()
|
| 68 |
+
try:
|
| 69 |
+
llm = Llama.from_pretrained(
|
| 70 |
+
repo_id="DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-GGUF",
|
| 71 |
+
filename="Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-D_AU-IQ4_XS-imat.gguf",
|
| 72 |
+
n_ctx=40960, # Context length
|
| 73 |
+
n_threads=8, # Adjust based on your CPU
|
| 74 |
+
verbose=False,
|
| 75 |
+
n_gpu_layers=-1, # Use all available GPU layers
|
| 76 |
+
n_batch=4096, # Batch size for processing
|
| 77 |
+
logits_all=False, # Optimize for text generation
|
| 78 |
+
use_mlock=True, # Lock model in memory if possible
|
| 79 |
+
use_mmap=True, # Use memory mapping for better performance
|
| 80 |
+
)
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"Error loading model: {e}")
|
| 83 |
+
print("Trying with basic configuration...")
|
| 84 |
+
# Fallback to basic configuration
|
| 85 |
+
model = Llama(
|
| 86 |
+
model_path=MODEL_PATH,
|
| 87 |
+
n_ctx=40960,
|
| 88 |
+
n_threads=8,
|
| 89 |
+
verbose=False,
|
| 90 |
+
n_gpu_layers=-1,
|
| 91 |
+
n_batch=4096
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
load_time = time.time() - start_time
|
| 95 |
+
print(f"LLM model loaded in {load_time:.2f}s")
|
| 96 |
+
show_memory_usage("Memory after model load")
|
| 97 |
+
|
| 98 |
+
return model
|
| 99 |
+
|
| 100 |
+
def clean_text(text: str) -> str:
|
| 101 |
+
"""Clean and normalize text for better embedding quality, optimized for legal/legislative content"""
|
| 102 |
+
import re
|
| 103 |
+
|
| 104 |
+
# Preserve section numbers and legal structure while cleaning
|
| 105 |
+
# Keep section numbers like "1:", "2:", etc.
|
| 106 |
+
text = re.sub(r'^(\d+:)', r'\1', text, flags=re.MULTILINE)
|
| 107 |
+
|
| 108 |
+
# Remove excessive whitespace but preserve paragraph structure
|
| 109 |
+
text = re.sub(r'[ \t]+', ' ', text) # Replace multiple spaces/tabs with single space
|
| 110 |
+
text = re.sub(r'\n\s*\n', '\n\n', text) # Preserve paragraph breaks but clean up
|
| 111 |
+
text = re.sub(r'\n{3,}', '\n\n', text) # Reduce excessive newlines to double
|
| 112 |
+
|
| 113 |
+
# Remove control characters but preserve legal formatting
|
| 114 |
+
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text) # Remove control chars except tab and newline
|
| 115 |
+
|
| 116 |
+
# Handle legal-specific characters and formatting
|
| 117 |
+
# Keep legal punctuation and symbols
|
| 118 |
+
allowed_chars = r'\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}\"\'\/\@\#\$\%\^\&\*\+\=\~\`°§'
|
| 119 |
+
text = re.sub(r'[^' + allowed_chars + ']', '', text)
|
| 120 |
+
|
| 121 |
+
# Normalize quotes and apostrophes for legal text
|
| 122 |
+
text = re.sub(r'[""]', '"', text) # Normalize double quotes
|
| 123 |
+
text = re.sub(r"['']", "'", text) # Normalize single quotes
|
| 124 |
+
text = re.sub(r'`', "'", text) # Replace backticks with apostrophes
|
| 125 |
+
|
| 126 |
+
# Clean up legal numbering and references
|
| 127 |
+
# Normalize section references
|
| 128 |
+
text = re.sub(r'section\s+(\d+)', r'section \1', text, flags=re.IGNORECASE)
|
| 129 |
+
text = re.sub(r'(\d+)\s*[Jj]anuary', r'\1 January', text) # Clean date formatting
|
| 130 |
+
text = re.sub(r'(\d+)\s*[Jj]uly', r'\1 July', text) # Clean date formatting
|
| 131 |
+
text = re.sub(r'(\d+)\s*[Aa]pril', r'\1 April', text) # Clean date formatting
|
| 132 |
+
text = re.sub(r'(\d+)\s*[Ff]ebruary', r'\1 February', text) # Clean date formatting
|
| 133 |
+
text = re.sub(r'(\d+)\s*[Dd]ecember', r'\1 December', text) # Clean date formatting
|
| 134 |
+
text = re.sub(r'(\d+)\s*[Aa]ugust', r'\1 August', text) # Clean date formatting
|
| 135 |
+
text = re.sub(r'(\d+)\s*[Mm]arch', r'\1 March', text) # Clean date formatting
|
| 136 |
+
text = re.sub(r'(\d+)\s*[Mm]ay', r'\1 May', text) # Clean date formatting
|
| 137 |
+
text = re.sub(r'(\d+)\s*[Jj]une', r'\1 June', text) # Clean date formatting
|
| 138 |
+
text = re.sub(r'(\d+)\s*[Ss]eptember', r'\1 September', text) # Clean date formatting
|
| 139 |
+
text = re.sub(r'(\d+)\s*[Oo]ctober', r'\1 October', text) # Clean date formatting
|
| 140 |
+
text = re.sub(r'(\d+)\s*[Nn]ovember', r'\1 November', text) # Clean date formatting
|
| 141 |
+
|
| 142 |
+
# Clean up punctuation spacing in legal text
|
| 143 |
+
text = re.sub(r'\s+([\.!\?\,\;\:])', r'\1', text) # Remove space before punctuation
|
| 144 |
+
text = re.sub(r'([\.!\?\,\;\:])\s*', r'\1 ', text) # Ensure space after punctuation
|
| 145 |
+
|
| 146 |
+
# Handle legal citations and references (generic patterns)
|
| 147 |
+
# Normalize act names with years - generic pattern for "Act ####" format
|
| 148 |
+
text = re.sub(r'(\b\w+(?:\s+\w+)*)\s+Act\s+(\d{4})', r'\1 Act', text) # Normalize act names
|
| 149 |
+
|
| 150 |
+
# Clean up amendment references (generic patterns)
|
| 151 |
+
text = re.sub(r'[Aa]mendment\(s\)\s+incorporated\s+in\s+the\s+[Aa]ct\(s\)', 'Amendments incorporated', text)
|
| 152 |
+
text = re.sub(r'section\s+\d+\(\d+\)\([a-zA-Z]\)', lambda m: m.group(0).lower(), text) # Normalize section references
|
| 153 |
+
|
| 154 |
+
# Generic pattern for legal document sections
|
| 155 |
+
text = re.sub(r'(\b(?:section|part|chapter|article|clause|subsection|paragraph))\s+(\d+[a-zA-Z]*)',
|
| 156 |
+
lambda m: f"{m.group(1)} {m.group(2)}", text, flags=re.IGNORECASE)
|
| 157 |
+
|
| 158 |
+
# NZ-specific legal enhancements
|
| 159 |
+
# Handle New Zealand specific terms and references
|
| 160 |
+
text = re.sub(r'\b[Nn]ew\s+[Zz]ealand\b', 'New Zealand', text) # Normalize "New Zealand"
|
| 161 |
+
text = re.sub(r'\b[Pp]arliament\b', 'Parliament', text) # Normalize "Parliament"
|
| 162 |
+
text = re.sub(r'\b[Cc]rown\b', 'Crown', text) # Normalize "Crown"
|
| 163 |
+
text = re.sub(r'\b[Gg]overnment\b', 'Government', text) # Normalize "Government"
|
| 164 |
+
|
| 165 |
+
# Handle NZ-specific legal citations (e.g., "NZB" references, Treaty of Waitangi)
|
| 166 |
+
text = re.sub(r'\b[Nn][Zz][Bb]\s+(\d+)', r'NZB \1', text) # Normalize NZB references
|
| 167 |
+
text = re.sub(r'[Tt]reaty\s+[Oo]f\s+[Ww]aitangi', 'Treaty of Waitangi', text, flags=re.IGNORECASE)
|
| 168 |
+
|
| 169 |
+
# Handle Maori-specific characters if present (basic support)
|
| 170 |
+
# Keep common Maori characters: Δ, Δ, Δ«, Ε, Ε«, wh
|
| 171 |
+
maori_chars = 'ΔΔΔ«ΕΕ«whΔΔΔͺΕΕͺWH'
|
| 172 |
+
allowed_chars += maori_chars
|
| 173 |
+
text = re.sub(r'[^' + allowed_chars + ']', '', text)
|
| 174 |
+
|
| 175 |
+
# Remove empty lines and trim while preserving legal structure
|
| 176 |
+
lines = []
|
| 177 |
+
for line in text.split('\n'):
|
| 178 |
+
stripped = line.strip()
|
| 179 |
+
if stripped: # Keep non-empty lines
|
| 180 |
+
# Preserve section headers
|
| 181 |
+
if re.match(r'^\d+:', stripped):
|
| 182 |
+
lines.append(stripped)
|
| 183 |
+
else:
|
| 184 |
+
lines.append(stripped)
|
| 185 |
+
|
| 186 |
+
text = '\n'.join(lines)
|
| 187 |
+
|
| 188 |
+
# Final cleanup
|
| 189 |
+
text = text.strip()
|
| 190 |
+
|
| 191 |
+
return text
|
| 192 |
+
|
| 193 |
+
# Constants for prompt formatting
|
| 194 |
+
REASONING_START = "<start_working_out>"
|
| 195 |
+
REASONING_END = "<end_working_out>"
|
| 196 |
+
SOLUTION_START = "<SOLUTION>"
|
| 197 |
+
SOLUTION_END = "</SOLUTION>"
|
| 198 |
+
|
| 199 |
+
def create_system_prompt(text: str) -> str:
|
| 200 |
+
"""Create a system prompt for analyzing legislative text for loopholes and ambiguities"""
|
| 201 |
+
return f"""You are a legal expert analyzing New Zealand legislation for loopholes and ambiguities.
|
| 202 |
+
|
| 203 |
+
LEGISLATION TEXT:
|
| 204 |
+
{text}
|
| 205 |
+
|
| 206 |
+
TASK: Analyze this legislative text and identify potential loopholes, ambiguities, or unintended consequences.
|
| 207 |
+
|
| 208 |
+
REASONING: Provide a structured analysis in the following format:
|
| 209 |
+
|
| 210 |
+
1. **Text Meaning**: Explain what the text means and its intended purpose
|
| 211 |
+
2. **Key Assumptions**: Identify any assumptions the text makes that could be exploited
|
| 212 |
+
3. **Exploitable Interpretations**: Discuss how the text could be interpreted or applied in ways that circumvent its intended purpose
|
| 213 |
+
4. **Critical Loopholes**: Identify specific loopholes, ambiguities, or unintended consequences that could be used to bypass the legislation
|
| 214 |
+
5. **Circumvention Strategies**: Suggest practical methods or scenarios for exploiting these loopholes to achieve objectives contrary to the legislation's intent
|
| 215 |
+
|
| 216 |
+
Write your complete analysis between {REASONING_START} and {REASONING_END}.
|
| 217 |
+
|
| 218 |
+
Then provide your overall conclusion between {SOLUTION_START} and {SOLUTION_END}.
|
| 219 |
+
"""
|
| 220 |
+
|
| 221 |
+
def generate_chat_template(system_prompt: str) -> str:
|
| 222 |
+
"""
|
| 223 |
+
Generate a chat template using the GGUF model's native chat format.
|
| 224 |
+
This uses the proper message structure with BOS/EOS tokens for better model compatibility.
|
| 225 |
+
"""
|
| 226 |
+
# Build the chat using the GGUF template structure
|
| 227 |
+
chat_messages = []
|
| 228 |
+
|
| 229 |
+
# System message
|
| 230 |
+
if system_prompt:
|
| 231 |
+
chat_messages.append("<|im_start|>system")
|
| 232 |
+
chat_messages.append(system_prompt)
|
| 233 |
+
chat_messages.append("<|im_end|>")
|
| 234 |
+
|
| 235 |
+
# User message with the analysis request
|
| 236 |
+
chat_messages.append("<|im_start|>user")
|
| 237 |
+
chat_messages.append("Analyze the given legislative text for loopholes, ambiguities, and unintended consequences. Provide a structured legal analysis following the specified format.")
|
| 238 |
+
chat_messages.append("<|im_end|>")
|
| 239 |
+
|
| 240 |
+
# Assistant message with generation prompt
|
| 241 |
+
chat_messages.append("<|im_start|>assistant")
|
| 242 |
+
chat_messages.append("") # Empty for generation
|
| 243 |
+
|
| 244 |
+
return "\n".join(chat_messages)
|
| 245 |
+
|
| 246 |
+
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
|
| 247 |
+
"""Split text into overlapping chunks for processing"""
|
| 248 |
+
if len(text) <= chunk_size:
|
| 249 |
+
return [text]
|
| 250 |
+
|
| 251 |
+
chunks = []
|
| 252 |
+
start = 0
|
| 253 |
+
while start < len(text):
|
| 254 |
+
end = start + chunk_size
|
| 255 |
+
chunk = text[start:end]
|
| 256 |
+
|
| 257 |
+
# Try to end chunk at a sentence boundary if possible
|
| 258 |
+
if end < len(text):
|
| 259 |
+
# Look for sentence endings in the last 100 characters
|
| 260 |
+
sentence_end = max(
|
| 261 |
+
chunk.rfind('. ', max(0, len(chunk) - 100)),
|
| 262 |
+
chunk.rfind('! ', max(0, len(chunk) - 100)),
|
| 263 |
+
chunk.rfind('? ', max(0, len(chunk) - 100))
|
| 264 |
+
)
|
| 265 |
+
if sentence_end != -1:
|
| 266 |
+
chunk = chunk[:sentence_end + 2] # Include the sentence ending
|
| 267 |
+
|
| 268 |
+
chunks.append(chunk)
|
| 269 |
+
start = end - overlap if end < len(text) else len(text)
|
| 270 |
+
|
| 271 |
+
return chunks
|
| 272 |
+
|
| 273 |
+
def generate_response(model, prompt: str, max_tokens: int = MAX_TOKENS) -> str:
|
| 274 |
+
"""
|
| 275 |
+
Generate a response from the model for a given prompt with optimized parameters for legal analysis.
|
| 276 |
+
|
| 277 |
+
Parameter Explanations:
|
| 278 |
+
- temperature=0.3: Balanced creativity for legal analysis (not too random, not too deterministic)
|
| 279 |
+
- top_p=0.85: Nucleus sampling - considers top 85% probability mass for coherent legal text
|
| 280 |
+
- top_k=50: Top-k sampling - considers top 50 tokens for better legal terminology selection
|
| 281 |
+
- min_p=0.05: Minimum probability threshold to avoid low-quality tokens
|
| 282 |
+
|
| 283 |
+
Anti-Repetition Parameters:
|
| 284 |
+
- repetition_penalty=1.15: Penalizes repetition of phrases (15% penalty)
|
| 285 |
+
- presence_penalty=0.1: Encourages topic diversity across the response
|
| 286 |
+
- frequency_penalty=0.1: Reduces overuse of frequent tokens
|
| 287 |
+
|
| 288 |
+
Advanced Sampling:
|
| 289 |
+
- typical_p=0.95: Focuses on typical token probabilities for legal text patterns
|
| 290 |
+
- tfs_z=0.95: Tail-free sampling for more natural legal reasoning
|
| 291 |
+
- mirostat_mode=2: Mirostat v2 for perplexity-controlled generation
|
| 292 |
+
- mirostat_tau=4.0: Target entropy level for legal analysis
|
| 293 |
+
- mirostat_eta=0.15: Learning rate for perplexity adaptation
|
| 294 |
+
"""
|
| 295 |
+
try:
|
| 296 |
+
response = model(
|
| 297 |
+
prompt,
|
| 298 |
+
max_tokens=max_tokens,
|
| 299 |
+
# Core generation parameters
|
| 300 |
+
temperature=0.3, # Balanced temperature for legal analysis
|
| 301 |
+
top_p=0.85, # Nucleus sampling for coherent legal text
|
| 302 |
+
top_k=50, # Top-k sampling for better token selection
|
| 303 |
+
min_p=0.05, # Minimum probability threshold to avoid low-quality tokens
|
| 304 |
+
|
| 305 |
+
# Anti-repetition parameters
|
| 306 |
+
repeat_penalty=1.15, # Reduce repetition of phrases
|
| 307 |
+
presence_penalty=0.1, # Encourage topic diversity
|
| 308 |
+
frequency_penalty=0.1, # Reduce frequent token usage
|
| 309 |
+
|
| 310 |
+
# Advanced sampling parameters
|
| 311 |
+
typical_p=0.95, # Typical token probability for legal text patterns
|
| 312 |
+
tfs_z=0.95, # Tail-free sampling for better reasoning
|
| 313 |
+
mirostat_mode=2, # Mirostat v2 for perplexity control
|
| 314 |
+
mirostat_tau=4.0, # Mirostat target entropy
|
| 315 |
+
mirostat_eta=0.15, # Mirostat learning rate
|
| 316 |
+
|
| 317 |
+
# Stopping conditions
|
| 318 |
+
stop=[SOLUTION_END, "</SOLUTION>", "<end_working_out>"] # Multiple stop tokens
|
| 319 |
+
)
|
| 320 |
+
return response['choices'][0]['text'].strip()
|
| 321 |
+
except Exception as e:
|
| 322 |
+
print(f"Error generating response: {e}")
|
| 323 |
+
# Try with fallback parameters if advanced ones fail
|
| 324 |
+
try:
|
| 325 |
+
response = model(
|
| 326 |
+
prompt,
|
| 327 |
+
max_tokens=max_tokens,
|
| 328 |
+
temperature=0.3,
|
| 329 |
+
top_p=0.85,
|
| 330 |
+
top_k=50,
|
| 331 |
+
repeat_penalty=1.15,
|
| 332 |
+
stop=[SOLUTION_END, "</SOLUTION>"]
|
| 333 |
+
)
|
| 334 |
+
return response['choices'][0]['text'].strip()
|
| 335 |
+
except Exception as e2:
|
| 336 |
+
print(f"Fallback also failed: {e2}")
|
| 337 |
+
return ""
|
| 338 |
+
|
| 339 |
+
def parse_legislation_json(file_path: str) -> List[Dict[str, Any]]:
|
| 340 |
+
"""Parse the JSON lines format of NZ legislation dataset"""
|
| 341 |
+
legislation_entries = []
|
| 342 |
+
|
| 343 |
+
try:
|
| 344 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 345 |
+
for line_num, line in enumerate(f, 1):
|
| 346 |
+
line = line.strip()
|
| 347 |
+
if line:
|
| 348 |
+
try:
|
| 349 |
+
entry = json.loads(line)
|
| 350 |
+
if 'id' in entry and 'text' in entry:
|
| 351 |
+
legislation_entries.append(entry)
|
| 352 |
+
else:
|
| 353 |
+
print(f"Warning: Line {line_num} missing required fields, skipping")
|
| 354 |
+
except json.JSONDecodeError as e:
|
| 355 |
+
print(f"Warning: Could not parse line {line_num}: {e}")
|
| 356 |
+
continue
|
| 357 |
+
except Exception as e:
|
| 358 |
+
print(f"Error reading legislation file: {e}")
|
| 359 |
+
return []
|
| 360 |
+
|
| 361 |
+
print(f"Successfully parsed {len(legislation_entries)} legislation entries")
|
| 362 |
+
return legislation_entries
|
| 363 |
+
|
| 364 |
+
def create_finetuning_dataset(input_file: str, model, output_file: str = None) -> List[Dict[str, Any]]:
|
| 365 |
+
"""Create a finetuning dataset by processing NZ legislation JSON dataset with incremental saving"""
|
| 366 |
+
if output_file is None:
|
| 367 |
+
output_file = os.path.join(OUTPUT_DIR, "nz_legislation_loophole_dataset.json")
|
| 368 |
+
|
| 369 |
+
# Create temporary file paths
|
| 370 |
+
temp_file = output_file.replace('.json', '_temp.jsonl')
|
| 371 |
+
backup_file = output_file.replace('.json', '_backup.json')
|
| 372 |
+
|
| 373 |
+
print(f"Parsing legislation dataset from {input_file}")
|
| 374 |
+
legislation_entries = parse_legislation_json(input_file)
|
| 375 |
+
|
| 376 |
+
if not legislation_entries:
|
| 377 |
+
print("No legislation entries found to process")
|
| 378 |
+
return []
|
| 379 |
+
|
| 380 |
+
dataset = []
|
| 381 |
+
total_entries = len(legislation_entries)
|
| 382 |
+
saved_count = 0
|
| 383 |
+
|
| 384 |
+
print(f"Processing {total_entries} legislation entries...")
|
| 385 |
+
print(f"Dataset will be saved incrementally to: {temp_file}")
|
| 386 |
+
|
| 387 |
+
try:
|
| 388 |
+
# Open temporary file for incremental saving
|
| 389 |
+
with open(temp_file, 'w', encoding='utf-8') as temp_f:
|
| 390 |
+
for entry_num, entry in enumerate(legislation_entries, 1):
|
| 391 |
+
legislation_id = entry.get('id', f'entry_{entry_num}')
|
| 392 |
+
title = entry.get('title', 'Unknown Title')
|
| 393 |
+
year = entry.get('year', 'Unknown Year')
|
| 394 |
+
raw_text = entry.get('text', '')
|
| 395 |
+
|
| 396 |
+
print(f"\nProcessing entry {entry_num}/{total_entries}: {title} ({year}) - ID: {legislation_id}")
|
| 397 |
+
|
| 398 |
+
# Clean the legislation text
|
| 399 |
+
cleaned_text = clean_text(raw_text)
|
| 400 |
+
|
| 401 |
+
# Chunk the text if it's too long
|
| 402 |
+
chunks = chunk_text(cleaned_text)
|
| 403 |
+
|
| 404 |
+
print(f" - Text length: {len(raw_text)} characters")
|
| 405 |
+
print(f" - Number of chunks: {len(chunks)}")
|
| 406 |
+
|
| 407 |
+
# Process each chunk
|
| 408 |
+
for chunk_id, chunk in enumerate(chunks):
|
| 409 |
+
# Create prompt for this chunk
|
| 410 |
+
system_prompt = create_system_prompt(chunk)
|
| 411 |
+
full_prompt = generate_chat_template(system_prompt)
|
| 412 |
+
|
| 413 |
+
# Generate response
|
| 414 |
+
response = generate_response(model, full_prompt)
|
| 415 |
+
|
| 416 |
+
# Print response for monitoring
|
| 417 |
+
print(f"\nπ **Generated Analysis for {title} (Chunk {chunk_id + 1}/{len(chunks)})**:")
|
| 418 |
+
print(f" Response length: {len(response)} characters")
|
| 419 |
+
|
| 420 |
+
# Show preview of the analysis
|
| 421 |
+
preview = response.replace('\n', ' ').strip()
|
| 422 |
+
print(f" Preview: {preview}")
|
| 423 |
+
|
| 424 |
+
# Check for key analysis elements
|
| 425 |
+
has_reasoning = '<start_working_out>' in response or 'reasoning' in response.lower()
|
| 426 |
+
has_loopholes = 'loophole' in response.lower() or 'ambiguity' in response.lower() or 'issue' in response.lower()
|
| 427 |
+
has_recommendations = 'recommend' in response.lower() or 'suggest' in response.lower()
|
| 428 |
+
|
| 429 |
+
print(f" Analysis quality: {'β
' if has_reasoning else 'β'} Reasoning | {'β
' if has_loopholes else 'β'} Loopholes | {'β
' if has_recommendations else 'β'} Recommendations")
|
| 430 |
+
|
| 431 |
+
# Add to dataset with metadata
|
| 432 |
+
dataset_entry = {
|
| 433 |
+
"prompt": full_prompt,
|
| 434 |
+
"response": response,
|
| 435 |
+
"legislation_id": legislation_id,
|
| 436 |
+
"title": title,
|
| 437 |
+
"year": year,
|
| 438 |
+
"chunk_id": chunk_id,
|
| 439 |
+
"total_chunks": len(chunks),
|
| 440 |
+
"text_length": len(chunk),
|
| 441 |
+
"original_text_length": len(raw_text)
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
# Save entry immediately to temporary file (JSON Lines format)
|
| 445 |
+
json.dump(dataset_entry, temp_f, ensure_ascii=False)
|
| 446 |
+
temp_f.write('\n')
|
| 447 |
+
temp_f.flush() # Force write to disk
|
| 448 |
+
|
| 449 |
+
dataset.append(dataset_entry)
|
| 450 |
+
saved_count += 1
|
| 451 |
+
|
| 452 |
+
# Progress update every 10 entries
|
| 453 |
+
if saved_count % 10 == 0:
|
| 454 |
+
print(f" β Saved {saved_count} entries so far...")
|
| 455 |
+
|
| 456 |
+
print(f"\nβ All entries processed and saved to temporary file")
|
| 457 |
+
print(f"β Total entries saved: {saved_count}")
|
| 458 |
+
|
| 459 |
+
# Create backup of existing file if it exists
|
| 460 |
+
if os.path.exists(output_file):
|
| 461 |
+
print(f"Creating backup of existing dataset...")
|
| 462 |
+
os.rename(output_file, backup_file)
|
| 463 |
+
|
| 464 |
+
# Convert JSON Lines to final JSON format
|
| 465 |
+
print(f"Converting to final JSON format...")
|
| 466 |
+
with open(temp_file, 'r', encoding='utf-8') as temp_f:
|
| 467 |
+
lines = temp_f.readlines()
|
| 468 |
+
|
| 469 |
+
final_dataset = []
|
| 470 |
+
for line in lines:
|
| 471 |
+
if line.strip():
|
| 472 |
+
final_dataset.append(json.loads(line))
|
| 473 |
+
|
| 474 |
+
# Save final consolidated JSON file
|
| 475 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 476 |
+
json.dump(final_dataset, f, indent=2, ensure_ascii=False)
|
| 477 |
+
|
| 478 |
+
print(f"β Final dataset saved to: {output_file}")
|
| 479 |
+
|
| 480 |
+
# Clean up temporary file
|
| 481 |
+
if os.path.exists(temp_file):
|
| 482 |
+
os.remove(temp_file)
|
| 483 |
+
print(f"β Temporary file cleaned up")
|
| 484 |
+
|
| 485 |
+
# Clean up backup file if everything succeeded
|
| 486 |
+
if os.path.exists(backup_file):
|
| 487 |
+
os.remove(backup_file)
|
| 488 |
+
print(f"β Backup file cleaned up")
|
| 489 |
+
|
| 490 |
+
print(f"\nπ Dataset creation complete!")
|
| 491 |
+
print(f" β’ Processed {total_entries} legislation documents")
|
| 492 |
+
print(f" β’ Generated {len(final_dataset)} analysis entries")
|
| 493 |
+
print(f" β’ Total chunks processed: {sum(entry.get('total_chunks', 1) for entry in final_dataset[:total_entries])}")
|
| 494 |
+
|
| 495 |
+
return final_dataset
|
| 496 |
+
|
| 497 |
+
except KeyboardInterrupt:
|
| 498 |
+
print(f"\nβ οΈ Process interrupted by user")
|
| 499 |
+
print(f" β’ Partial dataset saved to: {temp_file}")
|
| 500 |
+
print(f" β’ {saved_count} entries saved so far")
|
| 501 |
+
print(f" β’ You can resume processing or use the temporary file")
|
| 502 |
+
raise
|
| 503 |
+
|
| 504 |
+
except Exception as e:
|
| 505 |
+
print(f"\nβ Error during processing: {e}")
|
| 506 |
+
print(f" β’ Partial dataset saved to: {temp_file}")
|
| 507 |
+
print(f" β’ {saved_count} entries saved so far")
|
| 508 |
+
if os.path.exists(backup_file):
|
| 509 |
+
print(f" β’ Original dataset restored from backup")
|
| 510 |
+
os.rename(backup_file, output_file)
|
| 511 |
+
raise
|
| 512 |
+
|
| 513 |
+
def main():
|
| 514 |
+
"""Main execution function"""
|
| 515 |
+
print("Starting NZ Legislation Loophole Analysis Dataset Creation")
|
| 516 |
+
print("=" * 60)
|
| 517 |
+
|
| 518 |
+
# Load the model
|
| 519 |
+
model = load_model()
|
| 520 |
+
|
| 521 |
+
# Create the dataset
|
| 522 |
+
dataset = create_finetuning_dataset(INPUT_FILE, model)
|
| 523 |
+
|
| 524 |
+
# Cleanup
|
| 525 |
+
if hasattr(model, 'close'):
|
| 526 |
+
model.close()
|
| 527 |
+
|
| 528 |
+
print("\nDataset creation completed successfully!")
|
| 529 |
+
print(f"Output saved to: {os.path.join(OUTPUT_DIR, 'nz_legislation_loophole_dataset.json')}")
|
| 530 |
+
|
| 531 |
+
if __name__ == "__main__":
|
| 532 |
+
main()
|