Princess3 commited on
Commit
c089ca4
Β·
verified Β·
1 Parent(s): 99253d3

Upload 25 files

Browse files
.dockerignore ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Version control
2
+ .git
3
+ .gitignore
4
+
5
+ # Python
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+ *.so
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # Virtual environments
28
+ .env
29
+ .venv
30
+ env/
31
+ venv/
32
+ ENV/
33
+ env.bak/
34
+ venv.bak/
35
+
36
+ # IDEs
37
+ .vscode/
38
+ .idea/
39
+ *.swp
40
+ *.swo
41
+ *~
42
+
43
+ # OS
44
+ .DS_Store
45
+ .DS_Store?
46
+ ._*
47
+ .Spotlight-V100
48
+ .Trashes
49
+ ehthumbs.db
50
+ Thumbs.db
51
+
52
+ # Documentation (will be copied if needed)
53
+ *.md
54
+ !README_Streamlit_App.md
55
+
56
+ # Test files (will be copied if needed)
57
+ test_app_imports.py
58
+
59
+ # Original CLI script (replaced by Streamlit app)
60
+ trl.py
61
+ trl copy.py
62
+
63
+ # Cache and temporary files
64
+ *.log
65
+ .cache
66
+ .temp
67
+
68
+ # Model files (will be mounted or downloaded at runtime)
69
+ *.gguf
70
+ *.bin
71
+
72
+ # Node modules (if any)
73
+ node_modules/
74
+
75
+ # Docker files
76
+ Dockerfile
77
+ docker-compose.yml
78
+ .dockerignore
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ nz-legislation.txt filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,20 +1,46 @@
1
- FROM python:3.13.5-slim
2
-
3
- WORKDIR /app
4
 
 
5
  RUN apt-get update && apt-get install -y \
6
  build-essential \
7
- curl \
8
  git \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
- COPY requirements.txt ./
12
- COPY src/ ./src/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- RUN pip3 install -r requirements.txt
 
 
 
 
15
 
 
16
  EXPOSE 8501
17
 
18
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
 
19
 
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
1
+ # Use Python 3.11 slim as base image for the NZ Legislation Loophole Analysis Streamlit App
2
+ FROM python:3.11-slim
 
3
 
4
+ # Install system dependencies required for llama-cpp-python compilation and general app functionality
5
  RUN apt-get update && apt-get install -y \
6
  build-essential \
7
+ cmake \
8
  git \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
+ # Set working directory
12
+ WORKDIR /app
13
+
14
+ # Copy requirements file and install Python dependencies
15
+ COPY requirements.txt .
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Copy the entire Streamlit application
19
+ COPY streamlit_app/ ./streamlit_app/
20
+
21
+ # Copy data files (if needed for testing or default data)
22
+ COPY nz-legislation.txt ./
23
+
24
+ # Create necessary directories for the Streamlit app
25
+ RUN mkdir -p \
26
+ streamlit_app/cache \
27
+ streamlit_app/config \
28
+ streamlit_app/datasets \
29
+ streamlit_app/logs \
30
+ streamlit_app/uploads \
31
+ nz_legislation_dataset
32
 
33
+ # Set environment variables for Streamlit
34
+ ENV STREAMLIT_SERVER_HEADLESS=true
35
+ ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
36
+ ENV STREAMLIT_SERVER_PORT=8501
37
+ ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
38
 
39
+ # Expose the Streamlit port
40
  EXPOSE 8501
41
 
42
+ # Set working directory to the Streamlit app
43
+ WORKDIR /app/streamlit_app
44
 
45
+ # Set the default command to run the Streamlit application
46
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
README_Docker.md ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Docker Setup for NZ Legislation Loophole Analysis Streamlit App
2
+
3
+ This guide explains how to run the NZ Legislation Loophole Analysis Streamlit App using Docker.
4
+
5
+ ## πŸ“‹ Prerequisites
6
+
7
+ - Docker installed on your system
8
+ - Docker Compose (recommended for easier management)
9
+ - At least 4GB of available RAM (8GB recommended for optimal performance)
10
+
11
+ ## πŸš€ Quick Start
12
+
13
+ ### Method 1: Using Docker Compose (Recommended)
14
+
15
+ ```bash
16
+ # Clone or navigate to the project directory
17
+ cd /path/to/nz-legislation-analyzer
18
+
19
+ # Build and run the application
20
+ docker-compose up --build
21
+
22
+ # Or run in detached mode
23
+ docker-compose up -d --build
24
+ ```
25
+
26
+ The application will be available at: **http://localhost:8501**
27
+
28
+ ### Method 2: Using Docker Directly
29
+
30
+ ```bash
31
+ # Build the Docker image
32
+ docker build -t nz-legislation-analyzer .
33
+
34
+ # Run the container
35
+ docker run -p 8501:8501 \
36
+ -v $(pwd)/streamlit_app/cache:/app/streamlit_app/cache \
37
+ -v $(pwd)/streamlit_app/config:/app/streamlit_app/config \
38
+ -v $(pwd)/streamlit_app/datasets:/app/streamlit_app/datasets \
39
+ -v $(pwd)/nz-legislation.txt:/app/nz-legislation.txt:ro \
40
+ nz-legislation-analyzer
41
+ ```
42
+
43
+ ## πŸ“ Directory Structure
44
+
45
+ When using Docker, the following directories are created and can be persisted:
46
+
47
+ ```
48
+ πŸ“ streamlit_app/
49
+ β”œβ”€β”€ 🧠 cache/ # Persistent cache for processed chunks
50
+ β”œβ”€β”€ βš™οΈ config/ # Application configuration files
51
+ β”œβ”€β”€ πŸ“Š datasets/ # Generated datasets and results
52
+ β”œβ”€β”€ πŸ“ logs/ # Application logs
53
+ └── πŸ“€ uploads/ # Uploaded files (if any)
54
+ ```
55
+
56
+ ## πŸ› οΈ Configuration
57
+
58
+ ### Environment Variables
59
+
60
+ | Variable | Default | Description |
61
+ |----------|---------|-------------|
62
+ | `STREAMLIT_SERVER_HEADLESS` | `true` | Run in headless mode |
63
+ | `STREAMLIT_SERVER_PORT` | `8501` | Streamlit server port |
64
+ | `STREAMLIT_SERVER_ADDRESS` | `0.0.0.0` | Server bind address |
65
+
66
+ ### Volume Mounts
67
+
68
+ The Docker setup includes the following volume mounts for data persistence:
69
+
70
+ - `./streamlit_app/cache:/app/streamlit_app/cache` - Cache persistence
71
+ - `./streamlit_app/config:/app/streamlit_app/config` - Configuration files
72
+ - `./streamlit_app/datasets:/app/streamlit_app/datasets` - Generated datasets
73
+ - `./streamlit_app/logs:/app/streamlit_app/logs` - Application logs
74
+ - `./nz-legislation.txt:/app/nz-legislation.txt:ro` - Input data (read-only)
75
+
76
+ ## πŸ”§ Docker Commands
77
+
78
+ ### Building the Image
79
+
80
+ ```bash
81
+ # Build with no cache
82
+ docker build --no-cache -t nz-legislation-analyzer .
83
+
84
+ # Build with specific Dockerfile
85
+ docker build -f Dockerfile -t nz-legislation-analyzer .
86
+ ```
87
+
88
+ ### Running the Container
89
+
90
+ ```bash
91
+ # Interactive mode
92
+ docker run -it --rm -p 8501:8501 nz-legislation-analyzer
93
+
94
+ # Background mode
95
+ docker run -d -p 8501:8501 nz-legislation-analyzer
96
+
97
+ # With custom environment variables
98
+ docker run -p 8501:8501 \
99
+ -e STREAMLIT_SERVER_PORT=8502 \
100
+ nz-legislation-analyzer
101
+ ```
102
+
103
+ ### Docker Compose Commands
104
+
105
+ ```bash
106
+ # Start services
107
+ docker-compose up
108
+
109
+ # Start in background
110
+ docker-compose up -d
111
+
112
+ # Stop services
113
+ docker-compose down
114
+
115
+ # Rebuild and start
116
+ docker-compose up --build
117
+
118
+ # View logs
119
+ docker-compose logs -f
120
+
121
+ # Scale services (if needed)
122
+ docker-compose up -d --scale nz-legislation-analyzer=2
123
+ ```
124
+
125
+ ## πŸ“Š Monitoring and Logs
126
+
127
+ ### Viewing Logs
128
+
129
+ ```bash
130
+ # Docker Compose logs
131
+ docker-compose logs -f nz-legislation-analyzer
132
+
133
+ # Docker logs
134
+ docker logs -f <container_id>
135
+
136
+ # Follow logs in real-time
137
+ docker-compose logs -f --tail=100
138
+ ```
139
+
140
+ ### Health Checks
141
+
142
+ The Docker Compose setup includes health checks that monitor the Streamlit application:
143
+
144
+ ```yaml
145
+ healthcheck:
146
+ test: ["CMD", "curl", "-f", "http://localhost:8501/healthz"]
147
+ interval: 30s
148
+ timeout: 10s
149
+ retries: 3
150
+ start_period: 40s
151
+ ```
152
+
153
+ ## πŸ” Troubleshooting
154
+
155
+ ### Common Issues
156
+
157
+ 1. **Port Already in Use**
158
+ ```bash
159
+ # Change the port mapping
160
+ docker run -p 8502:8501 nz-legislation-analyzer
161
+ # Or with docker-compose, modify the ports section
162
+ ```
163
+
164
+ 2. **Memory Issues**
165
+ ```bash
166
+ # Increase Docker memory allocation
167
+ # Docker Desktop: Settings > Resources > Memory
168
+ # Or add memory limits to docker-compose.yml
169
+ ```
170
+
171
+ 3. **Model Loading Errors**
172
+ - Ensure sufficient RAM (8GB+ recommended)
173
+ - Check that model files are accessible
174
+ - Verify model path in configuration
175
+
176
+ 4. **Permission Issues**
177
+ ```bash
178
+ # Fix directory permissions
179
+ sudo chown -R $USER:$USER streamlit_app/
180
+ ```
181
+
182
+ 5. **Cache Issues**
183
+ ```bash
184
+ # Clear persistent cache
185
+ sudo rm -rf streamlit_app/cache/*
186
+ docker-compose restart
187
+ ```
188
+
189
+ ### Debug Mode
190
+
191
+ Enable debug logging by modifying the environment:
192
+
193
+ ```bash
194
+ # Add to docker-compose.yml environment section
195
+ - PYTHONPATH=/app
196
+ - LOG_LEVEL=DEBUG
197
+ ```
198
+
199
+ ## πŸ”„ Updates and Maintenance
200
+
201
+ ### Updating the Application
202
+
203
+ ```bash
204
+ # Pull latest changes
205
+ git pull
206
+
207
+ # Rebuild the image
208
+ docker-compose build --no-cache
209
+
210
+ # Restart services
211
+ docker-compose up -d
212
+ ```
213
+
214
+ ### Backup Important Data
215
+
216
+ ```bash
217
+ # Backup cache and configuration
218
+ tar -czf backup.tar.gz streamlit_app/cache/ streamlit_app/config/
219
+
220
+ # Backup datasets
221
+ tar -czf datasets_backup.tar.gz streamlit_app/datasets/
222
+ ```
223
+
224
+ ### Cleaning Up
225
+
226
+ ```bash
227
+ # Remove containers and volumes
228
+ docker-compose down -v
229
+
230
+ # Remove images
231
+ docker rmi nz-legislation-analyzer
232
+
233
+ # Clean up unused Docker resources
234
+ docker system prune -a
235
+ ```
236
+
237
+ ## πŸ—οΈ Advanced Configuration
238
+
239
+ ### Custom Model Files
240
+
241
+ To use custom model files:
242
+
243
+ 1. **Mount model directory:**
244
+ ```yaml
245
+ volumes:
246
+ - ./models:/app/models:ro
247
+ ```
248
+
249
+ 2. **Update configuration** in the Streamlit app to point to `/app/models/your-model.gguf`
250
+
251
+ ### GPU Support (Optional)
252
+
253
+ For GPU acceleration with CUDA:
254
+
255
+ ```dockerfile
256
+ # Use CUDA-enabled base image
257
+ FROM nvidia/cuda:11.8-devel-ubuntu22.04
258
+
259
+ # Install Python and dependencies
260
+ # ... (additional setup for CUDA)
261
+ ```
262
+
263
+ Note: GPU support requires additional configuration and CUDA-compatible hardware.
264
+
265
+ ## πŸ” Security Considerations
266
+
267
+ - The application runs in headless mode by default
268
+ - All data is stored locally in mounted volumes
269
+ - No external network access is required for basic functionality
270
+ - Consider implementing authentication for production deployments
271
+
272
+ ## πŸ“ˆ Performance Optimization
273
+
274
+ ### Memory Management
275
+
276
+ - Default cache size: 1024MB (configurable in app settings)
277
+ - Adjust based on available system memory
278
+ - Monitor memory usage through the app's Performance dashboard
279
+
280
+ ### Disk I/O
281
+
282
+ - Use SSD storage for better performance
283
+ - Ensure adequate disk space for cache and datasets
284
+ - Consider using tmpfs for temporary processing
285
+
286
+ ### Network
287
+
288
+ - The application binds to all interfaces (`0.0.0.0`)
289
+ - Access via `localhost` or container IP
290
+ - No external dependencies required
291
+
292
+ ## πŸ†˜ Support
293
+
294
+ For Docker-specific issues:
295
+
296
+ 1. Check Docker logs: `docker-compose logs`
297
+ 2. Verify Docker installation and version
298
+ 3. Ensure adequate system resources
299
+ 4. Review the main application logs in `streamlit_app/logs/`
300
+
301
+ For application-specific issues, refer to the main documentation in `README_Streamlit_App.md`.
302
+
303
+ ---
304
+
305
+ **πŸŽ‰ Happy analyzing with your containerized NZ Legislation Loophole Analysis Streamlit App!**
README_Streamlit_App.md ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NZ Legislation Loophole Analysis Streamlit App
2
+
3
+ A modern, AI-powered web application for analyzing New Zealand legislation to identify potential loopholes, ambiguities, and unintended consequences.
4
+
5
+ ## 🌟 Features
6
+
7
+ ### πŸ€– AI-Powered Analysis
8
+ - **Legal Expertise**: Specialized analysis for NZ legislation with Treaty of Waitangi references
9
+ - **Multiple Analysis Types**: Standard, Detailed, and Comprehensive analysis modes
10
+ - **Intelligent Chunking**: Sentence-aware text splitting with overlap for context preservation
11
+
12
+ ### 🧠 Context Memory Cache System
13
+ - **Smart Caching**: Hash-based chunk identification prevents re-processing identical content
14
+ - **Multi-level Storage**: In-memory LRU cache with optional SQLite persistence
15
+ - **Performance Boost**: Significant speed improvements for large documents and batch processing
16
+ - **Cache Management**: View statistics, export/import cache, and set TTL limits
17
+
18
+ ### 🎨 Modern Web Interface
19
+ - **Multi-page Layout**: Organized navigation with Home, Upload, Analysis, Settings, and Performance pages
20
+ - **Real-time Progress**: Live progress bars and processing status updates
21
+ - **Interactive Dashboards**: Performance metrics, cache statistics, and analysis results
22
+ - **Responsive Design**: Works on desktop and mobile devices
23
+
24
+ ### πŸ“Š Advanced Analytics
25
+ - **Quality Metrics**: Confidence scoring and analysis quality assessment
26
+ - **Performance Monitoring**: Memory usage, CPU utilization, and processing times
27
+ - **Batch Processing**: Handle multiple legislation files simultaneously
28
+ - **Export Options**: Multiple formats (JSON, CSV, Excel) with metadata
29
+
30
+ ## πŸš€ Quick Start
31
+
32
+ ### Prerequisites
33
+ ```bash
34
+ # Python 3.8 or higher
35
+ python --version
36
+
37
+ # Install dependencies
38
+ pip install -r requirements.txt
39
+ ```
40
+
41
+ ### Running the Application
42
+ ```bash
43
+ # Method 1: Use the run script (recommended)
44
+ python run_streamlit_app.py
45
+
46
+ # Method 2: Direct Streamlit command
47
+ cd streamlit_app
48
+ streamlit run app.py
49
+ ```
50
+
51
+ The app will be available at: **http://localhost:8501**
52
+
53
+ ## πŸ“ Project Structure
54
+
55
+ ```
56
+ streamlit_app/
57
+ β”œβ”€β”€ app.py # Main Streamlit application
58
+ β”œβ”€β”€ core/
59
+ β”‚ β”œβ”€β”€ cache_manager.py # Context memory cache system
60
+ β”‚ β”œβ”€β”€ text_processor.py # Text cleaning and chunking
61
+ β”‚ β”œβ”€β”€ llm_analyzer.py # LLM integration and analysis
62
+ β”‚ └── dataset_builder.py # Dataset creation and export
63
+ β”œβ”€β”€ utils/
64
+ β”‚ β”œβ”€β”€ config.py # Configuration management
65
+ β”‚ β”œβ”€β”€ performance.py # Performance monitoring
66
+ β”‚ └── ui_helpers.py # UI components and formatting
67
+ β”œβ”€β”€ pages/ # Multi-page navigation
68
+ β”œβ”€β”€ assets/ # Custom styling and assets
69
+ └── cache/ # Cache storage directory
70
+ ```
71
+
72
+ ## πŸ› οΈ Configuration
73
+
74
+ ### Model Configuration
75
+ The app supports both local GGUF models and HuggingFace models:
76
+
77
+ ```python
78
+ # Local model
79
+ model_path = "path/to/your/model.gguf"
80
+
81
+ # HuggingFace model
82
+ repo_id = "DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-GGUF"
83
+ filename = "model-file-name.gguf"
84
+ ```
85
+
86
+ ### Cache Configuration
87
+ ```python
88
+ cache_config = {
89
+ 'enabled': True, # Enable/disable caching
90
+ 'max_size_mb': 1024, # Maximum memory for cache
91
+ 'ttl_hours': 24, # Time-to-live for cached entries
92
+ 'persistent': True # Use disk persistence
93
+ }
94
+ ```
95
+
96
+ ### Processing Configuration
97
+ ```python
98
+ processing_config = {
99
+ 'chunk_size': 4096, # Size of text chunks
100
+ 'chunk_overlap': 256, # Overlap between chunks
101
+ 'batch_size': 16, # Number of chunks to process at once
102
+ 'clean_text': True # Apply text cleaning
103
+ }
104
+ ```
105
+
106
+ ## πŸ“– Usage Guide
107
+
108
+ ### 1. Home Page
109
+ - Overview of the application capabilities
110
+ - Current configuration status
111
+ - Quick start guide
112
+
113
+ ### 2. Upload & Process Page
114
+ - **File Upload**: Support for JSON lines, JSON arrays, and raw text files
115
+ - **Configuration**: Adjust model, processing, and analysis parameters
116
+ - **Batch Processing**: Upload multiple files for simultaneous analysis
117
+ - **Real-time Progress**: Monitor processing status and performance
118
+
119
+ ### 3. Analysis Results Page
120
+ - **Results Overview**: Summary metrics and statistics
121
+ - **Detailed Analysis**: Expandable results with confidence scores
122
+ - **Export Options**: Download results in multiple formats
123
+ - **Quality Metrics**: Analysis quality assessment and recommendations
124
+
125
+ ### 4. Settings Page
126
+ - **Model Settings**: Configure LLM parameters and model paths
127
+ - **Processing Settings**: Adjust text processing parameters
128
+ - **Cache Settings**: Manage cache behavior and persistence
129
+ - **UI Settings**: Customize interface appearance
130
+
131
+ ### 5. Performance Dashboard
132
+ - **Real-time Metrics**: Memory usage, CPU utilization, processing speed
133
+ - **Performance History**: Charts showing performance over time
134
+ - **Cache Statistics**: Hit rates, evictions, and cache efficiency
135
+ - **System Information**: Hardware and software details
136
+ - **Performance Recommendations**: Automated suggestions for optimization
137
+
138
+ ## πŸ”§ Advanced Features
139
+
140
+ ### Cache Management
141
+ ```python
142
+ from core.cache_manager import get_cache_manager
143
+
144
+ # Get cache instance
145
+ cache = get_cache_manager()
146
+
147
+ # View statistics
148
+ stats = cache.get_stats()
149
+ print(f"Hit Rate: {stats['hit_rate']:.1f}%")
150
+
151
+ # Clear cache
152
+ cache.clear_cache()
153
+
154
+ # Export cache
155
+ cache.export_cache('cache_backup.json')
156
+ ```
157
+
158
+ ### Custom Analysis Templates
159
+ The app supports custom analysis templates for different legal domains:
160
+
161
+ ```python
162
+ # Define custom template
163
+ custom_template = {
164
+ 'name': 'Commercial Law Analysis',
165
+ 'depth': 'Detailed',
166
+ 'focus_areas': [
167
+ 'contractual loopholes',
168
+ 'commercial implications',
169
+ 'regulatory compliance',
170
+ 'enforcement mechanisms'
171
+ ]
172
+ }
173
+ ```
174
+
175
+ ### Performance Optimization
176
+ - **Memory Management**: Automatic cache eviction based on memory limits
177
+ - **Batch Processing**: Optimized for large document collections
178
+ - **Concurrent Processing**: Thread-safe operations for multi-user scenarios
179
+ - **Progress Callbacks**: Real-time progress updates during long operations
180
+
181
+ ## πŸ“Š API Reference
182
+
183
+ ### Core Classes
184
+
185
+ #### CacheManager
186
+ ```python
187
+ class CacheManager:
188
+ def get(self, content, model_config, processing_config) -> Optional[Dict]
189
+ def put(self, content, analysis_result, model_config, processing_config)
190
+ def get_stats(self) -> Dict[str, Any]
191
+ def clear_cache(self)
192
+ def export_cache(self, filepath: str) -> bool
193
+ def import_cache(self, filepath: str) -> int
194
+ ```
195
+
196
+ #### TextProcessor
197
+ ```python
198
+ class TextProcessor:
199
+ def clean_text(self, text: str, preserve_structure: bool = True) -> str
200
+ def chunk_text(self, text: str, chunk_size: int = 4096, overlap: int = 256) -> List[str]
201
+ def extract_metadata(self, text: str) -> Dict[str, Any]
202
+ def preprocess_legislation_json(self, json_data: Dict) -> Dict
203
+ ```
204
+
205
+ #### LLMAnalyzer
206
+ ```python
207
+ class LLMAnalyzer:
208
+ def analyze_chunk(self, chunk: str, analysis_type: str = 'standard') -> Dict[str, Any]
209
+ def batch_analyze_chunks(self, chunks: List[str], analysis_type: str = 'standard') -> List[Dict]
210
+ def load_model(self) -> bool
211
+ def unload_model(self)
212
+ ```
213
+
214
+ ## πŸ” Analysis Output Format
215
+
216
+ Each analysis result contains:
217
+
218
+ ```json
219
+ {
220
+ "chunk": "original text chunk",
221
+ "analysis_type": "standard|detailed|comprehensive",
222
+ "model_config": {...},
223
+ "structured_analysis": {
224
+ "text_meaning": "explanation of text purpose",
225
+ "key_assumptions": ["list of assumptions"],
226
+ "exploitable_interpretations": ["potential interpretations"],
227
+ "critical_loopholes": ["identified loopholes"],
228
+ "circumvention_strategies": ["exploitation methods"],
229
+ "recommendations": ["suggested fixes"],
230
+ "confidence_score": 85,
231
+ "analysis_quality": "high|medium|low"
232
+ },
233
+ "processing_time": 2.34,
234
+ "chunk_size": 4096,
235
+ "word_count": 512
236
+ }
237
+ ```
238
+
239
+ ## πŸ› Troubleshooting
240
+
241
+ ### Common Issues
242
+
243
+ 1. **Model Loading Errors**
244
+ - Ensure model file exists and is accessible
245
+ - Check model format (GGUF required)
246
+ - Verify sufficient RAM for model loading
247
+
248
+ 2. **Cache Performance Issues**
249
+ - Clear cache if memory usage is high
250
+ - Adjust cache size limits in settings
251
+ - Check persistent cache database integrity
252
+
253
+ 3. **Processing Slowdowns**
254
+ - Reduce batch size for large documents
255
+ - Increase chunk overlap for better context
256
+ - Consider using a more powerful model
257
+
258
+ 4. **Memory Errors**
259
+ - Reduce cache size in settings
260
+ - Process files individually instead of batch
261
+ - Monitor memory usage in performance dashboard
262
+
263
+ ### Debug Mode
264
+ Enable debug mode in settings for detailed logging:
265
+ ```python
266
+ # In settings, enable debug mode
267
+ debug_mode = True
268
+ log_level = "DEBUG"
269
+ ```
270
+
271
+ ## 🀝 Contributing
272
+
273
+ 1. Fork the repository
274
+ 2. Create a feature branch
275
+ 3. Make your changes
276
+ 4. Add tests if applicable
277
+ 5. Submit a pull request
278
+
279
+ ## πŸ“„ License
280
+
281
+ This project is licensed under the MIT License - see the LICENSE file for details.
282
+
283
+ ## πŸ†˜ Support
284
+
285
+ For support and questions:
286
+ - Check the troubleshooting section above
287
+ - Review the performance recommendations in the app
288
+ - Examine the logs in the `streamlit_app/logs/` directory
289
+
290
+ ## πŸ”„ Migration from Original Script
291
+
292
+ If you're migrating from the original `trl.py` script:
293
+
294
+ 1. **Configuration**: Settings are now managed through the UI
295
+ 2. **Output**: Results are displayed in the web interface
296
+ 3. **Caching**: Automatic caching with no manual intervention needed
297
+ 4. **Batch Processing**: Multiple files can be uploaded simultaneously
298
+ 5. **Progress Tracking**: Real-time progress bars and status updates
299
+
300
+ The new app maintains all functionality of the original script while providing a modern, user-friendly interface and significant performance improvements through intelligent caching.
docker-compose.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ nz-legislation-analyzer:
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+ ports:
9
+ - "8501:8501"
10
+ volumes:
11
+ # Mount directories for persistent data
12
+ - ./streamlit_app/cache:/app/streamlit_app/cache
13
+ - ./streamlit_app/config:/app/streamlit_app/config
14
+ - ./streamlit_app/datasets:/app/streamlit_app/datasets
15
+ - ./streamlit_app/logs:/app/streamlit_app/logs
16
+ - ./nz-legislation.txt:/app/nz-legislation.txt:ro
17
+ environment:
18
+ - STREAMLIT_SERVER_HEADLESS=true
19
+ - STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
20
+ - STREAMLIT_SERVER_PORT=8501
21
+ - STREAMLIT_SERVER_ADDRESS=0.0.0.0
22
+ restart: unless-stopped
23
+ healthcheck:
24
+ test: ["CMD", "curl", "-f", "http://localhost:8501/healthz"]
25
+ interval: 30s
26
+ timeout: 10s
27
+ retries: 3
28
+ start_period: 40s
nz-legislation.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e671ba88cfc0d52bf03dcc089e67c6f73fa1ce7680cef6cf860bab1b5809e8e1
3
+ size 112806614
requirements.txt CHANGED
@@ -1,3 +1,9 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
1
+ llama-cpp-python
2
+ psutil
3
+ numpy
4
+ streamlit>=1.28.0
5
+ streamlit-extras>=0.3.0
6
+ plotly>=5.15.0
7
+ pandas>=2.0.0
8
+ streamlit-aggrid>=0.3.0
9
+ streamlit-ace>=0.1.1
run_streamlit_app.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ NZ Legislation Loophole Analysis Streamlit App Runner
4
+
5
+ This script runs the modern Streamlit application for analyzing New Zealand legislation
6
+ to identify potential loopholes, ambiguities, and unintended consequences using AI.
7
+
8
+ Features:
9
+ - Advanced UI with multi-page layout
10
+ - Context memory cache system for improved performance
11
+ - Real-time progress monitoring
12
+ - Interactive results visualization
13
+ - Batch processing capabilities
14
+ - Comprehensive configuration management
15
+
16
+ Usage:
17
+ python run_streamlit_app.py
18
+
19
+ Requirements:
20
+ - All dependencies from requirements.txt must be installed
21
+ - Run from the project root directory
22
+ """
23
+
24
+ import os
25
+ import sys
26
+ import subprocess
27
+ from pathlib import Path
28
+
29
+ def check_requirements():
30
+ """Check if all required packages are installed"""
31
+ required_packages = [
32
+ 'streamlit',
33
+ 'pandas',
34
+ 'plotly',
35
+ 'llama-cpp-python',
36
+ 'psutil',
37
+ 'numpy'
38
+ ]
39
+
40
+ missing_packages = []
41
+
42
+ for package in required_packages:
43
+ try:
44
+ __import__(package.replace('-', '_'))
45
+ except ImportError:
46
+ missing_packages.append(package)
47
+
48
+ if missing_packages:
49
+ print("❌ Missing required packages:")
50
+ for package in missing_packages:
51
+ print(f" - {package}")
52
+
53
+ print("\nπŸ“¦ Installing missing packages...")
54
+ try:
55
+ subprocess.check_call([
56
+ sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'
57
+ ])
58
+ print("βœ… All packages installed successfully!")
59
+ except subprocess.CalledProcessError:
60
+ print("❌ Failed to install packages. Please install manually:")
61
+ print(" pip install -r requirements.txt")
62
+ return False
63
+
64
+ return True
65
+
66
+ def check_app_structure():
67
+ """Check if the app structure is correct"""
68
+ app_dir = Path('streamlit_app')
69
+ required_files = [
70
+ 'app.py',
71
+ 'core/cache_manager.py',
72
+ 'core/text_processor.py',
73
+ 'core/llm_analyzer.py',
74
+ 'core/dataset_builder.py',
75
+ 'utils/config.py',
76
+ 'utils/performance.py',
77
+ 'utils/ui_helpers.py'
78
+ ]
79
+
80
+ missing_files = []
81
+
82
+ for file_path in required_files:
83
+ full_path = app_dir / file_path
84
+ if not full_path.exists():
85
+ missing_files.append(str(full_path))
86
+
87
+ if missing_files:
88
+ print("❌ Missing app files:")
89
+ for file_path in missing_files:
90
+ print(f" - {file_path}")
91
+ return False
92
+
93
+ print("βœ… App structure is complete!")
94
+ return True
95
+
96
+ def create_directories():
97
+ """Create necessary directories"""
98
+ directories = [
99
+ 'streamlit_app/cache',
100
+ 'streamlit_app/config',
101
+ 'streamlit_app/datasets',
102
+ 'streamlit_app/logs'
103
+ ]
104
+
105
+ for dir_path in directories:
106
+ Path(dir_path).mkdir(parents=True, exist_ok=True)
107
+ print(f"πŸ“ Created directory: {dir_path}")
108
+
109
+ def setup_environment():
110
+ """Setup environment variables and configuration"""
111
+ # Add current directory to Python path for imports
112
+ current_dir = os.path.dirname(os.path.abspath(__file__))
113
+ if current_dir not in sys.path:
114
+ sys.path.insert(0, current_dir)
115
+
116
+ # Set environment variables
117
+ os.environ.setdefault('STREAMLIT_SERVER_HEADLESS', 'true')
118
+ os.environ.setdefault('STREAMLIT_BROWSER_GATHER_USAGE_STATS', 'false')
119
+
120
+ print("πŸ”§ Environment setup complete!")
121
+
122
+ def run_app():
123
+ """Run the Streamlit application"""
124
+ print("\nπŸš€ Starting NZ Legislation Loophole Analyzer...")
125
+ print("=" * 60)
126
+ print("πŸ“± Access the app at: http://localhost:8501")
127
+ print("πŸ›‘ Press Ctrl+C to stop the application")
128
+ print("=" * 60)
129
+
130
+ try:
131
+ # Change to app directory
132
+ os.chdir('streamlit_app')
133
+
134
+ # Run Streamlit
135
+ subprocess.run([
136
+ sys.executable, '-m', 'streamlit', 'run', 'app.py',
137
+ '--server.port', '8501',
138
+ '--server.address', '0.0.0.0',
139
+ '--theme.base', 'light'
140
+ ])
141
+
142
+ except KeyboardInterrupt:
143
+ print("\n\nπŸ‘‹ Application stopped by user")
144
+ except Exception as e:
145
+ print(f"\n❌ Error running application: {e}")
146
+ return False
147
+
148
+ return True
149
+
150
+ def main():
151
+ """Main function"""
152
+ print("πŸ›οΈ NZ Legislation Loophole Analysis Streamlit App")
153
+ print("=" * 60)
154
+
155
+ # Check requirements
156
+ if not check_requirements():
157
+ return 1
158
+
159
+ # Check app structure
160
+ if not check_app_structure():
161
+ return 1
162
+
163
+ # Create directories
164
+ create_directories()
165
+
166
+ # Setup environment
167
+ setup_environment()
168
+
169
+ # Run the app
170
+ if not run_app():
171
+ return 1
172
+
173
+ return 0
174
+
175
+ if __name__ == "__main__":
176
+ sys.exit(main())
streamlit_app/app.py ADDED
@@ -0,0 +1,732 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ NZ Legislation Loophole Analysis Streamlit App
4
+
5
+ A modern web interface for analyzing New Zealand legislation text to identify
6
+ potential loopholes, ambiguities, and unintended consequences using AI.
7
+
8
+ Features:
9
+ - Advanced UI with multi-page layout
10
+ - Context memory cache system for improved performance
11
+ - Real-time progress monitoring
12
+ - Interactive results visualization
13
+ - Batch processing capabilities
14
+ - Comprehensive configuration management
15
+ """
16
+
17
+ import streamlit as st
18
+ import sys
19
+ import os
20
+ from pathlib import Path
21
+
22
+ # Add the current directory to Python path for imports
23
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
24
+
25
+ # Import core modules
26
+ from core.cache_manager import CacheManager
27
+ from core.text_processor import TextProcessor
28
+ from core.llm_analyzer import LLMAnalyzer
29
+ from core.dataset_builder import DatasetBuilder
30
+ from utils.config import ConfigManager
31
+ from utils.ui_helpers import UIHelpers
32
+ from utils.performance import PerformanceMonitor
33
+
34
+ # Configure page settings
35
+ st.set_page_config(
36
+ page_title="NZ Legislation Loophole Analyzer",
37
+ page_icon="βš–οΈ",
38
+ layout="wide",
39
+ initial_sidebar_state="expanded",
40
+ menu_items={
41
+ 'Get Help': 'https://github.com/your-repo',
42
+ 'Report a bug': 'https://github.com/your-repo/issues',
43
+ 'About': '''
44
+ ## NZ Legislation Loophole Analyzer
45
+ A powerful AI tool for analyzing New Zealand legislation to identify
46
+ potential loopholes, ambiguities, and unintended consequences.
47
+
48
+ **Version:** 1.0.0
49
+ **Built with:** Streamlit, Llama.cpp, and advanced caching
50
+ '''
51
+ }
52
+ )
53
+
54
+ # Initialize session state
55
+ def initialize_session_state():
56
+ """Initialize all session state variables"""
57
+ if 'cache_manager' not in st.session_state:
58
+ st.session_state.cache_manager = CacheManager()
59
+
60
+ if 'config_manager' not in st.session_state:
61
+ st.session_state.config_manager = ConfigManager()
62
+
63
+ if 'performance_monitor' not in st.session_state:
64
+ st.session_state.performance_monitor = PerformanceMonitor()
65
+
66
+ if 'current_analysis' not in st.session_state:
67
+ st.session_state.current_analysis = None
68
+
69
+ if 'analysis_results' not in st.session_state:
70
+ st.session_state.analysis_results = []
71
+
72
+ if 'processing_status' not in st.session_state:
73
+ st.session_state.processing_status = {
74
+ 'is_running': False,
75
+ 'progress': 0,
76
+ 'current_task': '',
77
+ 'total_chunks': 0,
78
+ 'processed_chunks': 0
79
+ }
80
+
81
+ def main():
82
+ """Main application function"""
83
+ # Initialize session state
84
+ initialize_session_state()
85
+
86
+ # Create sidebar with navigation and status
87
+ with st.sidebar:
88
+ st.title("βš–οΈ NZ Legislation Analyzer")
89
+ st.markdown("---")
90
+
91
+ # Navigation
92
+ pages = {
93
+ "🏠 Home": "home",
94
+ "πŸ“€ Upload & Process": "upload",
95
+ "πŸ“Š Analysis Results": "results",
96
+ "βš™οΈ Settings": "settings",
97
+ "πŸ“ˆ Performance": "performance"
98
+ }
99
+
100
+ selected_page = st.selectbox(
101
+ "Navigate to:",
102
+ list(pages.keys()),
103
+ key="nav_select"
104
+ )
105
+
106
+ st.markdown("---")
107
+
108
+ # Cache status
109
+ with st.expander("🧠 Cache Status", expanded=True):
110
+ cache_stats = st.session_state.cache_manager.get_stats()
111
+ st.metric("Cache Hits", cache_stats['hits'])
112
+ st.metric("Cache Misses", cache_stats['misses'])
113
+ st.metric("Hit Rate", ".1f")
114
+ st.metric("Cached Chunks", cache_stats['entries'])
115
+
116
+ if st.button("Clear Cache", type="secondary"):
117
+ st.session_state.cache_manager.clear_cache()
118
+ st.rerun()
119
+
120
+ # Performance metrics
121
+ with st.expander("πŸ“Š Performance", expanded=True):
122
+ perf_stats = st.session_state.performance_monitor.get_stats()
123
+ st.metric("Memory Usage", ".1f")
124
+ st.metric("Avg Processing Time", ".2f")
125
+
126
+ # Processing status
127
+ if st.session_state.processing_status['is_running']:
128
+ with st.expander("πŸ”„ Processing Status", expanded=True):
129
+ st.progress(st.session_state.processing_status['progress'])
130
+ st.text(st.session_state.processing_status['current_task'])
131
+ st.text(f"Chunk {st.session_state.processing_status['processed_chunks']}/"
132
+ f"{st.session_state.processing_status['total_chunks']}")
133
+
134
+ # Main content area
135
+ page = pages[selected_page]
136
+
137
+ if page == "home":
138
+ show_home_page()
139
+ elif page == "upload":
140
+ show_upload_page()
141
+ elif page == "results":
142
+ show_results_page()
143
+ elif page == "settings":
144
+ show_settings_page()
145
+ elif page == "performance":
146
+ show_performance_page()
147
+
148
+ # Footer
149
+ st.markdown("---")
150
+ st.markdown(
151
+ """
152
+ <div style='text-align: center; color: #666; font-size: 12px;'>
153
+ NZ Legislation Loophole Analyzer v1.0.0 | Built with Streamlit & Llama.cpp
154
+ </div>
155
+ """,
156
+ unsafe_allow_html=True
157
+ )
158
+
159
+ def show_home_page():
160
+ """Display the home page with overview and quick start"""
161
+ st.title("🏠 NZ Legislation Loophole Analyzer")
162
+ st.markdown("### AI-Powered Legal Analysis Tool")
163
+
164
+ col1, col2 = st.columns([2, 1])
165
+
166
+ with col1:
167
+ st.markdown("""
168
+ This advanced tool analyzes New Zealand legislation to identify:
169
+
170
+ πŸ” **Potential Loopholes** - Legal ambiguities that could be exploited
171
+ πŸ“‹ **Unintended Consequences** - Hidden implications in legislative language
172
+ βš–οΈ **Ambiguities** - Vague or unclear legal provisions
173
+ 🎯 **Circumvention Strategies** - Ways legislation might be bypassed
174
+
175
+ **Key Features:**
176
+ - **Smart Caching**: Avoid re-processing identical content
177
+ - **Advanced UI**: Modern interface with real-time progress
178
+ - **Batch Processing**: Handle multiple legislation files
179
+ - **Performance Monitoring**: Track memory usage and processing speed
180
+ - **Export Options**: Multiple formats for analysis results
181
+ """)
182
+
183
+ st.markdown("### Quick Start")
184
+ st.markdown("""
185
+ 1. **Upload** your NZ legislation files (JSON lines or raw text)
186
+ 2. **Configure** analysis parameters and model settings
187
+ 3. **Process** the legislation with AI-powered analysis
188
+ 4. **Review** results with interactive visualizations
189
+ 5. **Export** findings in multiple formats
190
+ """)
191
+
192
+ with col2:
193
+ st.markdown("### Current Configuration")
194
+
195
+ config = st.session_state.config_manager.get_config()
196
+
197
+ # Model settings
198
+ st.subheader("πŸ€– Model Settings")
199
+ st.info(f"**Model:** {config['model']['path']}")
200
+ st.info(f"**Context Length:** {config['model']['context_length']}")
201
+ st.info(f"**Max Tokens:** {config['model']['max_tokens']}")
202
+
203
+ # Processing settings
204
+ st.subheader("βš™οΈ Processing")
205
+ st.info(f"**Chunk Size:** {config['processing']['chunk_size']}")
206
+ st.info(f"**Overlap:** {config['processing']['chunk_overlap']}")
207
+ st.info(f"**Batch Size:** {config['processing']['batch_size']}")
208
+
209
+ # Cache settings
210
+ st.subheader("🧠 Cache")
211
+ cache_stats = st.session_state.cache_manager.get_stats()
212
+ st.info(f"**Status:** {'Active' if cache_stats['enabled'] else 'Disabled'}")
213
+ st.info(f"**Hit Rate:** {cache_stats['hit_rate']:.1f}%")
214
+
215
+ if st.button("πŸš€ Start Analysis", type="primary", use_container_width=True):
216
+ st.switch_page("pages/1_upload.py")
217
+
218
+ def show_upload_page():
219
+ """Display the upload and processing page"""
220
+ st.title("πŸ“€ Upload & Process Legislation")
221
+
222
+ # File upload section
223
+ st.subheader("πŸ“ Upload Legislation Files")
224
+
225
+ col1, col2 = st.columns([1, 1])
226
+
227
+ with col1:
228
+ uploaded_files = st.file_uploader(
229
+ "Select NZ legislation files",
230
+ accept_multiple_files=True,
231
+ type=['json', 'txt', 'jsonl'],
232
+ help="Upload JSON lines format (.jsonl), JSON arrays (.json), or raw text (.txt) files"
233
+ )
234
+
235
+ if uploaded_files:
236
+ st.success(f"πŸ“„ {len(uploaded_files)} file(s) selected")
237
+
238
+ # Show file details
239
+ for file in uploaded_files:
240
+ with st.expander(f"πŸ“‹ {file.name}"):
241
+ st.write(f"**Size:** {file.size:,} bytes")
242
+ st.write(f"**Type:** {file.type}")
243
+
244
+ # Preview content
245
+ if file.type in ['text/plain', 'application/json']:
246
+ content = file.read().decode('utf-8')
247
+ st.text_area("Preview", content[:500] + "..." if len(content) > 500 else content,
248
+ height=100, disabled=True)
249
+ file.seek(0) # Reset file pointer
250
+
251
+ with col2:
252
+ # Processing configuration
253
+ st.subheader("βš™οΈ Processing Configuration")
254
+
255
+ config = st.session_state.config_manager.get_config()
256
+
257
+ # Model settings
258
+ with st.expander("πŸ€– Model Configuration", expanded=True):
259
+ model_path = st.text_input(
260
+ "Model Path",
261
+ value=config['model']['path'],
262
+ help="Path to your GGUF model file"
263
+ )
264
+
265
+ context_length = st.slider(
266
+ "Context Length",
267
+ min_value=1024,
268
+ max_value=65536,
269
+ value=config['model']['context_length'],
270
+ step=1024,
271
+ help="Maximum context length for the model"
272
+ )
273
+
274
+ max_tokens = st.slider(
275
+ "Max Response Tokens",
276
+ min_value=256,
277
+ max_value=4096,
278
+ value=config['model']['max_tokens'],
279
+ step=64,
280
+ help="Maximum tokens in model response"
281
+ )
282
+
283
+ # Text processing settings
284
+ with st.expander("πŸ“ Text Processing", expanded=True):
285
+ chunk_size = st.slider(
286
+ "Chunk Size",
287
+ min_value=512,
288
+ max_value=8192,
289
+ value=config['processing']['chunk_size'],
290
+ step=256,
291
+ help="Size of text chunks for processing"
292
+ )
293
+
294
+ chunk_overlap = st.slider(
295
+ "Chunk Overlap",
296
+ min_value=64,
297
+ max_value=1024,
298
+ value=config['processing']['chunk_overlap'],
299
+ step=32,
300
+ help="Overlap between chunks for context preservation"
301
+ )
302
+
303
+ # Analysis settings
304
+ with st.expander("πŸ” Analysis Settings", expanded=True):
305
+ analysis_depth = st.select_slider(
306
+ "Analysis Depth",
307
+ options=["Basic", "Standard", "Detailed", "Comprehensive"],
308
+ value=config['analysis']['depth'],
309
+ help="Level of detail in legal analysis"
310
+ )
311
+
312
+ include_recommendations = st.checkbox(
313
+ "Include Recommendations",
314
+ value=config['analysis']['include_recommendations'],
315
+ help="Generate specific recommendations for addressing identified issues"
316
+ )
317
+
318
+ # Process button and status
319
+ col1, col2, col3 = st.columns([1, 1, 1])
320
+
321
+ with col1:
322
+ if st.button("πŸ”„ Start Processing", type="primary", use_container_width=True):
323
+ if not uploaded_files:
324
+ st.error("Please upload at least one legislation file")
325
+ else:
326
+ start_processing(uploaded_files, {
327
+ 'model': {
328
+ 'path': model_path,
329
+ 'context_length': context_length,
330
+ 'max_tokens': max_tokens
331
+ },
332
+ 'processing': {
333
+ 'chunk_size': chunk_size,
334
+ 'chunk_overlap': chunk_overlap
335
+ },
336
+ 'analysis': {
337
+ 'depth': analysis_depth,
338
+ 'include_recommendations': include_recommendations
339
+ }
340
+ })
341
+
342
+ with col2:
343
+ if st.button("⏹️ Stop Processing", use_container_width=True):
344
+ stop_processing()
345
+
346
+ with col3:
347
+ if st.button("πŸ“Š View Results", use_container_width=True):
348
+ st.switch_page("pages/2_analysis.py")
349
+
350
+ def start_processing(files, config):
351
+ """Start the processing workflow"""
352
+ st.session_state.processing_status = {
353
+ 'is_running': True,
354
+ 'progress': 0,
355
+ 'current_task': 'Initializing...',
356
+ 'total_chunks': 0,
357
+ 'processed_chunks': 0
358
+ }
359
+
360
+ # Update configuration
361
+ st.session_state.config_manager.update_config(config)
362
+
363
+ # TODO: Implement actual processing logic
364
+ st.rerun()
365
+
366
+ def stop_processing():
367
+ """Stop the current processing"""
368
+ st.session_state.processing_status['is_running'] = False
369
+ st.session_state.processing_status['current_task'] = 'Stopped by user'
370
+
371
+ def show_results_page():
372
+ """Display analysis results page"""
373
+ st.title("πŸ“Š Analysis Results")
374
+
375
+ if not st.session_state.analysis_results:
376
+ st.info("No analysis results available. Please upload and process legislation files first.")
377
+ return
378
+
379
+ # Results overview
380
+ st.subheader("πŸ“ˆ Results Overview")
381
+
382
+ col1, col2, col3, col4 = st.columns(4)
383
+
384
+ total_results = len(st.session_state.analysis_results)
385
+ total_loopholes = sum(len(result.get('loopholes', [])) for result in st.session_state.analysis_results)
386
+ avg_confidence = sum(result.get('confidence', 0) for result in st.session_state.analysis_results) / max(total_results, 1)
387
+
388
+ with col1:
389
+ st.metric("Total Analyses", total_results)
390
+
391
+ with col2:
392
+ st.metric("Loopholes Found", total_loopholes)
393
+
394
+ with col3:
395
+ st.metric("Avg Confidence", ".2f")
396
+
397
+ with col4:
398
+ cache_stats = st.session_state.cache_manager.get_stats()
399
+ st.metric("Cache Hit Rate", ".1f")
400
+
401
+ # Results display
402
+ st.subheader("πŸ” Detailed Results")
403
+
404
+ for i, result in enumerate(st.session_state.analysis_results):
405
+ with st.expander(f"πŸ“‹ Analysis {i+1}: {result.get('title', 'Unknown Title')}", expanded=i==0):
406
+ col1, col2 = st.columns([2, 1])
407
+
408
+ with col1:
409
+ st.markdown("**Summary:**")
410
+ st.write(result.get('summary', 'No summary available'))
411
+
412
+ st.markdown("**Key Findings:**")
413
+ for finding in result.get('loopholes', []):
414
+ st.markdown(f"- {finding}")
415
+
416
+ with col2:
417
+ st.metric("Confidence", ".2f")
418
+ st.metric("Processing Time", ".2f")
419
+ st.metric("Chunks Processed", result.get('chunks_processed', 0))
420
+
421
+ # Export options
422
+ st.subheader("πŸ’Ύ Export Results")
423
+
424
+ col1, col2, col3 = st.columns(3)
425
+
426
+ with col1:
427
+ if st.button("πŸ“„ Export as JSON", use_container_width=True):
428
+ export_results('json')
429
+
430
+ with col2:
431
+ if st.button("πŸ“Š Export as CSV", use_container_width=True):
432
+ export_results('csv')
433
+
434
+ with col3:
435
+ if st.button("πŸ“‹ Export as Excel", use_container_width=True):
436
+ export_results('excel')
437
+
438
+ def export_results(format_type):
439
+ """Export analysis results in specified format"""
440
+ # TODO: Implement export functionality
441
+ st.success(f"Results exported as {format_type.upper()}")
442
+
443
+ def show_settings_page():
444
+ """Display settings page"""
445
+ st.title("βš™οΈ Settings & Configuration")
446
+
447
+ tabs = st.tabs(["πŸ€– Model Settings", "πŸ“ Processing", "🧠 Cache", "🎨 UI", "πŸ”§ Advanced"])
448
+
449
+ with tabs[0]:
450
+ st.subheader("πŸ€– Model Configuration")
451
+
452
+ config = st.session_state.config_manager.get_config()
453
+
454
+ model_path = st.text_input(
455
+ "Model Path",
456
+ value=config['model']['path'],
457
+ help="Path to your GGUF model file"
458
+ )
459
+
460
+ repo_id = st.text_input(
461
+ "HuggingFace Repo ID",
462
+ value=config['model']['repo_id'],
463
+ help="HuggingFace repository ID for model download"
464
+ )
465
+
466
+ filename = st.text_input(
467
+ "Model Filename",
468
+ value=config['model']['filename'],
469
+ help="Specific model filename in the repository"
470
+ )
471
+
472
+ context_length = st.slider(
473
+ "Context Length",
474
+ min_value=1024,
475
+ max_value=131072,
476
+ value=config['model']['context_length'],
477
+ step=1024
478
+ )
479
+
480
+ max_tokens = st.slider(
481
+ "Max Response Tokens",
482
+ min_value=256,
483
+ max_value=8192,
484
+ value=config['model']['max_tokens'],
485
+ step=64
486
+ )
487
+
488
+ temperature = st.slider(
489
+ "Temperature",
490
+ min_value=0.0,
491
+ max_value=2.0,
492
+ value=config['model']['temperature'],
493
+ step=0.1,
494
+ help="Controls randomness in model output"
495
+ )
496
+
497
+ with tabs[1]:
498
+ st.subheader("πŸ“ Text Processing")
499
+
500
+ chunk_size = st.slider(
501
+ "Chunk Size",
502
+ min_value=256,
503
+ max_value=16384,
504
+ value=config['processing']['chunk_size'],
505
+ step=256
506
+ )
507
+
508
+ chunk_overlap = st.slider(
509
+ "Chunk Overlap",
510
+ min_value=32,
511
+ max_value=2048,
512
+ value=config['processing']['chunk_overlap'],
513
+ step=32
514
+ )
515
+
516
+ batch_size = st.slider(
517
+ "Batch Size",
518
+ min_value=1,
519
+ max_value=32,
520
+ value=config['processing']['batch_size'],
521
+ step=1
522
+ )
523
+
524
+ clean_text = st.checkbox(
525
+ "Clean Text",
526
+ value=config['processing']['clean_text'],
527
+ help="Apply text cleaning and normalization"
528
+ )
529
+
530
+ with tabs[2]:
531
+ st.subheader("🧠 Cache Configuration")
532
+
533
+ enable_cache = st.checkbox(
534
+ "Enable Caching",
535
+ value=config['cache']['enabled'],
536
+ help="Use cache to avoid re-processing identical chunks"
537
+ )
538
+
539
+ max_cache_size = st.slider(
540
+ "Max Cache Size (MB)",
541
+ min_value=100,
542
+ max_value=8192,
543
+ value=config['cache']['max_size_mb'],
544
+ step=100
545
+ )
546
+
547
+ cache_ttl = st.slider(
548
+ "Cache TTL (hours)",
549
+ min_value=1,
550
+ max_value=168,
551
+ value=config['cache']['ttl_hours'],
552
+ step=1,
553
+ help="Time-to-live for cached entries"
554
+ )
555
+
556
+ persistent_cache = st.checkbox(
557
+ "Persistent Cache",
558
+ value=config['cache']['persistent'],
559
+ help="Save cache to disk for persistence across sessions"
560
+ )
561
+
562
+ with tabs[3]:
563
+ st.subheader("🎨 UI Configuration")
564
+
565
+ theme = st.selectbox(
566
+ "Theme",
567
+ options=["Auto", "Light", "Dark"],
568
+ index=["Auto", "Light", "Dark"].index(config['ui']['theme'])
569
+ )
570
+
571
+ show_progress = st.checkbox(
572
+ "Show Progress Bars",
573
+ value=config['ui']['show_progress'],
574
+ help="Display progress indicators during processing"
575
+ )
576
+
577
+ auto_refresh = st.checkbox(
578
+ "Auto-refresh Results",
579
+ value=config['ui']['auto_refresh'],
580
+ help="Automatically refresh results view"
581
+ )
582
+
583
+ with tabs[4]:
584
+ st.subheader("πŸ”§ Advanced Settings")
585
+
586
+ debug_mode = st.checkbox(
587
+ "Debug Mode",
588
+ value=config['advanced']['debug_mode'],
589
+ help="Enable detailed logging and debugging information"
590
+ )
591
+
592
+ log_level = st.selectbox(
593
+ "Log Level",
594
+ options=["DEBUG", "INFO", "WARNING", "ERROR"],
595
+ index=["DEBUG", "INFO", "WARNING", "ERROR"].index(config['advanced']['log_level'])
596
+ )
597
+
598
+ memory_limit = st.slider(
599
+ "Memory Limit (MB)",
600
+ min_value=512,
601
+ max_value=32768,
602
+ value=config['advanced']['memory_limit_mb'],
603
+ step=512
604
+ )
605
+
606
+ # Save settings
607
+ col1, col2 = st.columns([1, 1])
608
+
609
+ with col1:
610
+ if st.button("πŸ’Ύ Save Settings", type="primary", use_container_width=True):
611
+ new_config = {
612
+ 'model': {
613
+ 'path': model_path,
614
+ 'repo_id': repo_id,
615
+ 'filename': filename,
616
+ 'context_length': context_length,
617
+ 'max_tokens': max_tokens,
618
+ 'temperature': temperature
619
+ },
620
+ 'processing': {
621
+ 'chunk_size': chunk_size,
622
+ 'chunk_overlap': chunk_overlap,
623
+ 'batch_size': batch_size,
624
+ 'clean_text': clean_text
625
+ },
626
+ 'cache': {
627
+ 'enabled': enable_cache,
628
+ 'max_size_mb': max_cache_size,
629
+ 'ttl_hours': cache_ttl,
630
+ 'persistent': persistent_cache
631
+ },
632
+ 'ui': {
633
+ 'theme': theme,
634
+ 'show_progress': show_progress,
635
+ 'auto_refresh': auto_refresh
636
+ },
637
+ 'advanced': {
638
+ 'debug_mode': debug_mode,
639
+ 'log_level': log_level,
640
+ 'memory_limit_mb': memory_limit
641
+ }
642
+ }
643
+
644
+ st.session_state.config_manager.update_config(new_config)
645
+ st.success("Settings saved successfully!")
646
+
647
+ with col2:
648
+ if st.button("πŸ”„ Reset to Defaults", use_container_width=True):
649
+ st.session_state.config_manager.reset_to_defaults()
650
+ st.success("Settings reset to defaults!")
651
+ st.rerun()
652
+
653
+ def show_performance_page():
654
+ """Display performance monitoring page"""
655
+ st.title("πŸ“ˆ Performance Dashboard")
656
+
657
+ # Real-time metrics
658
+ st.subheader("πŸ“Š Real-time Metrics")
659
+
660
+ col1, col2, col3, col4 = st.columns(4)
661
+
662
+ perf_stats = st.session_state.performance_monitor.get_stats()
663
+
664
+ with col1:
665
+ st.metric("Memory Usage", ".1f", "MB")
666
+
667
+ with col2:
668
+ st.metric("CPU Usage", ".1f", "%")
669
+
670
+ with col3:
671
+ st.metric("Active Threads", perf_stats.get('active_threads', 0))
672
+
673
+ with col4:
674
+ cache_stats = st.session_state.cache_manager.get_stats()
675
+ st.metric("Cache Hit Rate", ".1f", "%")
676
+
677
+ # Performance charts
678
+ st.subheader("πŸ“ˆ Performance History")
679
+
680
+ # TODO: Add interactive charts for performance metrics
681
+
682
+ # System information
683
+ st.subheader("πŸ’» System Information")
684
+
685
+ col1, col2 = st.columns(2)
686
+
687
+ with col1:
688
+ st.markdown("**Hardware:**")
689
+ # TODO: Add system information display
690
+
691
+ with col2:
692
+ st.markdown("**Software:**")
693
+ # TODO: Add software information display
694
+
695
+ # Cache performance
696
+ st.subheader("🧠 Cache Performance")
697
+
698
+ cache_stats = st.session_state.cache_manager.get_stats()
699
+
700
+ col1, col2, col3, col4 = st.columns(4)
701
+
702
+ with col1:
703
+ st.metric("Total Requests", cache_stats['hits'] + cache_stats['misses'])
704
+
705
+ with col2:
706
+ st.metric("Cache Hits", cache_stats['hits'])
707
+
708
+ with col3:
709
+ st.metric("Cache Misses", cache_stats['misses'])
710
+
711
+ with col4:
712
+ st.metric("Hit Rate", ".1f")
713
+
714
+ # Performance recommendations
715
+ st.subheader("πŸ’‘ Performance Recommendations")
716
+
717
+ recommendations = []
718
+
719
+ if cache_stats['hit_rate'] < 50:
720
+ recommendations.append("Consider increasing cache size or adjusting chunk sizes to improve hit rate")
721
+
722
+ if perf_stats.get('memory_usage_mb', 0) > 8000:
723
+ recommendations.append("High memory usage detected. Consider reducing batch size or chunk size")
724
+
725
+ if not recommendations:
726
+ recommendations.append("Performance is optimal!")
727
+
728
+ for rec in recommendations:
729
+ st.info(rec)
730
+
731
+ if __name__ == "__main__":
732
+ main()
streamlit_app/core/__pycache__/cache_manager.cpython-312.pyc ADDED
Binary file (24.5 kB). View file
 
streamlit_app/core/__pycache__/dataset_builder.cpython-312.pyc ADDED
Binary file (25.8 kB). View file
 
streamlit_app/core/__pycache__/llm_analyzer.cpython-312.pyc ADDED
Binary file (17.2 kB). View file
 
streamlit_app/core/__pycache__/text_processor.cpython-312.pyc ADDED
Binary file (15.9 kB). View file
 
streamlit_app/core/cache_manager.py ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Context Memory Cache Manager
4
+
5
+ A sophisticated caching system for NZ Legislation Loophole Analysis that provides:
6
+ - Hash-based chunk identification for unique content tracking
7
+ - Multi-level caching (memory + optional disk persistence)
8
+ - Intelligent cache invalidation based on memory limits
9
+ - Performance metrics and cache statistics
10
+ - Thread-safe operations for concurrent processing
11
+ """
12
+
13
+ import hashlib
14
+ import json
15
+ import os
16
+ import time
17
+ import threading
18
+ from typing import Dict, Any, Optional, Tuple
19
+ from functools import lru_cache
20
+ import sqlite3
21
+ from pathlib import Path
22
+ import psutil
23
+ import streamlit as st
24
+
25
+ class CacheEntry:
26
+ """Represents a single cache entry with metadata"""
27
+
28
+ def __init__(self, key: str, content: str, analysis_result: Dict[str, Any],
29
+ model_config: Dict[str, Any], processing_config: Dict[str, Any]):
30
+ self.key = key
31
+ self.content = content
32
+ self.analysis_result = analysis_result
33
+ self.model_config = model_config
34
+ self.processing_config = processing_config
35
+ self.created_at = time.time()
36
+ self.last_accessed = time.time()
37
+ self.access_count = 0
38
+ self.size_bytes = len(content.encode('utf-8')) + len(str(analysis_result).encode('utf-8'))
39
+
40
+ def to_dict(self) -> Dict[str, Any]:
41
+ """Convert cache entry to dictionary for serialization"""
42
+ return {
43
+ 'key': self.key,
44
+ 'content': self.content,
45
+ 'analysis_result': self.analysis_result,
46
+ 'model_config': self.model_config,
47
+ 'processing_config': self.processing_config,
48
+ 'created_at': self.created_at,
49
+ 'last_accessed': self.last_accessed,
50
+ 'access_count': self.access_count,
51
+ 'size_bytes': self.size_bytes
52
+ }
53
+
54
+ @classmethod
55
+ def from_dict(cls, data: Dict[str, Any]) -> 'CacheEntry':
56
+ """Create cache entry from dictionary"""
57
+ entry = cls(
58
+ key=data['key'],
59
+ content=data['content'],
60
+ analysis_result=data['analysis_result'],
61
+ model_config=data['model_config'],
62
+ processing_config=data['processing_config']
63
+ )
64
+ entry.created_at = data.get('created_at', time.time())
65
+ entry.last_accessed = data.get('last_accessed', time.time())
66
+ entry.access_count = data.get('access_count', 0)
67
+ entry.size_bytes = data.get('size_bytes', entry.size_bytes)
68
+ return entry
69
+
70
+ def update_access(self):
71
+ """Update access statistics"""
72
+ self.last_accessed = time.time()
73
+ self.access_count += 1
74
+
75
+ class CacheManager:
76
+ """Advanced cache manager for legislation analysis"""
77
+
78
+ def __init__(self, max_memory_mb: int = 1024, persistent: bool = True,
79
+ cache_dir: str = None, ttl_hours: int = 24):
80
+ """
81
+ Initialize the cache manager
82
+
83
+ Args:
84
+ max_memory_mb: Maximum memory to use for caching (MB)
85
+ persistent: Whether to use persistent disk cache
86
+ cache_dir: Directory for persistent cache storage
87
+ ttl_hours: Time-to-live for cache entries (hours)
88
+ """
89
+ self.max_memory_mb = max_memory_mb
90
+ self.persistent = persistent
91
+ self.ttl_hours = ttl_hours
92
+ self.ttl_seconds = ttl_hours * 3600
93
+
94
+ # Set up cache directory
95
+ if cache_dir is None:
96
+ cache_dir = os.path.join(os.path.dirname(__file__), '..', 'cache')
97
+ self.cache_dir = Path(cache_dir)
98
+ self.cache_dir.mkdir(exist_ok=True)
99
+ self.db_path = self.cache_dir / 'cache.db'
100
+
101
+ # Thread synchronization
102
+ self.lock = threading.RLock()
103
+
104
+ # In-memory cache with LRU eviction
105
+ self.memory_cache: Dict[str, CacheEntry] = {}
106
+ self.memory_size = 0 # Current memory usage in bytes
107
+
108
+ # Statistics
109
+ self.stats = {
110
+ 'hits': 0,
111
+ 'misses': 0,
112
+ 'entries': 0,
113
+ 'memory_usage_mb': 0,
114
+ 'evictions': 0,
115
+ 'enabled': True
116
+ }
117
+
118
+ # Initialize database if persistent
119
+ if self.persistent:
120
+ self._init_database()
121
+
122
+ # Load existing cache entries if persistent
123
+ if self.persistent:
124
+ self._load_persistent_cache()
125
+
126
+ def _init_database(self):
127
+ """Initialize SQLite database for persistent cache"""
128
+ try:
129
+ with sqlite3.connect(str(self.db_path)) as conn:
130
+ conn.execute('''
131
+ CREATE TABLE IF NOT EXISTS cache_entries (
132
+ key TEXT PRIMARY KEY,
133
+ data TEXT NOT NULL,
134
+ created_at REAL NOT NULL,
135
+ last_accessed REAL NOT NULL,
136
+ access_count INTEGER DEFAULT 0,
137
+ size_bytes INTEGER DEFAULT 0
138
+ )
139
+ ''')
140
+ conn.execute('CREATE INDEX IF NOT EXISTS idx_created_at ON cache_entries(created_at)')
141
+ conn.execute('CREATE INDEX IF NOT EXISTS idx_last_accessed ON cache_entries(last_accessed)')
142
+ except Exception as e:
143
+ print(f"Warning: Could not initialize persistent cache: {e}")
144
+ self.persistent = False
145
+
146
+ def _load_persistent_cache(self):
147
+ """Load existing cache entries from database"""
148
+ if not self.persistent:
149
+ return
150
+
151
+ try:
152
+ with sqlite3.connect(str(self.db_path)) as conn:
153
+ cursor = conn.execute('SELECT data FROM cache_entries')
154
+ for row in cursor:
155
+ try:
156
+ entry_data = json.loads(row[0])
157
+ entry = CacheEntry.from_dict(entry_data)
158
+
159
+ # Check if entry is still valid
160
+ if self._is_entry_valid(entry):
161
+ self._add_to_memory_cache(entry)
162
+ else:
163
+ # Remove expired entry from database
164
+ conn.execute('DELETE FROM cache_entries WHERE key = ?', (entry.key,))
165
+ except (json.JSONDecodeError, KeyError):
166
+ continue
167
+ except Exception as e:
168
+ print(f"Warning: Could not load persistent cache: {e}")
169
+
170
+ def _generate_cache_key(self, content: str, model_config: Dict[str, Any],
171
+ processing_config: Dict[str, Any]) -> str:
172
+ """
173
+ Generate a unique cache key based on content and configuration
174
+
175
+ Args:
176
+ content: The text content to be analyzed
177
+ model_config: Model configuration used for analysis
178
+ processing_config: Processing configuration used
179
+
180
+ Returns:
181
+ SHA-256 hash string as cache key
182
+ """
183
+ # Create a deterministic string from all parameters
184
+ key_data = {
185
+ 'content': content,
186
+ 'model_config': model_config,
187
+ 'processing_config': processing_config
188
+ }
189
+
190
+ # Convert to JSON string with sorted keys for consistency
191
+ key_string = json.dumps(key_data, sort_keys=True)
192
+
193
+ # Generate SHA-256 hash
194
+ return hashlib.sha256(key_string.encode('utf-8')).hexdigest()
195
+
196
+ def _is_entry_valid(self, entry: CacheEntry) -> bool:
197
+ """Check if a cache entry is still valid"""
198
+ # Check TTL
199
+ if time.time() - entry.created_at > self.ttl_seconds:
200
+ return False
201
+
202
+ # Check if configurations match (for future-proofing)
203
+ # This could be enhanced to handle configuration changes
204
+
205
+ return True
206
+
207
+ def _add_to_memory_cache(self, entry: CacheEntry):
208
+ """Add entry to memory cache with size management"""
209
+ with self.lock:
210
+ # Check if we need to evict entries
211
+ while self.memory_size + entry.size_bytes > self.max_memory_mb * 1024 * 1024:
212
+ if not self.memory_cache:
213
+ break
214
+ self._evict_lru_entry()
215
+
216
+ self.memory_cache[entry.key] = entry
217
+ self.memory_size += entry.size_bytes
218
+ self.stats['entries'] = len(self.memory_cache)
219
+ self.stats['memory_usage_mb'] = self.memory_size / (1024 * 1024)
220
+
221
+ def _evict_lru_entry(self):
222
+ """Evict the least recently used entry from memory cache"""
223
+ if not self.memory_cache:
224
+ return
225
+
226
+ # Find entry with oldest last_accessed time
227
+ lru_key = min(self.memory_cache.keys(),
228
+ key=lambda k: self.memory_cache[k].last_accessed)
229
+
230
+ evicted_entry = self.memory_cache.pop(lru_key)
231
+ self.memory_size -= evicted_entry.size_bytes
232
+ self.stats['evictions'] += 1
233
+
234
+ # If persistent, we could keep it in database but remove from memory
235
+ # For now, we'll just remove it completely
236
+
237
+ def _save_to_persistent_cache(self, entry: CacheEntry):
238
+ """Save entry to persistent cache"""
239
+ if not self.persistent:
240
+ return
241
+
242
+ try:
243
+ with sqlite3.connect(str(self.db_path)) as conn:
244
+ conn.execute('''
245
+ INSERT OR REPLACE INTO cache_entries
246
+ (key, data, created_at, last_accessed, access_count, size_bytes)
247
+ VALUES (?, ?, ?, ?, ?, ?)
248
+ ''', (
249
+ entry.key,
250
+ json.dumps(entry.to_dict()),
251
+ entry.created_at,
252
+ entry.last_accessed,
253
+ entry.access_count,
254
+ entry.size_bytes
255
+ ))
256
+ except Exception as e:
257
+ print(f"Warning: Could not save to persistent cache: {e}")
258
+
259
+ def get(self, content: str, model_config: Dict[str, Any],
260
+ processing_config: Dict[str, Any]) -> Optional[Dict[str, Any]]:
261
+ """
262
+ Get cached analysis result for given content and configuration
263
+
264
+ Args:
265
+ content: Text content to look up
266
+ model_config: Model configuration used for analysis
267
+ processing_config: Processing configuration used
268
+
269
+ Returns:
270
+ Cached analysis result or None if not found
271
+ """
272
+ if not self.stats['enabled']:
273
+ self.stats['misses'] += 1
274
+ return None
275
+
276
+ cache_key = self._generate_cache_key(content, model_config, processing_config)
277
+
278
+ with self.lock:
279
+ # Check memory cache first
280
+ if cache_key in self.memory_cache:
281
+ entry = self.memory_cache[cache_key]
282
+
283
+ if self._is_entry_valid(entry):
284
+ entry.update_access()
285
+ self.stats['hits'] += 1
286
+ return entry.analysis_result
287
+ else:
288
+ # Remove invalid entry
289
+ self.memory_cache.pop(cache_key)
290
+ self.memory_size -= entry.size_bytes
291
+ self.stats['entries'] = len(self.memory_cache)
292
+
293
+ # Check persistent cache if not in memory
294
+ if self.persistent:
295
+ try:
296
+ with sqlite3.connect(str(self.db_path)) as conn:
297
+ cursor = conn.execute('SELECT data FROM cache_entries WHERE key = ?', (cache_key,))
298
+ row = cursor.fetchone()
299
+
300
+ if row:
301
+ entry_data = json.loads(row[0])
302
+ entry = CacheEntry.from_dict(entry_data)
303
+
304
+ if self._is_entry_valid(entry):
305
+ entry.update_access()
306
+ self.stats['hits'] += 1
307
+
308
+ # Move to memory cache for faster future access
309
+ self._add_to_memory_cache(entry)
310
+
311
+ # Update persistent cache with new access stats
312
+ self._save_to_persistent_cache(entry)
313
+
314
+ return entry.analysis_result
315
+ except Exception as e:
316
+ print(f"Warning: Error accessing persistent cache: {e}")
317
+
318
+ self.stats['misses'] += 1
319
+ return None
320
+
321
+ def put(self, content: str, analysis_result: Dict[str, Any],
322
+ model_config: Dict[str, Any], processing_config: Dict[str, Any]):
323
+ """
324
+ Store analysis result in cache
325
+
326
+ Args:
327
+ content: Text content that was analyzed
328
+ analysis_result: Analysis result to cache
329
+ model_config: Model configuration used for analysis
330
+ processing_config: Processing configuration used
331
+ """
332
+ if not self.stats['enabled']:
333
+ return
334
+
335
+ cache_key = self._generate_cache_key(content, model_config, processing_config)
336
+
337
+ with self.lock:
338
+ entry = CacheEntry(cache_key, content, analysis_result,
339
+ model_config, processing_config)
340
+
341
+ # Add to memory cache
342
+ self._add_to_memory_cache(entry)
343
+
344
+ # Save to persistent cache
345
+ self._save_to_persistent_cache(entry)
346
+
347
+ def get_stats(self) -> Dict[str, Any]:
348
+ """Get cache statistics"""
349
+ with self.lock:
350
+ total_requests = self.stats['hits'] + self.stats['misses']
351
+ hit_rate = (self.stats['hits'] / total_requests * 100) if total_requests > 0 else 0
352
+
353
+ return {
354
+ **self.stats,
355
+ 'hit_rate': hit_rate,
356
+ 'total_requests': total_requests,
357
+ 'persistent_enabled': self.persistent,
358
+ 'memory_limit_mb': self.max_memory_mb,
359
+ 'ttl_hours': self.ttl_hours
360
+ }
361
+
362
+ def clear_cache(self):
363
+ """Clear all cache entries"""
364
+ with self.lock:
365
+ self.memory_cache.clear()
366
+ self.memory_size = 0
367
+ self.stats['entries'] = 0
368
+ self.stats['hits'] = 0
369
+ self.stats['misses'] = 0
370
+ self.stats['evictions'] = 0
371
+ self.stats['memory_usage_mb'] = 0
372
+
373
+ # Clear persistent cache
374
+ if self.persistent:
375
+ try:
376
+ with sqlite3.connect(str(self.db_path)) as conn:
377
+ conn.execute('DELETE FROM cache_entries')
378
+ except Exception as e:
379
+ print(f"Warning: Could not clear persistent cache: {e}")
380
+
381
+ def cleanup_expired_entries(self):
382
+ """Remove expired entries from cache"""
383
+ current_time = time.time()
384
+ expired_keys = []
385
+
386
+ with self.lock:
387
+ # Find expired entries in memory
388
+ for key, entry in self.memory_cache.items():
389
+ if current_time - entry.created_at > self.ttl_seconds:
390
+ expired_keys.append(key)
391
+ self.memory_size -= entry.size_bytes
392
+
393
+ # Remove expired entries from memory
394
+ for key in expired_keys:
395
+ del self.memory_cache[key]
396
+
397
+ self.stats['entries'] = len(self.memory_cache)
398
+ self.stats['memory_usage_mb'] = self.memory_size / (1024 * 1024)
399
+
400
+ # Clean up persistent cache
401
+ if self.persistent:
402
+ try:
403
+ with sqlite3.connect(str(self.db_path)) as conn:
404
+ conn.execute('DELETE FROM cache_entries WHERE ? - created_at > ?',
405
+ (current_time, self.ttl_seconds))
406
+ except Exception as e:
407
+ print(f"Warning: Could not cleanup persistent cache: {e}")
408
+
409
+ def enable(self):
410
+ """Enable caching"""
411
+ self.stats['enabled'] = True
412
+
413
+ def disable(self):
414
+ """Disable caching"""
415
+ self.stats['enabled'] = False
416
+
417
+ def export_cache(self, filepath: str):
418
+ """Export cache contents to JSON file"""
419
+ cache_data = {
420
+ 'metadata': {
421
+ 'exported_at': time.time(),
422
+ 'version': '1.0',
423
+ 'total_entries': len(self.memory_cache)
424
+ },
425
+ 'entries': []
426
+ }
427
+
428
+ with self.lock:
429
+ for entry in self.memory_cache.values():
430
+ cache_data['entries'].append(entry.to_dict())
431
+
432
+ # Also export persistent cache entries
433
+ if self.persistent:
434
+ try:
435
+ with sqlite3.connect(str(self.db_path)) as conn:
436
+ cursor = conn.execute('SELECT data FROM cache_entries')
437
+ for row in cursor:
438
+ try:
439
+ entry_data = json.loads(row[0])
440
+ cache_data['entries'].append(entry_data)
441
+ except json.JSONDecodeError:
442
+ continue
443
+ except Exception as e:
444
+ print(f"Warning: Could not export persistent cache: {e}")
445
+
446
+ try:
447
+ with open(filepath, 'w', encoding='utf-8') as f:
448
+ json.dump(cache_data, f, indent=2, ensure_ascii=False)
449
+ return True
450
+ except Exception as e:
451
+ print(f"Error exporting cache: {e}")
452
+ return False
453
+
454
+ def import_cache(self, filepath: str):
455
+ """Import cache contents from JSON file"""
456
+ try:
457
+ with open(filepath, 'r', encoding='utf-8') as f:
458
+ cache_data = json.load(f)
459
+
460
+ imported_count = 0
461
+ for entry_data in cache_data.get('entries', []):
462
+ try:
463
+ entry = CacheEntry.from_dict(entry_data)
464
+ if self._is_entry_valid(entry):
465
+ self._add_to_memory_cache(entry)
466
+ if self.persistent:
467
+ self._save_to_persistent_cache(entry)
468
+ imported_count += 1
469
+ except Exception as e:
470
+ print(f"Warning: Could not import cache entry: {e}")
471
+ continue
472
+
473
+ return imported_count
474
+ except Exception as e:
475
+ print(f"Error importing cache: {e}")
476
+ return 0
477
+
478
+ # Global cache instance for use across the application
479
+ _cache_instance = None
480
+ _cache_lock = threading.Lock()
481
+
482
+ def get_cache_manager(max_memory_mb: int = 1024, persistent: bool = True,
483
+ cache_dir: str = None, ttl_hours: int = 24) -> CacheManager:
484
+ """
485
+ Get or create global cache manager instance
486
+
487
+ This ensures we have a single cache instance across the application
488
+ while allowing configuration updates.
489
+ """
490
+ global _cache_instance
491
+
492
+ with _cache_lock:
493
+ if _cache_instance is None:
494
+ _cache_instance = CacheManager(max_memory_mb, persistent, cache_dir, ttl_hours)
495
+ else:
496
+ # Update configuration if different
497
+ if (_cache_instance.max_memory_mb != max_memory_mb or
498
+ _cache_instance.persistent != persistent or
499
+ _cache_instance.ttl_hours != ttl_hours):
500
+ _cache_instance.max_memory_mb = max_memory_mb
501
+ _cache_instance.persistent = persistent
502
+ _cache_instance.ttl_hours = ttl_hours
503
+ _cache_instance.ttl_seconds = ttl_hours * 3600
504
+
505
+ return _cache_instance
streamlit_app/core/dataset_builder.py ADDED
@@ -0,0 +1,649 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Dataset Builder
4
+
5
+ Creates and manages finetuning datasets from legislation analysis results.
6
+ Handles data formatting, validation, and export in multiple formats.
7
+ """
8
+
9
+ import os
10
+ import json
11
+ import time
12
+ from typing import List, Dict, Any, Optional, Tuple
13
+ from pathlib import Path
14
+ import pandas as pd
15
+ from datetime import datetime
16
+ import uuid
17
+
18
+ class DatasetBuilder:
19
+ """Builder for creating finetuning datasets from legislation analysis"""
20
+
21
+ def __init__(self, output_dir: str = "datasets"):
22
+ """
23
+ Initialize the dataset builder
24
+
25
+ Args:
26
+ output_dir: Directory to save datasets
27
+ """
28
+ self.output_dir = Path(output_dir)
29
+ self.output_dir.mkdir(exist_ok=True)
30
+
31
+ # Dataset metadata
32
+ self.metadata = {
33
+ 'version': '1.0',
34
+ 'created_at': datetime.now().isoformat(),
35
+ 'total_entries': 0,
36
+ 'analysis_types': set(),
37
+ 'legislation_sources': set(),
38
+ 'quality_metrics': {}
39
+ }
40
+
41
+ def create_finetuning_dataset(self, analysis_results: List[Dict[str, Any]],
42
+ dataset_name: str = None,
43
+ include_metadata: bool = True) -> Dict[str, Any]:
44
+ """
45
+ Create a finetuning dataset from analysis results
46
+
47
+ Args:
48
+ analysis_results: List of analysis results from LLM analyzer
49
+ dataset_name: Name for the dataset (optional)
50
+ include_metadata: Whether to include metadata in the dataset
51
+
52
+ Returns:
53
+ Dataset information and statistics
54
+ """
55
+ if not dataset_name:
56
+ timestamp = int(time.time())
57
+ dataset_name = f"nz_legislation_dataset_{timestamp}"
58
+
59
+ dataset_entries = []
60
+ successful_entries = 0
61
+
62
+ for result in analysis_results:
63
+ if 'error' in result:
64
+ continue
65
+
66
+ # Create finetuning entry
67
+ entry = self._create_finetuning_entry(result)
68
+ if entry:
69
+ dataset_entries.append(entry)
70
+ successful_entries += 1
71
+
72
+ # Update metadata
73
+ if 'analysis_type' in result:
74
+ self.metadata['analysis_types'].add(result['analysis_type'])
75
+
76
+ # Update metadata
77
+ self.metadata['total_entries'] = len(dataset_entries)
78
+ self.metadata['created_at'] = datetime.now().isoformat()
79
+
80
+ # Calculate quality metrics
81
+ self._calculate_quality_metrics(dataset_entries)
82
+
83
+ # Create dataset structure
84
+ dataset = {
85
+ 'metadata': dict(self.metadata),
86
+ 'entries': dataset_entries
87
+ }
88
+
89
+ if include_metadata:
90
+ dataset['metadata'].update({
91
+ 'dataset_name': dataset_name,
92
+ 'successful_entries': successful_entries,
93
+ 'total_input_results': len(analysis_results),
94
+ 'success_rate': successful_entries / len(analysis_results) if analysis_results else 0
95
+ })
96
+
97
+ return dataset
98
+
99
+ def _create_finetuning_entry(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
100
+ """
101
+ Create a single finetuning dataset entry
102
+
103
+ Args:
104
+ result: Analysis result from LLM analyzer
105
+
106
+ Returns:
107
+ Finetuning entry or None if invalid
108
+ """
109
+ try:
110
+ # Extract key components
111
+ chunk = result.get('chunk', '')
112
+ structured_analysis = result.get('structured_analysis', {})
113
+ response = result.get('response', '')
114
+
115
+ # Create the prompt (input)
116
+ prompt = self._create_prompt(chunk, result.get('analysis_type', 'standard'))
117
+
118
+ # Create the response (output) - structured format
119
+ response_text = self._create_response(structured_analysis, response)
120
+
121
+ if not prompt or not response_text:
122
+ return None
123
+
124
+ # Create entry
125
+ entry = {
126
+ 'id': str(uuid.uuid4()),
127
+ 'prompt': prompt,
128
+ 'response': response_text,
129
+ 'metadata': {
130
+ 'chunk_size': len(chunk),
131
+ 'word_count': len(chunk.split()),
132
+ 'analysis_type': result.get('analysis_type', 'standard'),
133
+ 'model_config': result.get('model_config', {}),
134
+ 'confidence_score': structured_analysis.get('confidence_score', 0),
135
+ 'analysis_quality': structured_analysis.get('analysis_quality', 'unknown'),
136
+ 'created_at': datetime.now().isoformat()
137
+ },
138
+ 'raw_data': {
139
+ 'original_chunk': chunk,
140
+ 'structured_analysis': structured_analysis,
141
+ 'raw_response': response
142
+ }
143
+ }
144
+
145
+ return entry
146
+
147
+ except Exception as e:
148
+ print(f"Error creating finetuning entry: {e}")
149
+ return None
150
+
151
+ def _create_prompt(self, chunk: str, analysis_type: str) -> str:
152
+ """
153
+ Create a standardized prompt for the finetuning dataset
154
+
155
+ Args:
156
+ chunk: Text chunk to analyze
157
+ analysis_type: Type of analysis
158
+
159
+ Returns:
160
+ Formatted prompt
161
+ """
162
+ analysis_configs = {
163
+ 'standard': {
164
+ 'depth': 'Standard',
165
+ 'focus': 'loopholes, ambiguities, and unintended consequences'
166
+ },
167
+ 'detailed': {
168
+ 'depth': 'Detailed',
169
+ 'focus': 'loopholes, ambiguities, unintended consequences, and implementation issues'
170
+ },
171
+ 'comprehensive': {
172
+ 'depth': 'Comprehensive',
173
+ 'focus': 'all aspects including policy conflicts and enforcement challenges'
174
+ }
175
+ }
176
+
177
+ config = analysis_configs.get(analysis_type, analysis_configs['standard'])
178
+
179
+ prompt = f"""You are a legal expert analyzing New Zealand legislation for loopholes and ambiguities.
180
+
181
+ LEGISLATION TEXT:
182
+ {chunk}
183
+
184
+ TASK: Analyze this legislative text for potential loopholes, ambiguities, or unintended consequences.
185
+
186
+ ANALYSIS DEPTH: {config['depth']}
187
+ FOCUS AREAS: {config['focus']}
188
+
189
+ Provide a structured analysis covering:
190
+ 1. Text Meaning - Explain what the text means and its intended purpose
191
+ 2. Key Assumptions - Identify any assumptions that could be exploited
192
+ 3. Exploitable Interpretations - Discuss how the text could be interpreted in unintended ways
193
+ 4. Critical Loopholes - Identify specific loopholes or ambiguities
194
+ 5. Circumvention Strategies - Suggest practical methods for exploiting these loopholes
195
+
196
+ Format your response clearly with section headers."""
197
+
198
+ return prompt
199
+
200
+ def _create_response(self, structured_analysis: Dict[str, Any], raw_response: str) -> str:
201
+ """
202
+ Create a standardized response format for the finetuning dataset
203
+
204
+ Args:
205
+ structured_analysis: Structured analysis data
206
+ raw_response: Raw LLM response
207
+
208
+ Returns:
209
+ Formatted response
210
+ """
211
+ sections = []
212
+
213
+ # Text Meaning
214
+ if structured_analysis.get('text_meaning'):
215
+ sections.append(f"**Text Meaning:** {structured_analysis['text_meaning']}")
216
+
217
+ # Key Assumptions
218
+ if structured_analysis.get('key_assumptions'):
219
+ assumptions = structured_analysis['key_assumptions']
220
+ if assumptions:
221
+ sections.append("**Key Assumptions:**")
222
+ for i, assumption in enumerate(assumptions, 1):
223
+ sections.append(f"{i}. {assumption}")
224
+
225
+ # Exploitable Interpretations
226
+ if structured_analysis.get('exploitable_interpretations'):
227
+ interpretations = structured_analysis['exploitable_interpretations']
228
+ if interpretations:
229
+ sections.append("**Exploitable Interpretations:**")
230
+ for i, interpretation in enumerate(interpretations, 1):
231
+ sections.append(f"{i}. {interpretation}")
232
+
233
+ # Critical Loopholes
234
+ if structured_analysis.get('critical_loopholes'):
235
+ loopholes = structured_analysis['critical_loopholes']
236
+ if loopholes:
237
+ sections.append("**Critical Loopholes:**")
238
+ for i, loophole in enumerate(loopholes, 1):
239
+ sections.append(f"{i}. {loophole}")
240
+
241
+ # Circumvention Strategies
242
+ if structured_analysis.get('circumvention_strategies'):
243
+ strategies = structured_analysis['circumvention_strategies']
244
+ if strategies:
245
+ sections.append("**Circumvention Strategies:**")
246
+ for i, strategy in enumerate(strategies, 1):
247
+ sections.append(f"{i}. {strategy}")
248
+
249
+ # Recommendations
250
+ if structured_analysis.get('recommendations'):
251
+ recommendations = structured_analysis['recommendations']
252
+ if recommendations:
253
+ sections.append("**Recommendations:**")
254
+ for i, rec in enumerate(recommendations, 1):
255
+ sections.append(f"{i}. {rec}")
256
+
257
+ return "\n\n".join(sections) if sections else raw_response
258
+
259
+ def _calculate_quality_metrics(self, entries: List[Dict[str, Any]]):
260
+ """Calculate quality metrics for the dataset"""
261
+ if not entries:
262
+ return
263
+
264
+ confidence_scores = []
265
+ analysis_qualities = {'high': 0, 'medium': 0, 'low': 0, 'unknown': 0}
266
+
267
+ for entry in entries:
268
+ metadata = entry.get('metadata', {})
269
+ confidence = metadata.get('confidence_score', 0)
270
+ quality = metadata.get('analysis_quality', 'unknown')
271
+
272
+ confidence_scores.append(confidence)
273
+ analysis_qualities[quality] = analysis_qualities.get(quality, 0) + 1
274
+
275
+ self.metadata['quality_metrics'] = {
276
+ 'average_confidence': sum(confidence_scores) / len(confidence_scores) if confidence_scores else 0,
277
+ 'max_confidence': max(confidence_scores) if confidence_scores else 0,
278
+ 'min_confidence': min(confidence_scores) if confidence_scores else 0,
279
+ 'quality_distribution': analysis_qualities,
280
+ 'total_entries': len(entries)
281
+ }
282
+
283
+ def save_dataset(self, dataset: Dict[str, Any], format_type: str = 'json',
284
+ filename: str = None) -> str:
285
+ """
286
+ Save dataset in specified format
287
+
288
+ Args:
289
+ dataset: Dataset to save
290
+ format_type: Format ('json', 'jsonl', 'csv', 'excel')
291
+ filename: Output filename (optional)
292
+
293
+ Returns:
294
+ Path to saved file
295
+ """
296
+ if not filename:
297
+ timestamp = int(time.time())
298
+ filename = f"nz_legislation_dataset_{timestamp}"
299
+
300
+ # Ensure filename has correct extension
301
+ if not filename.endswith(f'.{format_type}'):
302
+ filename += f'.{format_type}'
303
+
304
+ filepath = self.output_dir / filename
305
+
306
+ try:
307
+ if format_type == 'json':
308
+ with open(filepath, 'w', encoding='utf-8') as f:
309
+ json.dump(dataset, f, indent=2, ensure_ascii=False)
310
+
311
+ elif format_type == 'jsonl':
312
+ with open(filepath, 'w', encoding='utf-8') as f:
313
+ for entry in dataset.get('entries', []):
314
+ json.dump(entry, f, ensure_ascii=False)
315
+ f.write('\n')
316
+
317
+ elif format_type == 'csv':
318
+ self._save_as_csv(dataset, filepath)
319
+
320
+ elif format_type == 'excel':
321
+ self._save_as_excel(dataset, filepath)
322
+
323
+ else:
324
+ raise ValueError(f"Unsupported format: {format_type}")
325
+
326
+ return str(filepath)
327
+
328
+ except Exception as e:
329
+ raise Exception(f"Error saving dataset: {e}")
330
+
331
+ def _save_as_csv(self, dataset: Dict[str, Any], filepath: Path):
332
+ """Save dataset as CSV"""
333
+ entries = dataset.get('entries', [])
334
+
335
+ if not entries:
336
+ # Create empty CSV with headers
337
+ df = pd.DataFrame(columns=['id', 'prompt', 'response', 'metadata'])
338
+ df.to_csv(filepath, index=False)
339
+ return
340
+
341
+ # Flatten the data for CSV
342
+ csv_data = []
343
+ for entry in entries:
344
+ csv_row = {
345
+ 'id': entry.get('id', ''),
346
+ 'prompt': entry.get('prompt', ''),
347
+ 'response': entry.get('response', ''),
348
+ 'confidence_score': entry.get('metadata', {}).get('confidence_score', 0),
349
+ 'analysis_type': entry.get('metadata', {}).get('analysis_type', ''),
350
+ 'chunk_size': entry.get('metadata', {}).get('chunk_size', 0),
351
+ 'word_count': entry.get('metadata', {}).get('word_count', 0),
352
+ 'analysis_quality': entry.get('metadata', {}).get('analysis_quality', ''),
353
+ 'created_at': entry.get('metadata', {}).get('created_at', '')
354
+ }
355
+ csv_data.append(csv_row)
356
+
357
+ df = pd.DataFrame(csv_data)
358
+ df.to_csv(filepath, index=False, encoding='utf-8')
359
+
360
+ def _save_as_excel(self, dataset: Dict[str, Any], filepath: Path):
361
+ """Save dataset as Excel with multiple sheets"""
362
+ entries = dataset.get('entries', [])
363
+
364
+ with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
365
+ # Main dataset sheet
366
+ if entries:
367
+ csv_data = []
368
+ for entry in entries:
369
+ csv_row = {
370
+ 'id': entry.get('id', ''),
371
+ 'prompt': entry.get('prompt', ''),
372
+ 'response': entry.get('response', ''),
373
+ 'confidence_score': entry.get('metadata', {}).get('confidence_score', 0),
374
+ 'analysis_type': entry.get('metadata', {}).get('analysis_type', ''),
375
+ 'chunk_size': entry.get('metadata', {}).get('chunk_size', 0),
376
+ 'word_count': entry.get('metadata', {}).get('word_count', 0),
377
+ 'analysis_quality': entry.get('metadata', {}).get('analysis_quality', ''),
378
+ 'created_at': entry.get('metadata', {}).get('created_at', '')
379
+ }
380
+ csv_data.append(csv_row)
381
+
382
+ df_main = pd.DataFrame(csv_data)
383
+ df_main.to_excel(writer, sheet_name='Dataset', index=False)
384
+
385
+ # Metadata sheet
386
+ metadata_df = pd.DataFrame([dataset.get('metadata', {})])
387
+ metadata_df.to_excel(writer, sheet_name='Metadata', index=False)
388
+
389
+ # Quality metrics sheet
390
+ quality_data = dataset.get('metadata', {}).get('quality_metrics', {})
391
+ if quality_data:
392
+ quality_df = pd.DataFrame([quality_data])
393
+ quality_df.to_excel(writer, sheet_name='Quality_Metrics', index=False)
394
+
395
+ def load_dataset(self, filepath: str) -> Dict[str, Any]:
396
+ """
397
+ Load a dataset from file
398
+
399
+ Args:
400
+ filepath: Path to dataset file
401
+
402
+ Returns:
403
+ Loaded dataset
404
+ """
405
+ filepath = Path(filepath)
406
+
407
+ if not filepath.exists():
408
+ raise FileNotFoundError(f"Dataset file not found: {filepath}")
409
+
410
+ try:
411
+ if filepath.suffix == '.json':
412
+ with open(filepath, 'r', encoding='utf-8') as f:
413
+ return json.load(f)
414
+
415
+ elif filepath.suffix == '.jsonl':
416
+ entries = []
417
+ with open(filepath, 'r', encoding='utf-8') as f:
418
+ for line in f:
419
+ if line.strip():
420
+ entries.append(json.loads(line))
421
+
422
+ return {
423
+ 'metadata': {
424
+ 'loaded_from': str(filepath),
425
+ 'total_entries': len(entries)
426
+ },
427
+ 'entries': entries
428
+ }
429
+
430
+ elif filepath.suffix in ['.csv', '.xlsx', '.xls']:
431
+ return self._load_from_spreadsheet(filepath)
432
+
433
+ else:
434
+ raise ValueError(f"Unsupported file format: {filepath.suffix}")
435
+
436
+ except Exception as e:
437
+ raise Exception(f"Error loading dataset: {e}")
438
+
439
+ def _load_from_spreadsheet(self, filepath: Path) -> Dict[str, Any]:
440
+ """Load dataset from spreadsheet format"""
441
+ try:
442
+ if filepath.suffix == '.csv':
443
+ df = pd.read_csv(filepath)
444
+ else:
445
+ df = pd.read_excel(filepath)
446
+
447
+ # Convert back to dataset format
448
+ entries = []
449
+ for _, row in df.iterrows():
450
+ entry = {
451
+ 'id': row.get('id', str(uuid.uuid4())),
452
+ 'prompt': row.get('prompt', ''),
453
+ 'response': row.get('response', ''),
454
+ 'metadata': {
455
+ 'confidence_score': row.get('confidence_score', 0),
456
+ 'analysis_type': row.get('analysis_type', 'standard'),
457
+ 'chunk_size': row.get('chunk_size', 0),
458
+ 'word_count': row.get('word_count', 0),
459
+ 'analysis_quality': row.get('analysis_quality', 'unknown'),
460
+ 'created_at': row.get('created_at', datetime.now().isoformat())
461
+ }
462
+ }
463
+ entries.append(entry)
464
+
465
+ return {
466
+ 'metadata': {
467
+ 'loaded_from': str(filepath),
468
+ 'total_entries': len(entries),
469
+ 'original_format': filepath.suffix[1:]
470
+ },
471
+ 'entries': entries
472
+ }
473
+
474
+ except Exception as e:
475
+ raise Exception(f"Error loading spreadsheet: {e}")
476
+
477
+ def merge_datasets(self, datasets: List[Dict[str, Any]],
478
+ output_name: str = None) -> Dict[str, Any]:
479
+ """
480
+ Merge multiple datasets into one
481
+
482
+ Args:
483
+ datasets: List of datasets to merge
484
+ output_name: Name for merged dataset
485
+
486
+ Returns:
487
+ Merged dataset
488
+ """
489
+ if not datasets:
490
+ return self.create_finetuning_dataset([])
491
+
492
+ merged_entries = []
493
+ all_analysis_types = set()
494
+ all_sources = set()
495
+
496
+ for dataset in datasets:
497
+ entries = dataset.get('entries', [])
498
+ merged_entries.extend(entries)
499
+
500
+ metadata = dataset.get('metadata', {})
501
+ all_analysis_types.update(metadata.get('analysis_types', []))
502
+ all_sources.update(metadata.get('legislation_sources', []))
503
+
504
+ # Create merged dataset
505
+ merged_dataset = {
506
+ 'metadata': {
507
+ 'version': '1.0',
508
+ 'created_at': datetime.now().isoformat(),
509
+ 'dataset_name': output_name or f"merged_dataset_{int(time.time())}",
510
+ 'total_entries': len(merged_entries),
511
+ 'analysis_types': list(all_analysis_types),
512
+ 'legislation_sources': list(all_sources),
513
+ 'merged_from': len(datasets),
514
+ 'success_rate': 1.0 # Assuming all entries are valid
515
+ },
516
+ 'entries': merged_entries
517
+ }
518
+
519
+ # Recalculate quality metrics
520
+ self._calculate_quality_metrics(merged_entries)
521
+ merged_dataset['metadata']['quality_metrics'] = self.metadata['quality_metrics']
522
+
523
+ return merged_dataset
524
+
525
+ def validate_dataset(self, dataset: Dict[str, Any]) -> Dict[str, Any]:
526
+ """
527
+ Validate dataset quality and completeness
528
+
529
+ Args:
530
+ dataset: Dataset to validate
531
+
532
+ Returns:
533
+ Validation results
534
+ """
535
+ validation = {
536
+ 'is_valid': True,
537
+ 'issues': [],
538
+ 'warnings': [],
539
+ 'statistics': {}
540
+ }
541
+
542
+ entries = dataset.get('entries', [])
543
+ metadata = dataset.get('metadata', {})
544
+
545
+ # Check basic structure
546
+ if not isinstance(entries, list):
547
+ validation['issues'].append("Entries must be a list")
548
+ validation['is_valid'] = False
549
+ return validation
550
+
551
+ if not entries:
552
+ validation['warnings'].append("Dataset is empty")
553
+ return validation
554
+
555
+ # Validate entries
556
+ valid_entries = 0
557
+ total_confidence = 0
558
+
559
+ for i, entry in enumerate(entries):
560
+ if not isinstance(entry, dict):
561
+ validation['issues'].append(f"Entry {i} is not a dictionary")
562
+ continue
563
+
564
+ # Check required fields
565
+ required_fields = ['id', 'prompt', 'response']
566
+ for field in required_fields:
567
+ if field not in entry:
568
+ validation['issues'].append(f"Entry {i} missing required field: {field}")
569
+
570
+ # Check prompt and response quality
571
+ prompt = entry.get('prompt', '')
572
+ response = entry.get('response', '')
573
+
574
+ if len(prompt.strip()) < 10:
575
+ validation['warnings'].append(f"Entry {i} has very short prompt")
576
+
577
+ if len(response.strip()) < 10:
578
+ validation['warnings'].append(f"Entry {i} has very short response")
579
+
580
+ # Check confidence score
581
+ confidence = entry.get('metadata', {}).get('confidence_score', 0)
582
+ total_confidence += confidence
583
+
584
+ valid_entries += 1
585
+
586
+ # Calculate statistics
587
+ validation['statistics'] = {
588
+ 'total_entries': len(entries),
589
+ 'valid_entries': valid_entries,
590
+ 'average_confidence': total_confidence / valid_entries if valid_entries > 0 else 0,
591
+ 'validation_rate': valid_entries / len(entries) if entries else 0
592
+ }
593
+
594
+ return validation
595
+
596
+ def get_dataset_statistics(self, dataset: Dict[str, Any]) -> Dict[str, Any]:
597
+ """
598
+ Get comprehensive statistics about the dataset
599
+
600
+ Args:
601
+ dataset: Dataset to analyze
602
+
603
+ Returns:
604
+ Dataset statistics
605
+ """
606
+ entries = dataset.get('entries', [])
607
+
608
+ if not entries:
609
+ return {'total_entries': 0}
610
+
611
+ # Basic statistics
612
+ stats = {
613
+ 'total_entries': len(entries),
614
+ 'total_prompts': len([e for e in entries if e.get('prompt')]),
615
+ 'total_responses': len([e for e in entries if e.get('response')]),
616
+ 'average_prompt_length': 0,
617
+ 'average_response_length': 0,
618
+ 'confidence_distribution': {},
619
+ 'analysis_type_distribution': {},
620
+ 'quality_distribution': {}
621
+ }
622
+
623
+ # Calculate averages
624
+ prompt_lengths = [len(e.get('prompt', '')) for e in entries if e.get('prompt')]
625
+ response_lengths = [len(e.get('response', '')) for e in entries if e.get('response')]
626
+
627
+ if prompt_lengths:
628
+ stats['average_prompt_length'] = sum(prompt_lengths) / len(prompt_lengths)
629
+ if response_lengths:
630
+ stats['average_response_length'] = sum(response_lengths) / len(response_lengths)
631
+
632
+ # Distribution analysis
633
+ for entry in entries:
634
+ metadata = entry.get('metadata', {})
635
+
636
+ # Confidence distribution
637
+ confidence = metadata.get('confidence_score', 0)
638
+ conf_range = f"{(confidence // 20) * 20}-{(confidence // 20) * 20 + 19}"
639
+ stats['confidence_distribution'][conf_range] = stats['confidence_distribution'].get(conf_range, 0) + 1
640
+
641
+ # Analysis type distribution
642
+ analysis_type = metadata.get('analysis_type', 'unknown')
643
+ stats['analysis_type_distribution'][analysis_type] = stats['analysis_type_distribution'].get(analysis_type, 0) + 1
644
+
645
+ # Quality distribution
646
+ quality = metadata.get('analysis_quality', 'unknown')
647
+ stats['quality_distribution'][quality] = stats['quality_distribution'].get(quality, 0) + 1
648
+
649
+ return stats
streamlit_app/core/llm_analyzer.py ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ LLM Analyzer
4
+
5
+ Handles LLM model loading, inference, and analysis for the NZ Legislation Loophole Analysis.
6
+ Provides optimized prompts and response parsing for legal text analysis.
7
+ """
8
+
9
+ import os
10
+ import time
11
+ from typing import List, Dict, Any, Optional, Tuple
12
+ import json
13
+ from llama_cpp import Llama
14
+ import re
15
+
16
+ class LLMAnalyzer:
17
+ """LLM-based analyzer for legislation loophole detection"""
18
+
19
+ def __init__(self, model_config: Dict[str, Any]):
20
+ """
21
+ Initialize the LLM analyzer
22
+
23
+ Args:
24
+ model_config: Configuration for the LLM model
25
+ """
26
+ self.model_config = model_config
27
+ self.model = None
28
+ self.is_loaded = False
29
+
30
+ # Analysis templates
31
+ self.analysis_templates = {
32
+ 'standard': {
33
+ 'depth': 'Standard',
34
+ 'include_recommendations': True,
35
+ 'focus_areas': ['loopholes', 'ambiguities', 'unintended_consequences']
36
+ },
37
+ 'detailed': {
38
+ 'depth': 'Detailed',
39
+ 'include_recommendations': True,
40
+ 'focus_areas': ['loopholes', 'ambiguities', 'unintended_consequences', 'implementation_issues']
41
+ },
42
+ 'comprehensive': {
43
+ 'depth': 'Comprehensive',
44
+ 'include_recommendations': True,
45
+ 'focus_areas': ['loopholes', 'ambiguities', 'unintended_consequences',
46
+ 'implementation_issues', 'policy_conflicts', 'enforcement_challenges']
47
+ }
48
+ }
49
+
50
+ # Prompt templates
51
+ self.prompt_templates = {
52
+ 'loophole_analysis': self._get_loophole_analysis_template(),
53
+ 'ambiguity_detection': self._get_ambiguity_detection_template(),
54
+ 'recommendations': self._get_recommendations_template()
55
+ }
56
+
57
+ def _get_loophole_analysis_template(self) -> str:
58
+ """Get the main loophole analysis prompt template"""
59
+ return """You are a legal expert analyzing New Zealand legislation for loopholes and ambiguities.
60
+
61
+ LEGISLATION TEXT:
62
+ {text}
63
+
64
+ TASK: Analyze this legislative text for potential loopholes, ambiguities, or unintended consequences.
65
+
66
+ INSTRUCTIONS:
67
+ Provide a structured analysis following this format:
68
+
69
+ 1. **Text Meaning**: Explain what the text means and its intended purpose
70
+ 2. **Key Assumptions**: Identify any assumptions the text makes that could be exploited
71
+ 3. **Exploitable Interpretations**: Discuss how the text could be interpreted or applied in ways that circumvent its intended purpose
72
+ 4. **Critical Loopholes**: Identify specific loopholes, ambiguities, or unintended consequences that could be used to bypass the legislation
73
+ 5. **Circumvention Strategies**: Suggest practical methods or scenarios for exploiting these loopholes to achieve objectives contrary to the legislation's intent
74
+
75
+ {reasoning_format}
76
+ {recommendations_format}
77
+
78
+ ANALYSIS DEPTH: {depth}
79
+ FOCUS AREAS: {focus_areas}
80
+ """
81
+
82
+ def _get_ambiguity_detection_template(self) -> str:
83
+ """Get the ambiguity detection prompt template"""
84
+ return """Analyze the following legal text for ambiguities and unclear provisions:
85
+
86
+ TEXT: {text}
87
+
88
+ Identify:
89
+ 1. Vague terms or phrases
90
+ 2. Ambiguous references
91
+ 3. Unclear conditions or requirements
92
+ 4. Missing definitions
93
+ 5. Conflicting provisions
94
+
95
+ Provide specific examples and suggest clarifications.
96
+ """
97
+
98
+ def _get_recommendations_template(self) -> str:
99
+ """Get the recommendations prompt template"""
100
+ return """Based on the loopholes and ambiguities identified, provide specific recommendations for:
101
+
102
+ 1. Legislative amendments to close identified loopholes
103
+ 2. Additional definitions or clarifications needed
104
+ 3. Implementation guidelines or regulations
105
+ 4. Monitoring and enforcement mechanisms
106
+
107
+ Prioritize recommendations by impact and feasibility.
108
+ """
109
+
110
+ def load_model(self) -> bool:
111
+ """
112
+ Load the LLM model
113
+
114
+ Returns:
115
+ True if model loaded successfully, False otherwise
116
+ """
117
+ if self.is_loaded:
118
+ return True
119
+
120
+ try:
121
+ print("Loading LLM model...")
122
+
123
+ # Try to load from HuggingFace
124
+ if self.model_config.get('repo_id'):
125
+ self.model = Llama.from_pretrained(
126
+ repo_id=self.model_config['repo_id'],
127
+ filename=self.model_config.get('filename', ''),
128
+ n_ctx=self.model_config.get('context_length', 40960),
129
+ n_threads=min(os.cpu_count(), 8),
130
+ verbose=False,
131
+ n_gpu_layers=-1,
132
+ n_batch=4096,
133
+ logits_all=False,
134
+ use_mlock=True,
135
+ use_mmap=True,
136
+ )
137
+ else:
138
+ # Load from local path
139
+ model_path = self.model_config.get('path', '')
140
+ if not model_path or not os.path.exists(model_path):
141
+ print(f"Model path not found: {model_path}")
142
+ return False
143
+
144
+ self.model = Llama(
145
+ model_path=model_path,
146
+ n_ctx=self.model_config.get('context_length', 40960),
147
+ n_threads=min(os.cpu_count(), 8),
148
+ verbose=False,
149
+ n_gpu_layers=-1,
150
+ n_batch=4096,
151
+ )
152
+
153
+ self.is_loaded = True
154
+ print("Model loaded successfully")
155
+ return True
156
+
157
+ except Exception as e:
158
+ print(f"Error loading model: {e}")
159
+ return False
160
+
161
+ def unload_model(self):
162
+ """Unload the model to free memory"""
163
+ if self.model:
164
+ del self.model
165
+ self.model = None
166
+ self.is_loaded = False
167
+
168
+ def generate_chat_template(self, system_prompt: str, user_message: str = "") -> str:
169
+ """
170
+ Generate a chat template for the model
171
+
172
+ Args:
173
+ system_prompt: The system prompt
174
+ user_message: The user message (optional)
175
+
176
+ Returns:
177
+ Formatted chat template
178
+ """
179
+ chat_messages = []
180
+
181
+ # System message
182
+ if system_prompt:
183
+ chat_messages.append("<|im_start|>system")
184
+ chat_messages.append(system_prompt)
185
+ chat_messages.append("<|im_end|>")
186
+
187
+ # User message
188
+ if user_message:
189
+ chat_messages.append("<|im_start|>user")
190
+ chat_messages.append(user_message)
191
+ chat_messages.append("<|im_end|>")
192
+
193
+ # Assistant message with generation prompt
194
+ chat_messages.append("<|im_start|>assistant")
195
+ chat_messages.append("") # Empty for generation
196
+
197
+ return "\n".join(chat_messages)
198
+
199
+ def analyze_chunk(self, chunk: str, analysis_type: str = 'standard',
200
+ cache_manager = None) -> Dict[str, Any]:
201
+ """
202
+ Analyze a single text chunk for loopholes and ambiguities
203
+
204
+ Args:
205
+ chunk: Text chunk to analyze
206
+ analysis_type: Type of analysis to perform
207
+ cache_manager: Cache manager instance for caching results
208
+
209
+ Returns:
210
+ Analysis results
211
+ """
212
+ if not self.is_loaded and not self.load_model():
213
+ return {
214
+ 'error': 'Model not loaded',
215
+ 'chunk': chunk[:100] + "..." if len(chunk) > 100 else chunk
216
+ }
217
+
218
+ # Check cache first
219
+ if cache_manager:
220
+ cached_result = cache_manager.get(
221
+ chunk,
222
+ self.model_config,
223
+ {'analysis_type': analysis_type}
224
+ )
225
+ if cached_result:
226
+ return cached_result
227
+
228
+ try:
229
+ # Prepare analysis template
230
+ template_config = self.analysis_templates.get(analysis_type, self.analysis_templates['standard'])
231
+
232
+ # Build the full prompt
233
+ reasoning_format = """
234
+ Write your complete analysis between <start_working_out> and <end_working_out>.
235
+
236
+ Then provide your overall conclusion between <SOLUTION> and </SOLUTION>.
237
+ """
238
+
239
+ recommendations_format = """
240
+ **Recommendations**: Provide specific recommendations for addressing identified issues.
241
+ """ if template_config['include_recommendations'] else ""
242
+
243
+ full_prompt = self.prompt_templates['loophole_analysis'].format(
244
+ text=chunk,
245
+ reasoning_format=reasoning_format,
246
+ recommendations_format=recommendations_format,
247
+ depth=template_config['depth'],
248
+ focus_areas=', '.join(template_config['focus_areas'])
249
+ )
250
+
251
+ # Generate chat template
252
+ chat_template = self.generate_chat_template(full_prompt)
253
+
254
+ # Generate response
255
+ response = self._generate_response(chat_template)
256
+
257
+ # Parse and structure the response
258
+ structured_response = self._parse_response(response)
259
+
260
+ # Add metadata
261
+ result = {
262
+ 'chunk': chunk,
263
+ 'analysis_type': analysis_type,
264
+ 'model_config': self.model_config,
265
+ 'response': response,
266
+ 'structured_analysis': structured_response,
267
+ 'processing_time': time.time(),
268
+ 'chunk_size': len(chunk),
269
+ 'word_count': len(chunk.split())
270
+ }
271
+
272
+ # Cache the result
273
+ if cache_manager:
274
+ cache_manager.put(chunk, result, self.model_config, {'analysis_type': analysis_type})
275
+
276
+ return result
277
+
278
+ except Exception as e:
279
+ return {
280
+ 'error': str(e),
281
+ 'chunk': chunk[:100] + "..." if len(chunk) > 100 else chunk
282
+ }
283
+
284
+ def _generate_response(self, prompt: str, max_tokens: int = None) -> str:
285
+ """
286
+ Generate a response from the model
287
+
288
+ Args:
289
+ prompt: Input prompt
290
+ max_tokens: Maximum tokens to generate
291
+
292
+ Returns:
293
+ Generated response
294
+ """
295
+ if max_tokens is None:
296
+ max_tokens = self.model_config.get('max_tokens', 4096)
297
+
298
+ try:
299
+ response = self.model(
300
+ prompt,
301
+ max_tokens=max_tokens,
302
+ temperature=self.model_config.get('temperature', 0.3),
303
+ top_p=self.model_config.get('top_p', 0.85),
304
+ top_k=self.model_config.get('top_k', 50),
305
+ repeat_penalty=self.model_config.get('repeat_penalty', 1.15),
306
+ stop=["<end_working_out>", "</SOLUTION>", "<|im_end|>"],
307
+ echo=False
308
+ )
309
+
310
+ return response['choices'][0]['text'].strip()
311
+
312
+ except Exception as e:
313
+ print(f"Error generating response: {e}")
314
+ return ""
315
+
316
+ def _parse_response(self, response: str) -> Dict[str, Any]:
317
+ """
318
+ Parse the LLM response into structured data
319
+
320
+ Args:
321
+ response: Raw LLM response
322
+
323
+ Returns:
324
+ Structured analysis data
325
+ """
326
+ structured = {
327
+ 'text_meaning': '',
328
+ 'key_assumptions': [],
329
+ 'exploitable_interpretations': [],
330
+ 'critical_loopholes': [],
331
+ 'circumvention_strategies': [],
332
+ 'recommendations': [],
333
+ 'confidence_score': 0,
334
+ 'analysis_quality': 'unknown'
335
+ }
336
+
337
+ try:
338
+ # Extract sections using regex patterns
339
+ patterns = {
340
+ 'text_meaning': r'\*\*Text Meaning\*\*:\s*(.*?)(?=\*\*|$)',
341
+ 'key_assumptions': r'\*\*Key Assumptions\*\*:\s*(.*?)(?=\*\*|$)',
342
+ 'exploitable_interpretations': r'\*\*Exploitable Interpretations\*\*:\s*(.*?)(?=\*\*|$)',
343
+ 'critical_loopholes': r'\*\*Critical Loopholes\*\*:\s*(.*?)(?=\*\*|$)',
344
+ 'circumvention_strategies': r'\*\*Circumvention Strategies\*\*:\s*(.*?)(?=\*\*|$)',
345
+ 'recommendations': r'\*\*Recommendations\*\*:\s*(.*?)(?=\*\*|$|$)',
346
+ }
347
+
348
+ for key, pattern in patterns.items():
349
+ matches = re.findall(pattern, response, re.DOTALL | re.IGNORECASE)
350
+ if matches:
351
+ content = matches[0].strip()
352
+ if key in ['key_assumptions', 'exploitable_interpretations',
353
+ 'critical_loopholes', 'circumvention_strategies', 'recommendations']:
354
+ # Split into list items
355
+ items = re.findall(r'(?:\d+\.|-|\β€’)\s*(.*?)(?=(?:\d+\.|-|\β€’)|$)',
356
+ content, re.DOTALL)
357
+ structured[key] = [item.strip() for item in items if item.strip()]
358
+ else:
359
+ structured[key] = content
360
+
361
+ # Calculate confidence score based on analysis completeness
362
+ completeness_score = 0
363
+ if structured['text_meaning']:
364
+ completeness_score += 20
365
+ for key in ['key_assumptions', 'exploitable_interpretations',
366
+ 'critical_loopholes', 'circumvention_strategies']:
367
+ if structured[key]:
368
+ completeness_score += 20
369
+
370
+ structured['confidence_score'] = min(100, completeness_score)
371
+
372
+ # Determine analysis quality
373
+ if structured['confidence_score'] >= 80:
374
+ structured['analysis_quality'] = 'high'
375
+ elif structured['confidence_score'] >= 60:
376
+ structured['analysis_quality'] = 'medium'
377
+ else:
378
+ structured['analysis_quality'] = 'low'
379
+
380
+ except Exception as e:
381
+ print(f"Error parsing response: {e}")
382
+ structured['error'] = str(e)
383
+
384
+ return structured
385
+
386
+ def batch_analyze_chunks(self, chunks: List[str], analysis_type: str = 'standard',
387
+ cache_manager = None, progress_callback = None) -> List[Dict[str, Any]]:
388
+ """
389
+ Analyze multiple chunks in batch
390
+
391
+ Args:
392
+ chunks: List of text chunks to analyze
393
+ analysis_type: Type of analysis to perform
394
+ cache_manager: Cache manager instance
395
+ progress_callback: Callback function for progress updates
396
+
397
+ Returns:
398
+ List of analysis results
399
+ """
400
+ results = []
401
+ total_chunks = len(chunks)
402
+
403
+ for i, chunk in enumerate(chunks):
404
+ if progress_callback:
405
+ progress = (i + 1) / total_chunks
406
+ progress_callback(progress, f"Analyzing chunk {i + 1}/{total_chunks}")
407
+
408
+ result = self.analyze_chunk(chunk, analysis_type, cache_manager)
409
+ results.append(result)
410
+
411
+ return results
412
+
413
+ def get_model_info(self) -> Dict[str, Any]:
414
+ """Get information about the loaded model"""
415
+ if not self.is_loaded:
416
+ return {'status': 'not_loaded'}
417
+
418
+ try:
419
+ return {
420
+ 'status': 'loaded',
421
+ 'config': self.model_config,
422
+ 'model_type': type(self.model).__name__,
423
+ 'context_length': self.model_config.get('context_length', 'unknown'),
424
+ 'vocab_size': getattr(self.model, 'vocab_size', 'unknown')
425
+ }
426
+ except Exception as e:
427
+ return {
428
+ 'status': 'error',
429
+ 'error': str(e)
430
+ }
431
+
432
+ def validate_model_config(self) -> Dict[str, Any]:
433
+ """Validate the current model configuration"""
434
+ validation = {
435
+ 'is_valid': True,
436
+ 'issues': [],
437
+ 'warnings': []
438
+ }
439
+
440
+ # Check required parameters
441
+ required_params = ['context_length', 'max_tokens']
442
+ for param in required_params:
443
+ if param not in self.model_config:
444
+ validation['issues'].append(f"Missing required parameter: {param}")
445
+ validation['is_valid'] = False
446
+
447
+ # Check parameter ranges
448
+ if 'context_length' in self.model_config:
449
+ if self.model_config['context_length'] < 1024:
450
+ validation['issues'].append("Context length too small (minimum: 1024)")
451
+ validation['is_valid'] = False
452
+
453
+ if 'max_tokens' in self.model_config:
454
+ if self.model_config['max_tokens'] < 64:
455
+ validation['issues'].append("Max tokens too small (minimum: 64)")
456
+ validation['is_valid'] = False
457
+
458
+ if 'temperature' in self.model_config:
459
+ temp = self.model_config['temperature']
460
+ if not (0 <= temp <= 2):
461
+ validation['issues'].append("Temperature out of valid range (0-2)")
462
+ validation['is_valid'] = False
463
+
464
+ # Check model path/file
465
+ if 'path' in self.model_config and self.model_config['path']:
466
+ if not os.path.exists(self.model_config['path']):
467
+ validation['warnings'].append(f"Model file not found: {self.model_config['path']}")
468
+
469
+ return validation
streamlit_app/core/text_processor.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Text Processor
4
+
5
+ Handles text cleaning, chunking, and preprocessing for the NZ Legislation Loophole Analysis.
6
+ Optimized for legal/legislative content with specialized cleaning and structuring.
7
+ """
8
+
9
+ import re
10
+ from typing import List, Dict, Any, Optional, Tuple
11
+ import hashlib
12
+ import json
13
+
14
+ class TextProcessor:
15
+ """Advanced text processing for legislation analysis"""
16
+
17
+ def __init__(self):
18
+ """Initialize the text processor with legal-specific patterns"""
19
+ # Legal-specific patterns
20
+ self.section_pattern = re.compile(r'^(\d+):', re.MULTILINE)
21
+ self.act_name_pattern = re.compile(r'(\b\w+(?:\s+\w+)*)\s+Act\s+(\d{4})', re.IGNORECASE)
22
+ self.date_patterns = [
23
+ (r'(\d{1,2})\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*(\d{4})',
24
+ lambda m: f"{m.group(1)} {m.group(2)}"),
25
+ (r'(\d{1,2})/(\d{1,2})/(\d{4})', r'\1/\2/\3'),
26
+ (r'(\d{4})-(\d{2})-(\d{2})', r'\1-\2-\3')
27
+ ]
28
+
29
+ # NZ-specific legal terms
30
+ self.nz_terms = {
31
+ 'New Zealand': 'New Zealand',
32
+ 'Parliament': 'Parliament',
33
+ 'Crown': 'Crown',
34
+ 'Government': 'Government',
35
+ 'Treaty of Waitangi': 'Treaty of Waitangi',
36
+ 'NZB': 'NZB',
37
+ 'Her Majesty': 'Her Majesty',
38
+ 'Governor-General': 'Governor-General'
39
+ }
40
+
41
+ def clean_text(self, text: str, preserve_structure: bool = True) -> str:
42
+ """
43
+ Clean and normalize text for better processing, optimized for legal content
44
+
45
+ Args:
46
+ text: Raw text to clean
47
+ preserve_structure: Whether to preserve legal document structure
48
+
49
+ Returns:
50
+ Cleaned text
51
+ """
52
+ if not text:
53
+ return ""
54
+
55
+ # Preserve section numbers and legal structure if requested
56
+ if preserve_structure:
57
+ # Keep section numbers like "1:", "2:", etc.
58
+ text = self.section_pattern.sub(r'\1', text)
59
+
60
+ # Remove excessive whitespace but preserve paragraph structure
61
+ text = re.sub(r'[ \t]+', ' ', text) # Replace multiple spaces/tabs with single space
62
+ text = re.sub(r'\n\s*\n', '\n\n', text) # Preserve paragraph breaks but clean up
63
+ text = re.sub(r'\n{3,}', '\n\n', text) # Reduce excessive newlines to double
64
+
65
+ # Remove control characters but preserve legal formatting
66
+ text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)
67
+
68
+ # Handle legal-specific characters and formatting
69
+ allowed_chars = r'\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}\"\'\/\@\#\$\%\^\&\*\+\=\~\`°§'
70
+ text = re.sub(r'[^' + allowed_chars + ']', '', text)
71
+
72
+ # Normalize quotes and apostrophes for legal text
73
+ text = re.sub(r'[""]', '"', text)
74
+ text = re.sub(r"['']", "'", text)
75
+ text = re.sub(r'`', "'", text)
76
+
77
+ # Clean up legal numbering and references
78
+ text = re.sub(r'section\s+(\d+)', r'section \1', text, flags=re.IGNORECASE)
79
+
80
+ # Normalize date formats
81
+ for pattern, replacement in self.date_patterns:
82
+ if callable(replacement):
83
+ text = re.compile(pattern, re.IGNORECASE).sub(replacement, text)
84
+ else:
85
+ text = re.compile(pattern, re.IGNORECASE).sub(replacement, text)
86
+
87
+ # Normalize act names with years
88
+ text = self.act_name_pattern.sub(r'\1 Act', text)
89
+
90
+ # Clean up amendment references
91
+ text = re.sub(r'[Aa]mendment\(s\)\s+incorporated\s+in\s+the\s+[Aa]ct\(s\)', 'Amendments incorporated', text)
92
+
93
+ # Normalize section references
94
+ text = re.sub(r'section\s+\d+\(\d+\)\([a-zA-Z]\)', lambda m: m.group(0).lower(), text)
95
+
96
+ # Generic pattern for legal document sections
97
+ text = re.sub(r'(\b(?:section|part|chapter|article|clause|subsection|paragraph))\s+(\d+[a-zA-Z]*)',
98
+ lambda m: f"{m.group(1)} {m.group(2)}", text, flags=re.IGNORECASE)
99
+
100
+ # NZ-specific legal enhancements
101
+ for term, normalized in self.nz_terms.items():
102
+ text = re.sub(re.escape(term), normalized, text, flags=re.IGNORECASE)
103
+
104
+ # Handle Maori-specific characters if present
105
+ maori_chars = 'āēīōūwhΔ€Δ’ΔͺŌΕͺWH'
106
+ allowed_chars += maori_chars
107
+ text = re.sub(r'[^' + allowed_chars + ']', '', text)
108
+
109
+ # Remove empty lines and trim while preserving legal structure
110
+ lines = []
111
+ for line in text.split('\n'):
112
+ stripped = line.strip()
113
+ if stripped: # Keep non-empty lines
114
+ if preserve_structure and re.match(r'^\d+:', stripped):
115
+ lines.append(stripped) # Preserve section headers
116
+ else:
117
+ lines.append(stripped)
118
+
119
+ text = '\n'.join(lines)
120
+
121
+ return text.strip()
122
+
123
+ def chunk_text(self, text: str, chunk_size: int = 4096, overlap: int = 256,
124
+ method: str = "sentence") -> List[str]:
125
+ """
126
+ Split text into overlapping chunks for processing
127
+
128
+ Args:
129
+ text: Text to chunk
130
+ chunk_size: Size of each chunk
131
+ overlap: Overlap between chunks
132
+ method: Chunking method ('sentence', 'word', 'character')
133
+
134
+ Returns:
135
+ List of text chunks
136
+ """
137
+ if not text or len(text) <= chunk_size:
138
+ return [text] if text else []
139
+
140
+ chunks = []
141
+
142
+ if method == "sentence":
143
+ chunks = self._chunk_by_sentence(text, chunk_size, overlap)
144
+ elif method == "word":
145
+ chunks = self._chunk_by_word(text, chunk_size, overlap)
146
+ else: # character
147
+ chunks = self._chunk_by_character(text, chunk_size, overlap)
148
+
149
+ return chunks
150
+
151
+ def _chunk_by_sentence(self, text: str, chunk_size: int, overlap: int) -> List[str]:
152
+ """Chunk text by sentence boundaries"""
153
+ # Split into sentences (rough approximation)
154
+ sentence_pattern = r'(?<=[.!?])\s+'
155
+ sentences = re.split(sentence_pattern, text)
156
+
157
+ chunks = []
158
+ current_chunk = ""
159
+ overlap_text = ""
160
+
161
+ for sentence in sentences:
162
+ if not sentence.strip():
163
+ continue
164
+
165
+ # Check if adding this sentence would exceed chunk size
166
+ potential_chunk = current_chunk + sentence + " "
167
+
168
+ if len(potential_chunk) > chunk_size and current_chunk:
169
+ # Save current chunk
170
+ chunks.append(current_chunk.strip())
171
+
172
+ # Start new chunk with overlap
173
+ if overlap > 0 and len(current_chunk) > overlap:
174
+ overlap_text = current_chunk[-overlap:].strip()
175
+ current_chunk = overlap_text + " " + sentence + " "
176
+ else:
177
+ current_chunk = sentence + " "
178
+ else:
179
+ current_chunk = potential_chunk
180
+
181
+ # Add the last chunk
182
+ if current_chunk.strip():
183
+ chunks.append(current_chunk.strip())
184
+
185
+ return chunks
186
+
187
+ def _chunk_by_word(self, text: str, chunk_size: int, overlap: int) -> List[str]:
188
+ """Chunk text by word boundaries"""
189
+ words = text.split()
190
+ chunks = []
191
+
192
+ if not words:
193
+ return []
194
+
195
+ start = 0
196
+ while start < len(words):
197
+ end = start + 1
198
+ chunk_words = []
199
+
200
+ # Build chunk up to chunk_size
201
+ while end <= len(words):
202
+ potential_chunk = " ".join(words[start:end])
203
+ if len(potential_chunk) > chunk_size:
204
+ break
205
+ chunk_words = words[start:end]
206
+ end += 1
207
+
208
+ if chunk_words:
209
+ chunk = " ".join(chunk_words)
210
+ chunks.append(chunk)
211
+
212
+ # Move start position with overlap
213
+ overlap_words = max(0, min(overlap // 5, len(chunk_words))) # Rough word overlap
214
+ start = max(start + 1, end - overlap_words)
215
+ else:
216
+ break
217
+
218
+ return chunks
219
+
220
+ def _chunk_by_character(self, text: str, chunk_size: int, overlap: int) -> List[str]:
221
+ """Chunk text by character count (simple fallback)"""
222
+ chunks = []
223
+ start = 0
224
+
225
+ while start < len(text):
226
+ end = min(start + chunk_size, len(text))
227
+ chunk = text[start:end]
228
+ chunks.append(chunk)
229
+
230
+ # Move start with overlap
231
+ start = end - overlap if end < len(text) else len(text)
232
+
233
+ return chunks
234
+
235
+ def extract_metadata(self, text: str) -> Dict[str, Any]:
236
+ """Extract metadata from legislation text"""
237
+ metadata = {
238
+ 'sections': [],
239
+ 'acts_referenced': [],
240
+ 'dates': [],
241
+ 'word_count': len(text.split()),
242
+ 'character_count': len(text),
243
+ 'has_nz_references': False,
244
+ 'has_maori_terms': False
245
+ }
246
+
247
+ # Extract section numbers
248
+ sections = self.section_pattern.findall(text)
249
+ metadata['sections'] = [int(s) for s in sections]
250
+
251
+ # Extract referenced acts
252
+ acts = self.act_name_pattern.findall(text)
253
+ metadata['acts_referenced'] = [f"{act[0]} Act" for act in acts]
254
+
255
+ # Check for NZ-specific references
256
+ nz_indicators = ['New Zealand', 'Parliament', 'Crown', 'Government', 'Treaty of Waitangi']
257
+ metadata['has_nz_references'] = any(term in text for term in nz_indicators)
258
+
259
+ # Check for Maori terms
260
+ maori_indicators = ['ā', 'Δ“', 'Δ«', 'ō', 'Ε«', 'whakapapa', 'tangata whenua', 'mana']
261
+ metadata['has_maori_terms'] = any(term in text.lower() for term in maori_indicators)
262
+
263
+ # Extract dates (basic)
264
+ date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{4}\b'
265
+ dates = re.findall(date_pattern, text)
266
+ metadata['dates'] = dates
267
+
268
+ return metadata
269
+
270
+ def calculate_text_hash(self, text: str) -> str:
271
+ """Calculate SHA-256 hash of text for caching"""
272
+ return hashlib.sha256(text.encode('utf-8')).hexdigest()
273
+
274
+ def get_chunk_statistics(self, chunks: List[str]) -> Dict[str, Any]:
275
+ """Get statistics about text chunks"""
276
+ if not chunks:
277
+ return {
278
+ 'total_chunks': 0,
279
+ 'avg_chunk_size': 0,
280
+ 'min_chunk_size': 0,
281
+ 'max_chunk_size': 0,
282
+ 'total_characters': 0
283
+ }
284
+
285
+ chunk_sizes = [len(chunk) for chunk in chunks]
286
+
287
+ return {
288
+ 'total_chunks': len(chunks),
289
+ 'avg_chunk_size': sum(chunk_sizes) / len(chunks),
290
+ 'min_chunk_size': min(chunk_sizes),
291
+ 'max_chunk_size': max(chunk_sizes),
292
+ 'total_characters': sum(chunk_sizes)
293
+ }
294
+
295
+ def preprocess_legislation_json(self, json_data: Dict[str, Any]) -> Dict[str, Any]:
296
+ """Preprocess legislation data from JSON format"""
297
+ processed = {
298
+ 'id': json_data.get('id', ''),
299
+ 'title': json_data.get('title', ''),
300
+ 'year': json_data.get('year', ''),
301
+ 'source': json_data.get('source', ''),
302
+ 'original_text': json_data.get('text', ''),
303
+ 'cleaned_text': '',
304
+ 'chunks': [],
305
+ 'metadata': {},
306
+ 'processing_stats': {}
307
+ }
308
+
309
+ # Clean the text
310
+ raw_text = json_data.get('text', '')
311
+ processed['cleaned_text'] = self.clean_text(raw_text)
312
+
313
+ # Extract metadata
314
+ processed['metadata'] = self.extract_metadata(processed['cleaned_text'])
315
+
316
+ return processed
317
+
318
+ def batch_process_texts(self, texts: List[str], chunk_size: int = 4096,
319
+ overlap: int = 256) -> List[Dict[str, Any]]:
320
+ """Process multiple texts in batch"""
321
+ results = []
322
+
323
+ for text in texts:
324
+ cleaned = self.clean_text(text)
325
+ chunks = self.chunk_text(cleaned, chunk_size, overlap)
326
+ metadata = self.extract_metadata(cleaned)
327
+ stats = self.get_chunk_statistics(chunks)
328
+
329
+ result = {
330
+ 'original_text': text,
331
+ 'cleaned_text': cleaned,
332
+ 'chunks': chunks,
333
+ 'metadata': metadata,
334
+ 'processing_stats': stats
335
+ }
336
+
337
+ results.append(result)
338
+
339
+ return results
340
+
341
+ def validate_text_quality(self, text: str) -> Dict[str, Any]:
342
+ """Validate and assess text quality for processing"""
343
+ quality = {
344
+ 'is_valid': True,
345
+ 'issues': [],
346
+ 'score': 100,
347
+ 'metrics': {}
348
+ }
349
+
350
+ # Check minimum length
351
+ if len(text.strip()) < 10:
352
+ quality['issues'].append("Text too short")
353
+ quality['score'] -= 50
354
+
355
+ # Check for excessive special characters
356
+ special_chars = len(re.findall(r'[^\w\s]', text))
357
+ special_ratio = special_chars / len(text) if text else 0
358
+ if special_ratio > 0.3:
359
+ quality['issues'].append("High special character ratio")
360
+ quality['score'] -= 20
361
+
362
+ # Check for legal content indicators
363
+ legal_indicators = ['section', 'act', 'law', 'regulation', 'clause', 'subsection']
364
+ has_legal_content = any(indicator in text.lower() for indicator in legal_indicators)
365
+ if not has_legal_content:
366
+ quality['issues'].append("May not be legal content")
367
+ quality['score'] -= 30
368
+
369
+ quality['is_valid'] = len(quality['issues']) == 0
370
+ quality['metrics'] = {
371
+ 'length': len(text),
372
+ 'word_count': len(text.split()),
373
+ 'special_char_ratio': special_ratio,
374
+ 'has_legal_content': has_legal_content
375
+ }
376
+
377
+ return quality
streamlit_app/utils/__pycache__/config.cpython-312.pyc ADDED
Binary file (11.8 kB). View file
 
streamlit_app/utils/__pycache__/performance.cpython-312.pyc ADDED
Binary file (13.7 kB). View file
 
streamlit_app/utils/__pycache__/ui_helpers.cpython-312.pyc ADDED
Binary file (21.1 kB). View file
 
streamlit_app/utils/config.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Configuration Manager
4
+
5
+ Handles all configuration settings for the NZ Legislation Loophole Analysis application.
6
+ Provides default configurations, persistent storage, and validation.
7
+ """
8
+
9
+ import json
10
+ import os
11
+ from pathlib import Path
12
+ from typing import Dict, Any, Optional
13
+ import streamlit as st
14
+
15
+ class ConfigManager:
16
+ """Configuration manager for the application"""
17
+
18
+ def __init__(self, config_file: str = None):
19
+ """
20
+ Initialize configuration manager
21
+
22
+ Args:
23
+ config_file: Path to configuration file (optional)
24
+ """
25
+ if config_file is None:
26
+ config_dir = Path(__file__).parent.parent / 'config'
27
+ config_dir.mkdir(exist_ok=True)
28
+ config_file = config_dir / 'app_config.json'
29
+
30
+ self.config_file = Path(config_file)
31
+ self._config = {}
32
+ self._load_config()
33
+
34
+ def _load_config(self):
35
+ """Load configuration from file or use defaults"""
36
+ if self.config_file.exists():
37
+ try:
38
+ with open(self.config_file, 'r', encoding='utf-8') as f:
39
+ self._config = json.load(f)
40
+ # Validate and merge with defaults
41
+ self._config = self._merge_with_defaults(self._config)
42
+ except (json.JSONDecodeError, IOError) as e:
43
+ print(f"Warning: Could not load config file: {e}")
44
+ self._config = self._get_default_config()
45
+ else:
46
+ self._config = self._get_default_config()
47
+
48
+ def _get_default_config(self) -> Dict[str, Any]:
49
+ """Get default configuration"""
50
+ return {
51
+ 'model': {
52
+ 'path': 'qwen3.gguf',
53
+ 'repo_id': 'DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-GGUF',
54
+ 'filename': 'Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-D_AU-IQ4_XS-imat.gguf',
55
+ 'context_length': 40960,
56
+ 'max_tokens': 4096,
57
+ 'temperature': 0.3,
58
+ 'top_p': 0.85,
59
+ 'top_k': 50,
60
+ 'repeat_penalty': 1.15
61
+ },
62
+ 'processing': {
63
+ 'chunk_size': 4096,
64
+ 'chunk_overlap': 256,
65
+ 'batch_size': 16,
66
+ 'clean_text': True,
67
+ 'preserve_structure': True
68
+ },
69
+ 'cache': {
70
+ 'enabled': True,
71
+ 'max_size_mb': 1024,
72
+ 'ttl_hours': 24,
73
+ 'persistent': True
74
+ },
75
+ 'analysis': {
76
+ 'depth': 'Standard',
77
+ 'include_recommendations': True,
78
+ 'focus_areas': ['loopholes', 'ambiguities', 'unintended_consequences'],
79
+ 'legal_domains': ['constitutional', 'administrative', 'criminal', 'civil']
80
+ },
81
+ 'ui': {
82
+ 'theme': 'Auto',
83
+ 'show_progress': True,
84
+ 'auto_refresh': False,
85
+ 'max_display_items': 50
86
+ },
87
+ 'advanced': {
88
+ 'debug_mode': False,
89
+ 'log_level': 'INFO',
90
+ 'memory_limit_mb': 8192,
91
+ 'thread_pool_size': 4,
92
+ 'save_intermediate_results': True
93
+ }
94
+ }
95
+
96
+ def _merge_with_defaults(self, user_config: Dict[str, Any]) -> Dict[str, Any]:
97
+ """Merge user configuration with defaults"""
98
+ default_config = self._get_default_config()
99
+
100
+ def merge_dicts(default: Dict[str, Any], user: Dict[str, Any]) -> Dict[str, Any]:
101
+ merged = default.copy()
102
+ for key, value in user.items():
103
+ if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
104
+ merged[key] = merge_dicts(merged[key], value)
105
+ else:
106
+ merged[key] = value
107
+ return merged
108
+
109
+ return merge_dicts(default_config, user_config)
110
+
111
+ def get_config(self) -> Dict[str, Any]:
112
+ """Get current configuration"""
113
+ return self._config.copy()
114
+
115
+ def update_config(self, new_config: Dict[str, Any]):
116
+ """Update configuration with validation"""
117
+ # Validate configuration
118
+ if self._validate_config(new_config):
119
+ self._config = self._merge_with_defaults(new_config)
120
+ self._save_config()
121
+ else:
122
+ raise ValueError("Invalid configuration provided")
123
+
124
+ def _validate_config(self, config: Dict[str, Any]) -> bool:
125
+ """Validate configuration values"""
126
+ try:
127
+ # Model validation
128
+ model_config = config.get('model', {})
129
+ if model_config.get('context_length', 0) < 1024:
130
+ return False
131
+ if model_config.get('max_tokens', 0) < 64:
132
+ return False
133
+ if not (0 <= model_config.get('temperature', 0) <= 2):
134
+ return False
135
+
136
+ # Processing validation
137
+ proc_config = config.get('processing', {})
138
+ if proc_config.get('chunk_size', 0) < 256:
139
+ return False
140
+ if proc_config.get('chunk_overlap', 0) >= proc_config.get('chunk_size', 1):
141
+ return False
142
+ if proc_config.get('batch_size', 0) < 1:
143
+ return False
144
+
145
+ # Cache validation
146
+ cache_config = config.get('cache', {})
147
+ if cache_config.get('max_size_mb', 0) < 100:
148
+ return False
149
+ if cache_config.get('ttl_hours', 0) < 1:
150
+ return False
151
+
152
+ return True
153
+ except Exception:
154
+ return False
155
+
156
+ def _save_config(self):
157
+ """Save configuration to file"""
158
+ try:
159
+ self.config_file.parent.mkdir(exist_ok=True)
160
+ with open(self.config_file, 'w', encoding='utf-8') as f:
161
+ json.dump(self._config, f, indent=2, ensure_ascii=False)
162
+ except IOError as e:
163
+ print(f"Warning: Could not save config file: {e}")
164
+
165
+ def reset_to_defaults(self):
166
+ """Reset configuration to defaults"""
167
+ self._config = self._get_default_config()
168
+ self._save_config()
169
+
170
+ def get_section(self, section: str) -> Dict[str, Any]:
171
+ """Get a specific configuration section"""
172
+ return self._config.get(section, {})
173
+
174
+ def update_section(self, section: str, values: Dict[str, Any]):
175
+ """Update a specific configuration section"""
176
+ if section not in self._config:
177
+ self._config[section] = {}
178
+
179
+ self._config[section].update(values)
180
+
181
+ # Validate the updated config
182
+ if self._validate_config(self._config):
183
+ self._save_config()
184
+ else:
185
+ raise ValueError(f"Invalid configuration for section: {section}")
186
+
187
+ def export_config(self, filepath: str) -> bool:
188
+ """Export configuration to file"""
189
+ try:
190
+ with open(filepath, 'w', encoding='utf-8') as f:
191
+ json.dump(self._config, f, indent=2, ensure_ascii=False)
192
+ return True
193
+ except IOError:
194
+ return False
195
+
196
+ def import_config(self, filepath: str) -> bool:
197
+ """Import configuration from file"""
198
+ try:
199
+ with open(filepath, 'r', encoding='utf-8') as f:
200
+ imported_config = json.load(f)
201
+
202
+ if self._validate_config(imported_config):
203
+ self._config = self._merge_with_defaults(imported_config)
204
+ self._save_config()
205
+ return True
206
+ else:
207
+ return False
208
+ except (IOError, json.JSONDecodeError):
209
+ return False
210
+
211
+ def get_model_config(self) -> Dict[str, Any]:
212
+ """Get model-specific configuration"""
213
+ return self._config.get('model', {})
214
+
215
+ def get_processing_config(self) -> Dict[str, Any]:
216
+ """Get processing-specific configuration"""
217
+ return self._config.get('processing', {})
218
+
219
+ def get_cache_config(self) -> Dict[str, Any]:
220
+ """Get cache-specific configuration"""
221
+ return self._config.get('cache', {})
222
+
223
+ def get_ui_config(self) -> Dict[str, Any]:
224
+ """Get UI-specific configuration"""
225
+ return self._config.get('ui', {})
226
+
227
+ def get_advanced_config(self) -> Dict[str, Any]:
228
+ """Get advanced configuration"""
229
+ return self._config.get('advanced', {})
230
+
231
+ # Global configuration instance
232
+ _config_instance = None
233
+
234
+ def get_config_manager(config_file: str = None) -> ConfigManager:
235
+ """Get or create global configuration manager instance"""
236
+ global _config_instance
237
+
238
+ if _config_instance is None:
239
+ _config_instance = ConfigManager(config_file)
240
+
241
+ return _config_instance
streamlit_app/utils/performance.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Performance Monitor
4
+
5
+ Monitors system performance metrics for the NZ Legislation Loophole Analysis application.
6
+ Tracks memory usage, CPU utilization, processing times, and other performance indicators.
7
+ """
8
+
9
+ import time
10
+ import threading
11
+ import psutil
12
+ from typing import Dict, Any, Optional, List
13
+ from collections import deque
14
+ import streamlit as st
15
+
16
+ class PerformanceMonitor:
17
+ """Performance monitoring system"""
18
+
19
+ def __init__(self, max_history: int = 1000):
20
+ """
21
+ Initialize performance monitor
22
+
23
+ Args:
24
+ max_history: Maximum number of historical data points to keep
25
+ """
26
+ self.max_history = max_history
27
+ self.lock = threading.RLock()
28
+
29
+ # Historical data storage
30
+ self.memory_history = deque(maxlen=max_history)
31
+ self.cpu_history = deque(maxlen=max_history)
32
+ self.processing_times = deque(maxlen=max_history)
33
+
34
+ # Current metrics
35
+ self.current_metrics = {
36
+ 'memory_usage_mb': 0,
37
+ 'memory_percent': 0,
38
+ 'cpu_percent': 0,
39
+ 'active_threads': 0,
40
+ 'processing_time_avg': 0,
41
+ 'processing_time_max': 0,
42
+ 'processing_time_min': 0,
43
+ 'total_processed_chunks': 0,
44
+ 'chunks_per_second': 0
45
+ }
46
+
47
+ # Processing timing
48
+ self.processing_start_time = None
49
+ self.last_chunk_time = time.time()
50
+
51
+ # Start monitoring thread
52
+ self.monitoring = True
53
+ self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
54
+ self.monitor_thread.start()
55
+
56
+ def _monitor_loop(self):
57
+ """Background monitoring loop"""
58
+ while self.monitoring:
59
+ try:
60
+ self._update_metrics()
61
+ time.sleep(1) # Update every second
62
+ except Exception as e:
63
+ print(f"Performance monitoring error: {e}")
64
+ time.sleep(5) # Wait longer on error
65
+
66
+ def _update_metrics(self):
67
+ """Update current performance metrics"""
68
+ process = psutil.Process()
69
+
70
+ with self.lock:
71
+ # Memory metrics
72
+ memory_info = process.memory_info()
73
+ memory_usage_mb = memory_info.rss / 1024 / 1024
74
+ memory_percent = process.memory_percent()
75
+
76
+ # CPU metrics
77
+ cpu_percent = process.cpu_percent(interval=0.1)
78
+
79
+ # Thread count
80
+ active_threads = len(process.threads())
81
+
82
+ # Update current metrics
83
+ self.current_metrics.update({
84
+ 'memory_usage_mb': memory_usage_mb,
85
+ 'memory_percent': memory_percent,
86
+ 'cpu_percent': cpu_percent,
87
+ 'active_threads': active_threads
88
+ })
89
+
90
+ # Store historical data
91
+ current_time = time.time()
92
+ self.memory_history.append((current_time, memory_usage_mb))
93
+ self.cpu_history.append((current_time, cpu_percent))
94
+
95
+ def start_processing_timer(self):
96
+ """Start timing a processing operation"""
97
+ self.processing_start_time = time.time()
98
+
99
+ def end_processing_timer(self) -> float:
100
+ """End timing and return elapsed time"""
101
+ if self.processing_start_time is None:
102
+ return 0
103
+
104
+ elapsed = time.time() - self.processing_start_time
105
+ self.processing_start_time = None
106
+
107
+ with self.lock:
108
+ self.processing_times.append(elapsed)
109
+
110
+ # Update processing time statistics
111
+ if self.processing_times:
112
+ self.current_metrics['processing_time_avg'] = sum(self.processing_times) / len(self.processing_times)
113
+ self.current_metrics['processing_time_max'] = max(self.processing_times)
114
+ self.current_metrics['processing_time_min'] = min(self.processing_times)
115
+
116
+ return elapsed
117
+
118
+ def record_chunk_processing(self):
119
+ """Record that a chunk has been processed"""
120
+ current_time = time.time()
121
+
122
+ with self.lock:
123
+ self.current_metrics['total_processed_chunks'] += 1
124
+
125
+ # Calculate chunks per second
126
+ time_diff = current_time - self.last_chunk_time
127
+ if time_diff > 0:
128
+ current_cps = 1.0 / time_diff
129
+ # Smooth the chunks per second calculation
130
+ self.current_metrics['chunks_per_second'] = (
131
+ 0.9 * self.current_metrics['chunks_per_second'] + 0.1 * current_cps
132
+ )
133
+
134
+ self.last_chunk_time = current_time
135
+
136
+ def get_stats(self) -> Dict[str, Any]:
137
+ """Get current performance statistics"""
138
+ with self.lock:
139
+ return self.current_metrics.copy()
140
+
141
+ def get_memory_history(self, time_window_seconds: int = 300) -> List[tuple]:
142
+ """Get memory usage history within time window"""
143
+ current_time = time.time()
144
+ cutoff_time = current_time - time_window_seconds
145
+
146
+ with self.lock:
147
+ return [(t, v) for t, v in self.memory_history if t >= cutoff_time]
148
+
149
+ def get_cpu_history(self, time_window_seconds: int = 300) -> List[tuple]:
150
+ """Get CPU usage history within time window"""
151
+ current_time = time.time()
152
+ cutoff_time = current_time - time_window_seconds
153
+
154
+ with self.lock:
155
+ return [(t, v) for t, v in self.cpu_history if t >= cutoff_time]
156
+
157
+ def get_processing_time_stats(self) -> Dict[str, Any]:
158
+ """Get processing time statistics"""
159
+ with self.lock:
160
+ if not self.processing_times:
161
+ return {
162
+ 'count': 0,
163
+ 'average': 0,
164
+ 'maximum': 0,
165
+ 'minimum': 0,
166
+ 'median': 0
167
+ }
168
+
169
+ sorted_times = sorted(self.processing_times)
170
+
171
+ return {
172
+ 'count': len(self.processing_times),
173
+ 'average': sum(self.processing_times) / len(self.processing_times),
174
+ 'maximum': max(self.processing_times),
175
+ 'minimum': min(self.processing_times),
176
+ 'median': sorted_times[len(sorted_times) // 2]
177
+ }
178
+
179
+ def get_system_info(self) -> Dict[str, Any]:
180
+ """Get system information"""
181
+ return {
182
+ 'cpu_count': psutil.cpu_count(),
183
+ 'cpu_count_logical': psutil.cpu_count(logical=True),
184
+ 'total_memory_gb': psutil.virtual_memory().total / (1024**3),
185
+ 'available_memory_gb': psutil.virtual_memory().available / (1024**3),
186
+ 'python_version': f"{psutil.python_implementation()} {psutil.python_version()}",
187
+ 'platform': psutil.platform
188
+ }
189
+
190
+ def reset_stats(self):
191
+ """Reset performance statistics"""
192
+ with self.lock:
193
+ self.processing_times.clear()
194
+ self.current_metrics['total_processed_chunks'] = 0
195
+ self.current_metrics['chunks_per_second'] = 0
196
+ self.current_metrics['processing_time_avg'] = 0
197
+ self.current_metrics['processing_time_max'] = 0
198
+ self.current_metrics['processing_time_min'] = 0
199
+
200
+ def cleanup(self):
201
+ """Cleanup resources"""
202
+ self.monitoring = False
203
+ if self.monitor_thread.is_alive():
204
+ self.monitor_thread.join(timeout=2)
205
+
206
+ def get_performance_report(self) -> Dict[str, Any]:
207
+ """Generate a comprehensive performance report"""
208
+ return {
209
+ 'current_metrics': self.get_stats(),
210
+ 'processing_stats': self.get_processing_time_stats(),
211
+ 'system_info': self.get_system_info(),
212
+ 'memory_history_count': len(self.memory_history),
213
+ 'cpu_history_count': len(self.cpu_history),
214
+ 'processing_times_count': len(self.processing_times)
215
+ }
216
+
217
+ def check_memory_threshold(self, threshold_mb: int) -> bool:
218
+ """Check if memory usage is above threshold"""
219
+ return self.current_metrics['memory_usage_mb'] > threshold_mb
220
+
221
+ def check_cpu_threshold(self, threshold_percent: float) -> bool:
222
+ """Check if CPU usage is above threshold"""
223
+ return self.current_metrics['cpu_percent'] > threshold_percent
224
+
225
+ def get_recommendations(self) -> List[str]:
226
+ """Get performance recommendations based on current metrics"""
227
+ recommendations = []
228
+
229
+ # Memory recommendations
230
+ if self.current_metrics['memory_usage_mb'] > 7000:
231
+ recommendations.append("High memory usage detected. Consider reducing batch size or chunk size.")
232
+ elif self.current_metrics['memory_usage_mb'] > 5000:
233
+ recommendations.append("Moderate memory usage. Monitor closely during processing.")
234
+
235
+ # CPU recommendations
236
+ if self.current_metrics['cpu_percent'] > 90:
237
+ recommendations.append("High CPU usage. Consider reducing processing intensity.")
238
+ elif self.current_metrics['cpu_percent'] > 70:
239
+ recommendations.append("Moderate CPU usage. Processing is running optimally.")
240
+
241
+ # Processing speed recommendations
242
+ avg_time = self.current_metrics.get('processing_time_avg', 0)
243
+ if avg_time > 10:
244
+ recommendations.append("Slow processing detected. Consider using a more powerful model or optimizing settings.")
245
+ elif avg_time > 5:
246
+ recommendations.append("Moderate processing speed. Consider increasing batch size if memory allows.")
247
+
248
+ # Cache recommendations
249
+ # This would be integrated with cache manager stats
250
+ chunks_per_second = self.current_metrics.get('chunks_per_second', 0)
251
+ if chunks_per_second < 1:
252
+ recommendations.append("Low processing throughput. Consider optimizing chunk size or model parameters.")
253
+
254
+ if not recommendations:
255
+ recommendations.append("Performance is optimal. All metrics are within normal ranges.")
256
+
257
+ return recommendations
258
+
259
+ # Global performance monitor instance
260
+ _performance_instance = None
261
+ _performance_lock = threading.Lock()
262
+
263
+ def get_performance_monitor(max_history: int = 1000) -> PerformanceMonitor:
264
+ """Get or create global performance monitor instance"""
265
+ global _performance_instance
266
+
267
+ with _performance_lock:
268
+ if _performance_instance is None:
269
+ _performance_instance = PerformanceMonitor(max_history)
270
+
271
+ return _performance_instance
streamlit_app/utils/ui_helpers.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ UI Helpers
4
+
5
+ Utility functions and components for the Streamlit application UI.
6
+ Provides reusable UI elements, formatting functions, and visual components.
7
+ """
8
+
9
+ import streamlit as st
10
+ import pandas as pd
11
+ import plotly.graph_objects as go
12
+ import plotly.express as px
13
+ from typing import Dict, Any, List, Optional, Tuple
14
+ import time
15
+ from datetime import datetime
16
+ import json
17
+
18
+ class UIHelpers:
19
+ """UI helper functions and components"""
20
+
21
+ @staticmethod
22
+ def create_metric_card(title: str, value: Any, delta: Optional[Any] = None,
23
+ delta_color: str = "normal", help_text: Optional[str] = None):
24
+ """Create a styled metric card"""
25
+ if isinstance(value, float):
26
+ if title.lower().endswith(('rate', 'ratio', 'percentage', 'percent')):
27
+ formatted_value = ".1f"
28
+ else:
29
+ formatted_value = ".2f"
30
+ else:
31
+ formatted_value = str(value)
32
+
33
+ return st.metric(
34
+ label=title,
35
+ value=formatted_value,
36
+ delta=delta,
37
+ delta_color=delta_color,
38
+ help=help_text
39
+ )
40
+
41
+ @staticmethod
42
+ def create_progress_bar(progress: float, text: str = "", color: str = "primary"):
43
+ """Create a styled progress bar with text"""
44
+ if text:
45
+ st.write(f"**{text}**")
46
+
47
+ if color == "success":
48
+ bar_color = "#28a745"
49
+ elif color == "warning":
50
+ bar_color = "#ffc107"
51
+ elif color == "danger":
52
+ bar_color = "#dc3545"
53
+ else:
54
+ bar_color = None
55
+
56
+ st.progress(progress, text=f"{progress:.1%} Complete")
57
+
58
+ @staticmethod
59
+ def create_info_box(message: str, type: str = "info"):
60
+ """Create a styled info/warning/success box"""
61
+ if type == "success":
62
+ st.success(message)
63
+ elif type == "warning":
64
+ st.warning(message)
65
+ elif type == "error":
66
+ st.error(message)
67
+ else:
68
+ st.info(message)
69
+
70
+ @staticmethod
71
+ def format_file_size(size_bytes: int) -> str:
72
+ """Format file size in human-readable format"""
73
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
74
+ if size_bytes < 1024.0:
75
+ return ".1f"
76
+ size_bytes /= 1024.0
77
+ return ".1f"
78
+
79
+ @staticmethod
80
+ def format_time_duration(seconds: float) -> str:
81
+ """Format time duration in human-readable format"""
82
+ if seconds < 60:
83
+ return ".1f"
84
+ elif seconds < 3600:
85
+ minutes = int(seconds // 60)
86
+ remaining_seconds = seconds % 60
87
+ return ".1f"
88
+ else:
89
+ hours = int(seconds // 3600)
90
+ minutes = int((seconds % 3600) // 60)
91
+ return f"{hours}h {minutes}m"
92
+
93
+ @staticmethod
94
+ def create_performance_chart(data: List[Tuple[float, float]],
95
+ title: str, y_label: str, color: str = "#1f77b4"):
96
+ """Create a performance chart using Plotly"""
97
+ if not data:
98
+ return None
99
+
100
+ times, values = zip(*data)
101
+
102
+ # Convert timestamps to relative time
103
+ start_time = min(times)
104
+ relative_times = [t - start_time for t in times]
105
+
106
+ fig = go.Figure()
107
+ fig.add_trace(go.Scatter(
108
+ x=relative_times,
109
+ y=values,
110
+ mode='lines+markers',
111
+ line=dict(color=color, width=2),
112
+ marker=dict(size=4),
113
+ name=y_label
114
+ ))
115
+
116
+ fig.update_layout(
117
+ title=title,
118
+ xaxis_title="Time (seconds)",
119
+ yaxis_title=y_label,
120
+ template="plotly_white",
121
+ height=300,
122
+ margin=dict(l=20, r=20, t=40, b=20)
123
+ )
124
+
125
+ return fig
126
+
127
+ @staticmethod
128
+ def create_comparison_chart(data_dict: Dict[str, List[float]],
129
+ title: str, x_label: str, y_label: str):
130
+ """Create a comparison bar chart"""
131
+ fig = go.Figure()
132
+
133
+ for label, values in data_dict.items():
134
+ fig.add_trace(go.Bar(
135
+ name=label,
136
+ x=list(range(len(values))),
137
+ y=values,
138
+ text=[f"{v:.2f}" for v in values],
139
+ textposition='auto',
140
+ ))
141
+
142
+ fig.update_layout(
143
+ title=title,
144
+ xaxis_title=x_label,
145
+ yaxis_title=y_label,
146
+ template="plotly_white",
147
+ height=400,
148
+ margin=dict(l=20, r=20, t=40, b=20)
149
+ )
150
+
151
+ return fig
152
+
153
+ @staticmethod
154
+ def create_analysis_summary(results: List[Dict[str, Any]]) -> Dict[str, Any]:
155
+ """Create a summary of analysis results"""
156
+ if not results:
157
+ return {
158
+ 'total_analyses': 0,
159
+ 'total_loopholes': 0,
160
+ 'avg_confidence': 0,
161
+ 'total_chunks': 0,
162
+ 'analysis_types': {}
163
+ }
164
+
165
+ total_loopholes = sum(len(result.get('loopholes', [])) for result in results)
166
+ total_confidence = sum(result.get('confidence', 0) for result in results)
167
+ total_chunks = sum(result.get('chunks_processed', 0) for result in results)
168
+
169
+ # Count analysis types
170
+ analysis_types = {}
171
+ for result in results:
172
+ analysis_type = result.get('analysis_type', 'Unknown')
173
+ analysis_types[analysis_type] = analysis_types.get(analysis_type, 0) + 1
174
+
175
+ return {
176
+ 'total_analyses': len(results),
177
+ 'total_loopholes': total_loopholes,
178
+ 'avg_confidence': total_confidence / len(results) if results else 0,
179
+ 'total_chunks': total_chunks,
180
+ 'analysis_types': analysis_types
181
+ }
182
+
183
+ @staticmethod
184
+ def display_analysis_result(result: Dict[str, Any], index: int = 0):
185
+ """Display a single analysis result in a formatted way"""
186
+ with st.expander(f"πŸ“‹ Analysis {index + 1}: {result.get('title', 'Unknown Title')}", expanded=index == 0):
187
+ col1, col2 = st.columns([2, 1])
188
+
189
+ with col1:
190
+ st.markdown("**Summary:**")
191
+ st.write(result.get('summary', 'No summary available'))
192
+
193
+ st.markdown("**Key Findings:**")
194
+ loopholes = result.get('loopholes', [])
195
+ if loopholes:
196
+ for i, loophole in enumerate(loopholes, 1):
197
+ st.markdown(f"{i}. {loophole}")
198
+ else:
199
+ st.write("No significant loopholes identified.")
200
+
201
+ if result.get('recommendations'):
202
+ st.markdown("**Recommendations:**")
203
+ for rec in result.get('recommendations', []):
204
+ st.markdown(f"β€’ {rec}")
205
+
206
+ with col2:
207
+ UIHelpers.create_metric_card(
208
+ "Confidence",
209
+ ".2f",
210
+ help_text="Model confidence in analysis"
211
+ )
212
+
213
+ UIHelpers.create_metric_card(
214
+ "Processing Time",
215
+ ".2f",
216
+ help_text="Time taken to analyze this content"
217
+ )
218
+
219
+ UIHelpers.create_metric_card(
220
+ "Chunks Processed",
221
+ result.get('chunks_processed', 0),
222
+ help_text="Number of text chunks analyzed"
223
+ )
224
+
225
+ st.markdown("**Metadata:**")
226
+ st.write(f"**Source:** {result.get('source', 'Unknown')}")
227
+ st.write(f"**Date:** {result.get('date', 'Unknown')}")
228
+ st.write(f"**Analysis Type:** {result.get('analysis_type', 'Standard')}")
229
+
230
+ @staticmethod
231
+ def create_export_section(results: List[Dict[str, Any]]):
232
+ """Create the export section for results"""
233
+ st.subheader("πŸ’Ύ Export Results")
234
+
235
+ if not results:
236
+ st.info("No results to export")
237
+ return
238
+
239
+ col1, col2, col3 = st.columns(3)
240
+
241
+ with col1:
242
+ if st.button("πŸ“„ Export as JSON", use_container_width=True):
243
+ json_data = json.dumps(results, indent=2, ensure_ascii=False)
244
+ st.download_button(
245
+ label="Download JSON",
246
+ data=json_data,
247
+ file_name=f"nz_legislation_analysis_{int(time.time())}.json",
248
+ mime="application/json",
249
+ use_container_width=True
250
+ )
251
+
252
+ with col2:
253
+ if st.button("πŸ“Š Export as CSV", use_container_width=True):
254
+ df = pd.DataFrame(results)
255
+ csv_data = df.to_csv(index=False)
256
+ st.download_button(
257
+ label="Download CSV",
258
+ data=csv_data,
259
+ file_name=f"nz_legislation_analysis_{int(time.time())}.csv",
260
+ mime="text/csv",
261
+ use_container_width=True
262
+ )
263
+
264
+ with col3:
265
+ if st.button("πŸ“‹ Export as Excel", use_container_width=True):
266
+ df = pd.DataFrame(results)
267
+ excel_data = df.to_excel(index=False, engine='openpyxl')
268
+ st.download_button(
269
+ label="Download Excel",
270
+ data=excel_data,
271
+ file_name=f"nz_legislation_analysis_{int(time.time())}.xlsx",
272
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
273
+ use_container_width=True
274
+ )
275
+
276
+ @staticmethod
277
+ def create_cache_management_section(cache_manager):
278
+ """Create cache management section"""
279
+ st.subheader("🧠 Cache Management")
280
+
281
+ cache_stats = cache_manager.get_stats()
282
+
283
+ col1, col2, col3, col4 = st.columns(4)
284
+
285
+ with col1:
286
+ UIHelpers.create_metric_card("Cache Hits", cache_stats['hits'])
287
+
288
+ with col2:
289
+ UIHelpers.create_metric_card("Cache Misses", cache_stats['misses'])
290
+
291
+ with col3:
292
+ UIHelpers.create_metric_card("Hit Rate", ".1f")
293
+
294
+ with col4:
295
+ UIHelpers.create_metric_card("Cached Entries", cache_stats['entries'])
296
+
297
+ col1, col2, col3 = st.columns(3)
298
+
299
+ with col1:
300
+ if st.button("πŸ”„ Clear Cache", type="secondary", use_container_width=True):
301
+ cache_manager.clear_cache()
302
+ st.rerun()
303
+
304
+ with col2:
305
+ if st.button("πŸ“€ Export Cache", use_container_width=True):
306
+ import tempfile
307
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
308
+ success = cache_manager.export_cache(f.name)
309
+ if success:
310
+ st.success("Cache exported successfully")
311
+ else:
312
+ st.error("Failed to export cache")
313
+
314
+ with col3:
315
+ uploaded_cache = st.file_uploader("πŸ“₯ Import Cache", type=['json'])
316
+ if uploaded_cache:
317
+ import tempfile
318
+ with tempfile.NamedTemporaryFile(mode='wb', delete=False) as f:
319
+ f.write(uploaded_cache.read())
320
+ imported_count = cache_manager.import_cache(f.name)
321
+ st.success(f"Imported {imported_count} cache entries")
322
+
323
+ @staticmethod
324
+ def create_system_info_section(perf_monitor):
325
+ """Create system information section"""
326
+ st.subheader("πŸ’» System Information")
327
+
328
+ sys_info = perf_monitor.get_system_info()
329
+
330
+ col1, col2 = st.columns(2)
331
+
332
+ with col1:
333
+ st.markdown("**Hardware:**")
334
+ st.write(f"**CPU Cores:** {sys_info['cpu_count']} physical, {sys_info['cpu_count_logical']} logical")
335
+ st.write(f"**Total Memory:** {sys_info['total_memory_gb']:.1f} GB")
336
+ st.write(f"**Available Memory:** {sys_info['available_memory_gb']:.1f} GB")
337
+
338
+ with col2:
339
+ st.markdown("**Software:**")
340
+ st.write(f"**Python:** {sys_info['python_version']}")
341
+ st.write(f"**Platform:** {sys_info['platform']}")
342
+ st.write(f"**Active Threads:** {st.session_state.performance_monitor.get_stats()['active_threads']}")
343
+
344
+ @staticmethod
345
+ def create_performance_recommendations(perf_monitor):
346
+ """Create performance recommendations section"""
347
+ st.subheader("πŸ’‘ Performance Recommendations")
348
+
349
+ recommendations = perf_monitor.get_recommendations()
350
+
351
+ if recommendations:
352
+ for rec in recommendations:
353
+ if "High" in rec or "Slow" in rec:
354
+ st.error(rec)
355
+ elif "Moderate" in rec or "Consider" in rec:
356
+ st.warning(rec)
357
+ else:
358
+ st.info(rec)
359
+ else:
360
+ st.success("All performance metrics are within optimal ranges!")
361
+
362
+ @staticmethod
363
+ def create_loading_spinner(text: str = "Processing..."):
364
+ """Create a loading spinner"""
365
+ return st.spinner(text)
366
+
367
+ @staticmethod
368
+ def create_success_message(message: str):
369
+ """Create a success message"""
370
+ st.success(message)
371
+
372
+ @staticmethod
373
+ def create_error_message(message: str):
374
+ """Create an error message"""
375
+ st.error(message)
376
+
377
+ @staticmethod
378
+ def create_warning_message(message: str):
379
+ """Create a warning message"""
380
+ st.warning(message)
381
+
382
+ @staticmethod
383
+ def create_data_table(data: List[Dict[str, Any]], columns: Optional[List[str]] = None):
384
+ """Create a formatted data table"""
385
+ if not data:
386
+ st.info("No data to display")
387
+ return
388
+
389
+ df = pd.DataFrame(data)
390
+
391
+ if columns:
392
+ available_columns = [col for col in columns if col in df.columns]
393
+ if available_columns:
394
+ df = df[available_columns]
395
+
396
+ st.dataframe(df, use_container_width=True)
397
+
398
+ @staticmethod
399
+ def create_json_viewer(data: Dict[str, Any], title: str = "JSON Data"):
400
+ """Create a JSON viewer"""
401
+ st.subheader(title)
402
+
403
+ with st.expander("View JSON", expanded=False):
404
+ st.json(data)
405
+
406
+ @staticmethod
407
+ def create_file_preview(file_content: str, max_lines: int = 20):
408
+ """Create a file content preview"""
409
+ lines = file_content.split('\n')
410
+ preview_content = '\n'.join(lines[:max_lines])
411
+
412
+ if len(lines) > max_lines:
413
+ preview_content += f"\n\n... ({len(lines) - max_lines} more lines)"
414
+
415
+ st.text_area("File Preview", preview_content, height=200, disabled=True)
test_app_imports.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to validate Streamlit app imports and basic functionality
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+
10
+ def test_imports():
11
+ """Test that all required modules can be imported"""
12
+ print("πŸ” Testing Streamlit app imports...")
13
+
14
+ # Add current directory to Python path
15
+ current_dir = os.path.dirname(os.path.abspath(__file__))
16
+ if current_dir not in sys.path:
17
+ sys.path.insert(0, current_dir)
18
+
19
+ # Test core modules
20
+ modules_to_test = [
21
+ 'streamlit',
22
+ 'pandas',
23
+ 'plotly',
24
+ 'psutil',
25
+ 'numpy',
26
+ 'streamlit_app.core.cache_manager',
27
+ 'streamlit_app.core.text_processor',
28
+ 'streamlit_app.core.llm_analyzer',
29
+ 'streamlit_app.core.dataset_builder',
30
+ 'streamlit_app.utils.config',
31
+ 'streamlit_app.utils.performance',
32
+ 'streamlit_app.utils.ui_helpers'
33
+ ]
34
+
35
+ failed_imports = []
36
+
37
+ for module in modules_to_test:
38
+ try:
39
+ __import__(module)
40
+ print(f"βœ… {module}")
41
+ except ImportError as e:
42
+ print(f"❌ {module}: {e}")
43
+ failed_imports.append(module)
44
+ except Exception as e:
45
+ print(f"⚠️ {module}: Unexpected error - {e}")
46
+
47
+ if failed_imports:
48
+ print(f"\n❌ Failed to import {len(failed_imports)} modules:")
49
+ for module in failed_imports:
50
+ print(f" - {module}")
51
+ return False
52
+
53
+ print(f"\nβœ… All {len(modules_to_test)} modules imported successfully!")
54
+ return True
55
+
56
+ def test_core_functionality():
57
+ """Test basic functionality of core modules"""
58
+ print("\nπŸ”§ Testing core functionality...")
59
+
60
+ try:
61
+ # Test cache manager
62
+ from streamlit_app.core.cache_manager import CacheManager, get_cache_manager
63
+
64
+ cache = get_cache_manager(max_memory_mb=10, persistent=False) # Small cache for testing
65
+ cache_stats = cache.get_stats()
66
+ print(f"βœ… Cache Manager: {cache_stats}")
67
+
68
+ # Test text processor
69
+ from streamlit_app.core.text_processor import TextProcessor
70
+
71
+ processor = TextProcessor()
72
+ test_text = "This is a test of the New Zealand legislation analysis system."
73
+ cleaned = processor.clean_text(test_text)
74
+ chunks = processor.chunk_text(cleaned, chunk_size=50, overlap=10)
75
+ print(f"βœ… Text Processor: {len(chunks)} chunks created")
76
+
77
+ # Test configuration manager
78
+ from streamlit_app.utils.config import ConfigManager
79
+
80
+ config = ConfigManager()
81
+ config_dict = config.get_config()
82
+ print(f"βœ… Config Manager: {len(config_dict)} configuration sections")
83
+
84
+ # Test performance monitor
85
+ from streamlit_app.utils.performance import PerformanceMonitor
86
+
87
+ perf = PerformanceMonitor(max_history=10)
88
+ stats = perf.get_stats()
89
+ print(f"βœ… Performance Monitor: Memory usage {stats['memory_usage_mb']:.1f} MB")
90
+
91
+ # Test UI helpers (basic instantiation)
92
+ from streamlit_app.utils.ui_helpers import UIHelpers
93
+
94
+ helper = UIHelpers()
95
+ print("βœ… UI Helpers: Module loaded")
96
+
97
+ print("\nπŸŽ‰ All core functionality tests passed!")
98
+ return True
99
+
100
+ except Exception as e:
101
+ print(f"\n❌ Core functionality test failed: {e}")
102
+ import traceback
103
+ traceback.print_exc()
104
+ return False
105
+
106
+ def test_file_structure():
107
+ """Test that all required files exist"""
108
+ print("\nπŸ“ Testing file structure...")
109
+
110
+ required_files = [
111
+ 'streamlit_app/app.py',
112
+ 'streamlit_app/core/cache_manager.py',
113
+ 'streamlit_app/core/text_processor.py',
114
+ 'streamlit_app/core/llm_analyzer.py',
115
+ 'streamlit_app/core/dataset_builder.py',
116
+ 'streamlit_app/utils/config.py',
117
+ 'streamlit_app/utils/performance.py',
118
+ 'streamlit_app/utils/ui_helpers.py',
119
+ 'requirements.txt',
120
+ 'run_streamlit_app.py',
121
+ 'README_Streamlit_App.md'
122
+ ]
123
+
124
+ missing_files = []
125
+
126
+ for file_path in required_files:
127
+ if not Path(file_path).exists():
128
+ missing_files.append(file_path)
129
+ print(f"❌ Missing: {file_path}")
130
+ else:
131
+ print(f"βœ… Found: {file_path}")
132
+
133
+ if missing_files:
134
+ print(f"\n❌ Missing {len(missing_files)} files:")
135
+ for file_path in missing_files:
136
+ print(f" - {file_path}")
137
+ return False
138
+
139
+ print(f"\nβœ… All {len(required_files)} files present!")
140
+ return True
141
+
142
+ def main():
143
+ """Main test function"""
144
+ print("πŸ›οΈ NZ Legislation Loophole Analysis - App Validation")
145
+ print("=" * 60)
146
+
147
+ all_passed = True
148
+
149
+ # Test file structure
150
+ if not test_file_structure():
151
+ all_passed = False
152
+
153
+ # Test imports
154
+ if not test_imports():
155
+ all_passed = False
156
+
157
+ # Test core functionality
158
+ if not test_core_functionality():
159
+ all_passed = False
160
+
161
+ print("\n" + "=" * 60)
162
+ if all_passed:
163
+ print("πŸŽ‰ VALIDATION COMPLETE - App is ready to run!")
164
+ print("\nπŸš€ To start the application:")
165
+ print(" python run_streamlit_app.py")
166
+ print("\nπŸ“± Then visit: http://localhost:8501")
167
+ else:
168
+ print("❌ VALIDATION FAILED - Please check the errors above")
169
+ print("\nπŸ”§ Troubleshooting:")
170
+ print(" - Ensure all dependencies are installed: pip install -r requirements.txt")
171
+ print(" - Check Python version (3.8+ required)")
172
+ print(" - Verify file permissions")
173
+
174
+ return all_passed
175
+
176
+ if __name__ == "__main__":
177
+ success = main()
178
+ sys.exit(0 if success else 1)
trl copy.py ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ NZ Legislation Loophole Analysis Dataset Creation Tool
4
+
5
+ This script processes New Zealand legislation text to create a finetuning dataset for AI models
6
+ that can identify potential loopholes, ambiguities, and unintended consequences in legal text.
7
+
8
+ The script:
9
+ 1. Loads and cleans NZ legislation text, preserving legal structure and terminology
10
+ 2. Chunks the text into manageable sections with overlap for context
11
+ 3. Uses an LLM to analyze each chunk for legal issues
12
+ 4. Generates a structured dataset for training AI models on legal loophole detection
13
+
14
+ Usage:
15
+ python trl.py
16
+
17
+ Requirements:
18
+ - llama-cpp-python with GGUF model support
19
+ - psutil for memory monitoring
20
+ - Input file: nz-legislation.txt containing NZ legislation in JSON lines format
21
+
22
+ Output:
23
+ - JSON dataset saved to nz_legislation_dataset/nz_legislation_loophole_dataset.json
24
+ """
25
+
26
+ import os
27
+ import json
28
+ import time
29
+ import psutil
30
+ from typing import List, Dict, Any
31
+ import numpy as np
32
+ from llama_cpp import Llama
33
+ import re
34
+
35
+ # Placeholder classes and functions for missing dependencies
36
+ class ProgressManager:
37
+ """Simple placeholder for progress tracking"""
38
+ def __init__(self):
39
+ pass
40
+
41
+ def show_memory_usage(label: str):
42
+ """Simple memory usage display"""
43
+ process = psutil.Process(os.getpid())
44
+ memory_mb = process.memory_info().rss / 1024 / 1024
45
+ print(f"{label}: {memory_mb:.2f} MB")
46
+
47
+ # Configuration for NZ Legislation Loophole Analysis Dataset Creation
48
+ INPUT_FILE = "nz-legislation.txt" # Path to New Zealand legislation JSON dataset
49
+ OUTPUT_DIR = "nz_legislation_dataset" # Directory to save the dataset
50
+ CHUNK_SIZE = 4096 # Size of text chunks for processing legislation sections
51
+ CHUNK_OVERLAP = 256 # Overlap between chunks to maintain context
52
+ BATCH_SIZE = 16 # Number of chunks to process at once
53
+ MODEL_PATH = "qwen3.gguf" # Path to your Qwen3 GGUF model
54
+ MAX_TOKENS = 4096 # Maximum tokens for model response
55
+
56
+ # Ensure output directory exists
57
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
58
+
59
+ def load_model(progress_manager: ProgressManager = None):
60
+ """Load the LLM model for text generation with progress tracking"""
61
+ if progress_manager is None:
62
+ progress_manager = ProgressManager()
63
+
64
+ print("Loading LLM model...")
65
+ show_memory_usage("Initial memory usage")
66
+
67
+ start_time = time.time()
68
+ try:
69
+ llm = Llama.from_pretrained(
70
+ repo_id="DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-GGUF",
71
+ filename="Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-D_AU-IQ4_XS-imat.gguf",
72
+ n_ctx=40960, # Context length
73
+ n_threads=8, # Adjust based on your CPU
74
+ verbose=False,
75
+ n_gpu_layers=-1, # Use all available GPU layers
76
+ n_batch=4096, # Batch size for processing
77
+ logits_all=False, # Optimize for text generation
78
+ use_mlock=True, # Lock model in memory if possible
79
+ use_mmap=True, # Use memory mapping for better performance
80
+ )
81
+ except Exception as e:
82
+ print(f"Error loading model: {e}")
83
+ print("Trying with basic configuration...")
84
+ # Fallback to basic configuration
85
+ model = Llama(
86
+ model_path=MODEL_PATH,
87
+ n_ctx=40960,
88
+ n_threads=8,
89
+ verbose=False,
90
+ n_gpu_layers=-1,
91
+ n_batch=4096
92
+ )
93
+
94
+ load_time = time.time() - start_time
95
+ print(f"LLM model loaded in {load_time:.2f}s")
96
+ show_memory_usage("Memory after model load")
97
+
98
+ return model
99
+
100
+ def clean_text(text: str) -> str:
101
+ """Clean and normalize text for better embedding quality, optimized for legal/legislative content"""
102
+ import re
103
+
104
+ # Preserve section numbers and legal structure while cleaning
105
+ # Keep section numbers like "1:", "2:", etc.
106
+ text = re.sub(r'^(\d+:)', r'\1', text, flags=re.MULTILINE)
107
+
108
+ # Remove excessive whitespace but preserve paragraph structure
109
+ text = re.sub(r'[ \t]+', ' ', text) # Replace multiple spaces/tabs with single space
110
+ text = re.sub(r'\n\s*\n', '\n\n', text) # Preserve paragraph breaks but clean up
111
+ text = re.sub(r'\n{3,}', '\n\n', text) # Reduce excessive newlines to double
112
+
113
+ # Remove control characters but preserve legal formatting
114
+ text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text) # Remove control chars except tab and newline
115
+
116
+ # Handle legal-specific characters and formatting
117
+ # Keep legal punctuation and symbols
118
+ allowed_chars = r'\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}\"\'\/\@\#\$\%\^\&\*\+\=\~\`°§'
119
+ text = re.sub(r'[^' + allowed_chars + ']', '', text)
120
+
121
+ # Normalize quotes and apostrophes for legal text
122
+ text = re.sub(r'[""]', '"', text) # Normalize double quotes
123
+ text = re.sub(r"['']", "'", text) # Normalize single quotes
124
+ text = re.sub(r'`', "'", text) # Replace backticks with apostrophes
125
+
126
+ # Clean up legal numbering and references
127
+ # Normalize section references
128
+ text = re.sub(r'section\s+(\d+)', r'section \1', text, flags=re.IGNORECASE)
129
+ text = re.sub(r'(\d+)\s*[Jj]anuary', r'\1 January', text) # Clean date formatting
130
+ text = re.sub(r'(\d+)\s*[Jj]uly', r'\1 July', text) # Clean date formatting
131
+ text = re.sub(r'(\d+)\s*[Aa]pril', r'\1 April', text) # Clean date formatting
132
+ text = re.sub(r'(\d+)\s*[Ff]ebruary', r'\1 February', text) # Clean date formatting
133
+ text = re.sub(r'(\d+)\s*[Dd]ecember', r'\1 December', text) # Clean date formatting
134
+ text = re.sub(r'(\d+)\s*[Aa]ugust', r'\1 August', text) # Clean date formatting
135
+ text = re.sub(r'(\d+)\s*[Mm]arch', r'\1 March', text) # Clean date formatting
136
+ text = re.sub(r'(\d+)\s*[Mm]ay', r'\1 May', text) # Clean date formatting
137
+ text = re.sub(r'(\d+)\s*[Jj]une', r'\1 June', text) # Clean date formatting
138
+ text = re.sub(r'(\d+)\s*[Ss]eptember', r'\1 September', text) # Clean date formatting
139
+ text = re.sub(r'(\d+)\s*[Oo]ctober', r'\1 October', text) # Clean date formatting
140
+ text = re.sub(r'(\d+)\s*[Nn]ovember', r'\1 November', text) # Clean date formatting
141
+
142
+ # Clean up punctuation spacing in legal text
143
+ text = re.sub(r'\s+([\.!\?\,\;\:])', r'\1', text) # Remove space before punctuation
144
+ text = re.sub(r'([\.!\?\,\;\:])\s*', r'\1 ', text) # Ensure space after punctuation
145
+
146
+ # Handle legal citations and references (generic patterns)
147
+ # Normalize act names with years - generic pattern for "Act ####" format
148
+ text = re.sub(r'(\b\w+(?:\s+\w+)*)\s+Act\s+(\d{4})', r'\1 Act', text) # Normalize act names
149
+
150
+ # Clean up amendment references (generic patterns)
151
+ text = re.sub(r'[Aa]mendment\(s\)\s+incorporated\s+in\s+the\s+[Aa]ct\(s\)', 'Amendments incorporated', text)
152
+ text = re.sub(r'section\s+\d+\(\d+\)\([a-zA-Z]\)', lambda m: m.group(0).lower(), text) # Normalize section references
153
+
154
+ # Generic pattern for legal document sections
155
+ text = re.sub(r'(\b(?:section|part|chapter|article|clause|subsection|paragraph))\s+(\d+[a-zA-Z]*)',
156
+ lambda m: f"{m.group(1)} {m.group(2)}", text, flags=re.IGNORECASE)
157
+
158
+ # NZ-specific legal enhancements
159
+ # Handle New Zealand specific terms and references
160
+ text = re.sub(r'\b[Nn]ew\s+[Zz]ealand\b', 'New Zealand', text) # Normalize "New Zealand"
161
+ text = re.sub(r'\b[Pp]arliament\b', 'Parliament', text) # Normalize "Parliament"
162
+ text = re.sub(r'\b[Cc]rown\b', 'Crown', text) # Normalize "Crown"
163
+ text = re.sub(r'\b[Gg]overnment\b', 'Government', text) # Normalize "Government"
164
+
165
+ # Handle NZ-specific legal citations (e.g., "NZB" references, Treaty of Waitangi)
166
+ text = re.sub(r'\b[Nn][Zz][Bb]\s+(\d+)', r'NZB \1', text) # Normalize NZB references
167
+ text = re.sub(r'[Tt]reaty\s+[Oo]f\s+[Ww]aitangi', 'Treaty of Waitangi', text, flags=re.IGNORECASE)
168
+
169
+ # Handle Maori-specific characters if present (basic support)
170
+ # Keep common Maori characters: ā, Δ“, Δ«, ō, Ε«, wh
171
+ maori_chars = 'āēīōūwhΔ€Δ’ΔͺŌΕͺWH'
172
+ allowed_chars += maori_chars
173
+ text = re.sub(r'[^' + allowed_chars + ']', '', text)
174
+
175
+ # Remove empty lines and trim while preserving legal structure
176
+ lines = []
177
+ for line in text.split('\n'):
178
+ stripped = line.strip()
179
+ if stripped: # Keep non-empty lines
180
+ # Preserve section headers
181
+ if re.match(r'^\d+:', stripped):
182
+ lines.append(stripped)
183
+ else:
184
+ lines.append(stripped)
185
+
186
+ text = '\n'.join(lines)
187
+
188
+ # Final cleanup
189
+ text = text.strip()
190
+
191
+ return text
192
+
193
+ # Constants for prompt formatting
194
+ REASONING_START = "<start_working_out>"
195
+ REASONING_END = "<end_working_out>"
196
+ SOLUTION_START = "<SOLUTION>"
197
+ SOLUTION_END = "</SOLUTION>"
198
+
199
+ def create_system_prompt(text: str) -> str:
200
+ """Create a system prompt for analyzing legislative text for loopholes and ambiguities"""
201
+ return f"""You are a legal expert analyzing New Zealand legislation for loopholes and ambiguities.
202
+
203
+ LEGISLATION TEXT:
204
+ {text}
205
+
206
+ TASK: Analyze this legislative text and identify potential loopholes, ambiguities, or unintended consequences.
207
+
208
+ REASONING: Provide a structured analysis in the following format:
209
+
210
+ 1. **Text Meaning**: Explain what the text means and its intended purpose
211
+ 2. **Key Assumptions**: Identify any assumptions the text makes that could be exploited
212
+ 3. **Exploitable Interpretations**: Discuss how the text could be interpreted or applied in ways that circumvent its intended purpose
213
+ 4. **Critical Loopholes**: Identify specific loopholes, ambiguities, or unintended consequences that could be used to bypass the legislation
214
+ 5. **Circumvention Strategies**: Suggest practical methods or scenarios for exploiting these loopholes to achieve objectives contrary to the legislation's intent
215
+
216
+ Write your complete analysis between {REASONING_START} and {REASONING_END}.
217
+
218
+ Then provide your overall conclusion between {SOLUTION_START} and {SOLUTION_END}.
219
+ """
220
+
221
+ def generate_chat_template(system_prompt: str) -> str:
222
+ """
223
+ Generate a chat template using the GGUF model's native chat format.
224
+ This uses the proper message structure with BOS/EOS tokens for better model compatibility.
225
+ """
226
+ # Build the chat using the GGUF template structure
227
+ chat_messages = []
228
+
229
+ # System message
230
+ if system_prompt:
231
+ chat_messages.append("<|im_start|>system")
232
+ chat_messages.append(system_prompt)
233
+ chat_messages.append("<|im_end|>")
234
+
235
+ # User message with the analysis request
236
+ chat_messages.append("<|im_start|>user")
237
+ chat_messages.append("Analyze the given legislative text for loopholes, ambiguities, and unintended consequences. Provide a structured legal analysis following the specified format.")
238
+ chat_messages.append("<|im_end|>")
239
+
240
+ # Assistant message with generation prompt
241
+ chat_messages.append("<|im_start|>assistant")
242
+ chat_messages.append("") # Empty for generation
243
+
244
+ return "\n".join(chat_messages)
245
+
246
+ def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
247
+ """Split text into overlapping chunks for processing"""
248
+ if len(text) <= chunk_size:
249
+ return [text]
250
+
251
+ chunks = []
252
+ start = 0
253
+ while start < len(text):
254
+ end = start + chunk_size
255
+ chunk = text[start:end]
256
+
257
+ # Try to end chunk at a sentence boundary if possible
258
+ if end < len(text):
259
+ # Look for sentence endings in the last 100 characters
260
+ sentence_end = max(
261
+ chunk.rfind('. ', max(0, len(chunk) - 100)),
262
+ chunk.rfind('! ', max(0, len(chunk) - 100)),
263
+ chunk.rfind('? ', max(0, len(chunk) - 100))
264
+ )
265
+ if sentence_end != -1:
266
+ chunk = chunk[:sentence_end + 2] # Include the sentence ending
267
+
268
+ chunks.append(chunk)
269
+ start = end - overlap if end < len(text) else len(text)
270
+
271
+ return chunks
272
+
273
+ def generate_response(model, prompt: str, max_tokens: int = MAX_TOKENS) -> str:
274
+ """
275
+ Generate a response from the model for a given prompt with optimized parameters for legal analysis.
276
+
277
+ Parameter Explanations:
278
+ - temperature=0.3: Balanced creativity for legal analysis (not too random, not too deterministic)
279
+ - top_p=0.85: Nucleus sampling - considers top 85% probability mass for coherent legal text
280
+ - top_k=50: Top-k sampling - considers top 50 tokens for better legal terminology selection
281
+ - min_p=0.05: Minimum probability threshold to avoid low-quality tokens
282
+
283
+ Anti-Repetition Parameters:
284
+ - repetition_penalty=1.15: Penalizes repetition of phrases (15% penalty)
285
+ - presence_penalty=0.1: Encourages topic diversity across the response
286
+ - frequency_penalty=0.1: Reduces overuse of frequent tokens
287
+
288
+ Advanced Sampling:
289
+ - typical_p=0.95: Focuses on typical token probabilities for legal text patterns
290
+ - tfs_z=0.95: Tail-free sampling for more natural legal reasoning
291
+ - mirostat_mode=2: Mirostat v2 for perplexity-controlled generation
292
+ - mirostat_tau=4.0: Target entropy level for legal analysis
293
+ - mirostat_eta=0.15: Learning rate for perplexity adaptation
294
+ """
295
+ try:
296
+ response = model(
297
+ prompt,
298
+ max_tokens=max_tokens,
299
+ # Core generation parameters
300
+ temperature=0.3, # Balanced temperature for legal analysis
301
+ top_p=0.85, # Nucleus sampling for coherent legal text
302
+ top_k=50, # Top-k sampling for better token selection
303
+ min_p=0.05, # Minimum probability threshold to avoid low-quality tokens
304
+
305
+ # Anti-repetition parameters
306
+ repeat_penalty=1.15, # Reduce repetition of phrases
307
+ presence_penalty=0.1, # Encourage topic diversity
308
+ frequency_penalty=0.1, # Reduce frequent token usage
309
+
310
+ # Advanced sampling parameters
311
+ typical_p=0.95, # Typical token probability for legal text patterns
312
+ tfs_z=0.95, # Tail-free sampling for better reasoning
313
+ mirostat_mode=2, # Mirostat v2 for perplexity control
314
+ mirostat_tau=4.0, # Mirostat target entropy
315
+ mirostat_eta=0.15, # Mirostat learning rate
316
+
317
+ # Stopping conditions
318
+ stop=[SOLUTION_END, "</SOLUTION>", "<end_working_out>"] # Multiple stop tokens
319
+ )
320
+ return response['choices'][0]['text'].strip()
321
+ except Exception as e:
322
+ print(f"Error generating response: {e}")
323
+ # Try with fallback parameters if advanced ones fail
324
+ try:
325
+ response = model(
326
+ prompt,
327
+ max_tokens=max_tokens,
328
+ temperature=0.3,
329
+ top_p=0.85,
330
+ top_k=50,
331
+ repeat_penalty=1.15,
332
+ stop=[SOLUTION_END, "</SOLUTION>"]
333
+ )
334
+ return response['choices'][0]['text'].strip()
335
+ except Exception as e2:
336
+ print(f"Fallback also failed: {e2}")
337
+ return ""
338
+
339
+ def parse_legislation_json(file_path: str) -> List[Dict[str, Any]]:
340
+ """Parse the JSON lines format of NZ legislation dataset"""
341
+ legislation_entries = []
342
+
343
+ try:
344
+ with open(file_path, 'r', encoding='utf-8') as f:
345
+ for line_num, line in enumerate(f, 1):
346
+ line = line.strip()
347
+ if line:
348
+ try:
349
+ entry = json.loads(line)
350
+ if 'id' in entry and 'text' in entry:
351
+ legislation_entries.append(entry)
352
+ else:
353
+ print(f"Warning: Line {line_num} missing required fields, skipping")
354
+ except json.JSONDecodeError as e:
355
+ print(f"Warning: Could not parse line {line_num}: {e}")
356
+ continue
357
+ except Exception as e:
358
+ print(f"Error reading legislation file: {e}")
359
+ return []
360
+
361
+ print(f"Successfully parsed {len(legislation_entries)} legislation entries")
362
+ return legislation_entries
363
+
364
+ def create_finetuning_dataset(input_file: str, model, output_file: str = None) -> List[Dict[str, Any]]:
365
+ """Create a finetuning dataset by processing NZ legislation JSON dataset with incremental saving"""
366
+ if output_file is None:
367
+ output_file = os.path.join(OUTPUT_DIR, "nz_legislation_loophole_dataset.json")
368
+
369
+ # Create temporary file paths
370
+ temp_file = output_file.replace('.json', '_temp.jsonl')
371
+ backup_file = output_file.replace('.json', '_backup.json')
372
+
373
+ print(f"Parsing legislation dataset from {input_file}")
374
+ legislation_entries = parse_legislation_json(input_file)
375
+
376
+ if not legislation_entries:
377
+ print("No legislation entries found to process")
378
+ return []
379
+
380
+ dataset = []
381
+ total_entries = len(legislation_entries)
382
+ saved_count = 0
383
+
384
+ print(f"Processing {total_entries} legislation entries...")
385
+ print(f"Dataset will be saved incrementally to: {temp_file}")
386
+
387
+ try:
388
+ # Open temporary file for incremental saving
389
+ with open(temp_file, 'w', encoding='utf-8') as temp_f:
390
+ for entry_num, entry in enumerate(legislation_entries, 1):
391
+ legislation_id = entry.get('id', f'entry_{entry_num}')
392
+ title = entry.get('title', 'Unknown Title')
393
+ year = entry.get('year', 'Unknown Year')
394
+ raw_text = entry.get('text', '')
395
+
396
+ print(f"\nProcessing entry {entry_num}/{total_entries}: {title} ({year}) - ID: {legislation_id}")
397
+
398
+ # Clean the legislation text
399
+ cleaned_text = clean_text(raw_text)
400
+
401
+ # Chunk the text if it's too long
402
+ chunks = chunk_text(cleaned_text)
403
+
404
+ print(f" - Text length: {len(raw_text)} characters")
405
+ print(f" - Number of chunks: {len(chunks)}")
406
+
407
+ # Process each chunk
408
+ for chunk_id, chunk in enumerate(chunks):
409
+ # Create prompt for this chunk
410
+ system_prompt = create_system_prompt(chunk)
411
+ full_prompt = generate_chat_template(system_prompt)
412
+
413
+ # Generate response
414
+ response = generate_response(model, full_prompt)
415
+
416
+ # Print response for monitoring
417
+ print(f"\nπŸ“ **Generated Analysis for {title} (Chunk {chunk_id + 1}/{len(chunks)})**:")
418
+ print(f" Response length: {len(response)} characters")
419
+
420
+ # Show preview of the analysis
421
+ preview = response.replace('\n', ' ').strip()
422
+ print(f" Preview: {preview}")
423
+
424
+ # Check for key analysis elements
425
+ has_reasoning = '<start_working_out>' in response or 'reasoning' in response.lower()
426
+ has_loopholes = 'loophole' in response.lower() or 'ambiguity' in response.lower() or 'issue' in response.lower()
427
+ has_recommendations = 'recommend' in response.lower() or 'suggest' in response.lower()
428
+
429
+ print(f" Analysis quality: {'βœ…' if has_reasoning else '❌'} Reasoning | {'βœ…' if has_loopholes else '❌'} Loopholes | {'βœ…' if has_recommendations else '❌'} Recommendations")
430
+
431
+ # Add to dataset with metadata
432
+ dataset_entry = {
433
+ "prompt": full_prompt,
434
+ "response": response,
435
+ "legislation_id": legislation_id,
436
+ "title": title,
437
+ "year": year,
438
+ "chunk_id": chunk_id,
439
+ "total_chunks": len(chunks),
440
+ "text_length": len(chunk),
441
+ "original_text_length": len(raw_text)
442
+ }
443
+
444
+ # Save entry immediately to temporary file (JSON Lines format)
445
+ json.dump(dataset_entry, temp_f, ensure_ascii=False)
446
+ temp_f.write('\n')
447
+ temp_f.flush() # Force write to disk
448
+
449
+ dataset.append(dataset_entry)
450
+ saved_count += 1
451
+
452
+ # Progress update every 10 entries
453
+ if saved_count % 10 == 0:
454
+ print(f" βœ“ Saved {saved_count} entries so far...")
455
+
456
+ print(f"\nβœ“ All entries processed and saved to temporary file")
457
+ print(f"βœ“ Total entries saved: {saved_count}")
458
+
459
+ # Create backup of existing file if it exists
460
+ if os.path.exists(output_file):
461
+ print(f"Creating backup of existing dataset...")
462
+ os.rename(output_file, backup_file)
463
+
464
+ # Convert JSON Lines to final JSON format
465
+ print(f"Converting to final JSON format...")
466
+ with open(temp_file, 'r', encoding='utf-8') as temp_f:
467
+ lines = temp_f.readlines()
468
+
469
+ final_dataset = []
470
+ for line in lines:
471
+ if line.strip():
472
+ final_dataset.append(json.loads(line))
473
+
474
+ # Save final consolidated JSON file
475
+ with open(output_file, 'w', encoding='utf-8') as f:
476
+ json.dump(final_dataset, f, indent=2, ensure_ascii=False)
477
+
478
+ print(f"βœ“ Final dataset saved to: {output_file}")
479
+
480
+ # Clean up temporary file
481
+ if os.path.exists(temp_file):
482
+ os.remove(temp_file)
483
+ print(f"βœ“ Temporary file cleaned up")
484
+
485
+ # Clean up backup file if everything succeeded
486
+ if os.path.exists(backup_file):
487
+ os.remove(backup_file)
488
+ print(f"βœ“ Backup file cleaned up")
489
+
490
+ print(f"\nπŸŽ‰ Dataset creation complete!")
491
+ print(f" β€’ Processed {total_entries} legislation documents")
492
+ print(f" β€’ Generated {len(final_dataset)} analysis entries")
493
+ print(f" β€’ Total chunks processed: {sum(entry.get('total_chunks', 1) for entry in final_dataset[:total_entries])}")
494
+
495
+ return final_dataset
496
+
497
+ except KeyboardInterrupt:
498
+ print(f"\n⚠️ Process interrupted by user")
499
+ print(f" β€’ Partial dataset saved to: {temp_file}")
500
+ print(f" β€’ {saved_count} entries saved so far")
501
+ print(f" β€’ You can resume processing or use the temporary file")
502
+ raise
503
+
504
+ except Exception as e:
505
+ print(f"\n❌ Error during processing: {e}")
506
+ print(f" β€’ Partial dataset saved to: {temp_file}")
507
+ print(f" β€’ {saved_count} entries saved so far")
508
+ if os.path.exists(backup_file):
509
+ print(f" β€’ Original dataset restored from backup")
510
+ os.rename(backup_file, output_file)
511
+ raise
512
+
513
+ def main():
514
+ """Main execution function"""
515
+ print("Starting NZ Legislation Loophole Analysis Dataset Creation")
516
+ print("=" * 60)
517
+
518
+ # Load the model
519
+ model = load_model()
520
+
521
+ # Create the dataset
522
+ dataset = create_finetuning_dataset(INPUT_FILE, model)
523
+
524
+ # Cleanup
525
+ if hasattr(model, 'close'):
526
+ model.close()
527
+
528
+ print("\nDataset creation completed successfully!")
529
+ print(f"Output saved to: {os.path.join(OUTPUT_DIR, 'nz_legislation_loophole_dataset.json')}")
530
+
531
+ if __name__ == "__main__":
532
+ main()