Spaces:

Princess3
/

l

Build error

App Files Files Community

Princess3 commited on Aug 25

Commit

c089ca4

verified ·

1 Parent(s): 99253d3

Upload 25 files

Browse files

Files changed (26) hide show

.dockerignore +78 -0
.gitattributes +1 -0
Dockerfile +35 -9
README_Docker.md +305 -0
README_Streamlit_App.md +300 -0
docker-compose.yml +28 -0
nz-legislation.txt +3 -0
requirements.txt +9 -3
run_streamlit_app.py +176 -0
streamlit_app/app.py +732 -0
streamlit_app/core/__pycache__/cache_manager.cpython-312.pyc +0 -0
streamlit_app/core/__pycache__/dataset_builder.cpython-312.pyc +0 -0
streamlit_app/core/__pycache__/llm_analyzer.cpython-312.pyc +0 -0
streamlit_app/core/__pycache__/text_processor.cpython-312.pyc +0 -0
streamlit_app/core/cache_manager.py +505 -0
streamlit_app/core/dataset_builder.py +649 -0
streamlit_app/core/llm_analyzer.py +469 -0
streamlit_app/core/text_processor.py +377 -0
streamlit_app/utils/__pycache__/config.cpython-312.pyc +0 -0
streamlit_app/utils/__pycache__/performance.cpython-312.pyc +0 -0
streamlit_app/utils/__pycache__/ui_helpers.cpython-312.pyc +0 -0
streamlit_app/utils/config.py +241 -0
streamlit_app/utils/performance.py +271 -0
streamlit_app/utils/ui_helpers.py +415 -0
test_app_imports.py +178 -0
trl copy.py +532 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,78 @@

+# Version control
+.git
+.gitignore
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Documentation (will be copied if needed)
+*.md
+!README_Streamlit_App.md
+# Test files (will be copied if needed)
+test_app_imports.py
+# Original CLI script (replaced by Streamlit app)
+trl.py
+trl copy.py
+# Cache and temporary files
+*.log
+.cache
+.temp
+# Model files (will be mounted or downloaded at runtime)
+*.gguf
+*.bin
+# Node modules (if any)
+node_modules/
+# Docker files
+Dockerfile
+docker-compose.yml
+.dockerignore

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+nz-legislation.txt filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -1,20 +1,46 @@
-FROM python:3.13.5-slim
-WORKDIR /app
 RUN apt-get update && apt-get install -y \
     build-essential \
-    curl \
     git \
     && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
 EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+# Use Python 3.11 slim as base image for the NZ Legislation Loophole Analysis Streamlit App
+FROM python:3.11-slim
+# Install system dependencies required for llama-cpp-python compilation and general app functionality
 RUN apt-get update && apt-get install -y \
     build-essential \
+    cmake \
     git \
     && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy requirements file and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the entire Streamlit application
+COPY streamlit_app/ ./streamlit_app/
+# Copy data files (if needed for testing or default data)
+COPY nz-legislation.txt ./
+# Create necessary directories for the Streamlit app
+RUN mkdir -p \
+    streamlit_app/cache \
+    streamlit_app/config \
+    streamlit_app/datasets \
+    streamlit_app/logs \
+    streamlit_app/uploads \
+    nz_legislation_dataset
+# Set environment variables for Streamlit
+ENV STREAMLIT_SERVER_HEADLESS=true
+ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
+ENV STREAMLIT_SERVER_PORT=8501
+ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
+# Expose the Streamlit port
 EXPOSE 8501
+# Set working directory to the Streamlit app
+WORKDIR /app/streamlit_app
+# Set the default command to run the Streamlit application
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README_Docker.md ADDED Viewed

	@@ -0,0 +1,305 @@

+# Docker Setup for NZ Legislation Loophole Analysis Streamlit App
+This guide explains how to run the NZ Legislation Loophole Analysis Streamlit App using Docker.
+## 📋 Prerequisites
+- Docker installed on your system
+- Docker Compose (recommended for easier management)
+- At least 4GB of available RAM (8GB recommended for optimal performance)
+## 🚀 Quick Start
+### Method 1: Using Docker Compose (Recommended)
+```bash
+# Clone or navigate to the project directory
+cd /path/to/nz-legislation-analyzer
+# Build and run the application
+docker-compose up --build
+# Or run in detached mode
+docker-compose up -d --build
+```
+The application will be available at: **http://localhost:8501**
+### Method 2: Using Docker Directly
+```bash
+# Build the Docker image
+docker build -t nz-legislation-analyzer .
+# Run the container
+docker run -p 8501:8501 \
+  -v $(pwd)/streamlit_app/cache:/app/streamlit_app/cache \
+  -v $(pwd)/streamlit_app/config:/app/streamlit_app/config \
+  -v $(pwd)/streamlit_app/datasets:/app/streamlit_app/datasets \
+  -v $(pwd)/nz-legislation.txt:/app/nz-legislation.txt:ro \
+  nz-legislation-analyzer
+```
+## 📁 Directory Structure
+When using Docker, the following directories are created and can be persisted:
+```
+📁 streamlit_app/
+├── 🧠 cache/          # Persistent cache for processed chunks
+├── ⚙️ config/         # Application configuration files
+├── 📊 datasets/       # Generated datasets and results
+├── 📝 logs/           # Application logs
+└── 📤 uploads/        # Uploaded files (if any)
+```
+## 🛠️ Configuration
+### Environment Variables
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `STREAMLIT_SERVER_HEADLESS` | `true` | Run in headless mode |
+| `STREAMLIT_SERVER_PORT` | `8501` | Streamlit server port |
+| `STREAMLIT_SERVER_ADDRESS` | `0.0.0.0` | Server bind address |
+### Volume Mounts
+The Docker setup includes the following volume mounts for data persistence:
+- `./streamlit_app/cache:/app/streamlit_app/cache` - Cache persistence
+- `./streamlit_app/config:/app/streamlit_app/config` - Configuration files
+- `./streamlit_app/datasets:/app/streamlit_app/datasets` - Generated datasets
+- `./streamlit_app/logs:/app/streamlit_app/logs` - Application logs
+- `./nz-legislation.txt:/app/nz-legislation.txt:ro` - Input data (read-only)
+## 🔧 Docker Commands
+### Building the Image
+```bash
+# Build with no cache
+docker build --no-cache -t nz-legislation-analyzer .
+# Build with specific Dockerfile
+docker build -f Dockerfile -t nz-legislation-analyzer .
+```
+### Running the Container
+```bash
+# Interactive mode
+docker run -it --rm -p 8501:8501 nz-legislation-analyzer
+# Background mode
+docker run -d -p 8501:8501 nz-legislation-analyzer
+# With custom environment variables
+docker run -p 8501:8501 \
+  -e STREAMLIT_SERVER_PORT=8502 \
+  nz-legislation-analyzer
+```
+### Docker Compose Commands
+```bash
+# Start services
+docker-compose up
+# Start in background
+docker-compose up -d
+# Stop services
+docker-compose down
+# Rebuild and start
+docker-compose up --build
+# View logs
+docker-compose logs -f
+# Scale services (if needed)
+docker-compose up -d --scale nz-legislation-analyzer=2
+```
+## 📊 Monitoring and Logs
+### Viewing Logs
+```bash
+# Docker Compose logs
+docker-compose logs -f nz-legislation-analyzer
+# Docker logs
+docker logs -f <container_id>
+# Follow logs in real-time
+docker-compose logs -f --tail=100
+```
+### Health Checks
+The Docker Compose setup includes health checks that monitor the Streamlit application:
+```yaml
+healthcheck:
+  test: ["CMD", "curl", "-f", "http://localhost:8501/healthz"]
+  interval: 30s
+  timeout: 10s
+  retries: 3
+  start_period: 40s
+```
+## 🔍 Troubleshooting
+### Common Issues
+1. **Port Already in Use**
+   ```bash
+   # Change the port mapping
+   docker run -p 8502:8501 nz-legislation-analyzer
+   # Or with docker-compose, modify the ports section
+   ```
+2. **Memory Issues**
+   ```bash
+   # Increase Docker memory allocation
+   # Docker Desktop: Settings > Resources > Memory
+   # Or add memory limits to docker-compose.yml
+   ```
+3. **Model Loading Errors**
+   - Ensure sufficient RAM (8GB+ recommended)
+   - Check that model files are accessible
+   - Verify model path in configuration
+4. **Permission Issues**
+   ```bash
+   # Fix directory permissions
+   sudo chown -R $USER:$USER streamlit_app/
+   ```
+5. **Cache Issues**
+   ```bash
+   # Clear persistent cache
+   sudo rm -rf streamlit_app/cache/*
+   docker-compose restart
+   ```
+### Debug Mode
+Enable debug logging by modifying the environment:
+```bash
+# Add to docker-compose.yml environment section
+- PYTHONPATH=/app
+- LOG_LEVEL=DEBUG
+```
+## 🔄 Updates and Maintenance
+### Updating the Application
+```bash
+# Pull latest changes
+git pull
+# Rebuild the image
+docker-compose build --no-cache
+# Restart services
+docker-compose up -d
+```
+### Backup Important Data
+```bash
+# Backup cache and configuration
+tar -czf backup.tar.gz streamlit_app/cache/ streamlit_app/config/
+# Backup datasets
+tar -czf datasets_backup.tar.gz streamlit_app/datasets/
+```
+### Cleaning Up
+```bash
+# Remove containers and volumes
+docker-compose down -v
+# Remove images
+docker rmi nz-legislation-analyzer
+# Clean up unused Docker resources
+docker system prune -a
+```
+## 🏗️ Advanced Configuration
+### Custom Model Files
+To use custom model files:
+1. **Mount model directory:**
+   ```yaml
+   volumes:
+     - ./models:/app/models:ro
+   ```
+2. **Update configuration** in the Streamlit app to point to `/app/models/your-model.gguf`
+### GPU Support (Optional)
+For GPU acceleration with CUDA:
+```dockerfile
+# Use CUDA-enabled base image
+FROM nvidia/cuda:11.8-devel-ubuntu22.04
+# Install Python and dependencies
+# ... (additional setup for CUDA)
+```
+Note: GPU support requires additional configuration and CUDA-compatible hardware.
+## 🔐 Security Considerations
+- The application runs in headless mode by default
+- All data is stored locally in mounted volumes
+- No external network access is required for basic functionality
+- Consider implementing authentication for production deployments
+## 📈 Performance Optimization
+### Memory Management
+- Default cache size: 1024MB (configurable in app settings)
+- Adjust based on available system memory
+- Monitor memory usage through the app's Performance dashboard
+### Disk I/O
+- Use SSD storage for better performance
+- Ensure adequate disk space for cache and datasets
+- Consider using tmpfs for temporary processing
+### Network
+- The application binds to all interfaces (`0.0.0.0`)
+- Access via `localhost` or container IP
+- No external dependencies required
+## 🆘 Support
+For Docker-specific issues:
+1. Check Docker logs: `docker-compose logs`
+2. Verify Docker installation and version
+3. Ensure adequate system resources
+4. Review the main application logs in `streamlit_app/logs/`
+For application-specific issues, refer to the main documentation in `README_Streamlit_App.md`.
+---
+**🎉 Happy analyzing with your containerized NZ Legislation Loophole Analysis Streamlit App!**

README_Streamlit_App.md ADDED Viewed

	@@ -0,0 +1,300 @@

+# NZ Legislation Loophole Analysis Streamlit App
+A modern, AI-powered web application for analyzing New Zealand legislation to identify potential loopholes, ambiguities, and unintended consequences.
+## 🌟 Features
+### 🤖 AI-Powered Analysis
+- **Legal Expertise**: Specialized analysis for NZ legislation with Treaty of Waitangi references
+- **Multiple Analysis Types**: Standard, Detailed, and Comprehensive analysis modes
+- **Intelligent Chunking**: Sentence-aware text splitting with overlap for context preservation
+### 🧠 Context Memory Cache System
+- **Smart Caching**: Hash-based chunk identification prevents re-processing identical content
+- **Multi-level Storage**: In-memory LRU cache with optional SQLite persistence
+- **Performance Boost**: Significant speed improvements for large documents and batch processing
+- **Cache Management**: View statistics, export/import cache, and set TTL limits
+### 🎨 Modern Web Interface
+- **Multi-page Layout**: Organized navigation with Home, Upload, Analysis, Settings, and Performance pages
+- **Real-time Progress**: Live progress bars and processing status updates
+- **Interactive Dashboards**: Performance metrics, cache statistics, and analysis results
+- **Responsive Design**: Works on desktop and mobile devices
+### 📊 Advanced Analytics
+- **Quality Metrics**: Confidence scoring and analysis quality assessment
+- **Performance Monitoring**: Memory usage, CPU utilization, and processing times
+- **Batch Processing**: Handle multiple legislation files simultaneously
+- **Export Options**: Multiple formats (JSON, CSV, Excel) with metadata
+## 🚀 Quick Start
+### Prerequisites
+```bash
+# Python 3.8 or higher
+python --version
+# Install dependencies
+pip install -r requirements.txt
+```
+### Running the Application
+```bash
+# Method 1: Use the run script (recommended)
+python run_streamlit_app.py
+# Method 2: Direct Streamlit command
+cd streamlit_app
+streamlit run app.py
+```
+The app will be available at: **http://localhost:8501**
+## 📁 Project Structure
+```
+streamlit_app/
+├── app.py                 # Main Streamlit application
+├── core/
+│   ├── cache_manager.py  # Context memory cache system
+│   ├── text_processor.py # Text cleaning and chunking
+│   ├── llm_analyzer.py   # LLM integration and analysis
+│   └── dataset_builder.py # Dataset creation and export
+├── utils/
+│   ├── config.py         # Configuration management
+│   ├── performance.py    # Performance monitoring
+│   └── ui_helpers.py     # UI components and formatting
+├── pages/                # Multi-page navigation
+├── assets/               # Custom styling and assets
+└── cache/                # Cache storage directory
+```
+## 🛠️ Configuration
+### Model Configuration
+The app supports both local GGUF models and HuggingFace models:
+```python
+# Local model
+model_path = "path/to/your/model.gguf"
+# HuggingFace model
+repo_id = "DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-GGUF"
+filename = "model-file-name.gguf"
+```
+### Cache Configuration
+```python
+cache_config = {
+    'enabled': True,           # Enable/disable caching
+    'max_size_mb': 1024,       # Maximum memory for cache
+    'ttl_hours': 24,          # Time-to-live for cached entries
+    'persistent': True         # Use disk persistence
+}
+```
+### Processing Configuration
+```python
+processing_config = {
+    'chunk_size': 4096,        # Size of text chunks
+    'chunk_overlap': 256,      # Overlap between chunks
+    'batch_size': 16,          # Number of chunks to process at once
+    'clean_text': True         # Apply text cleaning
+}
+```
+## 📖 Usage Guide
+### 1. Home Page
+- Overview of the application capabilities
+- Current configuration status
+- Quick start guide
+### 2. Upload & Process Page
+- **File Upload**: Support for JSON lines, JSON arrays, and raw text files
+- **Configuration**: Adjust model, processing, and analysis parameters
+- **Batch Processing**: Upload multiple files for simultaneous analysis
+- **Real-time Progress**: Monitor processing status and performance
+### 3. Analysis Results Page
+- **Results Overview**: Summary metrics and statistics
+- **Detailed Analysis**: Expandable results with confidence scores
+- **Export Options**: Download results in multiple formats
+- **Quality Metrics**: Analysis quality assessment and recommendations
+### 4. Settings Page
+- **Model Settings**: Configure LLM parameters and model paths
+- **Processing Settings**: Adjust text processing parameters
+- **Cache Settings**: Manage cache behavior and persistence
+- **UI Settings**: Customize interface appearance
+### 5. Performance Dashboard
+- **Real-time Metrics**: Memory usage, CPU utilization, processing speed
+- **Performance History**: Charts showing performance over time
+- **Cache Statistics**: Hit rates, evictions, and cache efficiency
+- **System Information**: Hardware and software details
+- **Performance Recommendations**: Automated suggestions for optimization
+## 🔧 Advanced Features
+### Cache Management
+```python
+from core.cache_manager import get_cache_manager
+# Get cache instance
+cache = get_cache_manager()
+# View statistics
+stats = cache.get_stats()
+print(f"Hit Rate: {stats['hit_rate']:.1f}%")
+# Clear cache
+cache.clear_cache()
+# Export cache
+cache.export_cache('cache_backup.json')
+```
+### Custom Analysis Templates
+The app supports custom analysis templates for different legal domains:
+```python
+# Define custom template
+custom_template = {
+    'name': 'Commercial Law Analysis',
+    'depth': 'Detailed',
+    'focus_areas': [
+        'contractual loopholes',
+        'commercial implications',
+        'regulatory compliance',
+        'enforcement mechanisms'
+    ]
+}
+```
+### Performance Optimization
+- **Memory Management**: Automatic cache eviction based on memory limits
+- **Batch Processing**: Optimized for large document collections
+- **Concurrent Processing**: Thread-safe operations for multi-user scenarios
+- **Progress Callbacks**: Real-time progress updates during long operations
+## 📊 API Reference
+### Core Classes
+#### CacheManager
+```python
+class CacheManager:
+    def get(self, content, model_config, processing_config) -> Optional[Dict]
+    def put(self, content, analysis_result, model_config, processing_config)
+    def get_stats(self) -> Dict[str, Any]
+    def clear_cache(self)
+    def export_cache(self, filepath: str) -> bool
+    def import_cache(self, filepath: str) -> int
+```
+#### TextProcessor
+```python
+class TextProcessor:
+    def clean_text(self, text: str, preserve_structure: bool = True) -> str
+    def chunk_text(self, text: str, chunk_size: int = 4096, overlap: int = 256) -> List[str]
+    def extract_metadata(self, text: str) -> Dict[str, Any]
+    def preprocess_legislation_json(self, json_data: Dict) -> Dict
+```
+#### LLMAnalyzer
+```python
+class LLMAnalyzer:
+    def analyze_chunk(self, chunk: str, analysis_type: str = 'standard') -> Dict[str, Any]
+    def batch_analyze_chunks(self, chunks: List[str], analysis_type: str = 'standard') -> List[Dict]
+    def load_model(self) -> bool
+    def unload_model(self)
+```
+## 🔍 Analysis Output Format
+Each analysis result contains:
+```json
+{
+  "chunk": "original text chunk",
+  "analysis_type": "standard|detailed|comprehensive",
+  "model_config": {...},
+  "structured_analysis": {
+    "text_meaning": "explanation of text purpose",
+    "key_assumptions": ["list of assumptions"],
+    "exploitable_interpretations": ["potential interpretations"],
+    "critical_loopholes": ["identified loopholes"],
+    "circumvention_strategies": ["exploitation methods"],
+    "recommendations": ["suggested fixes"],
+    "confidence_score": 85,
+    "analysis_quality": "high|medium|low"
+  },
+  "processing_time": 2.34,
+  "chunk_size": 4096,
+  "word_count": 512
+}
+```
+## 🐛 Troubleshooting
+### Common Issues
+1. **Model Loading Errors**
+   - Ensure model file exists and is accessible
+   - Check model format (GGUF required)
+   - Verify sufficient RAM for model loading
+2. **Cache Performance Issues**
+   - Clear cache if memory usage is high
+   - Adjust cache size limits in settings
+   - Check persistent cache database integrity
+3. **Processing Slowdowns**
+   - Reduce batch size for large documents
+   - Increase chunk overlap for better context
+   - Consider using a more powerful model
+4. **Memory Errors**
+   - Reduce cache size in settings
+   - Process files individually instead of batch
+   - Monitor memory usage in performance dashboard
+### Debug Mode
+Enable debug mode in settings for detailed logging:
+```python
+# In settings, enable debug mode
+debug_mode = True
+log_level = "DEBUG"
+```
+## 🤝 Contributing
+1. Fork the repository
+2. Create a feature branch
+3. Make your changes
+4. Add tests if applicable
+5. Submit a pull request
+## 📄 License
+This project is licensed under the MIT License - see the LICENSE file for details.
+## 🆘 Support
+For support and questions:
+- Check the troubleshooting section above
+- Review the performance recommendations in the app
+- Examine the logs in the `streamlit_app/logs/` directory
+## 🔄 Migration from Original Script
+If you're migrating from the original `trl.py` script:
+1. **Configuration**: Settings are now managed through the UI
+2. **Output**: Results are displayed in the web interface
+3. **Caching**: Automatic caching with no manual intervention needed
+4. **Batch Processing**: Multiple files can be uploaded simultaneously
+5. **Progress Tracking**: Real-time progress bars and status updates
+The new app maintains all functionality of the original script while providing a modern, user-friendly interface and significant performance improvements through intelligent caching.

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+version: '3.8'
+services:
+  nz-legislation-analyzer:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "8501:8501"
+    volumes:
+      # Mount directories for persistent data
+      - ./streamlit_app/cache:/app/streamlit_app/cache
+      - ./streamlit_app/config:/app/streamlit_app/config
+      - ./streamlit_app/datasets:/app/streamlit_app/datasets
+      - ./streamlit_app/logs:/app/streamlit_app/logs
+      - ./nz-legislation.txt:/app/nz-legislation.txt:ro
+    environment:
+      - STREAMLIT_SERVER_HEADLESS=true
+      - STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
+      - STREAMLIT_SERVER_PORT=8501
+      - STREAMLIT_SERVER_ADDRESS=0.0.0.0
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8501/healthz"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s

nz-legislation.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e671ba88cfc0d52bf03dcc089e67c6f73fa1ce7680cef6cf860bab1b5809e8e1
+size 112806614

requirements.txt CHANGED Viewed

@@ -1,3 +1,9 @@
-altair
-pandas
-streamlit

+llama-cpp-python
+psutil
+numpy
+streamlit>=1.28.0
+streamlit-extras>=0.3.0
+plotly>=5.15.0
+pandas>=2.0.0
+streamlit-aggrid>=0.3.0
+streamlit-ace>=0.1.1

run_streamlit_app.py ADDED Viewed

	@@ -0,0 +1,176 @@

+#!/usr/bin/env python3
+"""
+NZ Legislation Loophole Analysis Streamlit App Runner
+This script runs the modern Streamlit application for analyzing New Zealand legislation
+to identify potential loopholes, ambiguities, and unintended consequences using AI.
+Features:
+- Advanced UI with multi-page layout
+- Context memory cache system for improved performance
+- Real-time progress monitoring
+- Interactive results visualization
+- Batch processing capabilities
+- Comprehensive configuration management
+Usage:
+    python run_streamlit_app.py
+Requirements:
+    - All dependencies from requirements.txt must be installed
+    - Run from the project root directory
+"""
+import os
+import sys
+import subprocess
+from pathlib import Path
+def check_requirements():
+    """Check if all required packages are installed"""
+    required_packages = [
+        'streamlit',
+        'pandas',
+        'plotly',
+        'llama-cpp-python',
+        'psutil',
+        'numpy'
+    ]
+    missing_packages = []
+    for package in required_packages:
+        try:
+            __import__(package.replace('-', '_'))
+        except ImportError:
+            missing_packages.append(package)
+    if missing_packages:
+        print("❌ Missing required packages:")
+        for package in missing_packages:
+            print(f"   - {package}")
+        print("\n📦 Installing missing packages...")
+        try:
+            subprocess.check_call([
+                sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'
+            ])
+            print("✅ All packages installed successfully!")
+        except subprocess.CalledProcessError:
+            print("❌ Failed to install packages. Please install manually:")
+            print("   pip install -r requirements.txt")
+            return False
+    return True
+def check_app_structure():
+    """Check if the app structure is correct"""
+    app_dir = Path('streamlit_app')
+    required_files = [
+        'app.py',
+        'core/cache_manager.py',
+        'core/text_processor.py',
+        'core/llm_analyzer.py',
+        'core/dataset_builder.py',
+        'utils/config.py',
+        'utils/performance.py',
+        'utils/ui_helpers.py'
+    ]
+    missing_files = []
+    for file_path in required_files:
+        full_path = app_dir / file_path
+        if not full_path.exists():
+            missing_files.append(str(full_path))
+    if missing_files:
+        print("❌ Missing app files:")
+        for file_path in missing_files:
+            print(f"   - {file_path}")
+        return False
+    print("✅ App structure is complete!")
+    return True
+def create_directories():
+    """Create necessary directories"""
+    directories = [
+        'streamlit_app/cache',
+        'streamlit_app/config',
+        'streamlit_app/datasets',
+        'streamlit_app/logs'
+    ]
+    for dir_path in directories:
+        Path(dir_path).mkdir(parents=True, exist_ok=True)
+        print(f"📁 Created directory: {dir_path}")
+def setup_environment():
+    """Setup environment variables and configuration"""
+    # Add current directory to Python path for imports
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    if current_dir not in sys.path:
+        sys.path.insert(0, current_dir)
+    # Set environment variables
+    os.environ.setdefault('STREAMLIT_SERVER_HEADLESS', 'true')
+    os.environ.setdefault('STREAMLIT_BROWSER_GATHER_USAGE_STATS', 'false')
+    print("🔧 Environment setup complete!")
+def run_app():
+    """Run the Streamlit application"""
+    print("\n🚀 Starting NZ Legislation Loophole Analyzer...")
+    print("=" * 60)
+    print("📱 Access the app at: http://localhost:8501")
+    print("🛑 Press Ctrl+C to stop the application")
+    print("=" * 60)
+    try:
+        # Change to app directory
+        os.chdir('streamlit_app')
+        # Run Streamlit
+        subprocess.run([
+            sys.executable, '-m', 'streamlit', 'run', 'app.py',
+            '--server.port', '8501',
+            '--server.address', '0.0.0.0',
+            '--theme.base', 'light'
+        ])
+    except KeyboardInterrupt:
+        print("\n\n👋 Application stopped by user")
+    except Exception as e:
+        print(f"\n❌ Error running application: {e}")
+        return False
+    return True
+def main():
+    """Main function"""
+    print("🏛️  NZ Legislation Loophole Analysis Streamlit App")
+    print("=" * 60)
+    # Check requirements
+    if not check_requirements():
+        return 1
+    # Check app structure
+    if not check_app_structure():
+        return 1
+    # Create directories
+    create_directories()
+    # Setup environment
+    setup_environment()
+    # Run the app
+    if not run_app():
+        return 1
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

streamlit_app/app.py ADDED Viewed

	@@ -0,0 +1,732 @@

+#!/usr/bin/env python3
+"""
+NZ Legislation Loophole Analysis Streamlit App
+A modern web interface for analyzing New Zealand legislation text to identify
+potential loopholes, ambiguities, and unintended consequences using AI.
+Features:
+- Advanced UI with multi-page layout
+- Context memory cache system for improved performance
+- Real-time progress monitoring
+- Interactive results visualization
+- Batch processing capabilities
+- Comprehensive configuration management
+"""
+import streamlit as st
+import sys
+import os
+from pathlib import Path
+# Add the current directory to Python path for imports
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# Import core modules
+from core.cache_manager import CacheManager
+from core.text_processor import TextProcessor
+from core.llm_analyzer import LLMAnalyzer
+from core.dataset_builder import DatasetBuilder
+from utils.config import ConfigManager
+from utils.ui_helpers import UIHelpers
+from utils.performance import PerformanceMonitor
+# Configure page settings
+st.set_page_config(
+    page_title="NZ Legislation Loophole Analyzer",
+    page_icon="⚖️",
+    layout="wide",
+    initial_sidebar_state="expanded",
+    menu_items={
+        'Get Help': 'https://github.com/your-repo',
+        'Report a bug': 'https://github.com/your-repo/issues',
+        'About': '''
+        ## NZ Legislation Loophole Analyzer
+        A powerful AI tool for analyzing New Zealand legislation to identify
+        potential loopholes, ambiguities, and unintended consequences.
+        **Version:** 1.0.0
+        **Built with:** Streamlit, Llama.cpp, and advanced caching
+        '''
+    }
+)
+# Initialize session state
+def initialize_session_state():
+    """Initialize all session state variables"""
+    if 'cache_manager' not in st.session_state:
+        st.session_state.cache_manager = CacheManager()
+    if 'config_manager' not in st.session_state:
+        st.session_state.config_manager = ConfigManager()
+    if 'performance_monitor' not in st.session_state:
+        st.session_state.performance_monitor = PerformanceMonitor()
+    if 'current_analysis' not in st.session_state:
+        st.session_state.current_analysis = None
+    if 'analysis_results' not in st.session_state:
+        st.session_state.analysis_results = []
+    if 'processing_status' not in st.session_state:
+        st.session_state.processing_status = {
+            'is_running': False,
+            'progress': 0,
+            'current_task': '',
+            'total_chunks': 0,
+            'processed_chunks': 0
+        }
+def main():
+    """Main application function"""
+    # Initialize session state
+    initialize_session_state()
+    # Create sidebar with navigation and status
+    with st.sidebar:
+        st.title("⚖️ NZ Legislation Analyzer")
+        st.markdown("---")
+        # Navigation
+        pages = {
+            "🏠 Home": "home",
+            "📤 Upload & Process": "upload",
+            "📊 Analysis Results": "results",
+            "⚙️ Settings": "settings",
+            "📈 Performance": "performance"
+        }
+        selected_page = st.selectbox(
+            "Navigate to:",
+            list(pages.keys()),
+            key="nav_select"
+        )
+        st.markdown("---")
+        # Cache status
+        with st.expander("🧠 Cache Status", expanded=True):
+            cache_stats = st.session_state.cache_manager.get_stats()
+            st.metric("Cache Hits", cache_stats['hits'])
+            st.metric("Cache Misses", cache_stats['misses'])
+            st.metric("Hit Rate", ".1f")
+            st.metric("Cached Chunks", cache_stats['entries'])
+            if st.button("Clear Cache", type="secondary"):
+                st.session_state.cache_manager.clear_cache()
+                st.rerun()
+        # Performance metrics
+        with st.expander("📊 Performance", expanded=True):
+            perf_stats = st.session_state.performance_monitor.get_stats()
+            st.metric("Memory Usage", ".1f")
+            st.metric("Avg Processing Time", ".2f")
+        # Processing status
+        if st.session_state.processing_status['is_running']:
+            with st.expander("🔄 Processing Status", expanded=True):
+                st.progress(st.session_state.processing_status['progress'])
+                st.text(st.session_state.processing_status['current_task'])
+                st.text(f"Chunk {st.session_state.processing_status['processed_chunks']}/"
+                       f"{st.session_state.processing_status['total_chunks']}")
+    # Main content area
+    page = pages[selected_page]
+    if page == "home":
+        show_home_page()
+    elif page == "upload":
+        show_upload_page()
+    elif page == "results":
+        show_results_page()
+    elif page == "settings":
+        show_settings_page()
+    elif page == "performance":
+        show_performance_page()
+    # Footer
+    st.markdown("---")
+    st.markdown(
+        """
+        <div style='text-align: center; color: #666; font-size: 12px;'>
+        NZ Legislation Loophole Analyzer v1.0.0 | Built with Streamlit & Llama.cpp
+        </div>
+        """,
+        unsafe_allow_html=True
+    )
+def show_home_page():
+    """Display the home page with overview and quick start"""
+    st.title("🏠 NZ Legislation Loophole Analyzer")
+    st.markdown("### AI-Powered Legal Analysis Tool")
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.markdown("""
+        This advanced tool analyzes New Zealand legislation to identify:
+        🔍 **Potential Loopholes** - Legal ambiguities that could be exploited
+        📋 **Unintended Consequences** - Hidden implications in legislative language
+        ⚖️ **Ambiguities** - Vague or unclear legal provisions
+        🎯 **Circumvention Strategies** - Ways legislation might be bypassed
+        **Key Features:**
+        - **Smart Caching**: Avoid re-processing identical content
+        - **Advanced UI**: Modern interface with real-time progress
+        - **Batch Processing**: Handle multiple legislation files
+        - **Performance Monitoring**: Track memory usage and processing speed
+        - **Export Options**: Multiple formats for analysis results
+        """)
+        st.markdown("### Quick Start")
+        st.markdown("""
+        1. **Upload** your NZ legislation files (JSON lines or raw text)
+        2. **Configure** analysis parameters and model settings
+        3. **Process** the legislation with AI-powered analysis
+        4. **Review** results with interactive visualizations
+        5. **Export** findings in multiple formats
+        """)
+    with col2:
+        st.markdown("### Current Configuration")
+        config = st.session_state.config_manager.get_config()
+        # Model settings
+        st.subheader("🤖 Model Settings")
+        st.info(f"**Model:** {config['model']['path']}")
+        st.info(f"**Context Length:** {config['model']['context_length']}")
+        st.info(f"**Max Tokens:** {config['model']['max_tokens']}")
+        # Processing settings
+        st.subheader("⚙️ Processing")
+        st.info(f"**Chunk Size:** {config['processing']['chunk_size']}")
+        st.info(f"**Overlap:** {config['processing']['chunk_overlap']}")
+        st.info(f"**Batch Size:** {config['processing']['batch_size']}")
+        # Cache settings
+        st.subheader("🧠 Cache")
+        cache_stats = st.session_state.cache_manager.get_stats()
+        st.info(f"**Status:** {'Active' if cache_stats['enabled'] else 'Disabled'}")
+        st.info(f"**Hit Rate:** {cache_stats['hit_rate']:.1f}%")
+        if st.button("🚀 Start Analysis", type="primary", use_container_width=True):
+            st.switch_page("pages/1_upload.py")
+def show_upload_page():
+    """Display the upload and processing page"""
+    st.title("📤 Upload & Process Legislation")
+    # File upload section
+    st.subheader("📁 Upload Legislation Files")
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        uploaded_files = st.file_uploader(
+            "Select NZ legislation files",
+            accept_multiple_files=True,
+            type=['json', 'txt', 'jsonl'],
+            help="Upload JSON lines format (.jsonl), JSON arrays (.json), or raw text (.txt) files"
+        )
+        if uploaded_files:
+            st.success(f"📄 {len(uploaded_files)} file(s) selected")
+            # Show file details
+            for file in uploaded_files:
+                with st.expander(f"📋 {file.name}"):
+                    st.write(f"**Size:** {file.size:,} bytes")
+                    st.write(f"**Type:** {file.type}")
+                    # Preview content
+                    if file.type in ['text/plain', 'application/json']:
+                        content = file.read().decode('utf-8')
+                        st.text_area("Preview", content[:500] + "..." if len(content) > 500 else content,
+                                   height=100, disabled=True)
+                        file.seek(0)  # Reset file pointer
+    with col2:
+        # Processing configuration
+        st.subheader("⚙️ Processing Configuration")
+        config = st.session_state.config_manager.get_config()
+        # Model settings
+        with st.expander("🤖 Model Configuration", expanded=True):
+            model_path = st.text_input(
+                "Model Path",
+                value=config['model']['path'],
+                help="Path to your GGUF model file"
+            )
+            context_length = st.slider(
+                "Context Length",
+                min_value=1024,
+                max_value=65536,
+                value=config['model']['context_length'],
+                step=1024,
+                help="Maximum context length for the model"
+            )
+            max_tokens = st.slider(
+                "Max Response Tokens",
+                min_value=256,
+                max_value=4096,
+                value=config['model']['max_tokens'],
+                step=64,
+                help="Maximum tokens in model response"
+            )
+        # Text processing settings
+        with st.expander("📝 Text Processing", expanded=True):
+            chunk_size = st.slider(
+                "Chunk Size",
+                min_value=512,
+                max_value=8192,
+                value=config['processing']['chunk_size'],
+                step=256,
+                help="Size of text chunks for processing"
+            )
+            chunk_overlap = st.slider(
+                "Chunk Overlap",
+                min_value=64,
+                max_value=1024,
+                value=config['processing']['chunk_overlap'],
+                step=32,
+                help="Overlap between chunks for context preservation"
+            )
+        # Analysis settings
+        with st.expander("🔍 Analysis Settings", expanded=True):
+            analysis_depth = st.select_slider(
+                "Analysis Depth",
+                options=["Basic", "Standard", "Detailed", "Comprehensive"],
+                value=config['analysis']['depth'],
+                help="Level of detail in legal analysis"
+            )
+            include_recommendations = st.checkbox(
+                "Include Recommendations",
+                value=config['analysis']['include_recommendations'],
+                help="Generate specific recommendations for addressing identified issues"
+            )
+    # Process button and status
+    col1, col2, col3 = st.columns([1, 1, 1])
+    with col1:
+        if st.button("🔄 Start Processing", type="primary", use_container_width=True):
+            if not uploaded_files:
+                st.error("Please upload at least one legislation file")
+            else:
+                start_processing(uploaded_files, {
+                    'model': {
+                        'path': model_path,
+                        'context_length': context_length,
+                        'max_tokens': max_tokens
+                    },
+                    'processing': {
+                        'chunk_size': chunk_size,
+                        'chunk_overlap': chunk_overlap
+                    },
+                    'analysis': {
+                        'depth': analysis_depth,
+                        'include_recommendations': include_recommendations
+                    }
+                })
+    with col2:
+        if st.button("⏹️ Stop Processing", use_container_width=True):
+            stop_processing()
+    with col3:
+        if st.button("📊 View Results", use_container_width=True):
+            st.switch_page("pages/2_analysis.py")
+def start_processing(files, config):
+    """Start the processing workflow"""
+    st.session_state.processing_status = {
+        'is_running': True,
+        'progress': 0,
+        'current_task': 'Initializing...',
+        'total_chunks': 0,
+        'processed_chunks': 0
+    }
+    # Update configuration
+    st.session_state.config_manager.update_config(config)
+    # TODO: Implement actual processing logic
+    st.rerun()
+def stop_processing():
+    """Stop the current processing"""
+    st.session_state.processing_status['is_running'] = False
+    st.session_state.processing_status['current_task'] = 'Stopped by user'
+def show_results_page():
+    """Display analysis results page"""
+    st.title("📊 Analysis Results")
+    if not st.session_state.analysis_results:
+        st.info("No analysis results available. Please upload and process legislation files first.")
+        return
+    # Results overview
+    st.subheader("📈 Results Overview")
+    col1, col2, col3, col4 = st.columns(4)
+    total_results = len(st.session_state.analysis_results)
+    total_loopholes = sum(len(result.get('loopholes', [])) for result in st.session_state.analysis_results)
+    avg_confidence = sum(result.get('confidence', 0) for result in st.session_state.analysis_results) / max(total_results, 1)
+    with col1:
+        st.metric("Total Analyses", total_results)
+    with col2:
+        st.metric("Loopholes Found", total_loopholes)
+    with col3:
+        st.metric("Avg Confidence", ".2f")
+    with col4:
+        cache_stats = st.session_state.cache_manager.get_stats()
+        st.metric("Cache Hit Rate", ".1f")
+    # Results display
+    st.subheader("🔍 Detailed Results")
+    for i, result in enumerate(st.session_state.analysis_results):
+        with st.expander(f"📋 Analysis {i+1}: {result.get('title', 'Unknown Title')}", expanded=i==0):
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                st.markdown("**Summary:**")
+                st.write(result.get('summary', 'No summary available'))
+                st.markdown("**Key Findings:**")
+                for finding in result.get('loopholes', []):
+                    st.markdown(f"- {finding}")
+            with col2:
+                st.metric("Confidence", ".2f")
+                st.metric("Processing Time", ".2f")
+                st.metric("Chunks Processed", result.get('chunks_processed', 0))
+    # Export options
+    st.subheader("💾 Export Results")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        if st.button("📄 Export as JSON", use_container_width=True):
+            export_results('json')
+    with col2:
+        if st.button("📊 Export as CSV", use_container_width=True):
+            export_results('csv')
+    with col3:
+        if st.button("📋 Export as Excel", use_container_width=True):
+            export_results('excel')
+def export_results(format_type):
+    """Export analysis results in specified format"""
+    # TODO: Implement export functionality
+    st.success(f"Results exported as {format_type.upper()}")
+def show_settings_page():
+    """Display settings page"""
+    st.title("⚙️ Settings & Configuration")
+    tabs = st.tabs(["🤖 Model Settings", "📝 Processing", "🧠 Cache", "🎨 UI", "🔧 Advanced"])
+    with tabs[0]:
+        st.subheader("🤖 Model Configuration")
+        config = st.session_state.config_manager.get_config()
+        model_path = st.text_input(
+            "Model Path",
+            value=config['model']['path'],
+            help="Path to your GGUF model file"
+        )
+        repo_id = st.text_input(
+            "HuggingFace Repo ID",
+            value=config['model']['repo_id'],
+            help="HuggingFace repository ID for model download"
+        )
+        filename = st.text_input(
+            "Model Filename",
+            value=config['model']['filename'],
+            help="Specific model filename in the repository"
+        )
+        context_length = st.slider(
+            "Context Length",
+            min_value=1024,
+            max_value=131072,
+            value=config['model']['context_length'],
+            step=1024
+        )
+        max_tokens = st.slider(
+            "Max Response Tokens",
+            min_value=256,
+            max_value=8192,
+            value=config['model']['max_tokens'],
+            step=64
+        )
+        temperature = st.slider(
+            "Temperature",
+            min_value=0.0,
+            max_value=2.0,
+            value=config['model']['temperature'],
+            step=0.1,
+            help="Controls randomness in model output"
+        )
+    with tabs[1]:
+        st.subheader("📝 Text Processing")
+        chunk_size = st.slider(
+            "Chunk Size",
+            min_value=256,
+            max_value=16384,
+            value=config['processing']['chunk_size'],
+            step=256
+        )
+        chunk_overlap = st.slider(
+            "Chunk Overlap",
+            min_value=32,
+            max_value=2048,
+            value=config['processing']['chunk_overlap'],
+            step=32
+        )
+        batch_size = st.slider(
+            "Batch Size",
+            min_value=1,
+            max_value=32,
+            value=config['processing']['batch_size'],
+            step=1
+        )
+        clean_text = st.checkbox(
+            "Clean Text",
+            value=config['processing']['clean_text'],
+            help="Apply text cleaning and normalization"
+        )
+    with tabs[2]:
+        st.subheader("🧠 Cache Configuration")
+        enable_cache = st.checkbox(
+            "Enable Caching",
+            value=config['cache']['enabled'],
+            help="Use cache to avoid re-processing identical chunks"
+        )
+        max_cache_size = st.slider(
+            "Max Cache Size (MB)",
+            min_value=100,
+            max_value=8192,
+            value=config['cache']['max_size_mb'],
+            step=100
+        )
+        cache_ttl = st.slider(
+            "Cache TTL (hours)",
+            min_value=1,
+            max_value=168,
+            value=config['cache']['ttl_hours'],
+            step=1,
+            help="Time-to-live for cached entries"
+        )
+        persistent_cache = st.checkbox(
+            "Persistent Cache",
+            value=config['cache']['persistent'],
+            help="Save cache to disk for persistence across sessions"
+        )
+    with tabs[3]:
+        st.subheader("🎨 UI Configuration")
+        theme = st.selectbox(
+            "Theme",
+            options=["Auto", "Light", "Dark"],
+            index=["Auto", "Light", "Dark"].index(config['ui']['theme'])
+        )
+        show_progress = st.checkbox(
+            "Show Progress Bars",
+            value=config['ui']['show_progress'],
+            help="Display progress indicators during processing"
+        )
+        auto_refresh = st.checkbox(
+            "Auto-refresh Results",
+            value=config['ui']['auto_refresh'],
+            help="Automatically refresh results view"
+        )
+    with tabs[4]:
+        st.subheader("🔧 Advanced Settings")
+        debug_mode = st.checkbox(
+            "Debug Mode",
+            value=config['advanced']['debug_mode'],
+            help="Enable detailed logging and debugging information"
+        )
+        log_level = st.selectbox(
+            "Log Level",
+            options=["DEBUG", "INFO", "WARNING", "ERROR"],
+            index=["DEBUG", "INFO", "WARNING", "ERROR"].index(config['advanced']['log_level'])
+        )
+        memory_limit = st.slider(
+            "Memory Limit (MB)",
+            min_value=512,
+            max_value=32768,
+            value=config['advanced']['memory_limit_mb'],
+            step=512
+        )
+    # Save settings
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        if st.button("💾 Save Settings", type="primary", use_container_width=True):
+            new_config = {
+                'model': {
+                    'path': model_path,
+                    'repo_id': repo_id,
+                    'filename': filename,
+                    'context_length': context_length,
+                    'max_tokens': max_tokens,
+                    'temperature': temperature
+                },
+                'processing': {
+                    'chunk_size': chunk_size,
+                    'chunk_overlap': chunk_overlap,
+                    'batch_size': batch_size,
+                    'clean_text': clean_text
+                },
+                'cache': {
+                    'enabled': enable_cache,
+                    'max_size_mb': max_cache_size,
+                    'ttl_hours': cache_ttl,
+                    'persistent': persistent_cache
+                },
+                'ui': {
+                    'theme': theme,
+                    'show_progress': show_progress,
+                    'auto_refresh': auto_refresh
+                },
+                'advanced': {
+                    'debug_mode': debug_mode,
+                    'log_level': log_level,
+                    'memory_limit_mb': memory_limit
+                }
+            }
+            st.session_state.config_manager.update_config(new_config)
+            st.success("Settings saved successfully!")
+    with col2:
+        if st.button("🔄 Reset to Defaults", use_container_width=True):
+            st.session_state.config_manager.reset_to_defaults()
+            st.success("Settings reset to defaults!")
+            st.rerun()
+def show_performance_page():
+    """Display performance monitoring page"""
+    st.title("📈 Performance Dashboard")
+    # Real-time metrics
+    st.subheader("📊 Real-time Metrics")
+    col1, col2, col3, col4 = st.columns(4)
+    perf_stats = st.session_state.performance_monitor.get_stats()
+    with col1:
+        st.metric("Memory Usage", ".1f", "MB")
+    with col2:
+        st.metric("CPU Usage", ".1f", "%")
+    with col3:
+        st.metric("Active Threads", perf_stats.get('active_threads', 0))
+    with col4:
+        cache_stats = st.session_state.cache_manager.get_stats()
+        st.metric("Cache Hit Rate", ".1f", "%")
+    # Performance charts
+    st.subheader("📈 Performance History")
+    # TODO: Add interactive charts for performance metrics
+    # System information
+    st.subheader("💻 System Information")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("**Hardware:**")
+        # TODO: Add system information display
+    with col2:
+        st.markdown("**Software:**")
+        # TODO: Add software information display
+    # Cache performance
+    st.subheader("🧠 Cache Performance")
+    cache_stats = st.session_state.cache_manager.get_stats()
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("Total Requests", cache_stats['hits'] + cache_stats['misses'])
+    with col2:
+        st.metric("Cache Hits", cache_stats['hits'])
+    with col3:
+        st.metric("Cache Misses", cache_stats['misses'])
+    with col4:
+        st.metric("Hit Rate", ".1f")
+    # Performance recommendations
+    st.subheader("💡 Performance Recommendations")
+    recommendations = []
+    if cache_stats['hit_rate'] < 50:
+        recommendations.append("Consider increasing cache size or adjusting chunk sizes to improve hit rate")
+    if perf_stats.get('memory_usage_mb', 0) > 8000:
+        recommendations.append("High memory usage detected. Consider reducing batch size or chunk size")
+    if not recommendations:
+        recommendations.append("Performance is optimal!")
+    for rec in recommendations:
+        st.info(rec)
+if __name__ == "__main__":
+    main()

streamlit_app/core/__pycache__/cache_manager.cpython-312.pyc ADDED Viewed

Binary file (24.5 kB). View file

streamlit_app/core/__pycache__/dataset_builder.cpython-312.pyc ADDED Viewed

Binary file (25.8 kB). View file

streamlit_app/core/__pycache__/llm_analyzer.cpython-312.pyc ADDED Viewed

Binary file (17.2 kB). View file

streamlit_app/core/__pycache__/text_processor.cpython-312.pyc ADDED Viewed

Binary file (15.9 kB). View file

streamlit_app/core/cache_manager.py ADDED Viewed

	@@ -0,0 +1,505 @@

+#!/usr/bin/env python3
+"""
+Context Memory Cache Manager
+A sophisticated caching system for NZ Legislation Loophole Analysis that provides:
+- Hash-based chunk identification for unique content tracking
+- Multi-level caching (memory + optional disk persistence)
+- Intelligent cache invalidation based on memory limits
+- Performance metrics and cache statistics
+- Thread-safe operations for concurrent processing
+"""
+import hashlib
+import json
+import os
+import time
+import threading
+from typing import Dict, Any, Optional, Tuple
+from functools import lru_cache
+import sqlite3
+from pathlib import Path
+import psutil
+import streamlit as st
+class CacheEntry:
+    """Represents a single cache entry with metadata"""
+    def __init__(self, key: str, content: str, analysis_result: Dict[str, Any],
+                 model_config: Dict[str, Any], processing_config: Dict[str, Any]):
+        self.key = key
+        self.content = content
+        self.analysis_result = analysis_result
+        self.model_config = model_config
+        self.processing_config = processing_config
+        self.created_at = time.time()
+        self.last_accessed = time.time()
+        self.access_count = 0
+        self.size_bytes = len(content.encode('utf-8')) + len(str(analysis_result).encode('utf-8'))
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert cache entry to dictionary for serialization"""
+        return {
+            'key': self.key,
+            'content': self.content,
+            'analysis_result': self.analysis_result,
+            'model_config': self.model_config,
+            'processing_config': self.processing_config,
+            'created_at': self.created_at,
+            'last_accessed': self.last_accessed,
+            'access_count': self.access_count,
+            'size_bytes': self.size_bytes
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'CacheEntry':
+        """Create cache entry from dictionary"""
+        entry = cls(
+            key=data['key'],
+            content=data['content'],
+            analysis_result=data['analysis_result'],
+            model_config=data['model_config'],
+            processing_config=data['processing_config']
+        )
+        entry.created_at = data.get('created_at', time.time())
+        entry.last_accessed = data.get('last_accessed', time.time())
+        entry.access_count = data.get('access_count', 0)
+        entry.size_bytes = data.get('size_bytes', entry.size_bytes)
+        return entry
+    def update_access(self):
+        """Update access statistics"""
+        self.last_accessed = time.time()
+        self.access_count += 1
+class CacheManager:
+    """Advanced cache manager for legislation analysis"""
+    def __init__(self, max_memory_mb: int = 1024, persistent: bool = True,
+                 cache_dir: str = None, ttl_hours: int = 24):
+        """
+        Initialize the cache manager
+        Args:
+            max_memory_mb: Maximum memory to use for caching (MB)
+            persistent: Whether to use persistent disk cache
+            cache_dir: Directory for persistent cache storage
+            ttl_hours: Time-to-live for cache entries (hours)
+        """
+        self.max_memory_mb = max_memory_mb
+        self.persistent = persistent
+        self.ttl_hours = ttl_hours
+        self.ttl_seconds = ttl_hours * 3600
+        # Set up cache directory
+        if cache_dir is None:
+            cache_dir = os.path.join(os.path.dirname(__file__), '..', 'cache')
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(exist_ok=True)
+        self.db_path = self.cache_dir / 'cache.db'
+        # Thread synchronization
+        self.lock = threading.RLock()
+        # In-memory cache with LRU eviction
+        self.memory_cache: Dict[str, CacheEntry] = {}
+        self.memory_size = 0  # Current memory usage in bytes
+        # Statistics
+        self.stats = {
+            'hits': 0,
+            'misses': 0,
+            'entries': 0,
+            'memory_usage_mb': 0,
+            'evictions': 0,
+            'enabled': True
+        }
+        # Initialize database if persistent
+        if self.persistent:
+            self._init_database()
+        # Load existing cache entries if persistent
+        if self.persistent:
+            self._load_persistent_cache()
+    def _init_database(self):
+        """Initialize SQLite database for persistent cache"""
+        try:
+            with sqlite3.connect(str(self.db_path)) as conn:
+                conn.execute('''
+                    CREATE TABLE IF NOT EXISTS cache_entries (
+                        key TEXT PRIMARY KEY,
+                        data TEXT NOT NULL,
+                        created_at REAL NOT NULL,
+                        last_accessed REAL NOT NULL,
+                        access_count INTEGER DEFAULT 0,
+                        size_bytes INTEGER DEFAULT 0
+                    )
+                ''')
+                conn.execute('CREATE INDEX IF NOT EXISTS idx_created_at ON cache_entries(created_at)')
+                conn.execute('CREATE INDEX IF NOT EXISTS idx_last_accessed ON cache_entries(last_accessed)')
+        except Exception as e:
+            print(f"Warning: Could not initialize persistent cache: {e}")
+            self.persistent = False
+    def _load_persistent_cache(self):
+        """Load existing cache entries from database"""
+        if not self.persistent:
+            return
+        try:
+            with sqlite3.connect(str(self.db_path)) as conn:
+                cursor = conn.execute('SELECT data FROM cache_entries')
+                for row in cursor:
+                    try:
+                        entry_data = json.loads(row[0])
+                        entry = CacheEntry.from_dict(entry_data)
+                        # Check if entry is still valid
+                        if self._is_entry_valid(entry):
+                            self._add_to_memory_cache(entry)
+                        else:
+                            # Remove expired entry from database
+                            conn.execute('DELETE FROM cache_entries WHERE key = ?', (entry.key,))
+                    except (json.JSONDecodeError, KeyError):
+                        continue
+        except Exception as e:
+            print(f"Warning: Could not load persistent cache: {e}")
+    def _generate_cache_key(self, content: str, model_config: Dict[str, Any],
+                           processing_config: Dict[str, Any]) -> str:
+        """
+        Generate a unique cache key based on content and configuration
+        Args:
+            content: The text content to be analyzed
+            model_config: Model configuration used for analysis
+            processing_config: Processing configuration used
+        Returns:
+            SHA-256 hash string as cache key
+        """
+        # Create a deterministic string from all parameters
+        key_data = {
+            'content': content,
+            'model_config': model_config,
+            'processing_config': processing_config
+        }
+        # Convert to JSON string with sorted keys for consistency
+        key_string = json.dumps(key_data, sort_keys=True)
+        # Generate SHA-256 hash
+        return hashlib.sha256(key_string.encode('utf-8')).hexdigest()
+    def _is_entry_valid(self, entry: CacheEntry) -> bool:
+        """Check if a cache entry is still valid"""
+        # Check TTL
+        if time.time() - entry.created_at > self.ttl_seconds:
+            return False
+        # Check if configurations match (for future-proofing)
+        # This could be enhanced to handle configuration changes
+        return True
+    def _add_to_memory_cache(self, entry: CacheEntry):
+        """Add entry to memory cache with size management"""
+        with self.lock:
+            # Check if we need to evict entries
+            while self.memory_size + entry.size_bytes > self.max_memory_mb * 1024 * 1024:
+                if not self.memory_cache:
+                    break
+                self._evict_lru_entry()
+            self.memory_cache[entry.key] = entry
+            self.memory_size += entry.size_bytes
+            self.stats['entries'] = len(self.memory_cache)
+            self.stats['memory_usage_mb'] = self.memory_size / (1024 * 1024)
+    def _evict_lru_entry(self):
+        """Evict the least recently used entry from memory cache"""
+        if not self.memory_cache:
+            return
+        # Find entry with oldest last_accessed time
+        lru_key = min(self.memory_cache.keys(),
+                     key=lambda k: self.memory_cache[k].last_accessed)
+        evicted_entry = self.memory_cache.pop(lru_key)
+        self.memory_size -= evicted_entry.size_bytes
+        self.stats['evictions'] += 1
+        # If persistent, we could keep it in database but remove from memory
+        # For now, we'll just remove it completely
+    def _save_to_persistent_cache(self, entry: CacheEntry):
+        """Save entry to persistent cache"""
+        if not self.persistent:
+            return
+        try:
+            with sqlite3.connect(str(self.db_path)) as conn:
+                conn.execute('''
+                    INSERT OR REPLACE INTO cache_entries
+                    (key, data, created_at, last_accessed, access_count, size_bytes)
+                    VALUES (?, ?, ?, ?, ?, ?)
+                ''', (
+                    entry.key,
+                    json.dumps(entry.to_dict()),
+                    entry.created_at,
+                    entry.last_accessed,
+                    entry.access_count,
+                    entry.size_bytes
+                ))
+        except Exception as e:
+            print(f"Warning: Could not save to persistent cache: {e}")
+    def get(self, content: str, model_config: Dict[str, Any],
+            processing_config: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """
+        Get cached analysis result for given content and configuration
+        Args:
+            content: Text content to look up
+            model_config: Model configuration used for analysis
+            processing_config: Processing configuration used
+        Returns:
+            Cached analysis result or None if not found
+        """
+        if not self.stats['enabled']:
+            self.stats['misses'] += 1
+            return None
+        cache_key = self._generate_cache_key(content, model_config, processing_config)
+        with self.lock:
+            # Check memory cache first
+            if cache_key in self.memory_cache:
+                entry = self.memory_cache[cache_key]
+                if self._is_entry_valid(entry):
+                    entry.update_access()
+                    self.stats['hits'] += 1
+                    return entry.analysis_result
+                else:
+                    # Remove invalid entry
+                    self.memory_cache.pop(cache_key)
+                    self.memory_size -= entry.size_bytes
+                    self.stats['entries'] = len(self.memory_cache)
+            # Check persistent cache if not in memory
+            if self.persistent:
+                try:
+                    with sqlite3.connect(str(self.db_path)) as conn:
+                        cursor = conn.execute('SELECT data FROM cache_entries WHERE key = ?', (cache_key,))
+                        row = cursor.fetchone()
+                        if row:
+                            entry_data = json.loads(row[0])
+                            entry = CacheEntry.from_dict(entry_data)
+                            if self._is_entry_valid(entry):
+                                entry.update_access()
+                                self.stats['hits'] += 1
+                                # Move to memory cache for faster future access
+                                self._add_to_memory_cache(entry)
+                                # Update persistent cache with new access stats
+                                self._save_to_persistent_cache(entry)
+                                return entry.analysis_result
+                except Exception as e:
+                    print(f"Warning: Error accessing persistent cache: {e}")
+        self.stats['misses'] += 1
+        return None
+    def put(self, content: str, analysis_result: Dict[str, Any],
+            model_config: Dict[str, Any], processing_config: Dict[str, Any]):
+        """
+        Store analysis result in cache
+        Args:
+            content: Text content that was analyzed
+            analysis_result: Analysis result to cache
+            model_config: Model configuration used for analysis
+            processing_config: Processing configuration used
+        """
+        if not self.stats['enabled']:
+            return
+        cache_key = self._generate_cache_key(content, model_config, processing_config)
+        with self.lock:
+            entry = CacheEntry(cache_key, content, analysis_result,
+                             model_config, processing_config)
+            # Add to memory cache
+            self._add_to_memory_cache(entry)
+            # Save to persistent cache
+            self._save_to_persistent_cache(entry)
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cache statistics"""
+        with self.lock:
+            total_requests = self.stats['hits'] + self.stats['misses']
+            hit_rate = (self.stats['hits'] / total_requests * 100) if total_requests > 0 else 0
+            return {
+                **self.stats,
+                'hit_rate': hit_rate,
+                'total_requests': total_requests,
+                'persistent_enabled': self.persistent,
+                'memory_limit_mb': self.max_memory_mb,
+                'ttl_hours': self.ttl_hours
+            }
+    def clear_cache(self):
+        """Clear all cache entries"""
+        with self.lock:
+            self.memory_cache.clear()
+            self.memory_size = 0
+            self.stats['entries'] = 0
+            self.stats['hits'] = 0
+            self.stats['misses'] = 0
+            self.stats['evictions'] = 0
+            self.stats['memory_usage_mb'] = 0
+            # Clear persistent cache
+            if self.persistent:
+                try:
+                    with sqlite3.connect(str(self.db_path)) as conn:
+                        conn.execute('DELETE FROM cache_entries')
+                except Exception as e:
+                    print(f"Warning: Could not clear persistent cache: {e}")
+    def cleanup_expired_entries(self):
+        """Remove expired entries from cache"""
+        current_time = time.time()
+        expired_keys = []
+        with self.lock:
+            # Find expired entries in memory
+            for key, entry in self.memory_cache.items():
+                if current_time - entry.created_at > self.ttl_seconds:
+                    expired_keys.append(key)
+                    self.memory_size -= entry.size_bytes
+            # Remove expired entries from memory
+            for key in expired_keys:
+                del self.memory_cache[key]
+            self.stats['entries'] = len(self.memory_cache)
+            self.stats['memory_usage_mb'] = self.memory_size / (1024 * 1024)
+            # Clean up persistent cache
+            if self.persistent:
+                try:
+                    with sqlite3.connect(str(self.db_path)) as conn:
+                        conn.execute('DELETE FROM cache_entries WHERE ? - created_at > ?',
+                                   (current_time, self.ttl_seconds))
+                except Exception as e:
+                    print(f"Warning: Could not cleanup persistent cache: {e}")
+    def enable(self):
+        """Enable caching"""
+        self.stats['enabled'] = True
+    def disable(self):
+        """Disable caching"""
+        self.stats['enabled'] = False
+    def export_cache(self, filepath: str):
+        """Export cache contents to JSON file"""
+        cache_data = {
+            'metadata': {
+                'exported_at': time.time(),
+                'version': '1.0',
+                'total_entries': len(self.memory_cache)
+            },
+            'entries': []
+        }
+        with self.lock:
+            for entry in self.memory_cache.values():
+                cache_data['entries'].append(entry.to_dict())
+            # Also export persistent cache entries
+            if self.persistent:
+                try:
+                    with sqlite3.connect(str(self.db_path)) as conn:
+                        cursor = conn.execute('SELECT data FROM cache_entries')
+                        for row in cursor:
+                            try:
+                                entry_data = json.loads(row[0])
+                                cache_data['entries'].append(entry_data)
+                            except json.JSONDecodeError:
+                                continue
+                except Exception as e:
+                    print(f"Warning: Could not export persistent cache: {e}")
+        try:
+            with open(filepath, 'w', encoding='utf-8') as f:
+                json.dump(cache_data, f, indent=2, ensure_ascii=False)
+            return True
+        except Exception as e:
+            print(f"Error exporting cache: {e}")
+            return False
+    def import_cache(self, filepath: str):
+        """Import cache contents from JSON file"""
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                cache_data = json.load(f)
+            imported_count = 0
+            for entry_data in cache_data.get('entries', []):
+                try:
+                    entry = CacheEntry.from_dict(entry_data)
+                    if self._is_entry_valid(entry):
+                        self._add_to_memory_cache(entry)
+                        if self.persistent:
+                            self._save_to_persistent_cache(entry)
+                        imported_count += 1
+                except Exception as e:
+                    print(f"Warning: Could not import cache entry: {e}")
+                    continue
+            return imported_count
+        except Exception as e:
+            print(f"Error importing cache: {e}")
+            return 0
+# Global cache instance for use across the application
+_cache_instance = None
+_cache_lock = threading.Lock()
+def get_cache_manager(max_memory_mb: int = 1024, persistent: bool = True,
+                     cache_dir: str = None, ttl_hours: int = 24) -> CacheManager:
+    """
+    Get or create global cache manager instance
+    This ensures we have a single cache instance across the application
+    while allowing configuration updates.
+    """
+    global _cache_instance
+    with _cache_lock:
+        if _cache_instance is None:
+            _cache_instance = CacheManager(max_memory_mb, persistent, cache_dir, ttl_hours)
+        else:
+            # Update configuration if different
+            if (_cache_instance.max_memory_mb != max_memory_mb or
+                _cache_instance.persistent != persistent or
+                _cache_instance.ttl_hours != ttl_hours):
+                _cache_instance.max_memory_mb = max_memory_mb
+                _cache_instance.persistent = persistent
+                _cache_instance.ttl_hours = ttl_hours
+                _cache_instance.ttl_seconds = ttl_hours * 3600
+    return _cache_instance

streamlit_app/core/dataset_builder.py ADDED Viewed

	@@ -0,0 +1,649 @@

+#!/usr/bin/env python3
+"""
+Dataset Builder
+Creates and manages finetuning datasets from legislation analysis results.
+Handles data formatting, validation, and export in multiple formats.
+"""
+import os
+import json
+import time
+from typing import List, Dict, Any, Optional, Tuple
+from pathlib import Path
+import pandas as pd
+from datetime import datetime
+import uuid
+class DatasetBuilder:
+    """Builder for creating finetuning datasets from legislation analysis"""
+    def __init__(self, output_dir: str = "datasets"):
+        """
+        Initialize the dataset builder
+        Args:
+            output_dir: Directory to save datasets
+        """
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(exist_ok=True)
+        # Dataset metadata
+        self.metadata = {
+            'version': '1.0',
+            'created_at': datetime.now().isoformat(),
+            'total_entries': 0,
+            'analysis_types': set(),
+            'legislation_sources': set(),
+            'quality_metrics': {}
+        }
+    def create_finetuning_dataset(self, analysis_results: List[Dict[str, Any]],
+                                dataset_name: str = None,
+                                include_metadata: bool = True) -> Dict[str, Any]:
+        """
+        Create a finetuning dataset from analysis results
+        Args:
+            analysis_results: List of analysis results from LLM analyzer
+            dataset_name: Name for the dataset (optional)
+            include_metadata: Whether to include metadata in the dataset
+        Returns:
+            Dataset information and statistics
+        """
+        if not dataset_name:
+            timestamp = int(time.time())
+            dataset_name = f"nz_legislation_dataset_{timestamp}"
+        dataset_entries = []
+        successful_entries = 0
+        for result in analysis_results:
+            if 'error' in result:
+                continue
+            # Create finetuning entry
+            entry = self._create_finetuning_entry(result)
+            if entry:
+                dataset_entries.append(entry)
+                successful_entries += 1
+                # Update metadata
+                if 'analysis_type' in result:
+                    self.metadata['analysis_types'].add(result['analysis_type'])
+        # Update metadata
+        self.metadata['total_entries'] = len(dataset_entries)
+        self.metadata['created_at'] = datetime.now().isoformat()
+        # Calculate quality metrics
+        self._calculate_quality_metrics(dataset_entries)
+        # Create dataset structure
+        dataset = {
+            'metadata': dict(self.metadata),
+            'entries': dataset_entries
+        }
+        if include_metadata:
+            dataset['metadata'].update({
+                'dataset_name': dataset_name,
+                'successful_entries': successful_entries,
+                'total_input_results': len(analysis_results),
+                'success_rate': successful_entries / len(analysis_results) if analysis_results else 0
+            })
+        return dataset
+    def _create_finetuning_entry(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """
+        Create a single finetuning dataset entry
+        Args:
+            result: Analysis result from LLM analyzer
+        Returns:
+            Finetuning entry or None if invalid
+        """
+        try:
+            # Extract key components
+            chunk = result.get('chunk', '')
+            structured_analysis = result.get('structured_analysis', {})
+            response = result.get('response', '')
+            # Create the prompt (input)
+            prompt = self._create_prompt(chunk, result.get('analysis_type', 'standard'))
+            # Create the response (output) - structured format
+            response_text = self._create_response(structured_analysis, response)
+            if not prompt or not response_text:
+                return None
+            # Create entry
+            entry = {
+                'id': str(uuid.uuid4()),
+                'prompt': prompt,
+                'response': response_text,
+                'metadata': {
+                    'chunk_size': len(chunk),
+                    'word_count': len(chunk.split()),
+                    'analysis_type': result.get('analysis_type', 'standard'),
+                    'model_config': result.get('model_config', {}),
+                    'confidence_score': structured_analysis.get('confidence_score', 0),
+                    'analysis_quality': structured_analysis.get('analysis_quality', 'unknown'),
+                    'created_at': datetime.now().isoformat()
+                },
+                'raw_data': {
+                    'original_chunk': chunk,
+                    'structured_analysis': structured_analysis,
+                    'raw_response': response
+                }
+            }
+            return entry
+        except Exception as e:
+            print(f"Error creating finetuning entry: {e}")
+            return None
+    def _create_prompt(self, chunk: str, analysis_type: str) -> str:
+        """
+        Create a standardized prompt for the finetuning dataset
+        Args:
+            chunk: Text chunk to analyze
+            analysis_type: Type of analysis
+        Returns:
+            Formatted prompt
+        """
+        analysis_configs = {
+            'standard': {
+                'depth': 'Standard',
+                'focus': 'loopholes, ambiguities, and unintended consequences'
+            },
+            'detailed': {
+                'depth': 'Detailed',
+                'focus': 'loopholes, ambiguities, unintended consequences, and implementation issues'
+            },
+            'comprehensive': {
+                'depth': 'Comprehensive',
+                'focus': 'all aspects including policy conflicts and enforcement challenges'
+            }
+        }
+        config = analysis_configs.get(analysis_type, analysis_configs['standard'])
+        prompt = f"""You are a legal expert analyzing New Zealand legislation for loopholes and ambiguities.
+LEGISLATION TEXT:
+{chunk}
+TASK: Analyze this legislative text for potential loopholes, ambiguities, or unintended consequences.
+ANALYSIS DEPTH: {config['depth']}
+FOCUS AREAS: {config['focus']}
+Provide a structured analysis covering:
+1. Text Meaning - Explain what the text means and its intended purpose
+2. Key Assumptions - Identify any assumptions that could be exploited
+3. Exploitable Interpretations - Discuss how the text could be interpreted in unintended ways
+4. Critical Loopholes - Identify specific loopholes or ambiguities
+5. Circumvention Strategies - Suggest practical methods for exploiting these loopholes
+Format your response clearly with section headers."""
+        return prompt
+    def _create_response(self, structured_analysis: Dict[str, Any], raw_response: str) -> str:
+        """
+        Create a standardized response format for the finetuning dataset
+        Args:
+            structured_analysis: Structured analysis data
+            raw_response: Raw LLM response
+        Returns:
+            Formatted response
+        """
+        sections = []
+        # Text Meaning
+        if structured_analysis.get('text_meaning'):
+            sections.append(f"**Text Meaning:** {structured_analysis['text_meaning']}")
+        # Key Assumptions
+        if structured_analysis.get('key_assumptions'):
+            assumptions = structured_analysis['key_assumptions']
+            if assumptions:
+                sections.append("**Key Assumptions:**")
+                for i, assumption in enumerate(assumptions, 1):
+                    sections.append(f"{i}. {assumption}")
+        # Exploitable Interpretations
+        if structured_analysis.get('exploitable_interpretations'):
+            interpretations = structured_analysis['exploitable_interpretations']
+            if interpretations:
+                sections.append("**Exploitable Interpretations:**")
+                for i, interpretation in enumerate(interpretations, 1):
+                    sections.append(f"{i}. {interpretation}")
+        # Critical Loopholes
+        if structured_analysis.get('critical_loopholes'):
+            loopholes = structured_analysis['critical_loopholes']
+            if loopholes:
+                sections.append("**Critical Loopholes:**")
+                for i, loophole in enumerate(loopholes, 1):
+                    sections.append(f"{i}. {loophole}")
+        # Circumvention Strategies
+        if structured_analysis.get('circumvention_strategies'):
+            strategies = structured_analysis['circumvention_strategies']
+            if strategies:
+                sections.append("**Circumvention Strategies:**")
+                for i, strategy in enumerate(strategies, 1):
+                    sections.append(f"{i}. {strategy}")
+        # Recommendations
+        if structured_analysis.get('recommendations'):
+            recommendations = structured_analysis['recommendations']
+            if recommendations:
+                sections.append("**Recommendations:**")
+                for i, rec in enumerate(recommendations, 1):
+                    sections.append(f"{i}. {rec}")
+        return "\n\n".join(sections) if sections else raw_response
+    def _calculate_quality_metrics(self, entries: List[Dict[str, Any]]):
+        """Calculate quality metrics for the dataset"""
+        if not entries:
+            return
+        confidence_scores = []
+        analysis_qualities = {'high': 0, 'medium': 0, 'low': 0, 'unknown': 0}
+        for entry in entries:
+            metadata = entry.get('metadata', {})
+            confidence = metadata.get('confidence_score', 0)
+            quality = metadata.get('analysis_quality', 'unknown')
+            confidence_scores.append(confidence)
+            analysis_qualities[quality] = analysis_qualities.get(quality, 0) + 1
+        self.metadata['quality_metrics'] = {
+            'average_confidence': sum(confidence_scores) / len(confidence_scores) if confidence_scores else 0,
+            'max_confidence': max(confidence_scores) if confidence_scores else 0,
+            'min_confidence': min(confidence_scores) if confidence_scores else 0,
+            'quality_distribution': analysis_qualities,
+            'total_entries': len(entries)
+        }
+    def save_dataset(self, dataset: Dict[str, Any], format_type: str = 'json',
+                   filename: str = None) -> str:
+        """
+        Save dataset in specified format
+        Args:
+            dataset: Dataset to save
+            format_type: Format ('json', 'jsonl', 'csv', 'excel')
+            filename: Output filename (optional)
+        Returns:
+            Path to saved file
+        """
+        if not filename:
+            timestamp = int(time.time())
+            filename = f"nz_legislation_dataset_{timestamp}"
+        # Ensure filename has correct extension
+        if not filename.endswith(f'.{format_type}'):
+            filename += f'.{format_type}'
+        filepath = self.output_dir / filename
+        try:
+            if format_type == 'json':
+                with open(filepath, 'w', encoding='utf-8') as f:
+                    json.dump(dataset, f, indent=2, ensure_ascii=False)
+            elif format_type == 'jsonl':
+                with open(filepath, 'w', encoding='utf-8') as f:
+                    for entry in dataset.get('entries', []):
+                        json.dump(entry, f, ensure_ascii=False)
+                        f.write('\n')
+            elif format_type == 'csv':
+                self._save_as_csv(dataset, filepath)
+            elif format_type == 'excel':
+                self._save_as_excel(dataset, filepath)
+            else:
+                raise ValueError(f"Unsupported format: {format_type}")
+            return str(filepath)
+        except Exception as e:
+            raise Exception(f"Error saving dataset: {e}")
+    def _save_as_csv(self, dataset: Dict[str, Any], filepath: Path):
+        """Save dataset as CSV"""
+        entries = dataset.get('entries', [])
+        if not entries:
+            # Create empty CSV with headers
+            df = pd.DataFrame(columns=['id', 'prompt', 'response', 'metadata'])
+            df.to_csv(filepath, index=False)
+            return
+        # Flatten the data for CSV
+        csv_data = []
+        for entry in entries:
+            csv_row = {
+                'id': entry.get('id', ''),
+                'prompt': entry.get('prompt', ''),
+                'response': entry.get('response', ''),
+                'confidence_score': entry.get('metadata', {}).get('confidence_score', 0),
+                'analysis_type': entry.get('metadata', {}).get('analysis_type', ''),
+                'chunk_size': entry.get('metadata', {}).get('chunk_size', 0),
+                'word_count': entry.get('metadata', {}).get('word_count', 0),
+                'analysis_quality': entry.get('metadata', {}).get('analysis_quality', ''),
+                'created_at': entry.get('metadata', {}).get('created_at', '')
+            }
+            csv_data.append(csv_row)
+        df = pd.DataFrame(csv_data)
+        df.to_csv(filepath, index=False, encoding='utf-8')
+    def _save_as_excel(self, dataset: Dict[str, Any], filepath: Path):
+        """Save dataset as Excel with multiple sheets"""
+        entries = dataset.get('entries', [])
+        with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
+            # Main dataset sheet
+            if entries:
+                csv_data = []
+                for entry in entries:
+                    csv_row = {
+                        'id': entry.get('id', ''),
+                        'prompt': entry.get('prompt', ''),
+                        'response': entry.get('response', ''),
+                        'confidence_score': entry.get('metadata', {}).get('confidence_score', 0),
+                        'analysis_type': entry.get('metadata', {}).get('analysis_type', ''),
+                        'chunk_size': entry.get('metadata', {}).get('chunk_size', 0),
+                        'word_count': entry.get('metadata', {}).get('word_count', 0),
+                        'analysis_quality': entry.get('metadata', {}).get('analysis_quality', ''),
+                        'created_at': entry.get('metadata', {}).get('created_at', '')
+                    }
+                    csv_data.append(csv_row)
+                df_main = pd.DataFrame(csv_data)
+                df_main.to_excel(writer, sheet_name='Dataset', index=False)
+            # Metadata sheet
+            metadata_df = pd.DataFrame([dataset.get('metadata', {})])
+            metadata_df.to_excel(writer, sheet_name='Metadata', index=False)
+            # Quality metrics sheet
+            quality_data = dataset.get('metadata', {}).get('quality_metrics', {})
+            if quality_data:
+                quality_df = pd.DataFrame([quality_data])
+                quality_df.to_excel(writer, sheet_name='Quality_Metrics', index=False)
+    def load_dataset(self, filepath: str) -> Dict[str, Any]:
+        """
+        Load a dataset from file
+        Args:
+            filepath: Path to dataset file
+        Returns:
+            Loaded dataset
+        """
+        filepath = Path(filepath)
+        if not filepath.exists():
+            raise FileNotFoundError(f"Dataset file not found: {filepath}")
+        try:
+            if filepath.suffix == '.json':
+                with open(filepath, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            elif filepath.suffix == '.jsonl':
+                entries = []
+                with open(filepath, 'r', encoding='utf-8') as f:
+                    for line in f:
+                        if line.strip():
+                            entries.append(json.loads(line))
+                return {
+                    'metadata': {
+                        'loaded_from': str(filepath),
+                        'total_entries': len(entries)
+                    },
+                    'entries': entries
+                }
+            elif filepath.suffix in ['.csv', '.xlsx', '.xls']:
+                return self._load_from_spreadsheet(filepath)
+            else:
+                raise ValueError(f"Unsupported file format: {filepath.suffix}")
+        except Exception as e:
+            raise Exception(f"Error loading dataset: {e}")
+    def _load_from_spreadsheet(self, filepath: Path) -> Dict[str, Any]:
+        """Load dataset from spreadsheet format"""
+        try:
+            if filepath.suffix == '.csv':
+                df = pd.read_csv(filepath)
+            else:
+                df = pd.read_excel(filepath)
+            # Convert back to dataset format
+            entries = []
+            for _, row in df.iterrows():
+                entry = {
+                    'id': row.get('id', str(uuid.uuid4())),
+                    'prompt': row.get('prompt', ''),
+                    'response': row.get('response', ''),
+                    'metadata': {
+                        'confidence_score': row.get('confidence_score', 0),
+                        'analysis_type': row.get('analysis_type', 'standard'),
+                        'chunk_size': row.get('chunk_size', 0),
+                        'word_count': row.get('word_count', 0),
+                        'analysis_quality': row.get('analysis_quality', 'unknown'),
+                        'created_at': row.get('created_at', datetime.now().isoformat())
+                    }
+                }
+                entries.append(entry)
+            return {
+                'metadata': {
+                    'loaded_from': str(filepath),
+                    'total_entries': len(entries),
+                    'original_format': filepath.suffix[1:]
+                },
+                'entries': entries
+            }
+        except Exception as e:
+            raise Exception(f"Error loading spreadsheet: {e}")
+    def merge_datasets(self, datasets: List[Dict[str, Any]],
+                      output_name: str = None) -> Dict[str, Any]:
+        """
+        Merge multiple datasets into one
+        Args:
+            datasets: List of datasets to merge
+            output_name: Name for merged dataset
+        Returns:
+            Merged dataset
+        """
+        if not datasets:
+            return self.create_finetuning_dataset([])
+        merged_entries = []
+        all_analysis_types = set()
+        all_sources = set()
+        for dataset in datasets:
+            entries = dataset.get('entries', [])
+            merged_entries.extend(entries)
+            metadata = dataset.get('metadata', {})
+            all_analysis_types.update(metadata.get('analysis_types', []))
+            all_sources.update(metadata.get('legislation_sources', []))
+        # Create merged dataset
+        merged_dataset = {
+            'metadata': {
+                'version': '1.0',
+                'created_at': datetime.now().isoformat(),
+                'dataset_name': output_name or f"merged_dataset_{int(time.time())}",
+                'total_entries': len(merged_entries),
+                'analysis_types': list(all_analysis_types),
+                'legislation_sources': list(all_sources),
+                'merged_from': len(datasets),
+                'success_rate': 1.0  # Assuming all entries are valid
+            },
+            'entries': merged_entries
+        }
+        # Recalculate quality metrics
+        self._calculate_quality_metrics(merged_entries)
+        merged_dataset['metadata']['quality_metrics'] = self.metadata['quality_metrics']
+        return merged_dataset
+    def validate_dataset(self, dataset: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Validate dataset quality and completeness
+        Args:
+            dataset: Dataset to validate
+        Returns:
+            Validation results
+        """
+        validation = {
+            'is_valid': True,
+            'issues': [],
+            'warnings': [],
+            'statistics': {}
+        }
+        entries = dataset.get('entries', [])
+        metadata = dataset.get('metadata', {})
+        # Check basic structure
+        if not isinstance(entries, list):
+            validation['issues'].append("Entries must be a list")
+            validation['is_valid'] = False
+            return validation
+        if not entries:
+            validation['warnings'].append("Dataset is empty")
+            return validation
+        # Validate entries
+        valid_entries = 0
+        total_confidence = 0
+        for i, entry in enumerate(entries):
+            if not isinstance(entry, dict):
+                validation['issues'].append(f"Entry {i} is not a dictionary")
+                continue
+            # Check required fields
+            required_fields = ['id', 'prompt', 'response']
+            for field in required_fields:
+                if field not in entry:
+                    validation['issues'].append(f"Entry {i} missing required field: {field}")
+            # Check prompt and response quality
+            prompt = entry.get('prompt', '')
+            response = entry.get('response', '')
+            if len(prompt.strip()) < 10:
+                validation['warnings'].append(f"Entry {i} has very short prompt")
+            if len(response.strip()) < 10:
+                validation['warnings'].append(f"Entry {i} has very short response")
+            # Check confidence score
+            confidence = entry.get('metadata', {}).get('confidence_score', 0)
+            total_confidence += confidence
+            valid_entries += 1
+        # Calculate statistics
+        validation['statistics'] = {
+            'total_entries': len(entries),
+            'valid_entries': valid_entries,
+            'average_confidence': total_confidence / valid_entries if valid_entries > 0 else 0,
+            'validation_rate': valid_entries / len(entries) if entries else 0
+        }
+        return validation
+    def get_dataset_statistics(self, dataset: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Get comprehensive statistics about the dataset
+        Args:
+            dataset: Dataset to analyze
+        Returns:
+            Dataset statistics
+        """
+        entries = dataset.get('entries', [])
+        if not entries:
+            return {'total_entries': 0}
+        # Basic statistics
+        stats = {
+            'total_entries': len(entries),
+            'total_prompts': len([e for e in entries if e.get('prompt')]),
+            'total_responses': len([e for e in entries if e.get('response')]),
+            'average_prompt_length': 0,
+            'average_response_length': 0,
+            'confidence_distribution': {},
+            'analysis_type_distribution': {},
+            'quality_distribution': {}
+        }
+        # Calculate averages
+        prompt_lengths = [len(e.get('prompt', '')) for e in entries if e.get('prompt')]
+        response_lengths = [len(e.get('response', '')) for e in entries if e.get('response')]
+        if prompt_lengths:
+            stats['average_prompt_length'] = sum(prompt_lengths) / len(prompt_lengths)
+        if response_lengths:
+            stats['average_response_length'] = sum(response_lengths) / len(response_lengths)
+        # Distribution analysis
+        for entry in entries:
+            metadata = entry.get('metadata', {})
+            # Confidence distribution
+            confidence = metadata.get('confidence_score', 0)
+            conf_range = f"{(confidence // 20) * 20}-{(confidence // 20) * 20 + 19}"
+            stats['confidence_distribution'][conf_range] = stats['confidence_distribution'].get(conf_range, 0) + 1
+            # Analysis type distribution
+            analysis_type = metadata.get('analysis_type', 'unknown')
+            stats['analysis_type_distribution'][analysis_type] = stats['analysis_type_distribution'].get(analysis_type, 0) + 1
+            # Quality distribution
+            quality = metadata.get('analysis_quality', 'unknown')
+            stats['quality_distribution'][quality] = stats['quality_distribution'].get(quality, 0) + 1
+        return stats

streamlit_app/core/llm_analyzer.py ADDED Viewed

	@@ -0,0 +1,469 @@

+#!/usr/bin/env python3
+"""
+LLM Analyzer
+Handles LLM model loading, inference, and analysis for the NZ Legislation Loophole Analysis.
+Provides optimized prompts and response parsing for legal text analysis.
+"""
+import os
+import time
+from typing import List, Dict, Any, Optional, Tuple
+import json
+from llama_cpp import Llama
+import re
+class LLMAnalyzer:
+    """LLM-based analyzer for legislation loophole detection"""
+    def __init__(self, model_config: Dict[str, Any]):
+        """
+        Initialize the LLM analyzer
+        Args:
+            model_config: Configuration for the LLM model
+        """
+        self.model_config = model_config
+        self.model = None
+        self.is_loaded = False
+        # Analysis templates
+        self.analysis_templates = {
+            'standard': {
+                'depth': 'Standard',
+                'include_recommendations': True,
+                'focus_areas': ['loopholes', 'ambiguities', 'unintended_consequences']
+            },
+            'detailed': {
+                'depth': 'Detailed',
+                'include_recommendations': True,
+                'focus_areas': ['loopholes', 'ambiguities', 'unintended_consequences', 'implementation_issues']
+            },
+            'comprehensive': {
+                'depth': 'Comprehensive',
+                'include_recommendations': True,
+                'focus_areas': ['loopholes', 'ambiguities', 'unintended_consequences',
+                               'implementation_issues', 'policy_conflicts', 'enforcement_challenges']
+            }
+        }
+        # Prompt templates
+        self.prompt_templates = {
+            'loophole_analysis': self._get_loophole_analysis_template(),
+            'ambiguity_detection': self._get_ambiguity_detection_template(),
+            'recommendations': self._get_recommendations_template()
+        }
+    def _get_loophole_analysis_template(self) -> str:
+        """Get the main loophole analysis prompt template"""
+        return """You are a legal expert analyzing New Zealand legislation for loopholes and ambiguities.
+LEGISLATION TEXT:
+{text}
+TASK: Analyze this legislative text for potential loopholes, ambiguities, or unintended consequences.
+INSTRUCTIONS:
+Provide a structured analysis following this format:
+1. **Text Meaning**: Explain what the text means and its intended purpose
+2. **Key Assumptions**: Identify any assumptions the text makes that could be exploited
+3. **Exploitable Interpretations**: Discuss how the text could be interpreted or applied in ways that circumvent its intended purpose
+4. **Critical Loopholes**: Identify specific loopholes, ambiguities, or unintended consequences that could be used to bypass the legislation
+5. **Circumvention Strategies**: Suggest practical methods or scenarios for exploiting these loopholes to achieve objectives contrary to the legislation's intent
+{reasoning_format}
+{recommendations_format}
+ANALYSIS DEPTH: {depth}
+FOCUS AREAS: {focus_areas}
+"""
+    def _get_ambiguity_detection_template(self) -> str:
+        """Get the ambiguity detection prompt template"""
+        return """Analyze the following legal text for ambiguities and unclear provisions:
+TEXT: {text}
+Identify:
+1. Vague terms or phrases
+2. Ambiguous references
+3. Unclear conditions or requirements
+4. Missing definitions
+5. Conflicting provisions
+Provide specific examples and suggest clarifications.
+"""
+    def _get_recommendations_template(self) -> str:
+        """Get the recommendations prompt template"""
+        return """Based on the loopholes and ambiguities identified, provide specific recommendations for:
+1. Legislative amendments to close identified loopholes
+2. Additional definitions or clarifications needed
+3. Implementation guidelines or regulations
+4. Monitoring and enforcement mechanisms
+Prioritize recommendations by impact and feasibility.
+"""
+    def load_model(self) -> bool:
+        """
+        Load the LLM model
+        Returns:
+            True if model loaded successfully, False otherwise
+        """
+        if self.is_loaded:
+            return True
+        try:
+            print("Loading LLM model...")
+            # Try to load from HuggingFace
+            if self.model_config.get('repo_id'):
+                self.model = Llama.from_pretrained(
+                    repo_id=self.model_config['repo_id'],
+                    filename=self.model_config.get('filename', ''),
+                    n_ctx=self.model_config.get('context_length', 40960),
+                    n_threads=min(os.cpu_count(), 8),
+                    verbose=False,
+                    n_gpu_layers=-1,
+                    n_batch=4096,
+                    logits_all=False,
+                    use_mlock=True,
+                    use_mmap=True,
+                )
+            else:
+                # Load from local path
+                model_path = self.model_config.get('path', '')
+                if not model_path or not os.path.exists(model_path):
+                    print(f"Model path not found: {model_path}")
+                    return False
+                self.model = Llama(
+                    model_path=model_path,
+                    n_ctx=self.model_config.get('context_length', 40960),
+                    n_threads=min(os.cpu_count(), 8),
+                    verbose=False,
+                    n_gpu_layers=-1,
+                    n_batch=4096,
+                )
+            self.is_loaded = True
+            print("Model loaded successfully")
+            return True
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            return False
+    def unload_model(self):
+        """Unload the model to free memory"""
+        if self.model:
+            del self.model
+            self.model = None
+        self.is_loaded = False
+    def generate_chat_template(self, system_prompt: str, user_message: str = "") -> str:
+        """
+        Generate a chat template for the model
+        Args:
+            system_prompt: The system prompt
+            user_message: The user message (optional)
+        Returns:
+            Formatted chat template
+        """
+        chat_messages = []
+        # System message
+        if system_prompt:
+            chat_messages.append("<|im_start|>system")
+            chat_messages.append(system_prompt)
+            chat_messages.append("<|im_end|>")
+        # User message
+        if user_message:
+            chat_messages.append("<|im_start|>user")
+            chat_messages.append(user_message)
+            chat_messages.append("<|im_end|>")
+        # Assistant message with generation prompt
+        chat_messages.append("<|im_start|>assistant")
+        chat_messages.append("")  # Empty for generation
+        return "\n".join(chat_messages)
+    def analyze_chunk(self, chunk: str, analysis_type: str = 'standard',
+                     cache_manager = None) -> Dict[str, Any]:
+        """
+        Analyze a single text chunk for loopholes and ambiguities
+        Args:
+            chunk: Text chunk to analyze
+            analysis_type: Type of analysis to perform
+            cache_manager: Cache manager instance for caching results
+        Returns:
+            Analysis results
+        """
+        if not self.is_loaded and not self.load_model():
+            return {
+                'error': 'Model not loaded',
+                'chunk': chunk[:100] + "..." if len(chunk) > 100 else chunk
+            }
+        # Check cache first
+        if cache_manager:
+            cached_result = cache_manager.get(
+                chunk,
+                self.model_config,
+                {'analysis_type': analysis_type}
+            )
+            if cached_result:
+                return cached_result
+        try:
+            # Prepare analysis template
+            template_config = self.analysis_templates.get(analysis_type, self.analysis_templates['standard'])
+            # Build the full prompt
+            reasoning_format = """
+Write your complete analysis between <start_working_out> and <end_working_out>.
+Then provide your overall conclusion between <SOLUTION> and </SOLUTION>.
+"""
+            recommendations_format = """
+**Recommendations**: Provide specific recommendations for addressing identified issues.
+""" if template_config['include_recommendations'] else ""
+            full_prompt = self.prompt_templates['loophole_analysis'].format(
+                text=chunk,
+                reasoning_format=reasoning_format,
+                recommendations_format=recommendations_format,
+                depth=template_config['depth'],
+                focus_areas=', '.join(template_config['focus_areas'])
+            )
+            # Generate chat template
+            chat_template = self.generate_chat_template(full_prompt)
+            # Generate response
+            response = self._generate_response(chat_template)
+            # Parse and structure the response
+            structured_response = self._parse_response(response)
+            # Add metadata
+            result = {
+                'chunk': chunk,
+                'analysis_type': analysis_type,
+                'model_config': self.model_config,
+                'response': response,
+                'structured_analysis': structured_response,
+                'processing_time': time.time(),
+                'chunk_size': len(chunk),
+                'word_count': len(chunk.split())
+            }
+            # Cache the result
+            if cache_manager:
+                cache_manager.put(chunk, result, self.model_config, {'analysis_type': analysis_type})
+            return result
+        except Exception as e:
+            return {
+                'error': str(e),
+                'chunk': chunk[:100] + "..." if len(chunk) > 100 else chunk
+            }
+    def _generate_response(self, prompt: str, max_tokens: int = None) -> str:
+        """
+        Generate a response from the model
+        Args:
+            prompt: Input prompt
+            max_tokens: Maximum tokens to generate
+        Returns:
+            Generated response
+        """
+        if max_tokens is None:
+            max_tokens = self.model_config.get('max_tokens', 4096)
+        try:
+            response = self.model(
+                prompt,
+                max_tokens=max_tokens,
+                temperature=self.model_config.get('temperature', 0.3),
+                top_p=self.model_config.get('top_p', 0.85),
+                top_k=self.model_config.get('top_k', 50),
+                repeat_penalty=self.model_config.get('repeat_penalty', 1.15),
+                stop=["<end_working_out>", "</SOLUTION>", "<|im_end|>"],
+                echo=False
+            )
+            return response['choices'][0]['text'].strip()
+        except Exception as e:
+            print(f"Error generating response: {e}")
+            return ""
+    def _parse_response(self, response: str) -> Dict[str, Any]:
+        """
+        Parse the LLM response into structured data
+        Args:
+            response: Raw LLM response
+        Returns:
+            Structured analysis data
+        """
+        structured = {
+            'text_meaning': '',
+            'key_assumptions': [],
+            'exploitable_interpretations': [],
+            'critical_loopholes': [],
+            'circumvention_strategies': [],
+            'recommendations': [],
+            'confidence_score': 0,
+            'analysis_quality': 'unknown'
+        }
+        try:
+            # Extract sections using regex patterns
+            patterns = {
+                'text_meaning': r'\*\*Text Meaning\*\*:\s*(.*?)(?=\*\*|$)',
+                'key_assumptions': r'\*\*Key Assumptions\*\*:\s*(.*?)(?=\*\*|$)',
+                'exploitable_interpretations': r'\*\*Exploitable Interpretations\*\*:\s*(.*?)(?=\*\*|$)',
+                'critical_loopholes': r'\*\*Critical Loopholes\*\*:\s*(.*?)(?=\*\*|$)',
+                'circumvention_strategies': r'\*\*Circumvention Strategies\*\*:\s*(.*?)(?=\*\*|$)',
+                'recommendations': r'\*\*Recommendations\*\*:\s*(.*?)(?=\*\*|$|$)',
+            }
+            for key, pattern in patterns.items():
+                matches = re.findall(pattern, response, re.DOTALL | re.IGNORECASE)
+                if matches:
+                    content = matches[0].strip()
+                    if key in ['key_assumptions', 'exploitable_interpretations',
+                              'critical_loopholes', 'circumvention_strategies', 'recommendations']:
+                        # Split into list items
+                        items = re.findall(r'(?:\d+\.|-|\•)\s*(.*?)(?=(?:\d+\.|-|\•)|$)',
+                                         content, re.DOTALL)
+                        structured[key] = [item.strip() for item in items if item.strip()]
+                    else:
+                        structured[key] = content
+            # Calculate confidence score based on analysis completeness
+            completeness_score = 0
+            if structured['text_meaning']:
+                completeness_score += 20
+            for key in ['key_assumptions', 'exploitable_interpretations',
+                       'critical_loopholes', 'circumvention_strategies']:
+                if structured[key]:
+                    completeness_score += 20
+            structured['confidence_score'] = min(100, completeness_score)
+            # Determine analysis quality
+            if structured['confidence_score'] >= 80:
+                structured['analysis_quality'] = 'high'
+            elif structured['confidence_score'] >= 60:
+                structured['analysis_quality'] = 'medium'
+            else:
+                structured['analysis_quality'] = 'low'
+        except Exception as e:
+            print(f"Error parsing response: {e}")
+            structured['error'] = str(e)
+        return structured
+    def batch_analyze_chunks(self, chunks: List[str], analysis_type: str = 'standard',
+                           cache_manager = None, progress_callback = None) -> List[Dict[str, Any]]:
+        """
+        Analyze multiple chunks in batch
+        Args:
+            chunks: List of text chunks to analyze
+            analysis_type: Type of analysis to perform
+            cache_manager: Cache manager instance
+            progress_callback: Callback function for progress updates
+        Returns:
+            List of analysis results
+        """
+        results = []
+        total_chunks = len(chunks)
+        for i, chunk in enumerate(chunks):
+            if progress_callback:
+                progress = (i + 1) / total_chunks
+                progress_callback(progress, f"Analyzing chunk {i + 1}/{total_chunks}")
+            result = self.analyze_chunk(chunk, analysis_type, cache_manager)
+            results.append(result)
+        return results
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the loaded model"""
+        if not self.is_loaded:
+            return {'status': 'not_loaded'}
+        try:
+            return {
+                'status': 'loaded',
+                'config': self.model_config,
+                'model_type': type(self.model).__name__,
+                'context_length': self.model_config.get('context_length', 'unknown'),
+                'vocab_size': getattr(self.model, 'vocab_size', 'unknown')
+            }
+        except Exception as e:
+            return {
+                'status': 'error',
+                'error': str(e)
+            }
+    def validate_model_config(self) -> Dict[str, Any]:
+        """Validate the current model configuration"""
+        validation = {
+            'is_valid': True,
+            'issues': [],
+            'warnings': []
+        }
+        # Check required parameters
+        required_params = ['context_length', 'max_tokens']
+        for param in required_params:
+            if param not in self.model_config:
+                validation['issues'].append(f"Missing required parameter: {param}")
+                validation['is_valid'] = False
+        # Check parameter ranges
+        if 'context_length' in self.model_config:
+            if self.model_config['context_length'] < 1024:
+                validation['issues'].append("Context length too small (minimum: 1024)")
+                validation['is_valid'] = False
+        if 'max_tokens' in self.model_config:
+            if self.model_config['max_tokens'] < 64:
+                validation['issues'].append("Max tokens too small (minimum: 64)")
+                validation['is_valid'] = False
+        if 'temperature' in self.model_config:
+            temp = self.model_config['temperature']
+            if not (0 <= temp <= 2):
+                validation['issues'].append("Temperature out of valid range (0-2)")
+                validation['is_valid'] = False
+        # Check model path/file
+        if 'path' in self.model_config and self.model_config['path']:
+            if not os.path.exists(self.model_config['path']):
+                validation['warnings'].append(f"Model file not found: {self.model_config['path']}")
+        return validation

streamlit_app/core/text_processor.py ADDED Viewed

	@@ -0,0 +1,377 @@

+#!/usr/bin/env python3
+"""
+Text Processor
+Handles text cleaning, chunking, and preprocessing for the NZ Legislation Loophole Analysis.
+Optimized for legal/legislative content with specialized cleaning and structuring.
+"""
+import re
+from typing import List, Dict, Any, Optional, Tuple
+import hashlib
+import json
+class TextProcessor:
+    """Advanced text processing for legislation analysis"""
+    def __init__(self):
+        """Initialize the text processor with legal-specific patterns"""
+        # Legal-specific patterns
+        self.section_pattern = re.compile(r'^(\d+):', re.MULTILINE)
+        self.act_name_pattern = re.compile(r'(\b\w+(?:\s+\w+)*)\s+Act\s+(\d{4})', re.IGNORECASE)
+        self.date_patterns = [
+            (r'(\d{1,2})\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*(\d{4})',
+             lambda m: f"{m.group(1)} {m.group(2)}"),
+            (r'(\d{1,2})/(\d{1,2})/(\d{4})', r'\1/\2/\3'),
+            (r'(\d{4})-(\d{2})-(\d{2})', r'\1-\2-\3')
+        ]
+        # NZ-specific legal terms
+        self.nz_terms = {
+            'New Zealand': 'New Zealand',
+            'Parliament': 'Parliament',
+            'Crown': 'Crown',
+            'Government': 'Government',
+            'Treaty of Waitangi': 'Treaty of Waitangi',
+            'NZB': 'NZB',
+            'Her Majesty': 'Her Majesty',
+            'Governor-General': 'Governor-General'
+        }
+    def clean_text(self, text: str, preserve_structure: bool = True) -> str:
+        """
+        Clean and normalize text for better processing, optimized for legal content
+        Args:
+            text: Raw text to clean
+            preserve_structure: Whether to preserve legal document structure
+        Returns:
+            Cleaned text
+        """
+        if not text:
+            return ""
+        # Preserve section numbers and legal structure if requested
+        if preserve_structure:
+            # Keep section numbers like "1:", "2:", etc.
+            text = self.section_pattern.sub(r'\1', text)
+        # Remove excessive whitespace but preserve paragraph structure
+        text = re.sub(r'[ \t]+', ' ', text)  # Replace multiple spaces/tabs with single space
+        text = re.sub(r'\n\s*\n', '\n\n', text)  # Preserve paragraph breaks but clean up
+        text = re.sub(r'\n{3,}', '\n\n', text)  # Reduce excessive newlines to double
+        # Remove control characters but preserve legal formatting
+        text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)
+        # Handle legal-specific characters and formatting
+        allowed_chars = r'\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}\"\'\/\@\#\$\%\^\&\*\+\=\~\`°§'
+        text = re.sub(r'[^' + allowed_chars + ']', '', text)
+        # Normalize quotes and apostrophes for legal text
+        text = re.sub(r'[""]', '"', text)
+        text = re.sub(r"['']", "'", text)
+        text = re.sub(r'`', "'", text)
+        # Clean up legal numbering and references
+        text = re.sub(r'section\s+(\d+)', r'section \1', text, flags=re.IGNORECASE)
+        # Normalize date formats
+        for pattern, replacement in self.date_patterns:
+            if callable(replacement):
+                text = re.compile(pattern, re.IGNORECASE).sub(replacement, text)
+            else:
+                text = re.compile(pattern, re.IGNORECASE).sub(replacement, text)
+        # Normalize act names with years
+        text = self.act_name_pattern.sub(r'\1 Act', text)
+        # Clean up amendment references
+        text = re.sub(r'[Aa]mendment\(s\)\s+incorporated\s+in\s+the\s+[Aa]ct\(s\)', 'Amendments incorporated', text)
+        # Normalize section references
+        text = re.sub(r'section\s+\d+\(\d+\)\([a-zA-Z]\)', lambda m: m.group(0).lower(), text)
+        # Generic pattern for legal document sections
+        text = re.sub(r'(\b(?:section|part|chapter|article|clause|subsection|paragraph))\s+(\d+[a-zA-Z]*)',
+                      lambda m: f"{m.group(1)} {m.group(2)}", text, flags=re.IGNORECASE)
+        # NZ-specific legal enhancements
+        for term, normalized in self.nz_terms.items():
+            text = re.sub(re.escape(term), normalized, text, flags=re.IGNORECASE)
+        # Handle Maori-specific characters if present
+        maori_chars = 'āēīōūwhĀĒĪŌŪWH'
+        allowed_chars += maori_chars
+        text = re.sub(r'[^' + allowed_chars + ']', '', text)
+        # Remove empty lines and trim while preserving legal structure
+        lines = []
+        for line in text.split('\n'):
+            stripped = line.strip()
+            if stripped:  # Keep non-empty lines
+                if preserve_structure and re.match(r'^\d+:', stripped):
+                    lines.append(stripped)  # Preserve section headers
+                else:
+                    lines.append(stripped)
+        text = '\n'.join(lines)
+        return text.strip()
+    def chunk_text(self, text: str, chunk_size: int = 4096, overlap: int = 256,
+                   method: str = "sentence") -> List[str]:
+        """
+        Split text into overlapping chunks for processing
+        Args:
+            text: Text to chunk
+            chunk_size: Size of each chunk
+            overlap: Overlap between chunks
+            method: Chunking method ('sentence', 'word', 'character')
+        Returns:
+            List of text chunks
+        """
+        if not text or len(text) <= chunk_size:
+            return [text] if text else []
+        chunks = []
+        if method == "sentence":
+            chunks = self._chunk_by_sentence(text, chunk_size, overlap)
+        elif method == "word":
+            chunks = self._chunk_by_word(text, chunk_size, overlap)
+        else:  # character
+            chunks = self._chunk_by_character(text, chunk_size, overlap)
+        return chunks
+    def _chunk_by_sentence(self, text: str, chunk_size: int, overlap: int) -> List[str]:
+        """Chunk text by sentence boundaries"""
+        # Split into sentences (rough approximation)
+        sentence_pattern = r'(?<=[.!?])\s+'
+        sentences = re.split(sentence_pattern, text)
+        chunks = []
+        current_chunk = ""
+        overlap_text = ""
+        for sentence in sentences:
+            if not sentence.strip():
+                continue
+            # Check if adding this sentence would exceed chunk size
+            potential_chunk = current_chunk + sentence + " "
+            if len(potential_chunk) > chunk_size and current_chunk:
+                # Save current chunk
+                chunks.append(current_chunk.strip())
+                # Start new chunk with overlap
+                if overlap > 0 and len(current_chunk) > overlap:
+                    overlap_text = current_chunk[-overlap:].strip()
+                    current_chunk = overlap_text + " " + sentence + " "
+                else:
+                    current_chunk = sentence + " "
+            else:
+                current_chunk = potential_chunk
+        # Add the last chunk
+        if current_chunk.strip():
+            chunks.append(current_chunk.strip())
+        return chunks
+    def _chunk_by_word(self, text: str, chunk_size: int, overlap: int) -> List[str]:
+        """Chunk text by word boundaries"""
+        words = text.split()
+        chunks = []
+        if not words:
+            return []
+        start = 0
+        while start < len(words):
+            end = start + 1
+            chunk_words = []
+            # Build chunk up to chunk_size
+            while end <= len(words):
+                potential_chunk = " ".join(words[start:end])
+                if len(potential_chunk) > chunk_size:
+                    break
+                chunk_words = words[start:end]
+                end += 1
+            if chunk_words:
+                chunk = " ".join(chunk_words)
+                chunks.append(chunk)
+                # Move start position with overlap
+                overlap_words = max(0, min(overlap // 5, len(chunk_words)))  # Rough word overlap
+                start = max(start + 1, end - overlap_words)
+            else:
+                break
+        return chunks
+    def _chunk_by_character(self, text: str, chunk_size: int, overlap: int) -> List[str]:
+        """Chunk text by character count (simple fallback)"""
+        chunks = []
+        start = 0
+        while start < len(text):
+            end = min(start + chunk_size, len(text))
+            chunk = text[start:end]
+            chunks.append(chunk)
+            # Move start with overlap
+            start = end - overlap if end < len(text) else len(text)
+        return chunks
+    def extract_metadata(self, text: str) -> Dict[str, Any]:
+        """Extract metadata from legislation text"""
+        metadata = {
+            'sections': [],
+            'acts_referenced': [],
+            'dates': [],
+            'word_count': len(text.split()),
+            'character_count': len(text),
+            'has_nz_references': False,
+            'has_maori_terms': False
+        }
+        # Extract section numbers
+        sections = self.section_pattern.findall(text)
+        metadata['sections'] = [int(s) for s in sections]
+        # Extract referenced acts
+        acts = self.act_name_pattern.findall(text)
+        metadata['acts_referenced'] = [f"{act[0]} Act" for act in acts]
+        # Check for NZ-specific references
+        nz_indicators = ['New Zealand', 'Parliament', 'Crown', 'Government', 'Treaty of Waitangi']
+        metadata['has_nz_references'] = any(term in text for term in nz_indicators)
+        # Check for Maori terms
+        maori_indicators = ['ā', 'ē', 'ī', 'ō', 'ū', 'whakapapa', 'tangata whenua', 'mana']
+        metadata['has_maori_terms'] = any(term in text.lower() for term in maori_indicators)
+        # Extract dates (basic)
+        date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{4}\b'
+        dates = re.findall(date_pattern, text)
+        metadata['dates'] = dates
+        return metadata
+    def calculate_text_hash(self, text: str) -> str:
+        """Calculate SHA-256 hash of text for caching"""
+        return hashlib.sha256(text.encode('utf-8')).hexdigest()
+    def get_chunk_statistics(self, chunks: List[str]) -> Dict[str, Any]:
+        """Get statistics about text chunks"""
+        if not chunks:
+            return {
+                'total_chunks': 0,
+                'avg_chunk_size': 0,
+                'min_chunk_size': 0,
+                'max_chunk_size': 0,
+                'total_characters': 0
+            }
+        chunk_sizes = [len(chunk) for chunk in chunks]
+        return {
+            'total_chunks': len(chunks),
+            'avg_chunk_size': sum(chunk_sizes) / len(chunks),
+            'min_chunk_size': min(chunk_sizes),
+            'max_chunk_size': max(chunk_sizes),
+            'total_characters': sum(chunk_sizes)
+        }
+    def preprocess_legislation_json(self, json_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Preprocess legislation data from JSON format"""
+        processed = {
+            'id': json_data.get('id', ''),
+            'title': json_data.get('title', ''),
+            'year': json_data.get('year', ''),
+            'source': json_data.get('source', ''),
+            'original_text': json_data.get('text', ''),
+            'cleaned_text': '',
+            'chunks': [],
+            'metadata': {},
+            'processing_stats': {}
+        }
+        # Clean the text
+        raw_text = json_data.get('text', '')
+        processed['cleaned_text'] = self.clean_text(raw_text)
+        # Extract metadata
+        processed['metadata'] = self.extract_metadata(processed['cleaned_text'])
+        return processed
+    def batch_process_texts(self, texts: List[str], chunk_size: int = 4096,
+                           overlap: int = 256) -> List[Dict[str, Any]]:
+        """Process multiple texts in batch"""
+        results = []
+        for text in texts:
+            cleaned = self.clean_text(text)
+            chunks = self.chunk_text(cleaned, chunk_size, overlap)
+            metadata = self.extract_metadata(cleaned)
+            stats = self.get_chunk_statistics(chunks)
+            result = {
+                'original_text': text,
+                'cleaned_text': cleaned,
+                'chunks': chunks,
+                'metadata': metadata,
+                'processing_stats': stats
+            }
+            results.append(result)
+        return results
+    def validate_text_quality(self, text: str) -> Dict[str, Any]:
+        """Validate and assess text quality for processing"""
+        quality = {
+            'is_valid': True,
+            'issues': [],
+            'score': 100,
+            'metrics': {}
+        }
+        # Check minimum length
+        if len(text.strip()) < 10:
+            quality['issues'].append("Text too short")
+            quality['score'] -= 50
+        # Check for excessive special characters
+        special_chars = len(re.findall(r'[^\w\s]', text))
+        special_ratio = special_chars / len(text) if text else 0
+        if special_ratio > 0.3:
+            quality['issues'].append("High special character ratio")
+            quality['score'] -= 20
+        # Check for legal content indicators
+        legal_indicators = ['section', 'act', 'law', 'regulation', 'clause', 'subsection']
+        has_legal_content = any(indicator in text.lower() for indicator in legal_indicators)
+        if not has_legal_content:
+            quality['issues'].append("May not be legal content")
+            quality['score'] -= 30
+        quality['is_valid'] = len(quality['issues']) == 0
+        quality['metrics'] = {
+            'length': len(text),
+            'word_count': len(text.split()),
+            'special_char_ratio': special_ratio,
+            'has_legal_content': has_legal_content
+        }
+        return quality

streamlit_app/utils/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (11.8 kB). View file

streamlit_app/utils/__pycache__/performance.cpython-312.pyc ADDED Viewed

Binary file (13.7 kB). View file

streamlit_app/utils/__pycache__/ui_helpers.cpython-312.pyc ADDED Viewed

Binary file (21.1 kB). View file

streamlit_app/utils/config.py ADDED Viewed

	@@ -0,0 +1,241 @@

+#!/usr/bin/env python3
+"""
+Configuration Manager
+Handles all configuration settings for the NZ Legislation Loophole Analysis application.
+Provides default configurations, persistent storage, and validation.
+"""
+import json
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional
+import streamlit as st
+class ConfigManager:
+    """Configuration manager for the application"""
+    def __init__(self, config_file: str = None):
+        """
+        Initialize configuration manager
+        Args:
+            config_file: Path to configuration file (optional)
+        """
+        if config_file is None:
+            config_dir = Path(__file__).parent.parent / 'config'
+            config_dir.mkdir(exist_ok=True)
+            config_file = config_dir / 'app_config.json'
+        self.config_file = Path(config_file)
+        self._config = {}
+        self._load_config()
+    def _load_config(self):
+        """Load configuration from file or use defaults"""
+        if self.config_file.exists():
+            try:
+                with open(self.config_file, 'r', encoding='utf-8') as f:
+                    self._config = json.load(f)
+                # Validate and merge with defaults
+                self._config = self._merge_with_defaults(self._config)
+            except (json.JSONDecodeError, IOError) as e:
+                print(f"Warning: Could not load config file: {e}")
+                self._config = self._get_default_config()
+        else:
+            self._config = self._get_default_config()
+    def _get_default_config(self) -> Dict[str, Any]:
+        """Get default configuration"""
+        return {
+            'model': {
+                'path': 'qwen3.gguf',
+                'repo_id': 'DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-GGUF',
+                'filename': 'Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-D_AU-IQ4_XS-imat.gguf',
+                'context_length': 40960,
+                'max_tokens': 4096,
+                'temperature': 0.3,
+                'top_p': 0.85,
+                'top_k': 50,
+                'repeat_penalty': 1.15
+            },
+            'processing': {
+                'chunk_size': 4096,
+                'chunk_overlap': 256,
+                'batch_size': 16,
+                'clean_text': True,
+                'preserve_structure': True
+            },
+            'cache': {
+                'enabled': True,
+                'max_size_mb': 1024,
+                'ttl_hours': 24,
+                'persistent': True
+            },
+            'analysis': {
+                'depth': 'Standard',
+                'include_recommendations': True,
+                'focus_areas': ['loopholes', 'ambiguities', 'unintended_consequences'],
+                'legal_domains': ['constitutional', 'administrative', 'criminal', 'civil']
+            },
+            'ui': {
+                'theme': 'Auto',
+                'show_progress': True,
+                'auto_refresh': False,
+                'max_display_items': 50
+            },
+            'advanced': {
+                'debug_mode': False,
+                'log_level': 'INFO',
+                'memory_limit_mb': 8192,
+                'thread_pool_size': 4,
+                'save_intermediate_results': True
+            }
+        }
+    def _merge_with_defaults(self, user_config: Dict[str, Any]) -> Dict[str, Any]:
+        """Merge user configuration with defaults"""
+        default_config = self._get_default_config()
+        def merge_dicts(default: Dict[str, Any], user: Dict[str, Any]) -> Dict[str, Any]:
+            merged = default.copy()
+            for key, value in user.items():
+                if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
+                    merged[key] = merge_dicts(merged[key], value)
+                else:
+                    merged[key] = value
+            return merged
+        return merge_dicts(default_config, user_config)
+    def get_config(self) -> Dict[str, Any]:
+        """Get current configuration"""
+        return self._config.copy()
+    def update_config(self, new_config: Dict[str, Any]):
+        """Update configuration with validation"""
+        # Validate configuration
+        if self._validate_config(new_config):
+            self._config = self._merge_with_defaults(new_config)
+            self._save_config()
+        else:
+            raise ValueError("Invalid configuration provided")
+    def _validate_config(self, config: Dict[str, Any]) -> bool:
+        """Validate configuration values"""
+        try:
+            # Model validation
+            model_config = config.get('model', {})
+            if model_config.get('context_length', 0) < 1024:
+                return False
+            if model_config.get('max_tokens', 0) < 64:
+                return False
+            if not (0 <= model_config.get('temperature', 0) <= 2):
+                return False
+            # Processing validation
+            proc_config = config.get('processing', {})
+            if proc_config.get('chunk_size', 0) < 256:
+                return False
+            if proc_config.get('chunk_overlap', 0) >= proc_config.get('chunk_size', 1):
+                return False
+            if proc_config.get('batch_size', 0) < 1:
+                return False
+            # Cache validation
+            cache_config = config.get('cache', {})
+            if cache_config.get('max_size_mb', 0) < 100:
+                return False
+            if cache_config.get('ttl_hours', 0) < 1:
+                return False
+            return True
+        except Exception:
+            return False
+    def _save_config(self):
+        """Save configuration to file"""
+        try:
+            self.config_file.parent.mkdir(exist_ok=True)
+            with open(self.config_file, 'w', encoding='utf-8') as f:
+                json.dump(self._config, f, indent=2, ensure_ascii=False)
+        except IOError as e:
+            print(f"Warning: Could not save config file: {e}")
+    def reset_to_defaults(self):
+        """Reset configuration to defaults"""
+        self._config = self._get_default_config()
+        self._save_config()
+    def get_section(self, section: str) -> Dict[str, Any]:
+        """Get a specific configuration section"""
+        return self._config.get(section, {})
+    def update_section(self, section: str, values: Dict[str, Any]):
+        """Update a specific configuration section"""
+        if section not in self._config:
+            self._config[section] = {}
+        self._config[section].update(values)
+        # Validate the updated config
+        if self._validate_config(self._config):
+            self._save_config()
+        else:
+            raise ValueError(f"Invalid configuration for section: {section}")
+    def export_config(self, filepath: str) -> bool:
+        """Export configuration to file"""
+        try:
+            with open(filepath, 'w', encoding='utf-8') as f:
+                json.dump(self._config, f, indent=2, ensure_ascii=False)
+            return True
+        except IOError:
+            return False
+    def import_config(self, filepath: str) -> bool:
+        """Import configuration from file"""
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                imported_config = json.load(f)
+            if self._validate_config(imported_config):
+                self._config = self._merge_with_defaults(imported_config)
+                self._save_config()
+                return True
+            else:
+                return False
+        except (IOError, json.JSONDecodeError):
+            return False
+    def get_model_config(self) -> Dict[str, Any]:
+        """Get model-specific configuration"""
+        return self._config.get('model', {})
+    def get_processing_config(self) -> Dict[str, Any]:
+        """Get processing-specific configuration"""
+        return self._config.get('processing', {})
+    def get_cache_config(self) -> Dict[str, Any]:
+        """Get cache-specific configuration"""
+        return self._config.get('cache', {})
+    def get_ui_config(self) -> Dict[str, Any]:
+        """Get UI-specific configuration"""
+        return self._config.get('ui', {})
+    def get_advanced_config(self) -> Dict[str, Any]:
+        """Get advanced configuration"""
+        return self._config.get('advanced', {})
+# Global configuration instance
+_config_instance = None
+def get_config_manager(config_file: str = None) -> ConfigManager:
+    """Get or create global configuration manager instance"""
+    global _config_instance
+    if _config_instance is None:
+        _config_instance = ConfigManager(config_file)
+    return _config_instance

streamlit_app/utils/performance.py ADDED Viewed

	@@ -0,0 +1,271 @@

+#!/usr/bin/env python3
+"""
+Performance Monitor
+Monitors system performance metrics for the NZ Legislation Loophole Analysis application.
+Tracks memory usage, CPU utilization, processing times, and other performance indicators.
+"""
+import time
+import threading
+import psutil
+from typing import Dict, Any, Optional, List
+from collections import deque
+import streamlit as st
+class PerformanceMonitor:
+    """Performance monitoring system"""
+    def __init__(self, max_history: int = 1000):
+        """
+        Initialize performance monitor
+        Args:
+            max_history: Maximum number of historical data points to keep
+        """
+        self.max_history = max_history
+        self.lock = threading.RLock()
+        # Historical data storage
+        self.memory_history = deque(maxlen=max_history)
+        self.cpu_history = deque(maxlen=max_history)
+        self.processing_times = deque(maxlen=max_history)
+        # Current metrics
+        self.current_metrics = {
+            'memory_usage_mb': 0,
+            'memory_percent': 0,
+            'cpu_percent': 0,
+            'active_threads': 0,
+            'processing_time_avg': 0,
+            'processing_time_max': 0,
+            'processing_time_min': 0,
+            'total_processed_chunks': 0,
+            'chunks_per_second': 0
+        }
+        # Processing timing
+        self.processing_start_time = None
+        self.last_chunk_time = time.time()
+        # Start monitoring thread
+        self.monitoring = True
+        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
+        self.monitor_thread.start()
+    def _monitor_loop(self):
+        """Background monitoring loop"""
+        while self.monitoring:
+            try:
+                self._update_metrics()
+                time.sleep(1)  # Update every second
+            except Exception as e:
+                print(f"Performance monitoring error: {e}")
+                time.sleep(5)  # Wait longer on error
+    def _update_metrics(self):
+        """Update current performance metrics"""
+        process = psutil.Process()
+        with self.lock:
+            # Memory metrics
+            memory_info = process.memory_info()
+            memory_usage_mb = memory_info.rss / 1024 / 1024
+            memory_percent = process.memory_percent()
+            # CPU metrics
+            cpu_percent = process.cpu_percent(interval=0.1)
+            # Thread count
+            active_threads = len(process.threads())
+            # Update current metrics
+            self.current_metrics.update({
+                'memory_usage_mb': memory_usage_mb,
+                'memory_percent': memory_percent,
+                'cpu_percent': cpu_percent,
+                'active_threads': active_threads
+            })
+            # Store historical data
+            current_time = time.time()
+            self.memory_history.append((current_time, memory_usage_mb))
+            self.cpu_history.append((current_time, cpu_percent))
+    def start_processing_timer(self):
+        """Start timing a processing operation"""
+        self.processing_start_time = time.time()
+    def end_processing_timer(self) -> float:
+        """End timing and return elapsed time"""
+        if self.processing_start_time is None:
+            return 0
+        elapsed = time.time() - self.processing_start_time
+        self.processing_start_time = None
+        with self.lock:
+            self.processing_times.append(elapsed)
+            # Update processing time statistics
+            if self.processing_times:
+                self.current_metrics['processing_time_avg'] = sum(self.processing_times) / len(self.processing_times)
+                self.current_metrics['processing_time_max'] = max(self.processing_times)
+                self.current_metrics['processing_time_min'] = min(self.processing_times)
+        return elapsed
+    def record_chunk_processing(self):
+        """Record that a chunk has been processed"""
+        current_time = time.time()
+        with self.lock:
+            self.current_metrics['total_processed_chunks'] += 1
+            # Calculate chunks per second
+            time_diff = current_time - self.last_chunk_time
+            if time_diff > 0:
+                current_cps = 1.0 / time_diff
+                # Smooth the chunks per second calculation
+                self.current_metrics['chunks_per_second'] = (
+                    0.9 * self.current_metrics['chunks_per_second'] + 0.1 * current_cps
+                )
+            self.last_chunk_time = current_time
+    def get_stats(self) -> Dict[str, Any]:
+        """Get current performance statistics"""
+        with self.lock:
+            return self.current_metrics.copy()
+    def get_memory_history(self, time_window_seconds: int = 300) -> List[tuple]:
+        """Get memory usage history within time window"""
+        current_time = time.time()
+        cutoff_time = current_time - time_window_seconds
+        with self.lock:
+            return [(t, v) for t, v in self.memory_history if t >= cutoff_time]
+    def get_cpu_history(self, time_window_seconds: int = 300) -> List[tuple]:
+        """Get CPU usage history within time window"""
+        current_time = time.time()
+        cutoff_time = current_time - time_window_seconds
+        with self.lock:
+            return [(t, v) for t, v in self.cpu_history if t >= cutoff_time]
+    def get_processing_time_stats(self) -> Dict[str, Any]:
+        """Get processing time statistics"""
+        with self.lock:
+            if not self.processing_times:
+                return {
+                    'count': 0,
+                    'average': 0,
+                    'maximum': 0,
+                    'minimum': 0,
+                    'median': 0
+                }
+            sorted_times = sorted(self.processing_times)
+            return {
+                'count': len(self.processing_times),
+                'average': sum(self.processing_times) / len(self.processing_times),
+                'maximum': max(self.processing_times),
+                'minimum': min(self.processing_times),
+                'median': sorted_times[len(sorted_times) // 2]
+            }
+    def get_system_info(self) -> Dict[str, Any]:
+        """Get system information"""
+        return {
+            'cpu_count': psutil.cpu_count(),
+            'cpu_count_logical': psutil.cpu_count(logical=True),
+            'total_memory_gb': psutil.virtual_memory().total / (1024**3),
+            'available_memory_gb': psutil.virtual_memory().available / (1024**3),
+            'python_version': f"{psutil.python_implementation()} {psutil.python_version()}",
+            'platform': psutil.platform
+        }
+    def reset_stats(self):
+        """Reset performance statistics"""
+        with self.lock:
+            self.processing_times.clear()
+            self.current_metrics['total_processed_chunks'] = 0
+            self.current_metrics['chunks_per_second'] = 0
+            self.current_metrics['processing_time_avg'] = 0
+            self.current_metrics['processing_time_max'] = 0
+            self.current_metrics['processing_time_min'] = 0
+    def cleanup(self):
+        """Cleanup resources"""
+        self.monitoring = False
+        if self.monitor_thread.is_alive():
+            self.monitor_thread.join(timeout=2)
+    def get_performance_report(self) -> Dict[str, Any]:
+        """Generate a comprehensive performance report"""
+        return {
+            'current_metrics': self.get_stats(),
+            'processing_stats': self.get_processing_time_stats(),
+            'system_info': self.get_system_info(),
+            'memory_history_count': len(self.memory_history),
+            'cpu_history_count': len(self.cpu_history),
+            'processing_times_count': len(self.processing_times)
+        }
+    def check_memory_threshold(self, threshold_mb: int) -> bool:
+        """Check if memory usage is above threshold"""
+        return self.current_metrics['memory_usage_mb'] > threshold_mb
+    def check_cpu_threshold(self, threshold_percent: float) -> bool:
+        """Check if CPU usage is above threshold"""
+        return self.current_metrics['cpu_percent'] > threshold_percent
+    def get_recommendations(self) -> List[str]:
+        """Get performance recommendations based on current metrics"""
+        recommendations = []
+        # Memory recommendations
+        if self.current_metrics['memory_usage_mb'] > 7000:
+            recommendations.append("High memory usage detected. Consider reducing batch size or chunk size.")
+        elif self.current_metrics['memory_usage_mb'] > 5000:
+            recommendations.append("Moderate memory usage. Monitor closely during processing.")
+        # CPU recommendations
+        if self.current_metrics['cpu_percent'] > 90:
+            recommendations.append("High CPU usage. Consider reducing processing intensity.")
+        elif self.current_metrics['cpu_percent'] > 70:
+            recommendations.append("Moderate CPU usage. Processing is running optimally.")
+        # Processing speed recommendations
+        avg_time = self.current_metrics.get('processing_time_avg', 0)
+        if avg_time > 10:
+            recommendations.append("Slow processing detected. Consider using a more powerful model or optimizing settings.")
+        elif avg_time > 5:
+            recommendations.append("Moderate processing speed. Consider increasing batch size if memory allows.")
+        # Cache recommendations
+        # This would be integrated with cache manager stats
+        chunks_per_second = self.current_metrics.get('chunks_per_second', 0)
+        if chunks_per_second < 1:
+            recommendations.append("Low processing throughput. Consider optimizing chunk size or model parameters.")
+        if not recommendations:
+            recommendations.append("Performance is optimal. All metrics are within normal ranges.")
+        return recommendations
+# Global performance monitor instance
+_performance_instance = None
+_performance_lock = threading.Lock()
+def get_performance_monitor(max_history: int = 1000) -> PerformanceMonitor:
+    """Get or create global performance monitor instance"""
+    global _performance_instance
+    with _performance_lock:
+        if _performance_instance is None:
+            _performance_instance = PerformanceMonitor(max_history)
+    return _performance_instance

streamlit_app/utils/ui_helpers.py ADDED Viewed

	@@ -0,0 +1,415 @@

+#!/usr/bin/env python3
+"""
+UI Helpers
+Utility functions and components for the Streamlit application UI.
+Provides reusable UI elements, formatting functions, and visual components.
+"""
+import streamlit as st
+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+from typing import Dict, Any, List, Optional, Tuple
+import time
+from datetime import datetime
+import json
+class UIHelpers:
+    """UI helper functions and components"""
+    @staticmethod
+    def create_metric_card(title: str, value: Any, delta: Optional[Any] = None,
+                          delta_color: str = "normal", help_text: Optional[str] = None):
+        """Create a styled metric card"""
+        if isinstance(value, float):
+            if title.lower().endswith(('rate', 'ratio', 'percentage', 'percent')):
+                formatted_value = ".1f"
+            else:
+                formatted_value = ".2f"
+        else:
+            formatted_value = str(value)
+        return st.metric(
+            label=title,
+            value=formatted_value,
+            delta=delta,
+            delta_color=delta_color,
+            help=help_text
+        )
+    @staticmethod
+    def create_progress_bar(progress: float, text: str = "", color: str = "primary"):
+        """Create a styled progress bar with text"""
+        if text:
+            st.write(f"**{text}**")
+        if color == "success":
+            bar_color = "#28a745"
+        elif color == "warning":
+            bar_color = "#ffc107"
+        elif color == "danger":
+            bar_color = "#dc3545"
+        else:
+            bar_color = None
+        st.progress(progress, text=f"{progress:.1%} Complete")
+    @staticmethod
+    def create_info_box(message: str, type: str = "info"):
+        """Create a styled info/warning/success box"""
+        if type == "success":
+            st.success(message)
+        elif type == "warning":
+            st.warning(message)
+        elif type == "error":
+            st.error(message)
+        else:
+            st.info(message)
+    @staticmethod
+    def format_file_size(size_bytes: int) -> str:
+        """Format file size in human-readable format"""
+        for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+            if size_bytes < 1024.0:
+                return ".1f"
+            size_bytes /= 1024.0
+        return ".1f"
+    @staticmethod
+    def format_time_duration(seconds: float) -> str:
+        """Format time duration in human-readable format"""
+        if seconds < 60:
+            return ".1f"
+        elif seconds < 3600:
+            minutes = int(seconds // 60)
+            remaining_seconds = seconds % 60
+            return ".1f"
+        else:
+            hours = int(seconds // 3600)
+            minutes = int((seconds % 3600) // 60)
+            return f"{hours}h {minutes}m"
+    @staticmethod
+    def create_performance_chart(data: List[Tuple[float, float]],
+                                title: str, y_label: str, color: str = "#1f77b4"):
+        """Create a performance chart using Plotly"""
+        if not data:
+            return None
+        times, values = zip(*data)
+        # Convert timestamps to relative time
+        start_time = min(times)
+        relative_times = [t - start_time for t in times]
+        fig = go.Figure()
+        fig.add_trace(go.Scatter(
+            x=relative_times,
+            y=values,
+            mode='lines+markers',
+            line=dict(color=color, width=2),
+            marker=dict(size=4),
+            name=y_label
+        ))
+        fig.update_layout(
+            title=title,
+            xaxis_title="Time (seconds)",
+            yaxis_title=y_label,
+            template="plotly_white",
+            height=300,
+            margin=dict(l=20, r=20, t=40, b=20)
+        )
+        return fig
+    @staticmethod
+    def create_comparison_chart(data_dict: Dict[str, List[float]],
+                               title: str, x_label: str, y_label: str):
+        """Create a comparison bar chart"""
+        fig = go.Figure()
+        for label, values in data_dict.items():
+            fig.add_trace(go.Bar(
+                name=label,
+                x=list(range(len(values))),
+                y=values,
+                text=[f"{v:.2f}" for v in values],
+                textposition='auto',
+            ))
+        fig.update_layout(
+            title=title,
+            xaxis_title=x_label,
+            yaxis_title=y_label,
+            template="plotly_white",
+            height=400,
+            margin=dict(l=20, r=20, t=40, b=20)
+        )
+        return fig
+    @staticmethod
+    def create_analysis_summary(results: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Create a summary of analysis results"""
+        if not results:
+            return {
+                'total_analyses': 0,
+                'total_loopholes': 0,
+                'avg_confidence': 0,
+                'total_chunks': 0,
+                'analysis_types': {}
+            }
+        total_loopholes = sum(len(result.get('loopholes', [])) for result in results)
+        total_confidence = sum(result.get('confidence', 0) for result in results)
+        total_chunks = sum(result.get('chunks_processed', 0) for result in results)
+        # Count analysis types
+        analysis_types = {}
+        for result in results:
+            analysis_type = result.get('analysis_type', 'Unknown')
+            analysis_types[analysis_type] = analysis_types.get(analysis_type, 0) + 1
+        return {
+            'total_analyses': len(results),
+            'total_loopholes': total_loopholes,
+            'avg_confidence': total_confidence / len(results) if results else 0,
+            'total_chunks': total_chunks,
+            'analysis_types': analysis_types
+        }
+    @staticmethod
+    def display_analysis_result(result: Dict[str, Any], index: int = 0):
+        """Display a single analysis result in a formatted way"""
+        with st.expander(f"📋 Analysis {index + 1}: {result.get('title', 'Unknown Title')}", expanded=index == 0):
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                st.markdown("**Summary:**")
+                st.write(result.get('summary', 'No summary available'))
+                st.markdown("**Key Findings:**")
+                loopholes = result.get('loopholes', [])
+                if loopholes:
+                    for i, loophole in enumerate(loopholes, 1):
+                        st.markdown(f"{i}. {loophole}")
+                else:
+                    st.write("No significant loopholes identified.")
+                if result.get('recommendations'):
+                    st.markdown("**Recommendations:**")
+                    for rec in result.get('recommendations', []):
+                        st.markdown(f"• {rec}")
+            with col2:
+                UIHelpers.create_metric_card(
+                    "Confidence",
+                    ".2f",
+                    help_text="Model confidence in analysis"
+                )
+                UIHelpers.create_metric_card(
+                    "Processing Time",
+                    ".2f",
+                    help_text="Time taken to analyze this content"
+                )
+                UIHelpers.create_metric_card(
+                    "Chunks Processed",
+                    result.get('chunks_processed', 0),
+                    help_text="Number of text chunks analyzed"
+                )
+                st.markdown("**Metadata:**")
+                st.write(f"**Source:** {result.get('source', 'Unknown')}")
+                st.write(f"**Date:** {result.get('date', 'Unknown')}")
+                st.write(f"**Analysis Type:** {result.get('analysis_type', 'Standard')}")
+    @staticmethod
+    def create_export_section(results: List[Dict[str, Any]]):
+        """Create the export section for results"""
+        st.subheader("💾 Export Results")
+        if not results:
+            st.info("No results to export")
+            return
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            if st.button("📄 Export as JSON", use_container_width=True):
+                json_data = json.dumps(results, indent=2, ensure_ascii=False)
+                st.download_button(
+                    label="Download JSON",
+                    data=json_data,
+                    file_name=f"nz_legislation_analysis_{int(time.time())}.json",
+                    mime="application/json",
+                    use_container_width=True
+                )
+        with col2:
+            if st.button("📊 Export as CSV", use_container_width=True):
+                df = pd.DataFrame(results)
+                csv_data = df.to_csv(index=False)
+                st.download_button(
+                    label="Download CSV",
+                    data=csv_data,
+                    file_name=f"nz_legislation_analysis_{int(time.time())}.csv",
+                    mime="text/csv",
+                    use_container_width=True
+                )
+        with col3:
+            if st.button("📋 Export as Excel", use_container_width=True):
+                df = pd.DataFrame(results)
+                excel_data = df.to_excel(index=False, engine='openpyxl')
+                st.download_button(
+                    label="Download Excel",
+                    data=excel_data,
+                    file_name=f"nz_legislation_analysis_{int(time.time())}.xlsx",
+                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    use_container_width=True
+                )
+    @staticmethod
+    def create_cache_management_section(cache_manager):
+        """Create cache management section"""
+        st.subheader("🧠 Cache Management")
+        cache_stats = cache_manager.get_stats()
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            UIHelpers.create_metric_card("Cache Hits", cache_stats['hits'])
+        with col2:
+            UIHelpers.create_metric_card("Cache Misses", cache_stats['misses'])
+        with col3:
+            UIHelpers.create_metric_card("Hit Rate", ".1f")
+        with col4:
+            UIHelpers.create_metric_card("Cached Entries", cache_stats['entries'])
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            if st.button("🔄 Clear Cache", type="secondary", use_container_width=True):
+                cache_manager.clear_cache()
+                st.rerun()
+        with col2:
+            if st.button("📤 Export Cache", use_container_width=True):
+                import tempfile
+                with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+                    success = cache_manager.export_cache(f.name)
+                    if success:
+                        st.success("Cache exported successfully")
+                    else:
+                        st.error("Failed to export cache")
+        with col3:
+            uploaded_cache = st.file_uploader("📥 Import Cache", type=['json'])
+            if uploaded_cache:
+                import tempfile
+                with tempfile.NamedTemporaryFile(mode='wb', delete=False) as f:
+                    f.write(uploaded_cache.read())
+                    imported_count = cache_manager.import_cache(f.name)
+                    st.success(f"Imported {imported_count} cache entries")
+    @staticmethod
+    def create_system_info_section(perf_monitor):
+        """Create system information section"""
+        st.subheader("💻 System Information")
+        sys_info = perf_monitor.get_system_info()
+        col1, col2 = st.columns(2)
+        with col1:
+            st.markdown("**Hardware:**")
+            st.write(f"**CPU Cores:** {sys_info['cpu_count']} physical, {sys_info['cpu_count_logical']} logical")
+            st.write(f"**Total Memory:** {sys_info['total_memory_gb']:.1f} GB")
+            st.write(f"**Available Memory:** {sys_info['available_memory_gb']:.1f} GB")
+        with col2:
+            st.markdown("**Software:**")
+            st.write(f"**Python:** {sys_info['python_version']}")
+            st.write(f"**Platform:** {sys_info['platform']}")
+            st.write(f"**Active Threads:** {st.session_state.performance_monitor.get_stats()['active_threads']}")
+    @staticmethod
+    def create_performance_recommendations(perf_monitor):
+        """Create performance recommendations section"""
+        st.subheader("💡 Performance Recommendations")
+        recommendations = perf_monitor.get_recommendations()
+        if recommendations:
+            for rec in recommendations:
+                if "High" in rec or "Slow" in rec:
+                    st.error(rec)
+                elif "Moderate" in rec or "Consider" in rec:
+                    st.warning(rec)
+                else:
+                    st.info(rec)
+        else:
+            st.success("All performance metrics are within optimal ranges!")
+    @staticmethod
+    def create_loading_spinner(text: str = "Processing..."):
+        """Create a loading spinner"""
+        return st.spinner(text)
+    @staticmethod
+    def create_success_message(message: str):
+        """Create a success message"""
+        st.success(message)
+    @staticmethod
+    def create_error_message(message: str):
+        """Create an error message"""
+        st.error(message)
+    @staticmethod
+    def create_warning_message(message: str):
+        """Create a warning message"""
+        st.warning(message)
+    @staticmethod
+    def create_data_table(data: List[Dict[str, Any]], columns: Optional[List[str]] = None):
+        """Create a formatted data table"""
+        if not data:
+            st.info("No data to display")
+            return
+        df = pd.DataFrame(data)
+        if columns:
+            available_columns = [col for col in columns if col in df.columns]
+            if available_columns:
+                df = df[available_columns]
+        st.dataframe(df, use_container_width=True)
+    @staticmethod
+    def create_json_viewer(data: Dict[str, Any], title: str = "JSON Data"):
+        """Create a JSON viewer"""
+        st.subheader(title)
+        with st.expander("View JSON", expanded=False):
+            st.json(data)
+    @staticmethod
+    def create_file_preview(file_content: str, max_lines: int = 20):
+        """Create a file content preview"""
+        lines = file_content.split('\n')
+        preview_content = '\n'.join(lines[:max_lines])
+        if len(lines) > max_lines:
+            preview_content += f"\n\n... ({len(lines) - max_lines} more lines)"
+        st.text_area("File Preview", preview_content, height=200, disabled=True)

test_app_imports.py ADDED Viewed

	@@ -0,0 +1,178 @@

+#!/usr/bin/env python3
+"""
+Test script to validate Streamlit app imports and basic functionality
+"""
+import sys
+import os
+from pathlib import Path
+def test_imports():
+    """Test that all required modules can be imported"""
+    print("🔍 Testing Streamlit app imports...")
+    # Add current directory to Python path
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    if current_dir not in sys.path:
+        sys.path.insert(0, current_dir)
+    # Test core modules
+    modules_to_test = [
+        'streamlit',
+        'pandas',
+        'plotly',
+        'psutil',
+        'numpy',
+        'streamlit_app.core.cache_manager',
+        'streamlit_app.core.text_processor',
+        'streamlit_app.core.llm_analyzer',
+        'streamlit_app.core.dataset_builder',
+        'streamlit_app.utils.config',
+        'streamlit_app.utils.performance',
+        'streamlit_app.utils.ui_helpers'
+    ]
+    failed_imports = []
+    for module in modules_to_test:
+        try:
+            __import__(module)
+            print(f"✅ {module}")
+        except ImportError as e:
+            print(f"❌ {module}: {e}")
+            failed_imports.append(module)
+        except Exception as e:
+            print(f"⚠️  {module}: Unexpected error - {e}")
+    if failed_imports:
+        print(f"\n❌ Failed to import {len(failed_imports)} modules:")
+        for module in failed_imports:
+            print(f"   - {module}")
+        return False
+    print(f"\n✅ All {len(modules_to_test)} modules imported successfully!")
+    return True
+def test_core_functionality():
+    """Test basic functionality of core modules"""
+    print("\n🔧 Testing core functionality...")
+    try:
+        # Test cache manager
+        from streamlit_app.core.cache_manager import CacheManager, get_cache_manager
+        cache = get_cache_manager(max_memory_mb=10, persistent=False)  # Small cache for testing
+        cache_stats = cache.get_stats()
+        print(f"✅ Cache Manager: {cache_stats}")
+        # Test text processor
+        from streamlit_app.core.text_processor import TextProcessor
+        processor = TextProcessor()
+        test_text = "This is a test of the New Zealand legislation analysis system."
+        cleaned = processor.clean_text(test_text)
+        chunks = processor.chunk_text(cleaned, chunk_size=50, overlap=10)
+        print(f"✅ Text Processor: {len(chunks)} chunks created")
+        # Test configuration manager
+        from streamlit_app.utils.config import ConfigManager
+        config = ConfigManager()
+        config_dict = config.get_config()
+        print(f"✅ Config Manager: {len(config_dict)} configuration sections")
+        # Test performance monitor
+        from streamlit_app.utils.performance import PerformanceMonitor
+        perf = PerformanceMonitor(max_history=10)
+        stats = perf.get_stats()
+        print(f"✅ Performance Monitor: Memory usage {stats['memory_usage_mb']:.1f} MB")
+        # Test UI helpers (basic instantiation)
+        from streamlit_app.utils.ui_helpers import UIHelpers
+        helper = UIHelpers()
+        print("✅ UI Helpers: Module loaded")
+        print("\n🎉 All core functionality tests passed!")
+        return True
+    except Exception as e:
+        print(f"\n❌ Core functionality test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def test_file_structure():
+    """Test that all required files exist"""
+    print("\n📁 Testing file structure...")
+    required_files = [
+        'streamlit_app/app.py',
+        'streamlit_app/core/cache_manager.py',
+        'streamlit_app/core/text_processor.py',
+        'streamlit_app/core/llm_analyzer.py',
+        'streamlit_app/core/dataset_builder.py',
+        'streamlit_app/utils/config.py',
+        'streamlit_app/utils/performance.py',
+        'streamlit_app/utils/ui_helpers.py',
+        'requirements.txt',
+        'run_streamlit_app.py',
+        'README_Streamlit_App.md'
+    ]
+    missing_files = []
+    for file_path in required_files:
+        if not Path(file_path).exists():
+            missing_files.append(file_path)
+            print(f"❌ Missing: {file_path}")
+        else:
+            print(f"✅ Found: {file_path}")
+    if missing_files:
+        print(f"\n❌ Missing {len(missing_files)} files:")
+        for file_path in missing_files:
+            print(f"   - {file_path}")
+        return False
+    print(f"\n✅ All {len(required_files)} files present!")
+    return True
+def main():
+    """Main test function"""
+    print("🏛️  NZ Legislation Loophole Analysis - App Validation")
+    print("=" * 60)
+    all_passed = True
+    # Test file structure
+    if not test_file_structure():
+        all_passed = False
+    # Test imports
+    if not test_imports():
+        all_passed = False
+    # Test core functionality
+    if not test_core_functionality():
+        all_passed = False
+    print("\n" + "=" * 60)
+    if all_passed:
+        print("🎉 VALIDATION COMPLETE - App is ready to run!")
+        print("\n🚀 To start the application:")
+        print("   python run_streamlit_app.py")
+        print("\n📱 Then visit: http://localhost:8501")
+    else:
+        print("❌ VALIDATION FAILED - Please check the errors above")
+        print("\n🔧 Troubleshooting:")
+        print("   - Ensure all dependencies are installed: pip install -r requirements.txt")
+        print("   - Check Python version (3.8+ required)")
+        print("   - Verify file permissions")
+    return all_passed
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)

trl copy.py ADDED Viewed

	@@ -0,0 +1,532 @@

+#!/usr/bin/env python3
+"""
+NZ Legislation Loophole Analysis Dataset Creation Tool
+This script processes New Zealand legislation text to create a finetuning dataset for AI models
+that can identify potential loopholes, ambiguities, and unintended consequences in legal text.
+The script:
+1. Loads and cleans NZ legislation text, preserving legal structure and terminology
+2. Chunks the text into manageable sections with overlap for context
+3. Uses an LLM to analyze each chunk for legal issues
+4. Generates a structured dataset for training AI models on legal loophole detection
+Usage:
+    python trl.py
+Requirements:
+    - llama-cpp-python with GGUF model support
+    - psutil for memory monitoring
+    - Input file: nz-legislation.txt containing NZ legislation in JSON lines format
+Output:
+    - JSON dataset saved to nz_legislation_dataset/nz_legislation_loophole_dataset.json
+"""
+import os
+import json
+import time
+import psutil
+from typing import List, Dict, Any
+import numpy as np
+from llama_cpp import Llama
+import re
+# Placeholder classes and functions for missing dependencies
+class ProgressManager:
+    """Simple placeholder for progress tracking"""
+    def __init__(self):
+        pass
+def show_memory_usage(label: str):
+    """Simple memory usage display"""
+    process = psutil.Process(os.getpid())
+    memory_mb = process.memory_info().rss / 1024 / 1024
+    print(f"{label}: {memory_mb:.2f} MB")
+# Configuration for NZ Legislation Loophole Analysis Dataset Creation
+INPUT_FILE = "nz-legislation.txt"  # Path to New Zealand legislation JSON dataset
+OUTPUT_DIR = "nz_legislation_dataset"   # Directory to save the dataset
+CHUNK_SIZE = 4096              # Size of text chunks for processing legislation sections
+CHUNK_OVERLAP = 256            # Overlap between chunks to maintain context
+BATCH_SIZE = 16                # Number of chunks to process at once
+MODEL_PATH = "qwen3.gguf"  # Path to your Qwen3 GGUF model
+MAX_TOKENS = 4096              # Maximum tokens for model response
+# Ensure output directory exists
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+def load_model(progress_manager: ProgressManager = None):
+    """Load the LLM model for text generation with progress tracking"""
+    if progress_manager is None:
+        progress_manager = ProgressManager()
+    print("Loading LLM model...")
+    show_memory_usage("Initial memory usage")
+    start_time = time.time()
+    try:
+        llm = Llama.from_pretrained(
+            repo_id="DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-GGUF",
+            filename="Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-D_AU-IQ4_XS-imat.gguf",
+            n_ctx=40960,  # Context length
+            n_threads=8,  # Adjust based on your CPU
+            verbose=False,
+            n_gpu_layers=-1,  # Use all available GPU layers
+            n_batch=4096,  # Batch size for processing
+            logits_all=False,  # Optimize for text generation
+            use_mlock=True,  # Lock model in memory if possible
+            use_mmap=True,  # Use memory mapping for better performance
+        )
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        print("Trying with basic configuration...")
+        # Fallback to basic configuration
+        model = Llama(
+            model_path=MODEL_PATH,
+            n_ctx=40960,
+            n_threads=8,
+            verbose=False,
+            n_gpu_layers=-1,
+            n_batch=4096
+        )
+    load_time = time.time() - start_time
+    print(f"LLM model loaded in {load_time:.2f}s")
+    show_memory_usage("Memory after model load")
+    return model
+def clean_text(text: str) -> str:
+    """Clean and normalize text for better embedding quality, optimized for legal/legislative content"""
+    import re
+    # Preserve section numbers and legal structure while cleaning
+    # Keep section numbers like "1:", "2:", etc.
+    text = re.sub(r'^(\d+:)', r'\1', text, flags=re.MULTILINE)
+    # Remove excessive whitespace but preserve paragraph structure
+    text = re.sub(r'[ \t]+', ' ', text)  # Replace multiple spaces/tabs with single space
+    text = re.sub(r'\n\s*\n', '\n\n', text)  # Preserve paragraph breaks but clean up
+    text = re.sub(r'\n{3,}', '\n\n', text)  # Reduce excessive newlines to double
+    # Remove control characters but preserve legal formatting
+    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)  # Remove control chars except tab and newline
+    # Handle legal-specific characters and formatting
+    # Keep legal punctuation and symbols
+    allowed_chars = r'\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}\"\'\/\@\#\$\%\^\&\*\+\=\~\`°§'
+    text = re.sub(r'[^' + allowed_chars + ']', '', text)
+    # Normalize quotes and apostrophes for legal text
+    text = re.sub(r'[""]', '"', text)  # Normalize double quotes
+    text = re.sub(r"['']", "'", text)  # Normalize single quotes
+    text = re.sub(r'`', "'", text)  # Replace backticks with apostrophes
+    # Clean up legal numbering and references
+    # Normalize section references
+    text = re.sub(r'section\s+(\d+)', r'section \1', text, flags=re.IGNORECASE)
+    text = re.sub(r'(\d+)\s*[Jj]anuary', r'\1 January', text)  # Clean date formatting
+    text = re.sub(r'(\d+)\s*[Jj]uly', r'\1 July', text)  # Clean date formatting
+    text = re.sub(r'(\d+)\s*[Aa]pril', r'\1 April', text)  # Clean date formatting
+    text = re.sub(r'(\d+)\s*[Ff]ebruary', r'\1 February', text)  # Clean date formatting
+    text = re.sub(r'(\d+)\s*[Dd]ecember', r'\1 December', text)  # Clean date formatting
+    text = re.sub(r'(\d+)\s*[Aa]ugust', r'\1 August', text)  # Clean date formatting
+    text = re.sub(r'(\d+)\s*[Mm]arch', r'\1 March', text)  # Clean date formatting
+    text = re.sub(r'(\d+)\s*[Mm]ay', r'\1 May', text)  # Clean date formatting
+    text = re.sub(r'(\d+)\s*[Jj]une', r'\1 June', text)  # Clean date formatting
+    text = re.sub(r'(\d+)\s*[Ss]eptember', r'\1 September', text)  # Clean date formatting
+    text = re.sub(r'(\d+)\s*[Oo]ctober', r'\1 October', text)  # Clean date formatting
+    text = re.sub(r'(\d+)\s*[Nn]ovember', r'\1 November', text)  # Clean date formatting
+    # Clean up punctuation spacing in legal text
+    text = re.sub(r'\s+([\.!\?\,\;\:])', r'\1', text)  # Remove space before punctuation
+    text = re.sub(r'([\.!\?\,\;\:])\s*', r'\1 ', text)  # Ensure space after punctuation
+    # Handle legal citations and references (generic patterns)
+    # Normalize act names with years - generic pattern for "Act ####" format
+    text = re.sub(r'(\b\w+(?:\s+\w+)*)\s+Act\s+(\d{4})', r'\1 Act', text)  # Normalize act names
+    # Clean up amendment references (generic patterns)
+    text = re.sub(r'[Aa]mendment\(s\)\s+incorporated\s+in\s+the\s+[Aa]ct\(s\)', 'Amendments incorporated', text)
+    text = re.sub(r'section\s+\d+\(\d+\)\([a-zA-Z]\)', lambda m: m.group(0).lower(), text)  # Normalize section references
+    # Generic pattern for legal document sections
+    text = re.sub(r'(\b(?:section|part|chapter|article|clause|subsection|paragraph))\s+(\d+[a-zA-Z]*)',
+                  lambda m: f"{m.group(1)} {m.group(2)}", text, flags=re.IGNORECASE)
+    # NZ-specific legal enhancements
+    # Handle New Zealand specific terms and references
+    text = re.sub(r'\b[Nn]ew\s+[Zz]ealand\b', 'New Zealand', text)  # Normalize "New Zealand"
+    text = re.sub(r'\b[Pp]arliament\b', 'Parliament', text)  # Normalize "Parliament"
+    text = re.sub(r'\b[Cc]rown\b', 'Crown', text)  # Normalize "Crown"
+    text = re.sub(r'\b[Gg]overnment\b', 'Government', text)  # Normalize "Government"
+    # Handle NZ-specific legal citations (e.g., "NZB" references, Treaty of Waitangi)
+    text = re.sub(r'\b[Nn][Zz][Bb]\s+(\d+)', r'NZB \1', text)  # Normalize NZB references
+    text = re.sub(r'[Tt]reaty\s+[Oo]f\s+[Ww]aitangi', 'Treaty of Waitangi', text, flags=re.IGNORECASE)
+    # Handle Maori-specific characters if present (basic support)
+    # Keep common Maori characters: ā, ē, ī, ō, ū, wh
+    maori_chars = 'āēīōūwhĀĒĪŌŪWH'
+    allowed_chars += maori_chars
+    text = re.sub(r'[^' + allowed_chars + ']', '', text)
+    # Remove empty lines and trim while preserving legal structure
+    lines = []
+    for line in text.split('\n'):
+        stripped = line.strip()
+        if stripped:  # Keep non-empty lines
+            # Preserve section headers
+            if re.match(r'^\d+:', stripped):
+                lines.append(stripped)
+            else:
+                lines.append(stripped)
+    text = '\n'.join(lines)
+    # Final cleanup
+    text = text.strip()
+    return text
+# Constants for prompt formatting
+REASONING_START = "<start_working_out>"
+REASONING_END = "<end_working_out>"
+SOLUTION_START = "<SOLUTION>"
+SOLUTION_END = "</SOLUTION>"
+def create_system_prompt(text: str) -> str:
+    """Create a system prompt for analyzing legislative text for loopholes and ambiguities"""
+    return f"""You are a legal expert analyzing New Zealand legislation for loopholes and ambiguities.
+LEGISLATION TEXT:
+{text}
+TASK: Analyze this legislative text and identify potential loopholes, ambiguities, or unintended consequences.
+REASONING: Provide a structured analysis in the following format:
+1. **Text Meaning**: Explain what the text means and its intended purpose
+2. **Key Assumptions**: Identify any assumptions the text makes that could be exploited
+3. **Exploitable Interpretations**: Discuss how the text could be interpreted or applied in ways that circumvent its intended purpose
+4. **Critical Loopholes**: Identify specific loopholes, ambiguities, or unintended consequences that could be used to bypass the legislation
+5. **Circumvention Strategies**: Suggest practical methods or scenarios for exploiting these loopholes to achieve objectives contrary to the legislation's intent
+Write your complete analysis between {REASONING_START} and {REASONING_END}.
+Then provide your overall conclusion between {SOLUTION_START} and {SOLUTION_END}.
+"""
+def generate_chat_template(system_prompt: str) -> str:
+    """
+    Generate a chat template using the GGUF model's native chat format.
+    This uses the proper message structure with BOS/EOS tokens for better model compatibility.
+    """
+    # Build the chat using the GGUF template structure
+    chat_messages = []
+    # System message
+    if system_prompt:
+        chat_messages.append("<|im_start|>system")
+        chat_messages.append(system_prompt)
+        chat_messages.append("<|im_end|>")
+    # User message with the analysis request
+    chat_messages.append("<|im_start|>user")
+    chat_messages.append("Analyze the given legislative text for loopholes, ambiguities, and unintended consequences. Provide a structured legal analysis following the specified format.")
+    chat_messages.append("<|im_end|>")
+    # Assistant message with generation prompt
+    chat_messages.append("<|im_start|>assistant")
+    chat_messages.append("")  # Empty for generation
+    return "\n".join(chat_messages)
+def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
+    """Split text into overlapping chunks for processing"""
+    if len(text) <= chunk_size:
+        return [text]
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        chunk = text[start:end]
+        # Try to end chunk at a sentence boundary if possible
+        if end < len(text):
+            # Look for sentence endings in the last 100 characters
+            sentence_end = max(
+                chunk.rfind('. ', max(0, len(chunk) - 100)),
+                chunk.rfind('! ', max(0, len(chunk) - 100)),
+                chunk.rfind('? ', max(0, len(chunk) - 100))
+            )
+            if sentence_end != -1:
+                chunk = chunk[:sentence_end + 2]  # Include the sentence ending
+        chunks.append(chunk)
+        start = end - overlap if end < len(text) else len(text)
+    return chunks
+def generate_response(model, prompt: str, max_tokens: int = MAX_TOKENS) -> str:
+    """
+    Generate a response from the model for a given prompt with optimized parameters for legal analysis.
+    Parameter Explanations:
+    - temperature=0.3: Balanced creativity for legal analysis (not too random, not too deterministic)
+    - top_p=0.85: Nucleus sampling - considers top 85% probability mass for coherent legal text
+    - top_k=50: Top-k sampling - considers top 50 tokens for better legal terminology selection
+    - min_p=0.05: Minimum probability threshold to avoid low-quality tokens
+    Anti-Repetition Parameters:
+    - repetition_penalty=1.15: Penalizes repetition of phrases (15% penalty)
+    - presence_penalty=0.1: Encourages topic diversity across the response
+    - frequency_penalty=0.1: Reduces overuse of frequent tokens
+    Advanced Sampling:
+    - typical_p=0.95: Focuses on typical token probabilities for legal text patterns
+    - tfs_z=0.95: Tail-free sampling for more natural legal reasoning
+    - mirostat_mode=2: Mirostat v2 for perplexity-controlled generation
+    - mirostat_tau=4.0: Target entropy level for legal analysis
+    - mirostat_eta=0.15: Learning rate for perplexity adaptation
+    """
+    try:
+        response = model(
+            prompt,
+            max_tokens=max_tokens,
+            # Core generation parameters
+            temperature=0.3,  # Balanced temperature for legal analysis
+            top_p=0.85,       # Nucleus sampling for coherent legal text
+            top_k=50,         # Top-k sampling for better token selection
+            min_p=0.05,       # Minimum probability threshold to avoid low-quality tokens
+            # Anti-repetition parameters
+            repeat_penalty=1.15,    # Reduce repetition of phrases
+            presence_penalty=0.1,       # Encourage topic diversity
+            frequency_penalty=0.1,      # Reduce frequent token usage
+            # Advanced sampling parameters
+            typical_p=0.95,    # Typical token probability for legal text patterns
+            tfs_z=0.95,        # Tail-free sampling for better reasoning
+            mirostat_mode=2,   # Mirostat v2 for perplexity control
+            mirostat_tau=4.0,  # Mirostat target entropy
+            mirostat_eta=0.15, # Mirostat learning rate
+            # Stopping conditions
+            stop=[SOLUTION_END, "</SOLUTION>", "<end_working_out>"]  # Multiple stop tokens
+        )
+        return response['choices'][0]['text'].strip()
+    except Exception as e:
+        print(f"Error generating response: {e}")
+        # Try with fallback parameters if advanced ones fail
+        try:
+            response = model(
+                prompt,
+                max_tokens=max_tokens,
+                temperature=0.3,
+                top_p=0.85,
+                top_k=50,
+                repeat_penalty=1.15,
+                stop=[SOLUTION_END, "</SOLUTION>"]
+            )
+            return response['choices'][0]['text'].strip()
+        except Exception as e2:
+            print(f"Fallback also failed: {e2}")
+            return ""
+def parse_legislation_json(file_path: str) -> List[Dict[str, Any]]:
+    """Parse the JSON lines format of NZ legislation dataset"""
+    legislation_entries = []
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if line:
+                    try:
+                        entry = json.loads(line)
+                        if 'id' in entry and 'text' in entry:
+                            legislation_entries.append(entry)
+                        else:
+                            print(f"Warning: Line {line_num} missing required fields, skipping")
+                    except json.JSONDecodeError as e:
+                        print(f"Warning: Could not parse line {line_num}: {e}")
+                        continue
+    except Exception as e:
+        print(f"Error reading legislation file: {e}")
+        return []
+    print(f"Successfully parsed {len(legislation_entries)} legislation entries")
+    return legislation_entries
+def create_finetuning_dataset(input_file: str, model, output_file: str = None) -> List[Dict[str, Any]]:
+    """Create a finetuning dataset by processing NZ legislation JSON dataset with incremental saving"""
+    if output_file is None:
+        output_file = os.path.join(OUTPUT_DIR, "nz_legislation_loophole_dataset.json")
+    # Create temporary file paths
+    temp_file = output_file.replace('.json', '_temp.jsonl')
+    backup_file = output_file.replace('.json', '_backup.json')
+    print(f"Parsing legislation dataset from {input_file}")
+    legislation_entries = parse_legislation_json(input_file)
+    if not legislation_entries:
+        print("No legislation entries found to process")
+        return []
+    dataset = []
+    total_entries = len(legislation_entries)
+    saved_count = 0
+    print(f"Processing {total_entries} legislation entries...")
+    print(f"Dataset will be saved incrementally to: {temp_file}")
+    try:
+        # Open temporary file for incremental saving
+        with open(temp_file, 'w', encoding='utf-8') as temp_f:
+            for entry_num, entry in enumerate(legislation_entries, 1):
+                legislation_id = entry.get('id', f'entry_{entry_num}')
+                title = entry.get('title', 'Unknown Title')
+                year = entry.get('year', 'Unknown Year')
+                raw_text = entry.get('text', '')
+                print(f"\nProcessing entry {entry_num}/{total_entries}: {title} ({year}) - ID: {legislation_id}")
+                # Clean the legislation text
+                cleaned_text = clean_text(raw_text)
+                # Chunk the text if it's too long
+                chunks = chunk_text(cleaned_text)
+                print(f"  - Text length: {len(raw_text)} characters")
+                print(f"  - Number of chunks: {len(chunks)}")
+                # Process each chunk
+                for chunk_id, chunk in enumerate(chunks):
+                    # Create prompt for this chunk
+                    system_prompt = create_system_prompt(chunk)
+                    full_prompt = generate_chat_template(system_prompt)
+                    # Generate response
+                    response = generate_response(model, full_prompt)
+                    # Print response for monitoring
+                    print(f"\n📝 **Generated Analysis for {title} (Chunk {chunk_id + 1}/{len(chunks)})**:")
+                    print(f"   Response length: {len(response)} characters")
+                    # Show preview of the analysis
+                    preview = response.replace('\n', ' ').strip()
+                    print(f"   Preview: {preview}")
+                    # Check for key analysis elements
+                    has_reasoning = '<start_working_out>' in response or 'reasoning' in response.lower()
+                    has_loopholes = 'loophole' in response.lower() or 'ambiguity' in response.lower() or 'issue' in response.lower()
+                    has_recommendations = 'recommend' in response.lower() or 'suggest' in response.lower()
+                    print(f"   Analysis quality: {'✅' if has_reasoning else '❌'} Reasoning | {'✅' if has_loopholes else '❌'} Loopholes | {'✅' if has_recommendations else '❌'} Recommendations")
+                    # Add to dataset with metadata
+                    dataset_entry = {
+                        "prompt": full_prompt,
+                        "response": response,
+                        "legislation_id": legislation_id,
+                        "title": title,
+                        "year": year,
+                        "chunk_id": chunk_id,
+                        "total_chunks": len(chunks),
+                        "text_length": len(chunk),
+                        "original_text_length": len(raw_text)
+                    }
+                    # Save entry immediately to temporary file (JSON Lines format)
+                    json.dump(dataset_entry, temp_f, ensure_ascii=False)
+                    temp_f.write('\n')
+                    temp_f.flush()  # Force write to disk
+                    dataset.append(dataset_entry)
+                    saved_count += 1
+                    # Progress update every 10 entries
+                    if saved_count % 10 == 0:
+                        print(f"  ✓ Saved {saved_count} entries so far...")
+        print(f"\n✓ All entries processed and saved to temporary file")
+        print(f"✓ Total entries saved: {saved_count}")
+        # Create backup of existing file if it exists
+        if os.path.exists(output_file):
+            print(f"Creating backup of existing dataset...")
+            os.rename(output_file, backup_file)
+        # Convert JSON Lines to final JSON format
+        print(f"Converting to final JSON format...")
+        with open(temp_file, 'r', encoding='utf-8') as temp_f:
+            lines = temp_f.readlines()
+        final_dataset = []
+        for line in lines:
+            if line.strip():
+                final_dataset.append(json.loads(line))
+        # Save final consolidated JSON file
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(final_dataset, f, indent=2, ensure_ascii=False)
+        print(f"✓ Final dataset saved to: {output_file}")
+        # Clean up temporary file
+        if os.path.exists(temp_file):
+            os.remove(temp_file)
+            print(f"✓ Temporary file cleaned up")
+        # Clean up backup file if everything succeeded
+        if os.path.exists(backup_file):
+            os.remove(backup_file)
+            print(f"✓ Backup file cleaned up")
+        print(f"\n🎉 Dataset creation complete!")
+        print(f"   • Processed {total_entries} legislation documents")
+        print(f"   • Generated {len(final_dataset)} analysis entries")
+        print(f"   • Total chunks processed: {sum(entry.get('total_chunks', 1) for entry in final_dataset[:total_entries])}")
+        return final_dataset
+    except KeyboardInterrupt:
+        print(f"\n⚠️  Process interrupted by user")
+        print(f"   • Partial dataset saved to: {temp_file}")
+        print(f"   • {saved_count} entries saved so far")
+        print(f"   • You can resume processing or use the temporary file")
+        raise
+    except Exception as e:
+        print(f"\n❌ Error during processing: {e}")
+        print(f"   • Partial dataset saved to: {temp_file}")
+        print(f"   • {saved_count} entries saved so far")
+        if os.path.exists(backup_file):
+            print(f"   • Original dataset restored from backup")
+            os.rename(backup_file, output_file)
+        raise
+def main():
+    """Main execution function"""
+    print("Starting NZ Legislation Loophole Analysis Dataset Creation")
+    print("=" * 60)
+    # Load the model
+    model = load_model()
+    # Create the dataset
+    dataset = create_finetuning_dataset(INPUT_FILE, model)
+    # Cleanup
+    if hasattr(model, 'close'):
+        model.close()
+    print("\nDataset creation completed successfully!")
+    print(f"Output saved to: {os.path.join(OUTPUT_DIR, 'nz_legislation_loophole_dataset.json')}")
+if __name__ == "__main__":
+    main()