Spaces:

Princess3
/

l

Build error

App Files Files Community

l / trl copy.py

Princess3

Upload 25 files

c089ca4 verified 3 months ago

raw

history blame contribute delete

23.2 kB

	#!/usr/bin/env python3
	"""
	NZ Legislation Loophole Analysis Dataset Creation Tool

	This script processes New Zealand legislation text to create a finetuning dataset for AI models
	that can identify potential loopholes, ambiguities, and unintended consequences in legal text.

	The script:
	1. Loads and cleans NZ legislation text, preserving legal structure and terminology
	2. Chunks the text into manageable sections with overlap for context
	3. Uses an LLM to analyze each chunk for legal issues
	4. Generates a structured dataset for training AI models on legal loophole detection

	Usage:
	python trl.py

	Requirements:
	- llama-cpp-python with GGUF model support
	- psutil for memory monitoring
	- Input file: nz-legislation.txt containing NZ legislation in JSON lines format

	Output:
	- JSON dataset saved to nz_legislation_dataset/nz_legislation_loophole_dataset.json
	"""

	import os
	import json
	import time
	import psutil
	from typing import List, Dict, Any
	import numpy as np
	from llama_cpp import Llama
	import re

	# Placeholder classes and functions for missing dependencies
	class ProgressManager:
	"""Simple placeholder for progress tracking"""
	def __init__(self):
	pass

	def show_memory_usage(label: str):
	"""Simple memory usage display"""
	process = psutil.Process(os.getpid())
	memory_mb = process.memory_info().rss / 1024 / 1024
	print(f"{label}: {memory_mb:.2f} MB")

	# Configuration for NZ Legislation Loophole Analysis Dataset Creation
	INPUT_FILE = "nz-legislation.txt" # Path to New Zealand legislation JSON dataset
	OUTPUT_DIR = "nz_legislation_dataset" # Directory to save the dataset
	CHUNK_SIZE = 4096 # Size of text chunks for processing legislation sections
	CHUNK_OVERLAP = 256 # Overlap between chunks to maintain context
	BATCH_SIZE = 16 # Number of chunks to process at once
	MODEL_PATH = "qwen3.gguf" # Path to your Qwen3 GGUF model
	MAX_TOKENS = 4096 # Maximum tokens for model response

	# Ensure output directory exists
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	def load_model(progress_manager: ProgressManager = None):
	"""Load the LLM model for text generation with progress tracking"""
	if progress_manager is None:
	progress_manager = ProgressManager()

	print("Loading LLM model...")
	show_memory_usage("Initial memory usage")

	start_time = time.time()
	try:
	llm = Llama.from_pretrained(
	repo_id="DavidAU/Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-GGUF",
	filename="Qwen3-Zero-Coder-Reasoning-0.8B-NEO-EX-D_AU-IQ4_XS-imat.gguf",
	n_ctx=40960, # Context length
	n_threads=8, # Adjust based on your CPU
	verbose=False,
	n_gpu_layers=-1, # Use all available GPU layers
	n_batch=4096, # Batch size for processing
	logits_all=False, # Optimize for text generation
	use_mlock=True, # Lock model in memory if possible
	use_mmap=True, # Use memory mapping for better performance
	)
	except Exception as e:
	print(f"Error loading model: {e}")
	print("Trying with basic configuration...")
	# Fallback to basic configuration
	model = Llama(
	model_path=MODEL_PATH,
	n_ctx=40960,
	n_threads=8,
	verbose=False,
	n_gpu_layers=-1,
	n_batch=4096
	)

	load_time = time.time() - start_time
	print(f"LLM model loaded in {load_time:.2f}s")
	show_memory_usage("Memory after model load")

	return model

	def clean_text(text: str) -> str:
	"""Clean and normalize text for better embedding quality, optimized for legal/legislative content"""
	import re

	# Preserve section numbers and legal structure while cleaning
	# Keep section numbers like "1:", "2:", etc.
	text = re.sub(r'^(\d+:)', r'\1', text, flags=re.MULTILINE)

	# Remove excessive whitespace but preserve paragraph structure
	text = re.sub(r'[ \t]+', ' ', text) # Replace multiple spaces/tabs with single space
	text = re.sub(r'\n\s*\n', '\n\n', text) # Preserve paragraph breaks but clean up
	text = re.sub(r'\n{3,}', '\n\n', text) # Reduce excessive newlines to double

	# Remove control characters but preserve legal formatting
	text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text) # Remove control chars except tab and newline

	# Handle legal-specific characters and formatting
	# Keep legal punctuation and symbols
	allowed_chars = r'\w\s\.\,\!\?\;\:\-\[\]\{\}\"\'\/\@\#\$\%\^\&\*\+\=\~\`°§'
	text = re.sub(r'[^' + allowed_chars + ']', '', text)

	# Normalize quotes and apostrophes for legal text
	text = re.sub(r'[""]', '"', text) # Normalize double quotes
	text = re.sub(r"['']", "'", text) # Normalize single quotes
	text = re.sub(r'`', "'", text) # Replace backticks with apostrophes

	# Clean up legal numbering and references
	# Normalize section references
	text = re.sub(r'section\s+(\d+)', r'section \1', text, flags=re.IGNORECASE)
	text = re.sub(r'(\d+)\s*[Jj]anuary', r'\1 January', text) # Clean date formatting
	text = re.sub(r'(\d+)\s*[Jj]uly', r'\1 July', text) # Clean date formatting
	text = re.sub(r'(\d+)\s*[Aa]pril', r'\1 April', text) # Clean date formatting
	text = re.sub(r'(\d+)\s*[Ff]ebruary', r'\1 February', text) # Clean date formatting
	text = re.sub(r'(\d+)\s*[Dd]ecember', r'\1 December', text) # Clean date formatting
	text = re.sub(r'(\d+)\s*[Aa]ugust', r'\1 August', text) # Clean date formatting
	text = re.sub(r'(\d+)\s*[Mm]arch', r'\1 March', text) # Clean date formatting
	text = re.sub(r'(\d+)\s*[Mm]ay', r'\1 May', text) # Clean date formatting
	text = re.sub(r'(\d+)\s*[Jj]une', r'\1 June', text) # Clean date formatting
	text = re.sub(r'(\d+)\s*[Ss]eptember', r'\1 September', text) # Clean date formatting
	text = re.sub(r'(\d+)\s*[Oo]ctober', r'\1 October', text) # Clean date formatting
	text = re.sub(r'(\d+)\s*[Nn]ovember', r'\1 November', text) # Clean date formatting

	# Clean up punctuation spacing in legal text
	text = re.sub(r'\s+([\.!\?\,\;\:])', r'\1', text) # Remove space before punctuation
	text = re.sub(r'([\.!\?\,\;\:])\s*', r'\1 ', text) # Ensure space after punctuation

	# Handle legal citations and references (generic patterns)
	# Normalize act names with years - generic pattern for "Act ####" format
	text = re.sub(r'(\b\w+(?:\s+\w+)*)\s+Act\s+(\d{4})', r'\1 Act', text) # Normalize act names

	# Clean up amendment references (generic patterns)
	text = re.sub(r'[Aa]mendment$s$\s+incorporated\s+in\s+the\s+[Aa]ct$s$', 'Amendments incorporated', text)
	text = re.sub(r'section\s+\d+$\d+$$[a-zA-Z]$', lambda m: m.group(0).lower(), text) # Normalize section references

	# Generic pattern for legal document sections
	text = re.sub(r'(\b(?:section\|part\|chapter\|article\|clause\|subsection\|paragraph))\s+(\d+[a-zA-Z]*)',
	lambda m: f"{m.group(1)} {m.group(2)}", text, flags=re.IGNORECASE)

	# NZ-specific legal enhancements
	# Handle New Zealand specific terms and references
	text = re.sub(r'\b[Nn]ew\s+[Zz]ealand\b', 'New Zealand', text) # Normalize "New Zealand"
	text = re.sub(r'\b[Pp]arliament\b', 'Parliament', text) # Normalize "Parliament"
	text = re.sub(r'\b[Cc]rown\b', 'Crown', text) # Normalize "Crown"
	text = re.sub(r'\b[Gg]overnment\b', 'Government', text) # Normalize "Government"

	# Handle NZ-specific legal citations (e.g., "NZB" references, Treaty of Waitangi)
	text = re.sub(r'\b[Nn][Zz][Bb]\s+(\d+)', r'NZB \1', text) # Normalize NZB references
	text = re.sub(r'[Tt]reaty\s+[Oo]f\s+[Ww]aitangi', 'Treaty of Waitangi', text, flags=re.IGNORECASE)

	# Handle Maori-specific characters if present (basic support)
	# Keep common Maori characters: ā, ē, ī, ō, ū, wh
	maori_chars = 'āēīōūwhĀĒĪŌŪWH'
	allowed_chars += maori_chars
	text = re.sub(r'[^' + allowed_chars + ']', '', text)

	# Remove empty lines and trim while preserving legal structure
	lines = []
	for line in text.split('\n'):
	stripped = line.strip()
	if stripped: # Keep non-empty lines
	# Preserve section headers
	if re.match(r'^\d+:', stripped):
	lines.append(stripped)
	else:
	lines.append(stripped)

	text = '\n'.join(lines)

	# Final cleanup
	text = text.strip()

	return text

	# Constants for prompt formatting
	REASONING_START = "<start_working_out>"
	REASONING_END = "<end_working_out>"
	SOLUTION_START = "<SOLUTION>"
	SOLUTION_END = "</SOLUTION>"

	def create_system_prompt(text: str) -> str:
	"""Create a system prompt for analyzing legislative text for loopholes and ambiguities"""
	return f"""You are a legal expert analyzing New Zealand legislation for loopholes and ambiguities.

	LEGISLATION TEXT:
	{text}

	TASK: Analyze this legislative text and identify potential loopholes, ambiguities, or unintended consequences.

	REASONING: Provide a structured analysis in the following format:

	1. Text Meaning: Explain what the text means and its intended purpose
	2. Key Assumptions: Identify any assumptions the text makes that could be exploited
	3. Exploitable Interpretations: Discuss how the text could be interpreted or applied in ways that circumvent its intended purpose
	4. Critical Loopholes: Identify specific loopholes, ambiguities, or unintended consequences that could be used to bypass the legislation
	5. Circumvention Strategies: Suggest practical methods or scenarios for exploiting these loopholes to achieve objectives contrary to the legislation's intent

	Write your complete analysis between {REASONING_START} and {REASONING_END}.

	Then provide your overall conclusion between {SOLUTION_START} and {SOLUTION_END}.
	"""

	def generate_chat_template(system_prompt: str) -> str:
	"""
	Generate a chat template using the GGUF model's native chat format.
	This uses the proper message structure with BOS/EOS tokens for better model compatibility.
	"""
	# Build the chat using the GGUF template structure
	chat_messages = []

	# System message
	if system_prompt:
	chat_messages.append("<\|im_start\|>system")
	chat_messages.append(system_prompt)
	chat_messages.append("<\|im_end\|>")

	# User message with the analysis request
	chat_messages.append("<\|im_start\|>user")
	chat_messages.append("Analyze the given legislative text for loopholes, ambiguities, and unintended consequences. Provide a structured legal analysis following the specified format.")
	chat_messages.append("<\|im_end\|>")

	# Assistant message with generation prompt
	chat_messages.append("<\|im_start\|>assistant")
	chat_messages.append("") # Empty for generation

	return "\n".join(chat_messages)

	def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
	"""Split text into overlapping chunks for processing"""
	if len(text) <= chunk_size:
	return [text]

	chunks = []
	start = 0
	while start < len(text):
	end = start + chunk_size
	chunk = text[start:end]

	# Try to end chunk at a sentence boundary if possible
	if end < len(text):
	# Look for sentence endings in the last 100 characters
	sentence_end = max(
	chunk.rfind('. ', max(0, len(chunk) - 100)),
	chunk.rfind('! ', max(0, len(chunk) - 100)),
	chunk.rfind('? ', max(0, len(chunk) - 100))
	)
	if sentence_end != -1:
	chunk = chunk[:sentence_end + 2] # Include the sentence ending

	chunks.append(chunk)
	start = end - overlap if end < len(text) else len(text)

	return chunks

	def generate_response(model, prompt: str, max_tokens: int = MAX_TOKENS) -> str:
	"""
	Generate a response from the model for a given prompt with optimized parameters for legal analysis.

	Parameter Explanations:
	- temperature=0.3: Balanced creativity for legal analysis (not too random, not too deterministic)
	- top_p=0.85: Nucleus sampling - considers top 85% probability mass for coherent legal text
	- top_k=50: Top-k sampling - considers top 50 tokens for better legal terminology selection
	- min_p=0.05: Minimum probability threshold to avoid low-quality tokens

	Anti-Repetition Parameters:
	- repetition_penalty=1.15: Penalizes repetition of phrases (15% penalty)
	- presence_penalty=0.1: Encourages topic diversity across the response
	- frequency_penalty=0.1: Reduces overuse of frequent tokens

	Advanced Sampling:
	- typical_p=0.95: Focuses on typical token probabilities for legal text patterns
	- tfs_z=0.95: Tail-free sampling for more natural legal reasoning
	- mirostat_mode=2: Mirostat v2 for perplexity-controlled generation
	- mirostat_tau=4.0: Target entropy level for legal analysis
	- mirostat_eta=0.15: Learning rate for perplexity adaptation
	"""
	try:
	response = model(
	prompt,
	max_tokens=max_tokens,
	# Core generation parameters
	temperature=0.3, # Balanced temperature for legal analysis
	top_p=0.85, # Nucleus sampling for coherent legal text
	top_k=50, # Top-k sampling for better token selection
	min_p=0.05, # Minimum probability threshold to avoid low-quality tokens

	# Anti-repetition parameters
	repeat_penalty=1.15, # Reduce repetition of phrases
	presence_penalty=0.1, # Encourage topic diversity
	frequency_penalty=0.1, # Reduce frequent token usage

	# Advanced sampling parameters
	typical_p=0.95, # Typical token probability for legal text patterns
	tfs_z=0.95, # Tail-free sampling for better reasoning
	mirostat_mode=2, # Mirostat v2 for perplexity control
	mirostat_tau=4.0, # Mirostat target entropy
	mirostat_eta=0.15, # Mirostat learning rate

	# Stopping conditions
	stop=[SOLUTION_END, "</SOLUTION>", "<end_working_out>"] # Multiple stop tokens
	)
	return response['choices'][0]['text'].strip()
	except Exception as e:
	print(f"Error generating response: {e}")
	# Try with fallback parameters if advanced ones fail
	try:
	response = model(
	prompt,
	max_tokens=max_tokens,
	temperature=0.3,
	top_p=0.85,
	top_k=50,
	repeat_penalty=1.15,
	stop=[SOLUTION_END, "</SOLUTION>"]
	)
	return response['choices'][0]['text'].strip()
	except Exception as e2:
	print(f"Fallback also failed: {e2}")
	return ""

	def parse_legislation_json(file_path: str) -> List[Dict[str, Any]]:
	"""Parse the JSON lines format of NZ legislation dataset"""
	legislation_entries = []

	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	for line_num, line in enumerate(f, 1):
	line = line.strip()
	if line:
	try:
	entry = json.loads(line)
	if 'id' in entry and 'text' in entry:
	legislation_entries.append(entry)
	else:
	print(f"Warning: Line {line_num} missing required fields, skipping")
	except json.JSONDecodeError as e:
	print(f"Warning: Could not parse line {line_num}: {e}")
	continue
	except Exception as e:
	print(f"Error reading legislation file: {e}")
	return []

	print(f"Successfully parsed {len(legislation_entries)} legislation entries")
	return legislation_entries

	def create_finetuning_dataset(input_file: str, model, output_file: str = None) -> List[Dict[str, Any]]:
	"""Create a finetuning dataset by processing NZ legislation JSON dataset with incremental saving"""
	if output_file is None:
	output_file = os.path.join(OUTPUT_DIR, "nz_legislation_loophole_dataset.json")

	# Create temporary file paths
	temp_file = output_file.replace('.json', '_temp.jsonl')
	backup_file = output_file.replace('.json', '_backup.json')

	print(f"Parsing legislation dataset from {input_file}")
	legislation_entries = parse_legislation_json(input_file)

	if not legislation_entries:
	print("No legislation entries found to process")
	return []

	dataset = []
	total_entries = len(legislation_entries)
	saved_count = 0

	print(f"Processing {total_entries} legislation entries...")
	print(f"Dataset will be saved incrementally to: {temp_file}")

	try:
	# Open temporary file for incremental saving
	with open(temp_file, 'w', encoding='utf-8') as temp_f:
	for entry_num, entry in enumerate(legislation_entries, 1):
	legislation_id = entry.get('id', f'entry_{entry_num}')
	title = entry.get('title', 'Unknown Title')
	year = entry.get('year', 'Unknown Year')
	raw_text = entry.get('text', '')

	print(f"\nProcessing entry {entry_num}/{total_entries}: {title} ({year}) - ID: {legislation_id}")

	# Clean the legislation text
	cleaned_text = clean_text(raw_text)

	# Chunk the text if it's too long
	chunks = chunk_text(cleaned_text)

	print(f" - Text length: {len(raw_text)} characters")
	print(f" - Number of chunks: {len(chunks)}")

	# Process each chunk
	for chunk_id, chunk in enumerate(chunks):
	# Create prompt for this chunk
	system_prompt = create_system_prompt(chunk)
	full_prompt = generate_chat_template(system_prompt)

	# Generate response
	response = generate_response(model, full_prompt)

	# Print response for monitoring
	print(f"\n📝 Generated Analysis for {title} (Chunk {chunk_id + 1}/{len(chunks)}):")
	print(f" Response length: {len(response)} characters")

	# Show preview of the analysis
	preview = response.replace('\n', ' ').strip()
	print(f" Preview: {preview}")

	# Check for key analysis elements
	has_reasoning = '<start_working_out>' in response or 'reasoning' in response.lower()
	has_loopholes = 'loophole' in response.lower() or 'ambiguity' in response.lower() or 'issue' in response.lower()
	has_recommendations = 'recommend' in response.lower() or 'suggest' in response.lower()

	print(f" Analysis quality: {'✅' if has_reasoning else '❌'} Reasoning \| {'✅' if has_loopholes else '❌'} Loopholes \| {'✅' if has_recommendations else '❌'} Recommendations")

	# Add to dataset with metadata
	dataset_entry = {
	"prompt": full_prompt,
	"response": response,
	"legislation_id": legislation_id,
	"title": title,
	"year": year,
	"chunk_id": chunk_id,
	"total_chunks": len(chunks),
	"text_length": len(chunk),
	"original_text_length": len(raw_text)
	}

	# Save entry immediately to temporary file (JSON Lines format)
	json.dump(dataset_entry, temp_f, ensure_ascii=False)
	temp_f.write('\n')
	temp_f.flush() # Force write to disk

	dataset.append(dataset_entry)
	saved_count += 1

	# Progress update every 10 entries
	if saved_count % 10 == 0:
	print(f" ✓ Saved {saved_count} entries so far...")

	print(f"\n✓ All entries processed and saved to temporary file")
	print(f"✓ Total entries saved: {saved_count}")

	# Create backup of existing file if it exists
	if os.path.exists(output_file):
	print(f"Creating backup of existing dataset...")
	os.rename(output_file, backup_file)

	# Convert JSON Lines to final JSON format
	print(f"Converting to final JSON format...")
	with open(temp_file, 'r', encoding='utf-8') as temp_f:
	lines = temp_f.readlines()

	final_dataset = []
	for line in lines:
	if line.strip():
	final_dataset.append(json.loads(line))

	# Save final consolidated JSON file
	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump(final_dataset, f, indent=2, ensure_ascii=False)

	print(f"✓ Final dataset saved to: {output_file}")

	# Clean up temporary file
	if os.path.exists(temp_file):
	os.remove(temp_file)
	print(f"✓ Temporary file cleaned up")

	# Clean up backup file if everything succeeded
	if os.path.exists(backup_file):
	os.remove(backup_file)
	print(f"✓ Backup file cleaned up")

	print(f"\n🎉 Dataset creation complete!")
	print(f" • Processed {total_entries} legislation documents")
	print(f" • Generated {len(final_dataset)} analysis entries")
	print(f" • Total chunks processed: {sum(entry.get('total_chunks', 1) for entry in final_dataset[:total_entries])}")

	return final_dataset

	except KeyboardInterrupt:
	print(f"\n⚠️ Process interrupted by user")
	print(f" • Partial dataset saved to: {temp_file}")
	print(f" • {saved_count} entries saved so far")
	print(f" • You can resume processing or use the temporary file")
	raise

	except Exception as e:
	print(f"\n❌ Error during processing: {e}")
	print(f" • Partial dataset saved to: {temp_file}")
	print(f" • {saved_count} entries saved so far")
	if os.path.exists(backup_file):
	print(f" • Original dataset restored from backup")
	os.rename(backup_file, output_file)
	raise

	def main():
	"""Main execution function"""
	print("Starting NZ Legislation Loophole Analysis Dataset Creation")
	print("=" * 60)

	# Load the model
	model = load_model()

	# Create the dataset
	dataset = create_finetuning_dataset(INPUT_FILE, model)

	# Cleanup
	if hasattr(model, 'close'):
	model.close()

	print("\nDataset creation completed successfully!")
	print(f"Output saved to: {os.path.join(OUTPUT_DIR, 'nz_legislation_loophole_dataset.json')}")

	if __name__ == "__main__":
	main()