Spaces:
Sleeping
Sleeping
| """ | |
| ViettelPay Knowledge Base Management Script | |
| This script uses the new ContextualWordProcessor with: | |
| - Automated processing of Word documents (.doc/.docx) from a folder | |
| - Contextual enhancement using OpenAI API (optional) | |
| - LangChain EnsembleRetriever for hybrid search | |
| - ChromaDB for semantic search and BM25 for keyword search | |
| Usage: | |
| python build_database_script.py ingest --documents-folder ./viettelpay_docs | |
| python build_database_script.py test --query "lỗi 606" | |
| python build_database_script.py test --interactive | |
| """ | |
| import argparse | |
| import os | |
| import sys | |
| from pathlib import Path | |
| from typing import Optional | |
| # Add the project root to Python path so we can import from src | |
| project_root = Path(__file__).parent.parent.parent | |
| sys.path.insert(0, str(project_root)) | |
| from src.knowledge_base.viettel_knowledge_base import ViettelKnowledgeBase | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| def validate_documents_folder(documents_folder: str) -> bool: | |
| """Validate that documents folder exists and contains Word documents""" | |
| if not os.path.exists(documents_folder): | |
| print(f"[ERROR] Documents folder not found: {documents_folder}") | |
| return False | |
| # Check for Word documents | |
| folder = Path(documents_folder) | |
| word_files = [] | |
| for pattern in ["*.doc", "*.docx"]: | |
| word_files.extend(folder.glob(pattern)) | |
| if not word_files: | |
| print(f"[ERROR] No Word documents (.doc/.docx) found in: {documents_folder}") | |
| return False | |
| print(f"[SUCCESS] Found {len(word_files)} Word documents in {documents_folder}") | |
| for word_file in word_files: | |
| print(f" - {word_file.name}") | |
| return True | |
| def ingest_documents(args): | |
| """Ingest documents and build knowledge base""" | |
| print("=" * 60) | |
| print("[INFO] INGESTING DOCUMENTS AND BUILDING KNOWLEDGE BASE") | |
| print("=" * 60) | |
| # Validate documents folder exists and contains Word documents | |
| if not validate_documents_folder(args.documents_folder): | |
| sys.exit(1) | |
| # Get OpenAI API key for contextual enhancement | |
| openai_api_key = os.getenv("OPENAI_API_KEY") | |
| if openai_api_key: | |
| print("[INFO] Using OpenAI API key for contextual enhancement") | |
| else: | |
| print("[WARNING] No OpenAI API key found. Contextual enhancement disabled.") | |
| # Initialize knowledge base (without OpenAI API key) | |
| kb = ViettelKnowledgeBase(embedding_model=args.embedding_model) | |
| try: | |
| # Create persist directory from chroma_dir | |
| persist_dir = os.path.dirname(args.chroma_dir) or "./knowledge_base" | |
| # Build knowledge base using the new API (pass OpenAI API key here) | |
| kb.build_knowledge_base( | |
| documents_folder=args.documents_folder, | |
| persist_dir=persist_dir, | |
| reset=args.reset, | |
| openai_api_key=openai_api_key, | |
| ) | |
| # Show final statistics | |
| print("\n[INFO] Knowledge Base Statistics:") | |
| stats = kb.get_stats() | |
| for key, value in stats.items(): | |
| print(f" {key}: {value}") | |
| print(f"\n[SUCCESS] Knowledge base saved successfully to {persist_dir}!") | |
| return True | |
| except Exception as e: | |
| print(f"[ERROR] Error during ingestion: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def test_retrieval(args): | |
| """Test retrieval on existing knowledge base""" | |
| print("=" * 60) | |
| print("[INFO] TESTING KNOWLEDGE BASE RETRIEVAL") | |
| print("=" * 60) | |
| # Load knowledge base | |
| kb = ViettelKnowledgeBase(embedding_model=args.embedding_model) | |
| # Create persist directory from chroma_dir | |
| persist_dir = os.path.dirname(args.chroma_dir) or "./knowledge_base" | |
| # Load knowledge base | |
| success = kb.load_knowledge_base(persist_dir=persist_dir) | |
| if not success: | |
| print("[ERROR] Failed to load knowledge base. Run 'ingest' first.") | |
| sys.exit(1) | |
| # Show knowledge base stats | |
| print("\n[INFO] Knowledge Base Statistics:") | |
| stats = kb.get_stats() | |
| for key, value in stats.items(): | |
| print(f" {key}: {value}") | |
| if args.interactive: | |
| # Interactive testing mode | |
| run_interactive_tests(kb) | |
| elif args.query: | |
| # Single query testing | |
| test_single_query(kb, args.query) | |
| else: | |
| # Run default test suite | |
| run_test_suite(kb) | |
| def test_single_query(kb, query: str): | |
| """Test a single query with simple output""" | |
| print(f"\n[INFO] Testing Query: '{query}'") | |
| print("-" * 40) | |
| try: | |
| # Test main search | |
| print("\n[INFO] Search Results:") | |
| results = kb.search(query, top_k=10) | |
| display_simple_results(results) | |
| except Exception as e: | |
| print(f"[ERROR] Error during search: {e}") | |
| def display_simple_results(results): | |
| """Display search results in a simple, clean format""" | |
| if results: | |
| for i, doc in enumerate(results, 1): | |
| content_preview = doc.page_content[:1000].replace("\n", " ") | |
| doc_type = doc.metadata.get("doc_type", "unknown") | |
| source = doc.metadata.get("source_file", "unknown") | |
| relevance_score = doc.metadata.get("relevance_score", "N/A") | |
| print( | |
| f" {i}. [{doc_type}] Score: {relevance_score} - {content_preview}..." | |
| ) | |
| print(f" Source: {source}") | |
| else: | |
| print(" No results found") | |
| def run_interactive_tests(kb): | |
| """Run interactive testing session""" | |
| print("\n[INFO] Interactive Testing Mode") | |
| print("Available commands:") | |
| print(" - Enter a query to search") | |
| print(" - 'stats' to view knowledge base statistics") | |
| print(" - 'quit' to exit") | |
| print("-" * 50) | |
| while True: | |
| try: | |
| user_input = input("\n[INPUT] Enter command: ").strip() | |
| if user_input.lower() in ["quit", "exit", "q"]: | |
| break | |
| if not user_input: | |
| continue | |
| # Handle 'stats' command | |
| if user_input.lower() == "stats": | |
| stats = kb.get_stats() | |
| print("\n[INFO] Knowledge Base Statistics:") | |
| for key, value in stats.items(): | |
| print(f" {key}: {value}") | |
| continue | |
| # Regular query | |
| print(f"\n[INFO] Search: '{user_input}'") | |
| results = kb.search(user_input, top_k=10) | |
| display_simple_results(results) | |
| except KeyboardInterrupt: | |
| print("\n[INFO] Exiting interactive mode...") | |
| break | |
| except Exception as e: | |
| print(f"[ERROR] Error: {e}") | |
| def run_test_suite(kb): | |
| """Run comprehensive test suite""" | |
| test_cases = [ | |
| # Error code queries (BM25 strength) | |
| {"query": "lỗi 606", "description": "Error code (lowercase)"}, | |
| {"query": "LỖI 606", "description": "Error code (uppercase)"}, | |
| {"query": "mã lỗi W02", "description": "Alphanumeric error code"}, | |
| # Semantic queries (ChromaDB strength) | |
| {"query": "không nạp được tiền", "description": "Semantic: cannot topup"}, | |
| {"query": "giao dịch bị treo", "description": "Semantic: transaction stuck"}, | |
| # Procedure queries | |
| { | |
| "query": "hướng dẫn nạp cước trả trước", | |
| "description": "Procedure: prepaid topup", | |
| }, | |
| { | |
| "query": "cách kiểm tra phí chiết khấu", | |
| "description": "Procedure: check discount", | |
| }, | |
| # Reference queries | |
| { | |
| "query": "thẻ 30k có nhà mạng nào", | |
| "description": "Reference: denomination availability", | |
| }, | |
| # Policy queries | |
| { | |
| "query": "quy định hủy giao dịch", | |
| "description": "Policy: cancellation rules", | |
| }, | |
| ] | |
| print("\n[INFO] Running Test Suite:") | |
| print("=" * 50) | |
| for i, test_case in enumerate(test_cases, 1): | |
| print(f"\n#{i} {test_case['description']}") | |
| print(f"Query: '{test_case['query']}'") | |
| print("-" * 30) | |
| try: | |
| results = kb.search(test_case["query"], top_k=3) | |
| display_simple_results(results) | |
| except Exception as e: | |
| print(f"[ERROR] Error: {e}") | |
| def main(): | |
| """Main entry point with argument parsing""" | |
| parser = argparse.ArgumentParser( | |
| description="ViettelPay Knowledge Base Management", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| python build_database_script.py ingest --documents-folder ./viettelpay_docs | |
| python build_database_script.py test --query "lỗi 606" | |
| python build_database_script.py test --interactive | |
| Environment Variables: | |
| OPENAI_API_KEY: Optional API key for contextual enhancement | |
| """, | |
| ) | |
| # Subcommands | |
| subparsers = parser.add_subparsers(dest="command", help="Available commands") | |
| # Ingest command | |
| ingest_parser = subparsers.add_parser( | |
| "ingest", help="Ingest documents and build knowledge base" | |
| ) | |
| ingest_parser.add_argument( | |
| "--documents-folder", | |
| default="./viettelpay_docs/raw", | |
| help="Directory containing Word documents (.doc/.docx files)", | |
| ) | |
| ingest_parser.add_argument( | |
| "--chroma-dir", | |
| default="./knowledge_base/chroma_db", | |
| help="ChromaDB storage directory", | |
| ) | |
| ingest_parser.add_argument( | |
| "--bm25-dir", | |
| default="./knowledge_base/bm25_index", | |
| help="BM25 storage directory", | |
| ) | |
| ingest_parser.add_argument( | |
| "--embedding-model", | |
| default="dangvantuan/vietnamese-document-embedding", | |
| help="Embedding model name", | |
| ) | |
| ingest_parser.add_argument( | |
| "--reset", | |
| action="store_true", | |
| default=True, | |
| help="Reset knowledge base before ingestion (default: True)", | |
| ) | |
| ingest_parser.add_argument( | |
| "--no-reset", | |
| dest="reset", | |
| action="store_false", | |
| help="Do not reset existing knowledge base", | |
| ) | |
| # Test command | |
| test_parser = subparsers.add_parser( | |
| "test", help="Test retrieval on existing knowledge base" | |
| ) | |
| test_parser.add_argument("--query", help="Single query to test") | |
| test_parser.add_argument( | |
| "--interactive", action="store_true", help="Interactive testing mode" | |
| ) | |
| test_parser.add_argument( | |
| "--chroma-dir", | |
| default="./knowledge_base/chroma_db", | |
| help="ChromaDB storage directory", | |
| ) | |
| test_parser.add_argument( | |
| "--bm25-dir", | |
| default="./knowledge_base/bm25_index", | |
| help="BM25 storage directory", | |
| ) | |
| test_parser.add_argument( | |
| "--embedding-model", | |
| default="dangvantuan/vietnamese-document-embedding", | |
| help="Embedding model name", | |
| ) | |
| args = parser.parse_args() | |
| if args.command == "ingest": | |
| success = ingest_documents(args) | |
| sys.exit(0 if success else 1) | |
| elif args.command == "test": | |
| test_retrieval(args) | |
| else: | |
| parser.print_help() | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |