viettelpay-chatbot / src /scripts /build_database_script.py
minhan6559's picture
Upload 73 files
60d1d13 verified
raw
history blame
11.2 kB
"""
ViettelPay Knowledge Base Management Script
This script uses the new ContextualWordProcessor with:
- Automated processing of Word documents (.doc/.docx) from a folder
- Contextual enhancement using OpenAI API (optional)
- LangChain EnsembleRetriever for hybrid search
- ChromaDB for semantic search and BM25 for keyword search
Usage:
python build_database_script.py ingest --documents-folder ./viettelpay_docs
python build_database_script.py test --query "lỗi 606"
python build_database_script.py test --interactive
"""
import argparse
import os
import sys
from pathlib import Path
from typing import Optional
# Add the project root to Python path so we can import from src
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from src.knowledge_base.viettel_knowledge_base import ViettelKnowledgeBase
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
def validate_documents_folder(documents_folder: str) -> bool:
"""Validate that documents folder exists and contains Word documents"""
if not os.path.exists(documents_folder):
print(f"[ERROR] Documents folder not found: {documents_folder}")
return False
# Check for Word documents
folder = Path(documents_folder)
word_files = []
for pattern in ["*.doc", "*.docx"]:
word_files.extend(folder.glob(pattern))
if not word_files:
print(f"[ERROR] No Word documents (.doc/.docx) found in: {documents_folder}")
return False
print(f"[SUCCESS] Found {len(word_files)} Word documents in {documents_folder}")
for word_file in word_files:
print(f" - {word_file.name}")
return True
def ingest_documents(args):
"""Ingest documents and build knowledge base"""
print("=" * 60)
print("[INFO] INGESTING DOCUMENTS AND BUILDING KNOWLEDGE BASE")
print("=" * 60)
# Validate documents folder exists and contains Word documents
if not validate_documents_folder(args.documents_folder):
sys.exit(1)
# Get OpenAI API key for contextual enhancement
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key:
print("[INFO] Using OpenAI API key for contextual enhancement")
else:
print("[WARNING] No OpenAI API key found. Contextual enhancement disabled.")
# Initialize knowledge base (without OpenAI API key)
kb = ViettelKnowledgeBase(embedding_model=args.embedding_model)
try:
# Create persist directory from chroma_dir
persist_dir = os.path.dirname(args.chroma_dir) or "./knowledge_base"
# Build knowledge base using the new API (pass OpenAI API key here)
kb.build_knowledge_base(
documents_folder=args.documents_folder,
persist_dir=persist_dir,
reset=args.reset,
openai_api_key=openai_api_key,
)
# Show final statistics
print("\n[INFO] Knowledge Base Statistics:")
stats = kb.get_stats()
for key, value in stats.items():
print(f" {key}: {value}")
print(f"\n[SUCCESS] Knowledge base saved successfully to {persist_dir}!")
return True
except Exception as e:
print(f"[ERROR] Error during ingestion: {e}")
import traceback
traceback.print_exc()
return False
def test_retrieval(args):
"""Test retrieval on existing knowledge base"""
print("=" * 60)
print("[INFO] TESTING KNOWLEDGE BASE RETRIEVAL")
print("=" * 60)
# Load knowledge base
kb = ViettelKnowledgeBase(embedding_model=args.embedding_model)
# Create persist directory from chroma_dir
persist_dir = os.path.dirname(args.chroma_dir) or "./knowledge_base"
# Load knowledge base
success = kb.load_knowledge_base(persist_dir=persist_dir)
if not success:
print("[ERROR] Failed to load knowledge base. Run 'ingest' first.")
sys.exit(1)
# Show knowledge base stats
print("\n[INFO] Knowledge Base Statistics:")
stats = kb.get_stats()
for key, value in stats.items():
print(f" {key}: {value}")
if args.interactive:
# Interactive testing mode
run_interactive_tests(kb)
elif args.query:
# Single query testing
test_single_query(kb, args.query)
else:
# Run default test suite
run_test_suite(kb)
def test_single_query(kb, query: str):
"""Test a single query with simple output"""
print(f"\n[INFO] Testing Query: '{query}'")
print("-" * 40)
try:
# Test main search
print("\n[INFO] Search Results:")
results = kb.search(query, top_k=10)
display_simple_results(results)
except Exception as e:
print(f"[ERROR] Error during search: {e}")
def display_simple_results(results):
"""Display search results in a simple, clean format"""
if results:
for i, doc in enumerate(results, 1):
content_preview = doc.page_content[:1000].replace("\n", " ")
doc_type = doc.metadata.get("doc_type", "unknown")
source = doc.metadata.get("source_file", "unknown")
relevance_score = doc.metadata.get("relevance_score", "N/A")
print(
f" {i}. [{doc_type}] Score: {relevance_score} - {content_preview}..."
)
print(f" Source: {source}")
else:
print(" No results found")
def run_interactive_tests(kb):
"""Run interactive testing session"""
print("\n[INFO] Interactive Testing Mode")
print("Available commands:")
print(" - Enter a query to search")
print(" - 'stats' to view knowledge base statistics")
print(" - 'quit' to exit")
print("-" * 50)
while True:
try:
user_input = input("\n[INPUT] Enter command: ").strip()
if user_input.lower() in ["quit", "exit", "q"]:
break
if not user_input:
continue
# Handle 'stats' command
if user_input.lower() == "stats":
stats = kb.get_stats()
print("\n[INFO] Knowledge Base Statistics:")
for key, value in stats.items():
print(f" {key}: {value}")
continue
# Regular query
print(f"\n[INFO] Search: '{user_input}'")
results = kb.search(user_input, top_k=10)
display_simple_results(results)
except KeyboardInterrupt:
print("\n[INFO] Exiting interactive mode...")
break
except Exception as e:
print(f"[ERROR] Error: {e}")
def run_test_suite(kb):
"""Run comprehensive test suite"""
test_cases = [
# Error code queries (BM25 strength)
{"query": "lỗi 606", "description": "Error code (lowercase)"},
{"query": "LỖI 606", "description": "Error code (uppercase)"},
{"query": "mã lỗi W02", "description": "Alphanumeric error code"},
# Semantic queries (ChromaDB strength)
{"query": "không nạp được tiền", "description": "Semantic: cannot topup"},
{"query": "giao dịch bị treo", "description": "Semantic: transaction stuck"},
# Procedure queries
{
"query": "hướng dẫn nạp cước trả trước",
"description": "Procedure: prepaid topup",
},
{
"query": "cách kiểm tra phí chiết khấu",
"description": "Procedure: check discount",
},
# Reference queries
{
"query": "thẻ 30k có nhà mạng nào",
"description": "Reference: denomination availability",
},
# Policy queries
{
"query": "quy định hủy giao dịch",
"description": "Policy: cancellation rules",
},
]
print("\n[INFO] Running Test Suite:")
print("=" * 50)
for i, test_case in enumerate(test_cases, 1):
print(f"\n#{i} {test_case['description']}")
print(f"Query: '{test_case['query']}'")
print("-" * 30)
try:
results = kb.search(test_case["query"], top_k=3)
display_simple_results(results)
except Exception as e:
print(f"[ERROR] Error: {e}")
def main():
"""Main entry point with argument parsing"""
parser = argparse.ArgumentParser(
description="ViettelPay Knowledge Base Management",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python build_database_script.py ingest --documents-folder ./viettelpay_docs
python build_database_script.py test --query "lỗi 606"
python build_database_script.py test --interactive
Environment Variables:
OPENAI_API_KEY: Optional API key for contextual enhancement
""",
)
# Subcommands
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Ingest command
ingest_parser = subparsers.add_parser(
"ingest", help="Ingest documents and build knowledge base"
)
ingest_parser.add_argument(
"--documents-folder",
default="./viettelpay_docs/raw",
help="Directory containing Word documents (.doc/.docx files)",
)
ingest_parser.add_argument(
"--chroma-dir",
default="./knowledge_base/chroma_db",
help="ChromaDB storage directory",
)
ingest_parser.add_argument(
"--bm25-dir",
default="./knowledge_base/bm25_index",
help="BM25 storage directory",
)
ingest_parser.add_argument(
"--embedding-model",
default="dangvantuan/vietnamese-document-embedding",
help="Embedding model name",
)
ingest_parser.add_argument(
"--reset",
action="store_true",
default=True,
help="Reset knowledge base before ingestion (default: True)",
)
ingest_parser.add_argument(
"--no-reset",
dest="reset",
action="store_false",
help="Do not reset existing knowledge base",
)
# Test command
test_parser = subparsers.add_parser(
"test", help="Test retrieval on existing knowledge base"
)
test_parser.add_argument("--query", help="Single query to test")
test_parser.add_argument(
"--interactive", action="store_true", help="Interactive testing mode"
)
test_parser.add_argument(
"--chroma-dir",
default="./knowledge_base/chroma_db",
help="ChromaDB storage directory",
)
test_parser.add_argument(
"--bm25-dir",
default="./knowledge_base/bm25_index",
help="BM25 storage directory",
)
test_parser.add_argument(
"--embedding-model",
default="dangvantuan/vietnamese-document-embedding",
help="Embedding model name",
)
args = parser.parse_args()
if args.command == "ingest":
success = ingest_documents(args)
sys.exit(0 if success else 1)
elif args.command == "test":
test_retrieval(args)
else:
parser.print_help()
sys.exit(1)
if __name__ == "__main__":
main()