Spaces:

mafzaal
/

lets_talk

Runtime error

App Files Files Community

mafzaal commited on May 11

Commit

4e87dd5

1 Parent(s): d5d262c

feat: Add build vector store script and update pipeline for CI integration

Browse files

Files changed (3) hide show

.github/workflows/build-vector-store.yml +0 -0
py-src/pipeline.py +114 -37
scripts/build-vector-store.sh +0 -0

.github/workflows/build-vector-store.yml ADDED Viewed

File without changes

py-src/pipeline.py CHANGED Viewed

@@ -5,11 +5,13 @@ This script updates the blog data vector store when new posts are added.
 It can be scheduled to run periodically or manually executed.
 Usage:
-    python pipeline.py [--force-recreate] [--data-dir DATA_DIR]
 Options:
     --force-recreate   Force recreation of the vector store even if it exists
     --data-dir DIR     Directory containing the blog posts (default: data/)
 """
 import os
@@ -17,12 +19,21 @@ import sys
 import argparse
 from datetime import datetime
 import json
 from pathlib import Path
 from lets_talk.config import VECTOR_STORAGE_PATH, DATA_DIR
 # Import the blog utilities module
 import lets_talk.utils.blog as blog
 def parse_args():
     """Parse command-line arguments"""
     parser = argparse.ArgumentParser(description="Update blog data vector store")
@@ -30,16 +41,34 @@ def parse_args():
                         help="Force recreation of the vector store")
     parser.add_argument("--data-dir", default=DATA_DIR,
                         help=f"Directory containing blog posts (default: {DATA_DIR})")
     return parser.parse_args()
-def save_stats(stats, output_dir="./stats"):
-    """Save stats to a JSON file for tracking changes over time"""
     # Create directory if it doesn't exist
     Path(output_dir).mkdir(exist_ok=True, parents=True)
-    # Create filename with timestamp
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    filename = f"{output_dir}/blog_stats_{timestamp}.json"
     # Save only the basic stats, not the full document list
     basic_stats = {
@@ -54,27 +83,34 @@ def save_stats(stats, output_dir="./stats"):
     with open(filename, "w") as f:
         json.dump(basic_stats, f, indent=2)
-    print(f"Saved stats to {filename}")
-    return filename
-def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH, force_recreate=False):
     """
     Create or update the vector database with blog documents.
     Args:
-        documents: List of document objects to store in the vector database
-        data_dir: Directory containing the blog posts (for reporting)
         storage_path: Path where the vector database will be stored
         force_recreate: Whether to force recreation of the vector store
     Returns:
-        Tuple of (success status, message)
     """
     try:
         # Load and process documents
         documents = blog.load_blog_posts(data_dir)
         documents = blog.update_document_metadata(documents)
@@ -83,57 +119,98 @@ def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
         blog.display_document_stats(stats)
         # Save stats for tracking
-        stats_file = save_stats(stats)
         create_vector_store = (not Path.exists(Path(storage_path))) or force_recreate
         if create_vector_store:
-            print("\nAttempting to save vector store reference file...")
             vector_store = blog.create_vector_store(
                 documents,
                 storage_path=storage_path,
                 force_recreate=force_recreate
             )
             vector_store.client.close()
-            print("Vector store reference file saved.")
-            return True, f"Vector store successfully created at {storage_path}",stats, stats_file
         else:
-            return True, f"Vector store already exists at {storage_path} (use --force-recreate to rebuild)",stats, stats_file
     except Exception as e:
-        return False, f"Error creating vector store: {str(e)}"
 def main():
     """Main function to update blog data"""
     args = parse_args()
-    print("=== Blog Data Update ===")
-    print(f"Data directory: {args.data_dir}")
-    print(f"Force recreate: {args.force_recreate}")
-    print("========================")
     try:
         # Create or update vector database
-        success, message,stats,stats_file = create_vector_database(
-            args.data_dir,
             storage_path=VECTOR_STORAGE_PATH,
-            force_recreate=args.force_recreate
         )
-        print("\n=== Update Summary ===")
-        print(f"Processed {stats['total_documents']} documents")
-        print(f"Stats saved to: {stats_file}")
-        print(f"Vector DB status: {message}")
-        print("=====================")
         if not success:
             return 1
         return 0
     except Exception as e:
-        print(f"Error: {e}")
-        import traceback
-        traceback.print_exc()
         return 1
 if __name__ == "__main__":

 It can be scheduled to run periodically or manually executed.
 Usage:
+    python pipeline.py [--force-recreate] [--data-dir DATA_DIR] [--output-dir OUTPUT_DIR] [--ci]
 Options:
     --force-recreate   Force recreation of the vector store even if it exists
     --data-dir DIR     Directory containing the blog posts (default: data/)
+    --output-dir DIR   Directory to save stats and artifacts (default: ./stats)
+    --ci               Run in CI mode (no interactive prompts, exit codes for CI)
 """
 import os
 import argparse
 from datetime import datetime
 import json
+import logging
 from pathlib import Path
 from lets_talk.config import VECTOR_STORAGE_PATH, DATA_DIR
 # Import the blog utilities module
 import lets_talk.utils.blog as blog
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler()]
+)
+logger = logging.getLogger("blog-pipeline")
 def parse_args():
     """Parse command-line arguments"""
     parser = argparse.ArgumentParser(description="Update blog data vector store")
                         help="Force recreation of the vector store")
     parser.add_argument("--data-dir", default=DATA_DIR,
                         help=f"Directory containing blog posts (default: {DATA_DIR})")
+    parser.add_argument("--output-dir", default="./stats",
+                        help="Directory to save stats and artifacts (default: ./stats)")
+    parser.add_argument("--ci", action="store_true",
+                        help="Run in CI mode (no interactive prompts, exit codes for CI)")
     return parser.parse_args()
+def save_stats(stats, output_dir="./stats", ci_mode=False):
+    """Save stats to a JSON file for tracking changes over time
+    Args:
+        stats: Dictionary containing statistics about the blog posts
+        output_dir: Directory to save the stats file
+        ci_mode: Whether to run in CI mode (use fixed filename)
+    Returns:
+        Tuple of (filename, stats_dict)
+    """
     # Create directory if it doesn't exist
     Path(output_dir).mkdir(exist_ok=True, parents=True)
+    # Create filename with timestamp or use fixed name for CI
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    if ci_mode:
+        filename = f"{output_dir}/blog_stats_latest.json"
+        # Also create a timestamped version for historical tracking
+        history_filename = f"{output_dir}/blog_stats_{timestamp}.json"
+    else:
+        filename = f"{output_dir}/blog_stats_{timestamp}.json"
     # Save only the basic stats, not the full document list
     basic_stats = {
     with open(filename, "w") as f:
         json.dump(basic_stats, f, indent=2)
+    # In CI mode, also save a timestamped version
+    if ci_mode:
+        with open(history_filename, "w") as f:
+            json.dump(basic_stats, f, indent=2)
+        logger.info(f"Saved stats to {filename} and {history_filename}")
+    else:
+        logger.info(f"Saved stats to {filename}")
+    return filename, basic_stats
+def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
+                      force_recreate=False, output_dir="./stats", ci_mode=False):
     """
     Create or update the vector database with blog documents.
     Args:
+        data_dir: Directory containing the blog posts
         storage_path: Path where the vector database will be stored
         force_recreate: Whether to force recreation of the vector store
+        output_dir: Directory to save stats and artifacts
+        ci_mode: Whether to run in CI mode
     Returns:
+        Tuple of (success status, message, stats, stats_file, stats_file_content)
     """
     try:
         # Load and process documents
+        logger.info(f"Loading blog posts from {data_dir}")
         documents = blog.load_blog_posts(data_dir)
         documents = blog.update_document_metadata(documents)
         blog.display_document_stats(stats)
         # Save stats for tracking
+        stats_file, stats_content = save_stats(stats, output_dir=output_dir, ci_mode=ci_mode)
         create_vector_store = (not Path.exists(Path(storage_path))) or force_recreate
         if create_vector_store:
+            logger.info("Creating vector store...")
             vector_store = blog.create_vector_store(
                 documents,
                 storage_path=storage_path,
                 force_recreate=force_recreate
             )
             vector_store.client.close()
+            logger.info(f"Vector store successfully created at {storage_path}")
+            # In CI mode, create a metadata file with the build info
+            if ci_mode:
+                build_info = {
+                    "build_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                    "document_count": stats["total_documents"],
+                    "storage_path": str(storage_path),
+                    "vector_store_size_bytes": get_directory_size(storage_path),
+                }
+                build_info_path = Path(output_dir) / "vector_store_build_info.json"
+                with open(build_info_path, "w") as f:
+                    json.dump(build_info, f, indent=2)
+                logger.info(f"Build info saved to {build_info_path}")
+            return True, f"Vector store successfully created at {storage_path}", stats, stats_file, stats_content
         else:
+            logger.info(f"Vector store already exists at {storage_path}")
+            return True, f"Vector store already exists at {storage_path} (use --force-recreate to rebuild)", stats, stats_file, stats_content
     except Exception as e:
+        logger.error(f"Error creating vector store: {str(e)}", exc_info=True)
+        return False, f"Error creating vector store: {str(e)}", None, None, None
+def get_directory_size(path):
+    """Get the size of a directory in bytes"""
+    total_size = 0
+    for dirpath, dirnames, filenames in os.walk(path):
+        for filename in filenames:
+            filepath = os.path.join(dirpath, filename)
+            if not os.path.islink(filepath):
+                total_size += os.path.getsize(filepath)
+    return total_size
 def main():
     """Main function to update blog data"""
     args = parse_args()
+    logger.info("=== Blog Data Update ===")
+    logger.info(f"Data directory: {args.data_dir}")
+    logger.info(f"Force recreate: {args.force_recreate}")
+    logger.info(f"Output directory: {args.output_dir}")
+    logger.info(f"CI mode: {args.ci}")
+    logger.info("========================")
     try:
         # Create or update vector database
+        success, message, stats, stats_file, stats_content = create_vector_database(
+            data_dir=args.data_dir,
             storage_path=VECTOR_STORAGE_PATH,
+            force_recreate=args.force_recreate,
+            output_dir=args.output_dir,
+            ci_mode=args.ci
         )
+        logger.info("\n=== Update Summary ===")
+        if stats:
+            logger.info(f"Processed {stats['total_documents']} documents")
+            logger.info(f"Stats saved to: {stats_file}")
+        logger.info(f"Vector DB status: {message}")
+        logger.info("=====================")
+        # In CI mode, create a summary file that GitHub Actions can use to set outputs
+        if args.ci and stats:
+            ci_summary_path = Path(args.output_dir) / "ci_summary.json"
+            ci_summary = {
+                "status": "success" if success else "failure",
+                "message": message,
+                "stats_file": stats_file,
+                "document_count": stats["total_documents"],
+                "vector_store_path": str(VECTOR_STORAGE_PATH)
+            }
+            with open(ci_summary_path, "w") as f:
+                json.dump(ci_summary, f, indent=2)
+            logger.info(f"CI summary saved to {ci_summary_path}")
         if not success:
             return 1
         return 0
     except Exception as e:
+        logger.error(f"Error: {e}", exc_info=True)
         return 1
 if __name__ == "__main__":

scripts/build-vector-store.sh ADDED Viewed

File without changes