Add document chunking configuration and update related utilities
Browse files- .env.example +4 -0
- BLOG_DATA_UTILS.md +12 -0
- README.md +2 -0
- py-src/lets_talk/config.py +4 -0
- py-src/lets_talk/utils/blog.py +5 -3
- py-src/pipeline.py +30 -4
.env.example
CHANGED
|
@@ -23,3 +23,7 @@ BLOG_BASE_URL=https://thedataguy.pro/blog/
|
|
| 23 |
|
| 24 |
# Search Configuration
|
| 25 |
MAX_SEARCH_RESULTS=5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
# Search Configuration
|
| 25 |
MAX_SEARCH_RESULTS=5
|
| 26 |
+
|
| 27 |
+
# Document Chunking Configuration
|
| 28 |
+
CHUNK_SIZE=1000
|
| 29 |
+
CHUNK_OVERLAP=200
|
BLOG_DATA_UTILS.md
CHANGED
|
@@ -46,6 +46,16 @@ When new blog posts are published, follow these steps:
|
|
| 46 |
uv run python update_blog_data.py --force-recreate
|
| 47 |
```
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
This will:
|
| 50 |
- Load all blog posts (including new ones)
|
| 51 |
- Update the vector embeddings
|
|
@@ -61,6 +71,8 @@ VECTOR_STORAGE_PATH=./db/vectorstore_v3 # Path to vector store
|
|
| 61 |
EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l # Embedding model
|
| 62 |
QDRANT_COLLECTION=thedataguy_documents # Collection name
|
| 63 |
BLOG_BASE_URL=https://thedataguy.pro/blog/ # Base URL for blog
|
|
|
|
|
|
|
| 64 |
```
|
| 65 |
|
| 66 |
### In the Chainlit App
|
|
|
|
| 46 |
uv run python update_blog_data.py --force-recreate
|
| 47 |
```
|
| 48 |
|
| 49 |
+
Or customize the chunking behavior:
|
| 50 |
+
```bash
|
| 51 |
+
uv run python update_blog_data.py --chunk-size 1500 --chunk-overlap 300
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
Or use whole documents without chunking:
|
| 55 |
+
```bash
|
| 56 |
+
uv run python update_blog_data.py --no-chunking
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
This will:
|
| 60 |
- Load all blog posts (including new ones)
|
| 61 |
- Update the vector embeddings
|
|
|
|
| 71 |
EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l # Embedding model
|
| 72 |
QDRANT_COLLECTION=thedataguy_documents # Collection name
|
| 73 |
BLOG_BASE_URL=https://thedataguy.pro/blog/ # Base URL for blog
|
| 74 |
+
CHUNK_SIZE=1000 # Size of each document chunk
|
| 75 |
+
CHUNK_OVERLAP=200 # Overlap between chunks
|
| 76 |
```
|
| 77 |
|
| 78 |
### In the Chainlit App
|
README.md
CHANGED
|
@@ -90,6 +90,8 @@ OPENAI_API_KEY=your_openai_api_key
|
|
| 90 |
VECTOR_STORAGE_PATH=./db/vector_store_tdg
|
| 91 |
LLM_MODEL=gpt-4o-mini
|
| 92 |
EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
|
|
|
|
|
|
|
| 93 |
```
|
| 94 |
|
| 95 |
## Running Locally
|
|
|
|
| 90 |
VECTOR_STORAGE_PATH=./db/vector_store_tdg
|
| 91 |
LLM_MODEL=gpt-4o-mini
|
| 92 |
EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
|
| 93 |
+
CHUNK_SIZE=1000
|
| 94 |
+
CHUNK_OVERLAP=200
|
| 95 |
```
|
| 96 |
|
| 97 |
## Running Locally
|
py-src/lets_talk/config.py
CHANGED
|
@@ -16,5 +16,9 @@ SDG_LLM_MODLEL = os.environ.get("SDG_LLM_MODEL", "gpt-4.1")
|
|
| 16 |
EVAL_LLM_MODEL = os.environ.get("EVAL_LLM_MODEL", "gpt-4.1")
|
| 17 |
MAX_SEARCH_RESULTS = int(os.environ.get("MAX_SEARCH_RESULTS", "5"))
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
|
|
|
| 16 |
EVAL_LLM_MODEL = os.environ.get("EVAL_LLM_MODEL", "gpt-4.1")
|
| 17 |
MAX_SEARCH_RESULTS = int(os.environ.get("MAX_SEARCH_RESULTS", "5"))
|
| 18 |
|
| 19 |
+
# Document chunking configuration
|
| 20 |
+
CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1000"))
|
| 21 |
+
CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", "200"))
|
| 22 |
+
|
| 23 |
|
| 24 |
|
py-src/lets_talk/utils/blog.py
CHANGED
|
@@ -26,7 +26,9 @@ from lets_talk.config import (
|
|
| 26 |
VECTOR_STORAGE_PATH,
|
| 27 |
EMBEDDING_MODEL,
|
| 28 |
QDRANT_COLLECTION,
|
| 29 |
-
BLOG_BASE_URL
|
|
|
|
|
|
|
| 30 |
)
|
| 31 |
|
| 32 |
def load_blog_posts(data_dir: str = DATA_DIR,
|
|
@@ -161,8 +163,8 @@ def display_document_stats(stats: Dict[str, Any]):
|
|
| 161 |
|
| 162 |
|
| 163 |
def split_documents(documents: List[Document],
|
| 164 |
-
chunk_size: int =
|
| 165 |
-
chunk_overlap: int =
|
| 166 |
"""
|
| 167 |
Split documents into chunks for better embedding and retrieval.
|
| 168 |
|
|
|
|
| 26 |
VECTOR_STORAGE_PATH,
|
| 27 |
EMBEDDING_MODEL,
|
| 28 |
QDRANT_COLLECTION,
|
| 29 |
+
BLOG_BASE_URL,
|
| 30 |
+
CHUNK_SIZE,
|
| 31 |
+
CHUNK_OVERLAP
|
| 32 |
)
|
| 33 |
|
| 34 |
def load_blog_posts(data_dir: str = DATA_DIR,
|
|
|
|
| 163 |
|
| 164 |
|
| 165 |
def split_documents(documents: List[Document],
|
| 166 |
+
chunk_size: int = CHUNK_SIZE,
|
| 167 |
+
chunk_overlap: int = CHUNK_OVERLAP) -> List[Document]:
|
| 168 |
"""
|
| 169 |
Split documents into chunks for better embedding and retrieval.
|
| 170 |
|
py-src/pipeline.py
CHANGED
|
@@ -45,6 +45,12 @@ def parse_args():
|
|
| 45 |
help="Directory to save stats and artifacts (default: ./stats)")
|
| 46 |
parser.add_argument("--ci", action="store_true",
|
| 47 |
help="Run in CI mode (no interactive prompts, exit codes for CI)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
return parser.parse_args()
|
| 49 |
|
| 50 |
def save_stats(stats, output_dir="./stats", ci_mode=False):
|
|
@@ -94,7 +100,8 @@ def save_stats(stats, output_dir="./stats", ci_mode=False):
|
|
| 94 |
return filename, basic_stats
|
| 95 |
|
| 96 |
def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
|
| 97 |
-
force_recreate=False, output_dir="./stats", ci_mode=False,
|
|
|
|
| 98 |
"""
|
| 99 |
Create or update the vector database with blog documents.
|
| 100 |
|
|
@@ -104,6 +111,10 @@ def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
|
|
| 104 |
force_recreate: Whether to force recreation of the vector store
|
| 105 |
output_dir: Directory to save stats and artifacts
|
| 106 |
ci_mode: Whether to run in CI mode
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
Returns:
|
| 109 |
Tuple of (success status, message, stats, stats_file, stats_file_content)
|
|
@@ -122,12 +133,20 @@ def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
|
|
| 122 |
# Save stats for tracking
|
| 123 |
stats_file = None
|
| 124 |
stats_content = None
|
| 125 |
-
if
|
| 126 |
stats_file, stats_content = save_stats(stats, output_dir=output_dir, ci_mode=ci_mode)
|
| 127 |
|
| 128 |
if use_chunking:
|
| 129 |
logger.info("Chunking documents...")
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
|
| 133 |
|
|
@@ -183,6 +202,10 @@ def main():
|
|
| 183 |
logger.info(f"Force recreate: {args.force_recreate}")
|
| 184 |
logger.info(f"Output directory: {args.output_dir}")
|
| 185 |
logger.info(f"CI mode: {args.ci}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
logger.info("========================")
|
| 187 |
|
| 188 |
try:
|
|
@@ -192,7 +215,10 @@ def main():
|
|
| 192 |
storage_path=VECTOR_STORAGE_PATH,
|
| 193 |
force_recreate=args.force_recreate,
|
| 194 |
output_dir=args.output_dir,
|
| 195 |
-
ci_mode=args.ci
|
|
|
|
|
|
|
|
|
|
| 196 |
)
|
| 197 |
|
| 198 |
logger.info("\n=== Update Summary ===")
|
|
|
|
| 45 |
help="Directory to save stats and artifacts (default: ./stats)")
|
| 46 |
parser.add_argument("--ci", action="store_true",
|
| 47 |
help="Run in CI mode (no interactive prompts, exit codes for CI)")
|
| 48 |
+
parser.add_argument("--chunk-size", type=int,
|
| 49 |
+
help=f"Size of each chunk in characters (default from config)")
|
| 50 |
+
parser.add_argument("--chunk-overlap", type=int,
|
| 51 |
+
help=f"Overlap between chunks in characters (default from config)")
|
| 52 |
+
parser.add_argument("--no-chunking", action="store_true",
|
| 53 |
+
help="Don't split documents into chunks (use whole documents)")
|
| 54 |
return parser.parse_args()
|
| 55 |
|
| 56 |
def save_stats(stats, output_dir="./stats", ci_mode=False):
|
|
|
|
| 100 |
return filename, basic_stats
|
| 101 |
|
| 102 |
def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
|
| 103 |
+
force_recreate=False, output_dir="./stats", ci_mode=False,
|
| 104 |
+
use_chunking=True, should_save_stats=True, chunk_size=None, chunk_overlap=None):
|
| 105 |
"""
|
| 106 |
Create or update the vector database with blog documents.
|
| 107 |
|
|
|
|
| 111 |
force_recreate: Whether to force recreation of the vector store
|
| 112 |
output_dir: Directory to save stats and artifacts
|
| 113 |
ci_mode: Whether to run in CI mode
|
| 114 |
+
use_chunking: Whether to split documents into chunks
|
| 115 |
+
should_save_stats: Whether to save statistics about the documents
|
| 116 |
+
chunk_size: Size of each chunk in characters (default from config)
|
| 117 |
+
chunk_overlap: Overlap between chunks in characters (default from config)
|
| 118 |
|
| 119 |
Returns:
|
| 120 |
Tuple of (success status, message, stats, stats_file, stats_file_content)
|
|
|
|
| 133 |
# Save stats for tracking
|
| 134 |
stats_file = None
|
| 135 |
stats_content = None
|
| 136 |
+
if should_save_stats:
|
| 137 |
stats_file, stats_content = save_stats(stats, output_dir=output_dir, ci_mode=ci_mode)
|
| 138 |
|
| 139 |
if use_chunking:
|
| 140 |
logger.info("Chunking documents...")
|
| 141 |
+
# Use provided chunk_size and chunk_overlap or default from config
|
| 142 |
+
chunking_params = {}
|
| 143 |
+
if chunk_size is not None:
|
| 144 |
+
chunking_params['chunk_size'] = chunk_size
|
| 145 |
+
if chunk_overlap is not None:
|
| 146 |
+
chunking_params['chunk_overlap'] = chunk_overlap
|
| 147 |
+
|
| 148 |
+
logger.info(f"Using chunk size: {chunking_params.get('chunk_size', 'default')} and overlap: {chunking_params.get('chunk_overlap', 'default')}")
|
| 149 |
+
documents = blog.split_documents(documents, **chunking_params)
|
| 150 |
|
| 151 |
|
| 152 |
|
|
|
|
| 202 |
logger.info(f"Force recreate: {args.force_recreate}")
|
| 203 |
logger.info(f"Output directory: {args.output_dir}")
|
| 204 |
logger.info(f"CI mode: {args.ci}")
|
| 205 |
+
logger.info(f"Chunking: {not args.no_chunking}")
|
| 206 |
+
if not args.no_chunking:
|
| 207 |
+
logger.info(f"Chunk size: {args.chunk_size if args.chunk_size else 'default from config'}")
|
| 208 |
+
logger.info(f"Chunk overlap: {args.chunk_overlap if args.chunk_overlap else 'default from config'}")
|
| 209 |
logger.info("========================")
|
| 210 |
|
| 211 |
try:
|
|
|
|
| 215 |
storage_path=VECTOR_STORAGE_PATH,
|
| 216 |
force_recreate=args.force_recreate,
|
| 217 |
output_dir=args.output_dir,
|
| 218 |
+
ci_mode=args.ci,
|
| 219 |
+
use_chunking=not args.no_chunking,
|
| 220 |
+
chunk_size=args.chunk_size,
|
| 221 |
+
chunk_overlap=args.chunk_overlap
|
| 222 |
)
|
| 223 |
|
| 224 |
logger.info("\n=== Update Summary ===")
|