Spaces:
Running
Running
Update KB service to use new ES indices
Browse files
ask_candid/services/knowledge_base.py
CHANGED
|
@@ -6,7 +6,7 @@ import logging
|
|
| 6 |
from langchain_core.documents import Document
|
| 7 |
|
| 8 |
from ask_candid.base.retrieval.elastic import (
|
| 9 |
-
build_sparse_vector_query,
|
| 10 |
build_sparse_vector_and_text_query,
|
| 11 |
news_query_builder,
|
| 12 |
issuelab_query_builder,
|
|
@@ -15,7 +15,7 @@ from ask_candid.base.retrieval.elastic import (
|
|
| 15 |
from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
|
| 16 |
from ask_candid.base.retrieval.schemas import ElasticHitsResult
|
| 17 |
import ask_candid.base.retrieval.sources as S
|
| 18 |
-
from ask_candid.base.config.connections import
|
| 19 |
from ask_candid.services.small_lm import CandidSmallLanguageModel
|
| 20 |
|
| 21 |
SourceNames = Literal[
|
|
@@ -32,7 +32,6 @@ logger = logging.getLogger(__name__)
|
|
| 32 |
logger.setLevel(logging.INFO)
|
| 33 |
|
| 34 |
|
| 35 |
-
# TODO remove
|
| 36 |
def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
|
| 37 |
"""Pads the relevant chunk of text with context before and after
|
| 38 |
|
|
@@ -123,18 +122,31 @@ def generate_queries(
|
|
| 123 |
semantic_fields=S.CandidBlogConfig.semantic_fields,
|
| 124 |
text_fields=S.CandidBlogConfig.text_fields,
|
| 125 |
highlight_fields=S.CandidBlogConfig.highlight_fields,
|
| 126 |
-
excluded_fields=S.CandidBlogConfig.excluded_fields
|
|
|
|
| 127 |
)
|
| 128 |
q["size"] = 5
|
| 129 |
vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
|
| 130 |
elif source_name == "Candid Help":
|
| 131 |
-
q =
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
q["size"] = 5
|
| 134 |
vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
|
| 135 |
elif source_name == "Candid Learning":
|
| 136 |
-
q =
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
q["size"] = 5
|
| 139 |
vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
|
| 140 |
elif source_name == "Candid News":
|
|
@@ -166,7 +178,8 @@ def generate_queries(
|
|
| 166 |
semantic_fields=S.YoutubeConfig.semantic_fields,
|
| 167 |
text_fields=S.YoutubeConfig.text_fields,
|
| 168 |
highlight_fields=S.YoutubeConfig.highlight_fields,
|
| 169 |
-
excluded_fields=S.YoutubeConfig.excluded_fields
|
|
|
|
| 170 |
)
|
| 171 |
q["size"] = 5
|
| 172 |
vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
|
|
@@ -215,7 +228,7 @@ def run_search(
|
|
| 215 |
|
| 216 |
results = []
|
| 217 |
if vector_searches is not None and len(vector_searches) > 0:
|
| 218 |
-
hits = multi_search_base(queries=vector_searches, credentials=
|
| 219 |
for hit in _msearch_response_generator(responses=hits):
|
| 220 |
results.append(hit)
|
| 221 |
if non_vector_searches is not None and len(non_vector_searches) > 0:
|
|
@@ -368,6 +381,7 @@ def process_hit(hit: ElasticHitsResult) -> Document:
|
|
| 368 |
)
|
| 369 |
elif "blog" in hit.index:
|
| 370 |
highlight = hit.highlight or {}
|
|
|
|
| 371 |
doc = Document(
|
| 372 |
page_content='\n\n'.join([
|
| 373 |
hit.source.get("title_summary_tags_text", ""),
|
|
@@ -378,35 +392,35 @@ def process_hit(hit: ElasticHitsResult) -> Document:
|
|
| 378 |
"title": hit.source.get("title", ""),
|
| 379 |
"source": "Candid Blog",
|
| 380 |
"source_id": hit.source["id"],
|
| 381 |
-
"url":
|
| 382 |
}
|
| 383 |
)
|
| 384 |
-
elif "
|
|
|
|
| 385 |
doc = Document(
|
| 386 |
page_content='\n\n'.join([
|
| 387 |
-
hit.source.get("
|
| 388 |
-
|
| 389 |
-
hit.source.get("training_topics", ""),
|
| 390 |
-
get_context("content", hit, context_length=12)
|
| 391 |
]),
|
| 392 |
metadata={
|
| 393 |
"title": hit.source["title"],
|
| 394 |
"source": "Candid Learning",
|
| 395 |
-
"source_id": hit.source["
|
| 396 |
-
"url": hit.source.get("
|
| 397 |
}
|
| 398 |
)
|
| 399 |
-
elif "
|
|
|
|
| 400 |
doc = Document(
|
| 401 |
page_content='\n\n'.join([
|
| 402 |
-
hit.source.get("
|
| 403 |
-
|
| 404 |
]),
|
| 405 |
metadata={
|
| 406 |
"title": hit.source.get("title", ""),
|
| 407 |
"source": "Candid Help",
|
| 408 |
-
"source_id": hit.source["
|
| 409 |
-
"url": hit.source.get("
|
| 410 |
}
|
| 411 |
)
|
| 412 |
elif "news" in hit.index:
|
|
|
|
| 6 |
from langchain_core.documents import Document
|
| 7 |
|
| 8 |
from ask_candid.base.retrieval.elastic import (
|
| 9 |
+
# build_sparse_vector_query,
|
| 10 |
build_sparse_vector_and_text_query,
|
| 11 |
news_query_builder,
|
| 12 |
issuelab_query_builder,
|
|
|
|
| 15 |
from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
|
| 16 |
from ask_candid.base.retrieval.schemas import ElasticHitsResult
|
| 17 |
import ask_candid.base.retrieval.sources as S
|
| 18 |
+
from ask_candid.base.config.connections import SEMANTIC_ELASTIC, ELSER_INFERENCE_ID, NEWS_ELASTIC
|
| 19 |
from ask_candid.services.small_lm import CandidSmallLanguageModel
|
| 20 |
|
| 21 |
SourceNames = Literal[
|
|
|
|
| 32 |
logger.setLevel(logging.INFO)
|
| 33 |
|
| 34 |
|
|
|
|
| 35 |
def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
|
| 36 |
"""Pads the relevant chunk of text with context before and after
|
| 37 |
|
|
|
|
| 122 |
semantic_fields=S.CandidBlogConfig.semantic_fields,
|
| 123 |
text_fields=S.CandidBlogConfig.text_fields,
|
| 124 |
highlight_fields=S.CandidBlogConfig.highlight_fields,
|
| 125 |
+
excluded_fields=S.CandidBlogConfig.excluded_fields,
|
| 126 |
+
inference_id=ELSER_INFERENCE_ID
|
| 127 |
)
|
| 128 |
q["size"] = 5
|
| 129 |
vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
|
| 130 |
elif source_name == "Candid Help":
|
| 131 |
+
q = build_sparse_vector_and_text_query(
|
| 132 |
+
query=query,
|
| 133 |
+
semantic_fields=S.CandidHelpConfig.semantic_fields,
|
| 134 |
+
text_fields=S.CandidHelpConfig.text_fields,
|
| 135 |
+
highlight_fields=S.CandidHelpConfig.highlight_fields,
|
| 136 |
+
excluded_fields=S.CandidHelpConfig.excluded_fields,
|
| 137 |
+
inference_id=ELSER_INFERENCE_ID
|
| 138 |
+
)
|
| 139 |
q["size"] = 5
|
| 140 |
vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
|
| 141 |
elif source_name == "Candid Learning":
|
| 142 |
+
q = build_sparse_vector_and_text_query(
|
| 143 |
+
query=query,
|
| 144 |
+
semantic_fields=S.CandidLearningConfig.semantic_fields,
|
| 145 |
+
text_fields=S.CandidLearningConfig.text_fields,
|
| 146 |
+
highlight_fields=S.CandidLearningConfig.highlight_fields,
|
| 147 |
+
excluded_fields=S.CandidLearningConfig.excluded_fields,
|
| 148 |
+
inference_id=ELSER_INFERENCE_ID
|
| 149 |
+
)
|
| 150 |
q["size"] = 5
|
| 151 |
vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
|
| 152 |
elif source_name == "Candid News":
|
|
|
|
| 178 |
semantic_fields=S.YoutubeConfig.semantic_fields,
|
| 179 |
text_fields=S.YoutubeConfig.text_fields,
|
| 180 |
highlight_fields=S.YoutubeConfig.highlight_fields,
|
| 181 |
+
excluded_fields=S.YoutubeConfig.excluded_fields,
|
| 182 |
+
inference_id=ELSER_INFERENCE_ID
|
| 183 |
)
|
| 184 |
q["size"] = 5
|
| 185 |
vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
|
|
|
|
| 228 |
|
| 229 |
results = []
|
| 230 |
if vector_searches is not None and len(vector_searches) > 0:
|
| 231 |
+
hits = multi_search_base(queries=vector_searches, credentials=SEMANTIC_ELASTIC)
|
| 232 |
for hit in _msearch_response_generator(responses=hits):
|
| 233 |
results.append(hit)
|
| 234 |
if non_vector_searches is not None and len(non_vector_searches) > 0:
|
|
|
|
| 381 |
)
|
| 382 |
elif "blog" in hit.index:
|
| 383 |
highlight = hit.highlight or {}
|
| 384 |
+
blog_url = hit.source.get("link", "")
|
| 385 |
doc = Document(
|
| 386 |
page_content='\n\n'.join([
|
| 387 |
hit.source.get("title_summary_tags_text", ""),
|
|
|
|
| 392 |
"title": hit.source.get("title", ""),
|
| 393 |
"source": "Candid Blog",
|
| 394 |
"source_id": hit.source["id"],
|
| 395 |
+
"url": blog_url
|
| 396 |
}
|
| 397 |
)
|
| 398 |
+
elif "learning" in hit.index:
|
| 399 |
+
highlight = hit.highlight or {}
|
| 400 |
doc = Document(
|
| 401 |
page_content='\n\n'.join([
|
| 402 |
+
hit.source.get("semantic_title_short_description", ""),
|
| 403 |
+
' '.join(highlight.get("semantic_lessons_content", []))
|
|
|
|
|
|
|
| 404 |
]),
|
| 405 |
metadata={
|
| 406 |
"title": hit.source["title"],
|
| 407 |
"source": "Candid Learning",
|
| 408 |
+
"source_id": hit.source["course_id"],
|
| 409 |
+
"url": hit.source.get("course_url", "")
|
| 410 |
}
|
| 411 |
)
|
| 412 |
+
elif "help" in hit.index:
|
| 413 |
+
highlight = hit.highlight or {}
|
| 414 |
doc = Document(
|
| 415 |
page_content='\n\n'.join([
|
| 416 |
+
hit.source.get("semantic_title_summary_question_category", ""),
|
| 417 |
+
' '.join(highlight.get("semantic_content", []))
|
| 418 |
]),
|
| 419 |
metadata={
|
| 420 |
"title": hit.source.get("title", ""),
|
| 421 |
"source": "Candid Help",
|
| 422 |
+
"source_id": hit.source["article_id"],
|
| 423 |
+
"url": f"""https://help.candid.org/s/article/{hit.source.get("url", "")}"""
|
| 424 |
}
|
| 425 |
)
|
| 426 |
elif "news" in hit.index:
|