brainsqueeze commited on
Commit
08718d5
·
verified ·
1 Parent(s): 64b9f1e

Update KB service to use new ES indices

Browse files
Files changed (1) hide show
  1. ask_candid/services/knowledge_base.py +37 -23
ask_candid/services/knowledge_base.py CHANGED
@@ -6,7 +6,7 @@ import logging
6
  from langchain_core.documents import Document
7
 
8
  from ask_candid.base.retrieval.elastic import (
9
- build_sparse_vector_query,
10
  build_sparse_vector_and_text_query,
11
  news_query_builder,
12
  issuelab_query_builder,
@@ -15,7 +15,7 @@ from ask_candid.base.retrieval.elastic import (
15
  from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
16
  from ask_candid.base.retrieval.schemas import ElasticHitsResult
17
  import ask_candid.base.retrieval.sources as S
18
- from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
19
  from ask_candid.services.small_lm import CandidSmallLanguageModel
20
 
21
  SourceNames = Literal[
@@ -32,7 +32,6 @@ logger = logging.getLogger(__name__)
32
  logger.setLevel(logging.INFO)
33
 
34
 
35
- # TODO remove
36
  def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
37
  """Pads the relevant chunk of text with context before and after
38
 
@@ -123,18 +122,31 @@ def generate_queries(
123
  semantic_fields=S.CandidBlogConfig.semantic_fields,
124
  text_fields=S.CandidBlogConfig.text_fields,
125
  highlight_fields=S.CandidBlogConfig.highlight_fields,
126
- excluded_fields=S.CandidBlogConfig.excluded_fields
 
127
  )
128
  q["size"] = 5
129
  vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
130
  elif source_name == "Candid Help":
131
- q = build_sparse_vector_query(query=query, fields=S.CandidHelpConfig.semantic_fields)
132
- q["_source"] = {"excludes": ["embeddings"]}
 
 
 
 
 
 
133
  q["size"] = 5
134
  vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
135
  elif source_name == "Candid Learning":
136
- q = build_sparse_vector_query(query=query, fields=S.CandidLearningConfig.semantic_fields)
137
- q["_source"] = {"excludes": ["embeddings"]}
 
 
 
 
 
 
138
  q["size"] = 5
139
  vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
140
  elif source_name == "Candid News":
@@ -166,7 +178,8 @@ def generate_queries(
166
  semantic_fields=S.YoutubeConfig.semantic_fields,
167
  text_fields=S.YoutubeConfig.text_fields,
168
  highlight_fields=S.YoutubeConfig.highlight_fields,
169
- excluded_fields=S.YoutubeConfig.excluded_fields
 
170
  )
171
  q["size"] = 5
172
  vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
@@ -215,7 +228,7 @@ def run_search(
215
 
216
  results = []
217
  if vector_searches is not None and len(vector_searches) > 0:
218
- hits = multi_search_base(queries=vector_searches, credentials=SEMANTIC_ELASTIC_QA)
219
  for hit in _msearch_response_generator(responses=hits):
220
  results.append(hit)
221
  if non_vector_searches is not None and len(non_vector_searches) > 0:
@@ -368,6 +381,7 @@ def process_hit(hit: ElasticHitsResult) -> Document:
368
  )
369
  elif "blog" in hit.index:
370
  highlight = hit.highlight or {}
 
371
  doc = Document(
372
  page_content='\n\n'.join([
373
  hit.source.get("title_summary_tags_text", ""),
@@ -378,35 +392,35 @@ def process_hit(hit: ElasticHitsResult) -> Document:
378
  "title": hit.source.get("title", ""),
379
  "source": "Candid Blog",
380
  "source_id": hit.source["id"],
381
- "url": hit.source["link"]
382
  }
383
  )
384
- elif "candid-learning" in hit.index:
 
385
  doc = Document(
386
  page_content='\n\n'.join([
387
- hit.source.get("title", ""),
388
- hit.source.get("staff_recommendations", ""),
389
- hit.source.get("training_topics", ""),
390
- get_context("content", hit, context_length=12)
391
  ]),
392
  metadata={
393
  "title": hit.source["title"],
394
  "source": "Candid Learning",
395
- "source_id": hit.source["post_id"],
396
- "url": hit.source.get("url", "")
397
  }
398
  )
399
- elif "candid-help" in hit.index:
 
400
  doc = Document(
401
  page_content='\n\n'.join([
402
- hit.source.get("combined_article_description", ""),
403
- get_context("content", hit, context_length=12)
404
  ]),
405
  metadata={
406
  "title": hit.source.get("title", ""),
407
  "source": "Candid Help",
408
- "source_id": hit.source["id"],
409
- "url": hit.source.get("link", "")
410
  }
411
  )
412
  elif "news" in hit.index:
 
6
  from langchain_core.documents import Document
7
 
8
  from ask_candid.base.retrieval.elastic import (
9
+ # build_sparse_vector_query,
10
  build_sparse_vector_and_text_query,
11
  news_query_builder,
12
  issuelab_query_builder,
 
15
  from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
16
  from ask_candid.base.retrieval.schemas import ElasticHitsResult
17
  import ask_candid.base.retrieval.sources as S
18
+ from ask_candid.base.config.connections import SEMANTIC_ELASTIC, ELSER_INFERENCE_ID, NEWS_ELASTIC
19
  from ask_candid.services.small_lm import CandidSmallLanguageModel
20
 
21
  SourceNames = Literal[
 
32
  logger.setLevel(logging.INFO)
33
 
34
 
 
35
  def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
36
  """Pads the relevant chunk of text with context before and after
37
 
 
122
  semantic_fields=S.CandidBlogConfig.semantic_fields,
123
  text_fields=S.CandidBlogConfig.text_fields,
124
  highlight_fields=S.CandidBlogConfig.highlight_fields,
125
+ excluded_fields=S.CandidBlogConfig.excluded_fields,
126
+ inference_id=ELSER_INFERENCE_ID
127
  )
128
  q["size"] = 5
129
  vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
130
  elif source_name == "Candid Help":
131
+ q = build_sparse_vector_and_text_query(
132
+ query=query,
133
+ semantic_fields=S.CandidHelpConfig.semantic_fields,
134
+ text_fields=S.CandidHelpConfig.text_fields,
135
+ highlight_fields=S.CandidHelpConfig.highlight_fields,
136
+ excluded_fields=S.CandidHelpConfig.excluded_fields,
137
+ inference_id=ELSER_INFERENCE_ID
138
+ )
139
  q["size"] = 5
140
  vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
141
  elif source_name == "Candid Learning":
142
+ q = build_sparse_vector_and_text_query(
143
+ query=query,
144
+ semantic_fields=S.CandidLearningConfig.semantic_fields,
145
+ text_fields=S.CandidLearningConfig.text_fields,
146
+ highlight_fields=S.CandidLearningConfig.highlight_fields,
147
+ excluded_fields=S.CandidLearningConfig.excluded_fields,
148
+ inference_id=ELSER_INFERENCE_ID
149
+ )
150
  q["size"] = 5
151
  vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
152
  elif source_name == "Candid News":
 
178
  semantic_fields=S.YoutubeConfig.semantic_fields,
179
  text_fields=S.YoutubeConfig.text_fields,
180
  highlight_fields=S.YoutubeConfig.highlight_fields,
181
+ excluded_fields=S.YoutubeConfig.excluded_fields,
182
+ inference_id=ELSER_INFERENCE_ID
183
  )
184
  q["size"] = 5
185
  vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
 
228
 
229
  results = []
230
  if vector_searches is not None and len(vector_searches) > 0:
231
+ hits = multi_search_base(queries=vector_searches, credentials=SEMANTIC_ELASTIC)
232
  for hit in _msearch_response_generator(responses=hits):
233
  results.append(hit)
234
  if non_vector_searches is not None and len(non_vector_searches) > 0:
 
381
  )
382
  elif "blog" in hit.index:
383
  highlight = hit.highlight or {}
384
+ blog_url = hit.source.get("link", "")
385
  doc = Document(
386
  page_content='\n\n'.join([
387
  hit.source.get("title_summary_tags_text", ""),
 
392
  "title": hit.source.get("title", ""),
393
  "source": "Candid Blog",
394
  "source_id": hit.source["id"],
395
+ "url": blog_url
396
  }
397
  )
398
+ elif "learning" in hit.index:
399
+ highlight = hit.highlight or {}
400
  doc = Document(
401
  page_content='\n\n'.join([
402
+ hit.source.get("semantic_title_short_description", ""),
403
+ ' '.join(highlight.get("semantic_lessons_content", []))
 
 
404
  ]),
405
  metadata={
406
  "title": hit.source["title"],
407
  "source": "Candid Learning",
408
+ "source_id": hit.source["course_id"],
409
+ "url": hit.source.get("course_url", "")
410
  }
411
  )
412
+ elif "help" in hit.index:
413
+ highlight = hit.highlight or {}
414
  doc = Document(
415
  page_content='\n\n'.join([
416
+ hit.source.get("semantic_title_summary_question_category", ""),
417
+ ' '.join(highlight.get("semantic_content", []))
418
  ]),
419
  metadata={
420
  "title": hit.source.get("title", ""),
421
  "source": "Candid Help",
422
+ "source_id": hit.source["article_id"],
423
+ "url": f"""https://help.candid.org/s/article/{hit.source.get("url", "")}"""
424
  }
425
  )
426
  elif "news" in hit.index: