Spaces:
Running
Running
Update issuelab query technique
Browse files* quasi-sparse vector approach for issuelab
* summarize issuelab articles prior to LLM handoff
* pin transformers version to avoid major bug
- ask_candid/base/retrieval/elastic.py +76 -8
- ask_candid/base/retrieval/sources.py +9 -2
- ask_candid/services/knowledge_base.py +420 -0
- ask_candid/services/small_lm.py +8 -2
- ask_candid/tools/grants.py +113 -0
- ask_candid/tools/letter_gen.py +230 -0
- ask_candid/tools/nlp.py +83 -0
- ask_candid/tools/recommendations.py +287 -0
- ask_candid/tools/search.py +1 -1
- ask_candid/utils.py +5 -37
- requirements.txt +1 -1
ask_candid/base/retrieval/elastic.py
CHANGED
|
@@ -21,14 +21,14 @@ def build_sparse_vector_query(
|
|
| 21 |
----------
|
| 22 |
query : str
|
| 23 |
Search context string
|
| 24 |
-
fields :
|
| 25 |
Semantic text field names
|
| 26 |
inference_id : str, optional
|
| 27 |
ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
|
| 28 |
|
| 29 |
Returns
|
| 30 |
-------
|
| 31 |
-
|
| 32 |
"""
|
| 33 |
|
| 34 |
output = []
|
|
@@ -70,20 +70,20 @@ def build_sparse_vector_and_text_query(
|
|
| 70 |
----------
|
| 71 |
query : str
|
| 72 |
Search context string
|
| 73 |
-
semantic_fields :
|
| 74 |
Semantic text field names
|
| 75 |
-
highlight_fields:
|
| 76 |
Fields which relevant chunks will be helpful for the agent to read
|
| 77 |
-
text_fields :
|
| 78 |
Regular text fields
|
| 79 |
-
excluded_fields :
|
| 80 |
Fields to exclude from the source
|
| 81 |
inference_id : str, optional
|
| 82 |
ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
|
| 83 |
|
| 84 |
Returns
|
| 85 |
-------
|
| 86 |
-
|
| 87 |
"""
|
| 88 |
|
| 89 |
output = []
|
|
@@ -149,7 +149,7 @@ def news_query_builder(
|
|
| 149 |
|
| 150 |
Returns
|
| 151 |
-------
|
| 152 |
-
|
| 153 |
"""
|
| 154 |
|
| 155 |
tokens = encoder.token_expand(query)
|
|
@@ -180,11 +180,79 @@ def news_query_builder(
|
|
| 180 |
return elastic_query
|
| 181 |
|
| 182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
def multi_search_base(
|
| 184 |
queries: list[dict[str, Any]],
|
| 185 |
credentials: BaseElasticSearchConnection | BaseElasticAPIKeyCredential,
|
| 186 |
timeout: int = 180
|
| 187 |
) -> Iterator[dict[str, Any]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
if isinstance(credentials, BaseElasticAPIKeyCredential):
|
| 189 |
es = Elasticsearch(
|
| 190 |
cloud_id=credentials.cloud_id,
|
|
|
|
| 21 |
----------
|
| 22 |
query : str
|
| 23 |
Search context string
|
| 24 |
+
fields : tuple[str, ...]
|
| 25 |
Semantic text field names
|
| 26 |
inference_id : str, optional
|
| 27 |
ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
|
| 28 |
|
| 29 |
Returns
|
| 30 |
-------
|
| 31 |
+
dict[str, Any]
|
| 32 |
"""
|
| 33 |
|
| 34 |
output = []
|
|
|
|
| 70 |
----------
|
| 71 |
query : str
|
| 72 |
Search context string
|
| 73 |
+
semantic_fields : tuple[str]
|
| 74 |
Semantic text field names
|
| 75 |
+
highlight_fields: tuple[str]
|
| 76 |
Fields which relevant chunks will be helpful for the agent to read
|
| 77 |
+
text_fields : tuple[str]
|
| 78 |
Regular text fields
|
| 79 |
+
excluded_fields : tuple[str]
|
| 80 |
Fields to exclude from the source
|
| 81 |
inference_id : str, optional
|
| 82 |
ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
|
| 83 |
|
| 84 |
Returns
|
| 85 |
-------
|
| 86 |
+
dict[str, Any]
|
| 87 |
"""
|
| 88 |
|
| 89 |
output = []
|
|
|
|
| 149 |
|
| 150 |
Returns
|
| 151 |
-------
|
| 152 |
+
dict[str, Any]
|
| 153 |
"""
|
| 154 |
|
| 155 |
tokens = encoder.token_expand(query)
|
|
|
|
| 180 |
return elastic_query
|
| 181 |
|
| 182 |
|
| 183 |
+
def issuelab_query_builder(
|
| 184 |
+
query: str,
|
| 185 |
+
fields: tuple[str, ...],
|
| 186 |
+
highlight_fields: tuple[str, ...] | None,
|
| 187 |
+
encoder: SpladeEncoder,
|
| 188 |
+
) -> dict[str, Any]:
|
| 189 |
+
"""Builds a valid Elasticsearch query against Issuelab, simulating a token expansion.
|
| 190 |
+
|
| 191 |
+
Parameters
|
| 192 |
+
----------
|
| 193 |
+
query : str
|
| 194 |
+
Search context string
|
| 195 |
+
|
| 196 |
+
Returns
|
| 197 |
+
-------
|
| 198 |
+
dict[str, Any]
|
| 199 |
+
"""
|
| 200 |
+
|
| 201 |
+
tokens = encoder.token_expand(query)
|
| 202 |
+
|
| 203 |
+
elastic_query = {
|
| 204 |
+
"_source": ["issuelab_id", "issuelab_url", "title", "description", "content"],
|
| 205 |
+
"query": {
|
| 206 |
+
"bool": {
|
| 207 |
+
# "filter": [
|
| 208 |
+
# # {"range": {"event_date": {"gte": f"now-{days_ago}d/d"}}},
|
| 209 |
+
# # {"range": {"insert_date": {"gte": f"now-{days_ago}d/d"}}},
|
| 210 |
+
# # {"range": {"article_trust_worthiness": {"gt": NEWS_TRUST_SCORE_THRESHOLD}}}
|
| 211 |
+
# ],
|
| 212 |
+
"should": []
|
| 213 |
+
}
|
| 214 |
+
},
|
| 215 |
+
"highlight": {
|
| 216 |
+
"fields": dict.fromkeys(highlight_fields or ("content", "description"), {})
|
| 217 |
+
}
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
for token, score in tokens.items():
|
| 221 |
+
if score > SPARSE_ENCODING_SCORE_THRESHOLD:
|
| 222 |
+
elastic_query["query"]["bool"]["should"].append({
|
| 223 |
+
"multi_match": {
|
| 224 |
+
"query": token,
|
| 225 |
+
"fields": fields,
|
| 226 |
+
"boost": score
|
| 227 |
+
}
|
| 228 |
+
})
|
| 229 |
+
return elastic_query
|
| 230 |
+
|
| 231 |
+
|
| 232 |
def multi_search_base(
|
| 233 |
queries: list[dict[str, Any]],
|
| 234 |
credentials: BaseElasticSearchConnection | BaseElasticAPIKeyCredential,
|
| 235 |
timeout: int = 180
|
| 236 |
) -> Iterator[dict[str, Any]]:
|
| 237 |
+
"""Handles multi-search queries on a single cluster given the relevant credetials object
|
| 238 |
+
|
| 239 |
+
Parameters
|
| 240 |
+
----------
|
| 241 |
+
queries : list[dict[str, Any]]
|
| 242 |
+
`msearch` query object, (see: https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-msearch)
|
| 243 |
+
credentials : BaseElasticSearchConnection | BaseElasticAPIKeyCredential
|
| 244 |
+
timeout : int, optional, by default 180
|
| 245 |
+
|
| 246 |
+
Yields
|
| 247 |
+
------
|
| 248 |
+
Iterator[dict[str, Any]]
|
| 249 |
+
|
| 250 |
+
Raises
|
| 251 |
+
------
|
| 252 |
+
TypeError
|
| 253 |
+
Raised if invalid credentials are passed
|
| 254 |
+
"""
|
| 255 |
+
|
| 256 |
if isinstance(credentials, BaseElasticAPIKeyCredential):
|
| 257 |
es = Elasticsearch(
|
| 258 |
cloud_id=credentials.cloud_id,
|
ask_candid/base/retrieval/sources.py
CHANGED
|
@@ -25,9 +25,16 @@ CandidNewsConfig = ElasticSourceConfig(
|
|
| 25 |
)
|
| 26 |
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
IssueLabConfig = ElasticSourceConfig(
|
| 29 |
-
index_name="
|
| 30 |
-
semantic_fields=("
|
|
|
|
|
|
|
| 31 |
)
|
| 32 |
|
| 33 |
|
|
|
|
| 25 |
)
|
| 26 |
|
| 27 |
|
| 28 |
+
# IssueLabConfig = ElasticSourceConfig(
|
| 29 |
+
# index_name="search-semantic-issuelab-elser_ve2",
|
| 30 |
+
# semantic_fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
|
| 31 |
+
# )
|
| 32 |
+
|
| 33 |
IssueLabConfig = ElasticSourceConfig(
|
| 34 |
+
index_name="issuelab_prod_data",
|
| 35 |
+
# semantic_fields=("title", "description", "content"),
|
| 36 |
+
semantic_fields=("title", "description", "content^0.3"),
|
| 37 |
+
highlight_fields=("description", "content")
|
| 38 |
)
|
| 39 |
|
| 40 |
|
ask_candid/services/knowledge_base.py
ADDED
|
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal, Any
|
| 2 |
+
from collections.abc import Iterator, Iterable
|
| 3 |
+
from itertools import groupby
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
from langchain_core.documents import Document
|
| 7 |
+
|
| 8 |
+
from ask_candid.base.retrieval.elastic import (
|
| 9 |
+
build_sparse_vector_query,
|
| 10 |
+
build_sparse_vector_and_text_query,
|
| 11 |
+
news_query_builder,
|
| 12 |
+
issuelab_query_builder,
|
| 13 |
+
multi_search_base
|
| 14 |
+
)
|
| 15 |
+
from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
|
| 16 |
+
from ask_candid.base.retrieval.schemas import ElasticHitsResult
|
| 17 |
+
import ask_candid.base.retrieval.sources as S
|
| 18 |
+
from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
|
| 19 |
+
from ask_candid.services.small_lm import CandidSmallLanguageModel
|
| 20 |
+
|
| 21 |
+
SourceNames = Literal[
|
| 22 |
+
"Candid Blog",
|
| 23 |
+
"Candid Help",
|
| 24 |
+
"Candid Learning",
|
| 25 |
+
"Candid News",
|
| 26 |
+
"IssueLab Research Reports",
|
| 27 |
+
"YouTube Training"
|
| 28 |
+
]
|
| 29 |
+
sparse_encoder = SpladeEncoder()
|
| 30 |
+
logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
logger.setLevel(logging.INFO)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# TODO remove
|
| 36 |
+
def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
|
| 37 |
+
"""Pads the relevant chunk of text with context before and after
|
| 38 |
+
|
| 39 |
+
Parameters
|
| 40 |
+
----------
|
| 41 |
+
field_name : str
|
| 42 |
+
a field with the long text that was chunked into pieces
|
| 43 |
+
hit : ElasticHitsResult
|
| 44 |
+
context_length : int, optional
|
| 45 |
+
length of text to add before and after the chunk, by default 1024
|
| 46 |
+
add_context : bool, optional
|
| 47 |
+
Set to `False` to expand the text context by searching for the Elastic inner hit inside the larger document
|
| 48 |
+
, by default True
|
| 49 |
+
|
| 50 |
+
Returns
|
| 51 |
+
-------
|
| 52 |
+
str
|
| 53 |
+
longer chunks stuffed together
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
chunks = []
|
| 57 |
+
# NOTE chunks have tokens, long text is a string, but may contain html which affects tokenization
|
| 58 |
+
long_text = hit.source.get(field_name) or ""
|
| 59 |
+
long_text = long_text.lower()
|
| 60 |
+
|
| 61 |
+
inner_hits_field = f"embeddings.{field_name}.chunks"
|
| 62 |
+
found_chunks = hit.inner_hits.get(inner_hits_field, {}) if hit.inner_hits else None
|
| 63 |
+
if found_chunks:
|
| 64 |
+
for h in found_chunks.get("hits", {}).get("hits") or []:
|
| 65 |
+
chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
|
| 66 |
+
|
| 67 |
+
# cutting the middle because we may have tokenizing artifacts there
|
| 68 |
+
chunk = chunk[3: -3]
|
| 69 |
+
|
| 70 |
+
if add_context:
|
| 71 |
+
# Find the start and end indices of the chunk in the large text
|
| 72 |
+
start_index = long_text.find(chunk[:20])
|
| 73 |
+
|
| 74 |
+
# Chunk is found
|
| 75 |
+
if start_index != -1:
|
| 76 |
+
end_index = start_index + len(chunk)
|
| 77 |
+
pre_start_index = max(0, start_index - context_length)
|
| 78 |
+
post_end_index = min(len(long_text), end_index + context_length)
|
| 79 |
+
chunks.append(long_text[pre_start_index:post_end_index])
|
| 80 |
+
else:
|
| 81 |
+
chunks.append(chunk)
|
| 82 |
+
return '\n\n'.join(chunks)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def generate_queries(
|
| 86 |
+
query: str,
|
| 87 |
+
sources: list[SourceNames],
|
| 88 |
+
news_days_ago: int = 60
|
| 89 |
+
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
| 90 |
+
"""Builds Elastic queries against indices which do or do not support sparse vector queries.
|
| 91 |
+
|
| 92 |
+
Parameters
|
| 93 |
+
----------
|
| 94 |
+
query : str
|
| 95 |
+
Text describing a user's question or a description of investigative work which requires support from Candid's
|
| 96 |
+
knowledge base
|
| 97 |
+
sources : list[SourceNames]
|
| 98 |
+
One or more sources of knowledge from different areas at Candid.
|
| 99 |
+
* Candid Blog: Blog posts from Candid staff and trusted partners intended to help those in the sector or
|
| 100 |
+
illuminate ongoing work
|
| 101 |
+
* Candid Help: Candid FAQs to help user's get started with Candid's product platform and learning resources
|
| 102 |
+
* Candid Learning: Training documents from Candid's subject matter experts
|
| 103 |
+
* Candid News: News articles and press releases about real-time activity in the philanthropic sector
|
| 104 |
+
* IssueLab Research Reports: Academic research reports about the social/philanthropic sector
|
| 105 |
+
* YouTube Training: Transcripts from video-based training seminars from Candid's subject matter experts
|
| 106 |
+
news_days_ago : int, optional
|
| 107 |
+
How many days in the past to search for news articles, if a user is asking for recent trends then this value
|
| 108 |
+
should be set lower >~ 10, by default 60
|
| 109 |
+
|
| 110 |
+
Returns
|
| 111 |
+
-------
|
| 112 |
+
tuple[list[dict[str, Any]], list[dict[str, Any]]]
|
| 113 |
+
(sparse vector queries, queries for indices which do not support sparse vectors)
|
| 114 |
+
"""
|
| 115 |
+
|
| 116 |
+
vector_queries = []
|
| 117 |
+
quasi_vector_queries = []
|
| 118 |
+
|
| 119 |
+
for source_name in sources:
|
| 120 |
+
if source_name == "Candid Blog":
|
| 121 |
+
q = build_sparse_vector_query(query=query, fields=S.CandidBlogConfig.semantic_fields)
|
| 122 |
+
q["_source"] = {"excludes": ["embeddings"]}
|
| 123 |
+
q["size"] = 5
|
| 124 |
+
vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
|
| 125 |
+
elif source_name == "Candid Help":
|
| 126 |
+
q = build_sparse_vector_query(query=query, fields=S.CandidHelpConfig.semantic_fields)
|
| 127 |
+
q["_source"] = {"excludes": ["embeddings"]}
|
| 128 |
+
q["size"] = 5
|
| 129 |
+
vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
|
| 130 |
+
elif source_name == "Candid Learning":
|
| 131 |
+
q = build_sparse_vector_query(query=query, fields=S.CandidLearningConfig.semantic_fields)
|
| 132 |
+
q["_source"] = {"excludes": ["embeddings"]}
|
| 133 |
+
q["size"] = 5
|
| 134 |
+
vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
|
| 135 |
+
elif source_name == "Candid News":
|
| 136 |
+
q = news_query_builder(
|
| 137 |
+
query=query,
|
| 138 |
+
fields=S.CandidNewsConfig.semantic_fields,
|
| 139 |
+
encoder=sparse_encoder,
|
| 140 |
+
days_ago=news_days_ago
|
| 141 |
+
)
|
| 142 |
+
q["size"] = 5
|
| 143 |
+
quasi_vector_queries.extend([{"index": S.CandidNewsConfig.index_name}, q])
|
| 144 |
+
elif source_name == "IssueLab Research Reports":
|
| 145 |
+
# q = build_sparse_vector_query(query=query, fields=S.IssueLabConfig.semantic_fields)
|
| 146 |
+
# q["_source"] = {"excludes": ["embeddings"]}
|
| 147 |
+
# q["size"] = 1
|
| 148 |
+
# vector_queries.extend([{"index": S.IssueLabConfig.index_name}, q])
|
| 149 |
+
|
| 150 |
+
q = issuelab_query_builder(
|
| 151 |
+
query=query,
|
| 152 |
+
fields=S.IssueLabConfig.semantic_fields,
|
| 153 |
+
highlight_fields=S.IssueLabConfig.highlight_fields,
|
| 154 |
+
encoder=sparse_encoder,
|
| 155 |
+
)
|
| 156 |
+
q["size"] = 1
|
| 157 |
+
quasi_vector_queries.extend([{"index": S.IssueLabConfig.index_name}, q])
|
| 158 |
+
elif source_name == "YouTube Training":
|
| 159 |
+
q = build_sparse_vector_and_text_query(
|
| 160 |
+
query=query,
|
| 161 |
+
semantic_fields=S.YoutubeConfig.semantic_fields,
|
| 162 |
+
text_fields=S.YoutubeConfig.text_fields,
|
| 163 |
+
highlight_fields=S.YoutubeConfig.highlight_fields,
|
| 164 |
+
excluded_fields=S.YoutubeConfig.excluded_fields
|
| 165 |
+
)
|
| 166 |
+
q["size"] = 5
|
| 167 |
+
vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
|
| 168 |
+
|
| 169 |
+
return vector_queries, quasi_vector_queries
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def run_search(
|
| 173 |
+
vector_searches: list[dict[str, Any]] | None = None,
|
| 174 |
+
non_vector_searches: list[dict[str, Any]] | None = None,
|
| 175 |
+
) -> list[ElasticHitsResult]:
|
| 176 |
+
"""Elastic query runner which executes both sparse vector, and quasi-sparse vector queries and concatenates results.
|
| 177 |
+
This does not include re-ranking.
|
| 178 |
+
|
| 179 |
+
Parameters
|
| 180 |
+
----------
|
| 181 |
+
vector_searches : list[dict[str, Any]] | None, optional
|
| 182 |
+
Sparse vector multi-search queries which , by default None
|
| 183 |
+
non_vector_searches : list[dict[str, Any]] | None, optional
|
| 184 |
+
Keyword-based multi-search queries, by default None
|
| 185 |
+
|
| 186 |
+
Returns
|
| 187 |
+
-------
|
| 188 |
+
list[ElasticHitsResult]
|
| 189 |
+
Concatenated results
|
| 190 |
+
"""
|
| 191 |
+
|
| 192 |
+
def _msearch_response_generator(responses: Iterable[dict[str, Any]]) -> Iterator[ElasticHitsResult]:
|
| 193 |
+
for query_group in responses:
|
| 194 |
+
for h in query_group.get("hits", {}).get("hits", []):
|
| 195 |
+
inner_hits = h.get("inner_hits", {})
|
| 196 |
+
|
| 197 |
+
if not inner_hits and "news" in h.get("_index"):
|
| 198 |
+
inner_hits = {"text": h.get("_source", {}).get("content")}
|
| 199 |
+
if not inner_hits and "issuelab" in h.get("_index"):
|
| 200 |
+
inner_hits = {"text": h.get("_source", {}).get("content")}
|
| 201 |
+
|
| 202 |
+
yield ElasticHitsResult(
|
| 203 |
+
index=h["_index"],
|
| 204 |
+
id=h["_id"],
|
| 205 |
+
score=h["_score"],
|
| 206 |
+
source=h["_source"],
|
| 207 |
+
inner_hits=inner_hits,
|
| 208 |
+
highlight=h.get("highlight", {})
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
results = []
|
| 212 |
+
if vector_searches is not None and len(vector_searches) > 0:
|
| 213 |
+
hits = multi_search_base(queries=vector_searches, credentials=SEMANTIC_ELASTIC_QA)
|
| 214 |
+
for hit in _msearch_response_generator(responses=hits):
|
| 215 |
+
results.append(hit)
|
| 216 |
+
if non_vector_searches is not None and len(non_vector_searches) > 0:
|
| 217 |
+
hits = multi_search_base(queries=non_vector_searches, credentials=NEWS_ELASTIC)
|
| 218 |
+
for hit in _msearch_response_generator(responses=hits):
|
| 219 |
+
results.append(hit)
|
| 220 |
+
return results
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def retrieved_text(hits: dict[str, Any]) -> str:
|
| 224 |
+
"""Extracts retrieved sub-texts from documents which are strong hits from semantic queries for the purpose of
|
| 225 |
+
re-scoring by a secondary language model.
|
| 226 |
+
|
| 227 |
+
Parameters
|
| 228 |
+
----------
|
| 229 |
+
hits : dict[str, Any]
|
| 230 |
+
|
| 231 |
+
Returns
|
| 232 |
+
-------
|
| 233 |
+
str
|
| 234 |
+
"""
|
| 235 |
+
|
| 236 |
+
nlp = CandidSmallLanguageModel()
|
| 237 |
+
|
| 238 |
+
text = []
|
| 239 |
+
for _, v in hits.items():
|
| 240 |
+
if _ == "text":
|
| 241 |
+
s = nlp.summarize(v, top_k=3)
|
| 242 |
+
text.append(s.summary)
|
| 243 |
+
# text.append(v)
|
| 244 |
+
continue
|
| 245 |
+
|
| 246 |
+
for h in (v.get("hits", {}).get("hits") or []):
|
| 247 |
+
for _, field in h.get("fields", {}).items():
|
| 248 |
+
for chunk in field:
|
| 249 |
+
if chunk.get("chunk"):
|
| 250 |
+
text.extend(chunk["chunk"])
|
| 251 |
+
return '\n'.join(text)
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def reranker(
|
| 255 |
+
query_results: Iterable[ElasticHitsResult],
|
| 256 |
+
search_text: str | None = None,
|
| 257 |
+
max_num_results: int = 5
|
| 258 |
+
) -> Iterator[ElasticHitsResult]:
|
| 259 |
+
"""Reranks Elasticsearch hits coming from multiple indices/queries which may have scores on different scales.
|
| 260 |
+
This will shuffle results
|
| 261 |
+
|
| 262 |
+
Parameters
|
| 263 |
+
----------
|
| 264 |
+
query_results : Iterable[ElasticHitsResult]
|
| 265 |
+
|
| 266 |
+
Yields
|
| 267 |
+
------
|
| 268 |
+
Iterator[ElasticHitsResult]
|
| 269 |
+
"""
|
| 270 |
+
|
| 271 |
+
results: list[ElasticHitsResult] = []
|
| 272 |
+
texts: list[str] = []
|
| 273 |
+
for _, data in groupby(query_results, key=lambda x: x.index):
|
| 274 |
+
data = list(data) # noqa: PLW2901
|
| 275 |
+
max_score = max(data, key=lambda x: x.score).score
|
| 276 |
+
min_score = min(data, key=lambda x: x.score).score
|
| 277 |
+
|
| 278 |
+
for d in data:
|
| 279 |
+
d.score = (d.score - min_score) / (max_score - min_score + 1e-9)
|
| 280 |
+
results.append(d)
|
| 281 |
+
|
| 282 |
+
if search_text:
|
| 283 |
+
if d.inner_hits:
|
| 284 |
+
text = retrieved_text(d.inner_hits)
|
| 285 |
+
if d.highlight:
|
| 286 |
+
highlight_texts = []
|
| 287 |
+
for k, v in d.highlight.items():
|
| 288 |
+
highlight_texts.append('\n'.join(v))
|
| 289 |
+
text = '\n'.join(highlight_texts)
|
| 290 |
+
texts.append(text)
|
| 291 |
+
|
| 292 |
+
if search_text and len(texts) == len(results) and len(texts) > 1:
|
| 293 |
+
logger.info("Re-ranking %d retrieval results", len(results))
|
| 294 |
+
scores = sparse_encoder.query_reranking(query=search_text, documents=texts)
|
| 295 |
+
for r, s in zip(results, scores):
|
| 296 |
+
r.score = s
|
| 297 |
+
|
| 298 |
+
yield from sorted(results, key=lambda x: x.score, reverse=True)[:max_num_results]
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
def process_hit(hit: ElasticHitsResult) -> Document:
|
| 302 |
+
"""Process a raw Elasticsearch document into a structured langchain `Document` object.
|
| 303 |
+
|
| 304 |
+
Parameters
|
| 305 |
+
----------
|
| 306 |
+
hit : ElasticHitsResult
|
| 307 |
+
|
| 308 |
+
Returns
|
| 309 |
+
-------
|
| 310 |
+
Document
|
| 311 |
+
|
| 312 |
+
Raises
|
| 313 |
+
------
|
| 314 |
+
ValueError
|
| 315 |
+
Raised if a result from an unknown index is passed in
|
| 316 |
+
"""
|
| 317 |
+
|
| 318 |
+
nlp = CandidSmallLanguageModel()
|
| 319 |
+
|
| 320 |
+
if "issuelab-elser" in hit.index:
|
| 321 |
+
doc = Document(
|
| 322 |
+
page_content='\n\n'.join([
|
| 323 |
+
hit.source.get("combined_item_description", ""),
|
| 324 |
+
hit.source.get("description", ""),
|
| 325 |
+
hit.source.get("combined_issuelab_findings", ""),
|
| 326 |
+
get_context("content", hit, context_length=12)
|
| 327 |
+
]),
|
| 328 |
+
metadata={
|
| 329 |
+
"title": hit.source["title"],
|
| 330 |
+
"source": "IssueLab",
|
| 331 |
+
"source_id": hit.source["resource_id"],
|
| 332 |
+
"url": hit.source.get("permalink", "")
|
| 333 |
+
}
|
| 334 |
+
)
|
| 335 |
+
elif "issuelab" in hit.index:
|
| 336 |
+
content_summary = ""
|
| 337 |
+
if hit.source.get("content", ""):
|
| 338 |
+
content_summary = nlp.summarize(hit.source.get("content", ""), top_k=20).summary
|
| 339 |
+
|
| 340 |
+
doc = Document(
|
| 341 |
+
page_content='\n\n'.join([hit.source.get("description", ""), content_summary]),
|
| 342 |
+
metadata={
|
| 343 |
+
"title": hit.source["title"],
|
| 344 |
+
"source": "IssueLab",
|
| 345 |
+
"source_id": hit.source["issuelab_id"],
|
| 346 |
+
"url": hit.source.get("issuelab_url", "")
|
| 347 |
+
}
|
| 348 |
+
)
|
| 349 |
+
elif "youtube" in hit.index:
|
| 350 |
+
highlight = hit.highlight or {}
|
| 351 |
+
doc = Document(
|
| 352 |
+
page_content='\n\n'.join([
|
| 353 |
+
hit.source.get("title", ""),
|
| 354 |
+
hit.source.get("semantic_description", ""),
|
| 355 |
+
' '.join(highlight.get("semantic_cc_text", []))
|
| 356 |
+
]),
|
| 357 |
+
metadata={
|
| 358 |
+
"title": hit.source.get("title", ""),
|
| 359 |
+
"source": "Candid YouTube",
|
| 360 |
+
"source_id": hit.source['video_id'],
|
| 361 |
+
"url": f"https://www.youtube.com/watch?v={hit.source['video_id']}"
|
| 362 |
+
}
|
| 363 |
+
)
|
| 364 |
+
elif "candid-blog" in hit.index:
|
| 365 |
+
doc = Document(
|
| 366 |
+
page_content='\n\n'.join([
|
| 367 |
+
hit.source.get("title", ""),
|
| 368 |
+
hit.source.get("excerpt", ""),
|
| 369 |
+
get_context("content", hit, context_length=12, add_context=False),
|
| 370 |
+
get_context("authors_text", hit, context_length=12, add_context=False),
|
| 371 |
+
hit.source.get("title_summary_tags", "")
|
| 372 |
+
]),
|
| 373 |
+
metadata={
|
| 374 |
+
"title": hit.source.get("title", ""),
|
| 375 |
+
"source": "Candid Blog",
|
| 376 |
+
"source_id": hit.source["id"],
|
| 377 |
+
"url": hit.source["link"]
|
| 378 |
+
}
|
| 379 |
+
)
|
| 380 |
+
elif "candid-learning" in hit.index:
|
| 381 |
+
doc = Document(
|
| 382 |
+
page_content='\n\n'.join([
|
| 383 |
+
hit.source.get("title", ""),
|
| 384 |
+
hit.source.get("staff_recommendations", ""),
|
| 385 |
+
hit.source.get("training_topics", ""),
|
| 386 |
+
get_context("content", hit, context_length=12)
|
| 387 |
+
]),
|
| 388 |
+
metadata={
|
| 389 |
+
"title": hit.source["title"],
|
| 390 |
+
"source": "Candid Learning",
|
| 391 |
+
"source_id": hit.source["post_id"],
|
| 392 |
+
"url": hit.source.get("url", "")
|
| 393 |
+
}
|
| 394 |
+
)
|
| 395 |
+
elif "candid-help" in hit.index:
|
| 396 |
+
doc = Document(
|
| 397 |
+
page_content='\n\n'.join([
|
| 398 |
+
hit.source.get("combined_article_description", ""),
|
| 399 |
+
get_context("content", hit, context_length=12)
|
| 400 |
+
]),
|
| 401 |
+
metadata={
|
| 402 |
+
"title": hit.source.get("title", ""),
|
| 403 |
+
"source": "Candid Help",
|
| 404 |
+
"source_id": hit.source["id"],
|
| 405 |
+
"url": hit.source.get("link", "")
|
| 406 |
+
}
|
| 407 |
+
)
|
| 408 |
+
elif "news" in hit.index:
|
| 409 |
+
doc = Document(
|
| 410 |
+
page_content='\n\n'.join([hit.source.get("title", ""), hit.source.get("content", "")]),
|
| 411 |
+
metadata={
|
| 412 |
+
"title": hit.source.get("title", ""),
|
| 413 |
+
"source": hit.source.get("site_name") or "Candid News",
|
| 414 |
+
"source_id": hit.source["id"],
|
| 415 |
+
"url": hit.source.get("link", "")
|
| 416 |
+
}
|
| 417 |
+
)
|
| 418 |
+
else:
|
| 419 |
+
raise ValueError(f"Unknown source result from index {hit.index}")
|
| 420 |
+
return doc
|
ask_candid/services/small_lm.py
CHANGED
|
@@ -8,12 +8,16 @@ from ask_candid.base.lambda_base import LambdaInvokeBase
|
|
| 8 |
|
| 9 |
@dataclass(slots=True)
|
| 10 |
class Encoding:
|
|
|
|
|
|
|
| 11 |
inputs: list[str]
|
| 12 |
vectors: torch.Tensor
|
| 13 |
|
| 14 |
|
| 15 |
@dataclass(slots=True)
|
| 16 |
class SummaryItem:
|
|
|
|
|
|
|
| 17 |
rank: int
|
| 18 |
score: float
|
| 19 |
text: str
|
|
@@ -21,6 +25,8 @@ class SummaryItem:
|
|
| 21 |
|
| 22 |
@dataclass(slots=True)
|
| 23 |
class TextSummary:
|
|
|
|
|
|
|
| 24 |
snippets: list[SummaryItem]
|
| 25 |
|
| 26 |
@property
|
|
@@ -28,7 +34,7 @@ class TextSummary:
|
|
| 28 |
return ' '.join([_.text for _ in self.snippets])
|
| 29 |
|
| 30 |
|
| 31 |
-
class
|
| 32 |
"""Wrapper around Candid's custom small language model.
|
| 33 |
For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
|
| 34 |
This services includes:
|
|
@@ -44,7 +50,7 @@ class CandidSLM(LambdaInvokeBase):
|
|
| 44 |
AWS secret key, by default None
|
| 45 |
"""
|
| 46 |
|
| 47 |
-
class Tasks(Enum): #
|
| 48 |
ENCODE = "/encode"
|
| 49 |
DOCUMENT_SUMMARIZE = "/document/summarize"
|
| 50 |
DOCUMENT_NER_SALIENCE = "/document/entitySalience"
|
|
|
|
| 8 |
|
| 9 |
@dataclass(slots=True)
|
| 10 |
class Encoding:
|
| 11 |
+
"""Text encoding vector response
|
| 12 |
+
"""
|
| 13 |
inputs: list[str]
|
| 14 |
vectors: torch.Tensor
|
| 15 |
|
| 16 |
|
| 17 |
@dataclass(slots=True)
|
| 18 |
class SummaryItem:
|
| 19 |
+
"""A single summary object
|
| 20 |
+
"""
|
| 21 |
rank: int
|
| 22 |
score: float
|
| 23 |
text: str
|
|
|
|
| 25 |
|
| 26 |
@dataclass(slots=True)
|
| 27 |
class TextSummary:
|
| 28 |
+
"""Text summarization response
|
| 29 |
+
"""
|
| 30 |
snippets: list[SummaryItem]
|
| 31 |
|
| 32 |
@property
|
|
|
|
| 34 |
return ' '.join([_.text for _ in self.snippets])
|
| 35 |
|
| 36 |
|
| 37 |
+
class CandidSmallLanguageModel(LambdaInvokeBase):
|
| 38 |
"""Wrapper around Candid's custom small language model.
|
| 39 |
For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
|
| 40 |
This services includes:
|
|
|
|
| 50 |
AWS secret key, by default None
|
| 51 |
"""
|
| 52 |
|
| 53 |
+
class Tasks(Enum): # noqa: D106
|
| 54 |
ENCODE = "/encode"
|
| 55 |
DOCUMENT_SUMMARIZE = "/document/summarize"
|
| 56 |
DOCUMENT_NER_SALIENCE = "/document/entitySalience"
|
ask_candid/tools/grants.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
from langchain_core.tools import tool
|
| 4 |
+
import httpx
|
| 5 |
+
|
| 6 |
+
from ask_candid.tools.utils import format_candid_profile_link
|
| 7 |
+
from ask_candid.base.config.rest import SEARCH
|
| 8 |
+
|
| 9 |
+
logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
logger.setLevel(logging.ERROR)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@tool
|
| 15 |
+
def grants_search(
|
| 16 |
+
query: str,
|
| 17 |
+
subject_codes: str | None = None,
|
| 18 |
+
populations_served_codes: str | None = None,
|
| 19 |
+
geonameids_of_geographies_served: str | None = None
|
| 20 |
+
) -> list[dict[str, str | int | float | None]] | str:
|
| 21 |
+
"""Search for historical grants to find context about what is happening in the sector, and what organizations are
|
| 22 |
+
involved with. This is intended for historial research purposes and contextualization. If trying to recommend
|
| 23 |
+
funders then please use the dedicated funder recommendation tool instead of this. Funder recommendations uses grants
|
| 24 |
+
and additional contexts, as well as a carefully trained graph neural network to provide targeted recommendations.
|
| 25 |
+
|
| 26 |
+
Another important note is that this tool only returns up to 25 top relevant grant results and should never be used
|
| 27 |
+
to make broad generalizations.
|
| 28 |
+
|
| 29 |
+
Queries are natural text, and the retrieval mechanism is a hybrid approach of keywords and sparse vector searches
|
| 30 |
+
over fields which describe the activity and purpose of the grant.
|
| 31 |
+
|
| 32 |
+
While extra subject codes, populations served codes, and geography IDs for where the grant is serving is not
|
| 33 |
+
required, grants may become more specific the more information can be provided.
|
| 34 |
+
|
| 35 |
+
Subjects and populations can be determined using the `autocode` tool if the requester can supply a description of
|
| 36 |
+
the program they are seeking funding for.
|
| 37 |
+
|
| 38 |
+
Geographies can be determined using the geo detection tool if the requester can supply a description of the program
|
| 39 |
+
they are seeking funding for.
|
| 40 |
+
|
| 41 |
+
Parameters
|
| 42 |
+
----------
|
| 43 |
+
query : str
|
| 44 |
+
Text describing a user's question or a description of investigative work which requires support from Candid's
|
| 45 |
+
grants knowledge base
|
| 46 |
+
subject_codes : str | None, optional
|
| 47 |
+
Subject codes from Candid's PCS taxonomy, comma separated, by default None
|
| 48 |
+
populations_served_codes : str | None, optional
|
| 49 |
+
Population groups served codes from Candid's PCS taxonomy, comma separated, by default None
|
| 50 |
+
geonameids_of_geographies_served : str | None, optional
|
| 51 |
+
Geonames ID values for geographies served by the requester's program, comma separted, by default None
|
| 52 |
+
|
| 53 |
+
Examples
|
| 54 |
+
--------
|
| 55 |
+
>>> grants_search(query='homeless shelters in new york')
|
| 56 |
+
>>> grants_search(
|
| 57 |
+
query='homeless shelters in new york',
|
| 58 |
+
subject_codes='SS050000, SS000000,SB050000',
|
| 59 |
+
populations_served_codes='PJ050100',
|
| 60 |
+
geonameids_of_geographies_served='4094212,4094212'
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
Returns
|
| 64 |
+
-------
|
| 65 |
+
list[dict[str, str | int | float | None]] | str
|
| 66 |
+
Array of relevant grants and information about the organizations involved
|
| 67 |
+
If output is a string then that means there was some error, and retry should be considered
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
payload = {"query": query, "rowCount": 25}
|
| 71 |
+
if subject_codes is not None:
|
| 72 |
+
payload["SubjectArea"] = subject_codes.split(',')
|
| 73 |
+
if populations_served_codes is not None:
|
| 74 |
+
payload["PopulationServed"] = populations_served_codes.split(',')
|
| 75 |
+
if geonameids_of_geographies_served:
|
| 76 |
+
payload["GeographicArea"] = geonameids_of_geographies_served.split(',')
|
| 77 |
+
|
| 78 |
+
with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
|
| 79 |
+
r = client.get(
|
| 80 |
+
url=SEARCH.endpoint("v1/grants/discovery"),
|
| 81 |
+
params=payload,
|
| 82 |
+
headers={**SEARCH.header} # type: ignore
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
if r.status_code != 200:
|
| 86 |
+
logger.error("Error calling grants search API %s. Error: %s", str(r.request.url), r.reason_phrase)
|
| 87 |
+
return f"Error calling grants search. Error: {r.reason_phrase}"
|
| 88 |
+
|
| 89 |
+
data: dict = r.json()
|
| 90 |
+
|
| 91 |
+
output = []
|
| 92 |
+
for grant in data.get("grants") or []:
|
| 93 |
+
working_on, serving = [], []
|
| 94 |
+
for facet, data in grant["pcsV3"].items():
|
| 95 |
+
if facet == "subject":
|
| 96 |
+
working_on.extend([code["name"].lower() for code in data["value"]])
|
| 97 |
+
elif facet == "population":
|
| 98 |
+
serving.extend([code["name"].lower() for code in data["value"]])
|
| 99 |
+
|
| 100 |
+
output.append({
|
| 101 |
+
"funder_id": grant["grantmakerId"],
|
| 102 |
+
"funder_profile_link": format_candid_profile_link(grant["grantmakerId"]),
|
| 103 |
+
"funder_name": grant["grantmakerName"],
|
| 104 |
+
"recipient_id": grant["recipientId"],
|
| 105 |
+
"recipient_profile_link": format_candid_profile_link(grant["recipientId"]),
|
| 106 |
+
"recipient_name": grant["recipientName"],
|
| 107 |
+
"fiscal_year": grant["fiscalYear"],
|
| 108 |
+
"amound_usd": grant["amountUsd"],
|
| 109 |
+
"description": grant["text"],
|
| 110 |
+
"working_on": f"Working on {', '.join(working_on)}",
|
| 111 |
+
"serving": f"Serving population groups {', '.join(serving)}",
|
| 112 |
+
})
|
| 113 |
+
return output
|
ask_candid/tools/letter_gen.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, field
|
| 2 |
+
|
| 3 |
+
from langchain_core.tools import tool
|
| 4 |
+
import httpx
|
| 5 |
+
|
| 6 |
+
from ask_candid.base.config.rest import GOLDEN_ORG, LOI_WRITER
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@dataclass
|
| 10 |
+
class LetterOfInterest:
|
| 11 |
+
opening: str = field(default="")
|
| 12 |
+
org_desc: str = field(default="")
|
| 13 |
+
need: str = field(default="")
|
| 14 |
+
project: str = field(default="")
|
| 15 |
+
request: str = field(default="")
|
| 16 |
+
conclusion: str = field(default="")
|
| 17 |
+
|
| 18 |
+
@property
|
| 19 |
+
def letter(self):
|
| 20 |
+
return f"""{self.opening}
|
| 21 |
+
|
| 22 |
+
{self.org_desc}
|
| 23 |
+
|
| 24 |
+
{self.need}
|
| 25 |
+
|
| 26 |
+
{self.project}
|
| 27 |
+
|
| 28 |
+
{self.request}
|
| 29 |
+
|
| 30 |
+
{self.conclusion}
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@tool
|
| 35 |
+
def estimate_budget(
|
| 36 |
+
nonprofit_id: int,
|
| 37 |
+
funder_id: int,
|
| 38 |
+
project_description: str,
|
| 39 |
+
# ctx: Context
|
| 40 |
+
) -> str:
|
| 41 |
+
"""This is an optional tool for estimating project budgets. Some users will already know what their budget is, or
|
| 42 |
+
know how much money they are seeking from a grant, in which case this tool should not be used.
|
| 43 |
+
|
| 44 |
+
This tool also provides guidance on setting a budget, and ultimately the user needs to decided based on the output
|
| 45 |
+
from this tool
|
| 46 |
+
|
| 47 |
+
Parameters
|
| 48 |
+
----------
|
| 49 |
+
nonprofit_id : int
|
| 50 |
+
The unique identifier of the requesting organization. This will need to be found from a search using inputs
|
| 51 |
+
elicited from the requeter
|
| 52 |
+
funder_id : int
|
| 53 |
+
The unique identifier of the funding organization which may be awarding a grant to the requester.
|
| 54 |
+
This will need to be found from a search using inputs elicited from the requeter, or from recommendations
|
| 55 |
+
project_description : str
|
| 56 |
+
Natural language text describing the project/program that the user is requesting funding for
|
| 57 |
+
|
| 58 |
+
Returns
|
| 59 |
+
-------
|
| 60 |
+
str
|
| 61 |
+
Budget guidance, including context on the funder's ability to provide the budget in question
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
recip_data = httpx.get(
|
| 65 |
+
url=GOLDEN_ORG.endpoint("v1/organization"),
|
| 66 |
+
params={"id": nonprofit_id},
|
| 67 |
+
headers={**GOLDEN_ORG.header}, # type: ignore
|
| 68 |
+
timeout=30
|
| 69 |
+
).json().get("document_data", {}).get("preferred_data", {}).get("data", {})
|
| 70 |
+
funder_data = httpx.get(
|
| 71 |
+
url=GOLDEN_ORG.endpoint("v1/organization"),
|
| 72 |
+
params={"id": funder_id},
|
| 73 |
+
headers={**GOLDEN_ORG.header}, # type: ignore
|
| 74 |
+
timeout=30
|
| 75 |
+
).json().get("document_data", {}).get("preferred_data", {}).get("data", {})
|
| 76 |
+
return httpx.post(
|
| 77 |
+
url=LOI_WRITER.endpoint("budget"),
|
| 78 |
+
json={
|
| 79 |
+
"recipient_candid_entity_id": nonprofit_id,
|
| 80 |
+
"program_description": project_description,
|
| 81 |
+
"recipient_data": recip_data,
|
| 82 |
+
"funder_data": funder_data
|
| 83 |
+
},
|
| 84 |
+
headers={**LOI_WRITER.header}, # type: ignore
|
| 85 |
+
timeout=30
|
| 86 |
+
).json().get("response", "No budget could be estimated")
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
@tool
|
| 90 |
+
def draft_loi(
|
| 91 |
+
nonprofit_id: int,
|
| 92 |
+
funder_id: int,
|
| 93 |
+
project_description: str,
|
| 94 |
+
budget: int,
|
| 95 |
+
) -> str:
|
| 96 |
+
"""Generate a letter of interest/intent from a chain-of-thought prompt chain using Candid's golden data and any
|
| 97 |
+
inputs specified by the user, and/or recommended content.
|
| 98 |
+
|
| 99 |
+
The output of this tool is the actual letter draft, please do not make changes to it other than adding headers
|
| 100 |
+
and/or footers.
|
| 101 |
+
|
| 102 |
+
Parameters
|
| 103 |
+
----------
|
| 104 |
+
nonprofit_id : int
|
| 105 |
+
The unique identifier of the requesting organization. This will need to be found from a search using inputs
|
| 106 |
+
elicited from the requeter
|
| 107 |
+
funder_id : int
|
| 108 |
+
The unique identifier of the funding organization which may be awarding a grant to the requester.
|
| 109 |
+
This will need to be found from a search using inputs elicited from the requeter, or from recommendations
|
| 110 |
+
project_description : str
|
| 111 |
+
Natural language text describing the project/program that the user is requesting funding for
|
| 112 |
+
budget : int
|
| 113 |
+
The dollar amount (in USD) that the user is requesting for funding. This should be specified by the user,
|
| 114 |
+
prompt for this if needed.
|
| 115 |
+
|
| 116 |
+
Returns
|
| 117 |
+
-------
|
| 118 |
+
str
|
| 119 |
+
Formatted letter of interest
|
| 120 |
+
"""
|
| 121 |
+
|
| 122 |
+
client = httpx.Client(headers={**LOI_WRITER.header}, timeout=30, base_url=LOI_WRITER.url) # type: ignore
|
| 123 |
+
|
| 124 |
+
def _make_request(ept: str, payload: dict):
|
| 125 |
+
# return httpx.get(
|
| 126 |
+
# url=LOI_WRITER.endpoint(ept),
|
| 127 |
+
# params=payload,
|
| 128 |
+
# headers={**LOI_WRITER.header}, # type: ignore
|
| 129 |
+
# timeout=30
|
| 130 |
+
# ).json().get("response", "")
|
| 131 |
+
|
| 132 |
+
return client.get(url=LOI_WRITER.endpoint(ept), params=payload).json().get("response", "")
|
| 133 |
+
|
| 134 |
+
data = _make_request(
|
| 135 |
+
ept="organization/autofill",
|
| 136 |
+
payload={"recipient_candid_entity_id": nonprofit_id, "funder_candid_entity_id": funder_id}
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
recip: dict = data.get("recipient_data", {})
|
| 140 |
+
funder: dict = data.get("funder_data", {})
|
| 141 |
+
pair_history: str = data.get("funding_history_text", "")
|
| 142 |
+
|
| 143 |
+
sections = (
|
| 144 |
+
("opening", "writer/opening"),
|
| 145 |
+
("organization description", "writer/org"),
|
| 146 |
+
("need statement", "writer/need"),
|
| 147 |
+
("project description", "writer/project"),
|
| 148 |
+
("funding request", "writer/fund"),
|
| 149 |
+
("conclusion", "writer/conclusion")
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
output = LetterOfInterest()
|
| 153 |
+
for _, (section, endpoint) in enumerate(sections, start=1):
|
| 154 |
+
if section == "opening":
|
| 155 |
+
output.opening = _make_request(
|
| 156 |
+
ept=endpoint,
|
| 157 |
+
payload={
|
| 158 |
+
"funder_name": [
|
| 159 |
+
n["name"] for n in funder.get("org_data", {}).get("names", [])
|
| 160 |
+
if n["name_type"] == "main"
|
| 161 |
+
][0],
|
| 162 |
+
"recipient_name": [
|
| 163 |
+
n["name"] for n in recip.get("org_data", {}).get("names", [])
|
| 164 |
+
if n["name_type"] == "main"
|
| 165 |
+
][0],
|
| 166 |
+
"project_purpose": project_description,
|
| 167 |
+
"amount": budget,
|
| 168 |
+
"prior_contact": None,
|
| 169 |
+
"connection": None
|
| 170 |
+
}
|
| 171 |
+
)
|
| 172 |
+
elif section == "organization description":
|
| 173 |
+
output.org_desc = _make_request(
|
| 174 |
+
ept=endpoint,
|
| 175 |
+
payload={
|
| 176 |
+
"opening": output.opening,
|
| 177 |
+
"history": pair_history,
|
| 178 |
+
"recipient_mission_statement": recip.get("mission_statement_text", ""),
|
| 179 |
+
"capacity": recip.get("capacity_text", ""),
|
| 180 |
+
"path": None,
|
| 181 |
+
"accomplishment": recip.get("data_text", "")
|
| 182 |
+
}
|
| 183 |
+
)
|
| 184 |
+
elif section == "need statement":
|
| 185 |
+
output.need = httpx.get(
|
| 186 |
+
url=GOLDEN_ORG.endpoint(endpoint),
|
| 187 |
+
params={
|
| 188 |
+
"recipient_desc": output.org_desc,
|
| 189 |
+
"funder_mission_statement": funder.get("mission_statement_text", ""),
|
| 190 |
+
"target": None,
|
| 191 |
+
"data": None,
|
| 192 |
+
},
|
| 193 |
+
headers={**GOLDEN_ORG.header}, # type: ignore
|
| 194 |
+
timeout=30
|
| 195 |
+
).json().get("response", "")
|
| 196 |
+
elif section == "project description":
|
| 197 |
+
output.project = _make_request(
|
| 198 |
+
ept=endpoint,
|
| 199 |
+
payload={
|
| 200 |
+
"need": output.need,
|
| 201 |
+
"projects": project_description,
|
| 202 |
+
"desired_objectives": None,
|
| 203 |
+
"major_activities": None,
|
| 204 |
+
"key_staff": None,
|
| 205 |
+
"stand_out": None,
|
| 206 |
+
"success": None
|
| 207 |
+
}
|
| 208 |
+
)
|
| 209 |
+
elif section == "funding request":
|
| 210 |
+
output.request = _make_request(
|
| 211 |
+
ept=endpoint,
|
| 212 |
+
payload={
|
| 213 |
+
"project_desc": output.project,
|
| 214 |
+
"amount": budget,
|
| 215 |
+
"funding_history": pair_history,
|
| 216 |
+
"other_funding": None,
|
| 217 |
+
}
|
| 218 |
+
)
|
| 219 |
+
elif section == "conclusion":
|
| 220 |
+
output.conclusion = _make_request(
|
| 221 |
+
ept=endpoint,
|
| 222 |
+
payload={
|
| 223 |
+
"funding_request": output.request,
|
| 224 |
+
"project_desc": output.project,
|
| 225 |
+
"follow_up": recip.get("contact_text", ""),
|
| 226 |
+
}
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
client.close()
|
| 230 |
+
return output.letter
|
ask_candid/tools/nlp.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
from langchain_core.tools import tool
|
| 5 |
+
import httpx
|
| 6 |
+
|
| 7 |
+
from ask_candid.base.utils import retry_on_status
|
| 8 |
+
from ask_candid.base.config.rest import AUTOCODING, DOCUMENT
|
| 9 |
+
|
| 10 |
+
logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
logger.setLevel(logging.ERROR)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@retry_on_status(num_retries=3)
|
| 16 |
+
def get_with_retries(url: str, payload: dict[str, Any] | None, headers: dict[str, str] | None):
|
| 17 |
+
with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
|
| 18 |
+
return client.get(url=url, params=payload, headers=headers)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@tool
|
| 22 |
+
def autocode(text: str) -> dict[str, list] | str:
|
| 23 |
+
"""Uses natural language processing to align some input text to Candid's taxonomy: https://taxonomy.candid.org.
|
| 24 |
+
The taxonomy describes activity in the social and philanthropic sectors.
|
| 25 |
+
|
| 26 |
+
Parameters
|
| 27 |
+
----------
|
| 28 |
+
text : str
|
| 29 |
+
Text describing working in the social sector. This should be related to the social and/or philanthropic sector.
|
| 30 |
+
|
| 31 |
+
Returns
|
| 32 |
+
-------
|
| 33 |
+
dict[str, list] | str
|
| 34 |
+
Taxonomy responses. The keys of the dictionary are individual taxonomy facets, and the items in the dictionary
|
| 35 |
+
are each term which the NLP model has determined is relevant giving the input text. This also includes
|
| 36 |
+
confidence score.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
r = httpx.get(
|
| 40 |
+
url=AUTOCODING.endpoint("predict"),
|
| 41 |
+
params={"text": text},
|
| 42 |
+
headers={**AUTOCODING.header} # type: ignore
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
if r.status_code != 200:
|
| 46 |
+
logger.error("Error calling autocoding API %s. Error: %s", str(r.request.url), r.reason_phrase)
|
| 47 |
+
return f"Error calling autocoding. Error: {r.reason_phrase}"
|
| 48 |
+
|
| 49 |
+
data: dict = r.json().get("data", {})
|
| 50 |
+
return {k: v for k, v in data.items() if k in {"subject", "population"}}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@tool
|
| 54 |
+
def geo_detect(text: str) -> list[dict[str, Any]] | str:
|
| 55 |
+
"""Uses natural language processing to find and match named geographies found in the supplied text. The output
|
| 56 |
+
will supply identified geographies from [Geonames](https://www.geonames.org/).
|
| 57 |
+
|
| 58 |
+
Parameters
|
| 59 |
+
----------
|
| 60 |
+
text : str
|
| 61 |
+
Text describing working in the social sector. This should be related to the social and/or philanthropic sector.
|
| 62 |
+
|
| 63 |
+
Returns
|
| 64 |
+
-------
|
| 65 |
+
list[dict[str, Any]] | str
|
| 66 |
+
Matched geographies responses. This is an array of JSON objects which contain the `name` of the geography as it
|
| 67 |
+
appeared in the supplied text, and the best match to a Geonames geography. For many Candid knowledge tools the
|
| 68 |
+
`geonames_id` value will be most useful.
|
| 69 |
+
If output is a string then that means there was some error, and retry should be considered
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
r = get_with_retries(
|
| 73 |
+
url=DOCUMENT.endpoint("entities/geographies"),
|
| 74 |
+
payload={"text": text, "only_best_match": True},
|
| 75 |
+
headers={**DOCUMENT.header}
|
| 76 |
+
)
|
| 77 |
+
assert isinstance(r, httpx.Response)
|
| 78 |
+
if r.status_code != 200:
|
| 79 |
+
logger.error("Error calling geo detection API %s. Error: %s", str(r.request.url), r.reason_phrase)
|
| 80 |
+
return f"Error calling geo detection. Error: {r.reason_phrase}"
|
| 81 |
+
|
| 82 |
+
data: dict = r.json().get("entities", [])
|
| 83 |
+
return [{"name": entity["name"], "match": entity["match"][:1]} for entity in data if entity.get("type") == "geo"]
|
ask_candid/tools/recommendations.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal, Any
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
from langchain_core.tools import tool
|
| 5 |
+
import httpx
|
| 6 |
+
|
| 7 |
+
from ask_candid.tools.utils import format_candid_profile_link
|
| 8 |
+
from ask_candid.base.utils import retry_on_status
|
| 9 |
+
from ask_candid.base.config.rest import FUNDER_RECOMMENDATION, SEARCH
|
| 10 |
+
|
| 11 |
+
logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
logger.setLevel(logging.ERROR)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@retry_on_status(num_retries=3)
|
| 17 |
+
def get_with_retries(url: str, payload: dict[str, Any] | None, headers: dict[str, str] | None) -> httpx.Response:
|
| 18 |
+
with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
|
| 19 |
+
return client.get(url=url, params=payload, headers=headers)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@tool
|
| 23 |
+
def organization_search(
|
| 24 |
+
query: str,
|
| 25 |
+
located_postal_code: str | None = None,
|
| 26 |
+
located_admin1: str | None = None,
|
| 27 |
+
search_mode: Literal["organization_only", "organization_and_grants"] | None = "organization_only"
|
| 28 |
+
) -> list[dict[str, str | None]] | str:
|
| 29 |
+
"""Search for organizations by name, description or work, program descriptions and locations. Here are some
|
| 30 |
+
guidelines:
|
| 31 |
+
* `query` controls hybrid searching involving both vector search and keyword search
|
| 32 |
+
* `query` can be used to find organizations based on a description of work
|
| 33 |
+
* if the query is intended to be a lookup of an organization by name, then adding quotes around the `query` string
|
| 34 |
+
circumvents vector search, and prioritizes keyword matching on names (eg. `query=Candid` --> `query='Candid'`)
|
| 35 |
+
* if the query is an EIN (eg. 12-3456789) then keyword searching is prioritized to get exact matches
|
| 36 |
+
* adding location information such as postal codes and/or admin1 (state/province abbreviations) will filter results
|
| 37 |
+
|
| 38 |
+
This tool should be used as a first step in any downstream task which requires identifying the nonprofit that the
|
| 39 |
+
user is identifying with. Often, the `nonprofit_id` is required, and that can be found via a search.
|
| 40 |
+
|
| 41 |
+
Parameters
|
| 42 |
+
----------
|
| 43 |
+
query : str
|
| 44 |
+
Free text query which drives the search functionality. This uses a hybrid approach of vector and keyword
|
| 45 |
+
searching, but under certain conditions expressed in the 'guidelines' this may disable vector search.
|
| 46 |
+
located_postal_code : str | None, optional
|
| 47 |
+
Postal code of the organization to be searched, if provided, by default None
|
| 48 |
+
located_admin1 : str | None, optional
|
| 49 |
+
Admin1 code (state/province abbreviation) of the organization to be searched, if provided, by default None
|
| 50 |
+
search_mode : Literal["organization_only", "organization_and_grants"] | None, optional
|
| 51 |
+
Choose how to search for organizations, if `None` or "organization_and_grants" then this will examine evidence
|
| 52 |
+
at the organization level as well as at the historical grant transaction level capturing activity evidence. For
|
| 53 |
+
name lookups it is best to use the "organization_only" default value, by default "organization_only"
|
| 54 |
+
|
| 55 |
+
Returns
|
| 56 |
+
-------
|
| 57 |
+
list[dict[str, str]] | str
|
| 58 |
+
List of the top organization search results
|
| 59 |
+
If output is a string then that means there was some error, and retry should be considered
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
payload = {"query": query, "searchMode": search_mode, "rowCount": 5}
|
| 63 |
+
if located_postal_code is not None:
|
| 64 |
+
payload["postalCode"] = located_postal_code
|
| 65 |
+
if located_admin1 is not None:
|
| 66 |
+
payload["admin1"] = located_admin1
|
| 67 |
+
|
| 68 |
+
with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
|
| 69 |
+
r = client.get(
|
| 70 |
+
url=SEARCH.endpoint("v1/search"),
|
| 71 |
+
params=payload,
|
| 72 |
+
headers={**SEARCH.header} # type: ignore
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
if r.status_code != 200:
|
| 76 |
+
logger.error("Error calling organization search API %s. Error: %s", str(r.request.url), r.reason_phrase)
|
| 77 |
+
return f"Error calling organization search. Error: {r.reason_phrase}"
|
| 78 |
+
|
| 79 |
+
data: dict = r.json()
|
| 80 |
+
|
| 81 |
+
output = []
|
| 82 |
+
for org in data.get("returnedOrgs") or []:
|
| 83 |
+
working_on, serving = [], []
|
| 84 |
+
for code, description in org["taxonomy"].items():
|
| 85 |
+
code: str
|
| 86 |
+
description: str
|
| 87 |
+
|
| 88 |
+
if code.startswith('P') and len(code) > 2:
|
| 89 |
+
serving.append(description.lower())
|
| 90 |
+
elif code.startswith('S'):
|
| 91 |
+
working_on.append(description.lower())
|
| 92 |
+
|
| 93 |
+
output.append({
|
| 94 |
+
"nonprofit_id": org["candidEntityID"],
|
| 95 |
+
"name": org["orgName"],
|
| 96 |
+
"aka_name": org["akaName"],
|
| 97 |
+
"acronym": org["acronymName"],
|
| 98 |
+
"city": org["city"],
|
| 99 |
+
"admin1": org["admin1"],
|
| 100 |
+
"country": org["countryName"],
|
| 101 |
+
"EIN": org["ein"],
|
| 102 |
+
"profile_link": format_candid_profile_link(org['candidEntityID']),
|
| 103 |
+
"working_on": f"Working on {', '.join(working_on)}",
|
| 104 |
+
"serving": f"Serving population groups {', '.join(serving)}",
|
| 105 |
+
"transparency_level": org["seal"].get("description"),
|
| 106 |
+
"organization_roles": ', '.join(org["roles"]),
|
| 107 |
+
"grants_awarded": ', '.join([f"{k}: {v}" for k, v in org["transactionsGiven"].items()]),
|
| 108 |
+
"grants_received": ', '.join([f"{k}: {v}" for k, v in org["transactionsReceived"].items()])
|
| 109 |
+
})
|
| 110 |
+
return output
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
@tool
|
| 114 |
+
def recommend_funders(
|
| 115 |
+
nonprofit_id: int,
|
| 116 |
+
subject_codes_of_program: str | None = None,
|
| 117 |
+
populations_served_codes_of_program: str | None = None,
|
| 118 |
+
geonameids_of_geographies_served: str | None = None,
|
| 119 |
+
include_past_funders: bool = False
|
| 120 |
+
) -> tuple[dict[str, Any], list[dict[str, Any]]] | str:
|
| 121 |
+
"""Recommend potential funding organizations to a nonprofit seeking a grant.
|
| 122 |
+
|
| 123 |
+
These recommendations are built using machine learning over a heterogeneous knowledge graph representing the work of
|
| 124 |
+
the requesting organization, and the contextual recent activities of potential funders, and their grant recipients.
|
| 125 |
+
|
| 126 |
+
While extra subject codes, populations served codes, and geography IDs for where the program takes place is not
|
| 127 |
+
required, recommendations tend to improve and become more specific the more information can be provided.
|
| 128 |
+
|
| 129 |
+
Subjects and populations can be determined using the `autocode` tool if the requester can supply a description of
|
| 130 |
+
the program they are seeking funding for.
|
| 131 |
+
|
| 132 |
+
Geographies can be determined using the geo detection tool if the requester can supply a description of the program
|
| 133 |
+
they are seeking funding for.
|
| 134 |
+
|
| 135 |
+
Key Usage Requirements:
|
| 136 |
+
- Always incorporate returned profile URLs directly into the response text
|
| 137 |
+
- Replace funding organization name mentions with hyperlinked Candid profile URLs
|
| 138 |
+
- Prioritize creating a seamless user experience by making URLs contextually relevant
|
| 139 |
+
- Use relevant recipient data as well as inferred metadata to provide explanations about recommendation relevance
|
| 140 |
+
|
| 141 |
+
Parameters
|
| 142 |
+
----------
|
| 143 |
+
nonprofit_id : int
|
| 144 |
+
The unique identifier of the requesting organization. This will need to be found from a search using inputs
|
| 145 |
+
elicited from the requester
|
| 146 |
+
subject_codes_of_program : str | None, optional
|
| 147 |
+
Subject codes from Candid's PCS taxonomy, comma separated, by default None
|
| 148 |
+
populations_served_codes_of_program : str | None, optional
|
| 149 |
+
Population groups served codes from Candid's PCS taxonomy, comma separated, by default None
|
| 150 |
+
geonameids_of_geographies_served : str | None, optional
|
| 151 |
+
Geonames ID values for geographies served by the requester's program, comma separted, by default None
|
| 152 |
+
include_past_funders : bool, optional
|
| 153 |
+
Boolean flag to indicate whether previous funders of the input organization identified by the `nonprofit_id`
|
| 154 |
+
should be excluded. If the requester would like to reconsider previous funding organizations then set this to
|
| 155 |
+
`True`, but the requester MUST be prompted to indicate this preference. Using the default value will help the
|
| 156 |
+
requester discover new, potentially relevant funders, by default False
|
| 157 |
+
|
| 158 |
+
Examples
|
| 159 |
+
--------
|
| 160 |
+
>>> recommend_funders(nonprofit_id=9981881)
|
| 161 |
+
>>> reccommend_funders(
|
| 162 |
+
nonprofit_id=9173173,
|
| 163 |
+
subject_codes_of_program='SS050000, SS000000,SB050000',
|
| 164 |
+
populations_served_codes_of_program='PJ050100',
|
| 165 |
+
geonameids_of_geographies_served='4094212,4094212'
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
Returns
|
| 169 |
+
-------
|
| 170 |
+
tuple[dict[str, Any], list[dict[str, Any]]] | str
|
| 171 |
+
(Inferred data used to generate recommendations, array of funders being recommended)
|
| 172 |
+
If output is a string then that means there was some error, and retry should be considered
|
| 173 |
+
"""
|
| 174 |
+
|
| 175 |
+
payload = {
|
| 176 |
+
"candid_entity_id": nonprofit_id,
|
| 177 |
+
"use_programs": True,
|
| 178 |
+
"top_k": 5,
|
| 179 |
+
"include_past_funders": include_past_funders
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
if subject_codes_of_program is not None:
|
| 183 |
+
payload["subjects"] = subject_codes_of_program
|
| 184 |
+
if populations_served_codes_of_program is not None:
|
| 185 |
+
payload["populations"] = populations_served_codes_of_program
|
| 186 |
+
if geonameids_of_geographies_served:
|
| 187 |
+
payload["geos"] = geonameids_of_geographies_served
|
| 188 |
+
|
| 189 |
+
r = get_with_retries(
|
| 190 |
+
url=FUNDER_RECOMMENDATION.endpoint("funder/pcs-v3"),
|
| 191 |
+
payload=payload,
|
| 192 |
+
headers={**FUNDER_RECOMMENDATION.header}
|
| 193 |
+
)
|
| 194 |
+
assert isinstance(r, httpx.Response)
|
| 195 |
+
if r.status_code != 200:
|
| 196 |
+
logger.error("Error calling funder recommendations API %s. Error: %s", str(r.request.url), r.reason_phrase)
|
| 197 |
+
return f"Error calling funder recommendations. Error: {r.reason_phrase}"
|
| 198 |
+
|
| 199 |
+
data: dict = r.json()
|
| 200 |
+
return (
|
| 201 |
+
data.get("meta") or {},
|
| 202 |
+
[{
|
| 203 |
+
**r,
|
| 204 |
+
"candid_profile_url": format_candid_profile_link(r['funder_id'])
|
| 205 |
+
} for r in (data.get("recommendations") or [])]
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
@tool
|
| 210 |
+
def recommend_funding_opportunities(
|
| 211 |
+
nonprofit_id: int,
|
| 212 |
+
subject_codes_of_program: str | None = None,
|
| 213 |
+
populations_served_codes_of_program: str | None = None,
|
| 214 |
+
geonameids_of_geographies_served: str | None = None
|
| 215 |
+
) -> tuple[dict[str, Any], list[dict[str, Any]]] | str:
|
| 216 |
+
"""Recommend active funding opportunities (RFPs) to a nonprofit seeking a grant.
|
| 217 |
+
|
| 218 |
+
These recommendations are built using machine learning over a heterogeneous knowledge graph representing the work of
|
| 219 |
+
the requesting organization, and the contextual recent activities of potential funders, and their grant recipients.
|
| 220 |
+
|
| 221 |
+
While extra subject codes, populations served codes, and geography IDs for where the program takes place is not
|
| 222 |
+
required, recommendations tend to improve and become more specific the more information can be provided.
|
| 223 |
+
|
| 224 |
+
Subjects and populations can be determined using the `autocode` tool if the requester can supply a description of
|
| 225 |
+
the program they are seeking funding for.
|
| 226 |
+
|
| 227 |
+
Key Usage Requirements:
|
| 228 |
+
- Always incorporate returned profile URLs directly into the response text
|
| 229 |
+
- Replace funding organization name mentions with hyperlinked Candid profile URLs
|
| 230 |
+
- Prioritize creating a seamless user experience by making URLs contextually relevant
|
| 231 |
+
- Use inferred metadata to provide explanations about recommendation relevance
|
| 232 |
+
|
| 233 |
+
Parameters
|
| 234 |
+
----------
|
| 235 |
+
nonprofit_id : int
|
| 236 |
+
The unique identifier of the requesting organization. This will need to be found from a search using inputs
|
| 237 |
+
elicited from the requeter
|
| 238 |
+
subject_codes_of_program : str | None, optional
|
| 239 |
+
Subject codes from Candid's PCS taxonomy, comma separated, by default None
|
| 240 |
+
populations_served_codes_of_program : str | None, optional
|
| 241 |
+
Population groups served codes from Candid's PCS taxonomy, comma separated, by default None
|
| 242 |
+
geonameids_of_geographies_served : str | None, optional
|
| 243 |
+
Geonames ID values for geographies served by the requester's program, comma separted, by default None
|
| 244 |
+
|
| 245 |
+
Examples
|
| 246 |
+
--------
|
| 247 |
+
>>> recommend_funding_opportunities(nonprofit_id=9981881)
|
| 248 |
+
>>> recommend_funding_opportunities(
|
| 249 |
+
nonprofit_id=9173173,
|
| 250 |
+
subject_codes_of_program='SS050000, SS000000,SB050000',
|
| 251 |
+
populations_served_codes_of_program='PJ050100',
|
| 252 |
+
geonameids_of_geographies_served='4094212,4094212'
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
Returns
|
| 256 |
+
-------
|
| 257 |
+
tuple[dict[str, Any], list[dict[str, Any]]] | str
|
| 258 |
+
(Inferred data used to generate recommendations, array of active funding opportunities being recommended)
|
| 259 |
+
If output is a string then that means there was some error, and retry should be considered
|
| 260 |
+
"""
|
| 261 |
+
|
| 262 |
+
payload = {"candid_entity_id": nonprofit_id, "use_programs": True, "top_k": 5}
|
| 263 |
+
if subject_codes_of_program is not None:
|
| 264 |
+
payload["subjects"] = subject_codes_of_program
|
| 265 |
+
if populations_served_codes_of_program is not None:
|
| 266 |
+
payload["populations"] = populations_served_codes_of_program
|
| 267 |
+
if geonameids_of_geographies_served:
|
| 268 |
+
payload["geos"] = geonameids_of_geographies_served
|
| 269 |
+
|
| 270 |
+
r = get_with_retries(
|
| 271 |
+
url=FUNDER_RECOMMENDATION.endpoint("rfp/pcs-v3"),
|
| 272 |
+
payload=payload,
|
| 273 |
+
headers={**FUNDER_RECOMMENDATION.header}
|
| 274 |
+
)
|
| 275 |
+
assert isinstance(r, httpx.Response)
|
| 276 |
+
if r.status_code != 200:
|
| 277 |
+
logger.error("Error calling RFP recommendation API %s. Error: %s", str(r.request.url), r.reason_phrase)
|
| 278 |
+
return f"Error calling RFP recommendations. Error: {r.reason_phrase}"
|
| 279 |
+
|
| 280 |
+
data: dict = r.json()
|
| 281 |
+
return (
|
| 282 |
+
data.get("meta") or {},
|
| 283 |
+
[{
|
| 284 |
+
**r,
|
| 285 |
+
"candid_profile_url": format_candid_profile_link(r['funder_id'])
|
| 286 |
+
} for r in (data.get("recommendations") or [])]
|
| 287 |
+
)
|
ask_candid/tools/search.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from langchain_core.documents import Document
|
| 2 |
from langchain_core.tools import tool
|
| 3 |
|
| 4 |
-
from ask_candid.
|
| 5 |
SourceNames,
|
| 6 |
generate_queries,
|
| 7 |
run_search,
|
|
|
|
| 1 |
from langchain_core.documents import Document
|
| 2 |
from langchain_core.tools import tool
|
| 3 |
|
| 4 |
+
from ask_candid.services.knowledge_base import (
|
| 5 |
SourceNames,
|
| 6 |
generate_queries,
|
| 7 |
run_search,
|
ask_candid/utils.py
CHANGED
|
@@ -1,47 +1,15 @@
|
|
| 1 |
-
from typing import
|
| 2 |
from uuid import uuid4
|
| 3 |
|
| 4 |
from langchain_core.documents import Document
|
| 5 |
|
| 6 |
-
from ask_candid.retrieval.sources import (
|
| 7 |
-
candid_blog,
|
| 8 |
-
candid_help,
|
| 9 |
-
candid_learning,
|
| 10 |
-
issuelab,
|
| 11 |
-
youtube
|
| 12 |
-
)
|
| 13 |
|
| 14 |
-
|
| 15 |
-
def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
|
| 16 |
-
height_px = 200
|
| 17 |
-
html = ""
|
| 18 |
-
|
| 19 |
-
if source == "news":
|
| 20 |
-
# html = news.article_card_html(doc, height_px, show_chunks)
|
| 21 |
-
pass
|
| 22 |
-
elif source == "transactions":
|
| 23 |
-
pass
|
| 24 |
-
elif source == "organizations":
|
| 25 |
-
pass
|
| 26 |
-
elif source == "issuelab":
|
| 27 |
-
html = issuelab.issuelab_card_html(doc, height_px, show_chunks)
|
| 28 |
-
elif source == "youtube":
|
| 29 |
-
html = youtube.build_card_html(doc, 400, show_chunks)
|
| 30 |
-
elif source == "candid_blog":
|
| 31 |
-
html = candid_blog.build_card_html(doc, height_px, show_chunks)
|
| 32 |
-
elif source == "candid_learning":
|
| 33 |
-
html = candid_learning.build_card_html(doc, height_px, show_chunks)
|
| 34 |
-
elif source == "candid_help":
|
| 35 |
-
html = candid_help.build_card_html(doc, height_px, show_chunks)
|
| 36 |
-
return html
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def html_format_docs_chat(docs: List[Document]) -> str:
|
| 40 |
"""Formats Candid sources
|
| 41 |
|
| 42 |
Parameters
|
| 43 |
----------
|
| 44 |
-
docs :
|
| 45 |
Retrieved documents for context
|
| 46 |
|
| 47 |
Returns
|
|
@@ -69,7 +37,7 @@ def html_format_docs_chat(docs: List[Document]) -> str:
|
|
| 69 |
return html
|
| 70 |
|
| 71 |
|
| 72 |
-
def format_chat_ag_response(chatbot:
|
| 73 |
"""If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
|
| 74 |
with the AI response
|
| 75 |
Returns:
|
|
@@ -89,7 +57,7 @@ def valid_inputs(*args) -> bool:
|
|
| 89 |
return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
|
| 90 |
|
| 91 |
|
| 92 |
-
def get_session_id(thread_id:
|
| 93 |
if not thread_id:
|
| 94 |
thread_id = uuid4().hex
|
| 95 |
return thread_id
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
from uuid import uuid4
|
| 3 |
|
| 4 |
from langchain_core.documents import Document
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
def html_format_docs_chat(docs: list[Document]) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"""Formats Candid sources
|
| 9 |
|
| 10 |
Parameters
|
| 11 |
----------
|
| 12 |
+
docs : list[Document]
|
| 13 |
Retrieved documents for context
|
| 14 |
|
| 15 |
Returns
|
|
|
|
| 37 |
return html
|
| 38 |
|
| 39 |
|
| 40 |
+
def format_chat_ag_response(chatbot: list[Any]) -> list[Any]:
|
| 41 |
"""If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
|
| 42 |
with the AI response
|
| 43 |
Returns:
|
|
|
|
| 57 |
return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
|
| 58 |
|
| 59 |
|
| 60 |
+
def get_session_id(thread_id: str | None) -> str:
|
| 61 |
if not thread_id:
|
| 62 |
thread_id = uuid4().hex
|
| 63 |
return thread_id
|
requirements.txt
CHANGED
|
@@ -9,7 +9,7 @@ langgraph-prebuilt==0.6.4
|
|
| 9 |
pydantic==2.10.6
|
| 10 |
pyopenssl>22.0.0
|
| 11 |
python-dotenv
|
| 12 |
-
transformers
|
| 13 |
|
| 14 |
--find-links https://download.pytorch.org/whl/cpu
|
| 15 |
torch
|
|
|
|
| 9 |
pydantic==2.10.6
|
| 10 |
pyopenssl>22.0.0
|
| 11 |
python-dotenv
|
| 12 |
+
transformers>=4.56.1
|
| 13 |
|
| 14 |
--find-links https://download.pytorch.org/whl/cpu
|
| 15 |
torch
|