Spaces:

CandidAI
/

ask-candid

Running

App Files Files Community

brainsqueeze commited on Sep 18

Commit

68e9b80

verified ·

1 Parent(s): aed4c76

Update issuelab query technique

Browse files

* quasi-sparse vector approach for issuelab
* summarize issuelab articles prior to LLM handoff
* pin transformers version to avoid major bug

Files changed (11) hide show

ask_candid/base/retrieval/elastic.py +76 -8
ask_candid/base/retrieval/sources.py +9 -2
ask_candid/services/knowledge_base.py +420 -0
ask_candid/services/small_lm.py +8 -2
ask_candid/tools/grants.py +113 -0
ask_candid/tools/letter_gen.py +230 -0
ask_candid/tools/nlp.py +83 -0
ask_candid/tools/recommendations.py +287 -0
ask_candid/tools/search.py +1 -1
ask_candid/utils.py +5 -37
requirements.txt +1 -1

ask_candid/base/retrieval/elastic.py CHANGED Viewed

@@ -21,14 +21,14 @@ def build_sparse_vector_query(
     ----------
     query : str
         Search context string
-    fields : Tuple[str, ...]
         Semantic text field names
     inference_id : str, optional
         ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
     Returns
     -------
-    Dict[str, Any]
     """
     output = []
@@ -70,20 +70,20 @@ def build_sparse_vector_and_text_query(
     ----------
     query : str
         Search context string
-    semantic_fields : Tuple[str]
         Semantic text field names
-    highlight_fields: Tuple[str]
         Fields which relevant chunks will be helpful for the agent to read
-    text_fields : Tuple[str]
         Regular text fields
-    excluded_fields :  Tuple[str]
         Fields to exclude from the source
     inference_id : str, optional
         ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
     Returns
     -------
-    Dict[str, Any]
     """
     output = []
@@ -149,7 +149,7 @@ def news_query_builder(
     Returns
     -------
-    Dict[str, Any]
     """
     tokens = encoder.token_expand(query)
@@ -180,11 +180,79 @@ def news_query_builder(
     return elastic_query
 def multi_search_base(
     queries: list[dict[str, Any]],
     credentials: BaseElasticSearchConnection | BaseElasticAPIKeyCredential,
     timeout: int = 180
 ) -> Iterator[dict[str, Any]]:
     if isinstance(credentials, BaseElasticAPIKeyCredential):
         es = Elasticsearch(
             cloud_id=credentials.cloud_id,

     ----------
     query : str
         Search context string
+    fields : tuple[str, ...]
         Semantic text field names
     inference_id : str, optional
         ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
     Returns
     -------
+    dict[str, Any]
     """
     output = []
     ----------
     query : str
         Search context string
+    semantic_fields : tuple[str]
         Semantic text field names
+    highlight_fields: tuple[str]
         Fields which relevant chunks will be helpful for the agent to read
+    text_fields : tuple[str]
         Regular text fields
+    excluded_fields :  tuple[str]
         Fields to exclude from the source
     inference_id : str, optional
         ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
     Returns
     -------
+    dict[str, Any]
     """
     output = []
     Returns
     -------
+    dict[str, Any]
     """
     tokens = encoder.token_expand(query)
     return elastic_query
+def issuelab_query_builder(
+    query: str,
+    fields: tuple[str, ...],
+    highlight_fields: tuple[str, ...] | None,
+    encoder: SpladeEncoder,
+) -> dict[str, Any]:
+    """Builds a valid Elasticsearch query against Issuelab, simulating a token expansion.
+    Parameters
+    ----------
+    query : str
+        Search context string
+    Returns
+    -------
+    dict[str, Any]
+    """
+    tokens = encoder.token_expand(query)
+    elastic_query = {
+        "_source": ["issuelab_id", "issuelab_url", "title", "description", "content"],
+        "query": {
+            "bool": {
+                # "filter": [
+                #     # {"range": {"event_date": {"gte": f"now-{days_ago}d/d"}}},
+                #     # {"range": {"insert_date": {"gte": f"now-{days_ago}d/d"}}},
+                #     # {"range": {"article_trust_worthiness": {"gt": NEWS_TRUST_SCORE_THRESHOLD}}}
+                # ],
+                "should": []
+            }
+        },
+        "highlight": {
+            "fields": dict.fromkeys(highlight_fields or ("content", "description"), {})
+        }
+    }
+    for token, score in tokens.items():
+        if score > SPARSE_ENCODING_SCORE_THRESHOLD:
+            elastic_query["query"]["bool"]["should"].append({
+                "multi_match": {
+                    "query": token,
+                    "fields": fields,
+                    "boost": score
+                }
+            })
+    return elastic_query
 def multi_search_base(
     queries: list[dict[str, Any]],
     credentials: BaseElasticSearchConnection | BaseElasticAPIKeyCredential,
     timeout: int = 180
 ) -> Iterator[dict[str, Any]]:
+    """Handles multi-search queries on a single cluster given the relevant credetials object
+    Parameters
+    ----------
+    queries : list[dict[str, Any]]
+        `msearch` query object, (see: https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-msearch)
+    credentials : BaseElasticSearchConnection | BaseElasticAPIKeyCredential
+    timeout : int, optional, by default 180
+    Yields
+    ------
+    Iterator[dict[str, Any]]
+    Raises
+    ------
+    TypeError
+        Raised if invalid credentials are passed
+    """
     if isinstance(credentials, BaseElasticAPIKeyCredential):
         es = Elasticsearch(
             cloud_id=credentials.cloud_id,

ask_candid/base/retrieval/sources.py CHANGED Viewed

@@ -25,9 +25,16 @@ CandidNewsConfig = ElasticSourceConfig(
 )
 IssueLabConfig = ElasticSourceConfig(
-    index_name="search-semantic-issuelab-elser_ve2",
-    semantic_fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
 )

 )
+# IssueLabConfig = ElasticSourceConfig(
+#     index_name="search-semantic-issuelab-elser_ve2",
+#     semantic_fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
+# )
 IssueLabConfig = ElasticSourceConfig(
+    index_name="issuelab_prod_data",
+    # semantic_fields=("title", "description", "content"),
+    semantic_fields=("title", "description", "content^0.3"),
+    highlight_fields=("description", "content")
 )

ask_candid/services/knowledge_base.py ADDED Viewed

	@@ -0,0 +1,420 @@

+from typing import Literal, Any
+from collections.abc import Iterator, Iterable
+from itertools import groupby
+import logging
+from langchain_core.documents import Document
+from ask_candid.base.retrieval.elastic import (
+    build_sparse_vector_query,
+    build_sparse_vector_and_text_query,
+    news_query_builder,
+    issuelab_query_builder,
+    multi_search_base
+)
+from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
+from ask_candid.base.retrieval.schemas import ElasticHitsResult
+import ask_candid.base.retrieval.sources as S
+from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
+from ask_candid.services.small_lm import CandidSmallLanguageModel
+SourceNames = Literal[
+    "Candid Blog",
+    "Candid Help",
+    "Candid Learning",
+    "Candid News",
+    "IssueLab Research Reports",
+    "YouTube Training"
+]
+sparse_encoder = SpladeEncoder()
+logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+# TODO remove
+def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
+    """Pads the relevant chunk of text with context before and after
+    Parameters
+    ----------
+    field_name : str
+        a field with the long text that was chunked into pieces
+    hit : ElasticHitsResult
+    context_length : int, optional
+        length of text to add before and after the chunk, by default 1024
+    add_context : bool, optional
+        Set to `False` to expand the text context by searching for the Elastic inner hit inside the larger document
+        , by default True
+    Returns
+    -------
+    str
+        longer chunks stuffed together
+    """
+    chunks = []
+    # NOTE chunks have tokens, long text is a string, but may contain html which affects tokenization
+    long_text = hit.source.get(field_name) or ""
+    long_text = long_text.lower()
+    inner_hits_field = f"embeddings.{field_name}.chunks"
+    found_chunks = hit.inner_hits.get(inner_hits_field, {}) if hit.inner_hits else None
+    if found_chunks:
+        for h in found_chunks.get("hits", {}).get("hits") or []:
+            chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
+            # cutting the middle because we may have tokenizing artifacts there
+            chunk = chunk[3: -3]
+            if add_context:
+                # Find the start and end indices of the chunk in the large text
+                start_index = long_text.find(chunk[:20])
+                # Chunk is found
+                if start_index != -1:
+                    end_index = start_index + len(chunk)
+                    pre_start_index = max(0, start_index - context_length)
+                    post_end_index = min(len(long_text), end_index + context_length)
+                    chunks.append(long_text[pre_start_index:post_end_index])
+            else:
+                chunks.append(chunk)
+    return '\n\n'.join(chunks)
+def generate_queries(
+    query: str,
+    sources: list[SourceNames],
+    news_days_ago: int = 60
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
+    """Builds Elastic queries against indices which do or do not support sparse vector queries.
+    Parameters
+    ----------
+    query : str
+        Text describing a user's question or a description of investigative work which requires support from Candid's
+        knowledge base
+    sources : list[SourceNames]
+        One or more sources of knowledge from different areas at Candid.
+        * Candid Blog: Blog posts from Candid staff and trusted partners intended to help those in the sector or
+        illuminate ongoing work
+        * Candid Help: Candid FAQs to help user's get started with Candid's product platform and learning resources
+        * Candid Learning: Training documents from Candid's subject matter experts
+        * Candid News: News articles and press releases about real-time activity in the philanthropic sector
+        * IssueLab Research Reports: Academic research reports about the social/philanthropic sector
+        * YouTube Training: Transcripts from video-based training seminars from Candid's subject matter experts
+    news_days_ago : int, optional
+        How many days in the past to search for news articles, if a user is asking for recent trends then this value
+        should be set lower >~ 10, by default 60
+    Returns
+    -------
+    tuple[list[dict[str, Any]], list[dict[str, Any]]]
+        (sparse vector queries, queries for indices which do not support sparse vectors)
+    """
+    vector_queries = []
+    quasi_vector_queries = []
+    for source_name in sources:
+        if source_name == "Candid Blog":
+            q = build_sparse_vector_query(query=query, fields=S.CandidBlogConfig.semantic_fields)
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 5
+            vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
+        elif source_name == "Candid Help":
+            q = build_sparse_vector_query(query=query, fields=S.CandidHelpConfig.semantic_fields)
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 5
+            vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
+        elif source_name == "Candid Learning":
+            q = build_sparse_vector_query(query=query, fields=S.CandidLearningConfig.semantic_fields)
+            q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 5
+            vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
+        elif source_name == "Candid News":
+            q = news_query_builder(
+                query=query,
+                fields=S.CandidNewsConfig.semantic_fields,
+                encoder=sparse_encoder,
+                days_ago=news_days_ago
+            )
+            q["size"] = 5
+            quasi_vector_queries.extend([{"index": S.CandidNewsConfig.index_name}, q])
+        elif source_name == "IssueLab Research Reports":
+            # q = build_sparse_vector_query(query=query, fields=S.IssueLabConfig.semantic_fields)
+            # q["_source"] = {"excludes": ["embeddings"]}
+            # q["size"] = 1
+            # vector_queries.extend([{"index": S.IssueLabConfig.index_name}, q])
+            q = issuelab_query_builder(
+                query=query,
+                fields=S.IssueLabConfig.semantic_fields,
+                highlight_fields=S.IssueLabConfig.highlight_fields,
+                encoder=sparse_encoder,
+            )
+            q["size"] = 1
+            quasi_vector_queries.extend([{"index": S.IssueLabConfig.index_name}, q])
+        elif source_name == "YouTube Training":
+            q = build_sparse_vector_and_text_query(
+                query=query,
+                semantic_fields=S.YoutubeConfig.semantic_fields,
+                text_fields=S.YoutubeConfig.text_fields,
+                highlight_fields=S.YoutubeConfig.highlight_fields,
+                excluded_fields=S.YoutubeConfig.excluded_fields
+            )
+            q["size"] = 5
+            vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
+    return vector_queries, quasi_vector_queries
+def run_search(
+    vector_searches: list[dict[str, Any]] | None = None,
+    non_vector_searches: list[dict[str, Any]] | None = None,
+) -> list[ElasticHitsResult]:
+    """Elastic query runner which executes both sparse vector, and quasi-sparse vector queries and concatenates results.
+    This does not include re-ranking.
+    Parameters
+    ----------
+    vector_searches : list[dict[str, Any]] | None, optional
+        Sparse vector multi-search queries which , by default None
+    non_vector_searches : list[dict[str, Any]] | None, optional
+        Keyword-based multi-search queries, by default None
+    Returns
+    -------
+    list[ElasticHitsResult]
+        Concatenated results
+    """
+    def _msearch_response_generator(responses: Iterable[dict[str, Any]]) -> Iterator[ElasticHitsResult]:
+        for query_group in responses:
+            for h in query_group.get("hits", {}).get("hits", []):
+                inner_hits = h.get("inner_hits", {})
+                if not inner_hits and "news" in h.get("_index"):
+                    inner_hits = {"text": h.get("_source", {}).get("content")}
+                if not inner_hits and "issuelab" in h.get("_index"):
+                    inner_hits = {"text": h.get("_source", {}).get("content")}
+                yield ElasticHitsResult(
+                    index=h["_index"],
+                    id=h["_id"],
+                    score=h["_score"],
+                    source=h["_source"],
+                    inner_hits=inner_hits,
+                    highlight=h.get("highlight", {})
+                )
+    results = []
+    if vector_searches is not None and len(vector_searches) > 0:
+        hits = multi_search_base(queries=vector_searches, credentials=SEMANTIC_ELASTIC_QA)
+        for hit in _msearch_response_generator(responses=hits):
+            results.append(hit)
+    if non_vector_searches is not None and len(non_vector_searches) > 0:
+        hits = multi_search_base(queries=non_vector_searches, credentials=NEWS_ELASTIC)
+        for hit in _msearch_response_generator(responses=hits):
+            results.append(hit)
+    return results
+def retrieved_text(hits: dict[str, Any]) -> str:
+    """Extracts retrieved sub-texts from documents which are strong hits from semantic queries for the purpose of
+    re-scoring by a secondary language model.
+    Parameters
+    ----------
+    hits : dict[str, Any]
+    Returns
+    -------
+    str
+    """
+    nlp = CandidSmallLanguageModel()
+    text = []
+    for _, v in hits.items():
+        if _ == "text":
+            s = nlp.summarize(v, top_k=3)
+            text.append(s.summary)
+            # text.append(v)
+            continue
+        for h in (v.get("hits", {}).get("hits") or []):
+            for _, field in h.get("fields", {}).items():
+                for chunk in field:
+                    if chunk.get("chunk"):
+                        text.extend(chunk["chunk"])
+    return '\n'.join(text)
+def reranker(
+    query_results: Iterable[ElasticHitsResult],
+    search_text: str | None = None,
+    max_num_results: int = 5
+) -> Iterator[ElasticHitsResult]:
+    """Reranks Elasticsearch hits coming from multiple indices/queries which may have scores on different scales.
+    This will shuffle results
+    Parameters
+    ----------
+    query_results : Iterable[ElasticHitsResult]
+    Yields
+    ------
+    Iterator[ElasticHitsResult]
+    """
+    results: list[ElasticHitsResult] = []
+    texts: list[str] = []
+    for _, data in groupby(query_results, key=lambda x: x.index):
+        data = list(data)  # noqa: PLW2901
+        max_score = max(data, key=lambda x: x.score).score
+        min_score = min(data, key=lambda x: x.score).score
+        for d in data:
+            d.score = (d.score - min_score) / (max_score - min_score + 1e-9)
+            results.append(d)
+            if search_text:
+                if d.inner_hits:
+                    text = retrieved_text(d.inner_hits)
+                if d.highlight:
+                    highlight_texts = []
+                    for k, v in d.highlight.items():
+                        highlight_texts.append('\n'.join(v))
+                    text = '\n'.join(highlight_texts)
+                texts.append(text)
+    if search_text and len(texts) == len(results) and len(texts) > 1:
+        logger.info("Re-ranking %d retrieval results", len(results))
+        scores = sparse_encoder.query_reranking(query=search_text, documents=texts)
+        for r, s in zip(results, scores):
+            r.score = s
+    yield from sorted(results, key=lambda x: x.score, reverse=True)[:max_num_results]
+def process_hit(hit: ElasticHitsResult) -> Document:
+    """Process a raw Elasticsearch document into a structured langchain `Document` object.
+    Parameters
+    ----------
+    hit : ElasticHitsResult
+    Returns
+    -------
+    Document
+    Raises
+    ------
+    ValueError
+        Raised if a result from an unknown index is passed in
+    """
+    nlp = CandidSmallLanguageModel()
+    if "issuelab-elser" in hit.index:
+        doc = Document(
+            page_content='\n\n'.join([
+                hit.source.get("combined_item_description", ""),
+                hit.source.get("description", ""),
+                hit.source.get("combined_issuelab_findings", ""),
+                get_context("content", hit, context_length=12)
+            ]),
+            metadata={
+                "title": hit.source["title"],
+                "source": "IssueLab",
+                "source_id": hit.source["resource_id"],
+                "url": hit.source.get("permalink", "")
+            }
+        )
+    elif "issuelab" in hit.index:
+        content_summary = ""
+        if hit.source.get("content", ""):
+            content_summary = nlp.summarize(hit.source.get("content", ""), top_k=20).summary
+        doc = Document(
+            page_content='\n\n'.join([hit.source.get("description", ""), content_summary]),
+            metadata={
+                "title": hit.source["title"],
+                "source": "IssueLab",
+                "source_id": hit.source["issuelab_id"],
+                "url": hit.source.get("issuelab_url", "")
+            }
+        )
+    elif "youtube" in hit.index:
+        highlight = hit.highlight or {}
+        doc = Document(
+            page_content='\n\n'.join([
+                hit.source.get("title", ""),
+                hit.source.get("semantic_description", ""),
+                ' '.join(highlight.get("semantic_cc_text", []))
+            ]),
+            metadata={
+                "title": hit.source.get("title", ""),
+                "source": "Candid YouTube",
+                "source_id": hit.source['video_id'],
+                "url": f"https://www.youtube.com/watch?v={hit.source['video_id']}"
+            }
+        )
+    elif "candid-blog" in hit.index:
+        doc = Document(
+            page_content='\n\n'.join([
+                hit.source.get("title", ""),
+                hit.source.get("excerpt", ""),
+                get_context("content", hit, context_length=12, add_context=False),
+                get_context("authors_text", hit, context_length=12, add_context=False),
+                hit.source.get("title_summary_tags", "")
+            ]),
+            metadata={
+                "title": hit.source.get("title", ""),
+                "source": "Candid Blog",
+                "source_id": hit.source["id"],
+                "url": hit.source["link"]
+            }
+        )
+    elif "candid-learning" in hit.index:
+        doc = Document(
+            page_content='\n\n'.join([
+                hit.source.get("title", ""),
+                hit.source.get("staff_recommendations", ""),
+                hit.source.get("training_topics", ""),
+                get_context("content", hit, context_length=12)
+            ]),
+            metadata={
+                "title": hit.source["title"],
+                "source": "Candid Learning",
+                "source_id": hit.source["post_id"],
+                "url": hit.source.get("url", "")
+            }
+        )
+    elif "candid-help" in hit.index:
+        doc = Document(
+            page_content='\n\n'.join([
+                hit.source.get("combined_article_description", ""),
+                get_context("content", hit, context_length=12)
+            ]),
+            metadata={
+                "title": hit.source.get("title", ""),
+                "source": "Candid Help",
+                "source_id": hit.source["id"],
+                "url": hit.source.get("link", "")
+            }
+        )
+    elif "news" in hit.index:
+        doc = Document(
+            page_content='\n\n'.join([hit.source.get("title", ""), hit.source.get("content", "")]),
+            metadata={
+                "title": hit.source.get("title", ""),
+                "source": hit.source.get("site_name") or "Candid News",
+                "source_id": hit.source["id"],
+                "url": hit.source.get("link", "")
+            }
+        )
+    else:
+        raise ValueError(f"Unknown source result from index {hit.index}")
+    return doc

ask_candid/services/small_lm.py CHANGED Viewed

@@ -8,12 +8,16 @@ from ask_candid.base.lambda_base import LambdaInvokeBase
 @dataclass(slots=True)
 class Encoding:
     inputs: list[str]
     vectors: torch.Tensor
 @dataclass(slots=True)
 class SummaryItem:
     rank: int
     score: float
     text: str
@@ -21,6 +25,8 @@ class SummaryItem:
 @dataclass(slots=True)
 class TextSummary:
     snippets: list[SummaryItem]
     @property
@@ -28,7 +34,7 @@ class TextSummary:
         return ' '.join([_.text for _ in self.snippets])
-class CandidSLM(LambdaInvokeBase):
     """Wrapper around Candid's custom small language model.
     For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
     This services includes:
@@ -44,7 +50,7 @@ class CandidSLM(LambdaInvokeBase):
         AWS secret key, by default None
     """
-    class Tasks(Enum):  # pylint: disable=missing-class-docstring
         ENCODE = "/encode"
         DOCUMENT_SUMMARIZE = "/document/summarize"
         DOCUMENT_NER_SALIENCE = "/document/entitySalience"

 @dataclass(slots=True)
 class Encoding:
+    """Text encoding vector response
+    """
     inputs: list[str]
     vectors: torch.Tensor
 @dataclass(slots=True)
 class SummaryItem:
+    """A single summary object
+    """
     rank: int
     score: float
     text: str
 @dataclass(slots=True)
 class TextSummary:
+    """Text summarization response
+    """
     snippets: list[SummaryItem]
     @property
         return ' '.join([_.text for _ in self.snippets])
+class CandidSmallLanguageModel(LambdaInvokeBase):
     """Wrapper around Candid's custom small language model.
     For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
     This services includes:
         AWS secret key, by default None
     """
+    class Tasks(Enum):  # noqa: D106
         ENCODE = "/encode"
         DOCUMENT_SUMMARIZE = "/document/summarize"
         DOCUMENT_NER_SALIENCE = "/document/entitySalience"

ask_candid/tools/grants.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import logging
+from langchain_core.tools import tool
+import httpx
+from ask_candid.tools.utils import format_candid_profile_link
+from ask_candid.base.config.rest import SEARCH
+logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.ERROR)
+@tool
+def grants_search(
+    query: str,
+    subject_codes: str | None = None,
+    populations_served_codes: str | None = None,
+    geonameids_of_geographies_served: str | None = None
+) -> list[dict[str, str | int | float | None]] | str:
+    """Search for historical grants to find context about what is happening in the sector, and what organizations are
+    involved with. This is intended for historial research purposes and contextualization. If trying to recommend
+    funders then please use the dedicated funder recommendation tool instead of this. Funder recommendations uses grants
+    and additional contexts, as well as a carefully trained graph neural network to provide targeted recommendations.
+    Another important note is that this tool only returns up to 25 top relevant grant results and should never be used
+    to make broad generalizations.
+    Queries are natural text, and the retrieval mechanism is a hybrid approach of keywords and sparse vector searches
+    over fields which describe the activity and purpose of the grant.
+    While extra subject codes, populations served codes, and geography IDs for where the grant is serving is not
+    required, grants may become more specific the more information can be provided.
+    Subjects and populations can be determined using the `autocode` tool if the requester can supply a description of
+    the program they are seeking funding for.
+    Geographies can be determined using the geo detection tool if the requester can supply a description of the program
+    they are seeking funding for.
+    Parameters
+    ----------
+    query : str
+        Text describing a user's question or a description of investigative work which requires support from Candid's
+        grants knowledge base
+    subject_codes : str | None, optional
+        Subject codes from Candid's PCS taxonomy, comma separated, by default None
+    populations_served_codes : str | None, optional
+        Population groups served codes from Candid's PCS taxonomy, comma separated, by default None
+    geonameids_of_geographies_served : str | None, optional
+        Geonames ID values for geographies served by the requester's program, comma separted, by default None
+    Examples
+    --------
+    >>> grants_search(query='homeless shelters in new york')
+    >>> grants_search(
+        query='homeless shelters in new york',
+        subject_codes='SS050000, SS000000,SB050000',
+        populations_served_codes='PJ050100',
+        geonameids_of_geographies_served='4094212,4094212'
+    )
+    Returns
+    -------
+    list[dict[str, str | int | float | None]] | str
+        Array of relevant grants and information about the organizations involved
+        If output is a string then that means there was some error, and retry should be considered
+    """
+    payload = {"query": query, "rowCount": 25}
+    if subject_codes is not None:
+        payload["SubjectArea"] = subject_codes.split(',')
+    if populations_served_codes is not None:
+        payload["PopulationServed"] = populations_served_codes.split(',')
+    if geonameids_of_geographies_served:
+        payload["GeographicArea"] = geonameids_of_geographies_served.split(',')
+    with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
+        r = client.get(
+            url=SEARCH.endpoint("v1/grants/discovery"),
+            params=payload,
+            headers={**SEARCH.header} # type: ignore
+        )
+        if r.status_code != 200:
+            logger.error("Error calling grants search API %s. Error: %s", str(r.request.url), r.reason_phrase)
+            return f"Error calling grants search. Error: {r.reason_phrase}"
+        data: dict = r.json()
+    output = []
+    for grant in data.get("grants") or []:
+        working_on, serving = [], []
+        for facet, data in grant["pcsV3"].items():
+            if facet == "subject":
+                working_on.extend([code["name"].lower() for code in data["value"]])
+            elif facet == "population":
+                serving.extend([code["name"].lower() for code in data["value"]])
+        output.append({
+            "funder_id": grant["grantmakerId"],
+            "funder_profile_link": format_candid_profile_link(grant["grantmakerId"]),
+            "funder_name": grant["grantmakerName"],
+            "recipient_id": grant["recipientId"],
+            "recipient_profile_link": format_candid_profile_link(grant["recipientId"]),
+            "recipient_name": grant["recipientName"],
+            "fiscal_year": grant["fiscalYear"],
+            "amound_usd": grant["amountUsd"],
+            "description": grant["text"],
+            "working_on": f"Working on {', '.join(working_on)}",
+            "serving": f"Serving population groups {', '.join(serving)}",
+        })
+    return output

ask_candid/tools/letter_gen.py ADDED Viewed

	@@ -0,0 +1,230 @@

+from dataclasses import dataclass, field
+from langchain_core.tools import tool
+import httpx
+from ask_candid.base.config.rest import GOLDEN_ORG, LOI_WRITER
+@dataclass
+class LetterOfInterest:
+    opening: str = field(default="")
+    org_desc: str = field(default="")
+    need: str = field(default="")
+    project: str = field(default="")
+    request: str = field(default="")
+    conclusion: str = field(default="")
+    @property
+    def letter(self):
+        return f"""{self.opening}
+        {self.org_desc}
+        {self.need}
+        {self.project}
+        {self.request}
+        {self.conclusion}
+        """
+@tool
+def estimate_budget(
+    nonprofit_id: int,
+    funder_id: int,
+    project_description: str,
+    # ctx: Context
+) -> str:
+    """This is an optional tool for estimating project budgets. Some users will already know what their budget is, or
+    know how much money they are seeking from a grant, in which case this tool should not be used.
+    This tool also provides guidance on setting a budget, and ultimately the user needs to decided based on the output
+    from this tool
+    Parameters
+    ----------
+    nonprofit_id : int
+        The unique identifier of the requesting organization. This will need to be found from a search using inputs
+        elicited from the requeter
+    funder_id : int
+        The unique identifier of the funding organization which may be awarding a grant to the requester.
+        This will need to be found from a search using inputs elicited from the requeter, or from recommendations
+    project_description : str
+        Natural language text describing the project/program that the user is requesting funding for
+    Returns
+    -------
+    str
+        Budget guidance, including context on the funder's ability to provide the budget in question
+    """
+    recip_data = httpx.get(
+        url=GOLDEN_ORG.endpoint("v1/organization"),
+        params={"id": nonprofit_id},
+        headers={**GOLDEN_ORG.header}, # type: ignore
+        timeout=30
+    ).json().get("document_data", {}).get("preferred_data", {}).get("data", {})
+    funder_data = httpx.get(
+        url=GOLDEN_ORG.endpoint("v1/organization"),
+        params={"id": funder_id},
+        headers={**GOLDEN_ORG.header}, # type: ignore
+        timeout=30
+    ).json().get("document_data", {}).get("preferred_data", {}).get("data", {})
+    return httpx.post(
+        url=LOI_WRITER.endpoint("budget"),
+        json={
+            "recipient_candid_entity_id": nonprofit_id,
+            "program_description": project_description,
+            "recipient_data": recip_data,
+            "funder_data": funder_data
+        },
+        headers={**LOI_WRITER.header}, # type: ignore
+        timeout=30
+    ).json().get("response", "No budget could be estimated")
+@tool
+def draft_loi(
+    nonprofit_id: int,
+    funder_id: int,
+    project_description: str,
+    budget: int,
+) -> str:
+    """Generate a letter of interest/intent from a chain-of-thought prompt chain using Candid's golden data and any
+    inputs specified by the user, and/or recommended content.
+    The output of this tool is the actual letter draft, please do not make changes to it other than adding headers
+    and/or footers.
+    Parameters
+    ----------
+    nonprofit_id : int
+        The unique identifier of the requesting organization. This will need to be found from a search using inputs
+        elicited from the requeter
+    funder_id : int
+        The unique identifier of the funding organization which may be awarding a grant to the requester.
+        This will need to be found from a search using inputs elicited from the requeter, or from recommendations
+    project_description : str
+        Natural language text describing the project/program that the user is requesting funding for
+    budget : int
+        The dollar amount (in USD) that the user is requesting for funding. This should be specified by the user,
+        prompt for this if needed.
+    Returns
+    -------
+    str
+        Formatted letter of interest
+    """
+    client = httpx.Client(headers={**LOI_WRITER.header}, timeout=30, base_url=LOI_WRITER.url) # type: ignore
+    def _make_request(ept: str, payload: dict):
+        # return httpx.get(
+        #     url=LOI_WRITER.endpoint(ept),
+        #     params=payload,
+        #     headers={**LOI_WRITER.header}, # type: ignore
+        #     timeout=30
+        # ).json().get("response", "")
+        return client.get(url=LOI_WRITER.endpoint(ept), params=payload).json().get("response", "")
+    data = _make_request(
+        ept="organization/autofill",
+        payload={"recipient_candid_entity_id": nonprofit_id, "funder_candid_entity_id": funder_id}
+    )
+    recip: dict = data.get("recipient_data", {})
+    funder: dict = data.get("funder_data", {})
+    pair_history: str = data.get("funding_history_text", "")
+    sections = (
+        ("opening", "writer/opening"),
+        ("organization description", "writer/org"),
+        ("need statement", "writer/need"),
+        ("project description", "writer/project"),
+        ("funding request", "writer/fund"),
+        ("conclusion", "writer/conclusion")
+    )
+    output = LetterOfInterest()
+    for _, (section, endpoint) in enumerate(sections, start=1):
+        if section == "opening":
+            output.opening = _make_request(
+                ept=endpoint,
+                payload={
+                    "funder_name": [
+                        n["name"] for n in funder.get("org_data", {}).get("names", [])
+                        if n["name_type"] == "main"
+                    ][0],
+                    "recipient_name": [
+                        n["name"] for n in recip.get("org_data", {}).get("names", [])
+                        if n["name_type"] == "main"
+                    ][0],
+                    "project_purpose": project_description,
+                    "amount": budget,
+                    "prior_contact": None,
+                    "connection": None
+                }
+            )
+        elif section == "organization description":
+            output.org_desc = _make_request(
+                ept=endpoint,
+                payload={
+                    "opening": output.opening,
+                    "history": pair_history,
+                    "recipient_mission_statement": recip.get("mission_statement_text", ""),
+                    "capacity": recip.get("capacity_text", ""),
+                    "path": None,
+                    "accomplishment": recip.get("data_text", "")
+                }
+            )
+        elif section == "need statement":
+            output.need = httpx.get(
+                url=GOLDEN_ORG.endpoint(endpoint),
+                params={
+                    "recipient_desc": output.org_desc,
+                    "funder_mission_statement": funder.get("mission_statement_text", ""),
+                    "target": None,
+                    "data": None,
+                },
+                headers={**GOLDEN_ORG.header}, # type: ignore
+                timeout=30
+            ).json().get("response", "")
+        elif section == "project description":
+            output.project = _make_request(
+                ept=endpoint,
+                payload={
+                    "need": output.need,
+                    "projects": project_description,
+                    "desired_objectives": None,
+                    "major_activities": None,
+                    "key_staff": None,
+                    "stand_out": None,
+                    "success": None
+                }
+            )
+        elif section == "funding request":
+            output.request = _make_request(
+                ept=endpoint,
+                payload={
+                    "project_desc": output.project,
+                    "amount": budget,
+                    "funding_history": pair_history,
+                    "other_funding": None,
+                }
+            )
+        elif section == "conclusion":
+            output.conclusion = _make_request(
+                ept=endpoint,
+                payload={
+                    "funding_request": output.request,
+                    "project_desc": output.project,
+                    "follow_up": recip.get("contact_text", ""),
+                }
+            )
+    client.close()
+    return output.letter

ask_candid/tools/nlp.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from typing import Any
+import logging
+from langchain_core.tools import tool
+import httpx
+from ask_candid.base.utils import retry_on_status
+from ask_candid.base.config.rest import AUTOCODING, DOCUMENT
+logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.ERROR)
+@retry_on_status(num_retries=3)
+def get_with_retries(url: str, payload: dict[str, Any] | None, headers: dict[str, str] | None):
+    with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
+        return client.get(url=url, params=payload, headers=headers)
+@tool
+def autocode(text: str) -> dict[str, list] | str:
+    """Uses natural language processing to align some input text to Candid's taxonomy: https://taxonomy.candid.org.
+    The taxonomy describes activity in the social and philanthropic sectors.
+    Parameters
+    ----------
+    text : str
+        Text describing working in the social sector. This should be related to the social and/or philanthropic sector.
+    Returns
+    -------
+    dict[str, list] | str
+        Taxonomy responses. The keys of the dictionary are individual taxonomy facets, and the items in the dictionary
+        are each term which the NLP model has determined is relevant giving the input text. This also includes
+        confidence score.
+    """
+    r = httpx.get(
+        url=AUTOCODING.endpoint("predict"),
+        params={"text": text},
+        headers={**AUTOCODING.header} # type: ignore
+    )
+    if r.status_code != 200:
+        logger.error("Error calling autocoding API %s. Error: %s", str(r.request.url), r.reason_phrase)
+        return f"Error calling autocoding. Error: {r.reason_phrase}"
+    data: dict = r.json().get("data", {})
+    return {k: v for k, v in data.items() if k in {"subject", "population"}}
+@tool
+def geo_detect(text: str) -> list[dict[str, Any]] | str:
+    """Uses natural language processing to find and match named geographies found in the supplied text. The output
+    will supply identified geographies from [Geonames](https://www.geonames.org/).
+    Parameters
+    ----------
+    text : str
+        Text describing working in the social sector. This should be related to the social and/or philanthropic sector.
+    Returns
+    -------
+    list[dict[str, Any]] | str
+        Matched geographies responses. This is an array of JSON objects which contain the `name` of the geography as it
+        appeared in the supplied text, and the best match to a Geonames geography. For many Candid knowledge tools the
+        `geonames_id` value will be most useful.
+        If output is a string then that means there was some error, and retry should be considered
+    """
+    r = get_with_retries(
+        url=DOCUMENT.endpoint("entities/geographies"),
+        payload={"text": text, "only_best_match": True},
+        headers={**DOCUMENT.header}
+    )
+    assert isinstance(r, httpx.Response)
+    if r.status_code != 200:
+        logger.error("Error calling geo detection API %s. Error: %s", str(r.request.url), r.reason_phrase)
+        return f"Error calling geo detection. Error: {r.reason_phrase}"
+    data: dict = r.json().get("entities", [])
+    return [{"name": entity["name"], "match": entity["match"][:1]} for entity in data if entity.get("type") == "geo"]

ask_candid/tools/recommendations.py ADDED Viewed

	@@ -0,0 +1,287 @@

+from typing import Literal, Any
+import logging
+from langchain_core.tools import tool
+import httpx
+from ask_candid.tools.utils import format_candid_profile_link
+from ask_candid.base.utils import retry_on_status
+from ask_candid.base.config.rest import FUNDER_RECOMMENDATION, SEARCH
+logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.ERROR)
+@retry_on_status(num_retries=3)
+def get_with_retries(url: str, payload: dict[str, Any] | None, headers: dict[str, str] | None) -> httpx.Response:
+    with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
+        return client.get(url=url, params=payload, headers=headers)
+@tool
+def organization_search(
+    query: str,
+    located_postal_code: str | None = None,
+    located_admin1: str | None = None,
+    search_mode: Literal["organization_only", "organization_and_grants"] | None = "organization_only"
+) -> list[dict[str, str | None]] | str:
+    """Search for organizations by name, description or work, program descriptions and locations. Here are some
+    guidelines:
+    * `query` controls hybrid searching involving both vector search and keyword search
+    * `query` can be used to find organizations based on a description of work
+    * if the query is intended to be a lookup of an organization by name, then adding quotes around the `query` string
+     circumvents vector search, and prioritizes keyword matching on names (eg. `query=Candid` --> `query='Candid'`)
+    * if the query is an EIN (eg. 12-3456789) then keyword searching is prioritized to get exact matches
+    * adding location information such as postal codes and/or admin1 (state/province abbreviations) will filter results
+    This tool should be used as a first step in any downstream task which requires identifying the nonprofit that the
+    user is identifying with. Often, the `nonprofit_id` is required, and that can be found via a search.
+    Parameters
+    ----------
+    query : str
+        Free text query which drives the search functionality. This uses a hybrid approach of vector and keyword
+        searching, but under certain conditions expressed in the 'guidelines' this may disable vector search.
+    located_postal_code : str | None, optional
+        Postal code of the organization to be searched, if provided, by default None
+    located_admin1 : str | None, optional
+        Admin1 code (state/province abbreviation) of the organization to be searched, if provided, by default None
+    search_mode : Literal["organization_only", "organization_and_grants"] | None, optional
+        Choose how to search for organizations, if `None` or "organization_and_grants" then this will examine evidence
+        at the organization level as well as at the historical grant transaction level capturing activity evidence. For
+        name lookups it is best to use the "organization_only" default value, by default "organization_only"
+    Returns
+    -------
+    list[dict[str, str]] | str
+        List of the top organization search results
+        If output is a string then that means there was some error, and retry should be considered
+    """
+    payload = {"query": query, "searchMode": search_mode, "rowCount": 5}
+    if located_postal_code is not None:
+        payload["postalCode"] = located_postal_code
+    if located_admin1 is not None:
+        payload["admin1"] = located_admin1
+    with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
+        r = client.get(
+            url=SEARCH.endpoint("v1/search"),
+            params=payload,
+            headers={**SEARCH.header} # type: ignore
+        )
+        if r.status_code != 200:
+            logger.error("Error calling organization search API %s. Error: %s", str(r.request.url), r.reason_phrase)
+            return f"Error calling organization search. Error: {r.reason_phrase}"
+        data: dict = r.json()
+    output = []
+    for org in data.get("returnedOrgs") or []:
+        working_on, serving = [], []
+        for code, description in org["taxonomy"].items():
+            code: str
+            description: str
+            if code.startswith('P') and len(code) > 2:
+                serving.append(description.lower())
+            elif code.startswith('S'):
+                working_on.append(description.lower())
+        output.append({
+            "nonprofit_id": org["candidEntityID"],
+            "name": org["orgName"],
+            "aka_name": org["akaName"],
+            "acronym": org["acronymName"],
+            "city": org["city"],
+            "admin1": org["admin1"],
+            "country": org["countryName"],
+            "EIN": org["ein"],
+            "profile_link": format_candid_profile_link(org['candidEntityID']),
+            "working_on": f"Working on {', '.join(working_on)}",
+            "serving": f"Serving population groups {', '.join(serving)}",
+            "transparency_level": org["seal"].get("description"),
+            "organization_roles": ', '.join(org["roles"]),
+            "grants_awarded": ', '.join([f"{k}: {v}" for k, v in org["transactionsGiven"].items()]),
+            "grants_received": ', '.join([f"{k}: {v}" for k, v in org["transactionsReceived"].items()])
+        })
+    return output
+@tool
+def recommend_funders(
+    nonprofit_id: int,
+    subject_codes_of_program: str | None = None,
+    populations_served_codes_of_program: str | None = None,
+    geonameids_of_geographies_served: str | None = None,
+    include_past_funders: bool = False
+) -> tuple[dict[str, Any], list[dict[str, Any]]] | str:
+    """Recommend potential funding organizations to a nonprofit seeking a grant.
+    These recommendations are built using machine learning over a heterogeneous knowledge graph representing the work of
+    the requesting organization, and the contextual recent activities of potential funders, and their grant recipients.
+    While extra subject codes, populations served codes, and geography IDs for where the program takes place is not
+    required, recommendations tend to improve and become more specific the more information can be provided.
+    Subjects and populations can be determined using the `autocode` tool if the requester can supply a description of
+    the program they are seeking funding for.
+    Geographies can be determined using the geo detection tool if the requester can supply a description of the program
+    they are seeking funding for.
+    Key Usage Requirements:
+    - Always incorporate returned profile URLs directly into the response text
+    - Replace funding organization name mentions with hyperlinked Candid profile URLs
+    - Prioritize creating a seamless user experience by making URLs contextually relevant
+    - Use relevant recipient data as well as inferred metadata to provide explanations about recommendation relevance
+    Parameters
+    ----------
+    nonprofit_id : int
+        The unique identifier of the requesting organization. This will need to be found from a search using inputs
+        elicited from the requester
+    subject_codes_of_program : str | None, optional
+        Subject codes from Candid's PCS taxonomy, comma separated, by default None
+    populations_served_codes_of_program : str | None, optional
+        Population groups served codes from Candid's PCS taxonomy, comma separated, by default None
+    geonameids_of_geographies_served : str | None, optional
+        Geonames ID values for geographies served by the requester's program, comma separted, by default None
+    include_past_funders : bool, optional
+        Boolean flag to indicate whether previous funders of the input organization identified by the `nonprofit_id`
+        should be excluded. If the requester would like to reconsider previous funding organizations then set this to
+        `True`, but the requester MUST be prompted to indicate this preference. Using the default value will help the
+        requester discover new, potentially relevant funders, by default False
+    Examples
+    --------
+    >>> recommend_funders(nonprofit_id=9981881)
+    >>> reccommend_funders(
+        nonprofit_id=9173173,
+        subject_codes_of_program='SS050000, SS000000,SB050000',
+        populations_served_codes_of_program='PJ050100',
+        geonameids_of_geographies_served='4094212,4094212'
+    )
+    Returns
+    -------
+    tuple[dict[str, Any], list[dict[str, Any]]] | str
+        (Inferred data used to generate recommendations, array of funders being recommended)
+        If output is a string then that means there was some error, and retry should be considered
+    """
+    payload = {
+        "candid_entity_id": nonprofit_id,
+        "use_programs": True,
+        "top_k": 5,
+        "include_past_funders": include_past_funders
+    }
+    if subject_codes_of_program is not None:
+        payload["subjects"] = subject_codes_of_program
+    if populations_served_codes_of_program is not None:
+        payload["populations"] = populations_served_codes_of_program
+    if geonameids_of_geographies_served:
+        payload["geos"] = geonameids_of_geographies_served
+    r = get_with_retries(
+        url=FUNDER_RECOMMENDATION.endpoint("funder/pcs-v3"),
+        payload=payload,
+        headers={**FUNDER_RECOMMENDATION.header}
+    )
+    assert isinstance(r, httpx.Response)
+    if r.status_code != 200:
+        logger.error("Error calling funder recommendations API %s. Error: %s", str(r.request.url), r.reason_phrase)
+        return f"Error calling funder recommendations. Error: {r.reason_phrase}"
+    data: dict = r.json()
+    return (
+        data.get("meta") or {},
+        [{
+            **r,
+            "candid_profile_url": format_candid_profile_link(r['funder_id'])
+        } for r in (data.get("recommendations") or [])]
+    )
+@tool
+def recommend_funding_opportunities(
+    nonprofit_id: int,
+    subject_codes_of_program: str | None = None,
+    populations_served_codes_of_program: str | None = None,
+    geonameids_of_geographies_served: str | None = None
+) -> tuple[dict[str, Any], list[dict[str, Any]]] | str:
+    """Recommend active funding opportunities (RFPs) to a nonprofit seeking a grant.
+    These recommendations are built using machine learning over a heterogeneous knowledge graph representing the work of
+    the requesting organization, and the contextual recent activities of potential funders, and their grant recipients.
+    While extra subject codes, populations served codes, and geography IDs for where the program takes place is not
+    required, recommendations tend to improve and become more specific the more information can be provided.
+    Subjects and populations can be determined using the `autocode` tool if the requester can supply a description of
+    the program they are seeking funding for.
+    Key Usage Requirements:
+    - Always incorporate returned profile URLs directly into the response text
+    - Replace funding organization name mentions with hyperlinked Candid profile URLs
+    - Prioritize creating a seamless user experience by making URLs contextually relevant
+    - Use inferred metadata to provide explanations about recommendation relevance
+    Parameters
+    ----------
+    nonprofit_id : int
+        The unique identifier of the requesting organization. This will need to be found from a search using inputs
+        elicited from the requeter
+    subject_codes_of_program : str | None, optional
+        Subject codes from Candid's PCS taxonomy, comma separated, by default None
+    populations_served_codes_of_program : str | None, optional
+        Population groups served codes from Candid's PCS taxonomy, comma separated, by default None
+    geonameids_of_geographies_served : str | None, optional
+        Geonames ID values for geographies served by the requester's program, comma separted, by default None
+    Examples
+    --------
+    >>> recommend_funding_opportunities(nonprofit_id=9981881)
+    >>> recommend_funding_opportunities(
+        nonprofit_id=9173173,
+        subject_codes_of_program='SS050000, SS000000,SB050000',
+        populations_served_codes_of_program='PJ050100',
+        geonameids_of_geographies_served='4094212,4094212'
+    )
+    Returns
+    -------
+    tuple[dict[str, Any], list[dict[str, Any]]] | str
+        (Inferred data used to generate recommendations, array of active funding opportunities being recommended)
+        If output is a string then that means there was some error, and retry should be considered
+    """
+    payload = {"candid_entity_id": nonprofit_id, "use_programs": True, "top_k": 5}
+    if subject_codes_of_program is not None:
+        payload["subjects"] = subject_codes_of_program
+    if populations_served_codes_of_program is not None:
+        payload["populations"] = populations_served_codes_of_program
+    if geonameids_of_geographies_served:
+        payload["geos"] = geonameids_of_geographies_served
+    r = get_with_retries(
+        url=FUNDER_RECOMMENDATION.endpoint("rfp/pcs-v3"),
+        payload=payload,
+        headers={**FUNDER_RECOMMENDATION.header}
+    )
+    assert isinstance(r, httpx.Response)
+    if r.status_code != 200:
+        logger.error("Error calling RFP recommendation API %s. Error: %s", str(r.request.url), r.reason_phrase)
+        return f"Error calling RFP recommendations. Error: {r.reason_phrase}"
+    data: dict = r.json()
+    return (
+        data.get("meta") or {},
+        [{
+            **r,
+            "candid_profile_url": format_candid_profile_link(r['funder_id'])
+        } for r in (data.get("recommendations") or [])]
+    )

ask_candid/tools/search.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from langchain_core.documents import Document
 from langchain_core.tools import tool
-from ask_candid.base.retrieval.knowledge_base import (
     SourceNames,
     generate_queries,
     run_search,

 from langchain_core.documents import Document
 from langchain_core.tools import tool
+from ask_candid.services.knowledge_base import (
     SourceNames,
     generate_queries,
     run_search,

ask_candid/utils.py CHANGED Viewed

@@ -1,47 +1,15 @@
-from typing import List, Dict, Union, Any
 from uuid import uuid4
 from langchain_core.documents import Document
-from ask_candid.retrieval.sources import (
-    candid_blog,
-    candid_help,
-    candid_learning,
-    issuelab,
-    youtube
-)
-def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
-    height_px = 200
-    html = ""
-    if source == "news":
-        # html = news.article_card_html(doc, height_px, show_chunks)
-        pass
-    elif source == "transactions":
-        pass
-    elif source == "organizations":
-        pass
-    elif source == "issuelab":
-        html = issuelab.issuelab_card_html(doc, height_px, show_chunks)
-    elif source == "youtube":
-        html = youtube.build_card_html(doc, 400, show_chunks)
-    elif source == "candid_blog":
-        html = candid_blog.build_card_html(doc, height_px, show_chunks)
-    elif source == "candid_learning":
-        html = candid_learning.build_card_html(doc, height_px, show_chunks)
-    elif source == "candid_help":
-        html = candid_help.build_card_html(doc, height_px, show_chunks)
-    return html
-def html_format_docs_chat(docs: List[Document]) -> str:
     """Formats Candid sources
     Parameters
     ----------
-    docs : List[Document]
         Retrieved documents for context
     Returns
@@ -69,7 +37,7 @@ def html_format_docs_chat(docs: List[Document]) -> str:
     return html
-def format_chat_ag_response(chatbot: List[Any]) -> List[Any]:
     """If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
         with the AI response
     Returns:
@@ -89,7 +57,7 @@ def valid_inputs(*args) -> bool:
     return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
-def get_session_id(thread_id: Union[str, None]) -> str:
     if not thread_id:
         thread_id = uuid4().hex
     return thread_id

+from typing import Any
 from uuid import uuid4
 from langchain_core.documents import Document
+def html_format_docs_chat(docs: list[Document]) -> str:
     """Formats Candid sources
     Parameters
     ----------
+    docs : list[Document]
         Retrieved documents for context
     Returns
     return html
+def format_chat_ag_response(chatbot: list[Any]) -> list[Any]:
     """If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
         with the AI response
     Returns:
     return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
+def get_session_id(thread_id: str | None) -> str:
     if not thread_id:
         thread_id = uuid4().hex
     return thread_id

requirements.txt CHANGED Viewed

@@ -9,7 +9,7 @@ langgraph-prebuilt==0.6.4
 pydantic==2.10.6
 pyopenssl>22.0.0
 python-dotenv
-transformers
 --find-links https://download.pytorch.org/whl/cpu
 torch

 pydantic==2.10.6
 pyopenssl>22.0.0
 python-dotenv
+transformers>=4.56.1
 --find-links https://download.pytorch.org/whl/cpu
 torch