Spaces:

kaburia
/

policy-analysis

Sleeping

App Files Files Community

kaburia commited on Aug 4

Commit

ef26a79

1 Parent(s): 1208e23

redesigned modules

Browse files

Files changed (7) hide show

utils/coherence_bbscore.py +260 -0
utils/encoding_input.py +12 -0
utils/generation_streaming.py +99 -0
utils/loading_embeddings.py +55 -0
utils/model_generation.py +211 -0
utils/retrieve_n_rerank.py +79 -0
utils/sentiment_analysis.py +218 -0

utils/coherence_bbscore.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# pip install sentence-transformers (if not already)
+import math, re, unicodedata
+from typing import List, Dict, Any, Optional, Tuple
+import numpy as np
+import os, re, unicodedata, numpy as np
+# get the reranked results with no scores
+from retrieve_n_rerank import retrieve_and_rerank
+try:
+    from sentence_transformers import SentenceTransformer
+except Exception:
+    SentenceTransformer = None
+# -----------------------------
+# Text utilities
+# -----------------------------
+def _norm(t: str) -> str:
+    if t is None: return ""
+    t = unicodedata.normalize("NFKC", str(t))
+    t = re.sub(r"\s*\n\s*", " ", t)
+    t = re.sub(r"\s{2,}", " ", t)
+    return t.strip()
+def split_sentences(text: str) -> List[str]:
+    t = _norm(text)
+    parts = re.split(r"(?<=[\.\?\!])\s+(?=[A-Z“\"'])", t)
+    return [p.strip() for p in parts if p.strip()]
+# -----------------------------
+# Embeddings wrapper
+# -----------------------------
+class Embedder:
+    def __init__(self, model_name: str = "BAAI/bge-m3", device: str = "cpu"):
+        if SentenceTransformer is None:
+            raise RuntimeError("Install sentence-transformers to enable coherence scoring.")
+        self.model = SentenceTransformer(model_name, device=device)
+    def encode(self, sentences: List[str]) -> np.ndarray:
+        if not sentences:
+            return np.zeros((0, 768), dtype=np.float32)
+        X = self.model.encode(sentences, normalize_embeddings=True, batch_size=32, show_progress_bar=False)
+        return np.asarray(X, dtype=np.float32)
+def _cos(a: np.ndarray, b: np.ndarray) -> float:
+    return float(np.dot(a, b))
+def _normalize(v: np.ndarray) -> np.ndarray:
+    v = np.asarray(v, dtype=np.float32)
+    n = np.linalg.norm(v) + 1e-8
+    return v / n
+# -----------------------------
+# Brownian-bridge style metric
+# -----------------------------
+def bb_coherence(sentences: List[str], E: np.ndarray) -> Dict[str, Any]:
+    """
+    Brownian-bridge–inspired coherence:
+    - Build a main-idea vector (intro+outro+centroid)
+    - Compare per-sentence sim to target curve that's high at ends, lower mid
+    - Map max bridge deviation -> (0,1] score (higher=more coherent)
+    """
+    n = len(sentences)
+    if n == 0:
+        return {"bbscore": 0.0, "sims": [], "off_idx": [], "rep_pairs": [], "sim_matrix": None}
+    k = max(1, min(3, n // 5))
+    v_first = E[:k].mean(axis=0)
+    v_last  = E[-k:].mean(axis=0)
+    v_all   = E.mean(axis=0)
+    v_main  = _normalize(0.4*v_first + 0.4*v_last + 0.2*v_all)
+    sims = np.array([_cos(v_main, E[i]) for i in range(n)], dtype=np.float32)
+    t = np.linspace(0.0, 1.0, num=n, dtype=np.float32)
+    q = 1.0 - 4.0 * t * (1.0 - t)          # peaks at ends
+    q = q / (q.mean() + 1e-8) * (sims.mean() if sims.size else 0.0)
+    r = sims - q
+    r_centered = r - r.mean()
+    cumsum = np.cumsum(r_centered)
+    B = cumsum - t * (cumsum[-1] if n > 1 else 0.0)
+    denom = (np.std(r_centered) * math.sqrt(n)) + 1e-8
+    ks = float(np.max(np.abs(B)) / denom)
+    bbscore = float(1.0 / (1.0 + ks))
+    # Off-topic: sims < mean - 1σ
+    off_thr = float(sims.mean() - sims.std())
+    off_idx = [i for i, s in enumerate(sims) if s < off_thr]
+    # Repetition: very high pairwise similarity, skip adjacent
+    S = E @ E.T if n > 1 else np.zeros((1,1), dtype=np.float32)  # cosine due to normalization
+    rep_pairs = []
+    if n > 1:
+        for i in range(n):
+            for j in range(i+2, n):  # skip adjacent
+                if S[i, j] >= 0.92:  # threshold tunable
+                    rep_pairs.append((i, j, float(S[i, j])))
+    return {"bbscore": round(bbscore, 3), "sims": sims, "off_idx": off_idx, "rep_pairs": rep_pairs, "sim_matrix": S}
+# -----------------------------
+# Zero-shot labeler (optional)
+# -----------------------------
+def zshot_label(text: str, topic: str = "the main topic") -> Dict[str, float]:
+    """
+    Optional: zero-shot verdict to complement rule-based label.
+    Labels: Coherent, Off topic, Repeated
+    """
+    try:
+        from transformers import pipeline
+    except Exception:
+        return {}
+    clf = pipeline("zero-shot-classification",
+                   model="MoritzLaurer/deberta-v3-base-zeroshot-v2.0",
+                   multi_label=True)
+    labels = ["Coherent", "Off topic", "Repeated"]
+    res = clf(_norm(text), labels, hypothesis_template=f"This passage is {{}} with respect to {topic}.")
+    return {lbl: float(score) for lbl, score in zip(res["labels"], res["scores"])}
+# -----------------------------
+# Decision logic + reasons
+# -----------------------------
+def decide_label_with_reasons(
+    text: str,
+    topic_hint: Optional[str],
+    bb: Dict[str, Any],
+    sentences: List[str],
+    zshot_scores: Optional[Dict[str, float]] = None,
+    thresholds: Dict[str, float] = None
+) -> Dict[str, Any]:
+    """
+    Returns:
+    {
+      "label": "Coherent" | "Off topic" | "Repeated",
+      "reasons": [ "...", "..."],
+      "evidence": { "off_topic_examples": [...], "repeated_examples": [...] },
+      "bbscore": 0.74
+    }
+    """
+    thr = thresholds or {
+        "bb_coherent_min": 0.65,     # >= coherent
+        "off_topic_ratio_max": 0.20, # <= coherent
+        "repeat_pairs_min": 1        # >= repeated (if any)
+    }
+    n = max(1, len(sentences))
+    off_ratio = len(bb["off_idx"]) / n
+    has_repeat = len(bb["rep_pairs"]) >= thr["repeat_pairs_min"]
+    bbscore = bb["bbscore"]
+    # Rule-based primary decision
+    if off_ratio > thr["off_topic_ratio_max"] and bbscore < thr["bb_coherent_min"]:
+        label = "Off topic"
+    elif has_repeat and bbscore >= 0.5:
+        label = "Repeated"
+    elif bbscore >= thr["bb_coherent_min"] and off_ratio <= thr["off_topic_ratio_max"] and not has_repeat:
+        label = "Coherent"
+    else:
+        # Tie-breaker using zero-shot if provided
+        if zshot_scores:
+            label = max(zshot_scores.items(), key=lambda kv: kv[1])[0]
+        else:
+            # fallback: prefer coherence if bbscore okay, else off-topic
+            label = "Coherent" if bbscore >= 0.6 else "Off topic"
+    # Reasons
+    reasons = [f"BBScore={bbscore:.3f}."]
+    if bb["off_idx"]:
+        reasons.append(f"Off-topic fraction={off_ratio:.2f} ({len(bb['off_idx'])}/{n} sentences below main-idea similarity).")
+    if has_repeat:
+        top_rep = sorted(bb["rep_pairs"], key=lambda x: x[2], reverse=True)[:2]
+        reasons.append(f"Repeated content detected (top sim={top_rep[0][2]:.2f}).")
+    if zshot_scores:
+        top = sorted(zshot_scores.items(), key=lambda kv: kv[1], reverse=True)[:2]
+        reasons.append("Zero-shot support: " + ", ".join([f"{k}={v:.2f}" for k,v in top]))
+    # Evidence snippets
+    ev_off = [f'{i}: "{sentences[i]}"' for i in bb["off_idx"][:2]]
+    ev_rep = []
+    for (i, j, sim) in sorted(bb["rep_pairs"], key=lambda x: x[2], reverse=True)[:2]:
+        ev_rep.append(f'({i},{j}) sim={sim:.2f}: "{sentences[i]}", "{sentences[j]}"')
+    return {
+        "label": label,
+        "reasons": reasons,
+        "evidence": {"off_topic_examples": ev_off, "repeated_examples": ev_rep},
+        "bbscore": bbscore
+    }
+def _display_title(meta: Dict[str, Any], fallback: str) -> str:
+    if meta.get("title"): return str(meta["title"]).strip()
+    src = meta.get("source") or meta.get("path")
+    if src:
+        base = os.path.basename(str(src))
+        return re.sub(r"\.pdf$", "", base, flags=re.I)
+    return meta.get("doc_id") or fallback
+def _page_label(meta: Dict[str, Any]) -> str:
+    return str(meta.get("page_label") or meta.get("page") or "?")
+def to_std_doc(item: Any, idx: int = 0) -> Dict[str, Any]:
+    """
+    Accepts a LangChain Document or dict; returns a standard dict:
+    {title, page_label, text}
+    """
+    if hasattr(item, "page_content"):  # LangChain Document
+        meta = getattr(item, "metadata", {}) or {}
+        return {
+            "title": _display_title(meta, f"doc{idx+1}"),
+            "page_label": _page_label(meta),
+            "text": _norm(item.page_content),
+        }
+    elif isinstance(item, dict):
+        meta = item.get("metadata", {}) or {}
+        title = item.get("title") or _display_title(meta, item.get("doc_id", f"doc{idx+1}"))
+        page  = item.get("page_label") or _page_label(meta)
+        text  = _norm(item.get("text") or item.get("page_content", ""))
+        return {"title": title, "page_label": page, "text": text}
+    else:
+        raise TypeError(f"Unsupported doc type at index {idx}: {type(item)}")
+def coherence_assessment_std(
+    std_doc: Dict[str, Any],
+    embedder,
+    topic_hint: Optional[str] = None,
+    run_zero_shot: bool = False,
+    thresholds: Optional[Dict[str, float]] = None
+) -> Dict[str, Any]:
+    """Same as your coherence_assessment, but expects a standardized dict."""
+    text = std_doc.get("text", "")
+    sents = split_sentences(text)
+    if not sents:
+        return {"title": std_doc.get("title","Document"), "label": "Off topic", "bbscore": 0.0,
+                "reasons": ["Empty text."], "evidence": {}}
+    E = embedder.encode(sents)
+    bb = bb_coherence(sents, E)
+    zshot_scores = zshot_label(text, topic_hint) if run_zero_shot else None
+    decision = decide_label_with_reasons(text, topic_hint, bb, sents, zshot_scores, thresholds)
+    return {
+        "title": std_doc.get("title","Document"),
+        "page_label": std_doc.get("page_label","?"),
+        "label": decision["label"],
+        "bbscore": decision["bbscore"],
+        "reasons": decision["reasons"],
+        "evidence": decision["evidence"],
+    }
+# Get the coherence report
+def coherence_report(embedder="MoritzLaurer/deberta-v3-base-zeroshot-v2.0",
+                        input_text=None,
+                        reranked_results=None,
+                        run_zero_shot=True):
+    embedder = Embedder(embedder) if isinstance(embedder, str) else embedder
+    if reranked_results is None:
+        reranked_results = retrieve_and_rerank(input_text)
+    if not reranked_results:
+        return []
+    # Convert reranked_results to standardized documents
+    std_results = [to_std_doc(doc, i) for i, doc in enumerate(reranked_results)]
+    reports = [coherence_assessment_std(d, embedder, topic_hint=input_text, run_zero_shot=run_zero_shot)
+               for d in std_results]
+    return reports

utils/encoding_input.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Methods to encode text
+import numpy as np
+from langchain_community.embeddings import HuggingFaceEmbeddings
+def encode_text(text, embedding_model='sentence-transformers/all-MiniLM-L6-v2', as_array=True):
+    """Encodes the input text using the provided embedding model."""
+    embedding_model = HuggingFaceEmbeddings(model_name=embedding_model)
+    encoded_input =  embedding_model.embed_query(text)
+    if as_array:
+        return np.array(encoded_input)
+    else:
+        return encoded_input

utils/generation_streaming.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# from langchain_community.embeddings import HuggingFaceEmbeddings
+# from langchain_community.embeddings import CrossEncoder
+import requests
+import numpy as np
+import time
+import json
+# encode the text
+from encoding_input import encode_text
+# rertrieve and rerank the documents
+from retrieve_n_rerank import retrieve_and_rerank
+# sentiment analysis on reranked documents
+from sentiment_analysis import get_sentiment
+# coherence assessment reports
+from coherence_bbscore import coherence_report
+# Get the vectorstore
+from loading_embeddings import get_vectorstore
+vectorstore = get_vectorstore()
+# build message from model generation
+from model_generation import build_messages
+API_KEY = "sk-do-"
+MODEL = "llama3.3-70b-instruct"
+def generate_response_stream(query: str, enable_sentiment: bool, enable_coherence: bool):
+    # encoded_input = encode_text(query)
+    reranked_results = retrieve_and_rerank(
+        query_text=query,
+        vectorstore=vectorstore,
+        k=50,  # number of initial documents to retrieve
+        rerank_model="cross-encoder/ms-marco-MiniLM-L-6-v2",
+        top_m=20,  # number of documents to return after reranking
+        min_score=0.5,  # minimum score for reranked documents
+        only_docs=False  # return both documents and scores
+    )
+    top_docs = [doc for doc, score in reranked_results]
+    if not top_docs:
+        yield "No relevant documents found."
+        return
+    sentiment_rollup =   get_sentiment(top_docs) if enable_sentiment else {}
+    coherence_report_ = coherence_report(reranked_results=top_docs, input_text= query) if enable_coherence else ""
+    messages = build_messages(
+        query=query,
+        top_docs=top_docs,
+        task_mode="verbatim_sentiment",
+        sentiment_rollup=sentiment_rollup,
+        coherence_report=coherence_report_,
+    )
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": MODEL,
+        "messages": messages,
+        "temperature": 0.2,
+        "stream": True,
+        "max_tokens": 2000
+    }
+    collected = ""  # Accumulate content to show
+    with requests.post("https://inference.do-ai.run/v1/chat/completions", headers=headers, json=data, stream=True) as r:
+        if r.status_code != 200:
+            yield f"[ERROR] API returned status {r.status_code}: {r.text}"
+            return
+        for line in r.iter_lines(decode_unicode=True):
+            if not line or line.strip() == "data: [DONE]":
+                continue
+            if line.startswith("data: "):
+                line = line[len("data: "):]
+            try:
+                chunk = json.loads(line)
+                delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
+                if delta:
+                    collected += delta
+                    yield collected  # yield progressively
+                    time.sleep(0.01)  # slight throttle to improve smoothness
+            except Exception as e:
+                print("Streaming decode error:", e)
+                continue

utils/loading_embeddings.py ADDED Viewed

	@@ -0,0 +1,55 @@

+#  Loading embeddings from storage
+import os
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+# download it at the data directory
+data_path = os.path.join(Path(os.getcwd()).parent, "data")
+# make the faiss local folder
+local_folder = os.path.join(data_path, 'faiss_index')
+def download_faiss_index(repo_id="kaburia/epic-a-embeddings", local_folder="faiss_index"):
+    os.makedirs(local_folder, exist_ok=True)
+    index_faiss_path = os.path.join(local_folder, "index.faiss")
+    index_pkl_path = os.path.join(local_folder, "index.pkl")
+    if not os.path.exists(index_faiss_path):
+        print("Downloading index.faiss from Hugging Face Dataset...")
+        hf_hub_download(
+            repo_id=repo_id,
+            filename="index.faiss",
+            repo_type="dataset",
+            local_dir=local_folder,
+            local_dir_use_symlinks=False,
+        )
+    if not os.path.exists(index_pkl_path):
+        print("Downloading index.pkl from Hugging Face Dataset...")
+        hf_hub_download(
+            repo_id=repo_id,
+            filename="index.pkl",
+            repo_type="dataset",
+            local_dir=local_folder,
+            local_dir_use_symlinks=False,
+        )
+def load_vectorstore(index_path="faiss_index"):
+    embedding_model = HuggingFaceEmbeddings(
+        model_name="sentence-transformers/all-MiniLM-L6-v2"
+    )
+    db = FAISS.load_local(
+        index_path,
+        embeddings=embedding_model,
+        allow_dangerous_deserialization=True
+    )
+    return db
+# download and load vectorstore
+def get_vectorstore(repo_id="kaburia/epic-a-embeddings", local_folder="faiss_index"):
+    download_faiss_index(repo_id=repo_id, local_folder=local_folder)
+    return load_vectorstore(index_path=local_folder)

utils/model_generation.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import json
+import requests
+from typing import List, Dict, Any, Union
+import time
+import numpy as np
+import os
+PROMPT_TEMPLATES = {
+    "verbatim_sentiment": {
+        "system": (
+            "You are a compliance-grade policy analyst assistant. "
+            "Your job is to return a precise, fact-grounded response. "
+            "Avoid hallucinations. Base everything strictly on the content provided."
+            "if the coherence and or sentiment analysis is not enabled, do not mention it in the response."
+        ),
+        "user_template": """
+Query: {query}
+Deliverables:
+1) **Quoted Policy Excerpts**: Quote key policy content directly. Cite the source using filename and page.
+2) **Sentiment Summary**: Use the sentiment JSON to explain tone, gaps, penalties, or enforcement clarity in plain English.
+3) **Coherence Assessment**: Summarize the coherence report below. Highlight:
+   - Whether the answer was mostly on-topic or off-topic
+   - point out the sections that were coherent, off topic and repeated
+Topic hint: {topic_hint}
+Sentiment JSON (rolled-up across top docs):
+{sentiment_json}
+Coherence report:
+{coherence_report}
+Context Sources:
+{context_block}
+"""
+    },
+    "abstractive_summary": {
+        "system": (
+            "You are a policy analyst summarizing government documents for a general audience. "
+            "Your response should paraphrase clearly, avoiding quotes unless absolutely necessary. "
+            "Highlight high-level goals, enforcement strategies, and important deadlines or penalties."
+        ),
+        "user_template": """Query: {query}
+Summarize the answer in natural, non-technical language. Emphasize clarity and coverage. Avoid quoting unless the phrase is legally binding.
+Topic hint: {topic_hint}
+Context DOCS:
+{context_block}
+"""
+    },
+    "followup_reasoning": {
+        "system": (
+            "You are an assistant that explains policy documents interactively, reasoning step-by-step. "
+            "Always cite document IDs and indicate if certain info is absent."
+        ),
+        "user_template": """User query: {query}
+Explain the answer step-by-step. Add follow-up questions that a reader might ask, and try to answer them using the documents below.
+Topic: {topic_hint}
+DOCS:
+{context_block}
+"""
+    },
+    # Add more templates as needed
+}
+# --- LLM client ---
+def get_do_completion(api_key, model_name, messages, temperature=0.2, max_tokens=800):
+    url = "https://inference.do-ai.run/v1/chat/completions"
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": model_name,
+        "messages": messages,
+        "temperature": temperature,
+        "max_tokens": max_tokens
+    }
+    try:
+        resp = requests.post(url, headers=headers, json=data, timeout=90)
+        resp.raise_for_status()
+        return resp.json()
+    except requests.exceptions.HTTPError as e:
+        print(f"HTTP error occurred: {e}")
+        print(f"Response body: {e.response.text if e.response is not None else ''}")
+        return None
+    except requests.exceptions.RequestException as e:
+        print(f"Request error: {e}")
+        return None
+    except json.JSONDecodeError as e:
+        print(f"Failed to decode JSON: {e}")
+        print(f"Response text: {resp.text if 'resp' in locals() else ''}")
+        return None
+# --- Prompt context builder ---
+def _clip(text: str, max_chars: int = 1400) -> str:
+    """Trim content to limit prompt size."""
+    if not text:
+        return ""
+    text = str(text).strip()
+    return text[:max_chars] + ("..." if len(text) > max_chars else "")
+def build_context_block(top_docs: List[Dict[str, Any]]) -> str:
+    """
+    Formats each document with real citation:
+    - Extracts file name from 'source' path
+    - Uses 'page_label' or falls back to 'page'
+    - Returns: <<<SOURCE: {filename}, p. {page_label}>>>
+    """
+    blocks = []
+    for i, item in enumerate(top_docs):
+        if hasattr(item, "page_content"):
+            text = item.page_content
+            meta = getattr(item, "metadata", {})
+        else:
+            text = item.get("text") or item.get("page_content", "")
+            meta = item.get("metadata", {})
+        # Get file name from path
+        full_path = meta.get("source", "")
+        filename = os.path.basename(full_path) if full_path else f"Document_{i+1}"
+        # Prefer page_label if available, else fallback to raw page
+        page_label = meta.get("page_label") or meta.get("page") or "unknown"
+        citation = f"{filename}, p. {page_label}"
+        blocks.append(f"<<<SOURCE: {citation}>>>\n{_clip(text)}\n</SOURCE>")
+    return "\n".join(blocks)
+# --- Message builder ---
+def build_messages(
+    query: str,
+    top_docs: List[Dict[str, Any]],
+    task_mode: str,
+    sentiment_rollup: Dict[str, List[str]],
+    coherence_report: str = "",
+    topic_hint: str = "energy policy"
+) -> List[Dict[str, str]]:
+    template = PROMPT_TEMPLATES.get(task_mode)
+    if not template:
+        raise ValueError(f"Unknown task mode: {task_mode}")
+    context_block = build_context_block(top_docs)
+    sentiment_json = json.dumps(sentiment_rollup or {}, ensure_ascii=False)
+    user_prompt = template["user_template"].format(
+        query=query,
+        topic_hint=topic_hint,
+        sentiment_json=sentiment_json,
+        context_block=context_block,
+        coherence_report=coherence_report
+    )
+    return [
+        {"role": "system", "content": template["system"]},
+        {"role": "user", "content": user_prompt}
+    ]
+# --- Generation orchestrator ---
+def generate_policy_answer(
+    api_key: str,
+    model_name: str,
+    query: str,
+    top_docs: List[Union[Dict[str, Any], Any]],
+    sentiment_rollup: Dict[str, List[str]],
+    coherence_report: str = "",
+    task_mode: str = "verbatim_sentiment",
+    temperature: float = 0.2,
+    max_tokens: int = 2000
+) -> str:
+    if not top_docs:
+        return "No documents available to answer."
+    messages = build_messages(
+        query=query,
+        top_docs=top_docs,
+        task_mode=task_mode,
+        sentiment_rollup=sentiment_rollup,
+        coherence_report=coherence_report
+    )
+    resp = get_do_completion(api_key, model_name, messages, temperature=temperature, max_tokens=max_tokens)
+    if resp is None:
+        return "Upstream model error. No response."
+    try:
+        return resp["choices"][0]["message"]["content"].strip()
+    except Exception:
+        return json.dumps(resp, indent=2)

utils/retrieve_n_rerank.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# load the encoded text and vectorstore
+from encoding_input import encode_text
+from loading_embeddings import get_vectorstore
+from sentence_transformers import CrossEncoder
+import numpy as np
+import faiss
+def search_vectorstore(encoded_text, vectorstore, k=5, with_score=False):
+    """
+    Vector similarity search with optional distance/score return.
+    Args:
+        encoded_text (np.ndarray | list): 1-D vector.
+        vectorstore (langchain.vectorstores.faiss.FAISS): your store.
+        k (int): # of neighbors.
+        with_score (bool): toggle score output.
+    Returns:
+        list: docs or (doc, score) tuples.
+    """
+    q = np.asarray(encoded_text, dtype="float32").reshape(1, -1)
+    # ---- Use raw FAISS for full control and consistent behavior-------
+    index = vectorstore.index                              # faiss.Index
+    distances, idxs = index.search(q, k)                   # (1, k) each
+    docs = [vectorstore.docstore.search(
+                vectorstore.index_to_docstore_id[i]) for i in idxs[0]]
+    # Return with or without scores
+    return list(zip(docs, distances[0])) if with_score else docs
+def rerank_cross_encoder(query_text, docs, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2", top_m=20, min_score=None):
+    """
+    Returns top_m (doc, score) sorted by score desc. If min_score is set, filters below it.
+    docs: A list of Document objects.
+    """
+    ce = CrossEncoder(model_name)
+    # Create pairs of (query_text, document_content)
+    pairs = [(query_text, doc.page_content) for doc in docs] # Use doc.page_content for the text
+    scores = ce.predict(pairs)  # higher is better
+    # Pair original documents with their scores and sort
+    scored_documents = sorted(zip(docs, scores.tolist()), key=lambda x: x[1], reverse=True)
+    # Apply minimum score filter if specified
+    if min_score is not None:
+        scored_documents = [r for r in scored_documents if r[1] >= min_score]
+    # Return the top_m reranked (Document, score) tuples
+    return scored_documents[:top_m]
+# retrieval and reranking function
+def retrieve_and_rerank(query_text, vectorstore, k=50,
+                        rerank_model="cross-encoder/ms-marco-MiniLM-L-6-v2",
+                        top_m=20, min_score=None,
+                        only_docs=True):
+    # Step 1: Encode the query text
+    encoded_query = encode_text(query_text)
+    # Step 2: Retrieve relevant documents from the vectorstore
+    retrieved_docs = search_vectorstore(encoded_query, vectorstore, k=k)
+    # get only the documents
+    retrieved_docs = [doc for doc, _ in retrieved_docs] if isinstance(retrieved_docs[0], tuple) else retrieved_docs
+    # If no documents are retrieved, return an empty list
+    if not retrieved_docs:
+        return []
+    # Step 3: Rerank the retrieved documents
+    reranked_docs = rerank_cross_encoder(query_text, retrieved_docs, model_name=rerank_model, top_m=top_m, min_score=min_score)
+    # If only_docs is True, return just the documents
+    if only_docs:
+        return [doc for doc, _ in reranked_docs]
+    # Otherwise, return the reranked documents with their scores
+    return reranked_docs

utils/sentiment_analysis.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import re, math, torch
+from transformers import pipeline
+# ------------- Model (CPU-friendly); use device=0 + fp16 on GPU -------------
+ZSHOT = pipeline(
+    "zero-shot-classification",
+    model="MoritzLaurer/deberta-v3-base-zeroshot-v2.0",
+    multi_label=True,
+    device=-1,
+    model_kwargs={"torch_dtype": torch.float32}
+)
+# ------------------ Taxonomy with descriptions (helps NLI) -------------------
+TAXO = {
+  "intent_type": [
+    "objective: declares goals or aims",
+    "principle: states guiding values",
+    "strategy: outlines measures or actions",
+    "obligation: mandates an action (shall/must)",
+    "prohibition: forbids an action",
+    "permission: allows an action (may)",
+    "exception: states conditions where rules change",
+    "definition: defines a term",
+    "scope: states applicability or coverage"
+  ],
+  "disposition": [
+    "restrictive: limits or constrains the topic",
+    "cautionary: warns or urges care",
+    "neutral: descriptive with no clear stance",
+    "enabling: allows or facilitates the topic",
+    "supportive: promotes or expands the topic"
+  ],
+  "rigidity": [
+    "must: mandatory (shall/must)",
+    "should: advisory (should)",
+    "may: permissive (may/can)"
+  ],
+  "temporal": [
+    "deadline: requires completion by a date or period",
+    "schedule: sets a cadence (e.g., annually, quarterly)",
+    "ongoing: continuing requirement without end date",
+    "effective_date: specifies when rules start/apply"
+  ],
+  "scope": [
+    "actor_specific: targets a group or entity (e.g., county governments, permit holders)",
+    "geography_specific: targets a location or region",
+    "subject_specific: targets a topic (e.g., permits, sanitation)",
+    "nationwide: applies across the country"
+  ],
+  "enforcement": [
+    "penalty: fines or sanctions for non-compliance",
+    "remedy: corrective actions required",
+    "monitoring: oversight or audits",
+    "reporting: reports/returns required",
+    "none_detected: no enforcement mechanisms present"
+  ],
+  "resourcing": [
+    "funding: funds or budget allocations",
+    "fees_levies: charges or levies",
+    "capacity_hr: staffing or training",
+    "infrastructure: capital works or equipment",
+    "none_detected: no resourcing present"
+  ],
+  "impact": [
+    "low: limited effect on regulated parties",
+    "medium: moderate practical effect",
+    "high: significant obligations or restrictions"
+  ]
+}
+# ---------------- Axis-specific thresholds (calibrate later) -----------------
+TAU = {
+  "intent_type": 0.55, "disposition": 0.55, "rigidity": 0.60,
+  "temporal": 0.62, "scope": 0.55,
+  "enforcement": 0.50, "resourcing": 0.50, "impact": 0.60
+}
+TAU_LOW = 0.40  # only for deciding if we can safely emit "none_detected"
+# ------------------------- Cleaning & evidence rules -------------------------
+def _clean(t: str) -> str:
+    t = re.sub(r"[ \t]*\n[ \t]*", " ", str(t))
+    t = re.sub(r"\s{2,}", " ", t).strip()
+    return t
+PAT = {
+    "actor": r"\bCounty Government(?:s)?\b|\bAuthority\b|\bMinistry\b|\bAgency\b|\bBoard\b|\bCommission\b",
+    "nationwide": r"\bKenya\b|\bnational\b|\bnationwide\b|\bacross the country\b|\bthe country\b",
+    "objective": r"\b(Objective[s]?|Purpose)\b|(?:^|\.\s+)To [A-Za-z]",
+    "imperative": r"(?:^|\.\s+)(Promote|Ensure|Encourage|Strengthen|Adopt)\b.*?(?:\.|;)",
+    "modal_must": r"\bshall\b|\bmust\b",
+    "modal_should": r"\bshould\b",
+    "modal_may": r"\bmay\b|\bcan\b",
+    "temporal": r"\bwithin \d+\s+(day|days|month|months|year|years)\b|\bby \d{4}\b|\beffective\b",
+    "enforcement": r"\bpenalt(y|ies)\b|\bfine(s)?\b|\brevocation\b|\bsuspension\b|\breport(ing)?\b|\bmonitor(ing)?\b",
+    "resourcing": r"\bfund(?:ing)?\b|\blevy|levies|fee(s)?\b|\bbudget\b|\binfrastructure\b|\bcapacity\b|\btraining\b"
+}
+def _spans(text, pattern, max_spans=2):
+    spans = []
+    for m in re.finditer(pattern, text, flags=re.I):
+        # sentence-level extraction
+        start = text.rfind('.', 0, m.start()) + 1
+        end = text.find('.', m.end())
+        if end == -1: end = len(text)
+        snippet = text[start:end].strip()
+        if snippet and snippet not in spans:
+            spans.append(snippet)
+        if len(spans) >= max_spans: break
+    return spans
+def _softmax(d):
+    vals = list(d.values())
+    if not vals: return {k: 0.0 for k in d}
+    m = max(vals)
+    exps = [math.exp(v - m) for v in vals]
+    Z = sum(exps)
+    return {k: (e / Z) for k, e in zip(d.keys(), exps)}
+# -------------------- Main: classify + explanations + % ----------------------
+def classify_and_explain(text: str, topic: str = "water and sanitation", per_axis_top_k=2):
+    text = _clean(text)
+    if not text:
+        return {"decision_summary": "No operative decision; empty passage.",
+                "labels": {ax: [] for ax in TAXO},
+                "percents_raw": {ax: {} for ax in TAXO},
+                "percents_norm": {ax: {} for ax in TAXO},
+                "why": [], "text_preview": ""}
+    # Topic-aware hypotheses (improves stance/intent)
+    def hyp(axis):
+        base = "This passage {} regarding " + topic + "."
+        return {
+          "intent_type": base.format("states a {}"),
+          "disposition": base.format("is {}"),
+          "rigidity": "Compliance in this passage is {}.",
+          "temporal": base.format("specifies a {} aspect"),
+          "scope": base.format("is {} in applicability"),
+          "enforcement": base.format("includes {} for compliance"),
+          "resourcing": base.format("provides {}"),
+          "impact": base.format("has {} impact")
+        }[axis]
+    # Single call if supported; else per-axis fallback
+    tasks = [{"sequences": text, "candidate_labels": labels, "hypothesis_template": hyp(axis)}
+             for axis, labels in TAXO.items()]
+    try:
+        results = ZSHOT(tasks)
+    except TypeError:
+        results = [ZSHOT(text, labels, hypothesis_template=hyp(axis))
+                   for axis, labels in TAXO.items()]
+    labels_out, perc_raw, perc_norm, why = {}, {}, {}, []
+    for (axis, labels), r in zip(TAXO.items(), results):
+        # raw scores
+        raw = {lbl.split(":")[0].strip(): float(s) for lbl, s in zip(r["labels"], r["scores"])}
+        perc_raw[axis]  = {k: round(raw[k]*100, 1) for k in raw}              # independent sigmoid
+        norm = _softmax(raw)
+        perc_norm[axis] = {k: round(norm[k]*100, 1) for k in norm}            # sums ~100%
+        # select labels by threshold
+        keep = [k for k, s in raw.items() if s >= TAU[axis]]
+        keep = sorted(keep, key=lambda k: raw[k], reverse=True)[:per_axis_top_k]
+        # only emit none_detected when everything else is weak and no heuristic evidence
+        if not keep and "none_detected" in raw:
+            if max([v for k, v in raw.items() if k != "none_detected"] or [0.0]) < TAU_LOW:
+                keep = ["none_detected"]
+        labels_out[axis] = keep
+        # compact "why" with evidence for the top choice
+        if keep and keep[0] != "none_detected":
+            if axis == "intent_type":
+                ev = _spans(text, PAT["objective"]) or _spans(text, PAT["imperative"])
+                why.append({"axis": axis, "label": keep[0], "reason": "functional cues", "evidence": ev[:2]})
+            elif axis == "disposition":
+                ev = _spans(text, PAT["imperative"])
+                why.append({"axis": axis, "label": keep[0], "reason": "promotional/allowing framing", "evidence": ev[:2]})
+            elif axis == "rigidity":
+                pat = {"must": PAT["modal_must"], "should": PAT["modal_should"], "may": PAT["modal_may"]}[keep[0]]
+                why.append({"axis": axis, "label": keep[0], "reason": "modal verb", "evidence": _spans(text, pat)[:2]})
+            elif axis == "temporal":
+                why.append({"axis": axis, "label": keep[0], "reason": "time expressions", "evidence": _spans(text, PAT["temporal"])[:2]})
+            elif axis == "scope":
+                ev = _spans(text, PAT["nationwide"]) or _spans(text, PAT["actor"])
+                why.append({"axis": axis, "label": keep[0], "reason": "applicability cues", "evidence": ev[:2]})
+            elif axis == "enforcement":
+                why.append({"axis": axis, "label": keep[0], "reason": "compliance hooks", "evidence": _spans(text, PAT["enforcement"])[:2]})
+            elif axis == "resourcing":
+                why.append({"axis": axis, "label": keep[0], "reason": "resourcing hooks", "evidence": _spans(text, PAT["resourcing"])[:2]})
+    # Decision summary: imperative lines + problem statements; never fabricate
+    summary_bits = []
+    imperatives = re.findall(PAT["imperative"], text, flags=re.I)
+    # pull full imperative sentences
+    imp_sents = _spans(text, PAT["imperative"], max_spans=3)
+    if imp_sents:
+        summary_bits.append("Strategies: " + " ".join(imp_sents))
+    if "nationwide" in labels_out.get("scope", []):
+        summary_bits.append("Applies nationwide.")
+    if labels_out.get("enforcement") == ["none_detected"]:
+        summary_bits.append("Enforcement: none detected in this passage.")
+    if labels_out.get("resourcing") == ["none_detected"]:
+        summary_bits.append("Resourcing: none detected in this passage.")
+    decision_summary = " ".join(summary_bits) if summary_bits else "No operative decision beyond high-level description detected."
+    return {
+      "decision_summary": decision_summary,
+      "labels": labels_out,
+      "percents_raw": perc_raw,     # model confidences per label (0–100, do NOT sum to 100)
+      "percents_norm": perc_norm,   # normalized per axis (sums to ~100)
+      "why": why,
+      "text_preview": text[:300] + ("..." if len(text) > 300 else "")
+    }
+# Get the sentiment for all the docs
+def get_sentiment(texts):
+    return [classify_and_explain(texts[i].page_content) for i in range(len(texts))]