""" Verification agent module for answer validation against source documents. """ from typing import Dict, List, Optional, Literal from langchain_core.documents import Document from langchain_google_genai import ChatGoogleGenerativeAI from pydantic import BaseModel, Field import logging from configuration.parameters import parameters logger = logging.getLogger(__name__) class VerificationResult(BaseModel): """Structured output model for verification results.""" supported: Literal["YES", "NO", "PARTIAL"] = Field( description="Whether the answer is supported by the context" ) confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field( default="MEDIUM", description="Confidence level in the verification result" ) unsupported_claims: List[str] = Field( default_factory=list, description="Claims not supported by context" ) contradictions: List[str] = Field( default_factory=list, description="Contradictions between answer and context" ) relevant: Literal["YES", "NO"] = Field( description="Whether the answer is relevant to the question" ) completeness: Literal["COMPLETE", "PARTIAL", "INCOMPLETE"] = Field( default="PARTIAL", description="How completely the answer addresses the question" ) additional_details: str = Field( default="", description="Additional explanations and reasoning" ) class BestAnswerSelection(BaseModel): """Structured output model for selecting the best answer from candidates.""" selected_index: int = Field( description="The index (0-based) of the best answer from the candidates list" ) reasoning: str = Field( description="Explanation of why this answer was selected as the best" ) confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field( default="MEDIUM", description="Confidence level in the selection" ) comparison_summary: str = Field( default="", description="Brief comparison of the candidate answers" ) class VerificationAgent: """Agent for verifying answers against source documents.""" def __init__( self, llm: Optional[ChatGoogleGenerativeAI] = None, max_context_chars: int = None, max_output_tokens: int = None, ) -> None: """Initialize the verification agent.""" logger.info("Initializing VerificationAgent...") self.max_context_chars = max_context_chars or parameters.VERIFICATION_MAX_CONTEXT_CHARS self.max_output_tokens = max_output_tokens or parameters.VERIFICATION_MAX_OUTPUT_TOKENS base_llm = llm or ChatGoogleGenerativeAI( model=parameters.VERIFICATION_AGENT_MODEL, google_api_key=parameters.GOOGLE_API_KEY, temperature=0, max_output_tokens=self.max_output_tokens, ) self.llm = base_llm self.structured_llm = base_llm.with_structured_output(VerificationResult) self.selection_llm = base_llm.with_structured_output(BestAnswerSelection) logger.info(f"VerificationAgent initialized (model={parameters.VERIFICATION_AGENT_MODEL})") def generate_prompt(self, answer: str, context: str, question: Optional[str] = None) -> str: """Generate verification prompt.""" question_section = f"\n**Original Question:** {question}\n" if question else "" return f"""Verify the following answer against the provided context. **Check for:** 1. Factual support (YES/NO/PARTIAL) 2. Confidence level (HIGH/MEDIUM/LOW) 3. Unsupported claims 4. Contradictions 5. Relevance to question 6. Completeness (COMPLETE/PARTIAL/INCOMPLETE) **Scoring:** - HIGH: All claims directly stated, no ambiguity - MEDIUM: Most claims supported, some inferred - LOW: Significant claims unsupported {question_section} **Answer to Verify:** {answer} **Context:** {context} Provide your verification analysis.""" def format_verification_report(self, verification: VerificationResult) -> str: """Format verification result into readable report.""" report = f"**Supported:** {verification.supported}\n" report += f"**Confidence:** {verification.confidence}\n" report += f"**Unsupported Claims:** {', '.join(verification.unsupported_claims) or 'None'}\n" report += f"**Contradictions:** {', '.join(verification.contradictions) or 'None'}\n" report += f"**Relevant:** {verification.relevant}\n" report += f"**Completeness:** {verification.completeness}\n" report += f"**Additional Details:** {verification.additional_details or 'None'}\n" return report def generate_feedback_for_research(self, verification: VerificationResult) -> Optional[str]: """Generate feedback for research agent if improvements needed.""" feedback_parts = [] if verification.supported == "NO": feedback_parts.append("Answer lacks sufficient support from documents.") elif verification.supported == "PARTIAL": feedback_parts.append("Some parts are not well supported.") if verification.unsupported_claims: claims_str = "; ".join(verification.unsupported_claims[:3]) feedback_parts.append(f"Unsupported: {claims_str}") if verification.contradictions: contradictions_str = "; ".join(verification.contradictions[:3]) feedback_parts.append(f"Contradictions: {contradictions_str}") if verification.completeness == "INCOMPLETE": feedback_parts.append("Answer is incomplete.") if verification.confidence == "LOW": feedback_parts.append("Focus on directly verifiable claims.") # Always add additional_details if present, even if other feedback exists if verification.additional_details: feedback_parts.append(f"Additional Details: {verification.additional_details}") return " | ".join(feedback_parts) if feedback_parts else None def should_retry_research(self, verification: VerificationResult, verification_report: str = None, feedback: Optional[str] = None) -> bool: """Determine if research should be retried.""" # Use structured fields first if verification.supported == "NO" or verification.relevant == "NO": return True if verification.confidence == "LOW" and ( verification.unsupported_claims or verification.contradictions ): return True if verification.supported == "PARTIAL" and verification.contradictions: return True # Also check verification_report string for extra signals (legacy/fallback) if verification_report: if "Supported: NO" in verification_report: logger.warning("[Re-Research] Answer not supported; triggering re-research.") return True elif "Relevant: NO" in verification_report: logger.warning("[Re-Research] Answer not relevant; triggering re-research.") return True elif "Confidence: LOW" in verification_report and "Supported: PARTIAL" in verification_report: logger.warning("[Re-Research] Low confidence with partial support; triggering re-research.") return True elif "Completeness: INCOMPLETE" in verification_report: logger.warning("[Re-Research] Answer is incomplete; triggering re-research.") return True elif "Completeness: PARTIAL" in verification_report: logger.warning("[Re-Research] Answer is partially complete; triggering re-research.") return True # Check feedback for contradiction/unsupported if feedback and ("contradiction" in feedback.lower() or "unsupported" in feedback.lower()): logger.warning("[Re-Research] Feedback indicates contradiction/unsupported; triggering re-research.") return True return False def check(self, answer: str, documents: List[Document], question: Optional[str] = None) -> Dict: """ Verify answer against provided documents. Args: answer: The answer to verify documents: Source documents for verification question: Optional original question Returns: Dict with verification report, context, and metadata """ logger.info(f"Verifying answer ({len(answer)} chars) against {len(documents)} documents") context = "\n\n".join([doc.page_content for doc in documents]) if len(context) > self.max_context_chars: logger.debug(f"Context truncated: {len(context)} -> {self.max_context_chars}") context = context[:self.max_context_chars] prompt = self.generate_prompt(answer, context, question) try: logger.debug("Calling LLM for verification...") verification_result: VerificationResult = self.structured_llm.invoke(prompt) logger.info(f"Verification: {verification_result.supported} ({verification_result.confidence})") except Exception as e: logger.error(f"Structured output failed: {e}") try: response = self.llm.invoke(prompt) report = response.content if hasattr(response, "content") else str(response) verification_result = self._parse_unstructured_response(report.strip()) except Exception as fallback_error: logger.error(f"Fallback failed: {fallback_error}") verification_result = VerificationResult( supported="NO", confidence="LOW", relevant="NO", completeness="INCOMPLETE", additional_details=f"Verification failed: {str(e)}" ) verification_report = self.format_verification_report(verification_result) feedback = self.generate_feedback_for_research(verification_result) if feedback: logger.debug(f"Generated feedback: {feedback[:80]}...") return { "verification_report": verification_report, "context_used": context, "structured_result": verification_result.model_dump(), "should_retry": self.should_retry_research(verification_result, verification_report, feedback), "feedback": feedback } def select_best_answer( self, candidate_answers: List[str], documents: List[Document], question: str ) -> Dict: """ Select the best answer from multiple candidates based on verification criteria. Args: candidate_answers: List of candidate answers to evaluate documents: Source documents for verification question: The original question Returns: Dict with selected answer, index, reasoning, and verification details """ logger.info(f"Selecting best answer from {len(candidate_answers)} candidates") if len(candidate_answers) == 0: logger.warning("No candidate answers provided") return { "selected_answer": "No answers were generated.", "selected_index": -1, "reasoning": "No candidates available", "confidence": "LOW" } if len(candidate_answers) == 1: logger.info("Only one candidate, returning it directly") return { "selected_answer": candidate_answers[0], "selected_index": 0, "reasoning": "Only one candidate answer was provided", "confidence": "MEDIUM" } context = "\n\n".join([doc.page_content for doc in documents]) if len(context) > self.max_context_chars: logger.debug(f"Context truncated: {len(context)} -> {self.max_context_chars}") context = context[:self.max_context_chars] candidates_text = "" for i, answer in enumerate(candidate_answers): candidates_text += f"\n**Candidate {i + 1}:**\n{answer}\n" prompt = f"""You are evaluating multiple candidate answers to select the best one. **Original Question:** {question} **Candidate Answers:** {candidates_text} **Source Context:** {context} **Evaluation Criteria:** 1. **Factual Accuracy**: Which answer is most accurately supported by the context? 2. **Completeness**: Which answer most thoroughly addresses the question? 3. **Relevance**: Which answer stays most focused on what was asked? 4. **No Contradictions**: Which answer has the fewest contradictions with the source? 5. **Clarity**: Which answer is clearest and most well-structured? Select the best answer by providing its index (0-based) and explain your reasoning.""" try: logger.debug("Calling LLM for best answer selection...") selection_result: BestAnswerSelection = self.selection_llm.invoke(prompt) selected_index = selection_result.selected_index if selected_index < 0 or selected_index >= len(candidate_answers): logger.warning(f"Invalid selection index {selected_index}, defaulting to 0") selected_index = 0 logger.info(f"Selected candidate {selected_index + 1} with {selection_result.confidence} confidence") return { "selected_answer": candidate_answers[selected_index], "selected_index": selected_index, "reasoning": selection_result.reasoning, "confidence": selection_result.confidence, "comparison_summary": selection_result.comparison_summary } except Exception as e: logger.error(f"Best answer selection failed: {e}") # Fallback: return the first candidate return { "selected_answer": candidate_answers[0], "selected_index": 0, "reasoning": f"Selection failed, using first candidate: {str(e)}", "confidence": "LOW" } def _parse_unstructured_response(self, response_text: str) -> VerificationResult: """Parse unstructured response into VerificationResult (fallback).""" try: data = { "supported": "NO", "confidence": "LOW", "unsupported_claims": [], "contradictions": [], "relevant": "NO", "completeness": "INCOMPLETE", "additional_details": "" } for line in response_text.split('\n'): if ':' not in line: continue key, value = line.split(':', 1) key = key.strip().lower().replace(' ', '_') value = value.strip().upper() if key == "supported": data["supported"] = "YES" if "YES" in value else ("PARTIAL" if "PARTIAL" in value else "NO") elif key == "confidence": data["confidence"] = "HIGH" if "HIGH" in value else ("MEDIUM" if "MEDIUM" in value else "LOW") elif key == "relevant": data["relevant"] = "YES" if "YES" in value else "NO" elif key == "completeness": if "COMPLETE" in value and "INCOMPLETE" not in value: data["completeness"] = "COMPLETE" elif "PARTIAL" in value: data["completeness"] = "PARTIAL" return VerificationResult(**data) except Exception as e: logger.error(f"Failed to parse response: {e}") return VerificationResult( supported="NO", confidence="LOW", relevant="NO", completeness="INCOMPLETE", additional_details="Failed to parse verification response" )