SmartDocAI / intelligence /accuracy_verifier.py
TilanB's picture
improvements
2a7fd26
"""
Verification agent module for answer validation against source documents.
"""
from typing import Dict, List, Optional, Literal
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import BaseModel, Field
import logging
from configuration.parameters import parameters
logger = logging.getLogger(__name__)
class VerificationResult(BaseModel):
"""Structured output model for verification results."""
supported: Literal["YES", "NO", "PARTIAL"] = Field(
description="Whether the answer is supported by the context"
)
confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
default="MEDIUM",
description="Confidence level in the verification result"
)
unsupported_claims: List[str] = Field(
default_factory=list,
description="Claims not supported by context"
)
contradictions: List[str] = Field(
default_factory=list,
description="Contradictions between answer and context"
)
relevant: Literal["YES", "NO"] = Field(
description="Whether the answer is relevant to the question"
)
completeness: Literal["COMPLETE", "PARTIAL", "INCOMPLETE"] = Field(
default="PARTIAL",
description="How completely the answer addresses the question"
)
additional_details: str = Field(
default="",
description="Additional explanations and reasoning"
)
class BestAnswerSelection(BaseModel):
"""Structured output model for selecting the best answer from candidates."""
selected_index: int = Field(
description="The index (0-based) of the best answer from the candidates list"
)
reasoning: str = Field(
description="Explanation of why this answer was selected as the best"
)
confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
default="MEDIUM",
description="Confidence level in the selection"
)
comparison_summary: str = Field(
default="",
description="Brief comparison of the candidate answers"
)
class VerificationAgent:
"""Agent for verifying answers against source documents."""
def __init__(
self,
llm: Optional[ChatGoogleGenerativeAI] = None,
max_context_chars: int = None,
max_output_tokens: int = None,
) -> None:
"""Initialize the verification agent."""
logger.info("Initializing VerificationAgent...")
self.max_context_chars = max_context_chars or parameters.VERIFICATION_MAX_CONTEXT_CHARS
self.max_output_tokens = max_output_tokens or parameters.VERIFICATION_MAX_OUTPUT_TOKENS
base_llm = llm or ChatGoogleGenerativeAI(
model=parameters.VERIFICATION_AGENT_MODEL,
google_api_key=parameters.GOOGLE_API_KEY,
temperature=0,
max_output_tokens=self.max_output_tokens,
)
self.llm = base_llm
self.structured_llm = base_llm.with_structured_output(VerificationResult)
self.selection_llm = base_llm.with_structured_output(BestAnswerSelection)
logger.info(f"VerificationAgent initialized (model={parameters.VERIFICATION_AGENT_MODEL})")
def generate_prompt(self, answer: str, context: str, question: Optional[str] = None) -> str:
"""Generate verification prompt."""
question_section = f"\n**Original Question:** {question}\n" if question else ""
return f"""Verify the following answer against the provided context.
**Check for:**
1. Factual support (YES/NO/PARTIAL)
2. Confidence level (HIGH/MEDIUM/LOW)
3. Unsupported claims
4. Contradictions
5. Relevance to question
6. Completeness (COMPLETE/PARTIAL/INCOMPLETE)
**Scoring:**
- HIGH: All claims directly stated, no ambiguity
- MEDIUM: Most claims supported, some inferred
- LOW: Significant claims unsupported
{question_section}
**Answer to Verify:**
{answer}
**Context:**
{context}
Provide your verification analysis."""
def format_verification_report(self, verification: VerificationResult) -> str:
"""Format verification result into readable report."""
report = f"**Supported:** {verification.supported}\n"
report += f"**Confidence:** {verification.confidence}\n"
report += f"**Unsupported Claims:** {', '.join(verification.unsupported_claims) or 'None'}\n"
report += f"**Contradictions:** {', '.join(verification.contradictions) or 'None'}\n"
report += f"**Relevant:** {verification.relevant}\n"
report += f"**Completeness:** {verification.completeness}\n"
report += f"**Additional Details:** {verification.additional_details or 'None'}\n"
return report
def generate_feedback_for_research(self, verification: VerificationResult) -> Optional[str]:
"""Generate feedback for research agent if improvements needed."""
feedback_parts = []
if verification.supported == "NO":
feedback_parts.append("Answer lacks sufficient support from documents.")
elif verification.supported == "PARTIAL":
feedback_parts.append("Some parts are not well supported.")
if verification.unsupported_claims:
claims_str = "; ".join(verification.unsupported_claims[:3])
feedback_parts.append(f"Unsupported: {claims_str}")
if verification.contradictions:
contradictions_str = "; ".join(verification.contradictions[:3])
feedback_parts.append(f"Contradictions: {contradictions_str}")
if verification.completeness == "INCOMPLETE":
feedback_parts.append("Answer is incomplete.")
if verification.confidence == "LOW":
feedback_parts.append("Focus on directly verifiable claims.")
# Always add additional_details if present, even if other feedback exists
if verification.additional_details:
feedback_parts.append(f"Additional Details: {verification.additional_details}")
return " | ".join(feedback_parts) if feedback_parts else None
def should_retry_research(self, verification: VerificationResult, verification_report: str = None, feedback: Optional[str] = None) -> bool:
"""Determine if research should be retried."""
# Use structured fields first
if verification.supported == "NO" or verification.relevant == "NO":
return True
if verification.confidence == "LOW" and (
verification.unsupported_claims or verification.contradictions
):
return True
if verification.supported == "PARTIAL" and verification.contradictions:
return True
# Also check verification_report string for extra signals (legacy/fallback)
if verification_report:
if "Supported: NO" in verification_report:
logger.warning("[Re-Research] Answer not supported; triggering re-research.")
return True
elif "Relevant: NO" in verification_report:
logger.warning("[Re-Research] Answer not relevant; triggering re-research.")
return True
elif "Confidence: LOW" in verification_report and "Supported: PARTIAL" in verification_report:
logger.warning("[Re-Research] Low confidence with partial support; triggering re-research.")
return True
elif "Completeness: INCOMPLETE" in verification_report:
logger.warning("[Re-Research] Answer is incomplete; triggering re-research.")
return True
elif "Completeness: PARTIAL" in verification_report:
logger.warning("[Re-Research] Answer is partially complete; triggering re-research.")
return True
# Check feedback for contradiction/unsupported
if feedback and ("contradiction" in feedback.lower() or "unsupported" in feedback.lower()):
logger.warning("[Re-Research] Feedback indicates contradiction/unsupported; triggering re-research.")
return True
return False
def check(self, answer: str, documents: List[Document], question: Optional[str] = None) -> Dict:
"""
Verify answer against provided documents.
Args:
answer: The answer to verify
documents: Source documents for verification
question: Optional original question
Returns:
Dict with verification report, context, and metadata
"""
logger.info(f"Verifying answer ({len(answer)} chars) against {len(documents)} documents")
context = "\n\n".join([doc.page_content for doc in documents])
if len(context) > self.max_context_chars:
logger.debug(f"Context truncated: {len(context)} -> {self.max_context_chars}")
context = context[:self.max_context_chars]
prompt = self.generate_prompt(answer, context, question)
try:
logger.debug("Calling LLM for verification...")
verification_result: VerificationResult = self.structured_llm.invoke(prompt)
logger.info(f"Verification: {verification_result.supported} ({verification_result.confidence})")
except Exception as e:
logger.error(f"Structured output failed: {e}")
try:
response = self.llm.invoke(prompt)
report = response.content if hasattr(response, "content") else str(response)
verification_result = self._parse_unstructured_response(report.strip())
except Exception as fallback_error:
logger.error(f"Fallback failed: {fallback_error}")
verification_result = VerificationResult(
supported="NO",
confidence="LOW",
relevant="NO",
completeness="INCOMPLETE",
additional_details=f"Verification failed: {str(e)}"
)
verification_report = self.format_verification_report(verification_result)
feedback = self.generate_feedback_for_research(verification_result)
if feedback:
logger.debug(f"Generated feedback: {feedback[:80]}...")
return {
"verification_report": verification_report,
"context_used": context,
"structured_result": verification_result.model_dump(),
"should_retry": self.should_retry_research(verification_result, verification_report, feedback),
"feedback": feedback
}
def select_best_answer(
self,
candidate_answers: List[str],
documents: List[Document],
question: str
) -> Dict:
"""
Select the best answer from multiple candidates based on verification criteria.
Args:
candidate_answers: List of candidate answers to evaluate
documents: Source documents for verification
question: The original question
Returns:
Dict with selected answer, index, reasoning, and verification details
"""
logger.info(f"Selecting best answer from {len(candidate_answers)} candidates")
if len(candidate_answers) == 0:
logger.warning("No candidate answers provided")
return {
"selected_answer": "No answers were generated.",
"selected_index": -1,
"reasoning": "No candidates available",
"confidence": "LOW"
}
if len(candidate_answers) == 1:
logger.info("Only one candidate, returning it directly")
return {
"selected_answer": candidate_answers[0],
"selected_index": 0,
"reasoning": "Only one candidate answer was provided",
"confidence": "MEDIUM"
}
context = "\n\n".join([doc.page_content for doc in documents])
if len(context) > self.max_context_chars:
logger.debug(f"Context truncated: {len(context)} -> {self.max_context_chars}")
context = context[:self.max_context_chars]
candidates_text = ""
for i, answer in enumerate(candidate_answers):
candidates_text += f"\n**Candidate {i + 1}:**\n{answer}\n"
prompt = f"""You are evaluating multiple candidate answers to select the best one.
**Original Question:** {question}
**Candidate Answers:**
{candidates_text}
**Source Context:**
{context}
**Evaluation Criteria:**
1. **Factual Accuracy**: Which answer is most accurately supported by the context?
2. **Completeness**: Which answer most thoroughly addresses the question?
3. **Relevance**: Which answer stays most focused on what was asked?
4. **No Contradictions**: Which answer has the fewest contradictions with the source?
5. **Clarity**: Which answer is clearest and most well-structured?
Select the best answer by providing its index (0-based) and explain your reasoning."""
try:
logger.debug("Calling LLM for best answer selection...")
selection_result: BestAnswerSelection = self.selection_llm.invoke(prompt)
selected_index = selection_result.selected_index
if selected_index < 0 or selected_index >= len(candidate_answers):
logger.warning(f"Invalid selection index {selected_index}, defaulting to 0")
selected_index = 0
logger.info(f"Selected candidate {selected_index + 1} with {selection_result.confidence} confidence")
return {
"selected_answer": candidate_answers[selected_index],
"selected_index": selected_index,
"reasoning": selection_result.reasoning,
"confidence": selection_result.confidence,
"comparison_summary": selection_result.comparison_summary
}
except Exception as e:
logger.error(f"Best answer selection failed: {e}")
# Fallback: return the first candidate
return {
"selected_answer": candidate_answers[0],
"selected_index": 0,
"reasoning": f"Selection failed, using first candidate: {str(e)}",
"confidence": "LOW"
}
def _parse_unstructured_response(self, response_text: str) -> VerificationResult:
"""Parse unstructured response into VerificationResult (fallback)."""
try:
data = {
"supported": "NO",
"confidence": "LOW",
"unsupported_claims": [],
"contradictions": [],
"relevant": "NO",
"completeness": "INCOMPLETE",
"additional_details": ""
}
for line in response_text.split('\n'):
if ':' not in line:
continue
key, value = line.split(':', 1)
key = key.strip().lower().replace(' ', '_')
value = value.strip().upper()
if key == "supported":
data["supported"] = "YES" if "YES" in value else ("PARTIAL" if "PARTIAL" in value else "NO")
elif key == "confidence":
data["confidence"] = "HIGH" if "HIGH" in value else ("MEDIUM" if "MEDIUM" in value else "LOW")
elif key == "relevant":
data["relevant"] = "YES" if "YES" in value else "NO"
elif key == "completeness":
if "COMPLETE" in value and "INCOMPLETE" not in value:
data["completeness"] = "COMPLETE"
elif "PARTIAL" in value:
data["completeness"] = "PARTIAL"
return VerificationResult(**data)
except Exception as e:
logger.error(f"Failed to parse response: {e}")
return VerificationResult(
supported="NO",
confidence="LOW",
relevant="NO",
completeness="INCOMPLETE",
additional_details="Failed to parse verification response"
)