Maharshi Gor
commited on
Commit
·
d0ae1a9
1
Parent(s):
eaa5563
Bugfix `logprob` in workflows.
Browse files- src/workflows/configs.py +5 -0
- src/workflows/executors.py +1 -1
- src/workflows/llms.py +8 -4
- src/workflows/qb_agents.py +10 -4
- src/workflows/structs.py +34 -2
- src/workflows/utils.py +1 -1
src/workflows/configs.py
CHANGED
|
@@ -8,9 +8,11 @@ including model configurations, workflow settings, and other package-wide consta
|
|
| 8 |
AVAILABLE_MODELS = {
|
| 9 |
"OpenAI/gpt-4o": {
|
| 10 |
"model": "gpt-4o-2024-11-20",
|
|
|
|
| 11 |
},
|
| 12 |
"OpenAI/gpt-4o-mini": {
|
| 13 |
"model": "gpt-4o-mini-2024-07-18",
|
|
|
|
| 14 |
},
|
| 15 |
"OpenAI/gpt-3.5-turbo": {
|
| 16 |
"model": "gpt-3.5-turbo-0125",
|
|
@@ -26,12 +28,15 @@ AVAILABLE_MODELS = {
|
|
| 26 |
},
|
| 27 |
"Cohere/command-r": {
|
| 28 |
"model": "command-r-08-2024",
|
|
|
|
| 29 |
},
|
| 30 |
"Cohere/command-r-plus": {
|
| 31 |
"model": "command-r-plus-08-2024",
|
|
|
|
| 32 |
},
|
| 33 |
"Cohere/command-r7b": {
|
| 34 |
"model": "command-r7b-12-2024",
|
|
|
|
| 35 |
},
|
| 36 |
}
|
| 37 |
|
|
|
|
| 8 |
AVAILABLE_MODELS = {
|
| 9 |
"OpenAI/gpt-4o": {
|
| 10 |
"model": "gpt-4o-2024-11-20",
|
| 11 |
+
"logprobs": True,
|
| 12 |
},
|
| 13 |
"OpenAI/gpt-4o-mini": {
|
| 14 |
"model": "gpt-4o-mini-2024-07-18",
|
| 15 |
+
"logprobs": True,
|
| 16 |
},
|
| 17 |
"OpenAI/gpt-3.5-turbo": {
|
| 18 |
"model": "gpt-3.5-turbo-0125",
|
|
|
|
| 28 |
},
|
| 29 |
"Cohere/command-r": {
|
| 30 |
"model": "command-r-08-2024",
|
| 31 |
+
"logprobs": True,
|
| 32 |
},
|
| 33 |
"Cohere/command-r-plus": {
|
| 34 |
"model": "command-r-plus-08-2024",
|
| 35 |
+
"logprobs": True,
|
| 36 |
},
|
| 37 |
"Cohere/command-r7b": {
|
| 38 |
"model": "command-r7b-12-2024",
|
| 39 |
+
"logprobs": False,
|
| 40 |
},
|
| 41 |
}
|
| 42 |
|
src/workflows/executors.py
CHANGED
|
@@ -231,7 +231,7 @@ def execute_model_step(
|
|
| 231 |
if return_full_content:
|
| 232 |
result["content"] = api_response["content"]
|
| 233 |
if logprobs:
|
| 234 |
-
result["logprob"] = api_response
|
| 235 |
return result
|
| 236 |
|
| 237 |
|
|
|
|
| 231 |
if return_full_content:
|
| 232 |
result["content"] = api_response["content"]
|
| 233 |
if logprobs:
|
| 234 |
+
result["logprob"] = api_response.get("logprob")
|
| 235 |
return result
|
| 236 |
|
| 237 |
|
src/workflows/llms.py
CHANGED
|
@@ -129,8 +129,6 @@ def _llm_completion(
|
|
| 129 |
Raises:
|
| 130 |
ValueError: If logprobs=True with Anthropic models
|
| 131 |
"""
|
| 132 |
-
if model not in AVAILABLE_MODELS:
|
| 133 |
-
raise ValueError(f"Model {model} not supported")
|
| 134 |
model_name = AVAILABLE_MODELS[model]["model"]
|
| 135 |
provider = model.split("/")[0]
|
| 136 |
if provider == "Cohere":
|
|
@@ -173,13 +171,19 @@ def completion(
|
|
| 173 |
Raises:
|
| 174 |
ValueError: If logprobs=True with Anthropic models
|
| 175 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
# Check cache first
|
| 177 |
cached_response = llm_cache.get(model, system, prompt, response_format, temperature)
|
| 178 |
-
if cached_response
|
| 179 |
logger.info(f"Cache hit for model {model}")
|
| 180 |
return cached_response
|
| 181 |
|
| 182 |
-
logger.info(f"Cache miss for model {model}, calling API")
|
| 183 |
|
| 184 |
# Continue with the original implementation for cache miss
|
| 185 |
response = _llm_completion(model, system, prompt, response_format, temperature, logprobs)
|
|
|
|
| 129 |
Raises:
|
| 130 |
ValueError: If logprobs=True with Anthropic models
|
| 131 |
"""
|
|
|
|
|
|
|
| 132 |
model_name = AVAILABLE_MODELS[model]["model"]
|
| 133 |
provider = model.split("/")[0]
|
| 134 |
if provider == "Cohere":
|
|
|
|
| 171 |
Raises:
|
| 172 |
ValueError: If logprobs=True with Anthropic models
|
| 173 |
"""
|
| 174 |
+
if model not in AVAILABLE_MODELS:
|
| 175 |
+
raise ValueError(f"Model {model} not supported")
|
| 176 |
+
if logprobs and not AVAILABLE_MODELS[model].get("logprobs", False):
|
| 177 |
+
logger.warning(f"{model} does not support logprobs feature, setting logprobs to False")
|
| 178 |
+
logprobs = False
|
| 179 |
+
|
| 180 |
# Check cache first
|
| 181 |
cached_response = llm_cache.get(model, system, prompt, response_format, temperature)
|
| 182 |
+
if cached_response and (not logprobs or cached_response.get("logprob")):
|
| 183 |
logger.info(f"Cache hit for model {model}")
|
| 184 |
return cached_response
|
| 185 |
|
| 186 |
+
logger.info(f"Cache miss for model {model}, calling API. Logprobs: {logprobs}")
|
| 187 |
|
| 188 |
# Continue with the original implementation for cache miss
|
| 189 |
response = _llm_completion(model, system, prompt, response_format, temperature, logprobs)
|
src/workflows/qb_agents.py
CHANGED
|
@@ -1,14 +1,18 @@
|
|
| 1 |
import time
|
| 2 |
from typing import Any, Iterable, TypedDict
|
| 3 |
|
|
|
|
|
|
|
| 4 |
from .executors import WorkflowOutput, execute_workflow
|
| 5 |
from .structs import TossupWorkflow, Workflow
|
| 6 |
|
| 7 |
|
| 8 |
-
def _get_workflow_response(
|
|
|
|
|
|
|
| 9 |
"""Get response from executing a complete workflow."""
|
| 10 |
start_time = time.time()
|
| 11 |
-
workflow_output = execute_workflow(workflow, available_vars, return_full_content=True)
|
| 12 |
response_time = time.time() - start_time
|
| 13 |
return workflow_output, response_time
|
| 14 |
|
|
@@ -78,15 +82,17 @@ class QuizBowlTossupAgent:
|
|
| 78 |
"""
|
| 79 |
for i, question_text in enumerate(question_runs):
|
| 80 |
# Execute the complete workflow
|
|
|
|
| 81 |
workflow_output, response_time = _get_workflow_response(
|
| 82 |
-
self.workflow, {self.external_input_variable: question_text}
|
| 83 |
)
|
| 84 |
final_outputs = workflow_output["final_outputs"]
|
| 85 |
-
buzz = self.workflow.buzzer.run(final_outputs["confidence"], logprob=
|
| 86 |
result: TossupResult = {
|
| 87 |
"position": i + 1,
|
| 88 |
"answer": final_outputs["answer"],
|
| 89 |
"confidence": final_outputs["confidence"],
|
|
|
|
| 90 |
"buzz": buzz,
|
| 91 |
"question_fragment": question_text,
|
| 92 |
"step_contents": workflow_output["step_contents"],
|
|
|
|
| 1 |
import time
|
| 2 |
from typing import Any, Iterable, TypedDict
|
| 3 |
|
| 4 |
+
from loguru import logger
|
| 5 |
+
|
| 6 |
from .executors import WorkflowOutput, execute_workflow
|
| 7 |
from .structs import TossupWorkflow, Workflow
|
| 8 |
|
| 9 |
|
| 10 |
+
def _get_workflow_response(
|
| 11 |
+
workflow: Workflow, available_vars: dict[str, Any], logprob_step: bool | str = False
|
| 12 |
+
) -> tuple[WorkflowOutput, float]:
|
| 13 |
"""Get response from executing a complete workflow."""
|
| 14 |
start_time = time.time()
|
| 15 |
+
workflow_output = execute_workflow(workflow, available_vars, return_full_content=True, logprob_step=logprob_step)
|
| 16 |
response_time = time.time() - start_time
|
| 17 |
return workflow_output, response_time
|
| 18 |
|
|
|
|
| 82 |
"""
|
| 83 |
for i, question_text in enumerate(question_runs):
|
| 84 |
# Execute the complete workflow
|
| 85 |
+
answer_var_step = self.workflow.outputs["answer"].split(".")[0]
|
| 86 |
workflow_output, response_time = _get_workflow_response(
|
| 87 |
+
self.workflow, {self.external_input_variable: question_text}, logprob_step=answer_var_step
|
| 88 |
)
|
| 89 |
final_outputs = workflow_output["final_outputs"]
|
| 90 |
+
buzz = self.workflow.buzzer.run(final_outputs["confidence"], logprob=workflow_output["logprob"])
|
| 91 |
result: TossupResult = {
|
| 92 |
"position": i + 1,
|
| 93 |
"answer": final_outputs["answer"],
|
| 94 |
"confidence": final_outputs["confidence"],
|
| 95 |
+
"logprob": workflow_output["logprob"],
|
| 96 |
"buzz": buzz,
|
| 97 |
"question_fragment": question_text,
|
| 98 |
"step_contents": workflow_output["step_contents"],
|
src/workflows/structs.py
CHANGED
|
@@ -6,6 +6,8 @@ from typing import Any, Literal, Optional
|
|
| 6 |
import numpy as np
|
| 7 |
from pydantic import BaseModel, Field, model_validator
|
| 8 |
|
|
|
|
|
|
|
| 9 |
"""
|
| 10 |
Core data structures for defining workflows and their components.
|
| 11 |
|
|
@@ -259,6 +261,13 @@ class Workflow(BaseModel):
|
|
| 259 |
"""Get all model selections for all steps."""
|
| 260 |
return {step_id: step.get_full_model_name() for step_id, step in self.steps.items()}
|
| 261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
# Step update method
|
| 263 |
|
| 264 |
def add_step(self, step: ModelStep) -> "Workflow":
|
|
@@ -305,14 +314,19 @@ class Buzzer(BaseModel):
|
|
| 305 |
use_enum_values = True
|
| 306 |
frozen = True
|
| 307 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
def run(self, confidence: float, prob: float | None = None, logprob: float | None = None) -> bool:
|
| 309 |
"""Run the buzzer logic."""
|
| 310 |
if logprob is not None and prob is not None:
|
| 311 |
raise ValueError("Cannot provide both logprob and prob")
|
| 312 |
-
if logprob is not None:
|
| 313 |
-
prob = np.exp(logprob)
|
| 314 |
if self.prob_threshold is None:
|
| 315 |
return confidence >= self.confidence_threshold
|
|
|
|
|
|
|
|
|
|
| 316 |
if self.method == BuzzerMethod.AND:
|
| 317 |
return confidence >= self.confidence_threshold and prob >= self.prob_threshold
|
| 318 |
elif self.method == BuzzerMethod.OR:
|
|
@@ -333,6 +347,24 @@ class TossupWorkflow(Workflow):
|
|
| 333 |
|
| 334 |
buzzer: Buzzer
|
| 335 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
def update_buzzer(self, buzzer: Buzzer) -> "TossupWorkflow":
|
| 337 |
"""Update the buzzer."""
|
| 338 |
return self.model_copy(update={"buzzer": buzzer})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
import numpy as np
|
| 7 |
from pydantic import BaseModel, Field, model_validator
|
| 8 |
|
| 9 |
+
from .configs import AVAILABLE_MODELS
|
| 10 |
+
|
| 11 |
"""
|
| 12 |
Core data structures for defining workflows and their components.
|
| 13 |
|
|
|
|
| 261 |
"""Get all model selections for all steps."""
|
| 262 |
return {step_id: step.get_full_model_name() for step_id, step in self.steps.items()}
|
| 263 |
|
| 264 |
+
def get_output_model_selections(self) -> dict[str, str]:
|
| 265 |
+
"""Get all output model selections for all steps."""
|
| 266 |
+
return {
|
| 267 |
+
output_var: target_var.split(".")[0] if target_var else None
|
| 268 |
+
for output_var, target_var in self.outputs.items()
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
# Step update method
|
| 272 |
|
| 273 |
def add_step(self, step: ModelStep) -> "Workflow":
|
|
|
|
| 314 |
use_enum_values = True
|
| 315 |
frozen = True
|
| 316 |
|
| 317 |
+
def update(self, **kwargs) -> "Buzzer":
|
| 318 |
+
"""Update the buzzer with the given kwargs."""
|
| 319 |
+
return self.model_copy(update=kwargs)
|
| 320 |
+
|
| 321 |
def run(self, confidence: float, prob: float | None = None, logprob: float | None = None) -> bool:
|
| 322 |
"""Run the buzzer logic."""
|
| 323 |
if logprob is not None and prob is not None:
|
| 324 |
raise ValueError("Cannot provide both logprob and prob")
|
|
|
|
|
|
|
| 325 |
if self.prob_threshold is None:
|
| 326 |
return confidence >= self.confidence_threshold
|
| 327 |
+
if logprob is None and prob is None:
|
| 328 |
+
raise ValueError("Must provide either logprob or prob if prob_threshold is not None")
|
| 329 |
+
prob = prob or float(np.exp(logprob))
|
| 330 |
if self.method == BuzzerMethod.AND:
|
| 331 |
return confidence >= self.confidence_threshold and prob >= self.prob_threshold
|
| 332 |
elif self.method == BuzzerMethod.OR:
|
|
|
|
| 347 |
|
| 348 |
buzzer: Buzzer
|
| 349 |
|
| 350 |
+
def get_answer_model(self, answer_var: str | None = None) -> str | None:
|
| 351 |
+
answer_var = answer_var or self.outputs["answer"]
|
| 352 |
+
if answer_var is None:
|
| 353 |
+
return None
|
| 354 |
+
step_id = answer_var.split(".")[0]
|
| 355 |
+
return self.steps[step_id].get_full_model_name()
|
| 356 |
+
|
| 357 |
+
def is_token_probs_supported(self, answer_var: str | None = None) -> bool:
|
| 358 |
+
model_name = self.get_answer_model(answer_var)
|
| 359 |
+
if model_name is None:
|
| 360 |
+
return True
|
| 361 |
+
return AVAILABLE_MODELS[model_name].get("logprobs", False)
|
| 362 |
+
|
| 363 |
def update_buzzer(self, buzzer: Buzzer) -> "TossupWorkflow":
|
| 364 |
"""Update the buzzer."""
|
| 365 |
return self.model_copy(update={"buzzer": buzzer})
|
| 366 |
+
|
| 367 |
+
def refresh_buzzer(self) -> "TossupWorkflow":
|
| 368 |
+
if not self.is_token_probs_supported():
|
| 369 |
+
return self.update_buzzer(self.buzzer.update(prob_threshold=None, method="AND"))
|
| 370 |
+
return self
|
src/workflows/utils.py
CHANGED
|
@@ -168,7 +168,7 @@ def topological_sort(dependencies: dict[str, set[str]]) -> list[str]:
|
|
| 168 |
|
| 169 |
nodes = list(dependencies.keys())
|
| 170 |
dependents: dict[str, list[str]] = {node: [] for node in nodes}
|
| 171 |
-
in_degree: dict[str, int] =
|
| 172 |
|
| 173 |
# Calculate in-degrees and build dependents list
|
| 174 |
for node, deps in dependencies.items():
|
|
|
|
| 168 |
|
| 169 |
nodes = list(dependencies.keys())
|
| 170 |
dependents: dict[str, list[str]] = {node: [] for node in nodes}
|
| 171 |
+
in_degree: dict[str, int] = dict.fromkeys(nodes, 0)
|
| 172 |
|
| 173 |
# Calculate in-degrees and build dependents list
|
| 174 |
for node, deps in dependencies.items():
|