Spaces:
Running
Running
| import pandas as pd | |
| import numpy as np | |
| from tqdm import tqdm | |
| from dotenv import load_dotenv | |
| from genai import Client | |
| from typing import Callable, List, Optional | |
| from genai.text.generation import TextGenerationParameters, TextGenerationReturnOptions | |
| from genai import Credentials, Client | |
| from langchain.prompts import PromptTemplate | |
| from .metrics import mapk, rank_biased_overlap | |
| from .plots import plot_ranks | |
| import logging | |
| import random | |
| logger = logging.getLogger(__name__) | |
| load_dotenv() | |
| credentials = Credentials.from_env() | |
| client = Client(credentials=credentials) | |
| _INSTRUCTION = "Compare the two responses." | |
| _RUBRIC = "Which is the better response?" | |
| _PROMETHEUS_PROMPT = """###Task Description: | |
| An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given. | |
| 1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general. | |
| 2. After writing a feedback, choose a better response between Response 1 and Response 2. You should refer to the score rubric. | |
| 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (1 or 2)" | |
| 4. Please do not generate any other opening, closing, and explanations. | |
| ###Instruction: | |
| {instruction} | |
| ###Response 1: | |
| {response_1} | |
| ###Response 2: | |
| {response_2} | |
| ###Score Rubric: | |
| {rubric} | |
| ###Feedback: | |
| """ | |
| template = PromptTemplate.from_template(_PROMETHEUS_PROMPT) | |
| class LLMJudge: | |
| """ | |
| Competing method based on an LLM-Judge (Prometheus) | |
| """ | |
| def __init__( | |
| self, | |
| MODELS: List, | |
| true_ranking: Optional[List] = None, | |
| show_progress: Optional[bool] = True, | |
| ): | |
| self.MODELS = MODELS | |
| self.N = len(MODELS) | |
| self.evaluate = prometheus | |
| self.true_ranking = true_ranking | |
| self.show_progress = show_progress | |
| def fit(self, df: pd.DataFrame): | |
| """ | |
| df: Dataframe where each row is a benchmark instance, | |
| and there is a column with the output for each Model | |
| """ | |
| assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked." | |
| self.N = len(self.MODELS) | |
| y = np.empty((self.N, self.N)) | |
| if self.show_progress: | |
| pbar = tqdm(total=self.N**2, position=0, leave=False, desc="Evaluations") | |
| for i, a in enumerate(self.MODELS): | |
| for j, b in enumerate(self.MODELS): | |
| if a == b: | |
| y[i, j] = 0 | |
| else: | |
| y[i, j] = self.evaluate(client, format_instruction, a=a, b=b, df=df) | |
| if self.show_progress: pbar.update(1) | |
| logger.debug(f"Win matrix:\n{y}") | |
| # Just agregate based on win rates | |
| df = pd.DataFrame({'wins': y.sum(axis=1)}, index=self.MODELS) | |
| df = df.sort_values(by='wins', ascending=False) | |
| self.ranking = df.index.to_list() | |
| return self.ranking | |
| def measure(self, metric='rbo', k=5, p=0.95) -> float: | |
| """ | |
| Report metric related to self-rank | |
| """ | |
| if metric not in ['rbo', 'mapk']: | |
| raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').") | |
| if hasattr(self, 'ranking'): | |
| if self.true_ranking is not None: | |
| if metric == 'mapk': | |
| if k > len(self.true_ranking): | |
| logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.") | |
| actual = [self.true_ranking[:k]] | |
| pred = [self.ranking[:k]] | |
| return mapk(actual, pred, k=k) | |
| elif metric == 'rbo': | |
| return rank_biased_overlap(self.true_ranking, self.ranking, p=p) | |
| else: | |
| raise ValueError(f"Metric {metric} not understood.") | |
| else: | |
| raise ValueError("True ranking not available for metric calculation.") | |
| else: | |
| raise ValueError("Ranking not estimated. Run 'fit' first.") | |
| def plot(self, caselabel="output"): | |
| if hasattr(self, 'ranking') & (self.true_ranking is not None): | |
| plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel) | |
| def format_instruction(x, a, b): | |
| """instruction to score with Prometheus""" | |
| # Build the instruction | |
| response1 =f"{x[a]}" | |
| response2 =f"{x[b]}" | |
| instruction = _INSTRUCTION | |
| rubric = _RUBRIC | |
| instruction = template.format( | |
| instruction=instruction, response_1=response1, response_2 = response2, rubric=rubric | |
| ) | |
| return instruction | |
| def prometheus(client: Client, formatter: Callable, a: str, b:str, df: pd.DataFrame) -> int: | |
| """ | |
| Query the LLM-as-a-judge model Prometheus to compare responses from model "a" and model "b" | |
| client: is the `genai` client (using BAM). | |
| formatter: function that takes the model output and generates the Prometheus instruction | |
| parameters: BAM specific parameters. | |
| a: name of model `a` to be evaluated (column in `df` with responses) | |
| b: named of model `b` to be evaluated | |
| df: DataFrame with responses | |
| """ | |
| parameters = TextGenerationParameters( | |
| max_new_tokens=500, return_options=TextGenerationReturnOptions(), random_seed=42 | |
| ) | |
| # Get the correct prompts | |
| inst = df.apply(formatter, axis=1, args = (a,b)) | |
| adf = df.copy(deep=True) | |
| results = [] | |
| for response in client.text.generation.create( | |
| model_id="kaist-ai/prometheus-8x7b-v2", | |
| inputs=inst.values.tolist(), | |
| execution_options={"ordered": True, 'concurrency_limit': 10}, | |
| parameters=parameters, | |
| ): | |
| results.append(response.results[0]) | |
| adf["generated_text"] = [r.generated_text for r in results] | |
| def _helper(x): | |
| try: | |
| return int(x.split("[RESULT]")[1]) | |
| except (IndexError, ValueError) as e: | |
| return random.choice([0, 1]) | |
| return | |
| adf['A'] = adf["generated_text"].apply(_helper) | |
| n = adf.shape[0] | |
| a_wins = sum(adf['A']) | |
| b_wins = n - a_wins | |
| if a_wins >= b_wins: | |
| return 1 | |
| else: | |
| return 0 | |