Spaces:

ibm
/

llm-rank-themselves

Running

App Files Files Community

llm-rank-themselves / selfrank /algos /pairwise.py

rahulnair23

test commit

0de1d17 over 1 year ago

raw

history blame

6.28 kB


	import pandas as pd
	import numpy as np
	from tqdm import tqdm
	from dotenv import load_dotenv
	from genai import Client
	from typing import Callable, List, Optional
	from genai.text.generation import TextGenerationParameters, TextGenerationReturnOptions
	from genai import Credentials, Client
	from langchain.prompts import PromptTemplate
	from .metrics import mapk, rank_biased_overlap
	from .plots import plot_ranks
	import logging
	import random

	logger = logging.getLogger(__name__)

	load_dotenv()

	credentials = Credentials.from_env()
	client = Client(credentials=credentials)

	_INSTRUCTION = "Compare the two responses."
	_RUBRIC = "Which is the better response?"
	_PROMETHEUS_PROMPT = """###Task Description:
	An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given.
	1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general.
	2. After writing a feedback, choose a better response between Response 1 and Response 2. You should refer to the score rubric.
	3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (1 or 2)"
	4. Please do not generate any other opening, closing, and explanations.
	###Instruction:
	{instruction}
	###Response 1:
	{response_1}
	###Response 2:
	{response_2}
	###Score Rubric:
	{rubric}
	###Feedback:
	"""

	template = PromptTemplate.from_template(_PROMETHEUS_PROMPT)

	class LLMJudge:
	"""
	Competing method based on an LLM-Judge (Prometheus)
	"""

	def __init__(
	self,
	MODELS: List,
	true_ranking: Optional[List] = None,
	show_progress: Optional[bool] = True,
	):
	self.MODELS = MODELS
	self.N = len(MODELS)
	self.evaluate = prometheus
	self.true_ranking = true_ranking
	self.show_progress = show_progress


	def fit(self, df: pd.DataFrame):
	"""
	df: Dataframe where each row is a benchmark instance,
	and there is a column with the output for each Model

	"""

	assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
	self.N = len(self.MODELS)

	y = np.empty((self.N, self.N))

	if self.show_progress:
	pbar = tqdm(total=self.N**2, position=0, leave=False, desc="Evaluations")

	for i, a in enumerate(self.MODELS):
	for j, b in enumerate(self.MODELS):
	if a == b:
	y[i, j] = 0
	else:
	y[i, j] = self.evaluate(client, format_instruction, a=a, b=b, df=df)

	if self.show_progress: pbar.update(1)

	logger.debug(f"Win matrix:\n{y}")
	# Just agregate based on win rates
	df = pd.DataFrame({'wins': y.sum(axis=1)}, index=self.MODELS)
	df = df.sort_values(by='wins', ascending=False)
	self.ranking = df.index.to_list()

	return self.ranking

	def measure(self, metric='rbo', k=5, p=0.95) -> float:
	"""
	Report metric related to self-rank
	"""
	if metric not in ['rbo', 'mapk']:
	raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")

	if hasattr(self, 'ranking'):
	if self.true_ranking is not None:
	if metric == 'mapk':
	if k > len(self.true_ranking):
	logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
	actual = [self.true_ranking[:k]]
	pred = [self.ranking[:k]]
	return mapk(actual, pred, k=k)
	elif metric == 'rbo':
	return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
	else:
	raise ValueError(f"Metric {metric} not understood.")
	else:
	raise ValueError("True ranking not available for metric calculation.")
	else:
	raise ValueError("Ranking not estimated. Run 'fit' first.")


	def plot(self, caselabel="output"):
	if hasattr(self, 'ranking') & (self.true_ranking is not None):
	plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)


	def format_instruction(x, a, b):
	"""instruction to score with Prometheus"""

	# Build the instruction
	response1 =f"{x[a]}"
	response2 =f"{x[b]}"

	instruction = _INSTRUCTION
	rubric = _RUBRIC

	instruction = template.format(
	instruction=instruction, response_1=response1, response_2 = response2, rubric=rubric
	)
	return instruction


	def prometheus(client: Client, formatter: Callable, a: str, b:str, df: pd.DataFrame) -> int:
	"""
	Query the LLM-as-a-judge model Prometheus to compare responses from model "a" and model "b"

	client: is the `genai` client (using BAM).
	formatter: function that takes the model output and generates the Prometheus instruction
	parameters: BAM specific parameters.
	a: name of model `a` to be evaluated (column in `df` with responses)
	b: named of model `b` to be evaluated
	df: DataFrame with responses
	"""

	parameters = TextGenerationParameters(
	max_new_tokens=500, return_options=TextGenerationReturnOptions(), random_seed=42
	)

	# Get the correct prompts
	inst = df.apply(formatter, axis=1, args = (a,b))
	adf = df.copy(deep=True)

	results = []
	for response in client.text.generation.create(
	model_id="kaist-ai/prometheus-8x7b-v2",
	inputs=inst.values.tolist(),
	execution_options={"ordered": True, 'concurrency_limit': 10},
	parameters=parameters,
	):
	results.append(response.results[0])

	adf["generated_text"] = [r.generated_text for r in results]

	def _helper(x):
	try:
	return int(x.split("[RESULT]")[1])
	except (IndexError, ValueError) as e:
	return random.choice([0, 1])

	return
	adf['A'] = adf["generated_text"].apply(_helper)

	n = adf.shape[0]
	a_wins = sum(adf['A'])
	b_wins = n - a_wins

	if a_wins >= b_wins:
	return 1
	else:
	return 0