Spaces:
Running
Running
| import pandas as pd | |
| import numpy as np | |
| from rouge_score import rouge_scorer | |
| from joblib import Parallel, delayed | |
| from selfrank.algos.greedy import SelfRankGreedy | |
| from selfrank.algos.iterative import SelfRank | |
| from selfrank.algos.baseline import MCARank | |
| from selfrank.algos.triplet import equality, rouge, noisy_equality | |
| import matplotlib.pyplot as plt | |
| from itertools import zip_longest | |
| from uuid import uuid4 | |
| import csv, os | |
| from functools import partial | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def generate_data(max_acc, min_acc, nmodels, nanswers, nquestions) -> tuple[pd.DataFrame, list]: | |
| np.random.seed(42) | |
| # Spread model accuracies between min and max | |
| model_acc = np.linspace(max_acc, min_acc, nmodels) | |
| gt_and_model_ans = np.zeros( | |
| (nquestions, nmodels + 1), dtype=int | |
| ) # array to store ground truth and model ans | |
| # Create ground truth answers i.e. first column | |
| for i in range(nquestions): | |
| gt_and_model_ans[i][0] = np.random.randint(nanswers) | |
| for i in range(0, nmodels): | |
| no_of_entries_frm_gt = np.ceil(model_acc[i] / 100 * (nquestions)).astype(int) | |
| # print(no_of_entries_frm_gt) | |
| offsets_to_match = np.random.permutation(nquestions)[0:no_of_entries_frm_gt] | |
| # print(offsets_to_match) | |
| for j in range(nquestions): | |
| if j in offsets_to_match: | |
| gt_and_model_ans[j][i + 1] = gt_and_model_ans[j][0] | |
| else: | |
| lst_wo_gt = list(range(nanswers)) | |
| lst_wo_gt.remove(gt_and_model_ans[j][0]) | |
| gt_and_model_ans[j][i + 1] = lst_wo_gt[np.random.randint(nanswers - 1)] | |
| # print(gt_and_model_ans) | |
| filename = str(uuid4()) | |
| fields = ["GT"] | |
| for i in range(nmodels): | |
| fields.append("M" + str(i + 1)) | |
| # writing to csv file | |
| with open(filename, "w") as csvfile: | |
| # creating a csv writer object | |
| csvwriter = csv.writer(csvfile) | |
| # writing the fields | |
| csvwriter.writerow(fields) | |
| # writing the data rows | |
| csvwriter.writerows(gt_and_model_ans) | |
| df = pd.read_csv(filename) | |
| os.remove(filename) | |
| true_ranking = [f"M{i}" for i in range(1, nmodels + 1)] | |
| return df, true_ranking | |
| def synth_executor(acc_range: tuple[float, float], nmodels, nanswers, nquestions, noise, method) -> tuple[str, dict]: | |
| min_acc, max_acc = acc_range | |
| logger.info(f"Synth experiment: min_acc:{min_acc}, max_acc:{max_acc}, nmodels: {nmodels}, nanswers: {nanswers}, nquestions: {nquestions}, noise:{noise}, method:{method}.") | |
| df, true_ranking = generate_data(max_acc, min_acc, nmodels, nanswers, nquestions) | |
| if noise == 0.: | |
| comp = equality | |
| else: | |
| comp = partial(noisy_equality, p=noise) | |
| df = df.drop(columns=["GT"]) | |
| MODELS = df.columns.tolist() | |
| if method == "Full": | |
| ranker = SelfRank(MODELS, comp, true_ranking) | |
| ranker.fit(df) | |
| # outputs of interest | |
| out = { | |
| "true_ranking": true_ranking, | |
| "estimated_ranking": ranker.ranking, | |
| "rbo": ranker.measure(metric="rbo"), | |
| "map-1": ranker.measure(metric='mapk', k=1), | |
| "map-3": ranker.measure(metric='mapk', k=3), | |
| "map-5": ranker.measure(metric='mapk', k=5), | |
| "map-10": ranker.measure(metric='mapk', k=10) | |
| } | |
| elif method == "Greedy": | |
| ranker = SelfRankGreedy(MODELS, comp, true_ranking) | |
| ranker.fit(df) | |
| out = { | |
| "true_ranking": true_ranking, | |
| "estimated_ranking": ranker.ranking, | |
| "rbo": ranker.measure(metric="rbo"), | |
| "map-1": ranker.measure(metric='mapk', k=1), | |
| "map-3": ranker.measure(metric='mapk', k=3), | |
| "map-5": ranker.measure(metric='mapk', k=5), | |
| "map-10": ranker.measure(metric='mapk', k=10) | |
| } | |
| elif method == 'MCA': | |
| ranker = MCARank(MODELS, comp, true_ranking) | |
| ranker.fit(df, measure='noisy_equality', p=noise) | |
| out = { | |
| "true_ranking": true_ranking, | |
| "estimated_ranking": ranker.ranking, | |
| "rbo": ranker.measure(metric="rbo"), | |
| "map-1": ranker.measure(metric='mapk', k=1), | |
| "map-3": ranker.measure(metric='mapk', k=3), | |
| "map-5": ranker.measure(metric='mapk', k=5), | |
| "map-10": ranker.measure(metric='mapk', k=10) | |
| } | |
| else: | |
| raise ValueError(f"{method} not understood.") | |
| eval_metrics = ( | |
| f"<h2 style='color: purple;'> Evaluation measures </h2>" | |
| f"Rank-Biased Overlap: {out['rbo']:0.3f}<br>" | |
| f"MAP-3 : {out['map-3']:0.3f}<br>" | |
| f"MAP-5 : {out['map-5']:0.3f}<br>" | |
| f"MAP-10 : {out['map-10']: 0.3f}." | |
| ) | |
| out_plot = ranker.plot("synth") | |
| plt.close(out_plot) | |
| return "synth.png", eval_metrics | |
| def benchmark_executor(data, mmlu_subject, evaluation, nmodels, nrows, method | |
| ) -> tuple[pd.DataFrame, plt.figure]: | |
| """Main execution flow for benchmarks""" | |
| logger.info(f"Benchmark experiment: benchmark:{data}, mmlu subject: {mmlu_subject}, evaluation:{evaluation}, nmodels:{nmodels}, nquestions: {nrows}, method: {method}.") | |
| seed = 40 | |
| np.random.seed(seed) | |
| match data: | |
| case "MMLU": | |
| adf = pd.read_pickle(f"data/mmlu_subject_{mmlu_subject}.pkl") | |
| case "CNN/DM": | |
| adf = pd.read_pickle(f"data/cnndm.pkl") | |
| case "XSUM": | |
| adf = pd.read_pickle(f"data/xsum.pkl") | |
| case _: | |
| raise ValueError(f"'{data}' not understood.") | |
| MODELS = adf.model.unique() | |
| # Sample fewer models if so needed | |
| if nmodels != "All": | |
| if nmodels < len(MODELS): | |
| MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist() | |
| adf = adf[adf.model.isin(MODELS)] | |
| match data: | |
| case "MMLU": | |
| keys = [ | |
| "id", | |
| "trial_id", | |
| "perturbation", | |
| ] # MMLU has this extra parameter | |
| case "CNN/DM" | "XSUM": | |
| keys = ["id", "trial_id"] | |
| case _: | |
| pass | |
| df = adf.pivot_table( | |
| columns="model", | |
| index=keys, | |
| values="output", | |
| aggfunc="first", | |
| ) | |
| # Filter by number of rows | |
| df.dropna(inplace=True) | |
| if nrows != "All": | |
| if nrows < df.shape[0]: | |
| df = df.sample(nrows, random_state=seed) | |
| # Compute true ranking | |
| adf = adf.set_index(keys).loc[df.index].reset_index() | |
| if evaluation == "Rouge": | |
| def __true_rouge(x, scorer): | |
| return scorer.score(x["reference"], x["output"])["rouge2"].fmeasure | |
| scorer = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True) | |
| adf["rouge"] = Parallel(n_jobs=-1, batch_size=128)( | |
| delayed(__true_rouge)(i, scorer) for _, i in adf.iterrows() | |
| ) | |
| # Method 2 - look at "win rates" - for each question, see which model | |
| # wins (i.e. has the best ROUGE score) | |
| idx = adf.groupby(["id", "trial_id"])["rouge"].idxmax() | |
| win_rates = adf.loc[idx].model.value_counts() | |
| win_rate_rank = win_rates.index.tolist() | |
| # include models with nowins at the bottom | |
| no_wins = list(set(MODELS) - set(win_rate_rank)) | |
| true_ranking = win_rate_rank + no_wins | |
| evaluator = rouge | |
| elif evaluation == "Equality": | |
| # Compute the true ranking (multiple choice - so use equality between | |
| # LLM response and reference-value) | |
| adf["C"] = (adf.output == adf.reference).astype(int) | |
| true_ranking = ( | |
| adf.groupby("model")["C"] | |
| .apply(lambda x: sum(x) / len(x)) | |
| .sort_values(ascending=False) | |
| .index.tolist() | |
| ) | |
| evaluator = equality | |
| else: | |
| raise ValueError(f"'{evaluation}' not understood.") | |
| match method: | |
| case "Full": | |
| ranker = SelfRank(MODELS, evaluator, true_ranking) | |
| case "Greedy": | |
| ranker = SelfRankGreedy(MODELS, evaluator, true_ranking) | |
| case "MCA": | |
| raise NotImplementedError | |
| case _: | |
| raise ValueError(f"'{method}' not understood.") | |
| # generate outputs | |
| ranker.fit(df) | |
| ranks = ranker.ranking | |
| ranks = [ | |
| j + i for i, j in zip_longest(ranks, ["π₯ ", "π₯ ", "π₯ "], fillvalue="") | |
| ] | |
| out_df = pd.DataFrame({"rank": range(1, len(true_ranking) + 1), "model": ranks}) | |
| out_metrics = { | |
| "rbo": ranker.measure(metric="rbo"), | |
| "map-1": ranker.measure(metric="mapk", k=1), | |
| "map-3": ranker.measure(metric="mapk", k=3), | |
| "map-5": ranker.measure(metric="mapk", k=5), | |
| "map-10": ranker.measure(metric="mapk", k=10), | |
| "evaluations": evaluator.calls, | |
| } | |
| eval_metrics = ( | |
| f"<h2 style='color: purple;'> Evaluation measures </h2>" | |
| f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>" | |
| f"MAP-3 : {out_metrics['map-3']:0.3f}<br>" | |
| f"MAP-5 : {out_metrics['map-5']:0.3f}<br>" | |
| f"MAP-10 : {out_metrics['map-10']: 0.3f}." | |
| ) | |
| out_plot = ranker.plot() | |
| plt.close(out_plot) | |
| return out_df, "output.png", eval_metrics |