Spaces:
Runtime error
Runtime error
| """ | |
| Compute agreement among judges. | |
| Usage: | |
| python compute_agreement.py --judges gpt4-pair human --votefiles human_judgments.json gpt4_pair_judgments.json | |
| python compute_agreement.py --judges human human --votefiles human_judgments.json | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import numpy as np | |
| def get_judge_name(judge): | |
| if isinstance(judge, list) and judge[0] == "gpt-4" and judge[1].startswith("pair"): | |
| return "gpt4-pair" | |
| if judge.startswith("expert"): | |
| return "human" | |
| if judge.startswith("author"): | |
| return "author" | |
| def revert(vote): | |
| if vote == "model_a": | |
| return "model_b" | |
| elif vote == "model_b": | |
| return "model_a" | |
| return vote | |
| def get_mt_bench_votes_data(raw_votes): | |
| data = [{}, {}] | |
| for judge_votes in raw_votes: | |
| for vote in judge_votes: | |
| turn = vote["turn"] - 1 | |
| if vote["model_a"] < vote["model_b"]: | |
| key = (vote["question_id"], vote["model_a"], vote["model_b"]) | |
| winner = vote["winner"] | |
| else: | |
| key = (vote["question_id"], vote["model_b"], vote["model_a"]) | |
| winner = revert(vote["winner"]) | |
| judge = get_judge_name(vote["judge"]) | |
| if key not in data[turn]: | |
| data[turn][key] = {} | |
| if judge not in data[turn][key]: | |
| data[turn][key][judge] = [] | |
| data[turn][key][judge].append(winner) | |
| return data | |
| def convertvote(vote): | |
| if "tie" in vote: | |
| return "tie" | |
| return vote | |
| def equalvote(vote1, vote2): | |
| if "tie" in vote1 and "tie" in vote2: | |
| return True | |
| return vote1 == vote2 | |
| # data: Dict[qid -> List[vote]] | |
| def get_mt_bench_agreement(data, judge1, judge2, ban): | |
| if judge1.startswith("gpt4") and judge2 == "human": | |
| stats = [0, 0] | |
| for votes in data.values(): | |
| if judge1 not in votes or judge2 not in votes: | |
| continue | |
| assert len(votes[judge1]) == 1 | |
| if convertvote(votes[judge1][0]) in ban: | |
| continue | |
| for v in votes[judge2]: | |
| if convertvote(v) in ban: | |
| continue | |
| stats[1] += 1 | |
| stats[0] += equalvote(votes[judge1][0], v) | |
| return stats[0], stats[1] | |
| elif judge1 == "human" and judge2 == "human": | |
| stats = [0, 0] | |
| for votes in data.values(): | |
| if "human" not in votes: | |
| continue | |
| for i in range(len(votes["human"]) - 1): | |
| for j in range(i + 1, len(votes["human"])): | |
| if ( | |
| convertvote(votes["human"][i]) in ban | |
| or convertvote(votes["human"][j]) in ban | |
| ): | |
| continue | |
| stats[1] += 1 | |
| stats[0] += equalvote(votes["human"][i], votes["human"][j]) | |
| return stats[0], stats[1] | |
| else: | |
| raise Exception("Unsupported judges.") | |
| def run_mt_bench_agreement(judges, votefiles): | |
| # votes[i]: List of votes | |
| votes = [] | |
| for filename in votefiles: | |
| with open(filename, "r") as f: | |
| data = json.load(f) | |
| votes.append(data) | |
| data = get_mt_bench_votes_data(votes) | |
| agree, total = get_mt_bench_agreement(data[0], judges[0], judges[1], ban=[]) | |
| print( | |
| f"turn 1 with tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}" | |
| ) | |
| agree, total = get_mt_bench_agreement(data[0], judges[0], judges[1], ban=["tie"]) | |
| print( | |
| f"turn 1 without tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}" | |
| ) | |
| agree, total = get_mt_bench_agreement(data[1], judges[0], judges[1], ban=[]) | |
| print( | |
| f"turn 2 with tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}" | |
| ) | |
| agree, total = get_mt_bench_agreement(data[1], judges[0], judges[1], ban=["tie"]) | |
| print( | |
| f"turn 2 without tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}" | |
| ) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--judges", nargs=2, type=str, default=["gpt4-pair", "human"]) | |
| parser.add_argument( | |
| "--votefiles", | |
| nargs="+", | |
| type=str, | |
| default=["gpt4_judgments.json", "human_judgments.json"], | |
| ) | |
| args = parser.parse_args() | |
| run_mt_bench_agreement(args.judges, args.votefiles) | |