Spaces:
Running
Running
| import json | |
| import gradio as gr | |
| import pandas as pd | |
| from statistics import median | |
| print("Loading datasets...") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def add_rank(df, compute_average=True): | |
| cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]] | |
| if len(cols_to_rank) == 1: | |
| df.sort_values(cols_to_rank[0], ascending=False, inplace=True) | |
| else: | |
| if compute_average: | |
| df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False)) | |
| df.sort_values("Average", ascending=False, inplace=True) | |
| else: | |
| df.sort_values(cols_to_rank[0], ascending=False, inplace=True) | |
| df.insert(0, "Rank", list(range(1, len(df) + 1))) | |
| df = df.round(2) | |
| # Fill NaN after averaging | |
| df.fillna("", inplace=True) | |
| return df | |
| def make_clickable_model(model_name, link=None): | |
| if link is None: | |
| link = "https://huggingface.co/" + model_name | |
| # Remove user from model name | |
| return ( | |
| f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>' | |
| ) | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| with open('all_results.json', 'r') as f: | |
| ALL_RESULTS = json.load(f) | |
| MODEL_LIST = list(ALL_RESULTS.keys()) | |
| NUM_MODELS = len(set(MODEL_LIST)) | |
| MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST} | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']] | |
| overall_acc = [results['overall_acc'] for results in results_list] | |
| overall_acc = median(overall_acc) | |
| consistency_score_3 = [results['consistency_score_3'] for results in results_list] | |
| consistency_score_3 = median(consistency_score_3) | |
| AC3_3 = [results['AC3_3'] for results in results_list] | |
| AC3_3 = median(AC3_3) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": overall_acc, | |
| "Cross-Lingual Consistency": consistency_score_3, | |
| "AC3": AC3_3, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "cross_xquad_overall")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=False) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| CROSS_XQUAD_ZERO_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="zero_shot") | |
| CROSS_XQUAD_FIVE_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="five_shot") | |
| def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']] | |
| English = [results['language_acc']['English'] for results in results_list] | |
| Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list] | |
| Chinese = [results['language_acc']['Chinese'] for results in results_list] | |
| Spanish = [results['language_acc']['Spanish'] for results in results_list] | |
| English = median(English) | |
| Vietnamese = median(Vietnamese) | |
| Chinese = median(Chinese) | |
| Spanish = median(Spanish) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "English": English, | |
| "Vietnamese": Vietnamese, | |
| "Chinese": Chinese, | |
| "Spanish": Spanish, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "cross_xquad_lang")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=False) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| CROSS_XQUAD_ZERO_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="zero_shot") | |
| CROSS_XQUAD_FIVE_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']] | |
| overall_acc = [results['overall_acc'] for results in results_list] | |
| overall_acc = median(overall_acc) | |
| consistency_score_3 = [results['consistency_score_3'] for results in results_list] | |
| consistency_score_3 = median(consistency_score_3) | |
| AC3_3 = [results['AC3_3'] for results in results_list] | |
| AC3_3 = median(AC3_3) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": overall_acc, | |
| "Cross-Lingual Consistency": consistency_score_3, | |
| "AC3": AC3_3, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "cross_mmlu_overall")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=False) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot") | |
| CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot") | |
| def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']] | |
| English = [results['language_acc']['English'] for results in results_list] | |
| Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list] | |
| Chinese = [results['language_acc']['Chinese'] for results in results_list] | |
| Indonesian = [results['language_acc']['Indonesian'] for results in results_list] | |
| Filipino = [results['language_acc']['Filipino'] for results in results_list] | |
| Spanish = [results['language_acc']['Spanish'] for results in results_list] | |
| Malay = [results['language_acc']['Malay'] for results in results_list] | |
| English = median(English) | |
| Vietnamese = median(Vietnamese) | |
| Chinese = median(Chinese) | |
| Indonesian = median(Indonesian) | |
| Filipino = median(Filipino) | |
| Spanish = median(Spanish) | |
| Malay = median(Malay) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "English": English, | |
| "Vietnamese": Vietnamese, | |
| "Chinese": Chinese, | |
| "Indonesian": Indonesian, | |
| "Filipino": Filipino, | |
| "Spanish": Spanish, | |
| "Malay": Malay, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "cross_mmlu_lang")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=False) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot") | |
| CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']] | |
| overall_acc = [results['overall_acc'] for results in results_list] | |
| overall_acc = median(overall_acc) | |
| consistency_score_3 = [results['consistency_score_3'] for results in results_list] | |
| consistency_score_3 = median(consistency_score_3) | |
| AC3_3 = [results['AC3_3'] for results in results_list] | |
| AC3_3 = median(AC3_3) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": overall_acc, | |
| "Cross-Lingual Consistency": consistency_score_3, | |
| "AC3": AC3_3, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "cross_logiqa_overall")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=False) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| CROSS_LOGIQA_ZERO_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="zero_shot") | |
| CROSS_LOGIQA_FIVE_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="five_shot") | |
| def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']] | |
| English = [results['language_acc']['English'] for results in results_list] | |
| Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list] | |
| Chinese = [results['language_acc']['Chinese'] for results in results_list] | |
| Indonesian = [results['language_acc']['Indonesian'] for results in results_list] | |
| Filipino = [results['language_acc']['Filipino'] for results in results_list] | |
| Spanish = [results['language_acc']['Spanish'] for results in results_list] | |
| Malay = [results['language_acc']['Malay'] for results in results_list] | |
| English = median(English) | |
| Vietnamese = median(Vietnamese) | |
| Chinese = median(Chinese) | |
| Indonesian = median(Indonesian) | |
| Filipino = median(Filipino) | |
| Spanish = median(Spanish) | |
| Malay = median(Malay) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "English": English, | |
| "Vietnamese": Vietnamese, | |
| "Chinese": Chinese, | |
| "Indonesian": Indonesian, | |
| "Filipino": Filipino, | |
| "Spanish": Spanish, | |
| "Malay": Malay, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "cross_logiqa_language")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=False) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| CROSS_LOGIQA_ZERO_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="zero_shot") | |
| CROSS_LOGIQA_FIVE_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "sg_eval")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| SG_EVAL_ZERO_SHOT = get_data_sg_eval(eval_mode="zero_shot") | |
| SG_EVAL_FIVE_SHOT = get_data_sg_eval(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "us_eval")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| US_EVAL_ZERO_SHOT = get_data_us_eval(eval_mode="zero_shot") | |
| US_EVAL_FIVE_SHOT = get_data_us_eval(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "cn_eval")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot") | |
| CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "ph_eval")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| PH_EVAL_ZERO_SHOT = get_data_ph_eval(eval_mode="zero_shot") | |
| PH_EVAL_FIVE_SHOT = get_data_ph_eval(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_sing2eng(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['sing2eng'][res] for res in ALL_RESULTS[model][eval_mode]['sing2eng']] | |
| bleu_score = median([results['bleu_score'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "BLEU": bleu_score, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "sing2eng")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| SING2ENG_ZERO_SHOT = get_data_sing2eng(eval_mode="zero_shot") | |
| SING2ENG_FIVE_SHOT = get_data_sing2eng(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_flores_ind2eng(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['flores_ind2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_ind2eng']] | |
| bleu_score = median([results['bleu_score'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "BLEU": bleu_score, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "flores_ind2eng")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| FLORES_IND2ENG_ZERO_SHOT = get_data_flores_ind2eng(eval_mode="zero_shot") | |
| FLORES_IND2ENG_FIVE_SHOT = get_data_flores_ind2eng(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_flores_vie2eng(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['flores_vie2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_vie2eng']] | |
| bleu_score = median([results['bleu_score'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "BLEU": bleu_score, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "flores_vie2eng")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| FLORES_VIE2ENG_ZERO_SHOT = get_data_flores_vie2eng(eval_mode="zero_shot") | |
| FLORES_VIE2ENG_FIVE_SHOT = get_data_flores_vie2eng(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_flores_zho2eng(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['flores_zho2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zho2eng']] | |
| bleu_score = median([results['bleu_score'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "BLEU": bleu_score, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "flores_zho2eng")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| FLORES_ZHO2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot") | |
| FLORES_ZHO2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['flores_zsm2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zsm2eng']] | |
| bleu_score = median([results['bleu_score'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "BLEU": bleu_score, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "flores_zsm2eng")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot") | |
| FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_mmlu(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| accuracy = -1 | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| MMLU_ZERO_SHOT = get_data_mmlu(eval_mode="zero_shot") | |
| MMLU_FIVE_SHOT = get_data_mmlu(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['mmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu_full']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "mmlu_full")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| MMLU_FULL_ZERO_SHOT = get_data_mmlu_full(eval_mode="zero_shot") | |
| MMLU_FULL_FIVE_SHOT = get_data_mmlu_full(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['c_eval'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "c_eval")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| C_EVAL_ZERO_SHOT = get_data_c_eval(eval_mode="zero_shot") | |
| C_EVAL_FIVE_SHOT = get_data_c_eval(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_c_eval_full(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['c_eval_full'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval_full']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "c_eval_full")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| C_EVAL_FULL_ZERO_SHOT = get_data_c_eval_full(eval_mode="zero_shot") | |
| C_EVAL_FULL_FIVE_SHOT = get_data_c_eval_full(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_cmmlu(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['cmmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "cmmlu")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| CMMLU_ZERO_SHOT = get_data_cmmlu(eval_mode="zero_shot") | |
| CMMLU_FIVE_SHOT = get_data_cmmlu(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_cmmlu_full(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['cmmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu_full']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "cmmlu_full")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| CMMLU_FULL_ZERO_SHOT = get_data_cmmlu_full(eval_mode="zero_shot") | |
| CMMLU_FULL_FIVE_SHOT = get_data_cmmlu_full(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_zbench(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['zbench'][res] for res in ALL_RESULTS[model][eval_mode]['zbench']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "zbench")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| ZBENCH_ZERO_SHOT = get_data_zbench(eval_mode="zero_shot") | |
| ZBENCH_FIVE_SHOT = get_data_zbench(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_indommlu(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['indommlu'][res] for res in ALL_RESULTS[model][eval_mode]['indommlu']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "indommlu")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| INDOMMLU_ZERO_SHOT = get_data_indommlu(eval_mode="zero_shot") | |
| INDOMMLU_FIVE_SHOT = get_data_indommlu(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['ind_emotion'][res] for res in ALL_RESULTS[model][eval_mode]['ind_emotion']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "ind_emotion")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| IND_EMOTION_ZERO_SHOT = get_data_ind_emotion(eval_mode="zero_shot") | |
| IND_EMOTION_FIVE_SHOT = get_data_ind_emotion(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_ocnli(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['ocnli'][res] for res in ALL_RESULTS[model][eval_mode]['ocnli']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "ocnli")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| OCNLI_ZERO_SHOT = get_data_ocnli(eval_mode="zero_shot") | |
| OCNLI_FIVE_SHOT = get_data_ocnli(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_c3(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['c3'][res] for res in ALL_RESULTS[model][eval_mode]['c3']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "c3")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| C3_ZERO_SHOT = get_data_c3(eval_mode="zero_shot") | |
| C3_FIVE_SHOT = get_data_c3(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_dream(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['dream'][res] for res in ALL_RESULTS[model][eval_mode]['dream']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "dream")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| DREAM_ZERO_SHOT = get_data_dream(eval_mode="zero_shot") | |
| DREAM_FIVE_SHOT = get_data_dream(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_samsum(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['samsum'][res] for res in ALL_RESULTS[model][eval_mode]['samsum']] | |
| rouge1 = median([results['rouge1'] for results in results_list]) | |
| rouge2 = median([results['rouge2'] for results in results_list]) | |
| rougeL = median([results['rougeL'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "ROUGE-1": rouge1, | |
| "ROUGE-2": rouge2, | |
| "ROUGE-L": rougeL, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "samsum")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| SAMSUM_ZERO_SHOT = get_data_samsum(eval_mode="zero_shot") | |
| SAMSUM_FIVE_SHOT = get_data_samsum(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_dialogsum(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['dialogsum'][res] for res in ALL_RESULTS[model][eval_mode]['dialogsum']] | |
| rouge1 = median([results['rouge1'] for results in results_list]) | |
| rouge2 = median([results['rouge2'] for results in results_list]) | |
| rougeL = median([results['rougeL'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "ROUGE-1": rouge1, | |
| "ROUGE-2": rouge2, | |
| "ROUGE-L": rougeL, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "dialogsum")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| DIALOGSUM_ZERO_SHOT = get_data_dialogsum(eval_mode="zero_shot") | |
| DIALOGSUM_FIVE_SHOT = get_data_dialogsum(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_sst2(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['sst2'][res] for res in ALL_RESULTS[model][eval_mode]['sst2']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "sst2")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| SST2_ZERO_SHOT = get_data_sst2(eval_mode="zero_shot") | |
| SST2_FIVE_SHOT = get_data_sst2(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_cola(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['cola'][res] for res in ALL_RESULTS[model][eval_mode]['cola']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "cola")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| COLA_ZERO_SHOT = get_data_cola(eval_mode="zero_shot") | |
| COLA_FIVE_SHOT = get_data_cola(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_qqp(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['qqp'][res] for res in ALL_RESULTS[model][eval_mode]['qqp']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "qqp")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| QQP_ZERO_SHOT = get_data_qqp(eval_mode="zero_shot") | |
| QQP_FIVE_SHOT = get_data_qqp(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_mnli(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['mnli'][res] for res in ALL_RESULTS[model][eval_mode]['mnli']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "mnli")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| MNLI_ZERO_SHOT = get_data_mnli(eval_mode="zero_shot") | |
| MNLI_FIVE_SHOT = get_data_mnli(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_qnli(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['qnli'][res] for res in ALL_RESULTS[model][eval_mode]['qnli']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "qnli")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| QNLI_ZERO_SHOT = get_data_qnli(eval_mode="zero_shot") | |
| QNLI_FIVE_SHOT = get_data_qnli(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_wnli(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['wnli'][res] for res in ALL_RESULTS[model][eval_mode]['wnli']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "wnli")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| WNLI_ZERO_SHOT = get_data_wnli(eval_mode="zero_shot") | |
| WNLI_FIVE_SHOT = get_data_wnli(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_rte(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['rte'][res] for res in ALL_RESULTS[model][eval_mode]['rte']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "rte")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| RTE_ZERO_SHOT = get_data_rte(eval_mode="zero_shot") | |
| RTE_FIVE_SHOT = get_data_rte(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| def get_data_mrpc(eval_mode='zero_shot', fillna=True, rank=True): | |
| df_list = [] | |
| for model in MODEL_LIST: | |
| try: | |
| results_list = [ALL_RESULTS[model][eval_mode]['mrpc'][res] for res in ALL_RESULTS[model][eval_mode]['mrpc']] | |
| accuracy = median([results['accuracy'] for results in results_list]) | |
| res = { | |
| "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), | |
| "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), | |
| "Accuracy": accuracy, | |
| } | |
| df_list.append(res) | |
| except: | |
| print('Not found in model: {} for {}'.format(model, "mrpc")) | |
| df = pd.DataFrame(df_list) | |
| # If there are any models that are the same, merge them | |
| # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one | |
| df = df.groupby("Model", as_index=False).first() | |
| # Put 'Model' column first | |
| #cols = sorted(list(df.columns)) | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("Model"))) | |
| df = df[cols] | |
| if rank: | |
| df = add_rank(df, compute_average=True) | |
| if fillna: | |
| df.fillna("", inplace=True) | |
| return df | |
| MRPC_ZERO_SHOT = get_data_mrpc(eval_mode="zero_shot") | |
| MRPC_FIVE_SHOT = get_data_mrpc(eval_mode="five_shot") | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
| theme = gr.themes.Soft().set( | |
| background_fill_primary='*secondary_50' | |
| ) | |
| block = gr.Blocks(theme='rottenlittlecreature/Moon_Goblin') | |
| with block: | |
| gr.Markdown(f""" | |
| ### SeaEval Leaderboard. To submit, refer to the <a href="https://seaeval.github.io/" target="_blank" style="text-decoration: underline">SeaEval Website</a>. Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models. | |
| - **Number of Datasets**: > 30 | |
| - **Number of Languages**: > 8 | |
| - **Number of Models**: {NUM_MODELS} | |
| - **Mode of Evaluation**: Zero-Shot, Five-Shot | |
| ### The following table shows the performance of the models on the SeaEval benchmark. | |
| - For **Zero-Shot** performance, it is the median value from 5 distinct prompts shown on the above leaderboard to mitigate the influence of random variations induced by prompts. | |
| - I am trying to evaluate the base models for five-shot performance and instruction-tuned models for zero-shot. | |
| - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | |
| """) | |
| with gr.Tabs(): | |
| with gr.TabItem("Cross-Lingual Consistency"): | |
| # dataset 1: cross-mmlu | |
| # dataset 1: cross-mmlu | |
| with gr.TabItem("Cross-MMLU"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| cross_mmlu_zero_shot_overall = gr.components.Dataframe( | |
| CROSS_MMLU_ZERO_SHOT_OVERALL, | |
| datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Language Performance"): | |
| with gr.Row(): | |
| cross_mmlu_zero_shot_overall = gr.components.Dataframe( | |
| CROSS_MMLU_ZERO_SHOT_LANGUAGE, | |
| datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| cross_mmlu_zero_shot_overall = gr.components.Dataframe( | |
| CROSS_MMLU_FIVE_SHOT_OVERALL, | |
| datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Language Performance"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| CROSS_MMLU_FIVE_SHOT_LANGUAGE, | |
| datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **Cross-MMLU Leaderboard** 🔮 | |
| - **Metric:** Cross-Lingual Consistency, Accuracy, AC3 | |
| - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino | |
| """) | |
| with gr.TabItem("Cross-XQUAD"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| cross_xquad_zero_shot_overall = gr.components.Dataframe( | |
| CROSS_XQUAD_ZERO_SHOT_OVERALL, | |
| datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_OVERALL.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Language Performance"): | |
| with gr.Row(): | |
| cross_xquad_zero_shot_overall = gr.components.Dataframe( | |
| CROSS_XQUAD_ZERO_SHOT_LANGUAGE, | |
| datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_LANGUAGE.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| cross_xquad_zero_shot_overall = gr.components.Dataframe( | |
| CROSS_XQUAD_FIVE_SHOT_OVERALL, | |
| datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_OVERALL.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Language Performance"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| CROSS_XQUAD_FIVE_SHOT_LANGUAGE, | |
| datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_LANGUAGE.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **Cross-XQUAD Leaderboard** 🔮 | |
| - **Metric:** Cross-Lingual Consistency, Accuracy, AC3 | |
| - **Languages:** English, Chinese, Spanish, Vietnamese | |
| """) | |
| # dataset 2: cross-logiqa | |
| with gr.TabItem("Cross-LogiQA"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| CROSS_LOGIQA_ZERO_SHOT_OVERALL, | |
| datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Language Performance"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| CROSS_LOGIQA_ZERO_SHOT_LANGUAGE, | |
| datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| CROSS_LOGIQA_FIVE_SHOT_OVERALL, | |
| datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Language Performance"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| CROSS_LOGIQA_FIVE_SHOT_LANGUAGE, | |
| datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **Cross-LogiQA Leaderboard** 🔮 | |
| - **Metric:** Cross-Lingual Consistency, Accuracy, AC3 | |
| - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino | |
| """) | |
| with gr.TabItem("Cultural Reasoning"): | |
| # dataset 3: SG_EVAL | |
| with gr.TabItem("SG_EVAL"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| SG_EVAL_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| SG_EVAL_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **SG_EVAL Leaderboard** 🔮 | |
| - **Metric:** Accuracy | |
| - **Languages:** English | |
| """) | |
| # dataset 4: | |
| with gr.TabItem("US_EVAL"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| US_EVAL_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| US_EVAL_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **US_EVAL Leaderboard** 🔮 | |
| - **Metric:** Accuracy | |
| - **Languages:** English | |
| """) | |
| # dataset 5: | |
| with gr.TabItem("CN_EVAL"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| CN_EVAL_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| CN_EVAL_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **CN_EVAL Leaderboard** 🔮 | |
| - **Metric:** Accuracy | |
| - **Languages:** Chinese | |
| """) | |
| # dataset 6: | |
| with gr.TabItem("PH_EVAL"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| PH_EVAL_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| PH_EVAL_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **PH_EVAL Leaderboard** 🔮 | |
| - **Metric:** Accuracy | |
| - **Languages:** English | |
| """) | |
| # dataset 7: | |
| with gr.TabItem("Singlish to English Translation"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| SING2ENG_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(SING2ENG_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| SING2ENG_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(SING2ENG_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **SING2ENG Leaderboard** 🔮 | |
| - **Metric:** BLEU Avg. | |
| - **Languages:** English | |
| """) | |
| with gr.TabItem("General Reasoning"): | |
| # dataset 12: | |
| with gr.TabItem("MMLU Subset"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| MMLU_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(MMLU_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| MMLU_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(MMLU_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **MMLU Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** English | |
| """) | |
| # dataset 13: | |
| with gr.TabItem("MMLU Full"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| MMLU_FULL_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| MMLU_FULL_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **MMLU Full Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** English | |
| """) | |
| # dataset 14: | |
| with gr.TabItem("C_EVAL Subset"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| C_EVAL_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(C_EVAL_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| C_EVAL_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **C_EVAL Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** Chinese | |
| """) | |
| # dataset 15: | |
| with gr.TabItem("C_EVAL Full"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| C_EVAL_FULL_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| C_EVAL_FULL_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **C_EVAL Full Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** Chinese | |
| """) | |
| # dataset 16: | |
| with gr.TabItem("CMMLU Subset"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| CMMLU_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(CMMLU_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| CMMLU_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(CMMLU_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **CMMLU Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** Chinese | |
| """) | |
| # dataset 17: | |
| with gr.TabItem("CMMLU Full"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| CMMLU_FULL_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| CMMLU_FULL_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **CMMLU Full Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** Chinese | |
| """) | |
| # dataset 18: | |
| with gr.TabItem("ZBench"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| ZBENCH_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(ZBENCH_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| ZBENCH_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(ZBENCH_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **ZBench Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** Chinese | |
| """) | |
| # dataset 18: | |
| with gr.TabItem("IndoMMLU"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| INDOMMLU_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(INDOMMLU_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| INDOMMLU_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(INDOMMLU_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **IndoMMLU Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** Bahasa Indonesian | |
| """) | |
| with gr.TabItem("FLORES-Translation"): | |
| # dataset 8: | |
| with gr.TabItem("FLORES Indonesian to English Translation"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| FLORES_IND2ENG_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| FLORES_IND2ENG_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **flores_ind2eng Leaderboard** 🔮 | |
| - **Metric:** BLEU Avg. | |
| - **Languages:** English | |
| """) | |
| # dataset 9: | |
| with gr.TabItem("FLORES Vitenamese to English Translation"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| FLORES_VIE2ENG_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| FLORES_VIE2ENG_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **flores_vie2eng Leaderboard** 🔮 | |
| - **Metric:** BLEU Avg. | |
| - **Languages:** English | |
| """) | |
| # dataset 10: | |
| with gr.TabItem("FLORES Chinese to English Translation"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| FLORES_ZHO2ENG_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| FLORES_ZHO2ENG_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **flores_zho2eng Leaderboard** 🔮 | |
| - **Metric:** BLEU Avg. | |
| - **Languages:** English | |
| """) | |
| # dataset 11: | |
| with gr.TabItem("FLORES Malay to English Translation"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| FLORES_ZSM2ENG_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| FLORES_ZSM2ENG_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **flores_zsm2eng Leaderboard** 🔮 | |
| - **Metric:** BLEU Avg. | |
| - **Languages:** English | |
| """) | |
| with gr.TabItem("Emotion"): | |
| # dataset 18: | |
| with gr.TabItem("Indonesian Emotion Classification"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| IND_EMOTION_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| IND_EMOTION_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **Ind_emotion Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** Indonesian | |
| """) | |
| # dataset | |
| with gr.TabItem("SST2"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| SST2_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(SST2_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| SST2_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(SST2_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **SST2 Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** English | |
| """) | |
| with gr.TabItem("Dialogue"): | |
| # dataset | |
| with gr.TabItem("DREAM"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| DREAM_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(DREAM_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| DREAM_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(DREAM_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **DREAM Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** English | |
| """) | |
| # dataset | |
| with gr.TabItem("SAMSum"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| SAMSUM_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(SAMSUM_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| SAMSUM_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(SAMSUM_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **SAMSum Leaderboard** 🔮 | |
| - **Metric:** ROUGE. | |
| - **Languages:** English | |
| """) | |
| # dataset | |
| with gr.TabItem("DialogSum"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| DIALOGSUM_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| DIALOGSUM_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **DialogSum Leaderboard** 🔮 | |
| - **Metric:** ROUGE. | |
| - **Languages:** English | |
| """) | |
| with gr.TabItem("Fundamental NLP Tasks"): | |
| # dataset | |
| with gr.TabItem("OCNLI"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| OCNLI_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(OCNLI_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| OCNLI_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(OCNLI_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **OCNLI Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** Chinese | |
| """) | |
| # dataset | |
| with gr.TabItem("C3"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| C3_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(C3_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| C3_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(C3_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **C3 Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** Chinese | |
| """) | |
| # dataset | |
| with gr.TabItem("COLA"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| COLA_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(COLA_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| COLA_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(COLA_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **COLA Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** English | |
| """) | |
| # dataset | |
| with gr.TabItem("QQP"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| QQP_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(QQP_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| QQP_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(QQP_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **QQP Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** English | |
| """) | |
| # dataset | |
| with gr.TabItem("MNLI"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| MNLI_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(MNLI_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| MNLI_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(MNLI_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **MNLI Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** English | |
| """) | |
| # dataset | |
| with gr.TabItem("QNLI"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| QNLI_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(QNLI_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| QNLI_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(QNLI_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **QNLI Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** English | |
| """) | |
| # dataset | |
| with gr.TabItem("WNLI"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| WNLI_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(WNLI_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| WNLI_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(WNLI_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **WNLI Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** English | |
| """) | |
| # dataset | |
| with gr.TabItem("RTE"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| RTE_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(RTE_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| RTE_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(RTE_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **RTE Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** English | |
| """) | |
| # dataset | |
| with gr.TabItem("MRPC"): | |
| with gr.TabItem("Zero Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| MRPC_ZERO_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(MRPC_ZERO_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.TabItem("Five Shot"): | |
| with gr.TabItem("Overall"): | |
| with gr.Row(): | |
| gr.components.Dataframe( | |
| MRPC_FIVE_SHOT, | |
| datatype=["number", "markdown"] + ["number"] * len(MRPC_FIVE_SHOT.columns), | |
| type="pandas", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **MRPC Leaderboard** 🔮 | |
| - **Metric:** Accuracy. | |
| - **Languages:** English | |
| """) | |
| gr.Markdown(r""" | |
| ### If our datasets and leaderboard are useful, please consider cite: | |
| ```bibtex | |
| @article{SeaEval, | |
| title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning}, | |
| author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.}, | |
| journal={NAACL}, | |
| year={2024}} | |
| ``` | |
| """) | |
| block.queue(max_size=10) | |
| # block.launch(server_name="0.0.0.0", share=False) | |
| block.launch(server_name="0.0.0.0", share=True) | |