Spaces:

MERaLiON
/

SeaEval_Leaderboard

Running

App Files Files Community

SeaEval_Leaderboard / app.py

binwang

new

ecb9582 over 1 year ago

raw

history blame

126 kB


	import json

	import gradio as gr
	import pandas as pd

	from statistics import median


	print("Loading datasets...")

	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =

	def add_rank(df, compute_average=True):
	cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]]
	if len(cols_to_rank) == 1:
	df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
	else:
	if compute_average:
	df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False))
	df.sort_values("Average", ascending=False, inplace=True)
	else:
	df.sort_values(cols_to_rank[0], ascending=False, inplace=True)

	df.insert(0, "Rank", list(range(1, len(df) + 1)))
	df = df.round(2)
	# Fill NaN after averaging
	df.fillna("", inplace=True)
	return df

	def make_clickable_model(model_name, link=None):
	if link is None:
	link = "https://huggingface.co/" + model_name
	# Remove user from model name
	return (
	f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
	)


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	with open('all_results.json', 'r') as f:
	ALL_RESULTS = json.load(f)

	MODEL_LIST = list(ALL_RESULTS.keys())
	NUM_MODELS = len(set(MODEL_LIST))
	MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =

	def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]

	overall_acc = [results['overall_acc'] for results in results_list]
	overall_acc = median(overall_acc)

	consistency_score_3 = [results['consistency_score_3'] for results in results_list]
	consistency_score_3 = median(consistency_score_3)

	AC3_3 = [results['AC3_3'] for results in results_list]
	AC3_3 = median(AC3_3)

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": overall_acc,
	"Cross-Lingual Consistency": consistency_score_3,
	"AC3": AC3_3,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "cross_xquad_overall"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=False)

	if fillna:
	df.fillna("", inplace=True)

	return df

	CROSS_XQUAD_ZERO_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="zero_shot")
	CROSS_XQUAD_FIVE_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="five_shot")


	def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]

	English = [results['language_acc']['English'] for results in results_list]
	Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
	Chinese = [results['language_acc']['Chinese'] for results in results_list]
	Spanish = [results['language_acc']['Spanish'] for results in results_list]

	English = median(English)
	Vietnamese = median(Vietnamese)
	Chinese = median(Chinese)
	Spanish = median(Spanish)

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"English": English,
	"Vietnamese": Vietnamese,
	"Chinese": Chinese,
	"Spanish": Spanish,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "cross_xquad_lang"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=False)

	if fillna:
	df.fillna("", inplace=True)

	return df

	CROSS_XQUAD_ZERO_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="zero_shot")
	CROSS_XQUAD_FIVE_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="five_shot")

	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =






	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =

	def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:

	results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]

	overall_acc = [results['overall_acc'] for results in results_list]
	overall_acc = median(overall_acc)

	consistency_score_3 = [results['consistency_score_3'] for results in results_list]
	consistency_score_3 = median(consistency_score_3)

	AC3_3 = [results['AC3_3'] for results in results_list]
	AC3_3 = median(AC3_3)

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": overall_acc,
	"Cross-Lingual Consistency": consistency_score_3,
	"AC3": AC3_3,
	}
	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "cross_mmlu_overall"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=False)

	if fillna:
	df.fillna("", inplace=True)

	return df

	CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot")
	CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot")


	def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:

	results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]

	English = [results['language_acc']['English'] for results in results_list]
	Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
	Chinese = [results['language_acc']['Chinese'] for results in results_list]
	Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
	Filipino = [results['language_acc']['Filipino'] for results in results_list]
	Spanish = [results['language_acc']['Spanish'] for results in results_list]
	Malay = [results['language_acc']['Malay'] for results in results_list]

	English = median(English)
	Vietnamese = median(Vietnamese)
	Chinese = median(Chinese)
	Indonesian = median(Indonesian)
	Filipino = median(Filipino)
	Spanish = median(Spanish)
	Malay = median(Malay)

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"English": English,
	"Vietnamese": Vietnamese,
	"Chinese": Chinese,
	"Indonesian": Indonesian,
	"Filipino": Filipino,
	"Spanish": Spanish,
	"Malay": Malay,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "cross_mmlu_lang"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=False)

	if fillna:
	df.fillna("", inplace=True)

	return df

	CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot")
	CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot")

	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =




	def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:

	results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]

	overall_acc = [results['overall_acc'] for results in results_list]
	overall_acc = median(overall_acc)

	consistency_score_3 = [results['consistency_score_3'] for results in results_list]
	consistency_score_3 = median(consistency_score_3)

	AC3_3 = [results['AC3_3'] for results in results_list]
	AC3_3 = median(AC3_3)

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": overall_acc,
	"Cross-Lingual Consistency": consistency_score_3,
	"AC3": AC3_3,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "cross_logiqa_overall"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=False)

	if fillna:
	df.fillna("", inplace=True)

	return df


	CROSS_LOGIQA_ZERO_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="zero_shot")
	CROSS_LOGIQA_FIVE_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="five_shot")


	def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:

	results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]

	English = [results['language_acc']['English'] for results in results_list]
	Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
	Chinese = [results['language_acc']['Chinese'] for results in results_list]
	Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
	Filipino = [results['language_acc']['Filipino'] for results in results_list]
	Spanish = [results['language_acc']['Spanish'] for results in results_list]
	Malay = [results['language_acc']['Malay'] for results in results_list]

	English = median(English)
	Vietnamese = median(Vietnamese)
	Chinese = median(Chinese)
	Indonesian = median(Indonesian)
	Filipino = median(Filipino)
	Spanish = median(Spanish)
	Malay = median(Malay)

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"English": English,
	"Vietnamese": Vietnamese,
	"Chinese": Chinese,
	"Indonesian": Indonesian,
	"Filipino": Filipino,
	"Spanish": Spanish,
	"Malay": Malay,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "cross_logiqa_language"))



	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=False)

	if fillna:
	df.fillna("", inplace=True)

	return df


	CROSS_LOGIQA_ZERO_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="zero_shot")
	CROSS_LOGIQA_FIVE_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="five_shot")

	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:

	results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "sg_eval"))



	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	SG_EVAL_ZERO_SHOT = get_data_sg_eval(eval_mode="zero_shot")
	SG_EVAL_FIVE_SHOT = get_data_sg_eval(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "us_eval"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	US_EVAL_ZERO_SHOT = get_data_us_eval(eval_mode="zero_shot")
	US_EVAL_FIVE_SHOT = get_data_us_eval(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "cn_eval"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df

	CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot")
	CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =

	def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:




	try:
	results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']]
	accuracy = median([results['accuracy'] for results in results_list])
	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "ph_eval"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	PH_EVAL_ZERO_SHOT = get_data_ph_eval(eval_mode="zero_shot")
	PH_EVAL_FIVE_SHOT = get_data_ph_eval(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_sing2eng(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['sing2eng'][res] for res in ALL_RESULTS[model][eval_mode]['sing2eng']]
	bleu_score = median([results['bleu_score'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"BLEU": bleu_score,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "sing2eng"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	SING2ENG_ZERO_SHOT = get_data_sing2eng(eval_mode="zero_shot")
	SING2ENG_FIVE_SHOT = get_data_sing2eng(eval_mode="five_shot")

	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_flores_ind2eng(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['flores_ind2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_ind2eng']]
	bleu_score = median([results['bleu_score'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"BLEU": bleu_score,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "flores_ind2eng"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	FLORES_IND2ENG_ZERO_SHOT = get_data_flores_ind2eng(eval_mode="zero_shot")
	FLORES_IND2ENG_FIVE_SHOT = get_data_flores_ind2eng(eval_mode="five_shot")



	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_flores_vie2eng(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['flores_vie2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_vie2eng']]
	bleu_score = median([results['bleu_score'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"BLEU": bleu_score,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "flores_vie2eng"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	FLORES_VIE2ENG_ZERO_SHOT = get_data_flores_vie2eng(eval_mode="zero_shot")
	FLORES_VIE2ENG_FIVE_SHOT = get_data_flores_vie2eng(eval_mode="five_shot")

	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_flores_zho2eng(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['flores_zho2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zho2eng']]
	bleu_score = median([results['bleu_score'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"BLEU": bleu_score,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "flores_zho2eng"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df

	FLORES_ZHO2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
	FLORES_ZHO2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['flores_zsm2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zsm2eng']]
	bleu_score = median([results['bleu_score'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"BLEU": bleu_score,
	}
	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "flores_zsm2eng"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df

	FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
	FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_mmlu(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}
	df_list.append(res)

	except:
	accuracy = -1

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	MMLU_ZERO_SHOT = get_data_mmlu(eval_mode="zero_shot")
	MMLU_FIVE_SHOT = get_data_mmlu(eval_mode="five_shot")



	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['mmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu_full']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "mmlu_full"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df

	MMLU_FULL_ZERO_SHOT = get_data_mmlu_full(eval_mode="zero_shot")
	MMLU_FULL_FIVE_SHOT = get_data_mmlu_full(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:
	try:
	results_list = [ALL_RESULTS[model][eval_mode]['c_eval'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "c_eval"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df

	C_EVAL_ZERO_SHOT = get_data_c_eval(eval_mode="zero_shot")
	C_EVAL_FIVE_SHOT = get_data_c_eval(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_c_eval_full(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['c_eval_full'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval_full']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "c_eval_full"))




	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	C_EVAL_FULL_ZERO_SHOT = get_data_c_eval_full(eval_mode="zero_shot")
	C_EVAL_FULL_FIVE_SHOT = get_data_c_eval_full(eval_mode="five_shot")




	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_cmmlu(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['cmmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "cmmlu"))





	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	CMMLU_ZERO_SHOT = get_data_cmmlu(eval_mode="zero_shot")
	CMMLU_FIVE_SHOT = get_data_cmmlu(eval_mode="five_shot")



	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_cmmlu_full(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['cmmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu_full']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "cmmlu_full"))





	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	CMMLU_FULL_ZERO_SHOT = get_data_cmmlu_full(eval_mode="zero_shot")
	CMMLU_FULL_FIVE_SHOT = get_data_cmmlu_full(eval_mode="five_shot")



	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_zbench(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:
	try:
	results_list = [ALL_RESULTS[model][eval_mode]['zbench'][res] for res in ALL_RESULTS[model][eval_mode]['zbench']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "zbench"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	ZBENCH_ZERO_SHOT = get_data_zbench(eval_mode="zero_shot")
	ZBENCH_FIVE_SHOT = get_data_zbench(eval_mode="five_shot")

	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_indommlu(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:


	try:
	results_list = [ALL_RESULTS[model][eval_mode]['indommlu'][res] for res in ALL_RESULTS[model][eval_mode]['indommlu']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "indommlu"))




	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	INDOMMLU_ZERO_SHOT = get_data_indommlu(eval_mode="zero_shot")
	INDOMMLU_FIVE_SHOT = get_data_indommlu(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:
	try:
	results_list = [ALL_RESULTS[model][eval_mode]['ind_emotion'][res] for res in ALL_RESULTS[model][eval_mode]['ind_emotion']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "ind_emotion"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df

	IND_EMOTION_ZERO_SHOT = get_data_ind_emotion(eval_mode="zero_shot")
	IND_EMOTION_FIVE_SHOT = get_data_ind_emotion(eval_mode="five_shot")




	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_ocnli(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['ocnli'][res] for res in ALL_RESULTS[model][eval_mode]['ocnli']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "ocnli"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	OCNLI_ZERO_SHOT = get_data_ocnli(eval_mode="zero_shot")
	OCNLI_FIVE_SHOT = get_data_ocnli(eval_mode="five_shot")



	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_c3(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['c3'][res] for res in ALL_RESULTS[model][eval_mode]['c3']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "c3"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	C3_ZERO_SHOT = get_data_c3(eval_mode="zero_shot")
	C3_FIVE_SHOT = get_data_c3(eval_mode="five_shot")



	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_dream(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['dream'][res] for res in ALL_RESULTS[model][eval_mode]['dream']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "dream"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df

	DREAM_ZERO_SHOT = get_data_dream(eval_mode="zero_shot")
	DREAM_FIVE_SHOT = get_data_dream(eval_mode="five_shot")

	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	def get_data_samsum(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['samsum'][res] for res in ALL_RESULTS[model][eval_mode]['samsum']]

	rouge1 = median([results['rouge1'] for results in results_list])
	rouge2 = median([results['rouge2'] for results in results_list])
	rougeL = median([results['rougeL'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"ROUGE-1": rouge1,
	"ROUGE-2": rouge2,
	"ROUGE-L": rougeL,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "samsum"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	SAMSUM_ZERO_SHOT = get_data_samsum(eval_mode="zero_shot")
	SAMSUM_FIVE_SHOT = get_data_samsum(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_dialogsum(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['dialogsum'][res] for res in ALL_RESULTS[model][eval_mode]['dialogsum']]

	rouge1 = median([results['rouge1'] for results in results_list])
	rouge2 = median([results['rouge2'] for results in results_list])
	rougeL = median([results['rougeL'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"ROUGE-1": rouge1,
	"ROUGE-2": rouge2,
	"ROUGE-L": rougeL,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "dialogsum"))





	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	DIALOGSUM_ZERO_SHOT = get_data_dialogsum(eval_mode="zero_shot")
	DIALOGSUM_FIVE_SHOT = get_data_dialogsum(eval_mode="five_shot")




	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_sst2(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['sst2'][res] for res in ALL_RESULTS[model][eval_mode]['sst2']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "sst2"))





	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	SST2_ZERO_SHOT = get_data_sst2(eval_mode="zero_shot")
	SST2_FIVE_SHOT = get_data_sst2(eval_mode="five_shot")




	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_cola(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['cola'][res] for res in ALL_RESULTS[model][eval_mode]['cola']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "cola"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	COLA_ZERO_SHOT = get_data_cola(eval_mode="zero_shot")
	COLA_FIVE_SHOT = get_data_cola(eval_mode="five_shot")





	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_qqp(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['qqp'][res] for res in ALL_RESULTS[model][eval_mode]['qqp']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "qqp"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	QQP_ZERO_SHOT = get_data_qqp(eval_mode="zero_shot")
	QQP_FIVE_SHOT = get_data_qqp(eval_mode="five_shot")





	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_mnli(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['mnli'][res] for res in ALL_RESULTS[model][eval_mode]['mnli']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "mnli"))


	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	MNLI_ZERO_SHOT = get_data_mnli(eval_mode="zero_shot")
	MNLI_FIVE_SHOT = get_data_mnli(eval_mode="five_shot")





	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_qnli(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['qnli'][res] for res in ALL_RESULTS[model][eval_mode]['qnli']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "qnli"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	QNLI_ZERO_SHOT = get_data_qnli(eval_mode="zero_shot")
	QNLI_FIVE_SHOT = get_data_qnli(eval_mode="five_shot")





	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_wnli(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['wnli'][res] for res in ALL_RESULTS[model][eval_mode]['wnli']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "wnli"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df

	WNLI_ZERO_SHOT = get_data_wnli(eval_mode="zero_shot")
	WNLI_FIVE_SHOT = get_data_wnli(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	def get_data_rte(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:
	try:
	results_list = [ALL_RESULTS[model][eval_mode]['rte'][res] for res in ALL_RESULTS[model][eval_mode]['rte']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "rte"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	RTE_ZERO_SHOT = get_data_rte(eval_mode="zero_shot")
	RTE_FIVE_SHOT = get_data_rte(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	def get_data_mrpc(eval_mode='zero_shot', fillna=True, rank=True):

	df_list = []

	for model in MODEL_LIST:

	try:
	results_list = [ALL_RESULTS[model][eval_mode]['mrpc'][res] for res in ALL_RESULTS[model][eval_mode]['mrpc']]
	accuracy = median([results['accuracy'] for results in results_list])

	res = {
	"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
	"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
	"Accuracy": accuracy,
	}

	df_list.append(res)

	except:
	print('Not found in model: {} for {}'.format(model, "mrpc"))

	df = pd.DataFrame(df_list)
	# If there are any models that are the same, merge them
	# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
	df = df.groupby("Model", as_index=False).first()
	# Put 'Model' column first
	#cols = sorted(list(df.columns))
	cols = list(df.columns)
	cols.insert(0, cols.pop(cols.index("Model")))
	df = df[cols]

	if rank:
	df = add_rank(df, compute_average=True)

	if fillna:
	df.fillna("", inplace=True)

	return df


	MRPC_ZERO_SHOT = get_data_mrpc(eval_mode="zero_shot")
	MRPC_FIVE_SHOT = get_data_mrpc(eval_mode="five_shot")


	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
	# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =


	theme = gr.themes.Soft().set(
	background_fill_primary='*secondary_50'
	)

	block = gr.Blocks(theme='rottenlittlecreature/Moon_Goblin')


	with block:
	gr.Markdown(f"""
	### SeaEval Leaderboard. To submit, refer to the <a href="https://seaeval.github.io/" target="_blank" style="text-decoration: underline">SeaEval Website</a>. Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models.
	- Number of Datasets: > 30
	- Number of Languages: > 8
	- Number of Models: {NUM_MODELS}
	- Mode of Evaluation: Zero-Shot, Five-Shot

	### The following table shows the performance of the models on the SeaEval benchmark.
	- For Zero-Shot performance, it is the median value from 5 distinct prompts shown on the above leaderboard to mitigate the influence of random variations induced by prompts.
	- I am trying to evaluate the base models for five-shot performance and instruction-tuned models for zero-shot.
	- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

	""")

	with gr.Tabs():
	with gr.TabItem("Cross-Lingual Consistency"):

	# dataset 1: cross-mmlu


	# dataset 1: cross-mmlu
	with gr.TabItem("Cross-MMLU"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	cross_mmlu_zero_shot_overall = gr.components.Dataframe(
	CROSS_MMLU_ZERO_SHOT_OVERALL,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns),
	type="pandas",
	)
	with gr.TabItem("Language Performance"):

	with gr.Row():
	cross_mmlu_zero_shot_overall = gr.components.Dataframe(
	CROSS_MMLU_ZERO_SHOT_LANGUAGE,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):

	with gr.Row():
	cross_mmlu_zero_shot_overall = gr.components.Dataframe(
	CROSS_MMLU_FIVE_SHOT_OVERALL,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns),
	type="pandas",
	)
	with gr.TabItem("Language Performance"):

	with gr.Row():
	gr.components.Dataframe(
	CROSS_MMLU_FIVE_SHOT_LANGUAGE,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns),
	type="pandas",
	)

	with gr.Row():
	gr.Markdown("""
	Cross-MMLU Leaderboard 🔮
	- Metric: Cross-Lingual Consistency, Accuracy, AC3
	- Languages: English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
	""")


	with gr.TabItem("Cross-XQUAD"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	cross_xquad_zero_shot_overall = gr.components.Dataframe(
	CROSS_XQUAD_ZERO_SHOT_OVERALL,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_OVERALL.columns),
	type="pandas",
	)
	with gr.TabItem("Language Performance"):

	with gr.Row():
	cross_xquad_zero_shot_overall = gr.components.Dataframe(
	CROSS_XQUAD_ZERO_SHOT_LANGUAGE,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_LANGUAGE.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):

	with gr.Row():
	cross_xquad_zero_shot_overall = gr.components.Dataframe(
	CROSS_XQUAD_FIVE_SHOT_OVERALL,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_OVERALL.columns),
	type="pandas",
	)
	with gr.TabItem("Language Performance"):

	with gr.Row():
	gr.components.Dataframe(
	CROSS_XQUAD_FIVE_SHOT_LANGUAGE,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_LANGUAGE.columns),
	type="pandas",
	)

	with gr.Row():
	gr.Markdown("""
	Cross-XQUAD Leaderboard 🔮
	- Metric: Cross-Lingual Consistency, Accuracy, AC3
	- Languages: English, Chinese, Spanish, Vietnamese
	""")


	# dataset 2: cross-logiqa
	with gr.TabItem("Cross-LogiQA"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	CROSS_LOGIQA_ZERO_SHOT_OVERALL,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns),
	type="pandas",
	)
	with gr.TabItem("Language Performance"):

	with gr.Row():
	gr.components.Dataframe(
	CROSS_LOGIQA_ZERO_SHOT_LANGUAGE,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	CROSS_LOGIQA_FIVE_SHOT_OVERALL,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns),
	type="pandas",
	)
	with gr.TabItem("Language Performance"):
	with gr.Row():
	gr.components.Dataframe(
	CROSS_LOGIQA_FIVE_SHOT_LANGUAGE,
	datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	Cross-LogiQA Leaderboard 🔮
	- Metric: Cross-Lingual Consistency, Accuracy, AC3
	- Languages: English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
	""")



	with gr.TabItem("Cultural Reasoning"):

	# dataset 3: SG_EVAL
	with gr.TabItem("SG_EVAL"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	SG_EVAL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	SG_EVAL_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	SG_EVAL Leaderboard 🔮
	- Metric: Accuracy
	- Languages: English
	""")




	# dataset 4:
	with gr.TabItem("US_EVAL"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	US_EVAL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	US_EVAL_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	US_EVAL Leaderboard 🔮
	- Metric: Accuracy
	- Languages: English
	""")



	# dataset 5:
	with gr.TabItem("CN_EVAL"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	CN_EVAL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	CN_EVAL_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	CN_EVAL Leaderboard 🔮
	- Metric: Accuracy
	- Languages: Chinese
	""")


	# dataset 6:
	with gr.TabItem("PH_EVAL"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	PH_EVAL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	PH_EVAL_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	PH_EVAL Leaderboard 🔮
	- Metric: Accuracy
	- Languages: English
	""")


	# dataset 7:
	with gr.TabItem("Singlish to English Translation"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	SING2ENG_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(SING2ENG_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	SING2ENG_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(SING2ENG_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	SING2ENG Leaderboard 🔮
	- Metric: BLEU Avg.
	- Languages: English
	""")


	with gr.TabItem("General Reasoning"):


	# dataset 12:
	with gr.TabItem("MMLU Subset"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	MMLU_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(MMLU_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	MMLU_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(MMLU_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	MMLU Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: English
	""")



	# dataset 13:
	with gr.TabItem("MMLU Full"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	MMLU_FULL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	MMLU_FULL_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	MMLU Full Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: English
	""")



	# dataset 14:
	with gr.TabItem("C_EVAL Subset"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	C_EVAL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(C_EVAL_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	C_EVAL_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	C_EVAL Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: Chinese
	""")



	# dataset 15:
	with gr.TabItem("C_EVAL Full"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	C_EVAL_FULL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	C_EVAL_FULL_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	C_EVAL Full Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: Chinese
	""")


	# dataset 16:
	with gr.TabItem("CMMLU Subset"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	CMMLU_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(CMMLU_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	CMMLU_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(CMMLU_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	CMMLU Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: Chinese
	""")



	# dataset 17:
	with gr.TabItem("CMMLU Full"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	CMMLU_FULL_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	CMMLU_FULL_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	CMMLU Full Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: Chinese
	""")


	# dataset 18:
	with gr.TabItem("ZBench"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	ZBENCH_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(ZBENCH_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	ZBENCH_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(ZBENCH_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	ZBench Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: Chinese
	""")

	# dataset 18:
	with gr.TabItem("IndoMMLU"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	INDOMMLU_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(INDOMMLU_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	INDOMMLU_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(INDOMMLU_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	IndoMMLU Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: Bahasa Indonesian
	""")


	with gr.TabItem("FLORES-Translation"):


	# dataset 8:
	with gr.TabItem("FLORES Indonesian to English Translation"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	FLORES_IND2ENG_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	FLORES_IND2ENG_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	flores_ind2eng Leaderboard 🔮
	- Metric: BLEU Avg.
	- Languages: English
	""")


	# dataset 9:
	with gr.TabItem("FLORES Vitenamese to English Translation"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	FLORES_VIE2ENG_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	FLORES_VIE2ENG_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	flores_vie2eng Leaderboard 🔮
	- Metric: BLEU Avg.
	- Languages: English
	""")



	# dataset 10:
	with gr.TabItem("FLORES Chinese to English Translation"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	FLORES_ZHO2ENG_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	FLORES_ZHO2ENG_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	flores_zho2eng Leaderboard 🔮
	- Metric: BLEU Avg.
	- Languages: English
	""")


	# dataset 11:
	with gr.TabItem("FLORES Malay to English Translation"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	FLORES_ZSM2ENG_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	FLORES_ZSM2ENG_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	flores_zsm2eng Leaderboard 🔮
	- Metric: BLEU Avg.
	- Languages: English
	""")


	with gr.TabItem("Emotion"):

	# dataset 18:
	with gr.TabItem("Indonesian Emotion Classification"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	IND_EMOTION_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	IND_EMOTION_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	Ind_emotion Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: Indonesian
	""")


	# dataset
	with gr.TabItem("SST2"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	SST2_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(SST2_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	SST2_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(SST2_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	SST2 Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: English
	""")



	with gr.TabItem("Dialogue"):


	# dataset
	with gr.TabItem("DREAM"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	DREAM_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(DREAM_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	DREAM_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(DREAM_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	DREAM Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: English
	""")

	# dataset
	with gr.TabItem("SAMSum"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	SAMSUM_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(SAMSUM_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	SAMSUM_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(SAMSUM_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	SAMSum Leaderboard 🔮
	- Metric: ROUGE.
	- Languages: English
	""")


	# dataset
	with gr.TabItem("DialogSum"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	DIALOGSUM_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	DIALOGSUM_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	DialogSum Leaderboard 🔮
	- Metric: ROUGE.
	- Languages: English
	""")



	with gr.TabItem("Fundamental NLP Tasks"):


	# dataset
	with gr.TabItem("OCNLI"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	OCNLI_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(OCNLI_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	OCNLI_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(OCNLI_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	OCNLI Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: Chinese
	""")


	# dataset
	with gr.TabItem("C3"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	C3_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(C3_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	C3_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(C3_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	C3 Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: Chinese
	""")




	# dataset
	with gr.TabItem("COLA"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	COLA_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(COLA_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	COLA_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(COLA_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	COLA Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: English
	""")


	# dataset
	with gr.TabItem("QQP"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	QQP_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(QQP_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	QQP_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(QQP_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	QQP Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: English
	""")


	# dataset
	with gr.TabItem("MNLI"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	MNLI_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(MNLI_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	MNLI_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(MNLI_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	MNLI Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: English
	""")


	# dataset
	with gr.TabItem("QNLI"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	QNLI_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(QNLI_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	QNLI_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(QNLI_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	QNLI Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: English
	""")



	# dataset
	with gr.TabItem("WNLI"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	WNLI_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(WNLI_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	WNLI_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(WNLI_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	WNLI Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: English
	""")



	# dataset
	with gr.TabItem("RTE"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	RTE_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(RTE_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	RTE_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(RTE_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	RTE Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: English
	""")

	# dataset
	with gr.TabItem("MRPC"):
	with gr.TabItem("Zero Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	MRPC_ZERO_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(MRPC_ZERO_SHOT.columns),
	type="pandas",
	)
	with gr.TabItem("Five Shot"):
	with gr.TabItem("Overall"):
	with gr.Row():
	gr.components.Dataframe(
	MRPC_FIVE_SHOT,
	datatype=["number", "markdown"] + ["number"] * len(MRPC_FIVE_SHOT.columns),
	type="pandas",
	)
	with gr.Row():
	gr.Markdown("""
	MRPC Leaderboard 🔮
	- Metric: Accuracy.
	- Languages: English
	""")

	gr.Markdown(r"""
	### If our datasets and leaderboard are useful, please consider cite:
	```bibtex
	@article{SeaEval,
	title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
	author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
	journal={NAACL},
	year={2024}}
	```
	""")


	block.queue(max_size=10)
	# block.launch(server_name="0.0.0.0", share=False)
	block.launch(server_name="0.0.0.0", share=True)