|  | """A gradio app that renders a static leaderboard. This is used for Hugging Face Space.""" | 
					
						
						|  |  | 
					
						
						|  | import ast | 
					
						
						|  | import argparse | 
					
						
						|  | import glob | 
					
						
						|  | import pickle | 
					
						
						|  |  | 
					
						
						|  | import gradio as gr | 
					
						
						|  | import numpy as np | 
					
						
						|  | import pandas as pd | 
					
						
						|  | import plotly.graph_objects as go | 
					
						
						|  | import pandas as pd | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | MODEL_NAME_COST = { | 
					
						
						|  | "anthropic/claude-2.1": 8, | 
					
						
						|  | "anthropic/claude-3-haiku": 0.25, | 
					
						
						|  | "anthropic/claude-3-opus": 15, | 
					
						
						|  | "anthropic/claude-3-sonnet": 3, | 
					
						
						|  | "cohere/command-r": 0.5, | 
					
						
						|  | "google/gemini-pro": 0.12, | 
					
						
						|  | "google/gemma-7b-it": 0.1, | 
					
						
						|  | "mistralai/mistral-large": 8, | 
					
						
						|  | "mistralai/mistral-medium": 2.7, | 
					
						
						|  | "mistralai/mixtral-8x7b-instruct": 0.7, | 
					
						
						|  | "openai/gpt-3.5-turbo": 0.5, | 
					
						
						|  | "openai/gpt-4-1106-preview": 10, | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def make_default_md(): | 
					
						
						|  |  | 
					
						
						|  | leaderboard_md = f""" | 
					
						
						|  | # 🏆 CZ-EVAL Leaderboard | 
					
						
						|  | [Developer](https://me.hynky.name/) | [Twitter](https://twitter.com/HKydlicek) | 
					
						
						|  |  | 
					
						
						|  | CZ-EVAL is a evaluation leadboard of Tasks in Czech for LLMs. | 
					
						
						|  |  | 
					
						
						|  | It's evaluated on following datasets: | 
					
						
						|  |  | 
					
						
						|  | - Math Problems Understanding [Klokan-QA](https://huggingface.co/datasets/hynky/klokan-qa) | 
					
						
						|  | - Reasoning and General Knowledge [TSP-QA](https://huggingface.co/datasets/hynky/tsp-qa) | 
					
						
						|  |  | 
					
						
						|  | 💻 Code: The evaluation code can be found at [hynky1999/LLM-Eval](https://github.com/hynky1999/LLM-Eval). Model inference is done using [Open-Router](https://openrouter.ai/) or on cloud using [Modal Labs](https://modal.com/). | 
					
						
						|  | """ | 
					
						
						|  | return leaderboard_md | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def make_arena_leaderboard_md(arena_df): | 
					
						
						|  | total_models = len(arena_df) | 
					
						
						|  |  | 
					
						
						|  | leaderboard_md = f""" | 
					
						
						|  | Total #models: **{total_models}**. Last updated: Mar 17, 2024. | 
					
						
						|  | """ | 
					
						
						|  | return leaderboard_md | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def make_full_leaderboard_md(elo_results): | 
					
						
						|  | leaderboard_md = f""" | 
					
						
						|  | Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**. | 
					
						
						|  | - [Klokan-QA](https://huggingface.co/datasets/hynky/klokan-qa) - Mathematical competitions dataset | 
					
						
						|  | - [TSP](https://huggingface.co/datasets/hynky/TSP) - Comprehensive dataset of | 
					
						
						|  |  | 
					
						
						|  | """ | 
					
						
						|  | return leaderboard_md | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def plot_spider(df, title): | 
					
						
						|  | categories = df.columns.tolist()[1:] | 
					
						
						|  | categories = [ | 
					
						
						|  | *categories, | 
					
						
						|  | categories[0], | 
					
						
						|  | ] | 
					
						
						|  | colors = [ | 
					
						
						|  | '#1f77b4', | 
					
						
						|  | '#ff7f0e', | 
					
						
						|  | '#2ca02c', | 
					
						
						|  | '#d62728', | 
					
						
						|  | '#9467bd', | 
					
						
						|  | '#8c564b', | 
					
						
						|  | '#e377c2', | 
					
						
						|  | '#7f7f7f', | 
					
						
						|  | '#bcbd22', | 
					
						
						|  | '#17becf', | 
					
						
						|  | '#f7b6d2', | 
					
						
						|  | '#bcbd22', | 
					
						
						|  | '#dbdb8d', | 
					
						
						|  | '#17becf', | 
					
						
						|  | '#9edae5', | 
					
						
						|  | '#c5b0d5', | 
					
						
						|  | '#c49c94', | 
					
						
						|  | '#f7b6d2', | 
					
						
						|  | '#bcbd22', | 
					
						
						|  | '#dbdb8d', | 
					
						
						|  | '#17becf', | 
					
						
						|  | '#9edae5', | 
					
						
						|  | '#c5b0d5', | 
					
						
						|  | '#c49c94', | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | fig_1000 = go.Figure() | 
					
						
						|  |  | 
					
						
						|  | for i, (idx, row) in enumerate(df.iterrows()): | 
					
						
						|  | name = row[0] | 
					
						
						|  | row = row.tolist()[1:] | 
					
						
						|  | row = row + [ | 
					
						
						|  | row[0] | 
					
						
						|  | ] | 
					
						
						|  | color = colors[i] | 
					
						
						|  | fig_1000.add_trace( | 
					
						
						|  | go.Scatterpolar( | 
					
						
						|  | r=row, | 
					
						
						|  | theta=categories, | 
					
						
						|  | opacity=0.4, | 
					
						
						|  | name=name, | 
					
						
						|  | line=dict( | 
					
						
						|  | color=color, width=4 | 
					
						
						|  | ), | 
					
						
						|  | ) | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | fig_1000.update_layout( | 
					
						
						|  | width=600, | 
					
						
						|  | height=950, | 
					
						
						|  | polar=dict( | 
					
						
						|  | angularaxis=dict( | 
					
						
						|  | gridwidth=2, | 
					
						
						|  | rotation=90, | 
					
						
						|  | direction="clockwise", | 
					
						
						|  | ), | 
					
						
						|  | radialaxis=dict( | 
					
						
						|  | visible=True, | 
					
						
						|  | range=[0, 100], | 
					
						
						|  | angle=45, | 
					
						
						|  | tickangle=45, | 
					
						
						|  | tickvals=[0, 25, 50, 75, 100], | 
					
						
						|  | ticktext=["0%", "25%", "50%", "75%", "100%"], | 
					
						
						|  | ), | 
					
						
						|  | ), | 
					
						
						|  | title_text=title, | 
					
						
						|  | title_x=0.5, | 
					
						
						|  | title_y=0.97, | 
					
						
						|  | title_xanchor="center", | 
					
						
						|  | title_yanchor="top", | 
					
						
						|  | title_font_size=24, | 
					
						
						|  | title_font_color="#333333", | 
					
						
						|  | font=dict(family="Arial", size=16, color="#333333"), | 
					
						
						|  | legend=dict( | 
					
						
						|  | orientation="h", yanchor="bottom", y=-0.45, xanchor="center", x=0.5 | 
					
						
						|  | ), | 
					
						
						|  | ) | 
					
						
						|  | return fig_1000 | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def openrouter_hyperlink(model_name): | 
					
						
						|  | return f'<a target="_blank" href="https://openrouter.ai/models/{model_name}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_full_table(model_table_df): | 
					
						
						|  | num_cols = ["klokan", "culture", "analytical", "critical", "verbal"] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | model_table_df["average"] = model_table_df[num_cols].mean(axis=1) | 
					
						
						|  | model_table_df[num_cols + ["average"]] = model_table_df[ | 
					
						
						|  | num_cols + ["average"] | 
					
						
						|  | ].apply(lambda x: round(x * 100, 2)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | model_table_df.sort_values(by="average", ascending=False, inplace=True) | 
					
						
						|  | model_table_df.insert(0, "rank", np.arange(1, len(model_table_df) + 1)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | model_table_df["completion_price"] = model_table_df["model_name"].apply( | 
					
						
						|  | lambda x: f"{MODEL_NAME_COST[x]}$" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | model_table_df["model_name"] = model_table_df["model_name"].apply( | 
					
						
						|  | lambda x: openrouter_hyperlink(x) | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | model_table_df = model_table_df[["rank", "model_name", "completion_price", "klokan", "culture", "analytical", "critical", "verbal", "average"]] | 
					
						
						|  |  | 
					
						
						|  | model_table_df.rename( | 
					
						
						|  | columns={ | 
					
						
						|  | "model_name": "🤖 Model", | 
					
						
						|  | "completion_price": "💰 Cost (1M-Tokens)", | 
					
						
						|  | "klokan": "🧮 Klokan-QA", | 
					
						
						|  | "culture": "🌍 TSP-Culture", | 
					
						
						|  | "analytical": "🔍 TSP-Analytical", | 
					
						
						|  | "critical": "💡 TSP-Critical", | 
					
						
						|  | "verbal": "📖 TSP-Verbal", | 
					
						
						|  | "average": "📊 Average", | 
					
						
						|  | }, | 
					
						
						|  | inplace=True, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | return model_table_df | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def build_leaderboard_tab(leaderboard_table_file, klokan_table_file, tsp_table_file): | 
					
						
						|  |  | 
					
						
						|  | results = pd.read_csv(leaderboard_table_file) | 
					
						
						|  | results = get_full_table(results) | 
					
						
						|  |  | 
					
						
						|  | default_md = make_default_md() | 
					
						
						|  |  | 
					
						
						|  | md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown") | 
					
						
						|  | with gr.Tabs() as tabs: | 
					
						
						|  |  | 
					
						
						|  | with gr.Tab("CZ-EVAL Leaderboard", id=0): | 
					
						
						|  | md = make_arena_leaderboard_md(results) | 
					
						
						|  | gr.Markdown(md, elem_id="leaderboard_markdown") | 
					
						
						|  | gr.Dataframe( | 
					
						
						|  | datatype=[ | 
					
						
						|  | "str", | 
					
						
						|  | "markdown", | 
					
						
						|  | "number", | 
					
						
						|  | "number", | 
					
						
						|  | "number", | 
					
						
						|  | "number", | 
					
						
						|  | "number", | 
					
						
						|  | "number", | 
					
						
						|  | "str", | 
					
						
						|  | "str", | 
					
						
						|  | "str", | 
					
						
						|  | ], | 
					
						
						|  | value=results, | 
					
						
						|  | elem_id="arena_leaderboard_dataframe", | 
					
						
						|  | height=700, | 
					
						
						|  | column_widths=[ | 
					
						
						|  | 70, | 
					
						
						|  | 200, | 
					
						
						|  | 110, | 
					
						
						|  | 120, | 
					
						
						|  | 120, | 
					
						
						|  | 120, | 
					
						
						|  | 120, | 
					
						
						|  | 100, | 
					
						
						|  | 100, | 
					
						
						|  | ], | 
					
						
						|  | wrap=True, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | p1 = plot_spider(pd.read_csv(klokan_table_file), "Klokan-QA - Acurracy") | 
					
						
						|  | p2 = plot_spider(pd.read_csv(tsp_table_file), "TSP - Accuracy") | 
					
						
						|  |  | 
					
						
						|  | gr.Markdown( | 
					
						
						|  | f"""## More Statistics for CZ-EVAL\n | 
					
						
						|  | Below are figures for more statistics. | 
					
						
						|  | """, | 
					
						
						|  | elem_id="leaderboard_markdown", | 
					
						
						|  | ) | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | gr.Markdown( | 
					
						
						|  | "#### Figure 1: Performance of models on Klokan-QA per difficulty" | 
					
						
						|  | ) | 
					
						
						|  | plot_1 = gr.Plot(p1, show_label=False) | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | gr.Markdown("#### Figure 2: Performance of models on TSP dataset") | 
					
						
						|  | plot_2 = gr.Plot(p2, show_label=False) | 
					
						
						|  |  | 
					
						
						|  | return [md_1, plot_1, plot_2] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | block_css = """ | 
					
						
						|  | #notice_markdown { | 
					
						
						|  | font-size: 104% | 
					
						
						|  | } | 
					
						
						|  | #notice_markdown th { | 
					
						
						|  | display: none; | 
					
						
						|  | } | 
					
						
						|  | #notice_markdown td { | 
					
						
						|  | padding-top: 6px; | 
					
						
						|  | padding-bottom: 6px; | 
					
						
						|  | } | 
					
						
						|  | #leaderboard_markdown { | 
					
						
						|  | font-size: 104% | 
					
						
						|  | } | 
					
						
						|  | #leaderboard_markdown td { | 
					
						
						|  | padding-top: 6px; | 
					
						
						|  | padding-bottom: 6px; | 
					
						
						|  | } | 
					
						
						|  | #leaderboard_dataframe td { | 
					
						
						|  | line-height: 0.1em; | 
					
						
						|  | } | 
					
						
						|  | footer { | 
					
						
						|  | display:none !important | 
					
						
						|  | } | 
					
						
						|  | .image-container { | 
					
						
						|  | display: flex; | 
					
						
						|  | align-items: center; | 
					
						
						|  | padding: 1px; | 
					
						
						|  | } | 
					
						
						|  | .image-container img { | 
					
						
						|  | margin: 0 30px; | 
					
						
						|  | height: 20px; | 
					
						
						|  | max-height: 100%; | 
					
						
						|  | width: auto; | 
					
						
						|  | max-width: 20%; | 
					
						
						|  | } | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def build_demo(leadboard_table, klokan_table, tsp_table): | 
					
						
						|  | text_size = gr.themes.sizes.text_lg | 
					
						
						|  |  | 
					
						
						|  | with gr.Blocks( | 
					
						
						|  | title="CZ-EVAL Leaderboard", | 
					
						
						|  | theme=gr.themes.Base(text_size=text_size), | 
					
						
						|  | css=block_css, | 
					
						
						|  | ) as demo: | 
					
						
						|  | leader_components = build_leaderboard_tab( | 
					
						
						|  | leadboard_table, klokan_table, tsp_table | 
					
						
						|  | ) | 
					
						
						|  | return demo | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | demo = build_demo( | 
					
						
						|  | leadboard_table="./leaderboard/table.csv", | 
					
						
						|  | klokan_table="./leaderboard/klokan.csv", | 
					
						
						|  | tsp_table="./leaderboard/tsp.csv", | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | demo.launch() | 
					
						
						|  |  |