update
Browse files
app.py
CHANGED
|
@@ -25,9 +25,7 @@ def make_default_md(arena_df, elo_results):
|
|
| 25 |
| [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
|
| 26 |
|
| 27 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
| 28 |
-
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
|
| 29 |
-
|
| 30 |
-
Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}) and more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
| 31 |
"""
|
| 32 |
return leaderboard_md
|
| 33 |
|
|
@@ -37,9 +35,10 @@ def make_arena_leaderboard_md(arena_df):
|
|
| 37 |
total_models = len(arena_df)
|
| 38 |
space = " "
|
| 39 |
leaderboard_md = f"""
|
| 40 |
-
Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: April
|
| 41 |
|
| 42 |
π£ **NEW!** View leaderboard for different categories (e.g., coding, long user query)!
|
|
|
|
| 43 |
"""
|
| 44 |
return leaderboard_md
|
| 45 |
|
|
@@ -405,7 +404,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
| 405 |
gr.Markdown(
|
| 406 |
f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
|
| 407 |
A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
|
| 408 |
-
See Figure 3 below for visualization of the confidence intervals.
|
| 409 |
""",
|
| 410 |
elem_id="leaderboard_markdown"
|
| 411 |
)
|
|
|
|
| 25 |
| [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
|
| 26 |
|
| 27 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
| 28 |
+
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
return leaderboard_md
|
| 31 |
|
|
|
|
| 35 |
total_models = len(arena_df)
|
| 36 |
space = " "
|
| 37 |
leaderboard_md = f"""
|
| 38 |
+
Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: April 11, 2024.
|
| 39 |
|
| 40 |
π£ **NEW!** View leaderboard for different categories (e.g., coding, long user query)!
|
| 41 |
+
Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}). Cast your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)!
|
| 42 |
"""
|
| 43 |
return leaderboard_md
|
| 44 |
|
|
|
|
| 404 |
gr.Markdown(
|
| 405 |
f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
|
| 406 |
A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
|
| 407 |
+
See Figure 3 below for visualization of the confidence intervals. More details in [notebook]({notebook_url}).
|
| 408 |
""",
|
| 409 |
elem_id="leaderboard_markdown"
|
| 410 |
)
|