Spaces:
Runtime error
Runtime error
bulk 1
Browse files
app.py
CHANGED
|
@@ -100,7 +100,7 @@ def main():
|
|
| 100 |
with tab1:
|
| 101 |
if content:
|
| 102 |
try:
|
| 103 |
-
score_columns = ['
|
| 104 |
|
| 105 |
# Display dataframe
|
| 106 |
full_df = convert_markdown_table_to_dataframe(content)
|
|
@@ -111,26 +111,9 @@ def main():
|
|
| 111 |
full_df['Tags'] = full_df['Tags'].fillna('')
|
| 112 |
df = pd.DataFrame(columns=full_df.columns)
|
| 113 |
|
| 114 |
-
# Toggles
|
| 115 |
-
col1, col2, col3 = st.columns(3)
|
| 116 |
-
with col1:
|
| 117 |
-
show_phi = st.checkbox("Phi (2.8B)", value=True)
|
| 118 |
-
with col2:
|
| 119 |
-
show_mistral = st.checkbox("Mistral (7B)", value=True)
|
| 120 |
-
with col3:
|
| 121 |
-
show_other = st.checkbox("Other", value=True)
|
| 122 |
-
|
| 123 |
# Create a DataFrame based on selected filters
|
| 124 |
dfs_to_concat = []
|
| 125 |
|
| 126 |
-
if show_phi:
|
| 127 |
-
dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
|
| 128 |
-
if show_mistral:
|
| 129 |
-
dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('mistral,')])
|
| 130 |
-
if show_other:
|
| 131 |
-
other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
|
| 132 |
-
dfs_to_concat.append(other_df)
|
| 133 |
-
|
| 134 |
# Concatenate the DataFrames
|
| 135 |
if dfs_to_concat:
|
| 136 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
|
@@ -219,27 +202,15 @@ def main():
|
|
| 219 |
# About tab
|
| 220 |
with tab2:
|
| 221 |
st.markdown('''
|
| 222 |
-
###
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
### Reproducibility
|
| 232 |
-
|
| 233 |
-
You can easily reproduce these results using 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
|
| 234 |
-
|
| 235 |
-
### Clone this space
|
| 236 |
-
|
| 237 |
-
You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
|
| 238 |
-
|
| 239 |
-
* Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
|
| 240 |
-
* Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
|
| 241 |
-
|
| 242 |
-
A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations and [CultriX](https://huggingface.co/CultriX) for the CSV export and search bar.
|
| 243 |
''')
|
| 244 |
|
| 245 |
if __name__ == "__main__":
|
|
|
|
| 100 |
with tab1:
|
| 101 |
if content:
|
| 102 |
try:
|
| 103 |
+
score_columns = ['Elo']
|
| 104 |
|
| 105 |
# Display dataframe
|
| 106 |
full_df = convert_markdown_table_to_dataframe(content)
|
|
|
|
| 111 |
full_df['Tags'] = full_df['Tags'].fillna('')
|
| 112 |
df = pd.DataFrame(columns=full_df.columns)
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
# Create a DataFrame based on selected filters
|
| 115 |
dfs_to_concat = []
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
# Concatenate the DataFrames
|
| 118 |
if dfs_to_concat:
|
| 119 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
|
|
|
| 202 |
# About tab
|
| 203 |
with tab2:
|
| 204 |
st.markdown('''
|
| 205 |
+
### Roleplay Leaderboard
|
| 206 |
+
|
| 207 |
+
This space is here to present the results from the Matou-Garou space, where human and AI play a game of werewolf.
|
| 208 |
+
|
| 209 |
+
It is meant as a social experience to see if you would be able to detect if talking to an AI.
|
| 210 |
+
We also hope that this leaderboard can be used by video game creator in the future to select what model to select for LLM based NPCs
|
| 211 |
+
|
| 212 |
+
Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks
|
| 213 |
+
Leaderboard copied from [Maxime Labonne](https://huggingface.co/mlabonne)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
''')
|
| 215 |
|
| 216 |
if __name__ == "__main__":
|
yall.py
CHANGED
|
@@ -12,6 +12,7 @@ class GistInfo:
|
|
| 12 |
model_name: str
|
| 13 |
model_id: str
|
| 14 |
model: str
|
|
|
|
| 15 |
agieval: float
|
| 16 |
gpt4all: float
|
| 17 |
truthfulqa: float
|
|
@@ -59,11 +60,7 @@ def create_yall():
|
|
| 59 |
model_name="Model 1",
|
| 60 |
model_id="model-1",
|
| 61 |
model="Model 1",
|
| 62 |
-
|
| 63 |
-
gpt4all=88.7,
|
| 64 |
-
truthfulqa=90.3,
|
| 65 |
-
bigbench=85.6,
|
| 66 |
-
average=90.0
|
| 67 |
),
|
| 68 |
GistInfo(
|
| 69 |
gist_id="dummy_gist_id_2",
|
|
@@ -72,11 +69,7 @@ def create_yall():
|
|
| 72 |
model_name="Model 2",
|
| 73 |
model_id="model-2",
|
| 74 |
model="Model 2",
|
| 75 |
-
|
| 76 |
-
gpt4all=85.0,
|
| 77 |
-
truthfulqa=87.5,
|
| 78 |
-
bigbench=83.0,
|
| 79 |
-
average=86.2
|
| 80 |
),
|
| 81 |
GistInfo(
|
| 82 |
gist_id="dummy_gist_id_3",
|
|
@@ -85,11 +78,7 @@ def create_yall():
|
|
| 85 |
model_name="Model 3",
|
| 86 |
model_id="model-3",
|
| 87 |
model="Model 3",
|
| 88 |
-
|
| 89 |
-
gpt4all=81.4,
|
| 90 |
-
truthfulqa=79.5,
|
| 91 |
-
bigbench=77.0,
|
| 92 |
-
average=79.0
|
| 93 |
)
|
| 94 |
]
|
| 95 |
|
|
@@ -97,12 +86,12 @@ def create_yall():
|
|
| 97 |
gist_infos = sorted(gist_infos, key=lambda x: x.average, reverse=True)
|
| 98 |
|
| 99 |
# Create markdown table
|
| 100 |
-
markdown_table = "| Model | Average |
|
| 101 |
-
markdown_table += "
|
| 102 |
|
| 103 |
for gist in gist_infos:
|
| 104 |
model_link = f"[{gist.model_id}](https://huggingface.co/{gist.model_id})"
|
| 105 |
-
markdown_table += f"| {model_link} [📄]({gist.url}) | {gist.average} | {gist.
|
| 106 |
|
| 107 |
# Update YALL's gist with dummy gist ID and token
|
| 108 |
update_gist(content=markdown_table, gist_id="dummy_gist_id_yall", access_token="dummy_access_token")
|
|
|
|
| 12 |
model_name: str
|
| 13 |
model_id: str
|
| 14 |
model: str
|
| 15 |
+
elo:float
|
| 16 |
agieval: float
|
| 17 |
gpt4all: float
|
| 18 |
truthfulqa: float
|
|
|
|
| 60 |
model_name="Model 1",
|
| 61 |
model_id="model-1",
|
| 62 |
model="Model 1",
|
| 63 |
+
elo=1900
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
),
|
| 65 |
GistInfo(
|
| 66 |
gist_id="dummy_gist_id_2",
|
|
|
|
| 69 |
model_name="Model 2",
|
| 70 |
model_id="model-2",
|
| 71 |
model="Model 2",
|
| 72 |
+
elo=2000
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
),
|
| 74 |
GistInfo(
|
| 75 |
gist_id="dummy_gist_id_3",
|
|
|
|
| 78 |
model_name="Model 3",
|
| 79 |
model_id="model-3",
|
| 80 |
model="Model 3",
|
| 81 |
+
elo=2200
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
)
|
| 83 |
]
|
| 84 |
|
|
|
|
| 86 |
gist_infos = sorted(gist_infos, key=lambda x: x.average, reverse=True)
|
| 87 |
|
| 88 |
# Create markdown table
|
| 89 |
+
markdown_table = "| Model | Average | Elo |\n"
|
| 90 |
+
markdown_table += "|---|---:|---:|\n"
|
| 91 |
|
| 92 |
for gist in gist_infos:
|
| 93 |
model_link = f"[{gist.model_id}](https://huggingface.co/{gist.model_id})"
|
| 94 |
+
markdown_table += f"| {model_link} [📄]({gist.url}) | {gist.average} | {gist.elo}\n"
|
| 95 |
|
| 96 |
# Update YALL's gist with dummy gist ID and token
|
| 97 |
update_gist(content=markdown_table, gist_id="dummy_gist_id_yall", access_token="dummy_access_token")
|