Roleplay_leaderboard

Runtime error

App Files Files Community

Jofthomas commited on May 25, 2024

Commit

e8631c9

1 Parent(s): 33e4196

bulk 1

Browse files

Files changed (2) hide show

app.py +10 -39
yall.py +7 -18

app.py CHANGED Viewed

@@ -100,7 +100,7 @@ def main():
     with tab1:
         if content:
             try:
-                score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
                 # Display dataframe
                 full_df = convert_markdown_table_to_dataframe(content)
@@ -111,26 +111,9 @@ def main():
                 full_df['Tags'] = full_df['Tags'].fillna('')
                 df = pd.DataFrame(columns=full_df.columns)
-                # Toggles
-                col1, col2, col3 = st.columns(3)
-                with col1:
-                    show_phi = st.checkbox("Phi (2.8B)", value=True)
-                with col2:
-                    show_mistral = st.checkbox("Mistral (7B)", value=True)
-                with col3:
-                    show_other = st.checkbox("Other", value=True)
                 # Create a DataFrame based on selected filters
                 dfs_to_concat = []
-                if show_phi:
-                    dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
-                if show_mistral:
-                    dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('mistral,')])
-                if show_other:
-                    other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
-                    dfs_to_concat.append(other_df)
                 # Concatenate the DataFrames
                 if dfs_to_concat:
                     df = pd.concat(dfs_to_concat, ignore_index=True)
@@ -219,27 +202,15 @@ def main():
      # About tab
     with tab2:
         st.markdown('''
-            ### Nous benchmark suite
-            Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
-            * [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
-            * **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
-            * [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
-            * [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
-            ### Reproducibility
-            You can easily reproduce these results using 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
-            ### Clone this space
-            You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
-            * Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
-            * Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
-            A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations and [CultriX](https://huggingface.co/CultriX) for the CSV export and search bar.
         ''')
 if __name__ == "__main__":

     with tab1:
         if content:
             try:
+                score_columns = ['Elo']
                 # Display dataframe
                 full_df = convert_markdown_table_to_dataframe(content)
                 full_df['Tags'] = full_df['Tags'].fillna('')
                 df = pd.DataFrame(columns=full_df.columns)
                 # Create a DataFrame based on selected filters
                 dfs_to_concat = []
                 # Concatenate the DataFrames
                 if dfs_to_concat:
                     df = pd.concat(dfs_to_concat, ignore_index=True)
      # About tab
     with tab2:
         st.markdown('''
+            ### Roleplay Leaderboard
+        This space is here to present the results from the Matou-Garou space, where human and AI play a game of werewolf.
+        It is meant as a social experience to see if you would be able to detect if talking to an AI.
+        We also hope that this leaderboard can be used by video game creator in the future to select what model to select for LLM based NPCs
+           Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks
+           Leaderboard copied from [Maxime Labonne](https://huggingface.co/mlabonne)
         ''')
 if __name__ == "__main__":

yall.py CHANGED Viewed

@@ -12,6 +12,7 @@ class GistInfo:
     model_name: str
     model_id: str
     model: str
     agieval: float
     gpt4all: float
     truthfulqa: float
@@ -59,11 +60,7 @@ def create_yall():
             model_name="Model 1",
             model_id="model-1",
             model="Model 1",
-            agieval=95.4,
-            gpt4all=88.7,
-            truthfulqa=90.3,
-            bigbench=85.6,
-            average=90.0
         ),
         GistInfo(
             gist_id="dummy_gist_id_2",
@@ -72,11 +69,7 @@ def create_yall():
             model_name="Model 2",
             model_id="model-2",
             model="Model 2",
-            agieval=89.1,
-            gpt4all=85.0,
-            truthfulqa=87.5,
-            bigbench=83.0,
-            average=86.2
         ),
         GistInfo(
             gist_id="dummy_gist_id_3",
@@ -85,11 +78,7 @@ def create_yall():
             model_name="Model 3",
             model_id="model-3",
             model="Model 3",
-            agieval=78.2,
-            gpt4all=81.4,
-            truthfulqa=79.5,
-            bigbench=77.0,
-            average=79.0
         )
     ]
@@ -97,12 +86,12 @@ def create_yall():
     gist_infos = sorted(gist_infos, key=lambda x: x.average, reverse=True)
     # Create markdown table
-    markdown_table = "| Model | Average | AGIEval | GPT4All | TruthfulQA | Bigbench |\n"
-    markdown_table += "|---|---:|---:|---:|---:|---:|\n"
     for gist in gist_infos:
         model_link = f"[{gist.model_id}](https://huggingface.co/{gist.model_id})"
-        markdown_table += f"| {model_link} [📄]({gist.url}) | {gist.average} | {gist.agieval} | {gist.gpt4all} | {gist.truthfulqa} | {gist.bigbench} |\n"
     # Update YALL's gist with dummy gist ID and token
     update_gist(content=markdown_table, gist_id="dummy_gist_id_yall", access_token="dummy_access_token")

     model_name: str
     model_id: str
     model: str
+    elo:float
     agieval: float
     gpt4all: float
     truthfulqa: float
             model_name="Model 1",
             model_id="model-1",
             model="Model 1",
+            elo=1900
         ),
         GistInfo(
             gist_id="dummy_gist_id_2",
             model_name="Model 2",
             model_id="model-2",
             model="Model 2",
+            elo=2000
         ),
         GistInfo(
             gist_id="dummy_gist_id_3",
             model_name="Model 3",
             model_id="model-3",
             model="Model 3",
+            elo=2200
         )
     ]
     gist_infos = sorted(gist_infos, key=lambda x: x.average, reverse=True)
     # Create markdown table
+    markdown_table = "| Model | Average | Elo |\n"
+    markdown_table += "|---|---:|---:|\n"
     for gist in gist_infos:
         model_link = f"[{gist.model_id}](https://huggingface.co/{gist.model_id})"
+        markdown_table += f"| {model_link} [📄]({gist.url}) | {gist.average} | {gist.elo}\n"
     # Update YALL's gist with dummy gist ID and token
     update_gist(content=markdown_table, gist_id="dummy_gist_id_yall", access_token="dummy_access_token")