Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Commit 
							
							ยท
						
						094d4db
	
1
								Parent(s):
							
							20dad4a
								
[FIX] Read evals
Browse files- app.py +113 -112
 - src/envs.py +1 -1
 - src/leaderboard/read_evals.py +27 -15
 
    	
        app.py
    CHANGED
    
    | 
         @@ -262,118 +262,6 @@ with demo: 
     | 
|
| 262 | 
         
             
                gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         
     | 
| 263 | 
         | 
| 264 | 
         
             
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         
     | 
| 265 | 
         
            -
                    with gr.TabItem("๐
 Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=0):
         
     | 
| 266 | 
         
            -
                        with gr.Row():
         
     | 
| 267 | 
         
            -
                            with gr.Column():
         
     | 
| 268 | 
         
            -
                                with gr.Row():
         
     | 
| 269 | 
         
            -
                                    search_bar = gr.Textbox(
         
     | 
| 270 | 
         
            -
                                        placeholder=" ๐ Search for your model (separate multiple queries with `;`) and press ENTER...",
         
     | 
| 271 | 
         
            -
                                        show_label=False,
         
     | 
| 272 | 
         
            -
                                        elem_id="search-bar",
         
     | 
| 273 | 
         
            -
                                    )
         
     | 
| 274 | 
         
            -
                                with gr.Row():
         
     | 
| 275 | 
         
            -
                                    shown_columns = gr.CheckboxGroup(
         
     | 
| 276 | 
         
            -
                                        choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
         
     | 
| 277 | 
         
            -
                                        value=[
         
     | 
| 278 | 
         
            -
                                            c.name
         
     | 
| 279 | 
         
            -
                                            for c in fields(AutoEvalColumn)
         
     | 
| 280 | 
         
            -
                                            if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
         
     | 
| 281 | 
         
            -
                                        ],
         
     | 
| 282 | 
         
            -
                                        label="Select columns to show",
         
     | 
| 283 | 
         
            -
                                        elem_id="column-select",
         
     | 
| 284 | 
         
            -
                                        interactive=True,
         
     | 
| 285 | 
         
            -
                                    )
         
     | 
| 286 | 
         
            -
                                # with gr.Row():
         
     | 
| 287 | 
         
            -
                                #     deleted_models_visibility = gr.Checkbox(
         
     | 
| 288 | 
         
            -
                                #         value=False, label="Show gated/private/deleted models", interactive=True
         
     | 
| 289 | 
         
            -
                                #     )
         
     | 
| 290 | 
         
            -
                            with gr.Column(min_width=320):
         
     | 
| 291 | 
         
            -
                                # with gr.Box(elem_id="box-filter"):
         
     | 
| 292 | 
         
            -
                                filter_columns_type = gr.CheckboxGroup(
         
     | 
| 293 | 
         
            -
                                    label="Model Types",
         
     | 
| 294 | 
         
            -
                                    choices=[t.to_str() for t in ModelType],
         
     | 
| 295 | 
         
            -
                                    value=[t.to_str() for t in ModelType],
         
     | 
| 296 | 
         
            -
                                    interactive=True,
         
     | 
| 297 | 
         
            -
                                    elem_id="filter-columns-type",
         
     | 
| 298 | 
         
            -
                                )
         
     | 
| 299 | 
         
            -
                                # filter_columns_architecture = gr.CheckboxGroup(
         
     | 
| 300 | 
         
            -
                                #     label="Architecture Types",
         
     | 
| 301 | 
         
            -
                                #     choices=[i.value.name for i in ModelArch],
         
     | 
| 302 | 
         
            -
                                #     value=[i.value.name for i in ModelArch],
         
     | 
| 303 | 
         
            -
                                #     interactive=True,
         
     | 
| 304 | 
         
            -
                                #     elem_id="filter-columns-architecture",
         
     | 
| 305 | 
         
            -
                                # )
         
     | 
| 306 | 
         
            -
                                filter_domain_specific = gr.CheckboxGroup(
         
     | 
| 307 | 
         
            -
                                    label="Domain Specificity",
         
     | 
| 308 | 
         
            -
                                    choices=["๐ฅ  Clinical models", "Generic models"],
         
     | 
| 309 | 
         
            -
                                    value=["๐ฅ  Clinical models", "Generic models"],
         
     | 
| 310 | 
         
            -
                                    interactive=True,
         
     | 
| 311 | 
         
            -
                                    elem_id="filter-columns-type",
         
     | 
| 312 | 
         
            -
                                )
         
     | 
| 313 | 
         
            -
                                filter_columns_size = gr.CheckboxGroup(
         
     | 
| 314 | 
         
            -
                                    label="Model sizes (in billions of parameters)",
         
     | 
| 315 | 
         
            -
                                    choices=list(NUMERIC_INTERVALS.keys()),
         
     | 
| 316 | 
         
            -
                                    value=list(NUMERIC_INTERVALS.keys()),
         
     | 
| 317 | 
         
            -
                                    interactive=True,
         
     | 
| 318 | 
         
            -
                                    elem_id="filter-columns-size",
         
     | 
| 319 | 
         
            -
                                )
         
     | 
| 320 | 
         
            -
             
     | 
| 321 | 
         
            -
                        datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
         
     | 
| 322 | 
         
            -
             
     | 
| 323 | 
         
            -
                        leaderboard_table = gr.components.Dataframe(
         
     | 
| 324 | 
         
            -
                            value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
         
     | 
| 325 | 
         
            -
                            headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
         
     | 
| 326 | 
         
            -
                            datatype=TYPES,
         
     | 
| 327 | 
         
            -
                            elem_id="leaderboard-table",
         
     | 
| 328 | 
         
            -
                            interactive=False,
         
     | 
| 329 | 
         
            -
                            visible=True,
         
     | 
| 330 | 
         
            -
                        )
         
     | 
| 331 | 
         
            -
             
     | 
| 332 | 
         
            -
                        # Dummy leaderboard for handling the case when the user uses backspace key
         
     | 
| 333 | 
         
            -
                        hidden_leaderboard_table_for_search = gr.components.Dataframe(
         
     | 
| 334 | 
         
            -
                            value=datasets_original_df[DATASET_COLS],
         
     | 
| 335 | 
         
            -
                            headers=DATASET_COLS,
         
     | 
| 336 | 
         
            -
                            datatype=TYPES,
         
     | 
| 337 | 
         
            -
                            visible=False,
         
     | 
| 338 | 
         
            -
                        )
         
     | 
| 339 | 
         
            -
             
     | 
| 340 | 
         
            -
                                    
         
     | 
| 341 | 
         
            -
                        search_bar.submit(
         
     | 
| 342 | 
         
            -
                            update_table,
         
     | 
| 343 | 
         
            -
                            [
         
     | 
| 344 | 
         
            -
                                hidden_leaderboard_table_for_search,
         
     | 
| 345 | 
         
            -
                                shown_columns,
         
     | 
| 346 | 
         
            -
                                search_bar,
         
     | 
| 347 | 
         
            -
                                filter_columns_type,
         
     | 
| 348 | 
         
            -
                                filter_domain_specific,
         
     | 
| 349 | 
         
            -
                                filter_columns_size
         
     | 
| 350 | 
         
            -
                                # filter_columns_architecture
         
     | 
| 351 | 
         
            -
                            ],
         
     | 
| 352 | 
         
            -
                            leaderboard_table,
         
     | 
| 353 | 
         
            -
                        )
         
     | 
| 354 | 
         
            -
                        for selector in [
         
     | 
| 355 | 
         
            -
                            shown_columns,
         
     | 
| 356 | 
         
            -
                            filter_columns_type,
         
     | 
| 357 | 
         
            -
                            filter_domain_specific,
         
     | 
| 358 | 
         
            -
                            # filter_columns_architecture,
         
     | 
| 359 | 
         
            -
                            filter_columns_size,
         
     | 
| 360 | 
         
            -
                            # deleted_models_visibility,
         
     | 
| 361 | 
         
            -
                        ]:
         
     | 
| 362 | 
         
            -
                            selector.change(
         
     | 
| 363 | 
         
            -
                                update_table,
         
     | 
| 364 | 
         
            -
                                [
         
     | 
| 365 | 
         
            -
                                    hidden_leaderboard_table_for_search,
         
     | 
| 366 | 
         
            -
                                    shown_columns,
         
     | 
| 367 | 
         
            -
                                    search_bar,
         
     | 
| 368 | 
         
            -
                                    filter_columns_type,
         
     | 
| 369 | 
         
            -
                                    filter_domain_specific,
         
     | 
| 370 | 
         
            -
                                    filter_columns_size
         
     | 
| 371 | 
         
            -
                                    # filter_columns_architecture,
         
     | 
| 372 | 
         
            -
                                ],
         
     | 
| 373 | 
         
            -
                                leaderboard_table,
         
     | 
| 374 | 
         
            -
                                queue=True,
         
     | 
| 375 | 
         
            -
                            )
         
     | 
| 376 | 
         
            -
             
     | 
| 377 | 
         
             
                    with gr.TabItem("๐
 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
         
     | 
| 378 | 
         
             
                        with gr.Row():
         
     | 
| 379 | 
         
             
                            with gr.Column():
         
     | 
| 
         @@ -1065,6 +953,119 @@ with demo: 
     | 
|
| 1065 | 
         
             
                                            leaderboard_table,
         
     | 
| 1066 | 
         
             
                                            queue=True,
         
     | 
| 1067 | 
         
             
                                        )
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1068 | 
         
             
                    with gr.TabItem("๐ About", elem_id="llm-benchmark-tab-table", id=5):
         
     | 
| 1069 | 
         
             
                        gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
         
     | 
| 1070 | 
         
             
                        gr.HTML(FIVE_PILLAR_DIAGRAM)
         
     | 
| 
         | 
|
| 262 | 
         
             
                gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         
     | 
| 263 | 
         | 
| 264 | 
         
             
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 265 | 
         
             
                    with gr.TabItem("๐
 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
         
     | 
| 266 | 
         
             
                        with gr.Row():
         
     | 
| 267 | 
         
             
                            with gr.Column():
         
     | 
| 
         | 
|
| 953 | 
         
             
                                            leaderboard_table,
         
     | 
| 954 | 
         
             
                                            queue=True,
         
     | 
| 955 | 
         
             
                                        )
         
     | 
| 956 | 
         
            +
                    with gr.TabItem("๐
 Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=0):
         
     | 
| 957 | 
         
            +
                        with gr.Row():
         
     | 
| 958 | 
         
            +
                            with gr.Column():
         
     | 
| 959 | 
         
            +
                                with gr.Row():
         
     | 
| 960 | 
         
            +
                                    search_bar = gr.Textbox(
         
     | 
| 961 | 
         
            +
                                        placeholder=" ๐ Search for your model (separate multiple queries with `;`) and press ENTER...",
         
     | 
| 962 | 
         
            +
                                        show_label=False,
         
     | 
| 963 | 
         
            +
                                        elem_id="search-bar",
         
     | 
| 964 | 
         
            +
                                    )
         
     | 
| 965 | 
         
            +
                                with gr.Row():
         
     | 
| 966 | 
         
            +
                                    shown_columns = gr.CheckboxGroup(
         
     | 
| 967 | 
         
            +
                                        choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
         
     | 
| 968 | 
         
            +
                                        value=[
         
     | 
| 969 | 
         
            +
                                            c.name
         
     | 
| 970 | 
         
            +
                                            for c in fields(AutoEvalColumn)
         
     | 
| 971 | 
         
            +
                                            if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
         
     | 
| 972 | 
         
            +
                                        ],
         
     | 
| 973 | 
         
            +
                                        label="Select columns to show",
         
     | 
| 974 | 
         
            +
                                        elem_id="column-select",
         
     | 
| 975 | 
         
            +
                                        interactive=True,
         
     | 
| 976 | 
         
            +
                                    )
         
     | 
| 977 | 
         
            +
                                # with gr.Row():
         
     | 
| 978 | 
         
            +
                                #     deleted_models_visibility = gr.Checkbox(
         
     | 
| 979 | 
         
            +
                                #         value=False, label="Show gated/private/deleted models", interactive=True
         
     | 
| 980 | 
         
            +
                                #     )
         
     | 
| 981 | 
         
            +
                            with gr.Column(min_width=320):
         
     | 
| 982 | 
         
            +
                                # with gr.Box(elem_id="box-filter"):
         
     | 
| 983 | 
         
            +
                                filter_columns_type = gr.CheckboxGroup(
         
     | 
| 984 | 
         
            +
                                    label="Model Types",
         
     | 
| 985 | 
         
            +
                                    choices=[t.to_str() for t in ModelType],
         
     | 
| 986 | 
         
            +
                                    value=[t.to_str() for t in ModelType],
         
     | 
| 987 | 
         
            +
                                    interactive=True,
         
     | 
| 988 | 
         
            +
                                    elem_id="filter-columns-type",
         
     | 
| 989 | 
         
            +
                                )
         
     | 
| 990 | 
         
            +
                                # filter_columns_architecture = gr.CheckboxGroup(
         
     | 
| 991 | 
         
            +
                                #     label="Architecture Types",
         
     | 
| 992 | 
         
            +
                                #     choices=[i.value.name for i in ModelArch],
         
     | 
| 993 | 
         
            +
                                #     value=[i.value.name for i in ModelArch],
         
     | 
| 994 | 
         
            +
                                #     interactive=True,
         
     | 
| 995 | 
         
            +
                                #     elem_id="filter-columns-architecture",
         
     | 
| 996 | 
         
            +
                                # )
         
     | 
| 997 | 
         
            +
                                filter_domain_specific = gr.CheckboxGroup(
         
     | 
| 998 | 
         
            +
                                    label="Domain Specificity",
         
     | 
| 999 | 
         
            +
                                    choices=["๐ฅ  Clinical models", "Generic models"],
         
     | 
| 1000 | 
         
            +
                                    value=["๐ฅ  Clinical models", "Generic models"],
         
     | 
| 1001 | 
         
            +
                                    interactive=True,
         
     | 
| 1002 | 
         
            +
                                    elem_id="filter-columns-type",
         
     | 
| 1003 | 
         
            +
                                )
         
     | 
| 1004 | 
         
            +
                                filter_columns_size = gr.CheckboxGroup(
         
     | 
| 1005 | 
         
            +
                                    label="Model sizes (in billions of parameters)",
         
     | 
| 1006 | 
         
            +
                                    choices=list(NUMERIC_INTERVALS.keys()),
         
     | 
| 1007 | 
         
            +
                                    value=list(NUMERIC_INTERVALS.keys()),
         
     | 
| 1008 | 
         
            +
                                    interactive=True,
         
     | 
| 1009 | 
         
            +
                                    elem_id="filter-columns-size",
         
     | 
| 1010 | 
         
            +
                                )
         
     | 
| 1011 | 
         
            +
             
     | 
| 1012 | 
         
            +
                        datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
         
     | 
| 1013 | 
         
            +
             
     | 
| 1014 | 
         
            +
                        leaderboard_table = gr.components.Dataframe(
         
     | 
| 1015 | 
         
            +
                            value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
         
     | 
| 1016 | 
         
            +
                            headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
         
     | 
| 1017 | 
         
            +
                            datatype=TYPES,
         
     | 
| 1018 | 
         
            +
                            elem_id="leaderboard-table",
         
     | 
| 1019 | 
         
            +
                            interactive=False,
         
     | 
| 1020 | 
         
            +
                            visible=True,
         
     | 
| 1021 | 
         
            +
                        )
         
     | 
| 1022 | 
         
            +
             
     | 
| 1023 | 
         
            +
                        # Dummy leaderboard for handling the case when the user uses backspace key
         
     | 
| 1024 | 
         
            +
                        hidden_leaderboard_table_for_search = gr.components.Dataframe(
         
     | 
| 1025 | 
         
            +
                            value=datasets_original_df[DATASET_COLS],
         
     | 
| 1026 | 
         
            +
                            headers=DATASET_COLS,
         
     | 
| 1027 | 
         
            +
                            datatype=TYPES,
         
     | 
| 1028 | 
         
            +
                            visible=False,
         
     | 
| 1029 | 
         
            +
                        )
         
     | 
| 1030 | 
         
            +
             
     | 
| 1031 | 
         
            +
                                    
         
     | 
| 1032 | 
         
            +
                        search_bar.submit(
         
     | 
| 1033 | 
         
            +
                            update_table,
         
     | 
| 1034 | 
         
            +
                            [
         
     | 
| 1035 | 
         
            +
                                hidden_leaderboard_table_for_search,
         
     | 
| 1036 | 
         
            +
                                shown_columns,
         
     | 
| 1037 | 
         
            +
                                search_bar,
         
     | 
| 1038 | 
         
            +
                                filter_columns_type,
         
     | 
| 1039 | 
         
            +
                                filter_domain_specific,
         
     | 
| 1040 | 
         
            +
                                filter_columns_size
         
     | 
| 1041 | 
         
            +
                                # filter_columns_architecture
         
     | 
| 1042 | 
         
            +
                            ],
         
     | 
| 1043 | 
         
            +
                            leaderboard_table,
         
     | 
| 1044 | 
         
            +
                        )
         
     | 
| 1045 | 
         
            +
                        for selector in [
         
     | 
| 1046 | 
         
            +
                            shown_columns,
         
     | 
| 1047 | 
         
            +
                            filter_columns_type,
         
     | 
| 1048 | 
         
            +
                            filter_domain_specific,
         
     | 
| 1049 | 
         
            +
                            # filter_columns_architecture,
         
     | 
| 1050 | 
         
            +
                            filter_columns_size,
         
     | 
| 1051 | 
         
            +
                            # deleted_models_visibility,
         
     | 
| 1052 | 
         
            +
                        ]:
         
     | 
| 1053 | 
         
            +
                            selector.change(
         
     | 
| 1054 | 
         
            +
                                update_table,
         
     | 
| 1055 | 
         
            +
                                [
         
     | 
| 1056 | 
         
            +
                                    hidden_leaderboard_table_for_search,
         
     | 
| 1057 | 
         
            +
                                    shown_columns,
         
     | 
| 1058 | 
         
            +
                                    search_bar,
         
     | 
| 1059 | 
         
            +
                                    filter_columns_type,
         
     | 
| 1060 | 
         
            +
                                    filter_domain_specific,
         
     | 
| 1061 | 
         
            +
                                    filter_columns_size
         
     | 
| 1062 | 
         
            +
                                    # filter_columns_architecture,
         
     | 
| 1063 | 
         
            +
                                ],
         
     | 
| 1064 | 
         
            +
                                leaderboard_table,
         
     | 
| 1065 | 
         
            +
                                queue=True,
         
     | 
| 1066 | 
         
            +
                            )
         
     | 
| 1067 | 
         
            +
             
     | 
| 1068 | 
         
            +
             
     | 
| 1069 | 
         
             
                    with gr.TabItem("๐ About", elem_id="llm-benchmark-tab-table", id=5):
         
     | 
| 1070 | 
         
             
                        gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
         
     | 
| 1071 | 
         
             
                        gr.HTML(FIVE_PILLAR_DIAGRAM)
         
     | 
    	
        src/envs.py
    CHANGED
    
    | 
         @@ -8,7 +8,7 @@ TOKEN = os.environ.get("TOKEN")  # A read/write token for your org 
     | 
|
| 8 | 
         | 
| 9 | 
         
             
            OWNER = "m42-health"  # Change to your org - don't forget to create a results and request dataset, with the correct format!
         
     | 
| 10 | 
         
             
            # ----------------------------------
         
     | 
| 11 | 
         
            -
            PRIVATE_REPO =  
     | 
| 12 | 
         | 
| 13 | 
         | 
| 14 | 
         
             
            if PRIVATE_REPO:
         
     | 
| 
         | 
|
| 8 | 
         | 
| 9 | 
         
             
            OWNER = "m42-health"  # Change to your org - don't forget to create a results and request dataset, with the correct format!
         
     | 
| 10 | 
         
             
            # ----------------------------------
         
     | 
| 11 | 
         
            +
            PRIVATE_REPO = False
         
     | 
| 12 | 
         | 
| 13 | 
         | 
| 14 | 
         
             
            if PRIVATE_REPO:
         
     | 
    	
        src/leaderboard/read_evals.py
    CHANGED
    
    | 
         @@ -54,7 +54,8 @@ class EvalResult: 
     | 
|
| 54 | 
         
             
                        except:
         
     | 
| 55 | 
         
             
                            breakpoint()
         
     | 
| 56 | 
         | 
| 57 | 
         
            -
             
         
     | 
| 
         | 
|
| 58 | 
         
             
                    config = data.get("config")
         
     | 
| 59 | 
         | 
| 60 | 
         
             
                    # Precision
         
     | 
| 
         @@ -113,7 +114,8 @@ class EvalResult: 
     | 
|
| 113 | 
         
             
                        if open_ended_results["ELO_intervals"] is not None and open_ended_results["Score_intervals"] is not None:
         
     | 
| 114 | 
         
             
                            open_ended_results["ELO_intervals"] = "+" + str(open_ended_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_results["ELO_intervals"][0]))
         
     | 
| 115 | 
         
             
                            open_ended_results["Score_intervals"] = "+" + str(open_ended_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_results["Score_intervals"][0]))
         
     | 
| 116 | 
         
            -
                    #  
     | 
| 
         | 
|
| 117 | 
         
             
                    # changes to be made here
         
     | 
| 118 | 
         
             
                    med_safety_results = {}
         
     | 
| 119 | 
         
             
                    if "med-safety" in data["results"]:
         
     | 
| 
         @@ -178,12 +180,12 @@ class EvalResult: 
     | 
|
| 178 | 
         
             
                                continue
         
     | 
| 179 | 
         
             
                            mean_acc = np.mean(accs)  # * 100.0
         
     | 
| 180 | 
         
             
                            closed_ended_arabic_results[task.benchmark] = mean_acc
         
     | 
| 181 | 
         
            -
                    if open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
         
     | 
| 182 | 
         
            -
             
     | 
| 183 | 
         
            -
             
     | 
| 184 | 
         
            -
             
     | 
| 185 | 
         
            -
             
     | 
| 186 | 
         
            -
             
     | 
| 187 | 
         
             
                    # types_results = {}
         
     | 
| 188 | 
         
             
                    # for clinical_type in ClinicalTypes:
         
     | 
| 189 | 
         
             
                    #     clinical_type = clinical_type.value
         
     | 
| 
         @@ -195,7 +197,8 @@ class EvalResult: 
     | 
|
| 195 | 
         | 
| 196 | 
         
             
                    #     mean_acc = np.mean(accs)  # * 100.0
         
     | 
| 197 | 
         
             
                    #     types_results[clinical_type.benchmark] = mean_acc
         
     | 
| 198 | 
         
            -
             
     | 
| 
         | 
|
| 199 | 
         
             
                    return self(
         
     | 
| 200 | 
         
             
                        eval_name=result_key,
         
     | 
| 201 | 
         
             
                        full_model=full_model,
         
     | 
| 
         @@ -337,6 +340,14 @@ def get_request_file_for_model(requests_path, model_name, precision): 
     | 
|
| 337 | 
         
             
                            request_file = tmp_request_file
         
     | 
| 338 | 
         
             
                return request_file
         
     | 
| 339 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 340 | 
         | 
| 341 | 
         
             
            def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metric: str) -> list[EvalResult]:
         
     | 
| 342 | 
         
             
                """From the path of the results folder root, extract all needed info for results"""
         
     | 
| 
         @@ -355,7 +366,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metri 
     | 
|
| 355 | 
         | 
| 356 | 
         
             
                    for file in files:
         
     | 
| 357 | 
         
             
                        model_result_filepaths.append(os.path.join(root, file))
         
     | 
| 358 | 
         
            -
             
     | 
| 359 | 
         
             
                eval_results = {}
         
     | 
| 360 | 
         
             
                for model_result_filepath in model_result_filepaths:
         
     | 
| 361 | 
         
             
                    # Creation of result
         
     | 
| 
         @@ -364,11 +375,12 @@ def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metri 
     | 
|
| 364 | 
         | 
| 365 | 
         
             
                    # Store results of same eval together
         
     | 
| 366 | 
         
             
                    eval_name = eval_result.eval_name
         
     | 
| 367 | 
         
            -
                     
     | 
| 368 | 
         
            -
             
     | 
| 369 | 
         
            -
             
     | 
| 370 | 
         
            -
                     
     | 
| 371 | 
         
            -
             
     | 
| 
         | 
|
| 372 | 
         
             
                results = []
         
     | 
| 373 | 
         
             
                # clinical_type_results = []
         
     | 
| 374 | 
         
             
                for v in eval_results.values():
         
     | 
| 
         | 
|
| 54 | 
         
             
                        except:
         
     | 
| 55 | 
         
             
                            breakpoint()
         
     | 
| 56 | 
         | 
| 57 | 
         
            +
                    # if "deepseek-ai/DeepSeek-R1-Distill-Llama-70B" in json_filepath:
         
     | 
| 58 | 
         
            +
                    #     breakpoint()
         
     | 
| 59 | 
         
             
                    config = data.get("config")
         
     | 
| 60 | 
         | 
| 61 | 
         
             
                    # Precision
         
     | 
| 
         | 
|
| 114 | 
         
             
                        if open_ended_results["ELO_intervals"] is not None and open_ended_results["Score_intervals"] is not None:
         
     | 
| 115 | 
         
             
                            open_ended_results["ELO_intervals"] = "+" + str(open_ended_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_results["ELO_intervals"][0]))
         
     | 
| 116 | 
         
             
                            open_ended_results["Score_intervals"] = "+" + str(open_ended_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_results["Score_intervals"][0]))
         
     | 
| 117 | 
         
            +
                    # if "deepseek-ai/DeepSeek-R1-Distill-Llama-70B" in json_filepath:
         
     | 
| 118 | 
         
            +
                    #     breakpoint()
         
     | 
| 119 | 
         
             
                    # changes to be made here
         
     | 
| 120 | 
         
             
                    med_safety_results = {}
         
     | 
| 121 | 
         
             
                    if "med-safety" in data["results"]:
         
     | 
| 
         | 
|
| 180 | 
         
             
                                continue
         
     | 
| 181 | 
         
             
                            mean_acc = np.mean(accs)  # * 100.0
         
     | 
| 182 | 
         
             
                            closed_ended_arabic_results[task.benchmark] = mean_acc
         
     | 
| 183 | 
         
            +
                    # if open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
         
     | 
| 184 | 
         
            +
                    #     open_ended_results = {}
         
     | 
| 185 | 
         
            +
                    #     med_safety_results = {}
         
     | 
| 186 | 
         
            +
                    #     medical_summarization_results = {}
         
     | 
| 187 | 
         
            +
                    #     aci_results = {}
         
     | 
| 188 | 
         
            +
                    #     soap_results = {}
         
     | 
| 189 | 
         
             
                    # types_results = {}
         
     | 
| 190 | 
         
             
                    # for clinical_type in ClinicalTypes:
         
     | 
| 191 | 
         
             
                    #     clinical_type = clinical_type.value
         
     | 
| 
         | 
|
| 197 | 
         | 
| 198 | 
         
             
                    #     mean_acc = np.mean(accs)  # * 100.0
         
     | 
| 199 | 
         
             
                    #     types_results[clinical_type.benchmark] = mean_acc
         
     | 
| 200 | 
         
            +
                    # if "deepseek-ai/DeepSeek-R1-Distill-Llama-70B" in json_filepath:
         
     | 
| 201 | 
         
            +
                    #     breakpoint()
         
     | 
| 202 | 
         
             
                    return self(
         
     | 
| 203 | 
         
             
                        eval_name=result_key,
         
     | 
| 204 | 
         
             
                        full_model=full_model,
         
     | 
| 
         | 
|
| 340 | 
         
             
                            request_file = tmp_request_file
         
     | 
| 341 | 
         
             
                return request_file
         
     | 
| 342 | 
         | 
| 343 | 
         
            +
            def update_results(result1, result2):
         
     | 
| 344 | 
         
            +
                # breakpoint()
         
     | 
| 345 | 
         
            +
                for key in dir(result1):
         
     | 
| 346 | 
         
            +
                    if key.endswith("_results"):
         
     | 
| 347 | 
         
            +
                        if getattr(result1, key) == {}:
         
     | 
| 348 | 
         
            +
                            setattr(result1, key, getattr(result2, key))
         
     | 
| 349 | 
         
            +
                # breakpoint()
         
     | 
| 350 | 
         
            +
                return result1
         
     | 
| 351 | 
         | 
| 352 | 
         
             
            def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metric: str) -> list[EvalResult]:
         
     | 
| 353 | 
         
             
                """From the path of the results folder root, extract all needed info for results"""
         
     | 
| 
         | 
|
| 366 | 
         | 
| 367 | 
         
             
                    for file in files:
         
     | 
| 368 | 
         
             
                        model_result_filepaths.append(os.path.join(root, file))
         
     | 
| 369 | 
         
            +
                # breakpoint()
         
     | 
| 370 | 
         
             
                eval_results = {}
         
     | 
| 371 | 
         
             
                for model_result_filepath in model_result_filepaths:
         
     | 
| 372 | 
         
             
                    # Creation of result
         
     | 
| 
         | 
|
| 375 | 
         | 
| 376 | 
         
             
                    # Store results of same eval together
         
     | 
| 377 | 
         
             
                    eval_name = eval_result.eval_name
         
     | 
| 378 | 
         
            +
                    if eval_name in eval_results.keys():
         
     | 
| 379 | 
         
            +
                        eval_results[eval_name] = update_results(eval_results[eval_name], eval_result)
         
     | 
| 380 | 
         
            +
                        # eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
         
     | 
| 381 | 
         
            +
                    else:
         
     | 
| 382 | 
         
            +
                        eval_results[eval_name] = eval_result
         
     | 
| 383 | 
         
            +
                # breakpoint()
         
     | 
| 384 | 
         
             
                results = []
         
     | 
| 385 | 
         
             
                # clinical_type_results = []
         
     | 
| 386 | 
         
             
                for v in eval_results.values():
         
     |