update results & separate results organization
Browse filesThis view is limited to 50 files because it contains too many changes.  
							See raw diff
- app.py +2 -1
- constants.py +1 -1
- static/eval_results/Default/Aquila_VL_2B/summary_results.json +251 -0
- static/eval_results/Default/Aquila_VL_2B/task_results.json +0 -0
- static/eval_results/Default/Aria/summary_results.json +251 -0
- static/eval_results/Default/Aria/task_results.json +0 -0
- static/eval_results/Default/Claude_3.5/summary_results.json +251 -0
- static/eval_results/Default/Claude_3.5/task_results.json +0 -0
- static/eval_results/Default/Claude_3.5_new/summary_results.json +251 -0
- static/eval_results/Default/Claude_3.5_new/task_results.json +0 -0
- static/eval_results/Default/GPT_4o/summary_results.json +251 -0
- static/eval_results/Default/GPT_4o/task_results.json +0 -0
- static/eval_results/Default/GPT_4o_mini/summary_results.json +251 -0
- static/eval_results/Default/GPT_4o_mini/task_results.json +0 -0
- static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json +251 -0
- static/eval_results/Default/Gemini_1.5_flash_002/task_results.json +0 -0
- static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json +251 -0
- static/eval_results/Default/Gemini_1.5_pro_002/task_results.json +0 -0
- static/eval_results/Default/Idefics3/summary_results.json +251 -0
- static/eval_results/Default/Idefics3/task_results.json +0 -0
- static/eval_results/Default/InternVL2_2B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_2B/task_results.json +0 -0
- static/eval_results/Default/InternVL2_5_2B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_5_2B/task_results.json +0 -0
- static/eval_results/Default/InternVL2_5_78B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_5_78B/task_results.json +0 -0
- static/eval_results/Default/InternVL2_76B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_76B/task_results.json +0 -0
- static/eval_results/Default/InternVL2_8B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_8B/task_results.json +0 -0
- static/eval_results/Default/Llama_3_2_11B/summary_results.json +251 -0
- static/eval_results/Default/Llama_3_2_11B/task_results.json +0 -0
- static/eval_results/Default/Mammoth_VL/summary_results.json +251 -0
- static/eval_results/Default/Mammoth_VL/task_results.json +0 -0
- static/eval_results/Default/MiniCPM_v2.6/summary_results.json +251 -0
- static/eval_results/Default/MiniCPM_v2.6/task_results.json +0 -0
- static/eval_results/Default/NVLM/summary_results.json +251 -0
- static/eval_results/Default/NVLM/task_results.json +0 -0
- static/eval_results/Default/Phi-3.5-vision/summary_results.json +251 -0
- static/eval_results/Default/Phi-3.5-vision/task_results.json +0 -0
- static/eval_results/Default/Pixtral_12B/summary_results.json +251 -0
- static/eval_results/Default/Pixtral_12B/task_results.json +0 -0
- static/eval_results/Default/Qwen2_VL_2B/summary_results.json +251 -0
- static/eval_results/Default/Qwen2_VL_2B/task_results.json +0 -0
- static/eval_results/Default/Qwen2_VL_72B/summary_results.json +251 -0
- static/eval_results/Default/Qwen2_VL_72B/task_results.json +0 -0
- static/eval_results/Default/Qwen2_VL_7B/summary_results.json +251 -0
- static/eval_results/Default/Qwen2_VL_7B/task_results.json +0 -0
- static/eval_results/Default/all_model_keywords_stats.json +0 -0
- static/eval_results/Default/all_summary.json +0 -525
    	
        app.py
    CHANGED
    
    | @@ -55,7 +55,8 @@ with gr.Blocks() as block: | |
| 55 | 
             
                            )
         | 
| 56 |  | 
| 57 | 
             
                        # Define different captions for each table
         | 
| 58 | 
            -
                        default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\ | 
|  | |
| 59 | 
             
                        single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
         | 
| 60 |  | 
| 61 | 
             
                        caption_component = gr.Markdown(
         | 
|  | |
| 55 | 
             
                            )
         | 
| 56 |  | 
| 57 | 
             
                        # Define different captions for each table
         | 
| 58 | 
            +
                        default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> Different from the results in our paper, we only use the Core results with CoT prompting here for clarity and compatibility with the released data. <br> $\\text{Overall} \\ = \\ \\frac{\\text{Core} \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$ " 
         | 
| 59 | 
            +
             | 
| 60 | 
             
                        single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
         | 
| 61 |  | 
| 62 | 
             
                        caption_component = gr.Markdown(
         | 
    	
        constants.py
    CHANGED
    
    | @@ -28,7 +28,7 @@ We aim to provide cost-effective and accurate evaluation for multimodal models, | |
| 28 |  | 
| 29 | 
             
            ## 📊🔍 Results & Takeaways from Evaluating Top Models
         | 
| 30 |  | 
| 31 | 
            -
            - GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet ( | 
| 32 | 
             
            - Qwen2-VL stands out among open-source models, and its flagship model gets close to some proprietary flagship models
         | 
| 33 | 
             
            - Chain-of-Thought (CoT) prompting improves proprietary models but has limited impact on open-source models
         | 
| 34 | 
             
            - Gemini 1.5 Flash performs the best among all the evaluated efficiency models, but struggles with UI and document tasks
         | 
|  | |
| 28 |  | 
| 29 | 
             
            ## 📊🔍 Results & Takeaways from Evaluating Top Models
         | 
| 30 |  | 
| 31 | 
            +
            - GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (0620) obviously in planning tasks (application dimension) and UI/Infographics inputs (input format dimension).
         | 
| 32 | 
             
            - Qwen2-VL stands out among open-source models, and its flagship model gets close to some proprietary flagship models
         | 
| 33 | 
             
            - Chain-of-Thought (CoT) prompting improves proprietary models but has limited impact on open-source models
         | 
| 34 | 
             
            - Gemini 1.5 Flash performs the best among all the evaluated efficiency models, but struggles with UI and document tasks
         | 
    	
        static/eval_results/Default/Aquila_VL_2B/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.159970161379836,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.15844711671722148
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.24567572098570653,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.2704213241616509
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.17100157004197775
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.1796551584774396
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.1263506560912463
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.1775085349123463
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.2114933522881099
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.16251700109869488
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.26453155444796583
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.3729498746867168
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.19090788408036002
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.16500679466160564
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.03972686819521137
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.07035116566014021
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.11915109312705179
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.18915652635850314
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.21939978337316163
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.17643260913333875
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.2438396314831894
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.08989401697906672
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.12241197113963243
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.10758402844431432
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.19372082302321905
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.19201243810115767
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.23278612647548963
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.21664527852608348
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.12138133030990172
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.01221681479628382
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.17994400163273605
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.21939978337316163
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.18212149746318507
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.21563163558700174
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.0981320856519089
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.0557399538308785
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.1351126472094214
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.2025034827431662
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.29326275059361956
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.22529225586731416
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.23810497886903373
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.17867138975396438
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Aquila_VL_2B/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Aria/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.289073788209904,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.2859007507765791
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.5103725263180767,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.5349957007738607
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.31755778420402525
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.3153649050553317
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.34425736922415495
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.3921740378709932
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.37623282710622424
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.271674311347156
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.46313777834281344
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.5692180451127821
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.3152064038837139
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.23851147782276536
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.11246568298589892
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.28561724084490353
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.2505346698796475
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.3040414715952029
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.41865640360591405
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.3622713579911698
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.35872259826035346
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.1509096092007215
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.2846987779732631
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.2899384042262363
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.27412885527802433
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.3117275816801635
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.4523860109667709
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.310055869988487
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.18301681783824644
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.26651659725352617
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.34236220565522313
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.41865640360591405
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.19142683154129833
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.2596336265133595
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.3929243812973524
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.1403503245041943
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.25367910605102256
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.3494812758481046
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.3662927672998609
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.28616079233761366
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.3953949223279651
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.26097385403450996
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Aria/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Claude_3.5/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.5040975742801586,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.5002259116666758
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.6373907158949892,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.6569647463456579
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.5212541172602853
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.5405089647404562
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.6082834220752651
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.5745077617490254
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.5450038475783499
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.4767692987630454
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.5756126284078804
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.6969774436090224
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.5278843049497918
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.4082144793870471
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.23803578664609892
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.5691641481808987
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.4795267886975966
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.525848282456283
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.508735695828719
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.5699094130430454
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.5096772701625744
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.4429640420975014
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.5066797418318023
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.4971460788134188
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.5278127103234661
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.4490020843308984
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.5838224169821388
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.5456152399978661
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.46300075585789874
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.5414381873407914
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.5373019912310933
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.508735695828719
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.4422556748863689
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.49311554035078103
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.6663170946790707
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.3382015835012861
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.5194010220575684
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.532329797132399
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.5808831682303479
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.513474611293123
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.5507075880782885
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.47461998432626556
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Claude_3.5/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Claude_3.5_new/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.5259191914020757,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.5230785894131227
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.6563419761104125,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.6724419604471196
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.5427062825031487
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.5690045172520449
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.6220681231036606
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.6077980666415158
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.5511440615639541
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.4885536652013625
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.5908204006544897
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.6569473684210526
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.5486763511384175
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.4315385951907387
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.2909419331017877
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.6048192628845258
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.48924295292319175
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.556418710368288
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.4946691340754988
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.5558756390298104
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.5425198547046186
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.44210335381541843
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.5187252051932875
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.5071121107460066
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.5387340524651681
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.4824302644151348
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.6242798397166945
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.5782691045270721
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.4630277507828528
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.5914338446093256
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.5636254729390459
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.4946691340754988
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.4828123870640382
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.48756636014597515
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.6590137441693218
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.39901670035164916
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.5166853031535193
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.5561634744977417
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.6123769274172342
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.5512015158810595
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.565796566886933
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.4763267502912362
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Claude_3.5_new/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/GPT_4o/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.5265030595065238,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.5236338521693411
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.6478225794744895,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.665391229578676
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.5421184432647768
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.5630758211022604
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.6216411634729735
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.616018277142757
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.5823101249498799
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.44177544539510955
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.6345458069232931
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.6795263157894738
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.5514924675940659
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.39435038953269674
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.22934807257231926
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.608083455060831
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.491325251564869
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.4999089647103332
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.5315979872161023
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.5641404607063637
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.5613545677222056
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.47760591698367955
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.5388690453811203
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.48037685656449847
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.5994159671881645
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.44606605087301393
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.6274371950293718
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.5448877153826162
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.4751133786848073
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.5343350103400748
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.5672657028463585
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.5315979872161023
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.4500928191484624
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.4908653289106883
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.7056027785545881
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.33202130899313653
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.5032849161169843
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.5510350848991218
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.6095778863474799
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.5283797185155754
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.6135723164021851
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.44047720383044436
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/GPT_4o/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/GPT_4o_mini/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.40767494558789397,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.40431644154143376
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.586537827213665,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.6133276010318144
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.43069690064863675
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.4492982787524939
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.49026056071002017
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.5168957112681365
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.46731791428406805
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.3406008235342885
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.5572925295284307
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.6902380952380953
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.4189154010048976
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.2943206715105082
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.19422793560945503
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.47202628409684394
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.3624496929166193
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.38946844562183286
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.45508480503584553
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.47569921440672464
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.465175334092545
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.29410984789062117
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.41242028190533997
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.3906415365938764
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.44244772638735347
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.3629944944697668
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.5713834131825314
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.39874839531459466
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.3359977324263039
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.4305788513381019
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.46343334374251277
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.45508480503584553
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.24651576711552803
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.36981497185070983
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.5666618234843734
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.2420320329702607
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.3458483931206892
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.43590838051817093
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.5176671720617656
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.3554299482098288
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.5399167524341886
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.32918280841495845
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/GPT_4o_mini/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.4189319021967416,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.41567515414375245
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.5691365176285039,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.5987532244196045
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.4382651695295427
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.46355333176347063
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.4431807648811706
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.4975887290434539
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.49409642663278297
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.38033540105052427
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.5621166766717235
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.6570726817042606
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.4480877005302385
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.3338006749329557
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.16197013296986068
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.3971534837718938
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.3448204918940882
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.43525833484767545
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.4837362543956792
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.5111257660425502
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.49366013155105076
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.4001983820478609
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.386988040250785
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.3884226428206387
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.4425893080900246
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.42223626366392253
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.5390305634303021
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.472066557554629
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.3666950113378685
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.44571360028283974
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.45400479933257654
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.4837362543956792
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.35161402777057993
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.3839609821519984
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.4822341581959653
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.26434115361219657
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.3677547363031234
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.4640301382180305
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.5348199655361041
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.4890240042560499
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.5126038207415967
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.384818434165593
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Gemini_1.5_flash_002/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.4822473962867704,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.4764805563057179
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.5858190649927173,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.6104901117798793
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.4955784031499121
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.5202055934299538
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.5017043129027509
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.5532599716027446
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.546753787203128
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.425969084163906
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.5751012914154264
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.6982330827067671
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.513647745999633
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.3845337030093212
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.23899503258223884
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.4625032188638111
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.4292353723689881
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.4869625906903554
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.5028718355967439
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.5584779204331461
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.55005349042813
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.4292127751495457
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.44896309957892694
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.44418591808616864
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.5146447350354234
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.4688623462674191
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.5580414823700747
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.5538255562099124
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.39066515495086923
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.5370278962809547
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.5034399620483027
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.5028718355967439
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.4885398161821004
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.45544217378728585
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.5421439953094952
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.3335324339429373
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.43465181771633377
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.5250631828331306
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.5821004797173627
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.5124355410095621
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.5722329455291694
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.41210885517904977
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Gemini_1.5_pro_002/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Idefics3/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.08956972487602757,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.08982225274252693
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.3210866162255635,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.35649183147033553
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.11936892871309657
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.123378776179585
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.09602065544451607
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.1661543932339007
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.13018902877020821
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.11200133210641629
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.1837120314657304
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.2364085213032582
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.15239546294916975
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.08255834173646705
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.03149369112824262
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.06151607584357764
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.10124344675801887
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.14147248511867794
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.15942387460900312
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.17458268378399872
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.13442937440893113
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.02766884416043467
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.15513016850044997
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.03757596375966502
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.05386631116442094
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.0760949224506388
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.2987797010800956
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.10403841600436024
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.0661753590325019
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.09190674791720088
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.12345439179884048
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.15942387460900312
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.11382786944230487
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.10803808254834846
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.11450308988278819
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.04671278220005028
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.0978814644137225
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.13283830731528018
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.09697463995668018
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.1840497279921703
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.1605667124060194
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.09835465288235297
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Idefics3/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/InternVL2_2B/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.13141974398938763,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.13063500716262516
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.23864417043743646,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.24901117798796224
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.14522090778963154
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.14491178903291552
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.12126906675624163
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.16912754929321935
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.18542274192083463
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.13923308734553164
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.23992252224543772
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.3420927318295739
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.14807577209152425
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.13036555933925006
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.01727799227799228
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.057021136657850864
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.10504085961245285
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.1625198552182714
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.18999779001767986
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.1487677475708977
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.2011727338536935
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.11886936592818943
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.1131404778887607
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.05739750616837997
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.15465451663650032
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.16044698450090833
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.21429521387724249
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.2128614316540013
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.03658352229780801
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.05757839721254354
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.15225683687839608
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.18999779001767986
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.17677460549936644
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.158165588340436
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.08722661966805
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.04102853815875594
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.11264043251709285
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.17001758160301803
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.3332891958712894
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.1686125516807394
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.21169137106199268
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.10975764217070672
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/InternVL2_2B/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/InternVL2_5_2B/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.17806821966478364,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.17708809739236367
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.2738430375585404,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.2905417024935512
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.19039567147289096
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.19614682488147464
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.18910947570579717
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.20543964378430513
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.23636598588530347
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.15691382827270517
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.28604169870255614
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.4248446115288219
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.18745928331343714
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.15097551654513372
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.030568378443583684
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.13898447520398388
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.13154711942685113
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.18343540213068474
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.20755556526976354
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.15983467048343838
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.26888883087046195
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.12906517409932386
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.14702422379343882
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.15324148486802894
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.19977956414542175
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.1665590610582109
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.2529339759528222
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.23420071687554841
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.09651832955404382
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.0784280378818194
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.21260786581183966
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.20755556526976354
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.138285387531761
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.20214332169825855
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.18128339685489062
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.053153113565753
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.12416116984428181
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.22449772657901465
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.3762336977650326
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.19222024833691936
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.25056132494721467
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.15596334442569906
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/InternVL2_5_2B/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/InternVL2_5_78B/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.44132952988532753,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.4397079059379812
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.5538024772749066,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.5776870163370592
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.4558062458859664
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.46893853078050696
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.5220829627238773
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.4933134095077618
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.477971701185214
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.3936387335462224
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.5610278744213835
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.6072907268170428
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.44533550848682696
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.3548055654857457
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.22852234519925363
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.4910486370158392
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.39410061025954557
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.43424133240430957
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.5300255483670417
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.4793195260560365
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.4622918421665308
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.3729954065847296
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.4226567593431527
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.4149806887502539
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.4904285184890861
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.4348674018783908
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.5124942746906233
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.4717682857925982
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.20496909081092754
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.4184724897299287
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.4951997132559491
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.5300255483670417
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.286105084660728
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.39635000103107665
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.5401547630322637
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.26403470419652064
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.3933356676003734
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.5168098196770042
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.47731479110938463
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.4388571290145052
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.5034762755043025
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.37742798395328586
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/InternVL2_5_78B/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/InternVL2_76B/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.3562710424410931,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.35129859801162616
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.5192997443033639,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.5421324161650903
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.3772549347599992
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.38193012983650343
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.41315219763443384
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.43665980552577693
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.4265623936500962
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.2975890791763991
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.5257990949897898
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.5779473684210527
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.33287081421166276
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.2949505390920417
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.17036496432397477
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.3634339625985008
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.31396468806559114
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.3473756113126343
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.395893002855977
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.44982107744035305
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.42875248733027654
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.2868239162778749
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.3630499545707523
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.3476691827105281
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.3943337471922549
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.29244088978470345
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.45822072478616577
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.3879326330400817
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.20309901738473166
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.34771123515123364
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.4145693044465943
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.395893002855977
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.24403942809507134
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.3153417935059416
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.4306947454508794
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.2132321995754061
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.2953329718984368
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.42202934355552685
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.47409276729986083
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.30014798153766264
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.4625649385962016
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.2868813944130515
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/InternVL2_76B/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/InternVL2_8B/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.25956581776451815,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.2546984460483302
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1165,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.3978571701460552,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.4108583690987125
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.2773656948037259
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.2817247716997634
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.280559214034858
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2511,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.32020728060179815
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2469,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.325593535916075
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.24118253695139918
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.39684007367798446
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.4700852130325815
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.27052668526005397
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2439,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.23189345356483618
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.08260405712900723
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.22800928556370195
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.2013779290163996
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.2804429603269583
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 700,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.34791358240562653
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.2942163420306113
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.3388056726588417
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.10933317885944857
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.250804626773504
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.2522493284864019
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.27414636444623874
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.22381302045502052
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1456,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.3537549824897016
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.30261189962428353
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.15434618291761149
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.19872104324302098
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.30088711082969344
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 700,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.34791358240562653
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.17725087609332119
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.2532272454839157
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.29129840423784176
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.12166926715781588
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.24700310231619527
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2315,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.3214666523378005
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.3995660275981844
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.24614711281861912
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.3393895915929317
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.22078333222564453
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/InternVL2_8B/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Llama_3_2_11B/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.15999641916771298,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.15809331016967038
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.3173342406187366,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.3487962166809973
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.1802478219287358
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.1907604552173455
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.14328677752263275
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.19646404502647707
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.22399113135844315
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.13303760019716085
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.323153603297999
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.4260501253132832
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.1770852858056774
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.15366454315378308
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.06563884729522687
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.11886347847341794
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.11489351406848371
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.1693681214060816
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.2123769209846321
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.2520175802062012
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.2485354956932213
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.06418655520777307
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.12417283740525839
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.16374180545556977
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.1576236804437753
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.15014439824913947
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.3003142292328822
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.19270157739425633
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.1463246409674981
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.0732004839476103
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.1960107191983825
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.2123769209846321
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.1351857051327849
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.18586695387250338
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.17288724679416761
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.08100042975820579
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.0575426944971537
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.19899465185565898
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.254316961351997
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.162801811963855
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.28055776664538923
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.13937853323074623
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Llama_3_2_11B/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Mammoth_VL/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.264052880412689,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.2626894374387823
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.37992668750165337,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.40120378331900275
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.27896733083008046
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.30194776127683565
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.2365295791606494
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.2993927028494267
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.3366347826116991
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.2408454736444444
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.37895522991264047
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.48003508771929826
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.27232427744946475
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.24522937191710698
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.11457024299726488
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.18941525254390731
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.1718334741390191
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.28108187023954245
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.3391119999611432
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.36434285930327387
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.36915384448504296
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.15940750469262005
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.2456942956200745
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.21586513216389874
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.29359048024032264
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.2646677074112521
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.34733130661096645
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.3286125236284589
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.16358654572940287
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.25463059203015115
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.2919119209789575
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.3391119999611432
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.20016011839130254
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.2679179451692527
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.23600902063965679
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.15326915093278803
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.20668466311255687
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.33348955971237954
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.3759170425350556
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.23894961766260706
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.351703435685048
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.26074348700688493
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Mammoth_VL/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/MiniCPM_v2.6/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.22955895202146906,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.22560399396899078
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.41728623355613875,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.43452278589853827
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.2537218694467236
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.2604967101191775
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.2500331562865158
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.3003169369011028
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.31808748114668184
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.18281637763548025
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.40732197204308807
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.48798245614035085
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.23723675736151562
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.1968926733821904
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.08735883237069725
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.21195711598986072
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.18639148159043903
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.21578309681746147
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.3527537836840162
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.3096882575625531
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.3176880312524649
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.0755920550038197
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.23506388020592064
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.1781127776443048
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.2551275278138797
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.20833171754655547
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.36473950920880716
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.293386806641223
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.13955971277399848
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.23596215721092323
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.26319603880798287
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.3527537836840162
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.17888270664238365
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.22288558250834017
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.2666989364424082
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.11693267119342445
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.15342045420318667
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.29243044121840894
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.3777897246686755
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.25714862989687987
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.33187729423141027
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.16493399805627715
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/MiniCPM_v2.6/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/NVLM/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.21589726765847422,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.21406043849932396
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.3478114310231307,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.3947549441100602
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.23287631838857856
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.21591473223174515
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.27426258729618225
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.284874072963892
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.2134087963800149
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.2525993645909815
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.4029543142569604
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.4317142857142857
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.2442484196551863
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.1424318574406695
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.046798309600525674
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.19655048708297065
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.18621338396242557
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.2922667531642391
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.0
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.3447361496776569
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.29674507895195534
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.09716389574493003
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.19684666506287793
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.2199792859352912
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.25164831125437204
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.2396831363622878
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.3215948035793096
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.1853526865291571
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.0
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.0
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.3352056263801705
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.0
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.038244047619047615
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.2100484481849172
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.15704252277801936
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.06688589450465973
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.2292747206409446
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.2689383226748064
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.18857142857142856
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.23682040748983965
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.3656649917873737
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.26866914106442213
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/NVLM/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Phi-3.5-vision/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.22995297916629392,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.22708502951025372
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.3947914647737769,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.42459157351676696
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.2511698139474551
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.2550326045763433
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.24395249720074527
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.2858236369733704
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.29876274710122536
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.21972896566746963
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.37513466171380355
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.4713934837092732
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.25475240046465697
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.20386233377001492
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.06657701969095552
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.16556787388989183
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.17989790940001513
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.2671646581690049
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.24920333780186898
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.3057560384411286
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.3341992361416253
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.12884156381685322
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.20494682188374266
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.21180084406324556
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.2609992615064841
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.2149689274645855
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.365192668303297
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.2593652357274648
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.10107709750566891
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.11861055655587921
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.2824151476986241
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.24920333780186898
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.1980440594073205
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.2636292373854696
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.20747122167273002
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.08602953103518936
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.20136893467064246
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.30979039348232706
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.3495072422622861
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.25858403958844717
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.3357218088688187
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.21140555087788399
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Phi-3.5-vision/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Pixtral_12B/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.31362045151669854,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.3100986209078182
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.4566234428542061,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.4870593293207223
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.33202677713439754
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.34184129499032456
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.37667712211439836
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.37896441862738645
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.37077191302051077
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.2843861774995234
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.4098150360139686
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.533077694235589
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.3372902862054838
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.25372282838901716
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.09524894246403817
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.2972619996610934
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.28304049684103855
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.33523333364720703
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.3988260865341648
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.39117521970978353
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.35583482417594536
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.21897822147396953
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.3436473210057542
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.28979044279399635
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.33530850344530555
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.30160980000905374
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.4166613092238044
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.30796171250186904
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.22871315192743763
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.21669652626580332
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.36087312117067055
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.3988260865341648
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.24616927284658197
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.2900329121369093
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.42652313209316933
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.1209559708312353
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.25678368121442124
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.37605128363484847
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.4576088857728113
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.3464929909487855
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.3858431845580602
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.2549787156825223
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Pixtral_12B/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Qwen2_VL_2B/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.20877163406364055,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.20561526268932287
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.3154302566225611,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.33856405846947557
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.22249997162072932
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.22236161923122505
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.23701014663017753
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.25669221785292334
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.26526414975225454
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.17623548305581763
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.31250702198481506
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.4140676691729323
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.20802820480076603
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.17320633068307653
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.06209506566980099
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.190837839372028
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.16287824421269087
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.19640906475019812
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.2520741776922928
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.24883076673424442
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.2877316297453947
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.13398525561847363
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.1624451002757208
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.20960092816529263
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.19986806708136184
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.2201024015934558
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.30248748033122763
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.256631742010999
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.07681405895691609
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.10526691703628158
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.25018977062352593
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.2520741776922928
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.17435940889565366
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.21286783416184518
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.2521972668785968
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.06967138760493456
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.16996250112948405
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.27603334911345223
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.31002436092347696
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.21061929716065056
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.2656728023444808
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.16356158787929762
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Qwen2_VL_2B/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Qwen2_VL_72B/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.4542376574527161,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.4501201906164793
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1163,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.5639771804231668,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.5835339638865004
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.4683625465479226
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.48669152179713876
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.5291932917937967
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2509,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.53654503409075
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2467,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.4931554892760308
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.3908023665629473
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.5668846347262286
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.6121127819548872
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.4493794346300551
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2437,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.33622171962424363
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.21642754068858566
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.5263730250833892
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.42759570727857965
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.4228561177227288
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 698,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.4780253686541936
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.5070774860945021
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.4807292191169126
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.38847545874852984
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.4359156358804688
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.43781407268698613
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.49080138099759946
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.42481004254128113
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1454,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.5132810622684265
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.5062248706593999
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.3063303099017385
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.523959576707116
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.4879791577413812
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 698,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.4780253686541936
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.34846161336322395
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.44101149919132854
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.5663587858366833
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.3067825586087303
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.4121566368482877
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2313,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.5176521211872086
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.5030444649397028
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.45616267568458396
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.5047683071464567
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.3553838743540432
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Qwen2_VL_72B/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/Qwen2_VL_7B/summary_results.json
    ADDED
    
    | @@ -0,0 +1,251 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "model_summary": {
         | 
| 3 | 
            +
                    "core": {
         | 
| 4 | 
            +
                        "num_eval_tasks": 440,
         | 
| 5 | 
            +
                        "num_eval_samples": 6539,
         | 
| 6 | 
            +
                        "macro_mean_score": 0.3293449599230247,
         | 
| 7 | 
            +
                        "micro_mean_score": 0.325331493515679
         | 
| 8 | 
            +
                    },
         | 
| 9 | 
            +
                    "open": {
         | 
| 10 | 
            +
                        "num_eval_tasks": 65,
         | 
| 11 | 
            +
                        "num_eval_samples": 1170,
         | 
| 12 | 
            +
                        "macro_mean_score": 0.43955105763038577,
         | 
| 13 | 
            +
                        "micro_mean_score": 0.45508547008546996
         | 
| 14 | 
            +
                    },
         | 
| 15 | 
            +
                    "overall_score": 0.34352990319228904
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "keyword_stats": {
         | 
| 18 | 
            +
                    "skills": {
         | 
| 19 | 
            +
                        "Object Recognition and Classification": {
         | 
| 20 | 
            +
                            "count": 303,
         | 
| 21 | 
            +
                            "num_samples": 4755,
         | 
| 22 | 
            +
                            "tasks": [],
         | 
| 23 | 
            +
                            "average_score": 0.3506773570484231
         | 
| 24 | 
            +
                        },
         | 
| 25 | 
            +
                        "Text Recognition (OCR)": {
         | 
| 26 | 
            +
                            "count": 137,
         | 
| 27 | 
            +
                            "num_samples": 2239,
         | 
| 28 | 
            +
                            "tasks": [],
         | 
| 29 | 
            +
                            "average_score": 0.38363163370919123
         | 
| 30 | 
            +
                        },
         | 
| 31 | 
            +
                        "Language Understanding and Generation": {
         | 
| 32 | 
            +
                            "count": 154,
         | 
| 33 | 
            +
                            "num_samples": 2511,
         | 
| 34 | 
            +
                            "tasks": [],
         | 
| 35 | 
            +
                            "average_score": 0.3882785389756705
         | 
| 36 | 
            +
                        },
         | 
| 37 | 
            +
                        "Scene and Event Understanding": {
         | 
| 38 | 
            +
                            "count": 154,
         | 
| 39 | 
            +
                            "num_samples": 2469,
         | 
| 40 | 
            +
                            "tasks": [],
         | 
| 41 | 
            +
                            "average_score": 0.38292659892379843
         | 
| 42 | 
            +
                        },
         | 
| 43 | 
            +
                        "Mathematical and Logical Reasoning": {
         | 
| 44 | 
            +
                            "count": 109,
         | 
| 45 | 
            +
                            "num_samples": 1910,
         | 
| 46 | 
            +
                            "tasks": [],
         | 
| 47 | 
            +
                            "average_score": 0.2730765188348748
         | 
| 48 | 
            +
                        },
         | 
| 49 | 
            +
                        "Commonsense and Social Reasoning": {
         | 
| 50 | 
            +
                            "count": 51,
         | 
| 51 | 
            +
                            "num_samples": 855,
         | 
| 52 | 
            +
                            "tasks": [],
         | 
| 53 | 
            +
                            "average_score": 0.4625711182912848
         | 
| 54 | 
            +
                        },
         | 
| 55 | 
            +
                        "Ethical and Safety Reasoning": {
         | 
| 56 | 
            +
                            "count": 15,
         | 
| 57 | 
            +
                            "num_samples": 245,
         | 
| 58 | 
            +
                            "tasks": [],
         | 
| 59 | 
            +
                            "average_score": 0.5287318295739348
         | 
| 60 | 
            +
                        },
         | 
| 61 | 
            +
                        "Domain-Specific Knowledge and Skills": {
         | 
| 62 | 
            +
                            "count": 77,
         | 
| 63 | 
            +
                            "num_samples": 1386,
         | 
| 64 | 
            +
                            "tasks": [],
         | 
| 65 | 
            +
                            "average_score": 0.32297080808954215
         | 
| 66 | 
            +
                        },
         | 
| 67 | 
            +
                        "Spatial and Temporal Reasoning": {
         | 
| 68 | 
            +
                            "count": 152,
         | 
| 69 | 
            +
                            "num_samples": 2439,
         | 
| 70 | 
            +
                            "tasks": [],
         | 
| 71 | 
            +
                            "average_score": 0.2561357336105554
         | 
| 72 | 
            +
                        },
         | 
| 73 | 
            +
                        "Planning and Decision Making": {
         | 
| 74 | 
            +
                            "count": 37,
         | 
| 75 | 
            +
                            "num_samples": 577,
         | 
| 76 | 
            +
                            "tasks": [],
         | 
| 77 | 
            +
                            "average_score": 0.12651411144309255
         | 
| 78 | 
            +
                        }
         | 
| 79 | 
            +
                    },
         | 
| 80 | 
            +
                    "input_format": {
         | 
| 81 | 
            +
                        "User Interface Screenshots": {
         | 
| 82 | 
            +
                            "count": 93,
         | 
| 83 | 
            +
                            "num_samples": 1517,
         | 
| 84 | 
            +
                            "tasks": [],
         | 
| 85 | 
            +
                            "average_score": 0.35229497847636093
         | 
| 86 | 
            +
                        },
         | 
| 87 | 
            +
                        "Text-Based Images and Documents": {
         | 
| 88 | 
            +
                            "count": 82,
         | 
| 89 | 
            +
                            "num_samples": 1294,
         | 
| 90 | 
            +
                            "tasks": [],
         | 
| 91 | 
            +
                            "average_score": 0.2881996369284258
         | 
| 92 | 
            +
                        },
         | 
| 93 | 
            +
                        "Diagrams and Data Visualizations": {
         | 
| 94 | 
            +
                            "count": 101,
         | 
| 95 | 
            +
                            "num_samples": 1718,
         | 
| 96 | 
            +
                            "tasks": [],
         | 
| 97 | 
            +
                            "average_score": 0.3162917354476226
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                        "Videos": {
         | 
| 100 | 
            +
                            "count": 43,
         | 
| 101 | 
            +
                            "num_samples": 700,
         | 
| 102 | 
            +
                            "tasks": [],
         | 
| 103 | 
            +
                            "average_score": 0.3555910609857979
         | 
| 104 | 
            +
                        },
         | 
| 105 | 
            +
                        "Artistic and Creative Content": {
         | 
| 106 | 
            +
                            "count": 32,
         | 
| 107 | 
            +
                            "num_samples": 541,
         | 
| 108 | 
            +
                            "tasks": [],
         | 
| 109 | 
            +
                            "average_score": 0.3513518594470202
         | 
| 110 | 
            +
                        },
         | 
| 111 | 
            +
                        "Photographs": {
         | 
| 112 | 
            +
                            "count": 143,
         | 
| 113 | 
            +
                            "num_samples": 2248,
         | 
| 114 | 
            +
                            "tasks": [],
         | 
| 115 | 
            +
                            "average_score": 0.39509504888372243
         | 
| 116 | 
            +
                        },
         | 
| 117 | 
            +
                        "3D Models and Aerial Imagery": {
         | 
| 118 | 
            +
                            "count": 11,
         | 
| 119 | 
            +
                            "num_samples": 169,
         | 
| 120 | 
            +
                            "tasks": [],
         | 
| 121 | 
            +
                            "average_score": 0.19173322639974366
         | 
| 122 | 
            +
                        }
         | 
| 123 | 
            +
                    },
         | 
| 124 | 
            +
                    "output_format": {
         | 
| 125 | 
            +
                        "contextual_formatted_text": {
         | 
| 126 | 
            +
                            "count": 98,
         | 
| 127 | 
            +
                            "num_samples": 1514,
         | 
| 128 | 
            +
                            "tasks": [],
         | 
| 129 | 
            +
                            "average_score": 0.3118818521697947
         | 
| 130 | 
            +
                        },
         | 
| 131 | 
            +
                        "structured_output": {
         | 
| 132 | 
            +
                            "count": 110,
         | 
| 133 | 
            +
                            "num_samples": 1714,
         | 
| 134 | 
            +
                            "tasks": [],
         | 
| 135 | 
            +
                            "average_score": 0.3323478338046426
         | 
| 136 | 
            +
                        },
         | 
| 137 | 
            +
                        "exact_text": {
         | 
| 138 | 
            +
                            "count": 83,
         | 
| 139 | 
            +
                            "num_samples": 1278,
         | 
| 140 | 
            +
                            "tasks": [],
         | 
| 141 | 
            +
                            "average_score": 0.31975345327634014
         | 
| 142 | 
            +
                        },
         | 
| 143 | 
            +
                        "numerical_data": {
         | 
| 144 | 
            +
                            "count": 49,
         | 
| 145 | 
            +
                            "num_samples": 862,
         | 
| 146 | 
            +
                            "tasks": [],
         | 
| 147 | 
            +
                            "average_score": 0.3207400992620562
         | 
| 148 | 
            +
                        },
         | 
| 149 | 
            +
                        "open_ended_output": {
         | 
| 150 | 
            +
                            "count": 80,
         | 
| 151 | 
            +
                            "num_samples": 1456,
         | 
| 152 | 
            +
                            "tasks": [],
         | 
| 153 | 
            +
                            "average_score": 0.39680785337230745
         | 
| 154 | 
            +
                        },
         | 
| 155 | 
            +
                        "multiple_choice": {
         | 
| 156 | 
            +
                            "count": 85,
         | 
| 157 | 
            +
                            "num_samples": 1363,
         | 
| 158 | 
            +
                            "tasks": [],
         | 
| 159 | 
            +
                            "average_score": 0.38069986029874947
         | 
| 160 | 
            +
                        }
         | 
| 161 | 
            +
                    },
         | 
| 162 | 
            +
                    "input_num": {
         | 
| 163 | 
            +
                        "6-8 images": {
         | 
| 164 | 
            +
                            "count": 21,
         | 
| 165 | 
            +
                            "num_samples": 314,
         | 
| 166 | 
            +
                            "tasks": [],
         | 
| 167 | 
            +
                            "average_score": 0.21448412698412703
         | 
| 168 | 
            +
                        },
         | 
| 169 | 
            +
                        "9-image or more": {
         | 
| 170 | 
            +
                            "count": 41,
         | 
| 171 | 
            +
                            "num_samples": 623,
         | 
| 172 | 
            +
                            "tasks": [],
         | 
| 173 | 
            +
                            "average_score": 0.34991843422677277
         | 
| 174 | 
            +
                        },
         | 
| 175 | 
            +
                        "1-image": {
         | 
| 176 | 
            +
                            "count": 315,
         | 
| 177 | 
            +
                            "num_samples": 5228,
         | 
| 178 | 
            +
                            "tasks": [],
         | 
| 179 | 
            +
                            "average_score": 0.36487656334089386
         | 
| 180 | 
            +
                        },
         | 
| 181 | 
            +
                        "video": {
         | 
| 182 | 
            +
                            "count": 43,
         | 
| 183 | 
            +
                            "num_samples": 700,
         | 
| 184 | 
            +
                            "tasks": [],
         | 
| 185 | 
            +
                            "average_score": 0.3555910609857979
         | 
| 186 | 
            +
                        },
         | 
| 187 | 
            +
                        "4-5 images": {
         | 
| 188 | 
            +
                            "count": 34,
         | 
| 189 | 
            +
                            "num_samples": 520,
         | 
| 190 | 
            +
                            "tasks": [],
         | 
| 191 | 
            +
                            "average_score": 0.23950364354876252
         | 
| 192 | 
            +
                        },
         | 
| 193 | 
            +
                        "2-3 images": {
         | 
| 194 | 
            +
                            "count": 51,
         | 
| 195 | 
            +
                            "num_samples": 802,
         | 
| 196 | 
            +
                            "tasks": [],
         | 
| 197 | 
            +
                            "average_score": 0.31886513111201115
         | 
| 198 | 
            +
                        }
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                    "app": {
         | 
| 201 | 
            +
                        "Information_Extraction": {
         | 
| 202 | 
            +
                            "count": 72,
         | 
| 203 | 
            +
                            "num_samples": 1124,
         | 
| 204 | 
            +
                            "tasks": [],
         | 
| 205 | 
            +
                            "average_score": 0.3972495309304478
         | 
| 206 | 
            +
                        },
         | 
| 207 | 
            +
                        "Planning": {
         | 
| 208 | 
            +
                            "count": 78,
         | 
| 209 | 
            +
                            "num_samples": 1239,
         | 
| 210 | 
            +
                            "tasks": [],
         | 
| 211 | 
            +
                            "average_score": 0.18098305857595157
         | 
| 212 | 
            +
                        },
         | 
| 213 | 
            +
                        "Coding": {
         | 
| 214 | 
            +
                            "count": 31,
         | 
| 215 | 
            +
                            "num_samples": 474,
         | 
| 216 | 
            +
                            "tasks": [],
         | 
| 217 | 
            +
                            "average_score": 0.30887234822244314
         | 
| 218 | 
            +
                        },
         | 
| 219 | 
            +
                        "Perception": {
         | 
| 220 | 
            +
                            "count": 145,
         | 
| 221 | 
            +
                            "num_samples": 2315,
         | 
| 222 | 
            +
                            "tasks": [],
         | 
| 223 | 
            +
                            "average_score": 0.39256038521661607
         | 
| 224 | 
            +
                        },
         | 
| 225 | 
            +
                        "Metrics": {
         | 
| 226 | 
            +
                            "count": 20,
         | 
| 227 | 
            +
                            "num_samples": 309,
         | 
| 228 | 
            +
                            "tasks": [],
         | 
| 229 | 
            +
                            "average_score": 0.44924313486983725
         | 
| 230 | 
            +
                        },
         | 
| 231 | 
            +
                        "Science": {
         | 
| 232 | 
            +
                            "count": 29,
         | 
| 233 | 
            +
                            "num_samples": 574,
         | 
| 234 | 
            +
                            "tasks": [],
         | 
| 235 | 
            +
                            "average_score": 0.2880278656037017
         | 
| 236 | 
            +
                        },
         | 
| 237 | 
            +
                        "Knowledge": {
         | 
| 238 | 
            +
                            "count": 97,
         | 
| 239 | 
            +
                            "num_samples": 1605,
         | 
| 240 | 
            +
                            "tasks": [],
         | 
| 241 | 
            +
                            "average_score": 0.4015531477048036
         | 
| 242 | 
            +
                        },
         | 
| 243 | 
            +
                        "Mathematics": {
         | 
| 244 | 
            +
                            "count": 33,
         | 
| 245 | 
            +
                            "num_samples": 547,
         | 
| 246 | 
            +
                            "tasks": [],
         | 
| 247 | 
            +
                            "average_score": 0.24179792538224956
         | 
| 248 | 
            +
                        }
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                }
         | 
| 251 | 
            +
            }
         | 
    	
        static/eval_results/Default/Qwen2_VL_7B/task_results.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/all_model_keywords_stats.json
    DELETED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        static/eval_results/Default/all_summary.json
    DELETED
    
    | @@ -1,525 +0,0 @@ | |
| 1 | 
            -
            {
         | 
| 2 | 
            -
                "GPT_4o": {
         | 
| 3 | 
            -
                    "core_noncot": {
         | 
| 4 | 
            -
                        "num_eval_tasks": 440,
         | 
| 5 | 
            -
                        "num_eval_samples": 6539,
         | 
| 6 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 7 | 
            -
                        "macro_mean_score": 0.5203440930873326,
         | 
| 8 | 
            -
                        "micro_mean_score": 0.514302640282204
         | 
| 9 | 
            -
                    },
         | 
| 10 | 
            -
                    "core_cot": {
         | 
| 11 | 
            -
                        "num_eval_tasks": 440,
         | 
| 12 | 
            -
                        "num_eval_samples": 6539,
         | 
| 13 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 14 | 
            -
                        "macro_mean_score": 0.5265030595065238,
         | 
| 15 | 
            -
                        "micro_mean_score": 0.5236338521693411
         | 
| 16 | 
            -
                    },
         | 
| 17 | 
            -
                    "open": {
         | 
| 18 | 
            -
                        "num_eval_tasks": 65,
         | 
| 19 | 
            -
                        "num_eval_samples": 1163,
         | 
| 20 | 
            -
                        "macro_mean_score": 0.6478225794744895,
         | 
| 21 | 
            -
                        "micro_mean_score": 0.665391229578676
         | 
| 22 | 
            -
                    },
         | 
| 23 | 
            -
                    "overall_score": 0.5421184432647768
         | 
| 24 | 
            -
                },
         | 
| 25 | 
            -
                "Gemini_1.5_pro_002": {
         | 
| 26 | 
            -
                    "core_noncot": {
         | 
| 27 | 
            -
                        "num_eval_tasks": 440,
         | 
| 28 | 
            -
                        "num_eval_samples": 6539,
         | 
| 29 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 30 | 
            -
                        "macro_mean_score": 0.4699992918320008,
         | 
| 31 | 
            -
                        "micro_mean_score": 0.4651116133689296
         | 
| 32 | 
            -
                    },
         | 
| 33 | 
            -
                    "core_cot": {
         | 
| 34 | 
            -
                        "num_eval_tasks": 440,
         | 
| 35 | 
            -
                        "num_eval_samples": 6539,
         | 
| 36 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 37 | 
            -
                        "macro_mean_score": 0.4822473962867704,
         | 
| 38 | 
            -
                        "micro_mean_score": 0.4764805563057179
         | 
| 39 | 
            -
                    },
         | 
| 40 | 
            -
                    "open": {
         | 
| 41 | 
            -
                        "num_eval_tasks": 65,
         | 
| 42 | 
            -
                        "num_eval_samples": 1163,
         | 
| 43 | 
            -
                        "macro_mean_score": 0.5858190649927173,
         | 
| 44 | 
            -
                        "micro_mean_score": 0.6104901117798793
         | 
| 45 | 
            -
                    },
         | 
| 46 | 
            -
                    "overall_score": 0.4955784031499121
         | 
| 47 | 
            -
                },
         | 
| 48 | 
            -
                "Gemini_1.5_flash_002": {
         | 
| 49 | 
            -
                    "core_noncot": {
         | 
| 50 | 
            -
                        "num_eval_tasks": 440,
         | 
| 51 | 
            -
                        "num_eval_samples": 6539,
         | 
| 52 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 53 | 
            -
                        "macro_mean_score": 0.41898948981774853,
         | 
| 54 | 
            -
                        "micro_mean_score": 0.4127376993779598
         | 
| 55 | 
            -
                    },
         | 
| 56 | 
            -
                    "core_cot": {
         | 
| 57 | 
            -
                        "num_eval_tasks": 440,
         | 
| 58 | 
            -
                        "num_eval_samples": 6539,
         | 
| 59 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 60 | 
            -
                        "macro_mean_score": 0.4189319021967416,
         | 
| 61 | 
            -
                        "micro_mean_score": 0.41567515414375245
         | 
| 62 | 
            -
                    },
         | 
| 63 | 
            -
                    "open": {
         | 
| 64 | 
            -
                        "num_eval_tasks": 65,
         | 
| 65 | 
            -
                        "num_eval_samples": 1163,
         | 
| 66 | 
            -
                        "macro_mean_score": 0.5691365176285039,
         | 
| 67 | 
            -
                        "micro_mean_score": 0.5987532244196045
         | 
| 68 | 
            -
                    },
         | 
| 69 | 
            -
                    "overall_score": 0.43831534488249924
         | 
| 70 | 
            -
                },
         | 
| 71 | 
            -
                "Claude_3.5": {
         | 
| 72 | 
            -
                    "core_noncot": {
         | 
| 73 | 
            -
                        "num_eval_tasks": 440,
         | 
| 74 | 
            -
                        "num_eval_samples": 6539,
         | 
| 75 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 76 | 
            -
                        "macro_mean_score": 0.48800427486796155,
         | 
| 77 | 
            -
                        "micro_mean_score": 0.4814327812005499
         | 
| 78 | 
            -
                    },
         | 
| 79 | 
            -
                    "core_cot": {
         | 
| 80 | 
            -
                        "num_eval_tasks": 440,
         | 
| 81 | 
            -
                        "num_eval_samples": 6539,
         | 
| 82 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 83 | 
            -
                        "macro_mean_score": 0.5040975742801586,
         | 
| 84 | 
            -
                        "micro_mean_score": 0.5002259116666758
         | 
| 85 | 
            -
                    },
         | 
| 86 | 
            -
                    "open": {
         | 
| 87 | 
            -
                        "num_eval_tasks": 65,
         | 
| 88 | 
            -
                        "num_eval_samples": 1163,
         | 
| 89 | 
            -
                        "macro_mean_score": 0.6373907158949892,
         | 
| 90 | 
            -
                        "micro_mean_score": 0.6569647463456579
         | 
| 91 | 
            -
                    },
         | 
| 92 | 
            -
                    "overall_score": 0.5212541172602853
         | 
| 93 | 
            -
                },
         | 
| 94 | 
            -
                "Claude_3.5_new": {
         | 
| 95 | 
            -
                    "core_noncot": {
         | 
| 96 | 
            -
                        "num_eval_tasks": 440,
         | 
| 97 | 
            -
                        "num_eval_samples": 6539,
         | 
| 98 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 99 | 
            -
                        "macro_mean_score": 0.4919657684484185,
         | 
| 100 | 
            -
                        "micro_mean_score": 0.4874520567007144
         | 
| 101 | 
            -
                    },
         | 
| 102 | 
            -
                    "core_cot": {
         | 
| 103 | 
            -
                        "num_eval_tasks": 440,
         | 
| 104 | 
            -
                        "num_eval_samples": 6539,
         | 
| 105 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 106 | 
            -
                        "macro_mean_score": 0.5259191914020757,
         | 
| 107 | 
            -
                        "micro_mean_score": 0.5230785894131227
         | 
| 108 | 
            -
                    },
         | 
| 109 | 
            -
                    "open": {
         | 
| 110 | 
            -
                        "num_eval_tasks": 65,
         | 
| 111 | 
            -
                        "num_eval_samples": 1163,
         | 
| 112 | 
            -
                        "macro_mean_score": 0.6563419761104125,
         | 
| 113 | 
            -
                        "micro_mean_score": 0.6724419604471196
         | 
| 114 | 
            -
                    },
         | 
| 115 | 
            -
                    "overall_score": 0.5427062825031487
         | 
| 116 | 
            -
                },
         | 
| 117 | 
            -
                "GPT_4o_mini": {
         | 
| 118 | 
            -
                    "core_noncot": {
         | 
| 119 | 
            -
                        "num_eval_tasks": 440,
         | 
| 120 | 
            -
                        "num_eval_samples": 6539,
         | 
| 121 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 122 | 
            -
                        "macro_mean_score": 0.39854757130003565,
         | 
| 123 | 
            -
                        "micro_mean_score": 0.3936551517403452
         | 
| 124 | 
            -
                    },
         | 
| 125 | 
            -
                    "core_cot": {
         | 
| 126 | 
            -
                        "num_eval_tasks": 440,
         | 
| 127 | 
            -
                        "num_eval_samples": 6539,
         | 
| 128 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 129 | 
            -
                        "macro_mean_score": 0.40767494558789397,
         | 
| 130 | 
            -
                        "micro_mean_score": 0.40431644154143376
         | 
| 131 | 
            -
                    },
         | 
| 132 | 
            -
                    "open": {
         | 
| 133 | 
            -
                        "num_eval_tasks": 65,
         | 
| 134 | 
            -
                        "num_eval_samples": 1163,
         | 
| 135 | 
            -
                        "macro_mean_score": 0.586537827213665,
         | 
| 136 | 
            -
                        "micro_mean_score": 0.6133276010318144
         | 
| 137 | 
            -
                    },
         | 
| 138 | 
            -
                    "overall_score": 0.43069690064863675
         | 
| 139 | 
            -
                },
         | 
| 140 | 
            -
                "Qwen2_VL_72B": {
         | 
| 141 | 
            -
                    "core_noncot": {
         | 
| 142 | 
            -
                        "num_eval_tasks": 440,
         | 
| 143 | 
            -
                        "num_eval_samples": 6539,
         | 
| 144 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 145 | 
            -
                        "macro_mean_score": 0.46406654108789214,
         | 
| 146 | 
            -
                        "micro_mean_score": 0.4584702152011697
         | 
| 147 | 
            -
                    },
         | 
| 148 | 
            -
                    "core_cot": {
         | 
| 149 | 
            -
                        "num_eval_tasks": 440,
         | 
| 150 | 
            -
                        "num_eval_samples": 6539,
         | 
| 151 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 152 | 
            -
                        "macro_mean_score": 0.4542376574527161,
         | 
| 153 | 
            -
                        "micro_mean_score": 0.4501201906164793
         | 
| 154 | 
            -
                    },
         | 
| 155 | 
            -
                    "open": {
         | 
| 156 | 
            -
                        "num_eval_tasks": 65,
         | 
| 157 | 
            -
                        "num_eval_samples": 1163,
         | 
| 158 | 
            -
                        "macro_mean_score": 0.5639771804231668,
         | 
| 159 | 
            -
                        "micro_mean_score": 0.5835339638865004
         | 
| 160 | 
            -
                    },
         | 
| 161 | 
            -
                    "overall_score": 0.4769263263488681
         | 
| 162 | 
            -
                },
         | 
| 163 | 
            -
                "Qwen2_VL_7B": {
         | 
| 164 | 
            -
                    "core_noncot": {
         | 
| 165 | 
            -
                        "num_eval_tasks": 440,
         | 
| 166 | 
            -
                        "num_eval_samples": 6539,
         | 
| 167 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 168 | 
            -
                        "macro_mean_score": 0.3480020832611913,
         | 
| 169 | 
            -
                        "micro_mean_score": 0.3441858958345098
         | 
| 170 | 
            -
                    },
         | 
| 171 | 
            -
                    "core_cot": {
         | 
| 172 | 
            -
                        "num_eval_tasks": 440,
         | 
| 173 | 
            -
                        "num_eval_samples": 6539,
         | 
| 174 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 175 | 
            -
                        "macro_mean_score": 0.3293449599230247,
         | 
| 176 | 
            -
                        "micro_mean_score": 0.325331493515679
         | 
| 177 | 
            -
                    },
         | 
| 178 | 
            -
                    "open": {
         | 
| 179 | 
            -
                        "num_eval_tasks": 65,
         | 
| 180 | 
            -
                        "num_eval_samples": 1170,
         | 
| 181 | 
            -
                        "macro_mean_score": 0.43955105763038577,
         | 
| 182 | 
            -
                        "micro_mean_score": 0.45508547008546996
         | 
| 183 | 
            -
                    },
         | 
| 184 | 
            -
                    "overall_score": 0.3597856146156421
         | 
| 185 | 
            -
                },
         | 
| 186 | 
            -
                "llava_onevision_72B": {
         | 
| 187 | 
            -
                    "core_noncot": {
         | 
| 188 | 
            -
                        "num_eval_tasks": 440,
         | 
| 189 | 
            -
                        "num_eval_samples": 6539,
         | 
| 190 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 191 | 
            -
                        "macro_mean_score": 0.3199332158220174,
         | 
| 192 | 
            -
                        "micro_mean_score": 0.31770770553892647
         | 
| 193 | 
            -
                    },
         | 
| 194 | 
            -
                    "core_cot": {
         | 
| 195 | 
            -
                        "num_eval_tasks": 440,
         | 
| 196 | 
            -
                        "num_eval_samples": 6539,
         | 
| 197 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 198 | 
            -
                        "macro_mean_score": 0.2974368415462532,
         | 
| 199 | 
            -
                        "micro_mean_score": 0.2956217833156672
         | 
| 200 | 
            -
                    },
         | 
| 201 | 
            -
                    "open": {
         | 
| 202 | 
            -
                        "num_eval_tasks": 65,
         | 
| 203 | 
            -
                        "num_eval_samples": 1163,
         | 
| 204 | 
            -
                        "macro_mean_score": 0.4599484231632498,
         | 
| 205 | 
            -
                        "micro_mean_score": 0.4850386930352536
         | 
| 206 | 
            -
                    },
         | 
| 207 | 
            -
                    "overall_score": 0.33795497518277007
         | 
| 208 | 
            -
                },
         | 
| 209 | 
            -
                "llava_onevision_7B": {
         | 
| 210 | 
            -
                    "core_noncot": {
         | 
| 211 | 
            -
                        "num_eval_tasks": 440,
         | 
| 212 | 
            -
                        "num_eval_samples": 6539,
         | 
| 213 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 214 | 
            -
                        "macro_mean_score": 0.22409531510496777,
         | 
| 215 | 
            -
                        "micro_mean_score": 0.22238854298563537
         | 
| 216 | 
            -
                    },
         | 
| 217 | 
            -
                    "core_cot": {
         | 
| 218 | 
            -
                        "num_eval_tasks": 440,
         | 
| 219 | 
            -
                        "num_eval_samples": 6539,
         | 
| 220 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 221 | 
            -
                        "macro_mean_score": 0.21362697219149712,
         | 
| 222 | 
            -
                        "micro_mean_score": 0.21073910058505504
         | 
| 223 | 
            -
                    },
         | 
| 224 | 
            -
                    "open": {
         | 
| 225 | 
            -
                        "num_eval_tasks": 65,
         | 
| 226 | 
            -
                        "num_eval_samples": 1163,
         | 
| 227 | 
            -
                        "macro_mean_score": 0.33979975321921935,
         | 
| 228 | 
            -
                        "micro_mean_score": 0.36474634565778147
         | 
| 229 | 
            -
                    },
         | 
| 230 | 
            -
                    "overall_score": 0.23898796555531696
         | 
| 231 | 
            -
                },
         | 
| 232 | 
            -
                "InternVL2_76B": {
         | 
| 233 | 
            -
                    "core_noncot": {
         | 
| 234 | 
            -
                        "num_eval_tasks": 440,
         | 
| 235 | 
            -
                        "num_eval_samples": 6539,
         | 
| 236 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 237 | 
            -
                        "macro_mean_score": 0.3502244283768534,
         | 
| 238 | 
            -
                        "micro_mean_score": 0.3456783051732046
         | 
| 239 | 
            -
                    },
         | 
| 240 | 
            -
                    "core_cot": {
         | 
| 241 | 
            -
                        "num_eval_tasks": 440,
         | 
| 242 | 
            -
                        "num_eval_samples": 6539,
         | 
| 243 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 244 | 
            -
                        "macro_mean_score": 0.3562710424410931,
         | 
| 245 | 
            -
                        "micro_mean_score": 0.35129859801162616
         | 
| 246 | 
            -
                    },
         | 
| 247 | 
            -
                    "open": {
         | 
| 248 | 
            -
                        "num_eval_tasks": 65,
         | 
| 249 | 
            -
                        "num_eval_samples": 1163,
         | 
| 250 | 
            -
                        "macro_mean_score": 0.5192997443033639,
         | 
| 251 | 
            -
                        "micro_mean_score": 0.5421324161650903
         | 
| 252 | 
            -
                    },
         | 
| 253 | 
            -
                    "overall_score": 0.3772549347599992
         | 
| 254 | 
            -
                },
         | 
| 255 | 
            -
                "InternVL2_8B": {
         | 
| 256 | 
            -
                    "core_noncot": {
         | 
| 257 | 
            -
                        "num_eval_tasks": 440,
         | 
| 258 | 
            -
                        "num_eval_samples": 6539,
         | 
| 259 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 260 | 
            -
                        "macro_mean_score": 0.25956581776451815,
         | 
| 261 | 
            -
                        "micro_mean_score": 0.2546984460483302
         | 
| 262 | 
            -
                    },
         | 
| 263 | 
            -
                    "core_cot": {
         | 
| 264 | 
            -
                        "num_eval_tasks": 440,
         | 
| 265 | 
            -
                        "num_eval_samples": 6539,
         | 
| 266 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 267 | 
            -
                        "macro_mean_score": 0.24090301358258295,
         | 
| 268 | 
            -
                        "micro_mean_score": 0.23819084111520938
         | 
| 269 | 
            -
                    },
         | 
| 270 | 
            -
                    "open": {
         | 
| 271 | 
            -
                        "num_eval_tasks": 65,
         | 
| 272 | 
            -
                        "num_eval_samples": 1165,
         | 
| 273 | 
            -
                        "macro_mean_score": 0.3978571701460552,
         | 
| 274 | 
            -
                        "micro_mean_score": 0.4108583690987125
         | 
| 275 | 
            -
                    },
         | 
| 276 | 
            -
                    "overall_score": 0.2773656948037259
         | 
| 277 | 
            -
                },
         | 
| 278 | 
            -
                "MiniCPM_v2.6": {
         | 
| 279 | 
            -
                    "core_noncot": {
         | 
| 280 | 
            -
                        "num_eval_tasks": 440,
         | 
| 281 | 
            -
                        "num_eval_samples": 6539,
         | 
| 282 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 283 | 
            -
                        "macro_mean_score": 0.2287645706203155,
         | 
| 284 | 
            -
                        "micro_mean_score": 0.2249087742955901
         | 
| 285 | 
            -
                    },
         | 
| 286 | 
            -
                    "core_cot": {
         | 
| 287 | 
            -
                        "num_eval_tasks": 440,
         | 
| 288 | 
            -
                        "num_eval_samples": 6539,
         | 
| 289 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 290 | 
            -
                        "macro_mean_score": 0.22955895202146906,
         | 
| 291 | 
            -
                        "micro_mean_score": 0.22560399396899078
         | 
| 292 | 
            -
                    },
         | 
| 293 | 
            -
                    "open": {
         | 
| 294 | 
            -
                        "num_eval_tasks": 65,
         | 
| 295 | 
            -
                        "num_eval_samples": 1163,
         | 
| 296 | 
            -
                        "macro_mean_score": 0.41728623355613875,
         | 
| 297 | 
            -
                        "micro_mean_score": 0.43452278589853827
         | 
| 298 | 
            -
                    },
         | 
| 299 | 
            -
                    "overall_score": 0.2537218694467236
         | 
| 300 | 
            -
                },
         | 
| 301 | 
            -
                "Phi-3.5-vision": {
         | 
| 302 | 
            -
                    "core_noncot": {
         | 
| 303 | 
            -
                        "num_eval_tasks": 440,
         | 
| 304 | 
            -
                        "num_eval_samples": 6539,
         | 
| 305 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 306 | 
            -
                        "macro_mean_score": 0.23271251159409778,
         | 
| 307 | 
            -
                        "micro_mean_score": 0.2296262323791101
         | 
| 308 | 
            -
                    },
         | 
| 309 | 
            -
                    "core_cot": {
         | 
| 310 | 
            -
                        "num_eval_tasks": 440,
         | 
| 311 | 
            -
                        "num_eval_samples": 6539,
         | 
| 312 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 313 | 
            -
                        "macro_mean_score": 0.22995297916629392,
         | 
| 314 | 
            -
                        "micro_mean_score": 0.22708502951025372
         | 
| 315 | 
            -
                    },
         | 
| 316 | 
            -
                    "open": {
         | 
| 317 | 
            -
                        "num_eval_tasks": 65,
         | 
| 318 | 
            -
                        "num_eval_samples": 1163,
         | 
| 319 | 
            -
                        "macro_mean_score": 0.3947914647737769,
         | 
| 320 | 
            -
                        "micro_mean_score": 0.42459157351676696
         | 
| 321 | 
            -
                    },
         | 
| 322 | 
            -
                    "overall_score": 0.25357415903306635
         | 
| 323 | 
            -
                },
         | 
| 324 | 
            -
                "Pixtral_12B": {
         | 
| 325 | 
            -
                    "core_noncot": {
         | 
| 326 | 
            -
                        "num_eval_tasks": 440,
         | 
| 327 | 
            -
                        "num_eval_samples": 6539,
         | 
| 328 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 329 | 
            -
                        "macro_mean_score": 0.31905695620134694,
         | 
| 330 | 
            -
                        "micro_mean_score": 0.31556607913724777
         | 
| 331 | 
            -
                    },
         | 
| 332 | 
            -
                    "core_cot": {
         | 
| 333 | 
            -
                        "num_eval_tasks": 440,
         | 
| 334 | 
            -
                        "num_eval_samples": 6539,
         | 
| 335 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 336 | 
            -
                        "macro_mean_score": 0.31362045151669854,
         | 
| 337 | 
            -
                        "micro_mean_score": 0.3100986209078182
         | 
| 338 | 
            -
                    },
         | 
| 339 | 
            -
                    "open": {
         | 
| 340 | 
            -
                        "num_eval_tasks": 65,
         | 
| 341 | 
            -
                        "num_eval_samples": 1163,
         | 
| 342 | 
            -
                        "macro_mean_score": 0.4566234428542061,
         | 
| 343 | 
            -
                        "micro_mean_score": 0.4870593293207223
         | 
| 344 | 
            -
                    },
         | 
| 345 | 
            -
                    "overall_score": 0.33676353369131895
         | 
| 346 | 
            -
                },
         | 
| 347 | 
            -
                "Llama_3_2_11B": {
         | 
| 348 | 
            -
                    "core_noncot": {
         | 
| 349 | 
            -
                        "num_eval_tasks": 440,
         | 
| 350 | 
            -
                        "num_eval_samples": 6539,
         | 
| 351 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 352 | 
            -
                        "macro_mean_score": 0.10044261716549671,
         | 
| 353 | 
            -
                        "micro_mean_score": 0.09980638766828835
         | 
| 354 | 
            -
                    },
         | 
| 355 | 
            -
                    "core_cot": {
         | 
| 356 | 
            -
                        "num_eval_tasks": 440,
         | 
| 357 | 
            -
                        "num_eval_samples": 6539,
         | 
| 358 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 359 | 
            -
                        "macro_mean_score": 0.15999641916771298,
         | 
| 360 | 
            -
                        "micro_mean_score": 0.15809331016967038
         | 
| 361 | 
            -
                    },
         | 
| 362 | 
            -
                    "open": {
         | 
| 363 | 
            -
                        "num_eval_tasks": 65,
         | 
| 364 | 
            -
                        "num_eval_samples": 1163,
         | 
| 365 | 
            -
                        "macro_mean_score": 0.3173342406187366,
         | 
| 366 | 
            -
                        "micro_mean_score": 0.3487962166809973
         | 
| 367 | 
            -
                    },
         | 
| 368 | 
            -
                    "overall_score": 0.1802478219287358
         | 
| 369 | 
            -
                },
         | 
| 370 | 
            -
                "Idefics3": {
         | 
| 371 | 
            -
                    "core_noncot": {
         | 
| 372 | 
            -
                        "num_eval_tasks": 440,
         | 
| 373 | 
            -
                        "num_eval_samples": 6539,
         | 
| 374 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 375 | 
            -
                        "macro_mean_score": 0.11118980301103833,
         | 
| 376 | 
            -
                        "micro_mean_score": 0.11201785633274061
         | 
| 377 | 
            -
                    },
         | 
| 378 | 
            -
                    "core_cot": {
         | 
| 379 | 
            -
                        "num_eval_tasks": 440,
         | 
| 380 | 
            -
                        "num_eval_samples": 6539,
         | 
| 381 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 382 | 
            -
                        "macro_mean_score": 0.08956972487602757,
         | 
| 383 | 
            -
                        "micro_mean_score": 0.08982225274252693
         | 
| 384 | 
            -
                    },
         | 
| 385 | 
            -
                    "open": {
         | 
| 386 | 
            -
                        "num_eval_tasks": 65,
         | 
| 387 | 
            -
                        "num_eval_samples": 1163,
         | 
| 388 | 
            -
                        "macro_mean_score": 0.3210866162255635,
         | 
| 389 | 
            -
                        "micro_mean_score": 0.35649183147033553
         | 
| 390 | 
            -
                    },
         | 
| 391 | 
            -
                    "overall_score": 0.138206224513898
         | 
| 392 | 
            -
                },
         | 
| 393 | 
            -
                "Aria": {
         | 
| 394 | 
            -
                    "core_noncot": {
         | 
| 395 | 
            -
                        "num_eval_tasks": 440,
         | 
| 396 | 
            -
                        "num_eval_samples": 6539,
         | 
| 397 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 398 | 
            -
                        "macro_mean_score": 0.30485930718699694,
         | 
| 399 | 
            -
                        "micro_mean_score": 0.3016713629035311
         | 
| 400 | 
            -
                    },
         | 
| 401 | 
            -
                    "core_cot": {
         | 
| 402 | 
            -
                        "num_eval_tasks": 440,
         | 
| 403 | 
            -
                        "num_eval_samples": 6539,
         | 
| 404 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 405 | 
            -
                        "macro_mean_score": 0.289073788209904,
         | 
| 406 | 
            -
                        "micro_mean_score": 0.2859007507765791
         | 
| 407 | 
            -
                    },
         | 
| 408 | 
            -
                    "open": {
         | 
| 409 | 
            -
                        "num_eval_tasks": 65,
         | 
| 410 | 
            -
                        "num_eval_samples": 1163,
         | 
| 411 | 
            -
                        "macro_mean_score": 0.5103725263180767,
         | 
| 412 | 
            -
                        "micro_mean_score": 0.5349957007738607
         | 
| 413 | 
            -
                    },
         | 
| 414 | 
            -
                    "overall_score": 0.3313115037088191
         | 
| 415 | 
            -
                },
         | 
| 416 | 
            -
                "NVLM": {
         | 
| 417 | 
            -
                    "core_noncot": {
         | 
| 418 | 
            -
                        "num_eval_tasks": 440,
         | 
| 419 | 
            -
                        "num_eval_samples": 6539,
         | 
| 420 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 421 | 
            -
                        "macro_mean_score": 0.2420528895703979,
         | 
| 422 | 
            -
                        "micro_mean_score": 0.23838419989257642
         | 
| 423 | 
            -
                    },
         | 
| 424 | 
            -
                    "core_cot": {
         | 
| 425 | 
            -
                        "num_eval_tasks": 440,
         | 
| 426 | 
            -
                        "num_eval_samples": 6539,
         | 
| 427 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 428 | 
            -
                        "macro_mean_score": 0.21589726765847422,
         | 
| 429 | 
            -
                        "micro_mean_score": 0.21406043849932396
         | 
| 430 | 
            -
                    },
         | 
| 431 | 
            -
                    "open": {
         | 
| 432 | 
            -
                        "num_eval_tasks": 65,
         | 
| 433 | 
            -
                        "num_eval_samples": 1163,
         | 
| 434 | 
            -
                        "macro_mean_score": 0.3478114310231307,
         | 
| 435 | 
            -
                        "micro_mean_score": 0.3947549441100602
         | 
| 436 | 
            -
                    },
         | 
| 437 | 
            -
                    "overall_score": 0.25566537510391796
         | 
| 438 | 
            -
                },
         | 
| 439 | 
            -
                "InternVL2_2B": {
         | 
| 440 | 
            -
                    "core_noncot": {
         | 
| 441 | 
            -
                        "num_eval_tasks": 440,
         | 
| 442 | 
            -
                        "num_eval_samples": 6539,
         | 
| 443 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 444 | 
            -
                        "macro_mean_score": 0.09089701489596874,
         | 
| 445 | 
            -
                        "micro_mean_score": 0.09036328295381871
         | 
| 446 | 
            -
                    },
         | 
| 447 | 
            -
                    "core_cot": {
         | 
| 448 | 
            -
                        "num_eval_tasks": 440,
         | 
| 449 | 
            -
                        "num_eval_samples": 6539,
         | 
| 450 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 451 | 
            -
                        "macro_mean_score": 0.13141974398938763,
         | 
| 452 | 
            -
                        "micro_mean_score": 0.13063500716262516
         | 
| 453 | 
            -
                    },
         | 
| 454 | 
            -
                    "open": {
         | 
| 455 | 
            -
                        "num_eval_tasks": 65,
         | 
| 456 | 
            -
                        "num_eval_samples": 1163,
         | 
| 457 | 
            -
                        "macro_mean_score": 0.23864417043743646,
         | 
| 458 | 
            -
                        "micro_mean_score": 0.24901117798796224
         | 
| 459 | 
            -
                    },
         | 
| 460 | 
            -
                    "overall_score": 0.14522090778963154
         | 
| 461 | 
            -
                },
         | 
| 462 | 
            -
                "Qwen2_VL_2B": {
         | 
| 463 | 
            -
                    "core_noncot": {
         | 
| 464 | 
            -
                        "num_eval_tasks": 440,
         | 
| 465 | 
            -
                        "num_eval_samples": 6539,
         | 
| 466 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 467 | 
            -
                        "macro_mean_score": 0.16448220309703876,
         | 
| 468 | 
            -
                        "micro_mean_score": 0.1610710186451323
         | 
| 469 | 
            -
                    },
         | 
| 470 | 
            -
                    "core_cot": {
         | 
| 471 | 
            -
                        "num_eval_tasks": 440,
         | 
| 472 | 
            -
                        "num_eval_samples": 6539,
         | 
| 473 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 474 | 
            -
                        "macro_mean_score": 0.20877163406364055,
         | 
| 475 | 
            -
                        "micro_mean_score": 0.20561526268932287
         | 
| 476 | 
            -
                    },
         | 
| 477 | 
            -
                    "open": {
         | 
| 478 | 
            -
                        "num_eval_tasks": 65,
         | 
| 479 | 
            -
                        "num_eval_samples": 1163,
         | 
| 480 | 
            -
                        "macro_mean_score": 0.3154302566225611,
         | 
| 481 | 
            -
                        "micro_mean_score": 0.33856405846947557
         | 
| 482 | 
            -
                    },
         | 
| 483 | 
            -
                    "overall_score": 0.22249997162072932
         | 
| 484 | 
            -
                },
         | 
| 485 | 
            -
                "Aquila_VL_2B": {
         | 
| 486 | 
            -
                    "core_noncot": {
         | 
| 487 | 
            -
                        "num_eval_tasks": 440,
         | 
| 488 | 
            -
                        "num_eval_samples": 6539,
         | 
| 489 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 490 | 
            -
                        "macro_mean_score": 0.16317824309838627,
         | 
| 491 | 
            -
                        "micro_mean_score": 0.16198837245148487
         | 
| 492 | 
            -
                    },
         | 
| 493 | 
            -
                    "core_cot": {
         | 
| 494 | 
            -
                        "num_eval_tasks": 440,
         | 
| 495 | 
            -
                        "num_eval_samples": 6539,
         | 
| 496 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 497 | 
            -
                        "macro_mean_score": 0.159970161379836,
         | 
| 498 | 
            -
                        "micro_mean_score": 0.15844711671722148
         | 
| 499 | 
            -
                    },
         | 
| 500 | 
            -
                    "open": {
         | 
| 501 | 
            -
                        "num_eval_tasks": 65,
         | 
| 502 | 
            -
                        "num_eval_samples": 1163,
         | 
| 503 | 
            -
                        "macro_mean_score": 0.24567572098570653,
         | 
| 504 | 
            -
                        "micro_mean_score": 0.2704213241616509
         | 
| 505 | 
            -
                    },
         | 
| 506 | 
            -
                    "overall_score": 0.17379673035120966
         | 
| 507 | 
            -
                },
         | 
| 508 | 
            -
                "Mammoth_VL": {
         | 
| 509 | 
            -
                    "core_noncot": {
         | 
| 510 | 
            -
                        "num_eval_tasks": 440,
         | 
| 511 | 
            -
                        "num_eval_samples": 6539,
         | 
| 512 | 
            -
                        "num_not_eval_samples": 0,
         | 
| 513 | 
            -
                        "macro_mean_score": 0.264052880412689,
         | 
| 514 | 
            -
                        "micro_mean_score": 0.2626894374387823
         | 
| 515 | 
            -
                    },
         | 
| 516 | 
            -
                    "core_cot": null,
         | 
| 517 | 
            -
                    "open": {
         | 
| 518 | 
            -
                        "num_eval_tasks": 65,
         | 
| 519 | 
            -
                        "num_eval_samples": 1163,
         | 
| 520 | 
            -
                        "macro_mean_score": 0.37992668750165337,
         | 
| 521 | 
            -
                        "micro_mean_score": 0.40120378331900275
         | 
| 522 | 
            -
                    },
         | 
| 523 | 
            -
                    "overall_score": 0.27896733083008046
         | 
| 524 | 
            -
                }
         | 
| 525 | 
            -
            }
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
 
			
