Spaces:
Running
Running
David Pomerenke
commited on
Commit
·
c5278dd
1
Parent(s):
d1a7111
More models
Browse files- evals/models.py +9 -2
- frontend/public/results.json +125 -20
- results.json +668 -24
evals/models.py
CHANGED
|
@@ -15,12 +15,19 @@ models = [
|
|
| 15 |
"openai/gpt-4o-mini", # 0.6$/M tokens
|
| 16 |
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
|
| 17 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
|
|
|
|
|
|
| 18 |
"mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
|
|
|
|
| 19 |
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
|
|
|
|
|
|
| 20 |
# "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
|
| 21 |
-
|
|
|
|
| 22 |
# "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
|
| 23 |
-
"
|
|
|
|
| 24 |
]
|
| 25 |
model_fast = "meta-llama/llama-3.3-70b-instruct"
|
| 26 |
|
|
|
|
| 15 |
"openai/gpt-4o-mini", # 0.6$/M tokens
|
| 16 |
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
|
| 17 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
| 18 |
+
"meta-llama/llama-3.1-70b-instruct", # 0.3$/M tokens
|
| 19 |
+
"meta-llama/llama-3-70b-instruct", # 0.4$/M tokens
|
| 20 |
"mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
|
| 21 |
+
"mistralai/mistral-nemo",
|
| 22 |
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
| 23 |
+
"google/gemini-2.0-flash-lite-001", # 0.3$/M tokens
|
| 24 |
+
"google/gemma-3-27b-it", # 0.2$/M tokens
|
| 25 |
# "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
|
| 26 |
+
"qwen/qwq-32b",
|
| 27 |
+
# "deepseek/deepseek-chat", # 1.3$/M tokens
|
| 28 |
# "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
|
| 29 |
+
"microsoft/phi-4-multimodal-instruct",
|
| 30 |
+
"amazon/nova-micro-v1", # 0.09$/M tokens
|
| 31 |
]
|
| 32 |
model_fast = "meta-llama/llama-3.3-70b-instruct"
|
| 33 |
|
frontend/public/results.json
CHANGED
|
@@ -18,6 +18,21 @@
|
|
| 18 |
{
|
| 19 |
"rank": 2,
|
| 20 |
"provider": "Google",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
"model": "Gemma 3 27b It",
|
| 22 |
"hf_id": "google/gemma-3-27b-it",
|
| 23 |
"creation_date": "2025-03-01",
|
|
@@ -31,7 +46,52 @@
|
|
| 31 |
"translation_chrf": 0.54
|
| 32 |
},
|
| 33 |
{
|
| 34 |
-
"rank":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
"provider": "OpenAI",
|
| 36 |
"model": "GPT 4o Mini",
|
| 37 |
"hf_id": null,
|
|
@@ -46,7 +106,7 @@
|
|
| 46 |
"translation_chrf": 0.55
|
| 47 |
},
|
| 48 |
{
|
| 49 |
-
"rank":
|
| 50 |
"provider": "MistralAI",
|
| 51 |
"model": "Mistral Small 24b Instruct 2501",
|
| 52 |
"hf_id": "mistralai/Mistral-Small-24B-Instruct-2501",
|
|
@@ -61,7 +121,7 @@
|
|
| 61 |
"translation_chrf": 0.52
|
| 62 |
},
|
| 63 |
{
|
| 64 |
-
"rank":
|
| 65 |
"provider": "Meta Llama",
|
| 66 |
"model": "Llama 3.3 70b Instruct",
|
| 67 |
"hf_id": "meta-llama/Llama-3.3-70B-Instruct",
|
|
@@ -74,6 +134,51 @@
|
|
| 74 |
"language_modeling_chrf": 0.94,
|
| 75 |
"translation_bleu": 0.31,
|
| 76 |
"translation_chrf": 0.48
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
}
|
| 78 |
],
|
| 79 |
"language_table": [
|
|
@@ -81,25 +186,13 @@
|
|
| 81 |
"language_name": "English",
|
| 82 |
"speakers": 1636485840,
|
| 83 |
"family": "Indo-European",
|
| 84 |
-
"average": 0.
|
| 85 |
"in_benchmark": true,
|
| 86 |
"NaN": 0.0,
|
| 87 |
-
"classification_accuracy": 0.
|
| 88 |
-
"language_modeling_chrf": 0.
|
| 89 |
-
"translation_bleu": 0.
|
| 90 |
-
"translation_chrf": 0.
|
| 91 |
-
},
|
| 92 |
-
{
|
| 93 |
-
"language_name": "Chinese",
|
| 94 |
-
"speakers": 1304678914,
|
| 95 |
-
"family": "Sino-Tibetan",
|
| 96 |
-
"average": 0.5,
|
| 97 |
-
"in_benchmark": true,
|
| 98 |
-
"NaN": 0.0,
|
| 99 |
-
"classification_accuracy": 0.65,
|
| 100 |
-
"language_modeling_chrf": 0.93,
|
| 101 |
-
"translation_bleu": 0.38,
|
| 102 |
-
"translation_chrf": 0.55
|
| 103 |
},
|
| 104 |
{
|
| 105 |
"language_name": "French",
|
|
@@ -113,6 +206,18 @@
|
|
| 113 |
"translation_bleu": 0.32,
|
| 114 |
"translation_chrf": 0.49
|
| 115 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
{
|
| 117 |
"language_name": "Hindi",
|
| 118 |
"speakers": 546882144,
|
|
|
|
| 18 |
{
|
| 19 |
"rank": 2,
|
| 20 |
"provider": "Google",
|
| 21 |
+
"model": "Gemini 2.0 Flash Lite 001",
|
| 22 |
+
"hf_id": null,
|
| 23 |
+
"creation_date": null,
|
| 24 |
+
"size": null,
|
| 25 |
+
"type": "Commercial",
|
| 26 |
+
"license": null,
|
| 27 |
+
"average": 0.66,
|
| 28 |
+
"classification_accuracy": 0.73,
|
| 29 |
+
"language_modeling_chrf": 0.97,
|
| 30 |
+
"translation_bleu": 0.4,
|
| 31 |
+
"translation_chrf": 0.54
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"rank": 3,
|
| 35 |
+
"provider": "Google",
|
| 36 |
"model": "Gemma 3 27b It",
|
| 37 |
"hf_id": "google/gemma-3-27b-it",
|
| 38 |
"creation_date": "2025-03-01",
|
|
|
|
| 46 |
"translation_chrf": 0.54
|
| 47 |
},
|
| 48 |
{
|
| 49 |
+
"rank": 4,
|
| 50 |
+
"provider": "Meta Llama",
|
| 51 |
+
"model": "Llama 3.1 70b Instruct",
|
| 52 |
+
"hf_id": "meta-llama/Llama-3.1-70B-Instruct",
|
| 53 |
+
"creation_date": "2024-07-16",
|
| 54 |
+
"size": 70553706496.0,
|
| 55 |
+
"type": "Open",
|
| 56 |
+
"license": "Llama3.1",
|
| 57 |
+
"average": 0.62,
|
| 58 |
+
"classification_accuracy": 0.57,
|
| 59 |
+
"language_modeling_chrf": 0.92,
|
| 60 |
+
"translation_bleu": 0.43,
|
| 61 |
+
"translation_chrf": 0.57
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"rank": 5,
|
| 65 |
+
"provider": "Amazon",
|
| 66 |
+
"model": "Nova Micro V1",
|
| 67 |
+
"hf_id": null,
|
| 68 |
+
"creation_date": null,
|
| 69 |
+
"size": null,
|
| 70 |
+
"type": "Commercial",
|
| 71 |
+
"license": null,
|
| 72 |
+
"average": 0.61,
|
| 73 |
+
"classification_accuracy": 0.52,
|
| 74 |
+
"language_modeling_chrf": 0.94,
|
| 75 |
+
"translation_bleu": 0.4,
|
| 76 |
+
"translation_chrf": 0.56
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"rank": 6,
|
| 80 |
+
"provider": "Meta Llama",
|
| 81 |
+
"model": "Llama 3 70b Instruct",
|
| 82 |
+
"hf_id": null,
|
| 83 |
+
"creation_date": null,
|
| 84 |
+
"size": null,
|
| 85 |
+
"type": "Commercial",
|
| 86 |
+
"license": null,
|
| 87 |
+
"average": 0.61,
|
| 88 |
+
"classification_accuracy": 0.8,
|
| 89 |
+
"language_modeling_chrf": 0.95,
|
| 90 |
+
"translation_bleu": 0.25,
|
| 91 |
+
"translation_chrf": 0.43
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"rank": 7,
|
| 95 |
"provider": "OpenAI",
|
| 96 |
"model": "GPT 4o Mini",
|
| 97 |
"hf_id": null,
|
|
|
|
| 106 |
"translation_chrf": 0.55
|
| 107 |
},
|
| 108 |
{
|
| 109 |
+
"rank": 8,
|
| 110 |
"provider": "MistralAI",
|
| 111 |
"model": "Mistral Small 24b Instruct 2501",
|
| 112 |
"hf_id": "mistralai/Mistral-Small-24B-Instruct-2501",
|
|
|
|
| 121 |
"translation_chrf": 0.52
|
| 122 |
},
|
| 123 |
{
|
| 124 |
+
"rank": 9,
|
| 125 |
"provider": "Meta Llama",
|
| 126 |
"model": "Llama 3.3 70b Instruct",
|
| 127 |
"hf_id": "meta-llama/Llama-3.3-70B-Instruct",
|
|
|
|
| 134 |
"language_modeling_chrf": 0.94,
|
| 135 |
"translation_bleu": 0.31,
|
| 136 |
"translation_chrf": 0.48
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"rank": 10,
|
| 140 |
+
"provider": "MistralAI",
|
| 141 |
+
"model": "Mistral Nemo",
|
| 142 |
+
"hf_id": null,
|
| 143 |
+
"creation_date": null,
|
| 144 |
+
"size": null,
|
| 145 |
+
"type": "Commercial",
|
| 146 |
+
"license": null,
|
| 147 |
+
"average": 0.55,
|
| 148 |
+
"classification_accuracy": 0.5,
|
| 149 |
+
"language_modeling_chrf": 0.88,
|
| 150 |
+
"translation_bleu": 0.32,
|
| 151 |
+
"translation_chrf": 0.49
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"rank": 11,
|
| 155 |
+
"provider": "Microsoft",
|
| 156 |
+
"model": "Phi 4 Multimodal Instruct",
|
| 157 |
+
"hf_id": "microsoft/Phi-4-multimodal-instruct",
|
| 158 |
+
"creation_date": "2025-02-24",
|
| 159 |
+
"size": 5574460384.0,
|
| 160 |
+
"type": "Open",
|
| 161 |
+
"license": "Mit",
|
| 162 |
+
"average": 0.52,
|
| 163 |
+
"classification_accuracy": 0.42,
|
| 164 |
+
"language_modeling_chrf": 0.87,
|
| 165 |
+
"translation_bleu": 0.32,
|
| 166 |
+
"translation_chrf": 0.46
|
| 167 |
+
},
|
| 168 |
+
{
|
| 169 |
+
"rank": 12,
|
| 170 |
+
"provider": "Qwen",
|
| 171 |
+
"model": "Qwq 32b",
|
| 172 |
+
"hf_id": "Qwen/QwQ-32B",
|
| 173 |
+
"creation_date": "2025-03-05",
|
| 174 |
+
"size": 32763876352.0,
|
| 175 |
+
"type": "Open",
|
| 176 |
+
"license": "Apache 2.0",
|
| 177 |
+
"average": 0.25,
|
| 178 |
+
"classification_accuracy": 0.0,
|
| 179 |
+
"language_modeling_chrf": 0.48,
|
| 180 |
+
"translation_bleu": 0.21,
|
| 181 |
+
"translation_chrf": 0.3
|
| 182 |
}
|
| 183 |
],
|
| 184 |
"language_table": [
|
|
|
|
| 186 |
"language_name": "English",
|
| 187 |
"speakers": 1636485840,
|
| 188 |
"family": "Indo-European",
|
| 189 |
+
"average": 0.47,
|
| 190 |
"in_benchmark": true,
|
| 191 |
"NaN": 0.0,
|
| 192 |
+
"classification_accuracy": 0.58,
|
| 193 |
+
"language_modeling_chrf": 0.92,
|
| 194 |
+
"translation_bleu": 0.37,
|
| 195 |
+
"translation_chrf": 0.49
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
},
|
| 197 |
{
|
| 198 |
"language_name": "French",
|
|
|
|
| 206 |
"translation_bleu": 0.32,
|
| 207 |
"translation_chrf": 0.49
|
| 208 |
},
|
| 209 |
+
{
|
| 210 |
+
"language_name": "Chinese",
|
| 211 |
+
"speakers": 1304678914,
|
| 212 |
+
"family": "Sino-Tibetan",
|
| 213 |
+
"average": 0.46,
|
| 214 |
+
"in_benchmark": true,
|
| 215 |
+
"NaN": 0.0,
|
| 216 |
+
"classification_accuracy": 0.55,
|
| 217 |
+
"language_modeling_chrf": 0.86,
|
| 218 |
+
"translation_bleu": 0.35,
|
| 219 |
+
"translation_chrf": 0.53
|
| 220 |
+
},
|
| 221 |
{
|
| 222 |
"language_name": "Hindi",
|
| 223 |
"speakers": 546882144,
|
results.json
CHANGED
|
@@ -3,33 +3,61 @@
|
|
| 3 |
{
|
| 4 |
"task": "classification",
|
| 5 |
"metric": "accuracy",
|
| 6 |
-
"score": 0.
|
| 7 |
"bcp_47": 10,
|
| 8 |
-
"model":
|
| 9 |
},
|
| 10 |
{
|
| 11 |
"task": "language_modeling",
|
| 12 |
"metric": "chrf",
|
| 13 |
-
"score": 0.
|
| 14 |
"bcp_47": 10,
|
| 15 |
-
"model":
|
| 16 |
},
|
| 17 |
{
|
| 18 |
"task": "translation",
|
| 19 |
"metric": "bleu",
|
| 20 |
-
"score": 0.
|
| 21 |
"bcp_47": 10,
|
| 22 |
-
"model":
|
| 23 |
},
|
| 24 |
{
|
| 25 |
"task": "translation",
|
| 26 |
"metric": "chrf",
|
| 27 |
-
"score": 0.
|
| 28 |
"bcp_47": 10,
|
| 29 |
-
"model":
|
| 30 |
}
|
| 31 |
],
|
| 32 |
"models": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
{
|
| 34 |
"model": "google/gemini-2.0-flash-001",
|
| 35 |
"task": "classification",
|
|
@@ -58,6 +86,34 @@
|
|
| 58 |
"score": 0.5828490054615683,
|
| 59 |
"bcp_47": 2
|
| 60 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
{
|
| 62 |
"model": "google/gemma-3-27b-it",
|
| 63 |
"task": "classification",
|
|
@@ -86,6 +142,62 @@
|
|
| 86 |
"score": 0.5376336154503363,
|
| 87 |
"bcp_47": 2
|
| 88 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
{
|
| 90 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
| 91 |
"task": "classification",
|
|
@@ -114,6 +226,62 @@
|
|
| 114 |
"score": 0.4836914110309717,
|
| 115 |
"bcp_47": 10
|
| 116 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
{
|
| 118 |
"model": "mistralai/mistral-small-24b-instruct-2501",
|
| 119 |
"task": "classification",
|
|
@@ -169,6 +337,34 @@
|
|
| 169 |
"metric": "chrf",
|
| 170 |
"score": 0.5452510379336759,
|
| 171 |
"bcp_47": 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
}
|
| 173 |
],
|
| 174 |
"languages": [
|
|
@@ -2554,8 +2750,8 @@
|
|
| 2554 |
"in_benchmark": true,
|
| 2555 |
"task": "classification",
|
| 2556 |
"metric": "accuracy",
|
| 2557 |
-
"score": 0.
|
| 2558 |
-
"model":
|
| 2559 |
},
|
| 2560 |
{
|
| 2561 |
"bcp_47": "en",
|
|
@@ -2569,8 +2765,8 @@
|
|
| 2569 |
"in_benchmark": true,
|
| 2570 |
"task": "language_modeling",
|
| 2571 |
"metric": "chrf",
|
| 2572 |
-
"score": 0.
|
| 2573 |
-
"model":
|
| 2574 |
},
|
| 2575 |
{
|
| 2576 |
"bcp_47": "en",
|
|
@@ -2584,8 +2780,8 @@
|
|
| 2584 |
"in_benchmark": true,
|
| 2585 |
"task": "translation",
|
| 2586 |
"metric": "bleu",
|
| 2587 |
-
"score": 0.
|
| 2588 |
-
"model":
|
| 2589 |
},
|
| 2590 |
{
|
| 2591 |
"bcp_47": "en",
|
|
@@ -2599,8 +2795,8 @@
|
|
| 2599 |
"in_benchmark": true,
|
| 2600 |
"task": "translation",
|
| 2601 |
"metric": "chrf",
|
| 2602 |
-
"score": 0.
|
| 2603 |
-
"model":
|
| 2604 |
},
|
| 2605 |
{
|
| 2606 |
"bcp_47": "eo",
|
|
@@ -10699,8 +10895,8 @@
|
|
| 10699 |
"in_benchmark": true,
|
| 10700 |
"task": "classification",
|
| 10701 |
"metric": "accuracy",
|
| 10702 |
-
"score": 0.
|
| 10703 |
-
"model":
|
| 10704 |
},
|
| 10705 |
{
|
| 10706 |
"bcp_47": "zh",
|
|
@@ -10714,8 +10910,8 @@
|
|
| 10714 |
"in_benchmark": true,
|
| 10715 |
"task": "language_modeling",
|
| 10716 |
"metric": "chrf",
|
| 10717 |
-
"score": 0.
|
| 10718 |
-
"model":
|
| 10719 |
},
|
| 10720 |
{
|
| 10721 |
"bcp_47": "zh",
|
|
@@ -10729,8 +10925,8 @@
|
|
| 10729 |
"in_benchmark": true,
|
| 10730 |
"task": "translation",
|
| 10731 |
"metric": "bleu",
|
| 10732 |
-
"score": 0.
|
| 10733 |
-
"model":
|
| 10734 |
},
|
| 10735 |
{
|
| 10736 |
"bcp_47": "zh",
|
|
@@ -10744,8 +10940,8 @@
|
|
| 10744 |
"in_benchmark": true,
|
| 10745 |
"task": "translation",
|
| 10746 |
"metric": "chrf",
|
| 10747 |
-
"score": 0.
|
| 10748 |
-
"model":
|
| 10749 |
},
|
| 10750 |
{
|
| 10751 |
"bcp_47": "zmi",
|
|
@@ -10794,6 +10990,70 @@
|
|
| 10794 |
}
|
| 10795 |
],
|
| 10796 |
"scores": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10797 |
{
|
| 10798 |
"model": "google/gemini-2.0-flash-001",
|
| 10799 |
"bcp_47": "en",
|
|
@@ -10858,6 +11118,70 @@
|
|
| 10858 |
"score": 0.5606266861920302,
|
| 10859 |
"sentence_nr": 14.5
|
| 10860 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10861 |
{
|
| 10862 |
"model": "google/gemma-3-27b-it",
|
| 10863 |
"bcp_47": "en",
|
|
@@ -10922,6 +11246,134 @@
|
|
| 10922 |
"score": 0.520771580386218,
|
| 10923 |
"sentence_nr": 14.5
|
| 10924 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10925 |
{
|
| 10926 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
| 10927 |
"bcp_47": "ar",
|
|
@@ -11242,6 +11694,134 @@
|
|
| 11242 |
"score": 0.5862284100611604,
|
| 11243 |
"sentence_nr": 14.5
|
| 11244 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11245 |
{
|
| 11246 |
"model": "mistralai/mistral-small-24b-instruct-2501",
|
| 11247 |
"bcp_47": "en",
|
|
@@ -11369,6 +11949,70 @@
|
|
| 11369 |
"metric": "chrf",
|
| 11370 |
"score": 0.559410465345808,
|
| 11371 |
"sentence_nr": 14.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11372 |
}
|
| 11373 |
]
|
| 11374 |
}
|
|
|
|
| 3 |
{
|
| 4 |
"task": "classification",
|
| 5 |
"metric": "accuracy",
|
| 6 |
+
"score": 0.5427083333333333,
|
| 7 |
"bcp_47": 10,
|
| 8 |
+
"model": 12
|
| 9 |
},
|
| 10 |
{
|
| 11 |
"task": "language_modeling",
|
| 12 |
"metric": "chrf",
|
| 13 |
+
"score": 0.9024222998985517,
|
| 14 |
"bcp_47": 10,
|
| 15 |
+
"model": 12
|
| 16 |
},
|
| 17 |
{
|
| 18 |
"task": "translation",
|
| 19 |
"metric": "bleu",
|
| 20 |
+
"score": 0.34325329881872996,
|
| 21 |
"bcp_47": 10,
|
| 22 |
+
"model": 12
|
| 23 |
},
|
| 24 |
{
|
| 25 |
"task": "translation",
|
| 26 |
"metric": "chrf",
|
| 27 |
+
"score": 0.49764810942023735,
|
| 28 |
"bcp_47": 10,
|
| 29 |
+
"model": 12
|
| 30 |
}
|
| 31 |
],
|
| 32 |
"models": [
|
| 33 |
+
{
|
| 34 |
+
"model": "amazon/nova-micro-v1",
|
| 35 |
+
"task": "classification",
|
| 36 |
+
"metric": "accuracy",
|
| 37 |
+
"score": 0.5166666666666666,
|
| 38 |
+
"bcp_47": 2
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"model": "amazon/nova-micro-v1",
|
| 42 |
+
"task": "language_modeling",
|
| 43 |
+
"metric": "chrf",
|
| 44 |
+
"score": 0.9446198732700857,
|
| 45 |
+
"bcp_47": 2
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"model": "amazon/nova-micro-v1",
|
| 49 |
+
"task": "translation",
|
| 50 |
+
"metric": "bleu",
|
| 51 |
+
"score": 0.40042093531509637,
|
| 52 |
+
"bcp_47": 2
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"model": "amazon/nova-micro-v1",
|
| 56 |
+
"task": "translation",
|
| 57 |
+
"metric": "chrf",
|
| 58 |
+
"score": 0.5642142196700637,
|
| 59 |
+
"bcp_47": 2
|
| 60 |
+
},
|
| 61 |
{
|
| 62 |
"model": "google/gemini-2.0-flash-001",
|
| 63 |
"task": "classification",
|
|
|
|
| 86 |
"score": 0.5828490054615683,
|
| 87 |
"bcp_47": 2
|
| 88 |
},
|
| 89 |
+
{
|
| 90 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
| 91 |
+
"task": "classification",
|
| 92 |
+
"metric": "accuracy",
|
| 93 |
+
"score": 0.7333333333333333,
|
| 94 |
+
"bcp_47": 2
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
| 98 |
+
"task": "language_modeling",
|
| 99 |
+
"metric": "chrf",
|
| 100 |
+
"score": 0.9710194350890375,
|
| 101 |
+
"bcp_47": 2
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
| 105 |
+
"task": "translation",
|
| 106 |
+
"metric": "bleu",
|
| 107 |
+
"score": 0.40085159165111883,
|
| 108 |
+
"bcp_47": 2
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
| 112 |
+
"task": "translation",
|
| 113 |
+
"metric": "chrf",
|
| 114 |
+
"score": 0.5422821788946908,
|
| 115 |
+
"bcp_47": 2
|
| 116 |
+
},
|
| 117 |
{
|
| 118 |
"model": "google/gemma-3-27b-it",
|
| 119 |
"task": "classification",
|
|
|
|
| 142 |
"score": 0.5376336154503363,
|
| 143 |
"bcp_47": 2
|
| 144 |
},
|
| 145 |
+
{
|
| 146 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
| 147 |
+
"task": "classification",
|
| 148 |
+
"metric": "accuracy",
|
| 149 |
+
"score": 0.8,
|
| 150 |
+
"bcp_47": 2
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
| 154 |
+
"task": "language_modeling",
|
| 155 |
+
"metric": "chrf",
|
| 156 |
+
"score": 0.9452435586756014,
|
| 157 |
+
"bcp_47": 2
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
| 161 |
+
"task": "translation",
|
| 162 |
+
"metric": "bleu",
|
| 163 |
+
"score": 0.25148401884229143,
|
| 164 |
+
"bcp_47": 2
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
| 168 |
+
"task": "translation",
|
| 169 |
+
"metric": "chrf",
|
| 170 |
+
"score": 0.4285750600098188,
|
| 171 |
+
"bcp_47": 2
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
| 175 |
+
"task": "classification",
|
| 176 |
+
"metric": "accuracy",
|
| 177 |
+
"score": 0.5666666666666667,
|
| 178 |
+
"bcp_47": 2
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
| 182 |
+
"task": "language_modeling",
|
| 183 |
+
"metric": "chrf",
|
| 184 |
+
"score": 0.9203465184571391,
|
| 185 |
+
"bcp_47": 2
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
| 189 |
+
"task": "translation",
|
| 190 |
+
"metric": "bleu",
|
| 191 |
+
"score": 0.43182300663190504,
|
| 192 |
+
"bcp_47": 2
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
| 196 |
+
"task": "translation",
|
| 197 |
+
"metric": "chrf",
|
| 198 |
+
"score": 0.5679592059634284,
|
| 199 |
+
"bcp_47": 2
|
| 200 |
+
},
|
| 201 |
{
|
| 202 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
| 203 |
"task": "classification",
|
|
|
|
| 226 |
"score": 0.4836914110309717,
|
| 227 |
"bcp_47": 10
|
| 228 |
},
|
| 229 |
+
{
|
| 230 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
| 231 |
+
"task": "classification",
|
| 232 |
+
"metric": "accuracy",
|
| 233 |
+
"score": 0.4166666666666667,
|
| 234 |
+
"bcp_47": 2
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
| 238 |
+
"task": "language_modeling",
|
| 239 |
+
"metric": "chrf",
|
| 240 |
+
"score": 0.8700000415175042,
|
| 241 |
+
"bcp_47": 2
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
| 245 |
+
"task": "translation",
|
| 246 |
+
"metric": "bleu",
|
| 247 |
+
"score": 0.31733056990581465,
|
| 248 |
+
"bcp_47": 2
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
| 252 |
+
"task": "translation",
|
| 253 |
+
"metric": "chrf",
|
| 254 |
+
"score": 0.45631576469060464,
|
| 255 |
+
"bcp_47": 2
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"model": "mistralai/mistral-nemo",
|
| 259 |
+
"task": "classification",
|
| 260 |
+
"metric": "accuracy",
|
| 261 |
+
"score": 0.5,
|
| 262 |
+
"bcp_47": 2
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"model": "mistralai/mistral-nemo",
|
| 266 |
+
"task": "language_modeling",
|
| 267 |
+
"metric": "chrf",
|
| 268 |
+
"score": 0.8815544644693022,
|
| 269 |
+
"bcp_47": 2
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"model": "mistralai/mistral-nemo",
|
| 273 |
+
"task": "translation",
|
| 274 |
+
"metric": "bleu",
|
| 275 |
+
"score": 0.3177444138044378,
|
| 276 |
+
"bcp_47": 2
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"model": "mistralai/mistral-nemo",
|
| 280 |
+
"task": "translation",
|
| 281 |
+
"metric": "chrf",
|
| 282 |
+
"score": 0.49319228717306784,
|
| 283 |
+
"bcp_47": 2
|
| 284 |
+
},
|
| 285 |
{
|
| 286 |
"model": "mistralai/mistral-small-24b-instruct-2501",
|
| 287 |
"task": "classification",
|
|
|
|
| 337 |
"metric": "chrf",
|
| 338 |
"score": 0.5452510379336759,
|
| 339 |
"bcp_47": 2
|
| 340 |
+
},
|
| 341 |
+
{
|
| 342 |
+
"model": "qwen/qwq-32b",
|
| 343 |
+
"task": "classification",
|
| 344 |
+
"metric": "accuracy",
|
| 345 |
+
"score": 0.0,
|
| 346 |
+
"bcp_47": 2
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"model": "qwen/qwq-32b",
|
| 350 |
+
"task": "language_modeling",
|
| 351 |
+
"metric": "chrf",
|
| 352 |
+
"score": 0.4813150156594517,
|
| 353 |
+
"bcp_47": 2
|
| 354 |
+
},
|
| 355 |
+
{
|
| 356 |
+
"model": "qwen/qwq-32b",
|
| 357 |
+
"task": "translation",
|
| 358 |
+
"metric": "bleu",
|
| 359 |
+
"score": 0.2144844735779058,
|
| 360 |
+
"bcp_47": 2
|
| 361 |
+
},
|
| 362 |
+
{
|
| 363 |
+
"model": "qwen/qwq-32b",
|
| 364 |
+
"task": "translation",
|
| 365 |
+
"metric": "chrf",
|
| 366 |
+
"score": 0.30433786997302065,
|
| 367 |
+
"bcp_47": 2
|
| 368 |
}
|
| 369 |
],
|
| 370 |
"languages": [
|
|
|
|
| 2750 |
"in_benchmark": true,
|
| 2751 |
"task": "classification",
|
| 2752 |
"metric": "accuracy",
|
| 2753 |
+
"score": 0.5777777777777778,
|
| 2754 |
+
"model": 12.0
|
| 2755 |
},
|
| 2756 |
{
|
| 2757 |
"bcp_47": "en",
|
|
|
|
| 2765 |
"in_benchmark": true,
|
| 2766 |
"task": "language_modeling",
|
| 2767 |
"metric": "chrf",
|
| 2768 |
+
"score": 0.9222343234934963,
|
| 2769 |
+
"model": 12.0
|
| 2770 |
},
|
| 2771 |
{
|
| 2772 |
"bcp_47": "en",
|
|
|
|
| 2780 |
"in_benchmark": true,
|
| 2781 |
"task": "translation",
|
| 2782 |
"metric": "bleu",
|
| 2783 |
+
"score": 0.37035746903842287,
|
| 2784 |
+
"model": 12.0
|
| 2785 |
},
|
| 2786 |
{
|
| 2787 |
"bcp_47": "en",
|
|
|
|
| 2795 |
"in_benchmark": true,
|
| 2796 |
"task": "translation",
|
| 2797 |
"metric": "chrf",
|
| 2798 |
+
"score": 0.4880916692700535,
|
| 2799 |
+
"model": 12.0
|
| 2800 |
},
|
| 2801 |
{
|
| 2802 |
"bcp_47": "eo",
|
|
|
|
| 10895 |
"in_benchmark": true,
|
| 10896 |
"task": "classification",
|
| 10897 |
"metric": "accuracy",
|
| 10898 |
+
"score": 0.5499999999999999,
|
| 10899 |
+
"model": 12.0
|
| 10900 |
},
|
| 10901 |
{
|
| 10902 |
"bcp_47": "zh",
|
|
|
|
| 10910 |
"in_benchmark": true,
|
| 10911 |
"task": "language_modeling",
|
| 10912 |
"metric": "chrf",
|
| 10913 |
+
"score": 0.8599948525016986,
|
| 10914 |
+
"model": 12.0
|
| 10915 |
},
|
| 10916 |
{
|
| 10917 |
"bcp_47": "zh",
|
|
|
|
| 10925 |
"in_benchmark": true,
|
| 10926 |
"task": "translation",
|
| 10927 |
"metric": "bleu",
|
| 10928 |
+
"score": 0.3532292543512247,
|
| 10929 |
+
"model": 12.0
|
| 10930 |
},
|
| 10931 |
{
|
| 10932 |
"bcp_47": "zh",
|
|
|
|
| 10940 |
"in_benchmark": true,
|
| 10941 |
"task": "translation",
|
| 10942 |
"metric": "chrf",
|
| 10943 |
+
"score": 0.529398790799104,
|
| 10944 |
+
"model": 12.0
|
| 10945 |
},
|
| 10946 |
{
|
| 10947 |
"bcp_47": "zmi",
|
|
|
|
| 10990 |
}
|
| 10991 |
],
|
| 10992 |
"scores": [
|
| 10993 |
+
{
|
| 10994 |
+
"model": "amazon/nova-micro-v1",
|
| 10995 |
+
"bcp_47": "en",
|
| 10996 |
+
"task": "classification",
|
| 10997 |
+
"metric": "accuracy",
|
| 10998 |
+
"score": 0.5333333333333333,
|
| 10999 |
+
"sentence_nr": 14.5
|
| 11000 |
+
},
|
| 11001 |
+
{
|
| 11002 |
+
"model": "amazon/nova-micro-v1",
|
| 11003 |
+
"bcp_47": "en",
|
| 11004 |
+
"task": "language_modeling",
|
| 11005 |
+
"metric": "chrf",
|
| 11006 |
+
"score": 0.9725001956658679,
|
| 11007 |
+
"sentence_nr": 14.5
|
| 11008 |
+
},
|
| 11009 |
+
{
|
| 11010 |
+
"model": "amazon/nova-micro-v1",
|
| 11011 |
+
"bcp_47": "en",
|
| 11012 |
+
"task": "translation",
|
| 11013 |
+
"metric": "bleu",
|
| 11014 |
+
"score": 0.4491277841667736,
|
| 11015 |
+
"sentence_nr": 14.5
|
| 11016 |
+
},
|
| 11017 |
+
{
|
| 11018 |
+
"model": "amazon/nova-micro-v1",
|
| 11019 |
+
"bcp_47": "en",
|
| 11020 |
+
"task": "translation",
|
| 11021 |
+
"metric": "chrf",
|
| 11022 |
+
"score": 0.5740458676508566,
|
| 11023 |
+
"sentence_nr": 14.5
|
| 11024 |
+
},
|
| 11025 |
+
{
|
| 11026 |
+
"model": "amazon/nova-micro-v1",
|
| 11027 |
+
"bcp_47": "zh",
|
| 11028 |
+
"task": "classification",
|
| 11029 |
+
"metric": "accuracy",
|
| 11030 |
+
"score": 0.5,
|
| 11031 |
+
"sentence_nr": 14.5
|
| 11032 |
+
},
|
| 11033 |
+
{
|
| 11034 |
+
"model": "amazon/nova-micro-v1",
|
| 11035 |
+
"bcp_47": "zh",
|
| 11036 |
+
"task": "language_modeling",
|
| 11037 |
+
"metric": "chrf",
|
| 11038 |
+
"score": 0.9167395508743035,
|
| 11039 |
+
"sentence_nr": 14.5
|
| 11040 |
+
},
|
| 11041 |
+
{
|
| 11042 |
+
"model": "amazon/nova-micro-v1",
|
| 11043 |
+
"bcp_47": "zh",
|
| 11044 |
+
"task": "translation",
|
| 11045 |
+
"metric": "bleu",
|
| 11046 |
+
"score": 0.3517140864634192,
|
| 11047 |
+
"sentence_nr": 14.5
|
| 11048 |
+
},
|
| 11049 |
+
{
|
| 11050 |
+
"model": "amazon/nova-micro-v1",
|
| 11051 |
+
"bcp_47": "zh",
|
| 11052 |
+
"task": "translation",
|
| 11053 |
+
"metric": "chrf",
|
| 11054 |
+
"score": 0.5543825716892707,
|
| 11055 |
+
"sentence_nr": 14.5
|
| 11056 |
+
},
|
| 11057 |
{
|
| 11058 |
"model": "google/gemini-2.0-flash-001",
|
| 11059 |
"bcp_47": "en",
|
|
|
|
| 11118 |
"score": 0.5606266861920302,
|
| 11119 |
"sentence_nr": 14.5
|
| 11120 |
},
|
| 11121 |
+
{
|
| 11122 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
| 11123 |
+
"bcp_47": "en",
|
| 11124 |
+
"task": "classification",
|
| 11125 |
+
"metric": "accuracy",
|
| 11126 |
+
"score": 0.7333333333333333,
|
| 11127 |
+
"sentence_nr": 14.5
|
| 11128 |
+
},
|
| 11129 |
+
{
|
| 11130 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
| 11131 |
+
"bcp_47": "en",
|
| 11132 |
+
"task": "language_modeling",
|
| 11133 |
+
"metric": "chrf",
|
| 11134 |
+
"score": 0.990925430282282,
|
| 11135 |
+
"sentence_nr": 14.5
|
| 11136 |
+
},
|
| 11137 |
+
{
|
| 11138 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
| 11139 |
+
"bcp_47": "en",
|
| 11140 |
+
"task": "translation",
|
| 11141 |
+
"metric": "bleu",
|
| 11142 |
+
"score": 0.37911136698810943,
|
| 11143 |
+
"sentence_nr": 14.5
|
| 11144 |
+
},
|
| 11145 |
+
{
|
| 11146 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
| 11147 |
+
"bcp_47": "en",
|
| 11148 |
+
"task": "translation",
|
| 11149 |
+
"metric": "chrf",
|
| 11150 |
+
"score": 0.5094402087357145,
|
| 11151 |
+
"sentence_nr": 14.5
|
| 11152 |
+
},
|
| 11153 |
+
{
|
| 11154 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
| 11155 |
+
"bcp_47": "zh",
|
| 11156 |
+
"task": "classification",
|
| 11157 |
+
"metric": "accuracy",
|
| 11158 |
+
"score": 0.7333333333333333,
|
| 11159 |
+
"sentence_nr": 14.5
|
| 11160 |
+
},
|
| 11161 |
+
{
|
| 11162 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
| 11163 |
+
"bcp_47": "zh",
|
| 11164 |
+
"task": "language_modeling",
|
| 11165 |
+
"metric": "chrf",
|
| 11166 |
+
"score": 0.9511134398957932,
|
| 11167 |
+
"sentence_nr": 14.5
|
| 11168 |
+
},
|
| 11169 |
+
{
|
| 11170 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
| 11171 |
+
"bcp_47": "zh",
|
| 11172 |
+
"task": "translation",
|
| 11173 |
+
"metric": "bleu",
|
| 11174 |
+
"score": 0.4225918163141283,
|
| 11175 |
+
"sentence_nr": 14.5
|
| 11176 |
+
},
|
| 11177 |
+
{
|
| 11178 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
| 11179 |
+
"bcp_47": "zh",
|
| 11180 |
+
"task": "translation",
|
| 11181 |
+
"metric": "chrf",
|
| 11182 |
+
"score": 0.5751241490536672,
|
| 11183 |
+
"sentence_nr": 14.5
|
| 11184 |
+
},
|
| 11185 |
{
|
| 11186 |
"model": "google/gemma-3-27b-it",
|
| 11187 |
"bcp_47": "en",
|
|
|
|
| 11246 |
"score": 0.520771580386218,
|
| 11247 |
"sentence_nr": 14.5
|
| 11248 |
},
|
| 11249 |
+
{
|
| 11250 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
| 11251 |
+
"bcp_47": "en",
|
| 11252 |
+
"task": "classification",
|
| 11253 |
+
"metric": "accuracy",
|
| 11254 |
+
"score": 0.8333333333333334,
|
| 11255 |
+
"sentence_nr": 14.5
|
| 11256 |
+
},
|
| 11257 |
+
{
|
| 11258 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
| 11259 |
+
"bcp_47": "en",
|
| 11260 |
+
"task": "language_modeling",
|
| 11261 |
+
"metric": "chrf",
|
| 11262 |
+
"score": 0.9674315682816375,
|
| 11263 |
+
"sentence_nr": 14.5
|
| 11264 |
+
},
|
| 11265 |
+
{
|
| 11266 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
| 11267 |
+
"bcp_47": "en",
|
| 11268 |
+
"task": "translation",
|
| 11269 |
+
"metric": "bleu",
|
| 11270 |
+
"score": 0.18722412351358647,
|
| 11271 |
+
"sentence_nr": 14.5
|
| 11272 |
+
},
|
| 11273 |
+
{
|
| 11274 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
| 11275 |
+
"bcp_47": "en",
|
| 11276 |
+
"task": "translation",
|
| 11277 |
+
"metric": "chrf",
|
| 11278 |
+
"score": 0.34151371128305424,
|
| 11279 |
+
"sentence_nr": 14.5
|
| 11280 |
+
},
|
| 11281 |
+
{
|
| 11282 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
| 11283 |
+
"bcp_47": "zh",
|
| 11284 |
+
"task": "classification",
|
| 11285 |
+
"metric": "accuracy",
|
| 11286 |
+
"score": 0.7666666666666667,
|
| 11287 |
+
"sentence_nr": 14.5
|
| 11288 |
+
},
|
| 11289 |
+
{
|
| 11290 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
| 11291 |
+
"bcp_47": "zh",
|
| 11292 |
+
"task": "language_modeling",
|
| 11293 |
+
"metric": "chrf",
|
| 11294 |
+
"score": 0.9230555490695652,
|
| 11295 |
+
"sentence_nr": 14.5
|
| 11296 |
+
},
|
| 11297 |
+
{
|
| 11298 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
| 11299 |
+
"bcp_47": "zh",
|
| 11300 |
+
"task": "translation",
|
| 11301 |
+
"metric": "bleu",
|
| 11302 |
+
"score": 0.3157439141709964,
|
| 11303 |
+
"sentence_nr": 14.5
|
| 11304 |
+
},
|
| 11305 |
+
{
|
| 11306 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
| 11307 |
+
"bcp_47": "zh",
|
| 11308 |
+
"task": "translation",
|
| 11309 |
+
"metric": "chrf",
|
| 11310 |
+
"score": 0.5156364087365835,
|
| 11311 |
+
"sentence_nr": 14.5
|
| 11312 |
+
},
|
| 11313 |
+
{
|
| 11314 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
| 11315 |
+
"bcp_47": "en",
|
| 11316 |
+
"task": "classification",
|
| 11317 |
+
"metric": "accuracy",
|
| 11318 |
+
"score": 0.7,
|
| 11319 |
+
"sentence_nr": 14.5
|
| 11320 |
+
},
|
| 11321 |
+
{
|
| 11322 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
| 11323 |
+
"bcp_47": "en",
|
| 11324 |
+
"task": "language_modeling",
|
| 11325 |
+
"metric": "chrf",
|
| 11326 |
+
"score": 0.9701295103188484,
|
| 11327 |
+
"sentence_nr": 14.5
|
| 11328 |
+
},
|
| 11329 |
+
{
|
| 11330 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
| 11331 |
+
"bcp_47": "en",
|
| 11332 |
+
"task": "translation",
|
| 11333 |
+
"metric": "bleu",
|
| 11334 |
+
"score": 0.44443705644214526,
|
| 11335 |
+
"sentence_nr": 14.5
|
| 11336 |
+
},
|
| 11337 |
+
{
|
| 11338 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
| 11339 |
+
"bcp_47": "en",
|
| 11340 |
+
"task": "translation",
|
| 11341 |
+
"metric": "chrf",
|
| 11342 |
+
"score": 0.5485685299214524,
|
| 11343 |
+
"sentence_nr": 14.5
|
| 11344 |
+
},
|
| 11345 |
+
{
|
| 11346 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
| 11347 |
+
"bcp_47": "zh",
|
| 11348 |
+
"task": "classification",
|
| 11349 |
+
"metric": "accuracy",
|
| 11350 |
+
"score": 0.43333333333333335,
|
| 11351 |
+
"sentence_nr": 14.5
|
| 11352 |
+
},
|
| 11353 |
+
{
|
| 11354 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
| 11355 |
+
"bcp_47": "zh",
|
| 11356 |
+
"task": "language_modeling",
|
| 11357 |
+
"metric": "chrf",
|
| 11358 |
+
"score": 0.8705635265954298,
|
| 11359 |
+
"sentence_nr": 14.5
|
| 11360 |
+
},
|
| 11361 |
+
{
|
| 11362 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
| 11363 |
+
"bcp_47": "zh",
|
| 11364 |
+
"task": "translation",
|
| 11365 |
+
"metric": "bleu",
|
| 11366 |
+
"score": 0.4192089568216648,
|
| 11367 |
+
"sentence_nr": 14.5
|
| 11368 |
+
},
|
| 11369 |
+
{
|
| 11370 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
| 11371 |
+
"bcp_47": "zh",
|
| 11372 |
+
"task": "translation",
|
| 11373 |
+
"metric": "chrf",
|
| 11374 |
+
"score": 0.5873498820054043,
|
| 11375 |
+
"sentence_nr": 14.5
|
| 11376 |
+
},
|
| 11377 |
{
|
| 11378 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
| 11379 |
"bcp_47": "ar",
|
|
|
|
| 11694 |
"score": 0.5862284100611604,
|
| 11695 |
"sentence_nr": 14.5
|
| 11696 |
},
|
| 11697 |
+
{
|
| 11698 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
| 11699 |
+
"bcp_47": "en",
|
| 11700 |
+
"task": "classification",
|
| 11701 |
+
"metric": "accuracy",
|
| 11702 |
+
"score": 0.43333333333333335,
|
| 11703 |
+
"sentence_nr": 14.5
|
| 11704 |
+
},
|
| 11705 |
+
{
|
| 11706 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
| 11707 |
+
"bcp_47": "en",
|
| 11708 |
+
"task": "language_modeling",
|
| 11709 |
+
"metric": "chrf",
|
| 11710 |
+
"score": 0.9268050965065061,
|
| 11711 |
+
"sentence_nr": 14.5
|
| 11712 |
+
},
|
| 11713 |
+
{
|
| 11714 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
| 11715 |
+
"bcp_47": "en",
|
| 11716 |
+
"task": "translation",
|
| 11717 |
+
"metric": "bleu",
|
| 11718 |
+
"score": 0.34049537977839345,
|
| 11719 |
+
"sentence_nr": 14.5
|
| 11720 |
+
},
|
| 11721 |
+
{
|
| 11722 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
| 11723 |
+
"bcp_47": "en",
|
| 11724 |
+
"task": "translation",
|
| 11725 |
+
"metric": "chrf",
|
| 11726 |
+
"score": 0.4566714452688056,
|
| 11727 |
+
"sentence_nr": 14.5
|
| 11728 |
+
},
|
| 11729 |
+
{
|
| 11730 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
| 11731 |
+
"bcp_47": "zh",
|
| 11732 |
+
"task": "classification",
|
| 11733 |
+
"metric": "accuracy",
|
| 11734 |
+
"score": 0.4,
|
| 11735 |
+
"sentence_nr": 14.5
|
| 11736 |
+
},
|
| 11737 |
+
{
|
| 11738 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
| 11739 |
+
"bcp_47": "zh",
|
| 11740 |
+
"task": "language_modeling",
|
| 11741 |
+
"metric": "chrf",
|
| 11742 |
+
"score": 0.8131949865285024,
|
| 11743 |
+
"sentence_nr": 14.5
|
| 11744 |
+
},
|
| 11745 |
+
{
|
| 11746 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
| 11747 |
+
"bcp_47": "zh",
|
| 11748 |
+
"task": "translation",
|
| 11749 |
+
"metric": "bleu",
|
| 11750 |
+
"score": 0.2941657600332359,
|
| 11751 |
+
"sentence_nr": 14.5
|
| 11752 |
+
},
|
| 11753 |
+
{
|
| 11754 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
| 11755 |
+
"bcp_47": "zh",
|
| 11756 |
+
"task": "translation",
|
| 11757 |
+
"metric": "chrf",
|
| 11758 |
+
"score": 0.4559600841124037,
|
| 11759 |
+
"sentence_nr": 14.5
|
| 11760 |
+
},
|
| 11761 |
+
{
|
| 11762 |
+
"model": "mistralai/mistral-nemo",
|
| 11763 |
+
"bcp_47": "en",
|
| 11764 |
+
"task": "classification",
|
| 11765 |
+
"metric": "accuracy",
|
| 11766 |
+
"score": 0.4666666666666667,
|
| 11767 |
+
"sentence_nr": 14.5
|
| 11768 |
+
},
|
| 11769 |
+
{
|
| 11770 |
+
"model": "mistralai/mistral-nemo",
|
| 11771 |
+
"bcp_47": "en",
|
| 11772 |
+
"task": "language_modeling",
|
| 11773 |
+
"metric": "chrf",
|
| 11774 |
+
"score": 0.9383955895073849,
|
| 11775 |
+
"sentence_nr": 14.5
|
| 11776 |
+
},
|
| 11777 |
+
{
|
| 11778 |
+
"model": "mistralai/mistral-nemo",
|
| 11779 |
+
"bcp_47": "en",
|
| 11780 |
+
"task": "translation",
|
| 11781 |
+
"metric": "bleu",
|
| 11782 |
+
"score": 0.3057719571177098,
|
| 11783 |
+
"sentence_nr": 14.5
|
| 11784 |
+
},
|
| 11785 |
+
{
|
| 11786 |
+
"model": "mistralai/mistral-nemo",
|
| 11787 |
+
"bcp_47": "en",
|
| 11788 |
+
"task": "translation",
|
| 11789 |
+
"metric": "chrf",
|
| 11790 |
+
"score": 0.45969934521843914,
|
| 11791 |
+
"sentence_nr": 14.5
|
| 11792 |
+
},
|
| 11793 |
+
{
|
| 11794 |
+
"model": "mistralai/mistral-nemo",
|
| 11795 |
+
"bcp_47": "zh",
|
| 11796 |
+
"task": "classification",
|
| 11797 |
+
"metric": "accuracy",
|
| 11798 |
+
"score": 0.5333333333333333,
|
| 11799 |
+
"sentence_nr": 14.5
|
| 11800 |
+
},
|
| 11801 |
+
{
|
| 11802 |
+
"model": "mistralai/mistral-nemo",
|
| 11803 |
+
"bcp_47": "zh",
|
| 11804 |
+
"task": "language_modeling",
|
| 11805 |
+
"metric": "chrf",
|
| 11806 |
+
"score": 0.8247133394312195,
|
| 11807 |
+
"sentence_nr": 14.5
|
| 11808 |
+
},
|
| 11809 |
+
{
|
| 11810 |
+
"model": "mistralai/mistral-nemo",
|
| 11811 |
+
"bcp_47": "zh",
|
| 11812 |
+
"task": "translation",
|
| 11813 |
+
"metric": "bleu",
|
| 11814 |
+
"score": 0.32971687049116577,
|
| 11815 |
+
"sentence_nr": 14.5
|
| 11816 |
+
},
|
| 11817 |
+
{
|
| 11818 |
+
"model": "mistralai/mistral-nemo",
|
| 11819 |
+
"bcp_47": "zh",
|
| 11820 |
+
"task": "translation",
|
| 11821 |
+
"metric": "chrf",
|
| 11822 |
+
"score": 0.5266852291276966,
|
| 11823 |
+
"sentence_nr": 14.5
|
| 11824 |
+
},
|
| 11825 |
{
|
| 11826 |
"model": "mistralai/mistral-small-24b-instruct-2501",
|
| 11827 |
"bcp_47": "en",
|
|
|
|
| 11949 |
"metric": "chrf",
|
| 11950 |
"score": 0.559410465345808,
|
| 11951 |
"sentence_nr": 14.5
|
| 11952 |
+
},
|
| 11953 |
+
{
|
| 11954 |
+
"model": "qwen/qwq-32b",
|
| 11955 |
+
"bcp_47": "en",
|
| 11956 |
+
"task": "classification",
|
| 11957 |
+
"metric": "accuracy",
|
| 11958 |
+
"score": 0.0,
|
| 11959 |
+
"sentence_nr": 14.5
|
| 11960 |
+
},
|
| 11961 |
+
{
|
| 11962 |
+
"model": "qwen/qwq-32b",
|
| 11963 |
+
"bcp_47": "en",
|
| 11964 |
+
"task": "language_modeling",
|
| 11965 |
+
"metric": "chrf",
|
| 11966 |
+
"score": 0.6047457400839834,
|
| 11967 |
+
"sentence_nr": 14.5
|
| 11968 |
+
},
|
| 11969 |
+
{
|
| 11970 |
+
"model": "qwen/qwq-32b",
|
| 11971 |
+
"bcp_47": "en",
|
| 11972 |
+
"task": "translation",
|
| 11973 |
+
"metric": "bleu",
|
| 11974 |
+
"score": 0.20068036705764214,
|
| 11975 |
+
"sentence_nr": 14.5
|
| 11976 |
+
},
|
| 11977 |
+
{
|
| 11978 |
+
"model": "qwen/qwq-32b",
|
| 11979 |
+
"bcp_47": "en",
|
| 11980 |
+
"task": "translation",
|
| 11981 |
+
"metric": "chrf",
|
| 11982 |
+
"score": 0.23884729813422853,
|
| 11983 |
+
"sentence_nr": 14.5
|
| 11984 |
+
},
|
| 11985 |
+
{
|
| 11986 |
+
"model": "qwen/qwq-32b",
|
| 11987 |
+
"bcp_47": "zh",
|
| 11988 |
+
"task": "classification",
|
| 11989 |
+
"metric": "accuracy",
|
| 11990 |
+
"score": 0.0,
|
| 11991 |
+
"sentence_nr": 14.5
|
| 11992 |
+
},
|
| 11993 |
+
{
|
| 11994 |
+
"model": "qwen/qwq-32b",
|
| 11995 |
+
"bcp_47": "zh",
|
| 11996 |
+
"task": "language_modeling",
|
| 11997 |
+
"metric": "chrf",
|
| 11998 |
+
"score": 0.35788429123492,
|
| 11999 |
+
"sentence_nr": 14.5
|
| 12000 |
+
},
|
| 12001 |
+
{
|
| 12002 |
+
"model": "qwen/qwq-32b",
|
| 12003 |
+
"bcp_47": "zh",
|
| 12004 |
+
"task": "translation",
|
| 12005 |
+
"metric": "bleu",
|
| 12006 |
+
"score": 0.22828858009816946,
|
| 12007 |
+
"sentence_nr": 14.5
|
| 12008 |
+
},
|
| 12009 |
+
{
|
| 12010 |
+
"model": "qwen/qwq-32b",
|
| 12011 |
+
"bcp_47": "zh",
|
| 12012 |
+
"task": "translation",
|
| 12013 |
+
"metric": "chrf",
|
| 12014 |
+
"score": 0.3698284418118128,
|
| 12015 |
+
"sentence_nr": 14.5
|
| 12016 |
}
|
| 12017 |
]
|
| 12018 |
}
|