{
"cells": [
{
"cell_type": "code",
"execution_count": 104,
"id": "73fc3ddb-9d22-4b9b-960a-f78b5111c898",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 23,
"id": "15c618d3-e5a2-4ae8-8e2e-df916cc7d465",
"metadata": {},
"outputs": [],
"source": [
"import json, pathlib, pandas as pd\n",
"from pprint import pprint\n",
"import os\n",
"from pathlib import Path\n",
"from collections import Counter\n",
"from io import StringIO\n",
"import numpy as np\n",
"\n",
"\n",
"pd.set_option(\"display.max_rows\", None) # show ALL rows\n",
"pd.set_option(\"display.max_columns\", None) # show ALL columns\n",
"\n",
"\n",
"# Time Conversion function\n",
"def format_time(seconds: float) -> str:\n",
" seconds = int(seconds)\n",
" hours, remainder = divmod(seconds, 3600)\n",
" minutes = remainder // 60 # drop leftover seconds\n",
"\n",
" parts = []\n",
" if hours > 0:\n",
" parts.append(f\"{hours}h\")\n",
" if minutes > 0 or not parts: # if no hours and no minutes, show 0m\n",
" parts.append(f\"{minutes}m\")\n",
"\n",
" return \" \".join(parts)\n",
"\n",
"\n",
"def list_json_files(directory: str):\n",
" \"\"\"\n",
" Reads all .json files in a given directory and returns \n",
" their full paths as a list.\n",
" \"\"\"\n",
" json_files = []\n",
" for file in os.listdir(directory):\n",
" if file.endswith(\".json\"):\n",
" full_path = os.path.join(directory, file)\n",
" json_files.append(full_path)\n",
" return json_files\n",
"\n",
"\n",
"def format_params(n: int) -> str:\n",
" \"\"\"\n",
" Convert raw parameter count (int) into human-friendly string.\n",
" Examples:\n",
" 6851947264 -> \"7B\"\n",
" 12500000000 -> \"12.5B\"\n",
" 560000000 -> \"560M\"\n",
" \"\"\"\n",
" if n >= 1_000_000_000: # billions\n",
" val = n / 1_000_000_000\n",
" if val.is_integer():\n",
" return f\"{int(val)}B\"\n",
" else:\n",
" return f\"{val:.1f}B\"\n",
" elif n >= 1_000_000: # millions\n",
" val = n / 1_000_000\n",
" if val.is_integer():\n",
" return f\"{int(val)}M\"\n",
" else:\n",
" return f\"{val:.1f}M\"\n",
" elif n >= 1_000: # thousands (rare for params, but included)\n",
" val = n / 1_000\n",
" if val.is_integer():\n",
" return f\"{int(val)}K\"\n",
" else:\n",
" return f\"{val:.1f}K\"\n",
" else:\n",
" return str(n)\n",
"\n",
"\n",
"metric_map = {\n",
" \"mmlu\":\"acc,none\" ,\n",
" \"hellaswag\": \"acc_norm,none\",\n",
" \"arc_challenge\": \"acc_norm,none\", # prefer normalized accuracy\n",
" \"bbh\": \"exact_match,get-answer\",\n",
" \"gsm8k\":\"exact_match,strict-match\" ,\n",
" \"gpqa_main_zeroshot\":\"acc_norm,none\",\n",
" \"anli_r1\": \"acc,none\",\n",
" \"anli_r2\": \"acc,none\",\n",
" \"anli_r3\": \"acc,none\",\n",
" \"piqa\":\"acc_norm,none\" ,\n",
" \"winogrande\": \"acc,none\",\n",
" \"boolq\": \"acc,none\",\n",
" \"truthfulqa_mc1\":\"acc,none\" ,\n",
" \"truthfulqa_mc2\":\"acc,none\" ,\n",
" \"drop\": \"f1,none\",\n",
" \"nq_open\":\"exact_match,remove_whitespace\" ,\n",
" \"openbookqa\":\"acc_norm,none\" ,\n",
" \"sciq\": \"acc_norm,none\",\n",
" \"triviaqa\":\"exact_match,remove_whitespace\" ,\n",
" \"qnli\":\"acc,none\" ,\n",
"}\n",
"\n",
"# Tasks from most important to least important\n",
"# tasks = [mmlu, hellaswag, arc_challenge, bbh, gsm8k, gpqa_main_zeroshot, ANLI (r1, r2, r3), piqa, winogrande, boolq, TruthfulQA (mc1, mc2), drop, nq_open, openbookqa, sciq, triviaqa, qnli]\n",
"\n",
"# Path list \n",
"directory = \"/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/results\"\n",
"all_json_paths = list_json_files(directory)\n",
"\n",
"def parse_results(json_path: str, metric_map: dict) -> pd.DataFrame:\n",
"\n",
" with open(json_path,'r') as f:\n",
" data = json.load(f)\n",
"\n",
" # Extract core info\n",
" model_name = data.get(\"model_name\")\n",
" model_name = model_name.split(\"/\")[-1]\n",
" total_time_raw = float(data.get(\"total_evaluation_time_seconds\", 0))\n",
" total_time = format_time(float(data.get(\"total_evaluation_time_seconds\", 0)))\n",
" batch_size = data[\"config\"].get(\"batch_size\")\n",
" batch_sizes = data[\"config\"].get(\"batch_sizes\")\n",
" parameters = format_params(data[\"config\"].get(\"model_num_parameters\"))\n",
" parameters_raw = data[\"config\"].get(\"model_num_parameters\")\n",
" \n",
"\n",
" rows = []\n",
" for task, metric_key in metric_map.items():\n",
" # Skip tasks not present in the results\n",
" if task not in data[\"results\"]:\n",
" raise ValueError(f\"'{task}' not in results! \") \n",
" \n",
" metrics = data[\"results\"][task]\n",
" \n",
" # If the metric_key isn't in this task's results, raise error\n",
" if metric_key not in metrics:\n",
" raise ValueError(\n",
" f\"Expected metric '{metric_key}' not found for task '{task}'. \"\n",
" f\"Available keys: {list(metrics.keys())}\"\n",
" )\n",
" \n",
" acc = metrics[metric_key]\n",
" \n",
" row = {\n",
" \"model_name\": model_name,\n",
" # \"task\": task,\n",
" \"task\": task + \"(\" + metric_key + \")\",\n",
" \"score\": acc,\n",
" \"total_time\": total_time,\n",
" \"total_time_raw\" : total_time_raw,\n",
" \"batch_size\": batch_size,\n",
" \"batch_sizes\": batch_sizes,\n",
" \"parameters\": parameters,\n",
" \"parameters_raw\": parameters_raw,\n",
" }\n",
" rows.append(row)\n",
"\n",
" # Convert to tidy dataframe\n",
" return pd.DataFrame(rows)\n",
"\n",
"\n",
"dfs = [parse_results(path, metric_map) for path in all_json_paths]\n",
"master_df = pd.concat(dfs, ignore_index=True)\n",
"\n",
"\n",
"# display(master_df)\n",
"\n",
"\n",
"# Wide format: one row per model, columns = tasks\n",
"#Check for duplicate rows \n",
"key_cols = [\"task\", 'score', 'model_name']\n",
"dups_mask = master_df.duplicated(key_cols, keep=False)\n",
"# dups = master_df.loc[dups_mask]\n",
"# display(dups)\n",
"\n",
"if dups_mask.any():\n",
" dups = master_df.loc[dups_mask, key_cols]\n",
" raise ValueError(f\"Duplicate rows found for keys:\\n{dups}\")\n",
"\n",
"wide_df = master_df.pivot_table(\n",
" index=[\"model_name\", \"parameters\"],\n",
" columns=[\"task\"],\n",
" values=\"score\",\n",
" aggfunc=\"mean\"\n",
").reset_index()\n",
"\n",
"# select the metadata columns you want from the long df\n",
"meta_cols = [\n",
" \"model_name\", \n",
" \"parameters\", \n",
" \"parameters_raw\",\n",
" \"total_time\", \n",
" \"total_time_raw\", \n",
" \"batch_size\", \n",
" \"batch_sizes\", \n",
"]\n",
"\n",
"# drop duplicate rows by model_name + parameters\n",
"df_meta = master_df[meta_cols].drop_duplicates(subset=[\"model_name\", \"parameters\"])\n",
"\n",
"# merge the metadata back into your wide dataframe\n",
"df_wide_merged = df_meta.merge(wide_df, on=[\"model_name\", \"parameters\"], how=\"left\")\n",
"\n",
"\n",
"# display(df_wide_merged.drop(columns=[\"parameters_raw\", \"total_time_raw\", \"batch_sizes\"]))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "324364b8-b59a-4450-8723-0c4057488513",
"metadata": {},
"outputs": [],
"source": [
"gpu_dir = Path(\"/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/gpu_usage\")\n",
"gpu_files = list(gpu_dir.glob(\"*_gpu_usage.csv\"))\n",
"\n",
"def model_from_filename(p: Path) -> str:\n",
" return p.stem.replace(\"_gpu_usage\", \"\").strip()\n",
"\n",
"model_names_gpu = [model_from_filename(x) for x in gpu_files]\n",
"\n",
"# Check if match with result\n",
"set_gpu = set(model_names_gpu)\n",
"set_results = set(wide_df['model_name'])\n",
"extra_in_gpu = set_gpu - set_results\n",
"missing_in_gpu = set_results - set_gpu\n",
"# print(\"Extra models in GPU logs:\", extra_in_gpu)\n",
"# print(\"Models in results with no GPU log:\", missing_in_gpu)\n",
"\n",
"\n",
"# Check for Dulicates\n",
"def print_duplicates(name_list, label=\"\"):\n",
" counts = Counter(name_list)\n",
" dups = [name for name, cnt in counts.items() if cnt > 1]\n",
" if dups:\n",
" print(f\"Duplicates in {label}:\")\n",
" for name in dups:\n",
" print(f\" {name} (count = {counts[name]})\")\n",
" else:\n",
" print(f\"No duplicates found in {label}.\")\n",
"# print_duplicates(model_names_gpu, \"GPU logs\")\n",
"\n",
"\n",
"def read_last_run_csv(path: Path) -> pd.DataFrame:\n",
" \"\"\"\n",
" Return a DataFrame for only the *last* '==== New Run ... ====' block.\n",
" Assumes next line after the marker is the CSV header.\n",
" \"\"\"\n",
" lines = path.read_text(encoding=\"utf-8\").splitlines()\n",
" # locate all run markers\n",
" run_idx = [i for i, line in enumerate(lines) if line.startswith(\"==== New Run:\")]\n",
" if not run_idx:\n",
" raise ValueError(f\"No '==== New Run' marker found in {path}\")\n",
" start = run_idx[-1] + 1 # header line index\n",
"\n",
" # slice from header to end and parse CSV\n",
" block = \"\\n\".join(lines[start:])\n",
" df = pd.read_csv(StringIO(block))\n",
"\n",
" # optional cleanup: strip units and cast to numbers if these columns exist\n",
" if \" utilization.gpu [%]\" in df.columns:\n",
" df[\" utilization.gpu [%]\"] = (\n",
" df[\" utilization.gpu [%]\"].astype(str).str.replace(\"%\", \"\", regex=False).str.strip().astype(\"float\")\n",
" )\n",
" if \" memory.used [MiB]\" in df.columns:\n",
" df[\" memory.used [MiB]\"] = (\n",
" df[\" memory.used [MiB]\"].astype(str).str.replace(\"MiB\", \"\", regex=False).str.strip().astype(\"float\")\n",
" )\n",
" # parse timestamp if desired\n",
" if \"timestamp\" in df.columns:\n",
" df[\"timestamp\"] = pd.to_datetime(df[\"timestamp\"], errors=\"coerce\")\n",
"\n",
" return df\n",
"\n",
"\n",
"def eq_full_util_time(df, util_col=\" utilization.gpu [%]\", interval_sec=60):\n",
" # clip just in case and cast to float\n",
" u = pd.to_numeric(df[util_col], errors=\"coerce\")\n",
" # u = pd.to_numeric(df[util_col], errors=\"coerce\").fillna(0).clip(0, 100)\n",
" eq_full_sec = float((u / 100 * interval_sec).sum())\n",
" full_sec = float(len(u)*interval_sec)\n",
"\n",
" # pretty formatter\n",
" h, rem = divmod(int(round(full_sec)), 3600)\n",
" m, s = divmod(rem, 60)\n",
" pretty_full = f\"{h}h {m}m\"\n",
" h, rem = divmod(int(round(eq_full_sec)), 3600)\n",
" m, s = divmod(rem, 60)\n",
" pretty = f\"{h}h {m}m\"\n",
" return pretty, pretty_full, eq_full_sec\n",
"\n",
"\n",
"gpu_df = [read_last_run_csv(df) for df in gpu_files]\n",
"\n",
"\n",
"results = []\n",
"for name, df in zip(model_names_gpu, gpu_df):\n",
" pretty, pretty_full, full_sec_raw = eq_full_util_time(df) # unpack values\n",
" results.append((name, pretty, full_sec_raw, pretty_full)) # collect tuple\n",
"\n",
"# Turn into DataFrame\n",
"gpu_util_df = pd.DataFrame(results, columns=[\"model_name\", \"gpu_util_time\", \"gpu_util_time_raw\", 'full_time_from_gpu_log'])\n",
"\n",
"result_gpu_merged = gpu_util_df.merge(df_wide_merged, on=[\"model_name\"], how=\"left\")\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "2fa54bc3-81f2-492c-832c-26e4f9a7cff3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Overall Rank | \n",
" Model Name | \n",
" GPU Util Time | \n",
" gpu_util_time_raw | \n",
" full_time_from_gpu_log | \n",
" Parameters | \n",
" parameters_raw | \n",
" Total Time | \n",
" total_time_raw | \n",
" batch_size | \n",
" batch_sizes | \n",
" anli_r1(acc,none) | \n",
" anli_r2(acc,none) | \n",
" anli_r3(acc,none) | \n",
" arc_challenge(acc_norm,none) | \n",
" bbh(exact_match,get-answer) | \n",
" boolq(acc,none) | \n",
" drop(f1,none) | \n",
" gpqa_main_zeroshot(acc_norm,none) | \n",
" gsm8k(exact_match,strict-match) | \n",
" hellaswag(acc_norm,none) | \n",
" mmlu(acc,none) | \n",
" nq_open(exact_match,remove_whitespace) | \n",
" openbookqa(acc_norm,none) | \n",
" piqa(acc_norm,none) | \n",
" qnli(acc,none) | \n",
" sciq(acc_norm,none) | \n",
" triviaqa(exact_match,remove_whitespace) | \n",
" truthfulqa_mc1(acc,none) | \n",
" truthfulqa_mc2(acc,none) | \n",
" winogrande(acc,none) | \n",
" gsm8k(exact_match,strict-match)_rank | \n",
" bbh(exact_match,get-answer)_rank | \n",
" arc_challenge(acc_norm,none)_rank | \n",
" anli_r1(acc,none)_rank | \n",
" anli_r2(acc,none)_rank | \n",
" anli_r3(acc,none)_rank | \n",
" gpqa_main_zeroshot(acc_norm,none)_rank | \n",
" hellaswag(acc_norm,none)_rank | \n",
" piqa(acc_norm,none)_rank | \n",
" winogrande(acc,none)_rank | \n",
" boolq(acc,none)_rank | \n",
" openbookqa(acc_norm,none)_rank | \n",
" sciq(acc_norm,none)_rank | \n",
" qnli(acc,none)_rank | \n",
" mmlu(acc,none)_rank | \n",
" nq_open(exact_match,remove_whitespace)_rank | \n",
" drop(f1,none)_rank | \n",
" truthfulqa_mc1(acc,none)_rank | \n",
" truthfulqa_mc2(acc,none)_rank | \n",
" triviaqa(exact_match,remove_whitespace)_rank | \n",
" Reasoning & Math Mean Score | \n",
" Reasoning & Math Avg. Rank | \n",
" Commonsense & NLI Mean Score | \n",
" Commonsense & NLI Avg. Rank | \n",
" Knowledge & Reading Mean Score | \n",
" Knowledge & Reading Avg. Rank | \n",
" Mean Score | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" google_gemma-3-12b-it | \n",
" 14h 8m | \n",
" 50906.4 | \n",
" 15h 47m | \n",
" 12.2B | \n",
" 12187325040 | \n",
" 15h 45m | \n",
" 56750.865892 | \n",
" auto | \n",
" [2] | \n",
" 0.603 | \n",
" 0.560 | \n",
" 0.595833 | \n",
" 0.610922 | \n",
" 0.801874 | \n",
" 0.874618 | \n",
" 0.139566 | \n",
" 0.337054 | \n",
" 0.877180 | \n",
" 0.818761 | \n",
" 0.716137 | \n",
" 0.157064 | \n",
" 0.498 | \n",
" 0.780740 | \n",
" 0.745744 | \n",
" 0.954 | \n",
" 0.275245 | \n",
" 0.405141 | \n",
" 0.581183 | \n",
" 0.744278 | \n",
" 3.0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 5.0 | \n",
" 3.0 | \n",
" 2.0 | \n",
" 9.0 | \n",
" 3.0 | \n",
" 19.0 | \n",
" 4.0 | \n",
" 3.0 | \n",
" 1.0 | \n",
" 6.0 | \n",
" 8.0 | \n",
" 6.0 | \n",
" 8.0 | \n",
" 8.0 | \n",
" 8.0 | \n",
" 8.0 | \n",
" 23.0 | \n",
" 0.6266 | \n",
" 1 | \n",
" 0.7737 | \n",
" 3 | \n",
" 0.3791 | \n",
" 10 | \n",
" 0.6038 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" Qwen_Qwen3-14B (8bit) | \n",
" 17h 29m | \n",
" 62956.2 | \n",
" 29h 46m | \n",
" 14.8B | \n",
" 14768307200 | \n",
" 29h 45m | \n",
" 107151.802065 | \n",
" 1 | \n",
" [] | \n",
" 0.646 | \n",
" 0.570 | \n",
" 0.556667 | \n",
" 0.600683 | \n",
" 0.432960 | \n",
" 0.891743 | \n",
" 0.090410 | \n",
" 0.397321 | \n",
" 0.898408 | \n",
" 0.787692 | \n",
" 0.769477 | \n",
" 0.092244 | \n",
" 0.460 | \n",
" 0.794886 | \n",
" 0.844225 | \n",
" 0.966 | \n",
" 0.407490 | \n",
" 0.406365 | \n",
" 0.589404 | \n",
" 0.720600 | \n",
" 1.0 | \n",
" 29.0 | \n",
" 4.0 | \n",
" 4.0 | \n",
" 2.0 | \n",
" 3.0 | \n",
" 1.0 | \n",
" 12.0 | \n",
" 14.0 | \n",
" 11.0 | \n",
" 1.0 | \n",
" 8.0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 2.0 | \n",
" 15.0 | \n",
" 18.0 | \n",
" 7.0 | \n",
" 6.0 | \n",
" 13.0 | \n",
" 0.5860 | \n",
" 3 | \n",
" 0.7807 | \n",
" 2 | \n",
" 0.3926 | \n",
" 7 | \n",
" 0.5961 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" openchat_openchat-3.6-8b-20240522 | \n",
" 6h 59m | \n",
" 25150.8 | \n",
" 7h 52m | \n",
" 8.0B | \n",
" 8030261248 | \n",
" 7h 51m | \n",
" 28278.859470 | \n",
" 3 | \n",
" [] | \n",
" 0.556 | \n",
" 0.513 | \n",
" 0.480000 | \n",
" 0.603242 | \n",
" 0.617877 | \n",
" 0.872783 | \n",
" 0.251569 | \n",
" 0.332589 | \n",
" 0.750569 | \n",
" 0.797849 | \n",
" 0.643071 | \n",
" 0.170637 | \n",
" 0.462 | \n",
" 0.818281 | \n",
" 0.730002 | \n",
" 0.964 | \n",
" 0.565927 | \n",
" 0.352509 | \n",
" 0.497601 | \n",
" 0.763220 | \n",
" 16.0 | \n",
" 10.0 | \n",
" 3.0 | \n",
" 9.0 | \n",
" 8.0 | \n",
" 11.0 | \n",
" 11.0 | \n",
" 6.0 | \n",
" 3.0 | \n",
" 1.0 | \n",
" 4.0 | \n",
" 7.0 | \n",
" 2.0 | \n",
" 9.0 | \n",
" 14.0 | \n",
" 5.0 | \n",
" 3.0 | \n",
" 15.0 | \n",
" 19.0 | \n",
" 3.0 | \n",
" 0.5505 | \n",
" 6 | \n",
" 0.7726 | \n",
" 5 | \n",
" 0.4136 | \n",
" 2 | \n",
" 0.5871 | \n",
"
\n",
" \n",
" | 3 | \n",
" 4 | \n",
" Qwen_Qwen3-8B | \n",
" 13h 44m | \n",
" 49497.0 | \n",
" 15h 33m | \n",
" 8.2B | \n",
" 8190735360 | \n",
" 15h 31m | \n",
" 55918.467860 | \n",
" auto | \n",
" [1] | \n",
" 0.669 | \n",
" 0.542 | \n",
" 0.555833 | \n",
" 0.562287 | \n",
" 0.797573 | \n",
" 0.865749 | \n",
" 0.109877 | \n",
" 0.350446 | \n",
" 0.872631 | \n",
" 0.748656 | \n",
" 0.728956 | \n",
" 0.073684 | \n",
" 0.418 | \n",
" 0.775299 | \n",
" 0.781805 | \n",
" 0.958 | \n",
" 0.320609 | \n",
" 0.363525 | \n",
" 0.543140 | \n",
" 0.680347 | \n",
" 4.0 | \n",
" 2.0 | \n",
" 11.0 | \n",
" 3.0 | \n",
" 5.0 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 24.0 | \n",
" 21.0 | \n",
" 22.0 | \n",
" 6.0 | \n",
" 20.0 | \n",
" 4.0 | \n",
" 7.0 | \n",
" 3.0 | \n",
" 16.0 | \n",
" 13.0 | \n",
" 13.0 | \n",
" 13.0 | \n",
" 18.0 | \n",
" 0.6214 | \n",
" 2 | \n",
" 0.7468 | \n",
" 8 | \n",
" 0.3566 | \n",
" 14 | \n",
" 0.5859 | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
" Qwen_Qwen2.5-7B-Instruct | \n",
" 8h 33m | \n",
" 30831.6 | \n",
" 9h 38m | \n",
" 7.6B | \n",
" 7615616512 | \n",
" 9h 36m | \n",
" 34616.604248 | \n",
" 3 | \n",
" [] | \n",
" 0.685 | \n",
" 0.549 | \n",
" 0.552500 | \n",
" 0.552901 | \n",
" 0.448779 | \n",
" 0.863303 | \n",
" 0.071089 | \n",
" 0.328125 | \n",
" 0.762699 | \n",
" 0.804919 | \n",
" 0.718060 | \n",
" 0.045706 | \n",
" 0.486 | \n",
" 0.803047 | \n",
" 0.804503 | \n",
" 0.937 | \n",
" 0.325401 | \n",
" 0.477356 | \n",
" 0.648483 | \n",
" 0.711918 | \n",
" 12.0 | \n",
" 27.0 | \n",
" 12.0 | \n",
" 2.0 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 12.0 | \n",
" 5.0 | \n",
" 10.0 | \n",
" 14.0 | \n",
" 7.0 | \n",
" 2.0 | \n",
" 12.0 | \n",
" 4.0 | \n",
" 4.0 | \n",
" 23.0 | \n",
" 27.0 | \n",
" 2.0 | \n",
" 2.0 | \n",
" 17.0 | \n",
" 0.5541 | \n",
" 5 | \n",
" 0.7730 | \n",
" 4 | \n",
" 0.3810 | \n",
" 9 | \n",
" 0.5788 | \n",
"
\n",
" \n",
" | 5 | \n",
" 6 | \n",
" Qwen_Qwen2.5-14B-Instruct (8bit) | \n",
" 29h 32m | \n",
" 106374.6 | \n",
" 52h 45m | \n",
" 14.8B | \n",
" 14770033664 | \n",
" 52h 44m | \n",
" 189869.409404 | \n",
" 1 | \n",
" [] | \n",
" 0.721 | \n",
" 0.634 | \n",
" 0.617500 | \n",
" 0.615188 | \n",
" 0.106896 | \n",
" 0.886239 | \n",
" 0.071276 | \n",
" 0.354911 | \n",
" 0.792267 | \n",
" 0.841964 | \n",
" 0.783079 | \n",
" 0.061496 | \n",
" 0.476 | \n",
" 0.817193 | \n",
" 0.853926 | \n",
" 0.929 | \n",
" 0.039289 | \n",
" 0.510404 | \n",
" 0.683015 | \n",
" 0.754538 | \n",
" 9.0 | \n",
" 41.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 4.0 | \n",
" 1.0 | \n",
" 4.0 | \n",
" 2.0 | \n",
" 2.0 | \n",
" 4.0 | \n",
" 17.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 19.0 | \n",
" 25.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 35.0 | \n",
" 0.5488 | \n",
" 7 | \n",
" 0.7941 | \n",
" 1 | \n",
" 0.3581 | \n",
" 13 | \n",
" 0.5775 | \n",
"
\n",
" \n",
" | 6 | \n",
" 7 | \n",
" 01-ai_Yi-1.5-9B | \n",
" 10h 26m | \n",
" 37569.6 | \n",
" 11h 44m | \n",
" 8.8B | \n",
" 8829407232 | \n",
" 11h 43m | \n",
" 42212.112622 | \n",
" 2 | \n",
" [] | \n",
" 0.532 | \n",
" 0.480 | \n",
" 0.439167 | \n",
" 0.546928 | \n",
" 0.712026 | \n",
" 0.858104 | \n",
" 0.445686 | \n",
" 0.294643 | \n",
" 0.639121 | \n",
" 0.778929 | \n",
" 0.689289 | \n",
" 0.153186 | \n",
" 0.456 | \n",
" 0.806311 | \n",
" 0.508695 | \n",
" 0.952 | \n",
" 0.543803 | \n",
" 0.321909 | \n",
" 0.467572 | \n",
" 0.726125 | \n",
" 25.0 | \n",
" 5.0 | \n",
" 15.0 | \n",
" 12.0 | \n",
" 12.0 | \n",
" 18.0 | \n",
" 20.0 | \n",
" 14.0 | \n",
" 6.0 | \n",
" 9.0 | \n",
" 10.0 | \n",
" 9.0 | \n",
" 7.0 | \n",
" 28.0 | \n",
" 8.0 | \n",
" 10.0 | \n",
" 1.0 | \n",
" 19.0 | \n",
" 26.0 | \n",
" 4.0 | \n",
" 0.5206 | \n",
" 16 | \n",
" 0.7266 | \n",
" 15 | \n",
" 0.4369 | \n",
" 1 | \n",
" 0.5676 | \n",
"
\n",
" \n",
" | 7 | \n",
" 8 | \n",
" Qwen_Qwen2.5-7B-Instruct-1M | \n",
" 10h 10m | \n",
" 36621.0 | \n",
" 11h 18m | \n",
" 7.6B | \n",
" 7615616512 | \n",
" 11h 17m | \n",
" 40632.813397 | \n",
" auto | \n",
" [1] | \n",
" 0.585 | \n",
" 0.533 | \n",
" 0.556667 | \n",
" 0.585324 | \n",
" 0.277223 | \n",
" 0.852599 | \n",
" 0.057047 | \n",
" 0.339286 | \n",
" 0.795299 | \n",
" 0.789982 | \n",
" 0.716636 | \n",
" 0.157618 | \n",
" 0.480 | \n",
" 0.816104 | \n",
" 0.678199 | \n",
" 0.950 | \n",
" 0.420531 | \n",
" 0.425949 | \n",
" 0.600072 | \n",
" 0.727703 | \n",
" 8.0 | \n",
" 38.0 | \n",
" 7.0 | \n",
" 6.0 | \n",
" 6.0 | \n",
" 3.0 | \n",
" 8.0 | \n",
" 11.0 | \n",
" 5.0 | \n",
" 8.0 | \n",
" 12.0 | \n",
" 3.0 | \n",
" 8.0 | \n",
" 11.0 | \n",
" 5.0 | \n",
" 7.0 | \n",
" 29.0 | \n",
" 3.0 | \n",
" 3.0 | \n",
" 12.0 | \n",
" 0.5245 | \n",
" 15 | \n",
" 0.7564 | \n",
" 7 | \n",
" 0.3963 | \n",
" 6 | \n",
" 0.5672 | \n",
"
\n",
" \n",
" | 8 | \n",
" 9 | \n",
" meta-llama_Llama-3.1-8B-Instruct | \n",
" 10h 52m | \n",
" 39147.6 | \n",
" 12h 20m | \n",
" 8.0B | \n",
" 8030261248 | \n",
" 12h 19m | \n",
" 44363.249360 | \n",
" auto | \n",
" [1] | \n",
" 0.482 | \n",
" 0.467 | \n",
" 0.443333 | \n",
" 0.550341 | \n",
" 0.715558 | \n",
" 0.841590 | \n",
" 0.193729 | \n",
" 0.343750 | \n",
" 0.754359 | \n",
" 0.792073 | \n",
" 0.679319 | \n",
" 0.177562 | \n",
" 0.432 | \n",
" 0.806311 | \n",
" 0.501373 | \n",
" 0.962 | \n",
" 0.518168 | \n",
" 0.365973 | \n",
" 0.541154 | \n",
" 0.738753 | \n",
" 15.0 | \n",
" 4.0 | \n",
" 13.0 | \n",
" 18.0 | \n",
" 14.0 | \n",
" 17.0 | \n",
" 6.0 | \n",
" 9.0 | \n",
" 6.0 | \n",
" 6.0 | \n",
" 16.0 | \n",
" 16.0 | \n",
" 3.0 | \n",
" 30.0 | \n",
" 12.0 | \n",
" 4.0 | \n",
" 4.0 | \n",
" 12.0 | \n",
" 14.0 | \n",
" 7.0 | \n",
" 0.5366 | \n",
" 12 | \n",
" 0.7249 | \n",
" 17 | \n",
" 0.4127 | \n",
" 3 | \n",
" 0.5653 | \n",
"
\n",
" \n",
" | 9 | \n",
" 10 | \n",
" 01-ai_Yi-1.5-9B-Chat | \n",
" 12h 15m | \n",
" 44120.4 | \n",
" 13h 55m | \n",
" 8.8B | \n",
" 8829407232 | \n",
" 13h 54m | \n",
" 50056.331345 | \n",
" 2 | \n",
" [] | \n",
" 0.535 | \n",
" 0.509 | \n",
" 0.525833 | \n",
" 0.587031 | \n",
" 0.610659 | \n",
" 0.868196 | \n",
" 0.125326 | \n",
" 0.303571 | \n",
" 0.708112 | \n",
" 0.787293 | \n",
" 0.684091 | \n",
" 0.009418 | \n",
" 0.436 | \n",
" 0.803591 | \n",
" 0.787662 | \n",
" 0.954 | \n",
" 0.338665 | \n",
" 0.374541 | \n",
" 0.547934 | \n",
" 0.746646 | \n",
" 18.0 | \n",
" 12.0 | \n",
" 6.0 | \n",
" 11.0 | \n",
" 9.0 | \n",
" 6.0 | \n",
" 18.0 | \n",
" 13.0 | \n",
" 9.0 | \n",
" 3.0 | \n",
" 5.0 | \n",
" 14.0 | \n",
" 6.0 | \n",
" 6.0 | \n",
" 9.0 | \n",
" 36.0 | \n",
" 9.0 | \n",
" 10.0 | \n",
" 11.0 | \n",
" 15.0 | \n",
" 0.5399 | \n",
" 9 | \n",
" 0.7691 | \n",
" 6 | \n",
" 0.3467 | \n",
" 15 | \n",
" 0.5621 | \n",
"
\n",
" \n",
" | 10 | \n",
" 11 | \n",
" mistralai_Ministral-8B-Instruct-2410 | \n",
" 9h 27m | \n",
" 34053.6 | \n",
" 10h 47m | \n",
" 8.0B | \n",
" 8019808256 | \n",
" 10h 46m | \n",
" 38770.339256 | \n",
" auto | \n",
" [1] | \n",
" 0.488 | \n",
" 0.487 | \n",
" 0.465833 | \n",
" 0.562287 | \n",
" 0.692520 | \n",
" 0.860245 | \n",
" 0.071413 | \n",
" 0.341518 | \n",
" 0.774829 | \n",
" 0.791077 | \n",
" 0.640721 | \n",
" 0.157618 | \n",
" 0.466 | \n",
" 0.823177 | \n",
" 0.494966 | \n",
" 0.956 | \n",
" 0.527809 | \n",
" 0.325581 | \n",
" 0.486670 | \n",
" 0.737964 | \n",
" 11.0 | \n",
" 7.0 | \n",
" 11.0 | \n",
" 16.0 | \n",
" 11.0 | \n",
" 14.0 | \n",
" 7.0 | \n",
" 10.0 | \n",
" 2.0 | \n",
" 7.0 | \n",
" 8.0 | \n",
" 6.0 | \n",
" 5.0 | \n",
" 39.0 | \n",
" 15.0 | \n",
" 7.0 | \n",
" 24.0 | \n",
" 18.0 | \n",
" 22.0 | \n",
" 5.0 | \n",
" 0.5446 | \n",
" 8 | \n",
" 0.7328 | \n",
" 12 | \n",
" 0.3683 | \n",
" 12 | \n",
" 0.5576 | \n",
"
\n",
" \n",
" | 11 | \n",
" 12 | \n",
" meta-llama_Meta-Llama-3-8B-Instruct | \n",
" 5h 46m | \n",
" 20809.8 | \n",
" 6h 31m | \n",
" 8.0B | \n",
" 8030261248 | \n",
" 6h 30m | \n",
" 23440.234421 | \n",
" 3 | \n",
" [] | \n",
" 0.484 | \n",
" 0.458 | \n",
" 0.448333 | \n",
" 0.563993 | \n",
" 0.679005 | \n",
" 0.831193 | \n",
" 0.163977 | \n",
" 0.310268 | \n",
" 0.756634 | \n",
" 0.759211 | \n",
" 0.638727 | \n",
" 0.159003 | \n",
" 0.430 | \n",
" 0.787269 | \n",
" 0.546403 | \n",
" 0.932 | \n",
" 0.511202 | \n",
" 0.363525 | \n",
" 0.517142 | \n",
" 0.716654 | \n",
" 14.0 | \n",
" 8.0 | \n",
" 10.0 | \n",
" 17.0 | \n",
" 18.0 | \n",
" 16.0 | \n",
" 16.0 | \n",
" 19.0 | \n",
" 18.0 | \n",
" 13.0 | \n",
" 19.0 | \n",
" 17.0 | \n",
" 15.0 | \n",
" 20.0 | \n",
" 16.0 | \n",
" 6.0 | \n",
" 5.0 | \n",
" 13.0 | \n",
" 17.0 | \n",
" 8.0 | \n",
" 0.5286 | \n",
" 13 | \n",
" 0.7147 | \n",
" 22 | \n",
" 0.3923 | \n",
" 8 | \n",
" 0.5528 | \n",
"
\n",
" \n",
" | 12 | \n",
" 13 | \n",
" Qwen_Qwen3-4B | \n",
" 5h 3m | \n",
" 18234.6 | \n",
" 5h 52m | \n",
" 4.0B | \n",
" 4022468096 | \n",
" 5h 51m | \n",
" 21077.943646 | \n",
" 6 | \n",
" [] | \n",
" 0.550 | \n",
" 0.461 | \n",
" 0.513333 | \n",
" 0.539249 | \n",
" 0.752265 | \n",
" 0.850459 | \n",
" 0.097707 | \n",
" 0.325893 | \n",
" 0.856710 | \n",
" 0.683330 | \n",
" 0.683592 | \n",
" 0.014681 | \n",
" 0.402 | \n",
" 0.751360 | \n",
" 0.808713 | \n",
" 0.932 | \n",
" 0.225033 | \n",
" 0.367197 | \n",
" 0.547575 | \n",
" 0.658248 | \n",
" 5.0 | \n",
" 3.0 | \n",
" 17.0 | \n",
" 10.0 | \n",
" 17.0 | \n",
" 8.0 | \n",
" 13.0 | \n",
" 29.0 | \n",
" 27.0 | \n",
" 27.0 | \n",
" 13.0 | \n",
" 23.0 | \n",
" 15.0 | \n",
" 3.0 | \n",
" 10.0 | \n",
" 34.0 | \n",
" 16.0 | \n",
" 11.0 | \n",
" 12.0 | \n",
" 26.0 | \n",
" 0.5712 | \n",
" 4 | \n",
" 0.7266 | \n",
" 16 | \n",
" 0.3226 | \n",
" 21 | \n",
" 0.5510 | \n",
"
\n",
" \n",
" | 13 | \n",
" 14 | \n",
" NousResearch_Hermes-2-Pro-Mistral-7B | \n",
" 7h 28m | \n",
" 26916.0 | \n",
" 8h 28m | \n",
" 7.2B | \n",
" 7241994240 | \n",
" 8h 27m | \n",
" 30434.329021 | \n",
" 3 | \n",
" [] | \n",
" 0.531 | \n",
" 0.496 | \n",
" 0.500000 | \n",
" 0.565700 | \n",
" 0.573798 | \n",
" 0.868196 | \n",
" 0.109754 | \n",
" 0.276786 | \n",
" 0.685368 | \n",
" 0.804919 | \n",
" 0.605113 | \n",
" 0.040443 | \n",
" 0.434 | \n",
" 0.798694 | \n",
" 0.556471 | \n",
" 0.917 | \n",
" 0.471132 | \n",
" 0.413709 | \n",
" 0.591156 | \n",
" 0.719811 | \n",
" 21.0 | \n",
" 17.0 | \n",
" 9.0 | \n",
" 13.0 | \n",
" 10.0 | \n",
" 9.0 | \n",
" 25.0 | \n",
" 5.0 | \n",
" 12.0 | \n",
" 12.0 | \n",
" 5.0 | \n",
" 15.0 | \n",
" 20.0 | \n",
" 18.0 | \n",
" 20.0 | \n",
" 25.0 | \n",
" 14.0 | \n",
" 6.0 | \n",
" 5.0 | \n",
" 11.0 | \n",
" 0.5184 | \n",
" 17 | \n",
" 0.7284 | \n",
" 13 | \n",
" 0.3719 | \n",
" 11 | \n",
" 0.5480 | \n",
"
\n",
" \n",
" | 14 | \n",
" 15 | \n",
" mistralai_Mistral-7B-Instruct-v0.3 | \n",
" 7h 41m | \n",
" 27676.8 | \n",
" 8h 39m | \n",
" 7.2B | \n",
" 7248023552 | \n",
" 8h 38m | \n",
" 31084.838324 | \n",
" 3 | \n",
" [] | \n",
" 0.476 | \n",
" 0.443 | \n",
" 0.448333 | \n",
" 0.589590 | \n",
" 0.562586 | \n",
" 0.858410 | \n",
" 0.089972 | \n",
" 0.283482 | \n",
" 0.489765 | \n",
" 0.828919 | \n",
" 0.597137 | \n",
" 0.153740 | \n",
" 0.470 | \n",
" 0.826986 | \n",
" 0.514552 | \n",
" 0.943 | \n",
" 0.568324 | \n",
" 0.421053 | \n",
" 0.596813 | \n",
" 0.740331 | \n",
" 28.0 | \n",
" 18.0 | \n",
" 5.0 | \n",
" 20.0 | \n",
" 20.0 | \n",
" 16.0 | \n",
" 24.0 | \n",
" 2.0 | \n",
" 1.0 | \n",
" 5.0 | \n",
" 9.0 | \n",
" 5.0 | \n",
" 9.0 | \n",
" 26.0 | \n",
" 22.0 | \n",
" 9.0 | \n",
" 19.0 | \n",
" 4.0 | \n",
" 4.0 | \n",
" 2.0 | \n",
" 0.4704 | \n",
" 22 | \n",
" 0.7403 | \n",
" 9 | \n",
" 0.4045 | \n",
" 5 | \n",
" 0.5451 | \n",
"
\n",
" \n",
" | 15 | \n",
" 16 | \n",
" google_gemma-3-4b-it | \n",
" 3h 50m | \n",
" 13811.4 | \n",
" 4h 52m | \n",
" 4.3B | \n",
" 4300079472 | \n",
" 4h 51m | \n",
" 17460.233507 | \n",
" auto | \n",
" [4] | \n",
" 0.492 | \n",
" 0.471 | \n",
" 0.468333 | \n",
" 0.570819 | \n",
" 0.709415 | \n",
" 0.839755 | \n",
" 0.089284 | \n",
" 0.287946 | \n",
" 0.761941 | \n",
" 0.741386 | \n",
" 0.575559 | \n",
" 0.109418 | \n",
" 0.466 | \n",
" 0.772035 | \n",
" 0.565989 | \n",
" 0.931 | \n",
" 0.314813 | \n",
" 0.348837 | \n",
" 0.518821 | \n",
" 0.700868 | \n",
" 13.0 | \n",
" 6.0 | \n",
" 8.0 | \n",
" 15.0 | \n",
" 13.0 | \n",
" 13.0 | \n",
" 23.0 | \n",
" 26.0 | \n",
" 22.0 | \n",
" 17.0 | \n",
" 17.0 | \n",
" 6.0 | \n",
" 16.0 | \n",
" 16.0 | \n",
" 24.0 | \n",
" 13.0 | \n",
" 20.0 | \n",
" 16.0 | \n",
" 16.0 | \n",
" 19.0 | \n",
" 0.5374 | \n",
" 11 | \n",
" 0.7167 | \n",
" 19 | \n",
" 0.3261 | \n",
" 20 | \n",
" 0.5368 | \n",
"
\n",
" \n",
" | 16 | \n",
" 17 | \n",
" 01-ai_Yi-1.5-6B-Chat | \n",
" 7h 1m | \n",
" 25318.8 | \n",
" 8h 5m | \n",
" 6.1B | \n",
" 6061035520 | \n",
" 8h 4m | \n",
" 29040.429802 | \n",
" 2 | \n",
" [] | \n",
" 0.477 | \n",
" 0.453 | \n",
" 0.460000 | \n",
" 0.539249 | \n",
" 0.547842 | \n",
" 0.847401 | \n",
" 0.116081 | \n",
" 0.357143 | \n",
" 0.670205 | \n",
" 0.767477 | \n",
" 0.617861 | \n",
" 0.027147 | \n",
" 0.436 | \n",
" 0.787813 | \n",
" 0.679480 | \n",
" 0.934 | \n",
" 0.330974 | \n",
" 0.376989 | \n",
" 0.534371 | \n",
" 0.709550 | \n",
" 22.0 | \n",
" 21.0 | \n",
" 17.0 | \n",
" 19.0 | \n",
" 19.0 | \n",
" 15.0 | \n",
" 3.0 | \n",
" 16.0 | \n",
" 17.0 | \n",
" 15.0 | \n",
" 15.0 | \n",
" 14.0 | \n",
" 14.0 | \n",
" 10.0 | \n",
" 18.0 | \n",
" 29.0 | \n",
" 12.0 | \n",
" 9.0 | \n",
" 15.0 | \n",
" 16.0 | \n",
" 0.5006 | \n",
" 19 | \n",
" 0.7374 | \n",
" 10 | \n",
" 0.3339 | \n",
" 19 | \n",
" 0.5335 | \n",
"
\n",
" \n",
" | 17 | \n",
" 18 | \n",
" 01-ai_Yi-1.5-6B | \n",
" 3h 54m | \n",
" 14091.6 | \n",
" 4h 29m | \n",
" 6.1B | \n",
" 6061035520 | \n",
" 4h 28m | \n",
" 16094.199661 | \n",
" auto | \n",
" [8] | \n",
" 0.448 | \n",
" 0.407 | \n",
" 0.406667 | \n",
" 0.496587 | \n",
" 0.575488 | \n",
" 0.801529 | \n",
" 0.399462 | \n",
" 0.290179 | \n",
" 0.522365 | \n",
" 0.754133 | \n",
" 0.624270 | \n",
" 0.178116 | \n",
" 0.422 | \n",
" 0.801415 | \n",
" 0.598572 | \n",
" 0.941 | \n",
" 0.495207 | \n",
" 0.299878 | \n",
" 0.440750 | \n",
" 0.720600 | \n",
" 27.0 | \n",
" 16.0 | \n",
" 20.0 | \n",
" 21.0 | \n",
" 26.0 | \n",
" 27.0 | \n",
" 22.0 | \n",
" 22.0 | \n",
" 11.0 | \n",
" 11.0 | \n",
" 23.0 | \n",
" 19.0 | \n",
" 10.0 | \n",
" 13.0 | \n",
" 17.0 | \n",
" 3.0 | \n",
" 2.0 | \n",
" 23.0 | \n",
" 32.0 | \n",
" 10.0 | \n",
" 0.4495 | \n",
" 24 | \n",
" 0.7199 | \n",
" 18 | \n",
" 0.4063 | \n",
" 4 | \n",
" 0.5312 | \n",
"
\n",
" \n",
" | 18 | \n",
" 19 | \n",
" Qwen_Qwen2-7B-Instruct | \n",
" 10h 11m | \n",
" 36684.6 | \n",
" 11h 31m | \n",
" 7.6B | \n",
" 7615616512 | \n",
" 11h 30m | \n",
" 41431.857967 | \n",
" auto | \n",
" [1] | \n",
" 0.573 | \n",
" 0.525 | \n",
" 0.522500 | \n",
" 0.540102 | \n",
" 0.577484 | \n",
" 0.856269 | \n",
" 0.052028 | \n",
" 0.314732 | \n",
" 0.646702 | \n",
" 0.806015 | \n",
" 0.699402 | \n",
" 0.013296 | \n",
" 0.462 | \n",
" 0.805767 | \n",
" 0.547135 | \n",
" 0.916 | \n",
" 0.008136 | \n",
" 0.405141 | \n",
" 0.573437 | \n",
" 0.698500 | \n",
" 23.0 | \n",
" 15.0 | \n",
" 16.0 | \n",
" 7.0 | \n",
" 7.0 | \n",
" 7.0 | \n",
" 15.0 | \n",
" 4.0 | \n",
" 7.0 | \n",
" 18.0 | \n",
" 11.0 | \n",
" 7.0 | \n",
" 21.0 | \n",
" 19.0 | \n",
" 7.0 | \n",
" 35.0 | \n",
" 31.0 | \n",
" 8.0 | \n",
" 9.0 | \n",
" 39.0 | \n",
" 0.5285 | \n",
" 14 | \n",
" 0.7274 | \n",
" 14 | \n",
" 0.2919 | \n",
" 24 | \n",
" 0.5271 | \n",
"
\n",
" \n",
" | 19 | \n",
" 20 | \n",
" deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | \n",
" 15h 30m | \n",
" 55855.2 | \n",
" 17h 59m | \n",
" 8.2B | \n",
" 8190735360 | \n",
" 17h 57m | \n",
" 64675.539163 | \n",
" auto | \n",
" [1] | \n",
" 0.511 | \n",
" 0.464 | \n",
" 0.476667 | \n",
" 0.549488 | \n",
" 0.584088 | \n",
" 0.848318 | \n",
" 0.053279 | \n",
" 0.372768 | \n",
" 0.812737 | \n",
" 0.756423 | \n",
" 0.682951 | \n",
" 0.018283 | \n",
" 0.430 | \n",
" 0.756801 | \n",
" 0.557752 | \n",
" 0.941 | \n",
" 0.029481 | \n",
" 0.357405 | \n",
" 0.559013 | \n",
" 0.675612 | \n",
" 7.0 | \n",
" 14.0 | \n",
" 14.0 | \n",
" 14.0 | \n",
" 16.0 | \n",
" 12.0 | \n",
" 2.0 | \n",
" 20.0 | \n",
" 25.0 | \n",
" 24.0 | \n",
" 14.0 | \n",
" 17.0 | \n",
" 10.0 | \n",
" 17.0 | \n",
" 11.0 | \n",
" 33.0 | \n",
" 30.0 | \n",
" 14.0 | \n",
" 10.0 | \n",
" 36.0 | \n",
" 0.5387 | \n",
" 10 | \n",
" 0.7094 | \n",
" 23 | \n",
" 0.2834 | \n",
" 28 | \n",
" 0.5219 | \n",
"
\n",
" \n",
" | 20 | \n",
" 21 | \n",
" meta-llama_Llama-3.2-3B-Instruct | \n",
" 5h 57m | \n",
" 21477.0 | \n",
" 7h 13m | \n",
" 3.2B | \n",
" 3212749824 | \n",
" 7h 12m | \n",
" 25939.885959 | \n",
" auto | \n",
" [2] | \n",
" 0.447 | \n",
" 0.418 | \n",
" 0.430833 | \n",
" 0.459044 | \n",
" 0.556443 | \n",
" 0.784709 | \n",
" 0.155394 | \n",
" 0.328125 | \n",
" 0.642153 | \n",
" 0.705437 | \n",
" 0.605184 | \n",
" 0.139058 | \n",
" 0.358 | \n",
" 0.755169 | \n",
" 0.545122 | \n",
" 0.932 | \n",
" 0.338943 | \n",
" 0.326805 | \n",
" 0.497579 | \n",
" 0.670876 | \n",
" 24.0 | \n",
" 20.0 | \n",
" 25.0 | \n",
" 22.0 | \n",
" 23.0 | \n",
" 21.0 | \n",
" 12.0 | \n",
" 27.0 | \n",
" 26.0 | \n",
" 25.0 | \n",
" 26.0 | \n",
" 28.0 | \n",
" 15.0 | \n",
" 21.0 | \n",
" 19.0 | \n",
" 12.0 | \n",
" 7.0 | \n",
" 17.0 | \n",
" 20.0 | \n",
" 14.0 | \n",
" 0.4688 | \n",
" 23 | \n",
" 0.6788 | \n",
" 30 | \n",
" 0.3438 | \n",
" 16 | \n",
" 0.5048 | \n",
"
\n",
" \n",
" | 21 | \n",
" 22 | \n",
" Qwen_Qwen2.5-3B-Instruct | \n",
" 6h 30m | \n",
" 23452.2 | \n",
" 7h 49m | \n",
" 3.1B | \n",
" 3085938688 | \n",
" 7h 48m | \n",
" 28089.516568 | \n",
" auto:4 | \n",
" [2, 64, 64, 64, 64] | \n",
" 0.562 | \n",
" 0.466 | \n",
" 0.494167 | \n",
" 0.482082 | \n",
" 0.249117 | \n",
" 0.801223 | \n",
" 0.077333 | \n",
" 0.321429 | \n",
" 0.101592 | \n",
" 0.749054 | \n",
" 0.654964 | \n",
" 0.008310 | \n",
" 0.422 | \n",
" 0.780740 | \n",
" 0.797913 | \n",
" 0.913 | \n",
" 0.300992 | \n",
" 0.416157 | \n",
" 0.586055 | \n",
" 0.692976 | \n",
" 41.0 | \n",
" 39.0 | \n",
" 22.0 | \n",
" 8.0 | \n",
" 15.0 | \n",
" 10.0 | \n",
" 14.0 | \n",
" 23.0 | \n",
" 19.0 | \n",
" 20.0 | \n",
" 24.0 | \n",
" 19.0 | \n",
" 24.0 | \n",
" 5.0 | \n",
" 13.0 | \n",
" 37.0 | \n",
" 21.0 | \n",
" 5.0 | \n",
" 7.0 | \n",
" 21.0 | \n",
" 0.3823 | \n",
" 32 | \n",
" 0.7367 | \n",
" 11 | \n",
" 0.3406 | \n",
" 17 | \n",
" 0.4939 | \n",
"
\n",
" \n",
" | 22 | \n",
" 23 | \n",
" Qwen_Qwen2.5-Math-7B | \n",
" 24h 38m | \n",
" 88696.2 | \n",
" 27h 23m | \n",
" 7.6B | \n",
" 7615616512 | \n",
" 27h 21m | \n",
" 98517.403245 | \n",
" auto | \n",
" [4] | \n",
" 0.387 | \n",
" 0.407 | \n",
" 0.382500 | \n",
" 0.502560 | \n",
" 0.672401 | \n",
" 0.745566 | \n",
" 0.043235 | \n",
" 0.308036 | \n",
" 0.847612 | \n",
" 0.652858 | \n",
" 0.579903 | \n",
" 0.050970 | \n",
" 0.392 | \n",
" 0.745375 | \n",
" 0.498078 | \n",
" 0.929 | \n",
" 0.218346 | \n",
" 0.320685 | \n",
" 0.483219 | \n",
" 0.647987 | \n",
" 6.0 | \n",
" 9.0 | \n",
" 18.0 | \n",
" 30.0 | \n",
" 26.0 | \n",
" 31.0 | \n",
" 17.0 | \n",
" 31.0 | \n",
" 29.0 | \n",
" 29.0 | \n",
" 33.0 | \n",
" 24.0 | \n",
" 17.0 | \n",
" 33.0 | \n",
" 23.0 | \n",
" 22.0 | \n",
" 33.0 | \n",
" 20.0 | \n",
" 23.0 | \n",
" 27.0 | \n",
" 0.5010 | \n",
" 18 | \n",
" 0.6587 | \n",
" 32 | \n",
" 0.2827 | \n",
" 29 | \n",
" 0.4907 | \n",
"
\n",
" \n",
" | 23 | \n",
" 24 | \n",
" deepseek-ai_deepseek-llm-7b-chat | \n",
" 9h 8m | \n",
" 32906.4 | \n",
" 10h 8m | \n",
" 6.9B | \n",
" 6910365696 | \n",
" 10h 6m | \n",
" 36412.969244 | \n",
" 3 | \n",
" [] | \n",
" 0.423 | \n",
" 0.419 | \n",
" 0.420833 | \n",
" 0.496587 | \n",
" 0.454769 | \n",
" 0.833028 | \n",
" 0.103048 | \n",
" 0.292411 | \n",
" 0.463988 | \n",
" 0.777236 | \n",
" 0.498789 | \n",
" 0.063435 | \n",
" 0.460 | \n",
" 0.801415 | \n",
" 0.496980 | \n",
" 0.893 | \n",
" 0.311190 | \n",
" 0.348837 | \n",
" 0.478933 | \n",
" 0.701657 | \n",
" 29.0 | \n",
" 26.0 | \n",
" 20.0 | \n",
" 26.0 | \n",
" 22.0 | \n",
" 23.0 | \n",
" 21.0 | \n",
" 15.0 | \n",
" 11.0 | \n",
" 16.0 | \n",
" 18.0 | \n",
" 8.0 | \n",
" 29.0 | \n",
" 35.0 | \n",
" 32.0 | \n",
" 18.0 | \n",
" 15.0 | \n",
" 16.0 | \n",
" 24.0 | \n",
" 20.0 | \n",
" 0.4244 | \n",
" 27 | \n",
" 0.7090 | \n",
" 24 | \n",
" 0.3007 | \n",
" 23 | \n",
" 0.4869 | \n",
"
\n",
" \n",
" | 24 | \n",
" 25 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Llama-8B | \n",
" 10h 36m | \n",
" 38179.2 | \n",
" 11h 47m | \n",
" 8.0B | \n",
" 8030261248 | \n",
" 11h 46m | \n",
" 42405.489811 | \n",
" auto:5 | \n",
" [1, 64, 64, 64, 64, 64] | \n",
" 0.404 | \n",
" 0.410 | \n",
" 0.388333 | \n",
" 0.423208 | \n",
" 0.603748 | \n",
" 0.828746 | \n",
" 0.071225 | \n",
" 0.274554 | \n",
" 0.624716 | \n",
" 0.742979 | \n",
" 0.532688 | \n",
" 0.058449 | \n",
" 0.410 | \n",
" 0.775843 | \n",
" 0.514735 | \n",
" 0.899 | \n",
" 0.194048 | \n",
" 0.321909 | \n",
" 0.504460 | \n",
" 0.677979 | \n",
" 26.0 | \n",
" 13.0 | \n",
" 31.0 | \n",
" 29.0 | \n",
" 25.0 | \n",
" 29.0 | \n",
" 26.0 | \n",
" 25.0 | \n",
" 20.0 | \n",
" 23.0 | \n",
" 20.0 | \n",
" 21.0 | \n",
" 27.0 | \n",
" 25.0 | \n",
" 27.0 | \n",
" 20.0 | \n",
" 26.0 | \n",
" 19.0 | \n",
" 18.0 | \n",
" 28.0 | \n",
" 0.4469 | \n",
" 26 | \n",
" 0.6928 | \n",
" 27 | \n",
" 0.2805 | \n",
" 30 | \n",
" 0.4830 | \n",
"
\n",
" \n",
" | 25 | \n",
" 26 | \n",
" meta-llama_Llama-2-13b-hf | \n",
" 17h 38m | \n",
" 63506.4 | \n",
" 19h 22m | \n",
" 13.0B | \n",
" 13015864320 | \n",
" 19h 21m | \n",
" 69687.765642 | \n",
" auto | \n",
" [1] | \n",
" 0.377 | \n",
" 0.390 | \n",
" 0.385000 | \n",
" 0.489761 | \n",
" 0.477653 | \n",
" 0.806422 | \n",
" 0.030132 | \n",
" 0.254464 | \n",
" 0.229719 | \n",
" 0.793866 | \n",
" 0.520937 | \n",
" 0.236288 | \n",
" 0.452 | \n",
" 0.805223 | \n",
" 0.495332 | \n",
" 0.935 | \n",
" 0.608839 | \n",
" 0.259486 | \n",
" 0.368992 | \n",
" 0.722178 | \n",
" 36.0 | \n",
" 25.0 | \n",
" 21.0 | \n",
" 31.0 | \n",
" 29.0 | \n",
" 30.0 | \n",
" 32.0 | \n",
" 8.0 | \n",
" 8.0 | \n",
" 10.0 | \n",
" 22.0 | \n",
" 10.0 | \n",
" 13.0 | \n",
" 38.0 | \n",
" 31.0 | \n",
" 1.0 | \n",
" 38.0 | \n",
" 33.0 | \n",
" 40.0 | \n",
" 1.0 | \n",
" 0.3719 | \n",
" 33 | \n",
" 0.7157 | \n",
" 20 | \n",
" 0.3374 | \n",
" 18 | \n",
" 0.4819 | \n",
"
\n",
" \n",
" | 26 | \n",
" 27 | \n",
" meta-llama_Llama-2-13b-chat-hf | \n",
" 15h 37m | \n",
" 56271.6 | \n",
" 17h 9m | \n",
" 13.0B | \n",
" 13015864320 | \n",
" 17h 8m | \n",
" 61732.053618 | \n",
" auto | \n",
" [1] | \n",
" 0.430 | \n",
" 0.430 | \n",
" 0.414167 | \n",
" 0.501706 | \n",
" 0.477960 | \n",
" 0.816514 | \n",
" 0.091509 | \n",
" 0.299107 | \n",
" 0.347233 | \n",
" 0.796654 | \n",
" 0.531263 | \n",
" 0.103047 | \n",
" 0.440 | \n",
" 0.793254 | \n",
" 0.543840 | \n",
" 0.905 | \n",
" 0.272459 | \n",
" 0.280294 | \n",
" 0.439624 | \n",
" 0.711918 | \n",
" 31.0 | \n",
" 24.0 | \n",
" 19.0 | \n",
" 25.0 | \n",
" 21.0 | \n",
" 24.0 | \n",
" 19.0 | \n",
" 7.0 | \n",
" 15.0 | \n",
" 14.0 | \n",
" 21.0 | \n",
" 12.0 | \n",
" 26.0 | \n",
" 22.0 | \n",
" 28.0 | \n",
" 14.0 | \n",
" 17.0 | \n",
" 30.0 | \n",
" 33.0 | \n",
" 24.0 | \n",
" 0.4143 | \n",
" 28 | \n",
" 0.7153 | \n",
" 21 | \n",
" 0.2864 | \n",
" 26 | \n",
" 0.4813 | \n",
"
\n",
" \n",
" | 27 | \n",
" 28 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | \n",
" 5h 43m | \n",
" 20637.0 | \n",
" 6h 29m | \n",
" 7.6B | \n",
" 7615616512 | \n",
" 6h 28m | \n",
" 23311.022941 | \n",
" 3 | \n",
" [] | \n",
" 0.445 | \n",
" 0.418 | \n",
" 0.410000 | \n",
" 0.437713 | \n",
" 0.556904 | \n",
" 0.778287 | \n",
" 0.041198 | \n",
" 0.334821 | \n",
" 0.786202 | \n",
" 0.602569 | \n",
" 0.526350 | \n",
" 0.032133 | \n",
" 0.360 | \n",
" 0.716540 | \n",
" 0.520959 | \n",
" 0.918 | \n",
" 0.059240 | \n",
" 0.288862 | \n",
" 0.456319 | \n",
" 0.599053 | \n",
" 10.0 | \n",
" 19.0 | \n",
" 28.0 | \n",
" 23.0 | \n",
" 23.0 | \n",
" 25.0 | \n",
" 10.0 | \n",
" 34.0 | \n",
" 33.0 | \n",
" 33.0 | \n",
" 28.0 | \n",
" 27.0 | \n",
" 19.0 | \n",
" 24.0 | \n",
" 29.0 | \n",
" 28.0 | \n",
" 35.0 | \n",
" 28.0 | \n",
" 29.0 | \n",
" 34.0 | \n",
" 0.4841 | \n",
" 21 | \n",
" 0.6422 | \n",
" 34 | \n",
" 0.2340 | \n",
" 35 | \n",
" 0.4644 | \n",
"
\n",
" \n",
" | 28 | \n",
" 29 | \n",
" Qwen_Qwen2.5-1.5B-Instruct | \n",
" 2h 36m | \n",
" 9398.4 | \n",
" 3h 21m | \n",
" 1.5B | \n",
" 1543714304 | \n",
" 3h 20m | \n",
" 12036.565195 | \n",
" 6 | \n",
" [] | \n",
" 0.448 | \n",
" 0.392 | \n",
" 0.431667 | \n",
" 0.468430 | \n",
" 0.369221 | \n",
" 0.781346 | \n",
" 0.039052 | \n",
" 0.283482 | \n",
" 0.319181 | \n",
" 0.682932 | \n",
" 0.600555 | \n",
" 0.041551 | \n",
" 0.406 | \n",
" 0.758433 | \n",
" 0.566722 | \n",
" 0.939 | \n",
" 0.282601 | \n",
" 0.312118 | \n",
" 0.465748 | \n",
" 0.627466 | \n",
" 33.0 | \n",
" 37.0 | \n",
" 23.0 | \n",
" 21.0 | \n",
" 28.0 | \n",
" 20.0 | \n",
" 24.0 | \n",
" 30.0 | \n",
" 24.0 | \n",
" 30.0 | \n",
" 27.0 | \n",
" 22.0 | \n",
" 11.0 | \n",
" 15.0 | \n",
" 21.0 | \n",
" 24.0 | \n",
" 36.0 | \n",
" 21.0 | \n",
" 27.0 | \n",
" 22.0 | \n",
" 0.3874 | \n",
" 31 | \n",
" 0.6803 | \n",
" 29 | \n",
" 0.2903 | \n",
" 25 | \n",
" 0.4608 | \n",
"
\n",
" \n",
" | 29 | \n",
" 30 | \n",
" Qwen_Qwen3-1.7B | \n",
" 3h 36m | \n",
" 13010.4 | \n",
" 4h 26m | \n",
" 1.7B | \n",
" 1720574976 | \n",
" 4h 25m | \n",
" 15915.268575 | \n",
" 6 | \n",
" [] | \n",
" 0.410 | \n",
" 0.404 | \n",
" 0.434167 | \n",
" 0.434300 | \n",
" 0.482568 | \n",
" 0.776453 | \n",
" 0.075260 | \n",
" 0.290179 | \n",
" 0.689917 | \n",
" 0.603764 | \n",
" 0.553767 | \n",
" 0.022161 | \n",
" 0.376 | \n",
" 0.720348 | \n",
" 0.510525 | \n",
" 0.914 | \n",
" 0.134975 | \n",
" 0.294982 | \n",
" 0.458812 | \n",
" 0.608524 | \n",
" 20.0 | \n",
" 23.0 | \n",
" 29.0 | \n",
" 28.0 | \n",
" 27.0 | \n",
" 19.0 | \n",
" 22.0 | \n",
" 33.0 | \n",
" 32.0 | \n",
" 31.0 | \n",
" 30.0 | \n",
" 26.0 | \n",
" 23.0 | \n",
" 27.0 | \n",
" 25.0 | \n",
" 30.0 | \n",
" 23.0 | \n",
" 25.0 | \n",
" 28.0 | \n",
" 32.0 | \n",
" 0.4493 | \n",
" 25 | \n",
" 0.6442 | \n",
" 33 | \n",
" 0.2567 | \n",
" 34 | \n",
" 0.4597 | \n",
"
\n",
" \n",
" | 30 | \n",
" 31 | \n",
" Qwen_Qwen2.5-Math-7B-Instruct | \n",
" 4h 57m | \n",
" 17861.4 | \n",
" 5h 38m | \n",
" 7.6B | \n",
" 7615616512 | \n",
" 5h 37m | \n",
" 20230.489569 | \n",
" auto | \n",
" [4] | \n",
" 0.431 | \n",
" 0.415 | \n",
" 0.429167 | \n",
" 0.430887 | \n",
" 0.614038 | \n",
" 0.606116 | \n",
" 0.027299 | \n",
" 0.287946 | \n",
" 0.890068 | \n",
" 0.588130 | \n",
" 0.537245 | \n",
" 0.019945 | \n",
" 0.334 | \n",
" 0.685528 | \n",
" 0.677467 | \n",
" 0.858 | \n",
" 0.007468 | \n",
" 0.298654 | \n",
" 0.475035 | \n",
" 0.579321 | \n",
" 2.0 | \n",
" 11.0 | \n",
" 30.0 | \n",
" 24.0 | \n",
" 24.0 | \n",
" 22.0 | \n",
" 23.0 | \n",
" 35.0 | \n",
" 35.0 | \n",
" 35.0 | \n",
" 39.0 | \n",
" 30.0 | \n",
" 32.0 | \n",
" 12.0 | \n",
" 26.0 | \n",
" 32.0 | \n",
" 40.0 | \n",
" 24.0 | \n",
" 25.0 | \n",
" 40.0 | \n",
" 0.4997 | \n",
" 20 | \n",
" 0.6184 | \n",
" 37 | \n",
" 0.2276 | \n",
" 36 | \n",
" 0.4596 | \n",
"
\n",
" \n",
" | 31 | \n",
" 32 | \n",
" meta-llama_Llama-2-7b-chat-hf | \n",
" 6h 7m | \n",
" 22072.8 | \n",
" 6h 59m | \n",
" 6.7B | \n",
" 6738415616 | \n",
" 6h 57m | \n",
" 25079.294749 | \n",
" auto | \n",
" [4] | \n",
" 0.417 | \n",
" 0.410 | \n",
" 0.407500 | \n",
" 0.442833 | \n",
" 0.401321 | \n",
" 0.797859 | \n",
" 0.117497 | \n",
" 0.261161 | \n",
" 0.231994 | \n",
" 0.754830 | \n",
" 0.463609 | \n",
" 0.066759 | \n",
" 0.438 | \n",
" 0.771491 | \n",
" 0.580084 | \n",
" 0.878 | \n",
" 0.190370 | \n",
" 0.302326 | \n",
" 0.453217 | \n",
" 0.664562 | \n",
" 35.0 | \n",
" 33.0 | \n",
" 27.0 | \n",
" 27.0 | \n",
" 25.0 | \n",
" 26.0 | \n",
" 31.0 | \n",
" 21.0 | \n",
" 23.0 | \n",
" 26.0 | \n",
" 25.0 | \n",
" 13.0 | \n",
" 31.0 | \n",
" 14.0 | \n",
" 33.0 | \n",
" 17.0 | \n",
" 11.0 | \n",
" 22.0 | \n",
" 30.0 | \n",
" 29.0 | \n",
" 0.3674 | \n",
" 35 | \n",
" 0.6978 | \n",
" 25 | \n",
" 0.2656 | \n",
" 32 | \n",
" 0.4525 | \n",
"
\n",
" \n",
" | 32 | \n",
" 33 | \n",
" meta-llama_Llama-2-7b-hf | \n",
" 4h 59m | \n",
" 17980.2 | \n",
" 5h 43m | \n",
" 6.7B | \n",
" 6738415616 | \n",
" 5h 42m | \n",
" 20539.258032 | \n",
" auto | \n",
" [4] | \n",
" 0.364 | \n",
" 0.372 | \n",
" 0.375833 | \n",
" 0.462457 | \n",
" 0.399017 | \n",
" 0.777370 | \n",
" 0.036335 | \n",
" 0.241071 | \n",
" 0.137983 | \n",
" 0.760008 | \n",
" 0.418530 | \n",
" 0.188920 | \n",
" 0.442 | \n",
" 0.790533 | \n",
" 0.499176 | \n",
" 0.910 | \n",
" 0.525078 | \n",
" 0.252142 | \n",
" 0.389716 | \n",
" 0.689818 | \n",
" 40.0 | \n",
" 34.0 | \n",
" 24.0 | \n",
" 33.0 | \n",
" 31.0 | \n",
" 33.0 | \n",
" 34.0 | \n",
" 18.0 | \n",
" 16.0 | \n",
" 21.0 | \n",
" 29.0 | \n",
" 11.0 | \n",
" 25.0 | \n",
" 31.0 | \n",
" 37.0 | \n",
" 2.0 | \n",
" 37.0 | \n",
" 34.0 | \n",
" 38.0 | \n",
" 6.0 | \n",
" 0.3361 | \n",
" 39 | \n",
" 0.6956 | \n",
" 26 | \n",
" 0.3018 | \n",
" 22 | \n",
" 0.4516 | \n",
"
\n",
" \n",
" | 33 | \n",
" 34 | \n",
" deepseek-ai_deepseek-llm-7b-base | \n",
" 6h 26m | \n",
" 23180.4 | \n",
" 7h 12m | \n",
" 6.9B | \n",
" 6910365696 | \n",
" 7h 11m | \n",
" 25877.186720 | \n",
" 3 | \n",
" [] | \n",
" 0.340 | \n",
" 0.363 | \n",
" 0.377500 | \n",
" 0.445392 | \n",
" 0.423744 | \n",
" 0.723547 | \n",
" 0.042181 | \n",
" 0.252232 | \n",
" 0.162244 | \n",
" 0.760605 | \n",
" 0.442814 | \n",
" 0.150970 | \n",
" 0.434 | \n",
" 0.797606 | \n",
" 0.495881 | \n",
" 0.915 | \n",
" 0.500390 | \n",
" 0.232558 | \n",
" 0.349214 | \n",
" 0.693765 | \n",
" 38.0 | \n",
" 30.0 | \n",
" 26.0 | \n",
" 37.0 | \n",
" 32.0 | \n",
" 32.0 | \n",
" 33.0 | \n",
" 17.0 | \n",
" 13.0 | \n",
" 19.0 | \n",
" 34.0 | \n",
" 15.0 | \n",
" 22.0 | \n",
" 37.0 | \n",
" 36.0 | \n",
" 11.0 | \n",
" 34.0 | \n",
" 36.0 | \n",
" 41.0 | \n",
" 9.0 | \n",
" 0.3377 | \n",
" 38 | \n",
" 0.6886 | \n",
" 28 | \n",
" 0.2864 | \n",
" 27 | \n",
" 0.4451 | \n",
"
\n",
" \n",
" | 34 | \n",
" 35 | \n",
" deepseek-ai_deepseek-math-7b-rl | \n",
" 7h 12m | \n",
" 25973.4 | \n",
" 8h 3m | \n",
" 6.9B | \n",
" 6910365696 | \n",
" 8h 2m | \n",
" 28925.110783 | \n",
" 3 | \n",
" [] | \n",
" 0.368 | \n",
" 0.389 | \n",
" 0.405000 | \n",
" 0.489761 | \n",
" 0.524651 | \n",
" 0.755963 | \n",
" 0.119027 | \n",
" 0.272321 | \n",
" 0.142532 | \n",
" 0.689604 | \n",
" 0.524996 | \n",
" 0.039335 | \n",
" 0.424 | \n",
" 0.750272 | \n",
" 0.498993 | \n",
" 0.928 | \n",
" 0.174654 | \n",
" 0.287638 | \n",
" 0.402884 | \n",
" 0.651144 | \n",
" 39.0 | \n",
" 22.0 | \n",
" 21.0 | \n",
" 32.0 | \n",
" 30.0 | \n",
" 28.0 | \n",
" 27.0 | \n",
" 28.0 | \n",
" 28.0 | \n",
" 28.0 | \n",
" 32.0 | \n",
" 18.0 | \n",
" 18.0 | \n",
" 32.0 | \n",
" 30.0 | \n",
" 26.0 | \n",
" 10.0 | \n",
" 29.0 | \n",
" 37.0 | \n",
" 31.0 | \n",
" 0.3702 | \n",
" 34 | \n",
" 0.6711 | \n",
" 31 | \n",
" 0.2581 | \n",
" 33 | \n",
" 0.4419 | \n",
"
\n",
" \n",
" | 35 | \n",
" 36 | \n",
" meta-llama_Llama-3.2-1B-Instruct | \n",
" 2h 35m | \n",
" 9307.8 | \n",
" 3h 32m | \n",
" 1.2B | \n",
" 1235814400 | \n",
" 3h 30m | \n",
" 12653.736082 | \n",
" auto | \n",
" [2] | \n",
" 0.338 | \n",
" 0.334 | \n",
" 0.372500 | \n",
" 0.380546 | \n",
" 0.378129 | \n",
" 0.694801 | \n",
" 0.163484 | \n",
" 0.274554 | \n",
" 0.337377 | \n",
" 0.608843 | \n",
" 0.458909 | \n",
" 0.056510 | \n",
" 0.346 | \n",
" 0.742111 | \n",
" 0.494600 | \n",
" 0.897 | \n",
" 0.249944 | \n",
" 0.271726 | \n",
" 0.438300 | \n",
" 0.601421 | \n",
" 32.0 | \n",
" 36.0 | \n",
" 32.0 | \n",
" 38.0 | \n",
" 37.0 | \n",
" 34.0 | \n",
" 26.0 | \n",
" 32.0 | \n",
" 30.0 | \n",
" 32.0 | \n",
" 35.0 | \n",
" 29.0 | \n",
" 28.0 | \n",
" 40.0 | \n",
" 34.0 | \n",
" 21.0 | \n",
" 6.0 | \n",
" 31.0 | \n",
" 34.0 | \n",
" 25.0 | \n",
" 0.3450 | \n",
" 37 | \n",
" 0.6264 | \n",
" 36 | \n",
" 0.2731 | \n",
" 31 | \n",
" 0.4219 | \n",
"
\n",
" \n",
" | 36 | \n",
" 37 | \n",
" google_gemma-3-1b-it | \n",
" 4h 52m | \n",
" 17533.8 | \n",
" 6h 51m | \n",
" 999.9M | \n",
" 999885952 | \n",
" 6h 50m | \n",
" 24641.929494 | \n",
" auto | \n",
" [1] | \n",
" 0.332 | \n",
" 0.354 | \n",
" 0.356667 | \n",
" 0.380546 | \n",
" 0.382276 | \n",
" 0.758104 | \n",
" 0.076157 | \n",
" 0.265625 | \n",
" 0.247157 | \n",
" 0.578271 | \n",
" 0.385914 | \n",
" 0.035734 | \n",
" 0.388 | \n",
" 0.720892 | \n",
" 0.494051 | \n",
" 0.858 | \n",
" 0.189701 | \n",
" 0.246022 | \n",
" 0.387463 | \n",
" 0.589582 | \n",
" 34.0 | \n",
" 35.0 | \n",
" 32.0 | \n",
" 39.0 | \n",
" 34.0 | \n",
" 36.0 | \n",
" 30.0 | \n",
" 36.0 | \n",
" 31.0 | \n",
" 34.0 | \n",
" 31.0 | \n",
" 25.0 | \n",
" 32.0 | \n",
" 41.0 | \n",
" 39.0 | \n",
" 27.0 | \n",
" 22.0 | \n",
" 35.0 | \n",
" 39.0 | \n",
" 30.0 | \n",
" 0.3312 | \n",
" 40 | \n",
" 0.6267 | \n",
" 35 | \n",
" 0.2202 | \n",
" 38 | \n",
" 0.4013 | \n",
"
\n",
" \n",
" | 37 | \n",
" 38 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | \n",
" 2h 52m | \n",
" 10353.6 | \n",
" 3h 42m | \n",
" 1.8B | \n",
" 1777088000 | \n",
" 3h 40m | \n",
" 13254.913052 | \n",
" 6 | \n",
" [] | \n",
" 0.356 | \n",
" 0.362 | \n",
" 0.362500 | \n",
" 0.346416 | \n",
" 0.405928 | \n",
" 0.680122 | \n",
" 0.050686 | \n",
" 0.272321 | \n",
" 0.701289 | \n",
" 0.446724 | \n",
" 0.360632 | \n",
" 0.006371 | \n",
" 0.308 | \n",
" 0.657780 | \n",
" 0.505400 | \n",
" 0.845 | \n",
" 0.009028 | \n",
" 0.293758 | \n",
" 0.451742 | \n",
" 0.549329 | \n",
" 19.0 | \n",
" 32.0 | \n",
" 34.0 | \n",
" 34.0 | \n",
" 33.0 | \n",
" 35.0 | \n",
" 27.0 | \n",
" 39.0 | \n",
" 37.0 | \n",
" 38.0 | \n",
" 36.0 | \n",
" 32.0 | \n",
" 33.0 | \n",
" 29.0 | \n",
" 41.0 | \n",
" 38.0 | \n",
" 32.0 | \n",
" 26.0 | \n",
" 31.0 | \n",
" 38.0 | \n",
" 0.4009 | \n",
" 30 | \n",
" 0.5703 | \n",
" 39 | \n",
" 0.1954 | \n",
" 41 | \n",
" 0.3986 | \n",
"
\n",
" \n",
" | 38 | \n",
" 39 | \n",
" Qwen_Qwen2.5-Math-1.5B-Instruct | \n",
" 2h 39m | \n",
" 9542.4 | \n",
" 3h 26m | \n",
" 1.5B | \n",
" 1543714304 | \n",
" 3h 25m | \n",
" 12324.098490 | \n",
" auto:4 | \n",
" [6, 64, 64, 64, 64] | \n",
" 0.342 | \n",
" 0.341 | \n",
" 0.353333 | \n",
" 0.365188 | \n",
" 0.437260 | \n",
" 0.569419 | \n",
" 0.023086 | \n",
" 0.283482 | \n",
" 0.736922 | \n",
" 0.416550 | \n",
" 0.378792 | \n",
" 0.003878 | \n",
" 0.286 | \n",
" 0.613711 | \n",
" 0.497346 | \n",
" 0.718 | \n",
" 0.004291 | \n",
" 0.290086 | \n",
" 0.489501 | \n",
" 0.525651 | \n",
" 17.0 | \n",
" 28.0 | \n",
" 33.0 | \n",
" 36.0 | \n",
" 36.0 | \n",
" 37.0 | \n",
" 24.0 | \n",
" 40.0 | \n",
" 38.0 | \n",
" 39.0 | \n",
" 40.0 | \n",
" 33.0 | \n",
" 35.0 | \n",
" 34.0 | \n",
" 40.0 | \n",
" 39.0 | \n",
" 41.0 | \n",
" 27.0 | \n",
" 21.0 | \n",
" 41.0 | \n",
" 0.4085 | \n",
" 29 | \n",
" 0.5181 | \n",
" 41 | \n",
" 0.1983 | \n",
" 40 | \n",
" 0.3838 | \n",
"
\n",
" \n",
" | 39 | \n",
" 40 | \n",
" Qwen_Qwen3-0.6B | \n",
" 2h 53m | \n",
" 10404.6 | \n",
" 3h 46m | \n",
" 596.0M | \n",
" 596049920 | \n",
" 3h 45m | \n",
" 13547.446141 | \n",
" 6 | \n",
" [] | \n",
" 0.343 | \n",
" 0.319 | \n",
" 0.344167 | \n",
" 0.342150 | \n",
" 0.414836 | \n",
" 0.639144 | \n",
" 0.060544 | \n",
" 0.270089 | \n",
" 0.412434 | \n",
" 0.471918 | \n",
" 0.401296 | \n",
" 0.020499 | \n",
" 0.320 | \n",
" 0.675190 | \n",
" 0.496064 | \n",
" 0.833 | \n",
" 0.019282 | \n",
" 0.270502 | \n",
" 0.427742 | \n",
" 0.551697 | \n",
" 30.0 | \n",
" 31.0 | \n",
" 35.0 | \n",
" 35.0 | \n",
" 38.0 | \n",
" 39.0 | \n",
" 28.0 | \n",
" 38.0 | \n",
" 36.0 | \n",
" 37.0 | \n",
" 38.0 | \n",
" 31.0 | \n",
" 34.0 | \n",
" 36.0 | \n",
" 38.0 | \n",
" 31.0 | \n",
" 28.0 | \n",
" 32.0 | \n",
" 35.0 | \n",
" 37.0 | \n",
" 0.3494 | \n",
" 36 | \n",
" 0.5696 | \n",
" 40 | \n",
" 0.2000 | \n",
" 39 | \n",
" 0.3816 | \n",
"
\n",
" \n",
" | 40 | \n",
" 41 | \n",
" Qwen_Qwen2.5-0.5B-Instruct | \n",
" 1h 48m | \n",
" 6532.8 | \n",
" 2h 35m | \n",
" 494.0M | \n",
" 494032768 | \n",
" 2h 34m | \n",
" 9253.074769 | \n",
" 6 | \n",
" [] | \n",
" 0.324 | \n",
" 0.342 | \n",
" 0.347500 | \n",
" 0.337031 | \n",
" 0.213792 | \n",
" 0.676758 | \n",
" 0.028644 | \n",
" 0.267857 | \n",
" 0.207733 | \n",
" 0.524099 | \n",
" 0.457556 | \n",
" 0.020499 | \n",
" 0.346 | \n",
" 0.704026 | \n",
" 0.536884 | \n",
" 0.883 | \n",
" 0.134195 | \n",
" 0.271726 | \n",
" 0.418387 | \n",
" 0.556433 | \n",
" 37.0 | \n",
" 40.0 | \n",
" 36.0 | \n",
" 40.0 | \n",
" 35.0 | \n",
" 38.0 | \n",
" 29.0 | \n",
" 37.0 | \n",
" 34.0 | \n",
" 36.0 | \n",
" 37.0 | \n",
" 29.0 | \n",
" 30.0 | \n",
" 23.0 | \n",
" 35.0 | \n",
" 31.0 | \n",
" 39.0 | \n",
" 31.0 | \n",
" 36.0 | \n",
" 33.0 | \n",
" 0.2914 | \n",
" 41 | \n",
" 0.6039 | \n",
" 38 | \n",
" 0.2218 | \n",
" 37 | \n",
" 0.3799 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Overall Rank Model Name GPU Util Time \\\n",
"0 1 google_gemma-3-12b-it 14h 8m \n",
"1 2 Qwen_Qwen3-14B (8bit) 17h 29m \n",
"2 3 openchat_openchat-3.6-8b-20240522 6h 59m \n",
"3 4 Qwen_Qwen3-8B 13h 44m \n",
"4 5 Qwen_Qwen2.5-7B-Instruct 8h 33m \n",
"5 6 Qwen_Qwen2.5-14B-Instruct (8bit) 29h 32m \n",
"6 7 01-ai_Yi-1.5-9B 10h 26m \n",
"7 8 Qwen_Qwen2.5-7B-Instruct-1M 10h 10m \n",
"8 9 meta-llama_Llama-3.1-8B-Instruct 10h 52m \n",
"9 10 01-ai_Yi-1.5-9B-Chat 12h 15m \n",
"10 11 mistralai_Ministral-8B-Instruct-2410 9h 27m \n",
"11 12 meta-llama_Meta-Llama-3-8B-Instruct 5h 46m \n",
"12 13 Qwen_Qwen3-4B 5h 3m \n",
"13 14 NousResearch_Hermes-2-Pro-Mistral-7B 7h 28m \n",
"14 15 mistralai_Mistral-7B-Instruct-v0.3 7h 41m \n",
"15 16 google_gemma-3-4b-it 3h 50m \n",
"16 17 01-ai_Yi-1.5-6B-Chat 7h 1m \n",
"17 18 01-ai_Yi-1.5-6B 3h 54m \n",
"18 19 Qwen_Qwen2-7B-Instruct 10h 11m \n",
"19 20 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 15h 30m \n",
"20 21 meta-llama_Llama-3.2-3B-Instruct 5h 57m \n",
"21 22 Qwen_Qwen2.5-3B-Instruct 6h 30m \n",
"22 23 Qwen_Qwen2.5-Math-7B 24h 38m \n",
"23 24 deepseek-ai_deepseek-llm-7b-chat 9h 8m \n",
"24 25 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 10h 36m \n",
"25 26 meta-llama_Llama-2-13b-hf 17h 38m \n",
"26 27 meta-llama_Llama-2-13b-chat-hf 15h 37m \n",
"27 28 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 5h 43m \n",
"28 29 Qwen_Qwen2.5-1.5B-Instruct 2h 36m \n",
"29 30 Qwen_Qwen3-1.7B 3h 36m \n",
"30 31 Qwen_Qwen2.5-Math-7B-Instruct 4h 57m \n",
"31 32 meta-llama_Llama-2-7b-chat-hf 6h 7m \n",
"32 33 meta-llama_Llama-2-7b-hf 4h 59m \n",
"33 34 deepseek-ai_deepseek-llm-7b-base 6h 26m \n",
"34 35 deepseek-ai_deepseek-math-7b-rl 7h 12m \n",
"35 36 meta-llama_Llama-3.2-1B-Instruct 2h 35m \n",
"36 37 google_gemma-3-1b-it 4h 52m \n",
"37 38 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 2h 52m \n",
"38 39 Qwen_Qwen2.5-Math-1.5B-Instruct 2h 39m \n",
"39 40 Qwen_Qwen3-0.6B 2h 53m \n",
"40 41 Qwen_Qwen2.5-0.5B-Instruct 1h 48m \n",
"\n",
" gpu_util_time_raw full_time_from_gpu_log Parameters parameters_raw \\\n",
"0 50906.4 15h 47m 12.2B 12187325040 \n",
"1 62956.2 29h 46m 14.8B 14768307200 \n",
"2 25150.8 7h 52m 8.0B 8030261248 \n",
"3 49497.0 15h 33m 8.2B 8190735360 \n",
"4 30831.6 9h 38m 7.6B 7615616512 \n",
"5 106374.6 52h 45m 14.8B 14770033664 \n",
"6 37569.6 11h 44m 8.8B 8829407232 \n",
"7 36621.0 11h 18m 7.6B 7615616512 \n",
"8 39147.6 12h 20m 8.0B 8030261248 \n",
"9 44120.4 13h 55m 8.8B 8829407232 \n",
"10 34053.6 10h 47m 8.0B 8019808256 \n",
"11 20809.8 6h 31m 8.0B 8030261248 \n",
"12 18234.6 5h 52m 4.0B 4022468096 \n",
"13 26916.0 8h 28m 7.2B 7241994240 \n",
"14 27676.8 8h 39m 7.2B 7248023552 \n",
"15 13811.4 4h 52m 4.3B 4300079472 \n",
"16 25318.8 8h 5m 6.1B 6061035520 \n",
"17 14091.6 4h 29m 6.1B 6061035520 \n",
"18 36684.6 11h 31m 7.6B 7615616512 \n",
"19 55855.2 17h 59m 8.2B 8190735360 \n",
"20 21477.0 7h 13m 3.2B 3212749824 \n",
"21 23452.2 7h 49m 3.1B 3085938688 \n",
"22 88696.2 27h 23m 7.6B 7615616512 \n",
"23 32906.4 10h 8m 6.9B 6910365696 \n",
"24 38179.2 11h 47m 8.0B 8030261248 \n",
"25 63506.4 19h 22m 13.0B 13015864320 \n",
"26 56271.6 17h 9m 13.0B 13015864320 \n",
"27 20637.0 6h 29m 7.6B 7615616512 \n",
"28 9398.4 3h 21m 1.5B 1543714304 \n",
"29 13010.4 4h 26m 1.7B 1720574976 \n",
"30 17861.4 5h 38m 7.6B 7615616512 \n",
"31 22072.8 6h 59m 6.7B 6738415616 \n",
"32 17980.2 5h 43m 6.7B 6738415616 \n",
"33 23180.4 7h 12m 6.9B 6910365696 \n",
"34 25973.4 8h 3m 6.9B 6910365696 \n",
"35 9307.8 3h 32m 1.2B 1235814400 \n",
"36 17533.8 6h 51m 999.9M 999885952 \n",
"37 10353.6 3h 42m 1.8B 1777088000 \n",
"38 9542.4 3h 26m 1.5B 1543714304 \n",
"39 10404.6 3h 46m 596.0M 596049920 \n",
"40 6532.8 2h 35m 494.0M 494032768 \n",
"\n",
" Total Time total_time_raw batch_size batch_sizes \\\n",
"0 15h 45m 56750.865892 auto [2] \n",
"1 29h 45m 107151.802065 1 [] \n",
"2 7h 51m 28278.859470 3 [] \n",
"3 15h 31m 55918.467860 auto [1] \n",
"4 9h 36m 34616.604248 3 [] \n",
"5 52h 44m 189869.409404 1 [] \n",
"6 11h 43m 42212.112622 2 [] \n",
"7 11h 17m 40632.813397 auto [1] \n",
"8 12h 19m 44363.249360 auto [1] \n",
"9 13h 54m 50056.331345 2 [] \n",
"10 10h 46m 38770.339256 auto [1] \n",
"11 6h 30m 23440.234421 3 [] \n",
"12 5h 51m 21077.943646 6 [] \n",
"13 8h 27m 30434.329021 3 [] \n",
"14 8h 38m 31084.838324 3 [] \n",
"15 4h 51m 17460.233507 auto [4] \n",
"16 8h 4m 29040.429802 2 [] \n",
"17 4h 28m 16094.199661 auto [8] \n",
"18 11h 30m 41431.857967 auto [1] \n",
"19 17h 57m 64675.539163 auto [1] \n",
"20 7h 12m 25939.885959 auto [2] \n",
"21 7h 48m 28089.516568 auto:4 [2, 64, 64, 64, 64] \n",
"22 27h 21m 98517.403245 auto [4] \n",
"23 10h 6m 36412.969244 3 [] \n",
"24 11h 46m 42405.489811 auto:5 [1, 64, 64, 64, 64, 64] \n",
"25 19h 21m 69687.765642 auto [1] \n",
"26 17h 8m 61732.053618 auto [1] \n",
"27 6h 28m 23311.022941 3 [] \n",
"28 3h 20m 12036.565195 6 [] \n",
"29 4h 25m 15915.268575 6 [] \n",
"30 5h 37m 20230.489569 auto [4] \n",
"31 6h 57m 25079.294749 auto [4] \n",
"32 5h 42m 20539.258032 auto [4] \n",
"33 7h 11m 25877.186720 3 [] \n",
"34 8h 2m 28925.110783 3 [] \n",
"35 3h 30m 12653.736082 auto [2] \n",
"36 6h 50m 24641.929494 auto [1] \n",
"37 3h 40m 13254.913052 6 [] \n",
"38 3h 25m 12324.098490 auto:4 [6, 64, 64, 64, 64] \n",
"39 3h 45m 13547.446141 6 [] \n",
"40 2h 34m 9253.074769 6 [] \n",
"\n",
" anli_r1(acc,none) anli_r2(acc,none) anli_r3(acc,none) \\\n",
"0 0.603 0.560 0.595833 \n",
"1 0.646 0.570 0.556667 \n",
"2 0.556 0.513 0.480000 \n",
"3 0.669 0.542 0.555833 \n",
"4 0.685 0.549 0.552500 \n",
"5 0.721 0.634 0.617500 \n",
"6 0.532 0.480 0.439167 \n",
"7 0.585 0.533 0.556667 \n",
"8 0.482 0.467 0.443333 \n",
"9 0.535 0.509 0.525833 \n",
"10 0.488 0.487 0.465833 \n",
"11 0.484 0.458 0.448333 \n",
"12 0.550 0.461 0.513333 \n",
"13 0.531 0.496 0.500000 \n",
"14 0.476 0.443 0.448333 \n",
"15 0.492 0.471 0.468333 \n",
"16 0.477 0.453 0.460000 \n",
"17 0.448 0.407 0.406667 \n",
"18 0.573 0.525 0.522500 \n",
"19 0.511 0.464 0.476667 \n",
"20 0.447 0.418 0.430833 \n",
"21 0.562 0.466 0.494167 \n",
"22 0.387 0.407 0.382500 \n",
"23 0.423 0.419 0.420833 \n",
"24 0.404 0.410 0.388333 \n",
"25 0.377 0.390 0.385000 \n",
"26 0.430 0.430 0.414167 \n",
"27 0.445 0.418 0.410000 \n",
"28 0.448 0.392 0.431667 \n",
"29 0.410 0.404 0.434167 \n",
"30 0.431 0.415 0.429167 \n",
"31 0.417 0.410 0.407500 \n",
"32 0.364 0.372 0.375833 \n",
"33 0.340 0.363 0.377500 \n",
"34 0.368 0.389 0.405000 \n",
"35 0.338 0.334 0.372500 \n",
"36 0.332 0.354 0.356667 \n",
"37 0.356 0.362 0.362500 \n",
"38 0.342 0.341 0.353333 \n",
"39 0.343 0.319 0.344167 \n",
"40 0.324 0.342 0.347500 \n",
"\n",
" arc_challenge(acc_norm,none) bbh(exact_match,get-answer) \\\n",
"0 0.610922 0.801874 \n",
"1 0.600683 0.432960 \n",
"2 0.603242 0.617877 \n",
"3 0.562287 0.797573 \n",
"4 0.552901 0.448779 \n",
"5 0.615188 0.106896 \n",
"6 0.546928 0.712026 \n",
"7 0.585324 0.277223 \n",
"8 0.550341 0.715558 \n",
"9 0.587031 0.610659 \n",
"10 0.562287 0.692520 \n",
"11 0.563993 0.679005 \n",
"12 0.539249 0.752265 \n",
"13 0.565700 0.573798 \n",
"14 0.589590 0.562586 \n",
"15 0.570819 0.709415 \n",
"16 0.539249 0.547842 \n",
"17 0.496587 0.575488 \n",
"18 0.540102 0.577484 \n",
"19 0.549488 0.584088 \n",
"20 0.459044 0.556443 \n",
"21 0.482082 0.249117 \n",
"22 0.502560 0.672401 \n",
"23 0.496587 0.454769 \n",
"24 0.423208 0.603748 \n",
"25 0.489761 0.477653 \n",
"26 0.501706 0.477960 \n",
"27 0.437713 0.556904 \n",
"28 0.468430 0.369221 \n",
"29 0.434300 0.482568 \n",
"30 0.430887 0.614038 \n",
"31 0.442833 0.401321 \n",
"32 0.462457 0.399017 \n",
"33 0.445392 0.423744 \n",
"34 0.489761 0.524651 \n",
"35 0.380546 0.378129 \n",
"36 0.380546 0.382276 \n",
"37 0.346416 0.405928 \n",
"38 0.365188 0.437260 \n",
"39 0.342150 0.414836 \n",
"40 0.337031 0.213792 \n",
"\n",
" boolq(acc,none) drop(f1,none) gpqa_main_zeroshot(acc_norm,none) \\\n",
"0 0.874618 0.139566 0.337054 \n",
"1 0.891743 0.090410 0.397321 \n",
"2 0.872783 0.251569 0.332589 \n",
"3 0.865749 0.109877 0.350446 \n",
"4 0.863303 0.071089 0.328125 \n",
"5 0.886239 0.071276 0.354911 \n",
"6 0.858104 0.445686 0.294643 \n",
"7 0.852599 0.057047 0.339286 \n",
"8 0.841590 0.193729 0.343750 \n",
"9 0.868196 0.125326 0.303571 \n",
"10 0.860245 0.071413 0.341518 \n",
"11 0.831193 0.163977 0.310268 \n",
"12 0.850459 0.097707 0.325893 \n",
"13 0.868196 0.109754 0.276786 \n",
"14 0.858410 0.089972 0.283482 \n",
"15 0.839755 0.089284 0.287946 \n",
"16 0.847401 0.116081 0.357143 \n",
"17 0.801529 0.399462 0.290179 \n",
"18 0.856269 0.052028 0.314732 \n",
"19 0.848318 0.053279 0.372768 \n",
"20 0.784709 0.155394 0.328125 \n",
"21 0.801223 0.077333 0.321429 \n",
"22 0.745566 0.043235 0.308036 \n",
"23 0.833028 0.103048 0.292411 \n",
"24 0.828746 0.071225 0.274554 \n",
"25 0.806422 0.030132 0.254464 \n",
"26 0.816514 0.091509 0.299107 \n",
"27 0.778287 0.041198 0.334821 \n",
"28 0.781346 0.039052 0.283482 \n",
"29 0.776453 0.075260 0.290179 \n",
"30 0.606116 0.027299 0.287946 \n",
"31 0.797859 0.117497 0.261161 \n",
"32 0.777370 0.036335 0.241071 \n",
"33 0.723547 0.042181 0.252232 \n",
"34 0.755963 0.119027 0.272321 \n",
"35 0.694801 0.163484 0.274554 \n",
"36 0.758104 0.076157 0.265625 \n",
"37 0.680122 0.050686 0.272321 \n",
"38 0.569419 0.023086 0.283482 \n",
"39 0.639144 0.060544 0.270089 \n",
"40 0.676758 0.028644 0.267857 \n",
"\n",
" gsm8k(exact_match,strict-match) hellaswag(acc_norm,none) mmlu(acc,none) \\\n",
"0 0.877180 0.818761 0.716137 \n",
"1 0.898408 0.787692 0.769477 \n",
"2 0.750569 0.797849 0.643071 \n",
"3 0.872631 0.748656 0.728956 \n",
"4 0.762699 0.804919 0.718060 \n",
"5 0.792267 0.841964 0.783079 \n",
"6 0.639121 0.778929 0.689289 \n",
"7 0.795299 0.789982 0.716636 \n",
"8 0.754359 0.792073 0.679319 \n",
"9 0.708112 0.787293 0.684091 \n",
"10 0.774829 0.791077 0.640721 \n",
"11 0.756634 0.759211 0.638727 \n",
"12 0.856710 0.683330 0.683592 \n",
"13 0.685368 0.804919 0.605113 \n",
"14 0.489765 0.828919 0.597137 \n",
"15 0.761941 0.741386 0.575559 \n",
"16 0.670205 0.767477 0.617861 \n",
"17 0.522365 0.754133 0.624270 \n",
"18 0.646702 0.806015 0.699402 \n",
"19 0.812737 0.756423 0.682951 \n",
"20 0.642153 0.705437 0.605184 \n",
"21 0.101592 0.749054 0.654964 \n",
"22 0.847612 0.652858 0.579903 \n",
"23 0.463988 0.777236 0.498789 \n",
"24 0.624716 0.742979 0.532688 \n",
"25 0.229719 0.793866 0.520937 \n",
"26 0.347233 0.796654 0.531263 \n",
"27 0.786202 0.602569 0.526350 \n",
"28 0.319181 0.682932 0.600555 \n",
"29 0.689917 0.603764 0.553767 \n",
"30 0.890068 0.588130 0.537245 \n",
"31 0.231994 0.754830 0.463609 \n",
"32 0.137983 0.760008 0.418530 \n",
"33 0.162244 0.760605 0.442814 \n",
"34 0.142532 0.689604 0.524996 \n",
"35 0.337377 0.608843 0.458909 \n",
"36 0.247157 0.578271 0.385914 \n",
"37 0.701289 0.446724 0.360632 \n",
"38 0.736922 0.416550 0.378792 \n",
"39 0.412434 0.471918 0.401296 \n",
"40 0.207733 0.524099 0.457556 \n",
"\n",
" nq_open(exact_match,remove_whitespace) openbookqa(acc_norm,none) \\\n",
"0 0.157064 0.498 \n",
"1 0.092244 0.460 \n",
"2 0.170637 0.462 \n",
"3 0.073684 0.418 \n",
"4 0.045706 0.486 \n",
"5 0.061496 0.476 \n",
"6 0.153186 0.456 \n",
"7 0.157618 0.480 \n",
"8 0.177562 0.432 \n",
"9 0.009418 0.436 \n",
"10 0.157618 0.466 \n",
"11 0.159003 0.430 \n",
"12 0.014681 0.402 \n",
"13 0.040443 0.434 \n",
"14 0.153740 0.470 \n",
"15 0.109418 0.466 \n",
"16 0.027147 0.436 \n",
"17 0.178116 0.422 \n",
"18 0.013296 0.462 \n",
"19 0.018283 0.430 \n",
"20 0.139058 0.358 \n",
"21 0.008310 0.422 \n",
"22 0.050970 0.392 \n",
"23 0.063435 0.460 \n",
"24 0.058449 0.410 \n",
"25 0.236288 0.452 \n",
"26 0.103047 0.440 \n",
"27 0.032133 0.360 \n",
"28 0.041551 0.406 \n",
"29 0.022161 0.376 \n",
"30 0.019945 0.334 \n",
"31 0.066759 0.438 \n",
"32 0.188920 0.442 \n",
"33 0.150970 0.434 \n",
"34 0.039335 0.424 \n",
"35 0.056510 0.346 \n",
"36 0.035734 0.388 \n",
"37 0.006371 0.308 \n",
"38 0.003878 0.286 \n",
"39 0.020499 0.320 \n",
"40 0.020499 0.346 \n",
"\n",
" piqa(acc_norm,none) qnli(acc,none) sciq(acc_norm,none) \\\n",
"0 0.780740 0.745744 0.954 \n",
"1 0.794886 0.844225 0.966 \n",
"2 0.818281 0.730002 0.964 \n",
"3 0.775299 0.781805 0.958 \n",
"4 0.803047 0.804503 0.937 \n",
"5 0.817193 0.853926 0.929 \n",
"6 0.806311 0.508695 0.952 \n",
"7 0.816104 0.678199 0.950 \n",
"8 0.806311 0.501373 0.962 \n",
"9 0.803591 0.787662 0.954 \n",
"10 0.823177 0.494966 0.956 \n",
"11 0.787269 0.546403 0.932 \n",
"12 0.751360 0.808713 0.932 \n",
"13 0.798694 0.556471 0.917 \n",
"14 0.826986 0.514552 0.943 \n",
"15 0.772035 0.565989 0.931 \n",
"16 0.787813 0.679480 0.934 \n",
"17 0.801415 0.598572 0.941 \n",
"18 0.805767 0.547135 0.916 \n",
"19 0.756801 0.557752 0.941 \n",
"20 0.755169 0.545122 0.932 \n",
"21 0.780740 0.797913 0.913 \n",
"22 0.745375 0.498078 0.929 \n",
"23 0.801415 0.496980 0.893 \n",
"24 0.775843 0.514735 0.899 \n",
"25 0.805223 0.495332 0.935 \n",
"26 0.793254 0.543840 0.905 \n",
"27 0.716540 0.520959 0.918 \n",
"28 0.758433 0.566722 0.939 \n",
"29 0.720348 0.510525 0.914 \n",
"30 0.685528 0.677467 0.858 \n",
"31 0.771491 0.580084 0.878 \n",
"32 0.790533 0.499176 0.910 \n",
"33 0.797606 0.495881 0.915 \n",
"34 0.750272 0.498993 0.928 \n",
"35 0.742111 0.494600 0.897 \n",
"36 0.720892 0.494051 0.858 \n",
"37 0.657780 0.505400 0.845 \n",
"38 0.613711 0.497346 0.718 \n",
"39 0.675190 0.496064 0.833 \n",
"40 0.704026 0.536884 0.883 \n",
"\n",
" triviaqa(exact_match,remove_whitespace) truthfulqa_mc1(acc,none) \\\n",
"0 0.275245 0.405141 \n",
"1 0.407490 0.406365 \n",
"2 0.565927 0.352509 \n",
"3 0.320609 0.363525 \n",
"4 0.325401 0.477356 \n",
"5 0.039289 0.510404 \n",
"6 0.543803 0.321909 \n",
"7 0.420531 0.425949 \n",
"8 0.518168 0.365973 \n",
"9 0.338665 0.374541 \n",
"10 0.527809 0.325581 \n",
"11 0.511202 0.363525 \n",
"12 0.225033 0.367197 \n",
"13 0.471132 0.413709 \n",
"14 0.568324 0.421053 \n",
"15 0.314813 0.348837 \n",
"16 0.330974 0.376989 \n",
"17 0.495207 0.299878 \n",
"18 0.008136 0.405141 \n",
"19 0.029481 0.357405 \n",
"20 0.338943 0.326805 \n",
"21 0.300992 0.416157 \n",
"22 0.218346 0.320685 \n",
"23 0.311190 0.348837 \n",
"24 0.194048 0.321909 \n",
"25 0.608839 0.259486 \n",
"26 0.272459 0.280294 \n",
"27 0.059240 0.288862 \n",
"28 0.282601 0.312118 \n",
"29 0.134975 0.294982 \n",
"30 0.007468 0.298654 \n",
"31 0.190370 0.302326 \n",
"32 0.525078 0.252142 \n",
"33 0.500390 0.232558 \n",
"34 0.174654 0.287638 \n",
"35 0.249944 0.271726 \n",
"36 0.189701 0.246022 \n",
"37 0.009028 0.293758 \n",
"38 0.004291 0.290086 \n",
"39 0.019282 0.270502 \n",
"40 0.134195 0.271726 \n",
"\n",
" truthfulqa_mc2(acc,none) winogrande(acc,none) \\\n",
"0 0.581183 0.744278 \n",
"1 0.589404 0.720600 \n",
"2 0.497601 0.763220 \n",
"3 0.543140 0.680347 \n",
"4 0.648483 0.711918 \n",
"5 0.683015 0.754538 \n",
"6 0.467572 0.726125 \n",
"7 0.600072 0.727703 \n",
"8 0.541154 0.738753 \n",
"9 0.547934 0.746646 \n",
"10 0.486670 0.737964 \n",
"11 0.517142 0.716654 \n",
"12 0.547575 0.658248 \n",
"13 0.591156 0.719811 \n",
"14 0.596813 0.740331 \n",
"15 0.518821 0.700868 \n",
"16 0.534371 0.709550 \n",
"17 0.440750 0.720600 \n",
"18 0.573437 0.698500 \n",
"19 0.559013 0.675612 \n",
"20 0.497579 0.670876 \n",
"21 0.586055 0.692976 \n",
"22 0.483219 0.647987 \n",
"23 0.478933 0.701657 \n",
"24 0.504460 0.677979 \n",
"25 0.368992 0.722178 \n",
"26 0.439624 0.711918 \n",
"27 0.456319 0.599053 \n",
"28 0.465748 0.627466 \n",
"29 0.458812 0.608524 \n",
"30 0.475035 0.579321 \n",
"31 0.453217 0.664562 \n",
"32 0.389716 0.689818 \n",
"33 0.349214 0.693765 \n",
"34 0.402884 0.651144 \n",
"35 0.438300 0.601421 \n",
"36 0.387463 0.589582 \n",
"37 0.451742 0.549329 \n",
"38 0.489501 0.525651 \n",
"39 0.427742 0.551697 \n",
"40 0.418387 0.556433 \n",
"\n",
" gsm8k(exact_match,strict-match)_rank bbh(exact_match,get-answer)_rank \\\n",
"0 3.0 1.0 \n",
"1 1.0 29.0 \n",
"2 16.0 10.0 \n",
"3 4.0 2.0 \n",
"4 12.0 27.0 \n",
"5 9.0 41.0 \n",
"6 25.0 5.0 \n",
"7 8.0 38.0 \n",
"8 15.0 4.0 \n",
"9 18.0 12.0 \n",
"10 11.0 7.0 \n",
"11 14.0 8.0 \n",
"12 5.0 3.0 \n",
"13 21.0 17.0 \n",
"14 28.0 18.0 \n",
"15 13.0 6.0 \n",
"16 22.0 21.0 \n",
"17 27.0 16.0 \n",
"18 23.0 15.0 \n",
"19 7.0 14.0 \n",
"20 24.0 20.0 \n",
"21 41.0 39.0 \n",
"22 6.0 9.0 \n",
"23 29.0 26.0 \n",
"24 26.0 13.0 \n",
"25 36.0 25.0 \n",
"26 31.0 24.0 \n",
"27 10.0 19.0 \n",
"28 33.0 37.0 \n",
"29 20.0 23.0 \n",
"30 2.0 11.0 \n",
"31 35.0 33.0 \n",
"32 40.0 34.0 \n",
"33 38.0 30.0 \n",
"34 39.0 22.0 \n",
"35 32.0 36.0 \n",
"36 34.0 35.0 \n",
"37 19.0 32.0 \n",
"38 17.0 28.0 \n",
"39 30.0 31.0 \n",
"40 37.0 40.0 \n",
"\n",
" arc_challenge(acc_norm,none)_rank anli_r1(acc,none)_rank \\\n",
"0 2.0 5.0 \n",
"1 4.0 4.0 \n",
"2 3.0 9.0 \n",
"3 11.0 3.0 \n",
"4 12.0 2.0 \n",
"5 1.0 1.0 \n",
"6 15.0 12.0 \n",
"7 7.0 6.0 \n",
"8 13.0 18.0 \n",
"9 6.0 11.0 \n",
"10 11.0 16.0 \n",
"11 10.0 17.0 \n",
"12 17.0 10.0 \n",
"13 9.0 13.0 \n",
"14 5.0 20.0 \n",
"15 8.0 15.0 \n",
"16 17.0 19.0 \n",
"17 20.0 21.0 \n",
"18 16.0 7.0 \n",
"19 14.0 14.0 \n",
"20 25.0 22.0 \n",
"21 22.0 8.0 \n",
"22 18.0 30.0 \n",
"23 20.0 26.0 \n",
"24 31.0 29.0 \n",
"25 21.0 31.0 \n",
"26 19.0 25.0 \n",
"27 28.0 23.0 \n",
"28 23.0 21.0 \n",
"29 29.0 28.0 \n",
"30 30.0 24.0 \n",
"31 27.0 27.0 \n",
"32 24.0 33.0 \n",
"33 26.0 37.0 \n",
"34 21.0 32.0 \n",
"35 32.0 38.0 \n",
"36 32.0 39.0 \n",
"37 34.0 34.0 \n",
"38 33.0 36.0 \n",
"39 35.0 35.0 \n",
"40 36.0 40.0 \n",
"\n",
" anli_r2(acc,none)_rank anli_r3(acc,none)_rank \\\n",
"0 3.0 2.0 \n",
"1 2.0 3.0 \n",
"2 8.0 11.0 \n",
"3 5.0 4.0 \n",
"4 4.0 5.0 \n",
"5 1.0 1.0 \n",
"6 12.0 18.0 \n",
"7 6.0 3.0 \n",
"8 14.0 17.0 \n",
"9 9.0 6.0 \n",
"10 11.0 14.0 \n",
"11 18.0 16.0 \n",
"12 17.0 8.0 \n",
"13 10.0 9.0 \n",
"14 20.0 16.0 \n",
"15 13.0 13.0 \n",
"16 19.0 15.0 \n",
"17 26.0 27.0 \n",
"18 7.0 7.0 \n",
"19 16.0 12.0 \n",
"20 23.0 21.0 \n",
"21 15.0 10.0 \n",
"22 26.0 31.0 \n",
"23 22.0 23.0 \n",
"24 25.0 29.0 \n",
"25 29.0 30.0 \n",
"26 21.0 24.0 \n",
"27 23.0 25.0 \n",
"28 28.0 20.0 \n",
"29 27.0 19.0 \n",
"30 24.0 22.0 \n",
"31 25.0 26.0 \n",
"32 31.0 33.0 \n",
"33 32.0 32.0 \n",
"34 30.0 28.0 \n",
"35 37.0 34.0 \n",
"36 34.0 36.0 \n",
"37 33.0 35.0 \n",
"38 36.0 37.0 \n",
"39 38.0 39.0 \n",
"40 35.0 38.0 \n",
"\n",
" gpqa_main_zeroshot(acc_norm,none)_rank hellaswag(acc_norm,none)_rank \\\n",
"0 9.0 3.0 \n",
"1 1.0 12.0 \n",
"2 11.0 6.0 \n",
"3 5.0 24.0 \n",
"4 12.0 5.0 \n",
"5 4.0 1.0 \n",
"6 20.0 14.0 \n",
"7 8.0 11.0 \n",
"8 6.0 9.0 \n",
"9 18.0 13.0 \n",
"10 7.0 10.0 \n",
"11 16.0 19.0 \n",
"12 13.0 29.0 \n",
"13 25.0 5.0 \n",
"14 24.0 2.0 \n",
"15 23.0 26.0 \n",
"16 3.0 16.0 \n",
"17 22.0 22.0 \n",
"18 15.0 4.0 \n",
"19 2.0 20.0 \n",
"20 12.0 27.0 \n",
"21 14.0 23.0 \n",
"22 17.0 31.0 \n",
"23 21.0 15.0 \n",
"24 26.0 25.0 \n",
"25 32.0 8.0 \n",
"26 19.0 7.0 \n",
"27 10.0 34.0 \n",
"28 24.0 30.0 \n",
"29 22.0 33.0 \n",
"30 23.0 35.0 \n",
"31 31.0 21.0 \n",
"32 34.0 18.0 \n",
"33 33.0 17.0 \n",
"34 27.0 28.0 \n",
"35 26.0 32.0 \n",
"36 30.0 36.0 \n",
"37 27.0 39.0 \n",
"38 24.0 40.0 \n",
"39 28.0 38.0 \n",
"40 29.0 37.0 \n",
"\n",
" piqa(acc_norm,none)_rank winogrande(acc,none)_rank boolq(acc,none)_rank \\\n",
"0 19.0 4.0 3.0 \n",
"1 14.0 11.0 1.0 \n",
"2 3.0 1.0 4.0 \n",
"3 21.0 22.0 6.0 \n",
"4 10.0 14.0 7.0 \n",
"5 4.0 2.0 2.0 \n",
"6 6.0 9.0 10.0 \n",
"7 5.0 8.0 12.0 \n",
"8 6.0 6.0 16.0 \n",
"9 9.0 3.0 5.0 \n",
"10 2.0 7.0 8.0 \n",
"11 18.0 13.0 19.0 \n",
"12 27.0 27.0 13.0 \n",
"13 12.0 12.0 5.0 \n",
"14 1.0 5.0 9.0 \n",
"15 22.0 17.0 17.0 \n",
"16 17.0 15.0 15.0 \n",
"17 11.0 11.0 23.0 \n",
"18 7.0 18.0 11.0 \n",
"19 25.0 24.0 14.0 \n",
"20 26.0 25.0 26.0 \n",
"21 19.0 20.0 24.0 \n",
"22 29.0 29.0 33.0 \n",
"23 11.0 16.0 18.0 \n",
"24 20.0 23.0 20.0 \n",
"25 8.0 10.0 22.0 \n",
"26 15.0 14.0 21.0 \n",
"27 33.0 33.0 28.0 \n",
"28 24.0 30.0 27.0 \n",
"29 32.0 31.0 30.0 \n",
"30 35.0 35.0 39.0 \n",
"31 23.0 26.0 25.0 \n",
"32 16.0 21.0 29.0 \n",
"33 13.0 19.0 34.0 \n",
"34 28.0 28.0 32.0 \n",
"35 30.0 32.0 35.0 \n",
"36 31.0 34.0 31.0 \n",
"37 37.0 38.0 36.0 \n",
"38 38.0 39.0 40.0 \n",
"39 36.0 37.0 38.0 \n",
"40 34.0 36.0 37.0 \n",
"\n",
" openbookqa(acc_norm,none)_rank sciq(acc_norm,none)_rank \\\n",
"0 1.0 6.0 \n",
"1 8.0 1.0 \n",
"2 7.0 2.0 \n",
"3 20.0 4.0 \n",
"4 2.0 12.0 \n",
"5 4.0 17.0 \n",
"6 9.0 7.0 \n",
"7 3.0 8.0 \n",
"8 16.0 3.0 \n",
"9 14.0 6.0 \n",
"10 6.0 5.0 \n",
"11 17.0 15.0 \n",
"12 23.0 15.0 \n",
"13 15.0 20.0 \n",
"14 5.0 9.0 \n",
"15 6.0 16.0 \n",
"16 14.0 14.0 \n",
"17 19.0 10.0 \n",
"18 7.0 21.0 \n",
"19 17.0 10.0 \n",
"20 28.0 15.0 \n",
"21 19.0 24.0 \n",
"22 24.0 17.0 \n",
"23 8.0 29.0 \n",
"24 21.0 27.0 \n",
"25 10.0 13.0 \n",
"26 12.0 26.0 \n",
"27 27.0 19.0 \n",
"28 22.0 11.0 \n",
"29 26.0 23.0 \n",
"30 30.0 32.0 \n",
"31 13.0 31.0 \n",
"32 11.0 25.0 \n",
"33 15.0 22.0 \n",
"34 18.0 18.0 \n",
"35 29.0 28.0 \n",
"36 25.0 32.0 \n",
"37 32.0 33.0 \n",
"38 33.0 35.0 \n",
"39 31.0 34.0 \n",
"40 29.0 30.0 \n",
"\n",
" qnli(acc,none)_rank mmlu(acc,none)_rank \\\n",
"0 8.0 6.0 \n",
"1 2.0 2.0 \n",
"2 9.0 14.0 \n",
"3 7.0 3.0 \n",
"4 4.0 4.0 \n",
"5 1.0 1.0 \n",
"6 28.0 8.0 \n",
"7 11.0 5.0 \n",
"8 30.0 12.0 \n",
"9 6.0 9.0 \n",
"10 39.0 15.0 \n",
"11 20.0 16.0 \n",
"12 3.0 10.0 \n",
"13 18.0 20.0 \n",
"14 26.0 22.0 \n",
"15 16.0 24.0 \n",
"16 10.0 18.0 \n",
"17 13.0 17.0 \n",
"18 19.0 7.0 \n",
"19 17.0 11.0 \n",
"20 21.0 19.0 \n",
"21 5.0 13.0 \n",
"22 33.0 23.0 \n",
"23 35.0 32.0 \n",
"24 25.0 27.0 \n",
"25 38.0 31.0 \n",
"26 22.0 28.0 \n",
"27 24.0 29.0 \n",
"28 15.0 21.0 \n",
"29 27.0 25.0 \n",
"30 12.0 26.0 \n",
"31 14.0 33.0 \n",
"32 31.0 37.0 \n",
"33 37.0 36.0 \n",
"34 32.0 30.0 \n",
"35 40.0 34.0 \n",
"36 41.0 39.0 \n",
"37 29.0 41.0 \n",
"38 34.0 40.0 \n",
"39 36.0 38.0 \n",
"40 23.0 35.0 \n",
"\n",
" nq_open(exact_match,remove_whitespace)_rank drop(f1,none)_rank \\\n",
"0 8.0 8.0 \n",
"1 15.0 18.0 \n",
"2 5.0 3.0 \n",
"3 16.0 13.0 \n",
"4 23.0 27.0 \n",
"5 19.0 25.0 \n",
"6 10.0 1.0 \n",
"7 7.0 29.0 \n",
"8 4.0 4.0 \n",
"9 36.0 9.0 \n",
"10 7.0 24.0 \n",
"11 6.0 5.0 \n",
"12 34.0 16.0 \n",
"13 25.0 14.0 \n",
"14 9.0 19.0 \n",
"15 13.0 20.0 \n",
"16 29.0 12.0 \n",
"17 3.0 2.0 \n",
"18 35.0 31.0 \n",
"19 33.0 30.0 \n",
"20 12.0 7.0 \n",
"21 37.0 21.0 \n",
"22 22.0 33.0 \n",
"23 18.0 15.0 \n",
"24 20.0 26.0 \n",
"25 1.0 38.0 \n",
"26 14.0 17.0 \n",
"27 28.0 35.0 \n",
"28 24.0 36.0 \n",
"29 30.0 23.0 \n",
"30 32.0 40.0 \n",
"31 17.0 11.0 \n",
"32 2.0 37.0 \n",
"33 11.0 34.0 \n",
"34 26.0 10.0 \n",
"35 21.0 6.0 \n",
"36 27.0 22.0 \n",
"37 38.0 32.0 \n",
"38 39.0 41.0 \n",
"39 31.0 28.0 \n",
"40 31.0 39.0 \n",
"\n",
" truthfulqa_mc1(acc,none)_rank truthfulqa_mc2(acc,none)_rank \\\n",
"0 8.0 8.0 \n",
"1 7.0 6.0 \n",
"2 15.0 19.0 \n",
"3 13.0 13.0 \n",
"4 2.0 2.0 \n",
"5 1.0 1.0 \n",
"6 19.0 26.0 \n",
"7 3.0 3.0 \n",
"8 12.0 14.0 \n",
"9 10.0 11.0 \n",
"10 18.0 22.0 \n",
"11 13.0 17.0 \n",
"12 11.0 12.0 \n",
"13 6.0 5.0 \n",
"14 4.0 4.0 \n",
"15 16.0 16.0 \n",
"16 9.0 15.0 \n",
"17 23.0 32.0 \n",
"18 8.0 9.0 \n",
"19 14.0 10.0 \n",
"20 17.0 20.0 \n",
"21 5.0 7.0 \n",
"22 20.0 23.0 \n",
"23 16.0 24.0 \n",
"24 19.0 18.0 \n",
"25 33.0 40.0 \n",
"26 30.0 33.0 \n",
"27 28.0 29.0 \n",
"28 21.0 27.0 \n",
"29 25.0 28.0 \n",
"30 24.0 25.0 \n",
"31 22.0 30.0 \n",
"32 34.0 38.0 \n",
"33 36.0 41.0 \n",
"34 29.0 37.0 \n",
"35 31.0 34.0 \n",
"36 35.0 39.0 \n",
"37 26.0 31.0 \n",
"38 27.0 21.0 \n",
"39 32.0 35.0 \n",
"40 31.0 36.0 \n",
"\n",
" triviaqa(exact_match,remove_whitespace)_rank Reasoning & Math Mean Score \\\n",
"0 23.0 0.6266 \n",
"1 13.0 0.5860 \n",
"2 3.0 0.5505 \n",
"3 18.0 0.6214 \n",
"4 17.0 0.5541 \n",
"5 35.0 0.5488 \n",
"6 4.0 0.5206 \n",
"7 12.0 0.5245 \n",
"8 7.0 0.5366 \n",
"9 15.0 0.5399 \n",
"10 5.0 0.5446 \n",
"11 8.0 0.5286 \n",
"12 26.0 0.5712 \n",
"13 11.0 0.5184 \n",
"14 2.0 0.4704 \n",
"15 19.0 0.5374 \n",
"16 16.0 0.5006 \n",
"17 10.0 0.4495 \n",
"18 39.0 0.5285 \n",
"19 36.0 0.5387 \n",
"20 14.0 0.4688 \n",
"21 21.0 0.3823 \n",
"22 27.0 0.5010 \n",
"23 20.0 0.4244 \n",
"24 28.0 0.4469 \n",
"25 1.0 0.3719 \n",
"26 24.0 0.4143 \n",
"27 34.0 0.4841 \n",
"28 22.0 0.3874 \n",
"29 32.0 0.4493 \n",
"30 40.0 0.4997 \n",
"31 29.0 0.3674 \n",
"32 6.0 0.3361 \n",
"33 9.0 0.3377 \n",
"34 31.0 0.3702 \n",
"35 25.0 0.3450 \n",
"36 30.0 0.3312 \n",
"37 38.0 0.4009 \n",
"38 41.0 0.4085 \n",
"39 37.0 0.3494 \n",
"40 33.0 0.2914 \n",
"\n",
" Reasoning & Math Avg. Rank Commonsense & NLI Mean Score \\\n",
"0 1 0.7737 \n",
"1 3 0.7807 \n",
"2 6 0.7726 \n",
"3 2 0.7468 \n",
"4 5 0.7730 \n",
"5 7 0.7941 \n",
"6 16 0.7266 \n",
"7 15 0.7564 \n",
"8 12 0.7249 \n",
"9 9 0.7691 \n",
"10 8 0.7328 \n",
"11 13 0.7147 \n",
"12 4 0.7266 \n",
"13 17 0.7284 \n",
"14 22 0.7403 \n",
"15 11 0.7167 \n",
"16 19 0.7374 \n",
"17 24 0.7199 \n",
"18 14 0.7274 \n",
"19 10 0.7094 \n",
"20 23 0.6788 \n",
"21 32 0.7367 \n",
"22 18 0.6587 \n",
"23 27 0.7090 \n",
"24 26 0.6928 \n",
"25 33 0.7157 \n",
"26 28 0.7153 \n",
"27 21 0.6422 \n",
"28 31 0.6803 \n",
"29 25 0.6442 \n",
"30 20 0.6184 \n",
"31 35 0.6978 \n",
"32 39 0.6956 \n",
"33 38 0.6886 \n",
"34 34 0.6711 \n",
"35 37 0.6264 \n",
"36 40 0.6267 \n",
"37 30 0.5703 \n",
"38 29 0.5181 \n",
"39 36 0.5696 \n",
"40 41 0.6039 \n",
"\n",
" Commonsense & NLI Avg. Rank Knowledge & Reading Mean Score \\\n",
"0 3 0.3791 \n",
"1 2 0.3926 \n",
"2 5 0.4136 \n",
"3 8 0.3566 \n",
"4 4 0.3810 \n",
"5 1 0.3581 \n",
"6 15 0.4369 \n",
"7 7 0.3963 \n",
"8 17 0.4127 \n",
"9 6 0.3467 \n",
"10 12 0.3683 \n",
"11 22 0.3923 \n",
"12 16 0.3226 \n",
"13 13 0.3719 \n",
"14 9 0.4045 \n",
"15 19 0.3261 \n",
"16 10 0.3339 \n",
"17 18 0.4063 \n",
"18 14 0.2919 \n",
"19 23 0.2834 \n",
"20 30 0.3438 \n",
"21 11 0.3406 \n",
"22 32 0.2827 \n",
"23 24 0.3007 \n",
"24 27 0.2805 \n",
"25 20 0.3374 \n",
"26 21 0.2864 \n",
"27 34 0.2340 \n",
"28 29 0.2903 \n",
"29 33 0.2567 \n",
"30 37 0.2276 \n",
"31 25 0.2656 \n",
"32 26 0.3018 \n",
"33 28 0.2864 \n",
"34 31 0.2581 \n",
"35 36 0.2731 \n",
"36 35 0.2202 \n",
"37 39 0.1954 \n",
"38 41 0.1983 \n",
"39 40 0.2000 \n",
"40 38 0.2218 \n",
"\n",
" Knowledge & Reading Avg. Rank Mean Score \n",
"0 10 0.6038 \n",
"1 7 0.5961 \n",
"2 2 0.5871 \n",
"3 14 0.5859 \n",
"4 9 0.5788 \n",
"5 13 0.5775 \n",
"6 1 0.5676 \n",
"7 6 0.5672 \n",
"8 3 0.5653 \n",
"9 15 0.5621 \n",
"10 12 0.5576 \n",
"11 8 0.5528 \n",
"12 21 0.5510 \n",
"13 11 0.5480 \n",
"14 5 0.5451 \n",
"15 20 0.5368 \n",
"16 19 0.5335 \n",
"17 4 0.5312 \n",
"18 24 0.5271 \n",
"19 28 0.5219 \n",
"20 16 0.5048 \n",
"21 17 0.4939 \n",
"22 29 0.4907 \n",
"23 23 0.4869 \n",
"24 30 0.4830 \n",
"25 18 0.4819 \n",
"26 26 0.4813 \n",
"27 35 0.4644 \n",
"28 25 0.4608 \n",
"29 34 0.4597 \n",
"30 36 0.4596 \n",
"31 32 0.4525 \n",
"32 22 0.4516 \n",
"33 27 0.4451 \n",
"34 33 0.4419 \n",
"35 31 0.4219 \n",
"36 38 0.4013 \n",
"37 41 0.3986 \n",
"38 40 0.3838 \n",
"39 39 0.3816 \n",
"40 37 0.3799 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'total_time_raw': '18d 7h 55m', 'gpu_util_time_raw': '14d 23h 41m'}\n"
]
}
],
"source": [
"\n",
"GROUPS = {\n",
" \"Reasoning & Math\": [\n",
" \"gsm8k(exact_match,strict-match)\", \n",
" \"bbh(exact_match,get-answer)\", \n",
" \"arc_challenge(acc_norm,none)\", 'anli_r1(acc,none)',\n",
" 'anli_r2(acc,none)', 'anli_r3(acc,none)',\n",
" \"gpqa_main_zeroshot(acc_norm,none)\",\n",
" ],\n",
" \"Commonsense & NLI\": [\n",
" \"hellaswag(acc_norm,none)\",\n",
" \"piqa(acc_norm,none)\", \"winogrande(acc,none)\", \"boolq(acc,none)\",\n",
" \"openbookqa(acc_norm,none)\", \"sciq(acc_norm,none)\", \"qnli(acc,none)\",\n",
" ],\n",
" \"Knowledge & Reading\": [\n",
" \"mmlu(acc,none)\", \"nq_open(exact_match,remove_whitespace)\", \"drop(f1,none)\",\n",
" \"truthfulqa_mc1(acc,none)\", 'truthfulqa_mc2(acc,none)','triviaqa(exact_match,remove_whitespace)',\n",
" ],\n",
"}\n",
"\n",
"\n",
"\n",
"def add_task_ranks(df, task_cols):\n",
" df = df.copy()\n",
" for col in task_cols:\n",
" if col not in df.columns: \n",
" raise ValueError(f\"No task: {col}\")\n",
" # rank: 1 = best; NaN scores get ranked at the bottom\n",
" df[f\"{col}_rank\"] = df[col].rank(ascending=False, method=\"dense\", na_option=\"bottom\")\n",
" return df\n",
"\n",
"def add_group_ranks(df, groups):\n",
" df = df.copy()\n",
" for gname, cols in groups.items():\n",
" # strip task name before \"(\" if any\n",
" tasks = [c for c in cols]\n",
" mean_col = f\"{gname}_mean\"\n",
" rank_col = f\"{gname}_rank\"\n",
" df[mean_col] = df[tasks].mean(axis=1)\n",
" df[rank_col] = df[mean_col].rank(ascending=False, method=\"dense\", na_option=\"bottom\").astype(int)\n",
" return df\n",
"\n",
"\n",
"def add_overall_rank(df, groups):\n",
" df = df.copy()\n",
" all_tasks = [c for cols in groups.values() for c in cols]\n",
"\n",
" # overall mean score across all tasks\n",
" df[\"overall_mean\"] = df[all_tasks].mean(axis=1, skipna=True)\n",
"\n",
" # higher = better → rank descending\n",
" df[\"overall_rank\"] = df[\"overall_mean\"].rank(\n",
" ascending=False, method=\"dense\", na_option=\"bottom\"\n",
" ).astype(int)\n",
" return df\n",
"\n",
"\n",
"all_task_cols = [c for cols in GROUPS.values() for c in cols]\n",
"\n",
"df_task_ranked = add_task_ranks(result_gpu_merged, all_task_cols)\n",
"df_group_ranked = add_group_ranks(df_task_ranked, GROUPS)\n",
"leaderboard = add_overall_rank(df_group_ranked, GROUPS)\n",
"\n",
"\n",
"col = \"overall_rank\" # the one you want first\n",
"cols = [col] + [c for c in leaderboard.columns if c != col]\n",
"df = leaderboard[cols]\n",
"df = df.sort_values(by=col, ascending=True).reset_index(drop=True)\n",
"\n",
"# Add quantization marker\n",
"targets = ['Qwen_Qwen3-14B', 'Qwen_Qwen2.5-14B-Instruct'] # use hyphen\n",
"mask = df['model_name'].isin(targets)\n",
"df.loc[mask, 'model_name'] = df.loc[mask, 'model_name'] + ' (8bit)'\n",
"\n",
"# display(df)\n",
"\n",
"df_display = df.rename(columns={\n",
" \"overall_rank\": \"Overall Rank\",\n",
" \"model_name\": \"Model Name\",\n",
" \"gpu_util_time\": \"GPU Util Time\",\n",
" \"total_time\": \"Total Time\",\n",
" \"parameters\": \"Parameters\",\n",
" 'Reasoning & Math_rank': 'Reasoning & Math Avg. Rank',\n",
" 'Commonsense & NLI_rank': 'Commonsense & NLI Avg. Rank',\n",
" 'Knowledge & Reading_rank': 'Knowledge & Reading Avg. Rank',\n",
" 'overall_mean': 'Mean Score',\n",
" 'Reasoning & Math_mean': 'Reasoning & Math Mean Score',\n",
" 'Commonsense & NLI_mean': 'Commonsense & NLI Mean Score',\n",
" 'Knowledge & Reading_mean': 'Knowledge & Reading Mean Score',\n",
"})\n",
"\n",
"cols_to_round = [\"Mean Score\", \"Reasoning & Math Mean Score\", \"Commonsense & NLI Mean Score\", \"Knowledge & Reading Mean Score\"] \n",
"df_display[cols_to_round] = df_display[cols_to_round].round(4)\n",
"\n",
"display(df_display)\n",
"df.to_csv(\"/mnt/data8tb/Documents/project/benchmark_project/llm_benchmarks_master.csv\")\n",
"\n",
"\n",
" \n",
"# Total time calculation\n",
"def format_seconds(secs: int) -> str:\n",
" days, rem = divmod(int(secs), 86400) # 86400 sec = 1 day\n",
" hours, rem = divmod(rem, 3600) # 3600 sec = 1 hour\n",
" minutes, _ = divmod(rem, 60)\n",
" return f\"{days}d {hours}h {minutes}m\"\n",
"\n",
"# Example usage with df_display\n",
"totals = {}\n",
"for col in [\"total_time_raw\", \"gpu_util_time_raw\"]:\n",
" total_secs = df_display[col].sum()\n",
" totals[col] = format_seconds(total_secs)\n",
"\n",
"print(totals)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "b3ce5953-3a36-436a-ba4c-46bedd2b4c56",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"overall\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Model Name | \n",
" Total Time | \n",
" GPU Util Time | \n",
" Mean Score | \n",
" Overall Rank | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" google_gemma-3-12b-it | \n",
" 15h 45m | \n",
" 14h 8m | \n",
" 0.6038 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" Qwen_Qwen3-14B (8bit) | \n",
" 29h 45m | \n",
" 17h 29m | \n",
" 0.5961 | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" openchat_openchat-3.6-8b-20240522 | \n",
" 7h 51m | \n",
" 6h 59m | \n",
" 0.5871 | \n",
" 3 | \n",
"
\n",
" \n",
" | 4 | \n",
" Qwen_Qwen3-8B | \n",
" 15h 31m | \n",
" 13h 44m | \n",
" 0.5859 | \n",
" 4 | \n",
"
\n",
" \n",
" | 5 | \n",
" Qwen_Qwen2.5-7B-Instruct | \n",
" 9h 36m | \n",
" 8h 33m | \n",
" 0.5788 | \n",
" 5 | \n",
"
\n",
" \n",
" | 6 | \n",
" Qwen_Qwen2.5-14B-Instruct (8bit) | \n",
" 52h 44m | \n",
" 29h 32m | \n",
" 0.5775 | \n",
" 6 | \n",
"
\n",
" \n",
" | 7 | \n",
" 01-ai_Yi-1.5-9B | \n",
" 11h 43m | \n",
" 10h 26m | \n",
" 0.5676 | \n",
" 7 | \n",
"
\n",
" \n",
" | 8 | \n",
" Qwen_Qwen2.5-7B-Instruct-1M | \n",
" 11h 17m | \n",
" 10h 10m | \n",
" 0.5672 | \n",
" 8 | \n",
"
\n",
" \n",
" | 9 | \n",
" meta-llama_Llama-3.1-8B-Instruct | \n",
" 12h 19m | \n",
" 10h 52m | \n",
" 0.5653 | \n",
" 9 | \n",
"
\n",
" \n",
" | 10 | \n",
" 01-ai_Yi-1.5-9B-Chat | \n",
" 13h 54m | \n",
" 12h 15m | \n",
" 0.5621 | \n",
" 10 | \n",
"
\n",
" \n",
" | 11 | \n",
" mistralai_Ministral-8B-Instruct-2410 | \n",
" 10h 46m | \n",
" 9h 27m | \n",
" 0.5576 | \n",
" 11 | \n",
"
\n",
" \n",
" | 12 | \n",
" meta-llama_Meta-Llama-3-8B-Instruct | \n",
" 6h 30m | \n",
" 5h 46m | \n",
" 0.5528 | \n",
" 12 | \n",
"
\n",
" \n",
" | 13 | \n",
" Qwen_Qwen3-4B | \n",
" 5h 51m | \n",
" 5h 3m | \n",
" 0.5510 | \n",
" 13 | \n",
"
\n",
" \n",
" | 14 | \n",
" NousResearch_Hermes-2-Pro-Mistral-7B | \n",
" 8h 27m | \n",
" 7h 28m | \n",
" 0.5480 | \n",
" 14 | \n",
"
\n",
" \n",
" | 15 | \n",
" mistralai_Mistral-7B-Instruct-v0.3 | \n",
" 8h 38m | \n",
" 7h 41m | \n",
" 0.5451 | \n",
" 15 | \n",
"
\n",
" \n",
" | 16 | \n",
" google_gemma-3-4b-it | \n",
" 4h 51m | \n",
" 3h 50m | \n",
" 0.5368 | \n",
" 16 | \n",
"
\n",
" \n",
" | 17 | \n",
" 01-ai_Yi-1.5-6B-Chat | \n",
" 8h 4m | \n",
" 7h 1m | \n",
" 0.5335 | \n",
" 17 | \n",
"
\n",
" \n",
" | 18 | \n",
" 01-ai_Yi-1.5-6B | \n",
" 4h 28m | \n",
" 3h 54m | \n",
" 0.5312 | \n",
" 18 | \n",
"
\n",
" \n",
" | 19 | \n",
" Qwen_Qwen2-7B-Instruct | \n",
" 11h 30m | \n",
" 10h 11m | \n",
" 0.5271 | \n",
" 19 | \n",
"
\n",
" \n",
" | 20 | \n",
" deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | \n",
" 17h 57m | \n",
" 15h 30m | \n",
" 0.5219 | \n",
" 20 | \n",
"
\n",
" \n",
" | 21 | \n",
" meta-llama_Llama-3.2-3B-Instruct | \n",
" 7h 12m | \n",
" 5h 57m | \n",
" 0.5048 | \n",
" 21 | \n",
"
\n",
" \n",
" | 22 | \n",
" Qwen_Qwen2.5-3B-Instruct | \n",
" 7h 48m | \n",
" 6h 30m | \n",
" 0.4939 | \n",
" 22 | \n",
"
\n",
" \n",
" | 23 | \n",
" Qwen_Qwen2.5-Math-7B | \n",
" 27h 21m | \n",
" 24h 38m | \n",
" 0.4907 | \n",
" 23 | \n",
"
\n",
" \n",
" | 24 | \n",
" deepseek-ai_deepseek-llm-7b-chat | \n",
" 10h 6m | \n",
" 9h 8m | \n",
" 0.4869 | \n",
" 24 | \n",
"
\n",
" \n",
" | 25 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Llama-8B | \n",
" 11h 46m | \n",
" 10h 36m | \n",
" 0.4830 | \n",
" 25 | \n",
"
\n",
" \n",
" | 26 | \n",
" meta-llama_Llama-2-13b-hf | \n",
" 19h 21m | \n",
" 17h 38m | \n",
" 0.4819 | \n",
" 26 | \n",
"
\n",
" \n",
" | 27 | \n",
" meta-llama_Llama-2-13b-chat-hf | \n",
" 17h 8m | \n",
" 15h 37m | \n",
" 0.4813 | \n",
" 27 | \n",
"
\n",
" \n",
" | 28 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | \n",
" 6h 28m | \n",
" 5h 43m | \n",
" 0.4644 | \n",
" 28 | \n",
"
\n",
" \n",
" | 29 | \n",
" Qwen_Qwen2.5-1.5B-Instruct | \n",
" 3h 20m | \n",
" 2h 36m | \n",
" 0.4608 | \n",
" 29 | \n",
"
\n",
" \n",
" | 30 | \n",
" Qwen_Qwen3-1.7B | \n",
" 4h 25m | \n",
" 3h 36m | \n",
" 0.4597 | \n",
" 30 | \n",
"
\n",
" \n",
" | 31 | \n",
" Qwen_Qwen2.5-Math-7B-Instruct | \n",
" 5h 37m | \n",
" 4h 57m | \n",
" 0.4596 | \n",
" 31 | \n",
"
\n",
" \n",
" | 32 | \n",
" meta-llama_Llama-2-7b-chat-hf | \n",
" 6h 57m | \n",
" 6h 7m | \n",
" 0.4525 | \n",
" 32 | \n",
"
\n",
" \n",
" | 33 | \n",
" meta-llama_Llama-2-7b-hf | \n",
" 5h 42m | \n",
" 4h 59m | \n",
" 0.4516 | \n",
" 33 | \n",
"
\n",
" \n",
" | 34 | \n",
" deepseek-ai_deepseek-llm-7b-base | \n",
" 7h 11m | \n",
" 6h 26m | \n",
" 0.4451 | \n",
" 34 | \n",
"
\n",
" \n",
" | 35 | \n",
" deepseek-ai_deepseek-math-7b-rl | \n",
" 8h 2m | \n",
" 7h 12m | \n",
" 0.4419 | \n",
" 35 | \n",
"
\n",
" \n",
" | 36 | \n",
" meta-llama_Llama-3.2-1B-Instruct | \n",
" 3h 30m | \n",
" 2h 35m | \n",
" 0.4219 | \n",
" 36 | \n",
"
\n",
" \n",
" | 37 | \n",
" google_gemma-3-1b-it | \n",
" 6h 50m | \n",
" 4h 52m | \n",
" 0.4013 | \n",
" 37 | \n",
"
\n",
" \n",
" | 38 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | \n",
" 3h 40m | \n",
" 2h 52m | \n",
" 0.3986 | \n",
" 38 | \n",
"
\n",
" \n",
" | 39 | \n",
" Qwen_Qwen2.5-Math-1.5B-Instruct | \n",
" 3h 25m | \n",
" 2h 39m | \n",
" 0.3838 | \n",
" 39 | \n",
"
\n",
" \n",
" | 40 | \n",
" Qwen_Qwen3-0.6B | \n",
" 3h 45m | \n",
" 2h 53m | \n",
" 0.3816 | \n",
" 40 | \n",
"
\n",
" \n",
" | 41 | \n",
" Qwen_Qwen2.5-0.5B-Instruct | \n",
" 2h 34m | \n",
" 1h 48m | \n",
" 0.3799 | \n",
" 41 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Model Name Total Time GPU Util Time \\\n",
"1 google_gemma-3-12b-it 15h 45m 14h 8m \n",
"2 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n",
"3 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n",
"4 Qwen_Qwen3-8B 15h 31m 13h 44m \n",
"5 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n",
"6 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n",
"7 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n",
"8 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n",
"9 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n",
"10 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n",
"11 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n",
"12 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n",
"13 Qwen_Qwen3-4B 5h 51m 5h 3m \n",
"14 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n",
"15 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n",
"16 google_gemma-3-4b-it 4h 51m 3h 50m \n",
"17 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n",
"18 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n",
"19 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n",
"20 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n",
"21 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n",
"22 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n",
"23 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n",
"24 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n",
"25 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n",
"26 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n",
"27 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n",
"28 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n",
"29 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n",
"30 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n",
"31 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n",
"32 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n",
"33 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n",
"34 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n",
"35 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n",
"36 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n",
"37 google_gemma-3-1b-it 6h 50m 4h 52m \n",
"38 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n",
"39 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n",
"40 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n",
"41 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n",
"\n",
" Mean Score Overall Rank \n",
"1 0.6038 1 \n",
"2 0.5961 2 \n",
"3 0.5871 3 \n",
"4 0.5859 4 \n",
"5 0.5788 5 \n",
"6 0.5775 6 \n",
"7 0.5676 7 \n",
"8 0.5672 8 \n",
"9 0.5653 9 \n",
"10 0.5621 10 \n",
"11 0.5576 11 \n",
"12 0.5528 12 \n",
"13 0.5510 13 \n",
"14 0.5480 14 \n",
"15 0.5451 15 \n",
"16 0.5368 16 \n",
"17 0.5335 17 \n",
"18 0.5312 18 \n",
"19 0.5271 19 \n",
"20 0.5219 20 \n",
"21 0.5048 21 \n",
"22 0.4939 22 \n",
"23 0.4907 23 \n",
"24 0.4869 24 \n",
"25 0.4830 25 \n",
"26 0.4819 26 \n",
"27 0.4813 27 \n",
"28 0.4644 28 \n",
"29 0.4608 29 \n",
"30 0.4597 30 \n",
"31 0.4596 31 \n",
"32 0.4525 32 \n",
"33 0.4516 33 \n",
"34 0.4451 34 \n",
"35 0.4419 35 \n",
"36 0.4219 36 \n",
"37 0.4013 37 \n",
"38 0.3986 38 \n",
"39 0.3838 39 \n",
"40 0.3816 40 \n",
"41 0.3799 41 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| Model Name | Total Time | GPU Util Time | Mean Score | Overall Rank |\n",
"|:------------------------------------------|:-------------|:----------------|-------------:|---------------:|\n",
"| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.6038 | 1 |\n",
"| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.5961 | 2 |\n",
"| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.5871 | 3 |\n",
"| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.5859 | 4 |\n",
"| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.5788 | 5 |\n",
"| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.5775 | 6 |\n",
"| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.5676 | 7 |\n",
"| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.5672 | 8 |\n",
"| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.5653 | 9 |\n",
"| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.5621 | 10 |\n",
"| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.5576 | 11 |\n",
"| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.5528 | 12 |\n",
"| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.551 | 13 |\n",
"| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.548 | 14 |\n",
"| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.5451 | 15 |\n",
"| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.5368 | 16 |\n",
"| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.5335 | 17 |\n",
"| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.5312 | 18 |\n",
"| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.5271 | 19 |\n",
"| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.5219 | 20 |\n",
"| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.5048 | 21 |\n",
"| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.4939 | 22 |\n",
"| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.4907 | 23 |\n",
"| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.4869 | 24 |\n",
"| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.483 | 25 |\n",
"| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.4819 | 26 |\n",
"| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.4813 | 27 |\n",
"| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.4644 | 28 |\n",
"| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.4608 | 29 |\n",
"| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.4597 | 30 |\n",
"| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.4596 | 31 |\n",
"| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.4525 | 32 |\n",
"| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.4516 | 33 |\n",
"| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.4451 | 34 |\n",
"| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.4419 | 35 |\n",
"| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.4219 | 36 |\n",
"| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.4013 | 37 |\n",
"| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.3986 | 38 |\n",
"| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.3838 | 39 |\n",
"| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.3816 | 40 |\n",
"| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.3799 | 41 |\n",
"\n",
"\n",
"reasoning_and_math\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Model Name | \n",
" Total Time | \n",
" GPU Util Time | \n",
" Reasoning & Math Mean Score | \n",
" Reasoning & Math Avg. Rank | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" google_gemma-3-12b-it | \n",
" 15h 45m | \n",
" 14h 8m | \n",
" 0.6266 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" Qwen_Qwen3-8B | \n",
" 15h 31m | \n",
" 13h 44m | \n",
" 0.6214 | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" Qwen_Qwen3-14B (8bit) | \n",
" 29h 45m | \n",
" 17h 29m | \n",
" 0.5860 | \n",
" 3 | \n",
"
\n",
" \n",
" | 4 | \n",
" Qwen_Qwen3-4B | \n",
" 5h 51m | \n",
" 5h 3m | \n",
" 0.5712 | \n",
" 4 | \n",
"
\n",
" \n",
" | 5 | \n",
" Qwen_Qwen2.5-7B-Instruct | \n",
" 9h 36m | \n",
" 8h 33m | \n",
" 0.5541 | \n",
" 5 | \n",
"
\n",
" \n",
" | 6 | \n",
" openchat_openchat-3.6-8b-20240522 | \n",
" 7h 51m | \n",
" 6h 59m | \n",
" 0.5505 | \n",
" 6 | \n",
"
\n",
" \n",
" | 7 | \n",
" Qwen_Qwen2.5-14B-Instruct (8bit) | \n",
" 52h 44m | \n",
" 29h 32m | \n",
" 0.5488 | \n",
" 7 | \n",
"
\n",
" \n",
" | 8 | \n",
" mistralai_Ministral-8B-Instruct-2410 | \n",
" 10h 46m | \n",
" 9h 27m | \n",
" 0.5446 | \n",
" 8 | \n",
"
\n",
" \n",
" | 9 | \n",
" 01-ai_Yi-1.5-9B-Chat | \n",
" 13h 54m | \n",
" 12h 15m | \n",
" 0.5399 | \n",
" 9 | \n",
"
\n",
" \n",
" | 10 | \n",
" deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | \n",
" 17h 57m | \n",
" 15h 30m | \n",
" 0.5387 | \n",
" 10 | \n",
"
\n",
" \n",
" | 11 | \n",
" google_gemma-3-4b-it | \n",
" 4h 51m | \n",
" 3h 50m | \n",
" 0.5374 | \n",
" 11 | \n",
"
\n",
" \n",
" | 12 | \n",
" meta-llama_Llama-3.1-8B-Instruct | \n",
" 12h 19m | \n",
" 10h 52m | \n",
" 0.5366 | \n",
" 12 | \n",
"
\n",
" \n",
" | 13 | \n",
" meta-llama_Meta-Llama-3-8B-Instruct | \n",
" 6h 30m | \n",
" 5h 46m | \n",
" 0.5286 | \n",
" 13 | \n",
"
\n",
" \n",
" | 14 | \n",
" Qwen_Qwen2-7B-Instruct | \n",
" 11h 30m | \n",
" 10h 11m | \n",
" 0.5285 | \n",
" 14 | \n",
"
\n",
" \n",
" | 15 | \n",
" Qwen_Qwen2.5-7B-Instruct-1M | \n",
" 11h 17m | \n",
" 10h 10m | \n",
" 0.5245 | \n",
" 15 | \n",
"
\n",
" \n",
" | 16 | \n",
" 01-ai_Yi-1.5-9B | \n",
" 11h 43m | \n",
" 10h 26m | \n",
" 0.5206 | \n",
" 16 | \n",
"
\n",
" \n",
" | 17 | \n",
" NousResearch_Hermes-2-Pro-Mistral-7B | \n",
" 8h 27m | \n",
" 7h 28m | \n",
" 0.5184 | \n",
" 17 | \n",
"
\n",
" \n",
" | 18 | \n",
" Qwen_Qwen2.5-Math-7B | \n",
" 27h 21m | \n",
" 24h 38m | \n",
" 0.5010 | \n",
" 18 | \n",
"
\n",
" \n",
" | 19 | \n",
" 01-ai_Yi-1.5-6B-Chat | \n",
" 8h 4m | \n",
" 7h 1m | \n",
" 0.5006 | \n",
" 19 | \n",
"
\n",
" \n",
" | 20 | \n",
" Qwen_Qwen2.5-Math-7B-Instruct | \n",
" 5h 37m | \n",
" 4h 57m | \n",
" 0.4997 | \n",
" 20 | \n",
"
\n",
" \n",
" | 21 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | \n",
" 6h 28m | \n",
" 5h 43m | \n",
" 0.4841 | \n",
" 21 | \n",
"
\n",
" \n",
" | 22 | \n",
" mistralai_Mistral-7B-Instruct-v0.3 | \n",
" 8h 38m | \n",
" 7h 41m | \n",
" 0.4704 | \n",
" 22 | \n",
"
\n",
" \n",
" | 23 | \n",
" meta-llama_Llama-3.2-3B-Instruct | \n",
" 7h 12m | \n",
" 5h 57m | \n",
" 0.4688 | \n",
" 23 | \n",
"
\n",
" \n",
" | 24 | \n",
" 01-ai_Yi-1.5-6B | \n",
" 4h 28m | \n",
" 3h 54m | \n",
" 0.4495 | \n",
" 24 | \n",
"
\n",
" \n",
" | 25 | \n",
" Qwen_Qwen3-1.7B | \n",
" 4h 25m | \n",
" 3h 36m | \n",
" 0.4493 | \n",
" 25 | \n",
"
\n",
" \n",
" | 26 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Llama-8B | \n",
" 11h 46m | \n",
" 10h 36m | \n",
" 0.4469 | \n",
" 26 | \n",
"
\n",
" \n",
" | 27 | \n",
" deepseek-ai_deepseek-llm-7b-chat | \n",
" 10h 6m | \n",
" 9h 8m | \n",
" 0.4244 | \n",
" 27 | \n",
"
\n",
" \n",
" | 28 | \n",
" meta-llama_Llama-2-13b-chat-hf | \n",
" 17h 8m | \n",
" 15h 37m | \n",
" 0.4143 | \n",
" 28 | \n",
"
\n",
" \n",
" | 29 | \n",
" Qwen_Qwen2.5-Math-1.5B-Instruct | \n",
" 3h 25m | \n",
" 2h 39m | \n",
" 0.4085 | \n",
" 29 | \n",
"
\n",
" \n",
" | 30 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | \n",
" 3h 40m | \n",
" 2h 52m | \n",
" 0.4009 | \n",
" 30 | \n",
"
\n",
" \n",
" | 31 | \n",
" Qwen_Qwen2.5-1.5B-Instruct | \n",
" 3h 20m | \n",
" 2h 36m | \n",
" 0.3874 | \n",
" 31 | \n",
"
\n",
" \n",
" | 32 | \n",
" Qwen_Qwen2.5-3B-Instruct | \n",
" 7h 48m | \n",
" 6h 30m | \n",
" 0.3823 | \n",
" 32 | \n",
"
\n",
" \n",
" | 33 | \n",
" meta-llama_Llama-2-13b-hf | \n",
" 19h 21m | \n",
" 17h 38m | \n",
" 0.3719 | \n",
" 33 | \n",
"
\n",
" \n",
" | 34 | \n",
" deepseek-ai_deepseek-math-7b-rl | \n",
" 8h 2m | \n",
" 7h 12m | \n",
" 0.3702 | \n",
" 34 | \n",
"
\n",
" \n",
" | 35 | \n",
" meta-llama_Llama-2-7b-chat-hf | \n",
" 6h 57m | \n",
" 6h 7m | \n",
" 0.3674 | \n",
" 35 | \n",
"
\n",
" \n",
" | 36 | \n",
" Qwen_Qwen3-0.6B | \n",
" 3h 45m | \n",
" 2h 53m | \n",
" 0.3494 | \n",
" 36 | \n",
"
\n",
" \n",
" | 37 | \n",
" meta-llama_Llama-3.2-1B-Instruct | \n",
" 3h 30m | \n",
" 2h 35m | \n",
" 0.3450 | \n",
" 37 | \n",
"
\n",
" \n",
" | 38 | \n",
" deepseek-ai_deepseek-llm-7b-base | \n",
" 7h 11m | \n",
" 6h 26m | \n",
" 0.3377 | \n",
" 38 | \n",
"
\n",
" \n",
" | 39 | \n",
" meta-llama_Llama-2-7b-hf | \n",
" 5h 42m | \n",
" 4h 59m | \n",
" 0.3361 | \n",
" 39 | \n",
"
\n",
" \n",
" | 40 | \n",
" google_gemma-3-1b-it | \n",
" 6h 50m | \n",
" 4h 52m | \n",
" 0.3312 | \n",
" 40 | \n",
"
\n",
" \n",
" | 41 | \n",
" Qwen_Qwen2.5-0.5B-Instruct | \n",
" 2h 34m | \n",
" 1h 48m | \n",
" 0.2914 | \n",
" 41 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Model Name Total Time GPU Util Time \\\n",
"1 google_gemma-3-12b-it 15h 45m 14h 8m \n",
"2 Qwen_Qwen3-8B 15h 31m 13h 44m \n",
"3 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n",
"4 Qwen_Qwen3-4B 5h 51m 5h 3m \n",
"5 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n",
"6 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n",
"7 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n",
"8 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n",
"9 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n",
"10 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n",
"11 google_gemma-3-4b-it 4h 51m 3h 50m \n",
"12 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n",
"13 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n",
"14 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n",
"15 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n",
"16 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n",
"17 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n",
"18 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n",
"19 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n",
"20 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n",
"21 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n",
"22 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n",
"23 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n",
"24 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n",
"25 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n",
"26 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n",
"27 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n",
"28 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n",
"29 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n",
"30 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n",
"31 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n",
"32 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n",
"33 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n",
"34 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n",
"35 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n",
"36 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n",
"37 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n",
"38 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n",
"39 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n",
"40 google_gemma-3-1b-it 6h 50m 4h 52m \n",
"41 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n",
"\n",
" Reasoning & Math Mean Score Reasoning & Math Avg. Rank \n",
"1 0.6266 1 \n",
"2 0.6214 2 \n",
"3 0.5860 3 \n",
"4 0.5712 4 \n",
"5 0.5541 5 \n",
"6 0.5505 6 \n",
"7 0.5488 7 \n",
"8 0.5446 8 \n",
"9 0.5399 9 \n",
"10 0.5387 10 \n",
"11 0.5374 11 \n",
"12 0.5366 12 \n",
"13 0.5286 13 \n",
"14 0.5285 14 \n",
"15 0.5245 15 \n",
"16 0.5206 16 \n",
"17 0.5184 17 \n",
"18 0.5010 18 \n",
"19 0.5006 19 \n",
"20 0.4997 20 \n",
"21 0.4841 21 \n",
"22 0.4704 22 \n",
"23 0.4688 23 \n",
"24 0.4495 24 \n",
"25 0.4493 25 \n",
"26 0.4469 26 \n",
"27 0.4244 27 \n",
"28 0.4143 28 \n",
"29 0.4085 29 \n",
"30 0.4009 30 \n",
"31 0.3874 31 \n",
"32 0.3823 32 \n",
"33 0.3719 33 \n",
"34 0.3702 34 \n",
"35 0.3674 35 \n",
"36 0.3494 36 \n",
"37 0.3450 37 \n",
"38 0.3377 38 \n",
"39 0.3361 39 \n",
"40 0.3312 40 \n",
"41 0.2914 41 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| Model Name | Total Time | GPU Util Time | Reasoning & Math Mean Score | Reasoning & Math Avg. Rank |\n",
"|:------------------------------------------|:-------------|:----------------|------------------------------:|-----------------------------:|\n",
"| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.6266 | 1 |\n",
"| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.6214 | 2 |\n",
"| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.586 | 3 |\n",
"| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.5712 | 4 |\n",
"| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.5541 | 5 |\n",
"| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.5505 | 6 |\n",
"| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.5488 | 7 |\n",
"| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.5446 | 8 |\n",
"| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.5399 | 9 |\n",
"| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.5387 | 10 |\n",
"| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.5374 | 11 |\n",
"| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.5366 | 12 |\n",
"| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.5286 | 13 |\n",
"| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.5285 | 14 |\n",
"| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.5245 | 15 |\n",
"| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.5206 | 16 |\n",
"| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.5184 | 17 |\n",
"| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.501 | 18 |\n",
"| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.5006 | 19 |\n",
"| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.4997 | 20 |\n",
"| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.4841 | 21 |\n",
"| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.4704 | 22 |\n",
"| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.4688 | 23 |\n",
"| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.4495 | 24 |\n",
"| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.4493 | 25 |\n",
"| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.4469 | 26 |\n",
"| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.4244 | 27 |\n",
"| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.4143 | 28 |\n",
"| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.4085 | 29 |\n",
"| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.4009 | 30 |\n",
"| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.3874 | 31 |\n",
"| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.3823 | 32 |\n",
"| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.3719 | 33 |\n",
"| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.3702 | 34 |\n",
"| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.3674 | 35 |\n",
"| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.3494 | 36 |\n",
"| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.345 | 37 |\n",
"| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.3377 | 38 |\n",
"| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.3361 | 39 |\n",
"| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.3312 | 40 |\n",
"| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.2914 | 41 |\n",
"\n",
"\n",
"commonsense_and_nli\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Model Name | \n",
" Total Time | \n",
" GPU Util Time | \n",
" Commonsense & NLI Mean Score | \n",
" Commonsense & NLI Avg. Rank | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" Qwen_Qwen2.5-14B-Instruct (8bit) | \n",
" 52h 44m | \n",
" 29h 32m | \n",
" 0.7941 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" Qwen_Qwen3-14B (8bit) | \n",
" 29h 45m | \n",
" 17h 29m | \n",
" 0.7807 | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" google_gemma-3-12b-it | \n",
" 15h 45m | \n",
" 14h 8m | \n",
" 0.7737 | \n",
" 3 | \n",
"
\n",
" \n",
" | 4 | \n",
" Qwen_Qwen2.5-7B-Instruct | \n",
" 9h 36m | \n",
" 8h 33m | \n",
" 0.7730 | \n",
" 4 | \n",
"
\n",
" \n",
" | 5 | \n",
" openchat_openchat-3.6-8b-20240522 | \n",
" 7h 51m | \n",
" 6h 59m | \n",
" 0.7726 | \n",
" 5 | \n",
"
\n",
" \n",
" | 6 | \n",
" 01-ai_Yi-1.5-9B-Chat | \n",
" 13h 54m | \n",
" 12h 15m | \n",
" 0.7691 | \n",
" 6 | \n",
"
\n",
" \n",
" | 7 | \n",
" Qwen_Qwen2.5-7B-Instruct-1M | \n",
" 11h 17m | \n",
" 10h 10m | \n",
" 0.7564 | \n",
" 7 | \n",
"
\n",
" \n",
" | 8 | \n",
" Qwen_Qwen3-8B | \n",
" 15h 31m | \n",
" 13h 44m | \n",
" 0.7468 | \n",
" 8 | \n",
"
\n",
" \n",
" | 9 | \n",
" mistralai_Mistral-7B-Instruct-v0.3 | \n",
" 8h 38m | \n",
" 7h 41m | \n",
" 0.7403 | \n",
" 9 | \n",
"
\n",
" \n",
" | 10 | \n",
" 01-ai_Yi-1.5-6B-Chat | \n",
" 8h 4m | \n",
" 7h 1m | \n",
" 0.7374 | \n",
" 10 | \n",
"
\n",
" \n",
" | 11 | \n",
" Qwen_Qwen2.5-3B-Instruct | \n",
" 7h 48m | \n",
" 6h 30m | \n",
" 0.7367 | \n",
" 11 | \n",
"
\n",
" \n",
" | 12 | \n",
" mistralai_Ministral-8B-Instruct-2410 | \n",
" 10h 46m | \n",
" 9h 27m | \n",
" 0.7328 | \n",
" 12 | \n",
"
\n",
" \n",
" | 13 | \n",
" NousResearch_Hermes-2-Pro-Mistral-7B | \n",
" 8h 27m | \n",
" 7h 28m | \n",
" 0.7284 | \n",
" 13 | \n",
"
\n",
" \n",
" | 14 | \n",
" Qwen_Qwen2-7B-Instruct | \n",
" 11h 30m | \n",
" 10h 11m | \n",
" 0.7274 | \n",
" 14 | \n",
"
\n",
" \n",
" | 15 | \n",
" 01-ai_Yi-1.5-9B | \n",
" 11h 43m | \n",
" 10h 26m | \n",
" 0.7266 | \n",
" 15 | \n",
"
\n",
" \n",
" | 16 | \n",
" Qwen_Qwen3-4B | \n",
" 5h 51m | \n",
" 5h 3m | \n",
" 0.7266 | \n",
" 16 | \n",
"
\n",
" \n",
" | 17 | \n",
" meta-llama_Llama-3.1-8B-Instruct | \n",
" 12h 19m | \n",
" 10h 52m | \n",
" 0.7249 | \n",
" 17 | \n",
"
\n",
" \n",
" | 18 | \n",
" 01-ai_Yi-1.5-6B | \n",
" 4h 28m | \n",
" 3h 54m | \n",
" 0.7199 | \n",
" 18 | \n",
"
\n",
" \n",
" | 19 | \n",
" google_gemma-3-4b-it | \n",
" 4h 51m | \n",
" 3h 50m | \n",
" 0.7167 | \n",
" 19 | \n",
"
\n",
" \n",
" | 20 | \n",
" meta-llama_Llama-2-13b-hf | \n",
" 19h 21m | \n",
" 17h 38m | \n",
" 0.7157 | \n",
" 20 | \n",
"
\n",
" \n",
" | 21 | \n",
" meta-llama_Llama-2-13b-chat-hf | \n",
" 17h 8m | \n",
" 15h 37m | \n",
" 0.7153 | \n",
" 21 | \n",
"
\n",
" \n",
" | 22 | \n",
" meta-llama_Meta-Llama-3-8B-Instruct | \n",
" 6h 30m | \n",
" 5h 46m | \n",
" 0.7147 | \n",
" 22 | \n",
"
\n",
" \n",
" | 23 | \n",
" deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | \n",
" 17h 57m | \n",
" 15h 30m | \n",
" 0.7094 | \n",
" 23 | \n",
"
\n",
" \n",
" | 24 | \n",
" deepseek-ai_deepseek-llm-7b-chat | \n",
" 10h 6m | \n",
" 9h 8m | \n",
" 0.7090 | \n",
" 24 | \n",
"
\n",
" \n",
" | 25 | \n",
" meta-llama_Llama-2-7b-chat-hf | \n",
" 6h 57m | \n",
" 6h 7m | \n",
" 0.6978 | \n",
" 25 | \n",
"
\n",
" \n",
" | 26 | \n",
" meta-llama_Llama-2-7b-hf | \n",
" 5h 42m | \n",
" 4h 59m | \n",
" 0.6956 | \n",
" 26 | \n",
"
\n",
" \n",
" | 27 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Llama-8B | \n",
" 11h 46m | \n",
" 10h 36m | \n",
" 0.6928 | \n",
" 27 | \n",
"
\n",
" \n",
" | 28 | \n",
" deepseek-ai_deepseek-llm-7b-base | \n",
" 7h 11m | \n",
" 6h 26m | \n",
" 0.6886 | \n",
" 28 | \n",
"
\n",
" \n",
" | 29 | \n",
" Qwen_Qwen2.5-1.5B-Instruct | \n",
" 3h 20m | \n",
" 2h 36m | \n",
" 0.6803 | \n",
" 29 | \n",
"
\n",
" \n",
" | 30 | \n",
" meta-llama_Llama-3.2-3B-Instruct | \n",
" 7h 12m | \n",
" 5h 57m | \n",
" 0.6788 | \n",
" 30 | \n",
"
\n",
" \n",
" | 31 | \n",
" deepseek-ai_deepseek-math-7b-rl | \n",
" 8h 2m | \n",
" 7h 12m | \n",
" 0.6711 | \n",
" 31 | \n",
"
\n",
" \n",
" | 32 | \n",
" Qwen_Qwen2.5-Math-7B | \n",
" 27h 21m | \n",
" 24h 38m | \n",
" 0.6587 | \n",
" 32 | \n",
"
\n",
" \n",
" | 33 | \n",
" Qwen_Qwen3-1.7B | \n",
" 4h 25m | \n",
" 3h 36m | \n",
" 0.6442 | \n",
" 33 | \n",
"
\n",
" \n",
" | 34 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | \n",
" 6h 28m | \n",
" 5h 43m | \n",
" 0.6422 | \n",
" 34 | \n",
"
\n",
" \n",
" | 35 | \n",
" google_gemma-3-1b-it | \n",
" 6h 50m | \n",
" 4h 52m | \n",
" 0.6267 | \n",
" 35 | \n",
"
\n",
" \n",
" | 36 | \n",
" meta-llama_Llama-3.2-1B-Instruct | \n",
" 3h 30m | \n",
" 2h 35m | \n",
" 0.6264 | \n",
" 36 | \n",
"
\n",
" \n",
" | 37 | \n",
" Qwen_Qwen2.5-Math-7B-Instruct | \n",
" 5h 37m | \n",
" 4h 57m | \n",
" 0.6184 | \n",
" 37 | \n",
"
\n",
" \n",
" | 38 | \n",
" Qwen_Qwen2.5-0.5B-Instruct | \n",
" 2h 34m | \n",
" 1h 48m | \n",
" 0.6039 | \n",
" 38 | \n",
"
\n",
" \n",
" | 39 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | \n",
" 3h 40m | \n",
" 2h 52m | \n",
" 0.5703 | \n",
" 39 | \n",
"
\n",
" \n",
" | 40 | \n",
" Qwen_Qwen3-0.6B | \n",
" 3h 45m | \n",
" 2h 53m | \n",
" 0.5696 | \n",
" 40 | \n",
"
\n",
" \n",
" | 41 | \n",
" Qwen_Qwen2.5-Math-1.5B-Instruct | \n",
" 3h 25m | \n",
" 2h 39m | \n",
" 0.5181 | \n",
" 41 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Model Name Total Time GPU Util Time \\\n",
"1 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n",
"2 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n",
"3 google_gemma-3-12b-it 15h 45m 14h 8m \n",
"4 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n",
"5 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n",
"6 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n",
"7 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n",
"8 Qwen_Qwen3-8B 15h 31m 13h 44m \n",
"9 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n",
"10 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n",
"11 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n",
"12 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n",
"13 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n",
"14 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n",
"15 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n",
"16 Qwen_Qwen3-4B 5h 51m 5h 3m \n",
"17 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n",
"18 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n",
"19 google_gemma-3-4b-it 4h 51m 3h 50m \n",
"20 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n",
"21 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n",
"22 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n",
"23 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n",
"24 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n",
"25 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n",
"26 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n",
"27 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n",
"28 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n",
"29 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n",
"30 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n",
"31 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n",
"32 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n",
"33 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n",
"34 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n",
"35 google_gemma-3-1b-it 6h 50m 4h 52m \n",
"36 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n",
"37 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n",
"38 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n",
"39 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n",
"40 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n",
"41 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n",
"\n",
" Commonsense & NLI Mean Score Commonsense & NLI Avg. Rank \n",
"1 0.7941 1 \n",
"2 0.7807 2 \n",
"3 0.7737 3 \n",
"4 0.7730 4 \n",
"5 0.7726 5 \n",
"6 0.7691 6 \n",
"7 0.7564 7 \n",
"8 0.7468 8 \n",
"9 0.7403 9 \n",
"10 0.7374 10 \n",
"11 0.7367 11 \n",
"12 0.7328 12 \n",
"13 0.7284 13 \n",
"14 0.7274 14 \n",
"15 0.7266 15 \n",
"16 0.7266 16 \n",
"17 0.7249 17 \n",
"18 0.7199 18 \n",
"19 0.7167 19 \n",
"20 0.7157 20 \n",
"21 0.7153 21 \n",
"22 0.7147 22 \n",
"23 0.7094 23 \n",
"24 0.7090 24 \n",
"25 0.6978 25 \n",
"26 0.6956 26 \n",
"27 0.6928 27 \n",
"28 0.6886 28 \n",
"29 0.6803 29 \n",
"30 0.6788 30 \n",
"31 0.6711 31 \n",
"32 0.6587 32 \n",
"33 0.6442 33 \n",
"34 0.6422 34 \n",
"35 0.6267 35 \n",
"36 0.6264 36 \n",
"37 0.6184 37 \n",
"38 0.6039 38 \n",
"39 0.5703 39 \n",
"40 0.5696 40 \n",
"41 0.5181 41 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| Model Name | Total Time | GPU Util Time | Commonsense & NLI Mean Score | Commonsense & NLI Avg. Rank |\n",
"|:------------------------------------------|:-------------|:----------------|-------------------------------:|------------------------------:|\n",
"| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.7941 | 1 |\n",
"| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.7807 | 2 |\n",
"| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.7737 | 3 |\n",
"| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.773 | 4 |\n",
"| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.7726 | 5 |\n",
"| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.7691 | 6 |\n",
"| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.7564 | 7 |\n",
"| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.7468 | 8 |\n",
"| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.7403 | 9 |\n",
"| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.7374 | 10 |\n",
"| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.7367 | 11 |\n",
"| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.7328 | 12 |\n",
"| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.7284 | 13 |\n",
"| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.7274 | 14 |\n",
"| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.7266 | 15 |\n",
"| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.7266 | 16 |\n",
"| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.7249 | 17 |\n",
"| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.7199 | 18 |\n",
"| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.7167 | 19 |\n",
"| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.7157 | 20 |\n",
"| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.7153 | 21 |\n",
"| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.7147 | 22 |\n",
"| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.7094 | 23 |\n",
"| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.709 | 24 |\n",
"| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.6978 | 25 |\n",
"| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.6956 | 26 |\n",
"| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.6928 | 27 |\n",
"| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.6886 | 28 |\n",
"| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.6803 | 29 |\n",
"| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.6788 | 30 |\n",
"| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.6711 | 31 |\n",
"| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.6587 | 32 |\n",
"| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.6442 | 33 |\n",
"| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.6422 | 34 |\n",
"| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.6267 | 35 |\n",
"| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.6264 | 36 |\n",
"| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.6184 | 37 |\n",
"| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.6039 | 38 |\n",
"| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.5703 | 39 |\n",
"| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.5696 | 40 |\n",
"| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.5181 | 41 |\n",
"\n",
"\n",
"knowledge_and_reading\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Model Name | \n",
" Total Time | \n",
" GPU Util Time | \n",
" Knowledge & Reading Mean Score | \n",
" Knowledge & Reading Avg. Rank | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" 01-ai_Yi-1.5-9B | \n",
" 11h 43m | \n",
" 10h 26m | \n",
" 0.4369 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" openchat_openchat-3.6-8b-20240522 | \n",
" 7h 51m | \n",
" 6h 59m | \n",
" 0.4136 | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" meta-llama_Llama-3.1-8B-Instruct | \n",
" 12h 19m | \n",
" 10h 52m | \n",
" 0.4127 | \n",
" 3 | \n",
"
\n",
" \n",
" | 4 | \n",
" 01-ai_Yi-1.5-6B | \n",
" 4h 28m | \n",
" 3h 54m | \n",
" 0.4063 | \n",
" 4 | \n",
"
\n",
" \n",
" | 5 | \n",
" mistralai_Mistral-7B-Instruct-v0.3 | \n",
" 8h 38m | \n",
" 7h 41m | \n",
" 0.4045 | \n",
" 5 | \n",
"
\n",
" \n",
" | 6 | \n",
" Qwen_Qwen2.5-7B-Instruct-1M | \n",
" 11h 17m | \n",
" 10h 10m | \n",
" 0.3963 | \n",
" 6 | \n",
"
\n",
" \n",
" | 7 | \n",
" Qwen_Qwen3-14B (8bit) | \n",
" 29h 45m | \n",
" 17h 29m | \n",
" 0.3926 | \n",
" 7 | \n",
"
\n",
" \n",
" | 8 | \n",
" meta-llama_Meta-Llama-3-8B-Instruct | \n",
" 6h 30m | \n",
" 5h 46m | \n",
" 0.3923 | \n",
" 8 | \n",
"
\n",
" \n",
" | 9 | \n",
" Qwen_Qwen2.5-7B-Instruct | \n",
" 9h 36m | \n",
" 8h 33m | \n",
" 0.3810 | \n",
" 9 | \n",
"
\n",
" \n",
" | 10 | \n",
" google_gemma-3-12b-it | \n",
" 15h 45m | \n",
" 14h 8m | \n",
" 0.3791 | \n",
" 10 | \n",
"
\n",
" \n",
" | 11 | \n",
" NousResearch_Hermes-2-Pro-Mistral-7B | \n",
" 8h 27m | \n",
" 7h 28m | \n",
" 0.3719 | \n",
" 11 | \n",
"
\n",
" \n",
" | 12 | \n",
" mistralai_Ministral-8B-Instruct-2410 | \n",
" 10h 46m | \n",
" 9h 27m | \n",
" 0.3683 | \n",
" 12 | \n",
"
\n",
" \n",
" | 13 | \n",
" Qwen_Qwen2.5-14B-Instruct (8bit) | \n",
" 52h 44m | \n",
" 29h 32m | \n",
" 0.3581 | \n",
" 13 | \n",
"
\n",
" \n",
" | 14 | \n",
" Qwen_Qwen3-8B | \n",
" 15h 31m | \n",
" 13h 44m | \n",
" 0.3566 | \n",
" 14 | \n",
"
\n",
" \n",
" | 15 | \n",
" 01-ai_Yi-1.5-9B-Chat | \n",
" 13h 54m | \n",
" 12h 15m | \n",
" 0.3467 | \n",
" 15 | \n",
"
\n",
" \n",
" | 16 | \n",
" meta-llama_Llama-3.2-3B-Instruct | \n",
" 7h 12m | \n",
" 5h 57m | \n",
" 0.3438 | \n",
" 16 | \n",
"
\n",
" \n",
" | 17 | \n",
" Qwen_Qwen2.5-3B-Instruct | \n",
" 7h 48m | \n",
" 6h 30m | \n",
" 0.3406 | \n",
" 17 | \n",
"
\n",
" \n",
" | 18 | \n",
" meta-llama_Llama-2-13b-hf | \n",
" 19h 21m | \n",
" 17h 38m | \n",
" 0.3374 | \n",
" 18 | \n",
"
\n",
" \n",
" | 19 | \n",
" 01-ai_Yi-1.5-6B-Chat | \n",
" 8h 4m | \n",
" 7h 1m | \n",
" 0.3339 | \n",
" 19 | \n",
"
\n",
" \n",
" | 20 | \n",
" google_gemma-3-4b-it | \n",
" 4h 51m | \n",
" 3h 50m | \n",
" 0.3261 | \n",
" 20 | \n",
"
\n",
" \n",
" | 21 | \n",
" Qwen_Qwen3-4B | \n",
" 5h 51m | \n",
" 5h 3m | \n",
" 0.3226 | \n",
" 21 | \n",
"
\n",
" \n",
" | 22 | \n",
" meta-llama_Llama-2-7b-hf | \n",
" 5h 42m | \n",
" 4h 59m | \n",
" 0.3018 | \n",
" 22 | \n",
"
\n",
" \n",
" | 23 | \n",
" deepseek-ai_deepseek-llm-7b-chat | \n",
" 10h 6m | \n",
" 9h 8m | \n",
" 0.3007 | \n",
" 23 | \n",
"
\n",
" \n",
" | 24 | \n",
" Qwen_Qwen2-7B-Instruct | \n",
" 11h 30m | \n",
" 10h 11m | \n",
" 0.2919 | \n",
" 24 | \n",
"
\n",
" \n",
" | 25 | \n",
" Qwen_Qwen2.5-1.5B-Instruct | \n",
" 3h 20m | \n",
" 2h 36m | \n",
" 0.2903 | \n",
" 25 | \n",
"
\n",
" \n",
" | 26 | \n",
" meta-llama_Llama-2-13b-chat-hf | \n",
" 17h 8m | \n",
" 15h 37m | \n",
" 0.2864 | \n",
" 26 | \n",
"
\n",
" \n",
" | 27 | \n",
" deepseek-ai_deepseek-llm-7b-base | \n",
" 7h 11m | \n",
" 6h 26m | \n",
" 0.2864 | \n",
" 27 | \n",
"
\n",
" \n",
" | 28 | \n",
" deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | \n",
" 17h 57m | \n",
" 15h 30m | \n",
" 0.2834 | \n",
" 28 | \n",
"
\n",
" \n",
" | 29 | \n",
" Qwen_Qwen2.5-Math-7B | \n",
" 27h 21m | \n",
" 24h 38m | \n",
" 0.2827 | \n",
" 29 | \n",
"
\n",
" \n",
" | 30 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Llama-8B | \n",
" 11h 46m | \n",
" 10h 36m | \n",
" 0.2805 | \n",
" 30 | \n",
"
\n",
" \n",
" | 31 | \n",
" meta-llama_Llama-3.2-1B-Instruct | \n",
" 3h 30m | \n",
" 2h 35m | \n",
" 0.2731 | \n",
" 31 | \n",
"
\n",
" \n",
" | 32 | \n",
" meta-llama_Llama-2-7b-chat-hf | \n",
" 6h 57m | \n",
" 6h 7m | \n",
" 0.2656 | \n",
" 32 | \n",
"
\n",
" \n",
" | 33 | \n",
" deepseek-ai_deepseek-math-7b-rl | \n",
" 8h 2m | \n",
" 7h 12m | \n",
" 0.2581 | \n",
" 33 | \n",
"
\n",
" \n",
" | 34 | \n",
" Qwen_Qwen3-1.7B | \n",
" 4h 25m | \n",
" 3h 36m | \n",
" 0.2567 | \n",
" 34 | \n",
"
\n",
" \n",
" | 35 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | \n",
" 6h 28m | \n",
" 5h 43m | \n",
" 0.2340 | \n",
" 35 | \n",
"
\n",
" \n",
" | 36 | \n",
" Qwen_Qwen2.5-Math-7B-Instruct | \n",
" 5h 37m | \n",
" 4h 57m | \n",
" 0.2276 | \n",
" 36 | \n",
"
\n",
" \n",
" | 37 | \n",
" Qwen_Qwen2.5-0.5B-Instruct | \n",
" 2h 34m | \n",
" 1h 48m | \n",
" 0.2218 | \n",
" 37 | \n",
"
\n",
" \n",
" | 38 | \n",
" google_gemma-3-1b-it | \n",
" 6h 50m | \n",
" 4h 52m | \n",
" 0.2202 | \n",
" 38 | \n",
"
\n",
" \n",
" | 39 | \n",
" Qwen_Qwen3-0.6B | \n",
" 3h 45m | \n",
" 2h 53m | \n",
" 0.2000 | \n",
" 39 | \n",
"
\n",
" \n",
" | 40 | \n",
" Qwen_Qwen2.5-Math-1.5B-Instruct | \n",
" 3h 25m | \n",
" 2h 39m | \n",
" 0.1983 | \n",
" 40 | \n",
"
\n",
" \n",
" | 41 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | \n",
" 3h 40m | \n",
" 2h 52m | \n",
" 0.1954 | \n",
" 41 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Model Name Total Time GPU Util Time \\\n",
"1 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n",
"2 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n",
"3 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n",
"4 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n",
"5 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n",
"6 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n",
"7 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n",
"8 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n",
"9 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n",
"10 google_gemma-3-12b-it 15h 45m 14h 8m \n",
"11 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n",
"12 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n",
"13 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n",
"14 Qwen_Qwen3-8B 15h 31m 13h 44m \n",
"15 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n",
"16 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n",
"17 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n",
"18 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n",
"19 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n",
"20 google_gemma-3-4b-it 4h 51m 3h 50m \n",
"21 Qwen_Qwen3-4B 5h 51m 5h 3m \n",
"22 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n",
"23 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n",
"24 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n",
"25 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n",
"26 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n",
"27 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n",
"28 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n",
"29 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n",
"30 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n",
"31 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n",
"32 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n",
"33 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n",
"34 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n",
"35 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n",
"36 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n",
"37 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n",
"38 google_gemma-3-1b-it 6h 50m 4h 52m \n",
"39 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n",
"40 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n",
"41 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n",
"\n",
" Knowledge & Reading Mean Score Knowledge & Reading Avg. Rank \n",
"1 0.4369 1 \n",
"2 0.4136 2 \n",
"3 0.4127 3 \n",
"4 0.4063 4 \n",
"5 0.4045 5 \n",
"6 0.3963 6 \n",
"7 0.3926 7 \n",
"8 0.3923 8 \n",
"9 0.3810 9 \n",
"10 0.3791 10 \n",
"11 0.3719 11 \n",
"12 0.3683 12 \n",
"13 0.3581 13 \n",
"14 0.3566 14 \n",
"15 0.3467 15 \n",
"16 0.3438 16 \n",
"17 0.3406 17 \n",
"18 0.3374 18 \n",
"19 0.3339 19 \n",
"20 0.3261 20 \n",
"21 0.3226 21 \n",
"22 0.3018 22 \n",
"23 0.3007 23 \n",
"24 0.2919 24 \n",
"25 0.2903 25 \n",
"26 0.2864 26 \n",
"27 0.2864 27 \n",
"28 0.2834 28 \n",
"29 0.2827 29 \n",
"30 0.2805 30 \n",
"31 0.2731 31 \n",
"32 0.2656 32 \n",
"33 0.2581 33 \n",
"34 0.2567 34 \n",
"35 0.2340 35 \n",
"36 0.2276 36 \n",
"37 0.2218 37 \n",
"38 0.2202 38 \n",
"39 0.2000 39 \n",
"40 0.1983 40 \n",
"41 0.1954 41 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| Model Name | Total Time | GPU Util Time | Knowledge & Reading Mean Score | Knowledge & Reading Avg. Rank |\n",
"|:------------------------------------------|:-------------|:----------------|---------------------------------:|--------------------------------:|\n",
"| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.4369 | 1 |\n",
"| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.4136 | 2 |\n",
"| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.4127 | 3 |\n",
"| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.4063 | 4 |\n",
"| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.4045 | 5 |\n",
"| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.3963 | 6 |\n",
"| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.3926 | 7 |\n",
"| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.3923 | 8 |\n",
"| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.381 | 9 |\n",
"| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.3791 | 10 |\n",
"| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.3719 | 11 |\n",
"| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.3683 | 12 |\n",
"| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.3581 | 13 |\n",
"| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.3566 | 14 |\n",
"| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.3467 | 15 |\n",
"| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.3438 | 16 |\n",
"| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.3406 | 17 |\n",
"| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.3374 | 18 |\n",
"| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.3339 | 19 |\n",
"| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.3261 | 20 |\n",
"| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.3226 | 21 |\n",
"| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.3018 | 22 |\n",
"| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.3007 | 23 |\n",
"| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.2919 | 24 |\n",
"| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.2903 | 25 |\n",
"| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.2864 | 26 |\n",
"| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.2864 | 27 |\n",
"| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.2834 | 28 |\n",
"| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.2827 | 29 |\n",
"| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.2805 | 30 |\n",
"| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.2731 | 31 |\n",
"| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.2656 | 32 |\n",
"| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.2581 | 33 |\n",
"| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.2567 | 34 |\n",
"| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.234 | 35 |\n",
"| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.2276 | 36 |\n",
"| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.2218 | 37 |\n",
"| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.2202 | 38 |\n",
"| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.2 | 39 |\n",
"| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.1983 | 40 |\n",
"| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.1954 | 41 |\n",
"\n",
"\n"
]
}
],
"source": [
"column_map = {\n",
" \"overall\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Mean Score', \"Overall Rank\"],\n",
" \"reasoning_and_math\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Reasoning & Math Mean Score', \"Reasoning & Math Avg. Rank\"],\n",
" \"commonsense_and_nli\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Commonsense & NLI Mean Score', \"Commonsense & NLI Avg. Rank\"],\n",
" \"knowledge_and_reading\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Knowledge & Reading Mean Score', \"Knowledge & Reading Avg. Rank\"]\n",
"}\n",
"\n",
"\n",
"\n",
"# Produce sub-dataframes and export them to csv and excel file.\n",
"with pd.ExcelWriter(\"/mnt/data8tb/Documents/project/benchmark_project/llm_benchmarks_all_results.xlsx\") as writer:\n",
" df_display.to_excel(writer, sheet_name=\"Master\", index=False)\n",
" \n",
" for name, cols in column_map.items():\n",
" sub_df = df_display[cols].copy()\n",
" rank_col = [c for c in sub_df.columns if 'Rank' in c][0]\n",
" sub_df = sub_df.sort_values(by=rank_col, ascending=True).reset_index(drop=True)\n",
" sub_df.index = sub_df.index + 1\n",
" print(name)\n",
" if name == 'overall':\n",
" overall_df = sub_df\n",
" display(sub_df)\n",
" \n",
" # sub_df.to_csv(f\"/mnt/data8tb/Documents/project/benchmark_project/{name}_rank.csv\")\n",
" # sub_df.to_excel(writer, sheet_name=name, index=False)\n",
"\n",
" table_md = sub_df.to_markdown(index=False)\n",
" print(table_md)\n",
"\n",
" sub_df.to_html(f\"{name}.html\", index=False)\n",
" print()\n",
" print()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "5642b72a-e416-482b-b45b-8376fd2571b7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Model Name | \n",
" Total Time | \n",
" GPU Util Time | \n",
" Mean Score | \n",
" Overall Rank | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" google_gemma-3-12b-it | \n",
" 15h 45m | \n",
" 14h 8m | \n",
" 0.6038 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" Qwen_Qwen3-14B (8bit) | \n",
" 29h 45m | \n",
" 17h 29m | \n",
" 0.5961 | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" openchat_openchat-3.6-8b-20240522 | \n",
" 7h 51m | \n",
" 6h 59m | \n",
" 0.5871 | \n",
" 3 | \n",
"
\n",
" \n",
" | 4 | \n",
" Qwen_Qwen3-8B | \n",
" 15h 31m | \n",
" 13h 44m | \n",
" 0.5859 | \n",
" 4 | \n",
"
\n",
" \n",
" | 5 | \n",
" Qwen_Qwen2.5-7B-Instruct | \n",
" 9h 36m | \n",
" 8h 33m | \n",
" 0.5788 | \n",
" 5 | \n",
"
\n",
" \n",
" | 6 | \n",
" Qwen_Qwen2.5-14B-Instruct (8bit) | \n",
" 52h 44m | \n",
" 29h 32m | \n",
" 0.5775 | \n",
" 6 | \n",
"
\n",
" \n",
" | 7 | \n",
" 01-ai_Yi-1.5-9B | \n",
" 11h 43m | \n",
" 10h 26m | \n",
" 0.5676 | \n",
" 7 | \n",
"
\n",
" \n",
" | 8 | \n",
" Qwen_Qwen2.5-7B-Instruct-1M | \n",
" 11h 17m | \n",
" 10h 10m | \n",
" 0.5672 | \n",
" 8 | \n",
"
\n",
" \n",
" | 9 | \n",
" meta-llama_Llama-3.1-8B-Instruct | \n",
" 12h 19m | \n",
" 10h 52m | \n",
" 0.5653 | \n",
" 9 | \n",
"
\n",
" \n",
" | 10 | \n",
" 01-ai_Yi-1.5-9B-Chat | \n",
" 13h 54m | \n",
" 12h 15m | \n",
" 0.5621 | \n",
" 10 | \n",
"
\n",
" \n",
" | 11 | \n",
" mistralai_Ministral-8B-Instruct-2410 | \n",
" 10h 46m | \n",
" 9h 27m | \n",
" 0.5576 | \n",
" 11 | \n",
"
\n",
" \n",
" | 12 | \n",
" meta-llama_Meta-Llama-3-8B-Instruct | \n",
" 6h 30m | \n",
" 5h 46m | \n",
" 0.5528 | \n",
" 12 | \n",
"
\n",
" \n",
" | 13 | \n",
" Qwen_Qwen3-4B | \n",
" 5h 51m | \n",
" 5h 3m | \n",
" 0.5510 | \n",
" 13 | \n",
"
\n",
" \n",
" | 14 | \n",
" NousResearch_Hermes-2-Pro-Mistral-7B | \n",
" 8h 27m | \n",
" 7h 28m | \n",
" 0.5480 | \n",
" 14 | \n",
"
\n",
" \n",
" | 15 | \n",
" mistralai_Mistral-7B-Instruct-v0.3 | \n",
" 8h 38m | \n",
" 7h 41m | \n",
" 0.5451 | \n",
" 15 | \n",
"
\n",
" \n",
" | 16 | \n",
" google_gemma-3-4b-it | \n",
" 4h 51m | \n",
" 3h 50m | \n",
" 0.5368 | \n",
" 16 | \n",
"
\n",
" \n",
" | 17 | \n",
" 01-ai_Yi-1.5-6B-Chat | \n",
" 8h 4m | \n",
" 7h 1m | \n",
" 0.5335 | \n",
" 17 | \n",
"
\n",
" \n",
" | 18 | \n",
" 01-ai_Yi-1.5-6B | \n",
" 4h 28m | \n",
" 3h 54m | \n",
" 0.5312 | \n",
" 18 | \n",
"
\n",
" \n",
" | 19 | \n",
" Qwen_Qwen2-7B-Instruct | \n",
" 11h 30m | \n",
" 10h 11m | \n",
" 0.5271 | \n",
" 19 | \n",
"
\n",
" \n",
" | 20 | \n",
" deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | \n",
" 17h 57m | \n",
" 15h 30m | \n",
" 0.5219 | \n",
" 20 | \n",
"
\n",
" \n",
" | 21 | \n",
" meta-llama_Llama-3.2-3B-Instruct | \n",
" 7h 12m | \n",
" 5h 57m | \n",
" 0.5048 | \n",
" 21 | \n",
"
\n",
" \n",
" | 22 | \n",
" Qwen_Qwen2.5-3B-Instruct | \n",
" 7h 48m | \n",
" 6h 30m | \n",
" 0.4939 | \n",
" 22 | \n",
"
\n",
" \n",
" | 23 | \n",
" Qwen_Qwen2.5-Math-7B | \n",
" 27h 21m | \n",
" 24h 38m | \n",
" 0.4907 | \n",
" 23 | \n",
"
\n",
" \n",
" | 24 | \n",
" deepseek-ai_deepseek-llm-7b-chat | \n",
" 10h 6m | \n",
" 9h 8m | \n",
" 0.4869 | \n",
" 24 | \n",
"
\n",
" \n",
" | 25 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Llama-8B | \n",
" 11h 46m | \n",
" 10h 36m | \n",
" 0.4830 | \n",
" 25 | \n",
"
\n",
" \n",
" | 26 | \n",
" meta-llama_Llama-2-13b-hf | \n",
" 19h 21m | \n",
" 17h 38m | \n",
" 0.4819 | \n",
" 26 | \n",
"
\n",
" \n",
" | 27 | \n",
" meta-llama_Llama-2-13b-chat-hf | \n",
" 17h 8m | \n",
" 15h 37m | \n",
" 0.4813 | \n",
" 27 | \n",
"
\n",
" \n",
" | 28 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | \n",
" 6h 28m | \n",
" 5h 43m | \n",
" 0.4644 | \n",
" 28 | \n",
"
\n",
" \n",
" | 29 | \n",
" Qwen_Qwen2.5-1.5B-Instruct | \n",
" 3h 20m | \n",
" 2h 36m | \n",
" 0.4608 | \n",
" 29 | \n",
"
\n",
" \n",
" | 30 | \n",
" Qwen_Qwen3-1.7B | \n",
" 4h 25m | \n",
" 3h 36m | \n",
" 0.4597 | \n",
" 30 | \n",
"
\n",
" \n",
" | 31 | \n",
" Qwen_Qwen2.5-Math-7B-Instruct | \n",
" 5h 37m | \n",
" 4h 57m | \n",
" 0.4596 | \n",
" 31 | \n",
"
\n",
" \n",
" | 32 | \n",
" meta-llama_Llama-2-7b-chat-hf | \n",
" 6h 57m | \n",
" 6h 7m | \n",
" 0.4525 | \n",
" 32 | \n",
"
\n",
" \n",
" | 33 | \n",
" meta-llama_Llama-2-7b-hf | \n",
" 5h 42m | \n",
" 4h 59m | \n",
" 0.4516 | \n",
" 33 | \n",
"
\n",
" \n",
" | 34 | \n",
" deepseek-ai_deepseek-llm-7b-base | \n",
" 7h 11m | \n",
" 6h 26m | \n",
" 0.4451 | \n",
" 34 | \n",
"
\n",
" \n",
" | 35 | \n",
" deepseek-ai_deepseek-math-7b-rl | \n",
" 8h 2m | \n",
" 7h 12m | \n",
" 0.4419 | \n",
" 35 | \n",
"
\n",
" \n",
" | 36 | \n",
" meta-llama_Llama-3.2-1B-Instruct | \n",
" 3h 30m | \n",
" 2h 35m | \n",
" 0.4219 | \n",
" 36 | \n",
"
\n",
" \n",
" | 37 | \n",
" google_gemma-3-1b-it | \n",
" 6h 50m | \n",
" 4h 52m | \n",
" 0.4013 | \n",
" 37 | \n",
"
\n",
" \n",
" | 38 | \n",
" deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | \n",
" 3h 40m | \n",
" 2h 52m | \n",
" 0.3986 | \n",
" 38 | \n",
"
\n",
" \n",
" | 39 | \n",
" Qwen_Qwen2.5-Math-1.5B-Instruct | \n",
" 3h 25m | \n",
" 2h 39m | \n",
" 0.3838 | \n",
" 39 | \n",
"
\n",
" \n",
" | 40 | \n",
" Qwen_Qwen3-0.6B | \n",
" 3h 45m | \n",
" 2h 53m | \n",
" 0.3816 | \n",
" 40 | \n",
"
\n",
" \n",
" | 41 | \n",
" Qwen_Qwen2.5-0.5B-Instruct | \n",
" 2h 34m | \n",
" 1h 48m | \n",
" 0.3799 | \n",
" 41 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Model Name Total Time GPU Util Time \\\n",
"1 google_gemma-3-12b-it 15h 45m 14h 8m \n",
"2 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n",
"3 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n",
"4 Qwen_Qwen3-8B 15h 31m 13h 44m \n",
"5 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n",
"6 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n",
"7 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n",
"8 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n",
"9 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n",
"10 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n",
"11 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n",
"12 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n",
"13 Qwen_Qwen3-4B 5h 51m 5h 3m \n",
"14 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n",
"15 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n",
"16 google_gemma-3-4b-it 4h 51m 3h 50m \n",
"17 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n",
"18 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n",
"19 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n",
"20 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n",
"21 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n",
"22 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n",
"23 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n",
"24 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n",
"25 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n",
"26 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n",
"27 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n",
"28 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n",
"29 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n",
"30 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n",
"31 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n",
"32 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n",
"33 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n",
"34 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n",
"35 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n",
"36 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n",
"37 google_gemma-3-1b-it 6h 50m 4h 52m \n",
"38 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n",
"39 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n",
"40 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n",
"41 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n",
"\n",
" Mean Score Overall Rank \n",
"1 0.6038 1 \n",
"2 0.5961 2 \n",
"3 0.5871 3 \n",
"4 0.5859 4 \n",
"5 0.5788 5 \n",
"6 0.5775 6 \n",
"7 0.5676 7 \n",
"8 0.5672 8 \n",
"9 0.5653 9 \n",
"10 0.5621 10 \n",
"11 0.5576 11 \n",
"12 0.5528 12 \n",
"13 0.5510 13 \n",
"14 0.5480 14 \n",
"15 0.5451 15 \n",
"16 0.5368 16 \n",
"17 0.5335 17 \n",
"18 0.5312 18 \n",
"19 0.5271 19 \n",
"20 0.5219 20 \n",
"21 0.5048 21 \n",
"22 0.4939 22 \n",
"23 0.4907 23 \n",
"24 0.4869 24 \n",
"25 0.4830 25 \n",
"26 0.4819 26 \n",
"27 0.4813 27 \n",
"28 0.4644 28 \n",
"29 0.4608 29 \n",
"30 0.4597 30 \n",
"31 0.4596 31 \n",
"32 0.4525 32 \n",
"33 0.4516 33 \n",
"34 0.4451 34 \n",
"35 0.4419 35 \n",
"36 0.4219 36 \n",
"37 0.4013 37 \n",
"38 0.3986 38 \n",
"39 0.3838 39 \n",
"40 0.3816 40 \n",
"41 0.3799 41 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(overall_df)\n",
"overall_df.to_html(\"overall.html\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a04411e-c749-428f-89bd-2c23ac74af71",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "7368bca2-dd44-4393-be0e-320f737af82b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}