|
|
|
|
|
|
|
|
<!doctype html> |
|
|
<html lang="en"> |
|
|
<head> |
|
|
<meta charset="utf-8" /> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1" /> |
|
|
<title>LLM Benchmarks — Overall Rank</title> |
|
|
|
|
|
<style> |
|
|
|
|
|
:root { |
|
|
--bg: #0b1220; |
|
|
--panel: #0e1626; |
|
|
--panel-2: #0b1220; |
|
|
--border: #1f2937; |
|
|
--text: #e5e7eb; |
|
|
--muted: #a3aab8; |
|
|
--head-bg: #111827; |
|
|
--head-fg: #ffffff; |
|
|
--accent: #60a5fa; |
|
|
--shadow: 0 8px 24px rgba(0,0,0,.35); |
|
|
} |
|
|
html, body { height: 100%; } |
|
|
body { |
|
|
margin: 0; |
|
|
background: var(--bg); |
|
|
color: var(--text); |
|
|
font: 14px/1.45 Inter, Roboto, "Helvetica Neue", Arial, system-ui, -apple-system, Segoe UI, Noto Sans, sans-serif; |
|
|
-webkit-font-smoothing: antialiased; |
|
|
font-feature-settings: "tnum" 1, "lnum" 1; |
|
|
} |
|
|
|
|
|
|
|
|
.wrap { |
|
|
max-width: 1200px; |
|
|
margin: 15px auto 48px; |
|
|
padding: 0 16px; |
|
|
} |
|
|
|
|
|
.title { |
|
|
margin: 0 0 25px; |
|
|
font-size: 22px; |
|
|
font-weight: 800; |
|
|
letter-spacing: .2px; |
|
|
} |
|
|
.subtitle { |
|
|
margin: 0 0 20px; |
|
|
color: var(--muted); |
|
|
font-size: 13px; |
|
|
} |
|
|
|
|
|
|
|
|
.table-card { |
|
|
background: var(--panel); |
|
|
border: 1px solid var(--border); |
|
|
border-radius: 14px; |
|
|
box-shadow: var(--shadow); |
|
|
overflow: auto; |
|
|
} |
|
|
|
|
|
|
|
|
table { |
|
|
width: 100%; |
|
|
border-collapse: separate; |
|
|
border-spacing: 0; |
|
|
min-width: 720px; |
|
|
} |
|
|
thead th { |
|
|
position: sticky; |
|
|
top: 0; |
|
|
z-index: 2; |
|
|
background: var(--head-bg); |
|
|
color: var(--head-fg); |
|
|
text-align: left; |
|
|
font-weight: 700; |
|
|
padding: 12px 14px; |
|
|
border-bottom: 1px solid var(--border); |
|
|
} |
|
|
tbody td { |
|
|
padding: 10px 14px; |
|
|
border-bottom: 1px solid var(--border); |
|
|
vertical-align: middle; |
|
|
} |
|
|
tbody tr:nth-child(odd) { background: var(--panel-2); } |
|
|
tbody tr:nth-child(even) { background: var(--panel); } |
|
|
tbody tr:hover { outline: 2px solid rgba(96,165,250,.35); outline-offset: -2px; } |
|
|
|
|
|
article.card:hover, |
|
|
section.card:hover { |
|
|
outline: 2px solid rgba(96,165,250,.35); |
|
|
outline-offset: -2px; |
|
|
border-radius: 10px; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tbody td:first-child, thead th:first-child { text-align: left; } |
|
|
tbody td:not(:first-child), thead th:not(:first-child) { text-align: right; } |
|
|
|
|
|
|
|
|
thead th:first-child { border-top-left-radius: 14px; } |
|
|
thead th:last-child { border-top-right-radius: 14px; } |
|
|
tbody tr:last-child td:first-child { border-bottom-left-radius: 14px; } |
|
|
tbody tr:last-child td:last-child { border-bottom-right-radius: 14px; } |
|
|
|
|
|
|
|
|
.top1 td, .top2 td, .top3 td { font-weight: 800; } |
|
|
.medal { margin-left: 6px; font-size: 13px; opacity: .95; } |
|
|
|
|
|
|
|
|
caption { |
|
|
caption-side: top; |
|
|
padding: 14px 14px 6px; |
|
|
text-align: left; |
|
|
color: var(--muted); |
|
|
font-weight: 600; |
|
|
letter-spacing: .2px; |
|
|
} |
|
|
|
|
|
|
|
|
a { color: var(--accent); text-decoration: none; } |
|
|
a:hover { text-decoration: underline; } |
|
|
|
|
|
|
|
|
|
|
|
.lb-container, .lb-divider { |
|
|
max-width: 1200px; |
|
|
margin: 0 auto; |
|
|
padding: 0 12px; |
|
|
} |
|
|
|
|
|
|
|
|
.lb-hero { margin: 28px 0 10px; } |
|
|
.lb-hero h1 { |
|
|
margin: 0 0 6px; |
|
|
font-size: 30px; |
|
|
font-weight: 800; |
|
|
letter-spacing: .2px; |
|
|
} |
|
|
|
|
|
.lb-tagline { |
|
|
margin: 10px 0 2px; |
|
|
max-width: 860px; |
|
|
text-align: left; |
|
|
color: rgba(255, 255, 255, 0.85); |
|
|
font-size: 15px; |
|
|
line-height: 1; |
|
|
} |
|
|
|
|
|
.lb-tagline p { |
|
|
margin: 4px 0; |
|
|
} |
|
|
|
|
|
|
|
|
.lb-meta { |
|
|
margin-top: 10px; |
|
|
display: flex; align-items: center; gap: 8px; flex-wrap: wrap; |
|
|
} |
|
|
.badge { |
|
|
font-size: 12px; |
|
|
padding: 4px 8px; |
|
|
border-radius: 999px; |
|
|
border: 1px solid var(--border, rgba(255,255,255,.08)); |
|
|
background: var(--panel, rgba(255,255,255,.03)); |
|
|
} |
|
|
.meta-spacer { flex: 1 1 auto; } |
|
|
.updated { font-size: 12px; opacity: .75; } |
|
|
|
|
|
|
|
|
.card-grid { |
|
|
display: grid; |
|
|
gap: 12px; |
|
|
grid-template-columns: repeat(3, 1fr); |
|
|
margin: 16px 0 12px; |
|
|
} |
|
|
@media (max-width: 980px) { .card-grid { grid-template-columns: 1fr; } } |
|
|
|
|
|
.card { |
|
|
background: var(--panel, rgba(255,255,255,.03)); |
|
|
border: 1px solid var(--border, rgba(255,255,255,.08)); |
|
|
border-radius: 14px; |
|
|
padding: 14px; |
|
|
box-shadow: var(--shadow, 0 2px 12px rgba(0,0,0,.25)); |
|
|
} |
|
|
.card h3 { margin: 0 0 2px; font-size: 16px; } |
|
|
.muted { opacity: .75; font-size: 13px; margin: 0 0 8px; } |
|
|
|
|
|
|
|
|
.chips { display: flex; flex-wrap: wrap; gap: 8px; } |
|
|
.chip { |
|
|
font-size: 12px; |
|
|
padding: 4px 8px; |
|
|
border-radius: 8px; |
|
|
background: rgba(255,255,255,.06); |
|
|
border: 1px solid rgba(255,255,255,.08); |
|
|
} |
|
|
|
|
|
|
|
|
.card-defs h3 { margin: 2px 0 8px; } |
|
|
.defs { display: grid; grid-template-columns: 220px 1fr; gap: 8px 16px; } |
|
|
@media (max-width: 800px) { .defs { grid-template-columns: 1fr; } } |
|
|
.defs dt { font-weight: 700; } |
|
|
.defs dd { margin: 0; opacity: .92; } |
|
|
.defs code { |
|
|
background: rgba(255,255,255,.06); |
|
|
padding: 1px 4px; border-radius: 4px; |
|
|
} |
|
|
|
|
|
|
|
|
.link-row { |
|
|
display: flex; flex-wrap: wrap; gap: 10px; |
|
|
margin: 12px 0 18px; |
|
|
} |
|
|
.btn { |
|
|
display: inline-block; |
|
|
padding: 8px 12px; |
|
|
border-radius: 10px; |
|
|
border: 1px solid var(--border, rgba(255,255,255,.10)); |
|
|
background: linear-gradient(180deg, rgba(255,255,255,.06), rgba(255,255,255,.02)); |
|
|
color: var(--accent, #8ab4ff); text-decoration: none; |
|
|
font-weight: 600; font-size: 14px; |
|
|
} |
|
|
.btn:hover { text-decoration: underline; } |
|
|
.btn.ghost { background: transparent; } |
|
|
|
|
|
|
|
|
.lb-divider { |
|
|
height: 1px; border: none; |
|
|
background: linear-gradient(to right, transparent, rgba(255,255,255,.12), transparent); |
|
|
margin: 18px 0 14px; |
|
|
} |
|
|
|
|
|
.card h3 { |
|
|
margin-bottom: 10px; |
|
|
} |
|
|
|
|
|
.equation-svg { |
|
|
display: flex; |
|
|
align-items: center; |
|
|
margin: 1.5em 2em 1em; |
|
|
} |
|
|
|
|
|
.equation-svg img { |
|
|
max-width: 100%; |
|
|
height: auto; |
|
|
display: block; |
|
|
} |
|
|
|
|
|
|
|
|
.card-doc { |
|
|
margin-top: 16px; |
|
|
} |
|
|
|
|
|
.card-doc h3 { |
|
|
margin: 4px 0 10px; |
|
|
font-size: 18px; |
|
|
font-weight: 700; |
|
|
} |
|
|
|
|
|
.card-doc .lead { |
|
|
margin: 6px 0 12px; |
|
|
opacity: .92; |
|
|
line-height: 1.6; |
|
|
} |
|
|
|
|
|
.contents-list { |
|
|
margin: 0 0 12px 0; |
|
|
padding-left: 22px; |
|
|
} |
|
|
|
|
|
.contents-list li { |
|
|
margin: 6px 0; |
|
|
line-height: 1.6; |
|
|
} |
|
|
|
|
|
.ref-note { |
|
|
margin-top: 12px; |
|
|
padding-top: 10px; |
|
|
border-top: 1px solid rgba(255,255,255,.08); |
|
|
opacity: .9; |
|
|
} |
|
|
|
|
|
.ref-note a { |
|
|
color: var(--accent, #8ab4ff); |
|
|
text-decoration: underline; |
|
|
} |
|
|
.ref-note a:hover { text-decoration: none; } |
|
|
|
|
|
|
|
|
.env-note { |
|
|
margin-top: 1rem; |
|
|
font-style: italic; |
|
|
color: #8dd98d; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
</style> |
|
|
|
|
|
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script> |
|
|
<script id="MathJax-script" async |
|
|
src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</head> |
|
|
<body> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<section class="lb-container"> |
|
|
|
|
|
|
|
|
<header class="lb-hero"> |
|
|
<h1>41 Open-Source LLMs Evaluated Locally on 19 Benchmarks ⚡</h1> |
|
|
<p class="lb-tagline"> |
|
|
Evaluations run with the lm-evaluation-harness library on a local workstation. |
|
|
</p> |
|
|
<p class="lb-tagline">Benchmarks are grouped into three categories, with the corresponding tasks and metrics listed below. |
|
|
</p> |
|
|
</header> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<div class="card-grid"> |
|
|
<article class="card"> |
|
|
<h3>Reasoning & Math</h3> |
|
|
|
|
|
<div class="chips"> |
|
|
<span class="chip">gsm8k(exact_match,strict-match)</span> |
|
|
<span class="chip">bbh(exact_match,get-answer)</span> |
|
|
<span class="chip">arc_challenge(acc_norm,none)</span> |
|
|
<span class="chip">anli_r1(acc,none)</span> |
|
|
<span class="chip">anli_r2(acc,none)</span> |
|
|
<span class="chip">anli_r3(acc,none)</span> |
|
|
<span class="chip">gqpa_main_zeroshot(acc_norm,none)</span> |
|
|
</div> |
|
|
</article> |
|
|
|
|
|
<article class="card"> |
|
|
<h3>Commonsense & Natural Language Inference</h3> |
|
|
|
|
|
<div class="chips"> |
|
|
<span class="chip">hellaswag(acc_norm,none)</span> |
|
|
<span class="chip">piqa(acc_norm,none)</span> |
|
|
<span class="chip">winogrande(acc,none)</span> |
|
|
<span class="chip">boolq(acc,none)</span> |
|
|
<span class="chip">openbookqa(acc_norm,none)</span> |
|
|
<span class="chip">sciq(acc_norm,none)</span> |
|
|
<span class="chip">qnli(acc,none)</span> |
|
|
</div> |
|
|
</article> |
|
|
|
|
|
<article class="card"> |
|
|
<h3>Knowledge & Reading</h3> |
|
|
|
|
|
<div class="chips"> |
|
|
<span class="chip">mmlu(acc,none)</span> |
|
|
<span class="chip">nq_open(exact_match,remove_whitespace)</span> |
|
|
<span class="chip">drop(f1,none)</span> |
|
|
<span class="chip">truthfulqa_mc1(acc,none)</span> |
|
|
<span class="chip">truthfulqa_mc2(acc,none)</span> |
|
|
<span class="chip">triviaqa(exact_match,remove_whitespace)</span> |
|
|
</div> |
|
|
</article> |
|
|
</div> |
|
|
|
|
|
|
|
|
<section class="card card-defs"> |
|
|
<h3>Table Fields</h3> |
|
|
<dl class="defs"> |
|
|
<dt>Model Name</dt> |
|
|
<dd>Listed as <code>Company_ModelName</code>. If quantized, <code>(8bit)</code> is appended.</dd> |
|
|
|
|
|
<dt>Total Time</dt> |
|
|
<dd>Wall-clock time for the full evaluation run per model.</dd> |
|
|
|
|
|
<dt>GPU Util Time</dt> |
|
|
<dd> |
|
|
Equivalent RTX 5090 GPU time at 100% utilization. GPU utilization was logged once per minute, |
|
|
and the effective utilization time was calculated using the following equation: |
|
|
|
|
|
<div class="equation-svg"> |
|
|
<img src="results_json/equation.svg" alt="T_util-100% = Σ (u_i / 100) Δt" /> |
|
|
</div> |
|
|
<ul> |
|
|
<li><span class="muted">interval:</span> 1 minute</li> |
|
|
<li><span class="muted">u<sub>i</sub>:</span> GPU utilization (%) during interval i</li> |
|
|
<li><span class="muted">Δt:</span> duration of interval i (s)</li> |
|
|
</ul> |
|
|
</dd> |
|
|
|
|
|
<dt>Mean Score</dt> |
|
|
<dd>Arithmetic mean across the 19 tasks (0–1; higher is better). Unweighted.</dd> |
|
|
|
|
|
<dt>Overall Rank</dt> |
|
|
<dd>Rank by Mean Score (1 = best).</dd> |
|
|
</dl> |
|
|
</section> |
|
|
|
|
|
<section class="card card-doc"> |
|
|
<h3>Repository Contents</h3> |
|
|
|
|
|
|
|
|
<ul class="contents-list"> |
|
|
<li> |
|
|
<strong>Scripts</strong> — scripts for running benchmarks and collecting metrics/logs. |
|
|
</li> |
|
|
<li> |
|
|
<strong>Notebooks</strong> — Jupyter notebook for table generation and post-processing. |
|
|
</li> |
|
|
<li> |
|
|
<strong>Results (CSV / Excel)</strong> — tables for each ranking category, overall rankings, and a master table with all scores and fields. |
|
|
</li> |
|
|
<li> |
|
|
<strong>Raw JSON</strong> — per-run outputs from <code>lm-evaluation-harness</code>. |
|
|
</li> |
|
|
<li> |
|
|
<strong>Stdout logs</strong> — per-run console logs. |
|
|
</li> |
|
|
<li> |
|
|
<strong>GPU utilization logs</strong> — minute-level logs of GPU utilization and memory usage, used to compute <em>GPU Util Time</em>. |
|
|
</li> |
|
|
</ul> |
|
|
|
|
|
<div class="ref-note"> |
|
|
Benchmarks were referenced from |
|
|
<a href="https://github.com/leobeeson/llm_benchmarks" target="_blank" rel="noopener"> |
|
|
leobeeson/llm_benchmarks |
|
|
</a>. |
|
|
Detailed descriptions of each benchmark can be found in this repository. |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
<hr class="lb-divider"> |
|
|
|
|
|
</section> |
|
|
|
|
|
<script> |
|
|
|
|
|
const el = document.getElementById('last-updated'); |
|
|
if (el) el.textContent = new Date().toISOString().slice(0,10); |
|
|
</script> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<div class="wrap"> |
|
|
<h1 class="title">Overall Rank (Average Rank)</h1> |
|
|
|
|
|
<div class="table-card"> |
|
|
|
|
|
<div id="table-slot"> |
|
|
|
|
|
|
|
|
<table border="1" class="dataframe"> |
|
|
<thead> |
|
|
<tr style="text-align: right;"> |
|
|
<th>Model Name</th> |
|
|
<th>Total Time</th> |
|
|
<th>GPU Util Time</th> |
|
|
<th>Mean Score</th> |
|
|
<th>Overall Rank</th> |
|
|
</tr> |
|
|
</thead> |
|
|
<tbody> |
|
|
<tr> |
|
|
<td>google_gemma-3-12b-it</td> |
|
|
<td>15h 45m</td> |
|
|
<td>14h 8m</td> |
|
|
<td>0.6038</td> |
|
|
<td>1</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-14B (8bit)</td> |
|
|
<td>29h 45m</td> |
|
|
<td>17h 29m</td> |
|
|
<td>0.5961</td> |
|
|
<td>2</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>openchat_openchat-3.6-8b-20240522</td> |
|
|
<td>7h 51m</td> |
|
|
<td>6h 59m</td> |
|
|
<td>0.5871</td> |
|
|
<td>3</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-8B</td> |
|
|
<td>15h 31m</td> |
|
|
<td>13h 44m</td> |
|
|
<td>0.5859</td> |
|
|
<td>4</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-7B-Instruct</td> |
|
|
<td>9h 36m</td> |
|
|
<td>8h 33m</td> |
|
|
<td>0.5788</td> |
|
|
<td>5</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-14B-Instruct (8bit)</td> |
|
|
<td>52h 44m</td> |
|
|
<td>29h 32m</td> |
|
|
<td>0.5775</td> |
|
|
<td>6</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-9B</td> |
|
|
<td>11h 43m</td> |
|
|
<td>10h 26m</td> |
|
|
<td>0.5676</td> |
|
|
<td>7</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-7B-Instruct-1M</td> |
|
|
<td>11h 17m</td> |
|
|
<td>10h 10m</td> |
|
|
<td>0.5672</td> |
|
|
<td>8</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-3.1-8B-Instruct</td> |
|
|
<td>12h 19m</td> |
|
|
<td>10h 52m</td> |
|
|
<td>0.5653</td> |
|
|
<td>9</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-9B-Chat</td> |
|
|
<td>13h 54m</td> |
|
|
<td>12h 15m</td> |
|
|
<td>0.5621</td> |
|
|
<td>10</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>mistralai_Ministral-8B-Instruct-2410</td> |
|
|
<td>10h 46m</td> |
|
|
<td>9h 27m</td> |
|
|
<td>0.5576</td> |
|
|
<td>11</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Meta-Llama-3-8B-Instruct</td> |
|
|
<td>6h 30m</td> |
|
|
<td>5h 46m</td> |
|
|
<td>0.5528</td> |
|
|
<td>12</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-4B</td> |
|
|
<td>5h 51m</td> |
|
|
<td>5h 3m</td> |
|
|
<td>0.5510</td> |
|
|
<td>13</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>NousResearch_Hermes-2-Pro-Mistral-7B</td> |
|
|
<td>8h 27m</td> |
|
|
<td>7h 28m</td> |
|
|
<td>0.5480</td> |
|
|
<td>14</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>mistralai_Mistral-7B-Instruct-v0.3</td> |
|
|
<td>8h 38m</td> |
|
|
<td>7h 41m</td> |
|
|
<td>0.5451</td> |
|
|
<td>15</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>google_gemma-3-4b-it</td> |
|
|
<td>4h 51m</td> |
|
|
<td>3h 50m</td> |
|
|
<td>0.5368</td> |
|
|
<td>16</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-6B-Chat</td> |
|
|
<td>8h 4m</td> |
|
|
<td>7h 1m</td> |
|
|
<td>0.5335</td> |
|
|
<td>17</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-6B</td> |
|
|
<td>4h 28m</td> |
|
|
<td>3h 54m</td> |
|
|
<td>0.5312</td> |
|
|
<td>18</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2-7B-Instruct</td> |
|
|
<td>11h 30m</td> |
|
|
<td>10h 11m</td> |
|
|
<td>0.5271</td> |
|
|
<td>19</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-0528-Qwen3-8B</td> |
|
|
<td>17h 57m</td> |
|
|
<td>15h 30m</td> |
|
|
<td>0.5219</td> |
|
|
<td>20</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-3.2-3B-Instruct</td> |
|
|
<td>7h 12m</td> |
|
|
<td>5h 57m</td> |
|
|
<td>0.5048</td> |
|
|
<td>21</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-3B-Instruct</td> |
|
|
<td>7h 48m</td> |
|
|
<td>6h 30m</td> |
|
|
<td>0.4939</td> |
|
|
<td>22</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-Math-7B</td> |
|
|
<td>27h 21m</td> |
|
|
<td>24h 38m</td> |
|
|
<td>0.4907</td> |
|
|
<td>23</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_deepseek-llm-7b-chat</td> |
|
|
<td>10h 6m</td> |
|
|
<td>9h 8m</td> |
|
|
<td>0.4869</td> |
|
|
<td>24</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-Distill-Llama-8B</td> |
|
|
<td>11h 46m</td> |
|
|
<td>10h 36m</td> |
|
|
<td>0.4830</td> |
|
|
<td>25</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-13b-hf</td> |
|
|
<td>19h 21m</td> |
|
|
<td>17h 38m</td> |
|
|
<td>0.4819</td> |
|
|
<td>26</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-13b-chat-hf</td> |
|
|
<td>17h 8m</td> |
|
|
<td>15h 37m</td> |
|
|
<td>0.4813</td> |
|
|
<td>27</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-Distill-Qwen-7B</td> |
|
|
<td>6h 28m</td> |
|
|
<td>5h 43m</td> |
|
|
<td>0.4644</td> |
|
|
<td>28</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-1.5B-Instruct</td> |
|
|
<td>3h 20m</td> |
|
|
<td>2h 36m</td> |
|
|
<td>0.4608</td> |
|
|
<td>29</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-1.7B</td> |
|
|
<td>4h 25m</td> |
|
|
<td>3h 36m</td> |
|
|
<td>0.4597</td> |
|
|
<td>30</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-Math-7B-Instruct</td> |
|
|
<td>5h 37m</td> |
|
|
<td>4h 57m</td> |
|
|
<td>0.4596</td> |
|
|
<td>31</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-7b-chat-hf</td> |
|
|
<td>6h 57m</td> |
|
|
<td>6h 7m</td> |
|
|
<td>0.4525</td> |
|
|
<td>32</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-7b-hf</td> |
|
|
<td>5h 42m</td> |
|
|
<td>4h 59m</td> |
|
|
<td>0.4516</td> |
|
|
<td>33</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_deepseek-llm-7b-base</td> |
|
|
<td>7h 11m</td> |
|
|
<td>6h 26m</td> |
|
|
<td>0.4451</td> |
|
|
<td>34</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_deepseek-math-7b-rl</td> |
|
|
<td>8h 2m</td> |
|
|
<td>7h 12m</td> |
|
|
<td>0.4419</td> |
|
|
<td>35</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-3.2-1B-Instruct</td> |
|
|
<td>3h 30m</td> |
|
|
<td>2h 35m</td> |
|
|
<td>0.4219</td> |
|
|
<td>36</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>google_gemma-3-1b-it</td> |
|
|
<td>6h 50m</td> |
|
|
<td>4h 52m</td> |
|
|
<td>0.4013</td> |
|
|
<td>37</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B</td> |
|
|
<td>3h 40m</td> |
|
|
<td>2h 52m</td> |
|
|
<td>0.3986</td> |
|
|
<td>38</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-Math-1.5B-Instruct</td> |
|
|
<td>3h 25m</td> |
|
|
<td>2h 39m</td> |
|
|
<td>0.3838</td> |
|
|
<td>39</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-0.6B</td> |
|
|
<td>3h 45m</td> |
|
|
<td>2h 53m</td> |
|
|
<td>0.3816</td> |
|
|
<td>40</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-0.5B-Instruct</td> |
|
|
<td>2h 34m</td> |
|
|
<td>1h 48m</td> |
|
|
<td>0.3799</td> |
|
|
<td>41</td> |
|
|
</tr> |
|
|
</tbody> |
|
|
</table> |
|
|
|
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div class="wrap"> |
|
|
<h1 class="title">Reasoning & Math (Average Rank)</h1> |
|
|
|
|
|
<div class="table-card"> |
|
|
|
|
|
<div id="table-slot"> |
|
|
|
|
|
|
|
|
<table border="1" class="dataframe"> |
|
|
<thead> |
|
|
<tr style="text-align: right;"> |
|
|
<th>Model Name</th> |
|
|
<th>Total Time</th> |
|
|
<th>GPU Util Time</th> |
|
|
<th>Reasoning & Math Mean Score</th> |
|
|
<th>Reasoning & Math Avg. Rank</th> |
|
|
</tr> |
|
|
</thead> |
|
|
<tbody> |
|
|
<tr> |
|
|
<td>google_gemma-3-12b-it</td> |
|
|
<td>15h 45m</td> |
|
|
<td>14h 8m</td> |
|
|
<td>0.6266</td> |
|
|
<td>1</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-8B</td> |
|
|
<td>15h 31m</td> |
|
|
<td>13h 44m</td> |
|
|
<td>0.6214</td> |
|
|
<td>2</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-14B (8bit)</td> |
|
|
<td>29h 45m</td> |
|
|
<td>17h 29m</td> |
|
|
<td>0.5860</td> |
|
|
<td>3</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-4B</td> |
|
|
<td>5h 51m</td> |
|
|
<td>5h 3m</td> |
|
|
<td>0.5712</td> |
|
|
<td>4</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-7B-Instruct</td> |
|
|
<td>9h 36m</td> |
|
|
<td>8h 33m</td> |
|
|
<td>0.5541</td> |
|
|
<td>5</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>openchat_openchat-3.6-8b-20240522</td> |
|
|
<td>7h 51m</td> |
|
|
<td>6h 59m</td> |
|
|
<td>0.5505</td> |
|
|
<td>6</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-14B-Instruct (8bit)</td> |
|
|
<td>52h 44m</td> |
|
|
<td>29h 32m</td> |
|
|
<td>0.5488</td> |
|
|
<td>7</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>mistralai_Ministral-8B-Instruct-2410</td> |
|
|
<td>10h 46m</td> |
|
|
<td>9h 27m</td> |
|
|
<td>0.5446</td> |
|
|
<td>8</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-9B-Chat</td> |
|
|
<td>13h 54m</td> |
|
|
<td>12h 15m</td> |
|
|
<td>0.5399</td> |
|
|
<td>9</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-0528-Qwen3-8B</td> |
|
|
<td>17h 57m</td> |
|
|
<td>15h 30m</td> |
|
|
<td>0.5387</td> |
|
|
<td>10</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>google_gemma-3-4b-it</td> |
|
|
<td>4h 51m</td> |
|
|
<td>3h 50m</td> |
|
|
<td>0.5374</td> |
|
|
<td>11</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-3.1-8B-Instruct</td> |
|
|
<td>12h 19m</td> |
|
|
<td>10h 52m</td> |
|
|
<td>0.5366</td> |
|
|
<td>12</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Meta-Llama-3-8B-Instruct</td> |
|
|
<td>6h 30m</td> |
|
|
<td>5h 46m</td> |
|
|
<td>0.5286</td> |
|
|
<td>13</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2-7B-Instruct</td> |
|
|
<td>11h 30m</td> |
|
|
<td>10h 11m</td> |
|
|
<td>0.5285</td> |
|
|
<td>14</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-7B-Instruct-1M</td> |
|
|
<td>11h 17m</td> |
|
|
<td>10h 10m</td> |
|
|
<td>0.5245</td> |
|
|
<td>15</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-9B</td> |
|
|
<td>11h 43m</td> |
|
|
<td>10h 26m</td> |
|
|
<td>0.5206</td> |
|
|
<td>16</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>NousResearch_Hermes-2-Pro-Mistral-7B</td> |
|
|
<td>8h 27m</td> |
|
|
<td>7h 28m</td> |
|
|
<td>0.5184</td> |
|
|
<td>17</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-Math-7B</td> |
|
|
<td>27h 21m</td> |
|
|
<td>24h 38m</td> |
|
|
<td>0.5010</td> |
|
|
<td>18</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-6B-Chat</td> |
|
|
<td>8h 4m</td> |
|
|
<td>7h 1m</td> |
|
|
<td>0.5006</td> |
|
|
<td>19</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-Math-7B-Instruct</td> |
|
|
<td>5h 37m</td> |
|
|
<td>4h 57m</td> |
|
|
<td>0.4997</td> |
|
|
<td>20</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-Distill-Qwen-7B</td> |
|
|
<td>6h 28m</td> |
|
|
<td>5h 43m</td> |
|
|
<td>0.4841</td> |
|
|
<td>21</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>mistralai_Mistral-7B-Instruct-v0.3</td> |
|
|
<td>8h 38m</td> |
|
|
<td>7h 41m</td> |
|
|
<td>0.4704</td> |
|
|
<td>22</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-3.2-3B-Instruct</td> |
|
|
<td>7h 12m</td> |
|
|
<td>5h 57m</td> |
|
|
<td>0.4688</td> |
|
|
<td>23</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-6B</td> |
|
|
<td>4h 28m</td> |
|
|
<td>3h 54m</td> |
|
|
<td>0.4495</td> |
|
|
<td>24</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-1.7B</td> |
|
|
<td>4h 25m</td> |
|
|
<td>3h 36m</td> |
|
|
<td>0.4493</td> |
|
|
<td>25</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-Distill-Llama-8B</td> |
|
|
<td>11h 46m</td> |
|
|
<td>10h 36m</td> |
|
|
<td>0.4469</td> |
|
|
<td>26</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_deepseek-llm-7b-chat</td> |
|
|
<td>10h 6m</td> |
|
|
<td>9h 8m</td> |
|
|
<td>0.4244</td> |
|
|
<td>27</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-13b-chat-hf</td> |
|
|
<td>17h 8m</td> |
|
|
<td>15h 37m</td> |
|
|
<td>0.4143</td> |
|
|
<td>28</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-Math-1.5B-Instruct</td> |
|
|
<td>3h 25m</td> |
|
|
<td>2h 39m</td> |
|
|
<td>0.4085</td> |
|
|
<td>29</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B</td> |
|
|
<td>3h 40m</td> |
|
|
<td>2h 52m</td> |
|
|
<td>0.4009</td> |
|
|
<td>30</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-1.5B-Instruct</td> |
|
|
<td>3h 20m</td> |
|
|
<td>2h 36m</td> |
|
|
<td>0.3874</td> |
|
|
<td>31</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-3B-Instruct</td> |
|
|
<td>7h 48m</td> |
|
|
<td>6h 30m</td> |
|
|
<td>0.3823</td> |
|
|
<td>32</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-13b-hf</td> |
|
|
<td>19h 21m</td> |
|
|
<td>17h 38m</td> |
|
|
<td>0.3719</td> |
|
|
<td>33</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_deepseek-math-7b-rl</td> |
|
|
<td>8h 2m</td> |
|
|
<td>7h 12m</td> |
|
|
<td>0.3702</td> |
|
|
<td>34</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-7b-chat-hf</td> |
|
|
<td>6h 57m</td> |
|
|
<td>6h 7m</td> |
|
|
<td>0.3674</td> |
|
|
<td>35</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-0.6B</td> |
|
|
<td>3h 45m</td> |
|
|
<td>2h 53m</td> |
|
|
<td>0.3494</td> |
|
|
<td>36</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-3.2-1B-Instruct</td> |
|
|
<td>3h 30m</td> |
|
|
<td>2h 35m</td> |
|
|
<td>0.3450</td> |
|
|
<td>37</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_deepseek-llm-7b-base</td> |
|
|
<td>7h 11m</td> |
|
|
<td>6h 26m</td> |
|
|
<td>0.3377</td> |
|
|
<td>38</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-7b-hf</td> |
|
|
<td>5h 42m</td> |
|
|
<td>4h 59m</td> |
|
|
<td>0.3361</td> |
|
|
<td>39</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>google_gemma-3-1b-it</td> |
|
|
<td>6h 50m</td> |
|
|
<td>4h 52m</td> |
|
|
<td>0.3312</td> |
|
|
<td>40</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-0.5B-Instruct</td> |
|
|
<td>2h 34m</td> |
|
|
<td>1h 48m</td> |
|
|
<td>0.2914</td> |
|
|
<td>41</td> |
|
|
</tr> |
|
|
</tbody> |
|
|
</table> |
|
|
|
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
<div class="wrap"> |
|
|
<h1 class="title">Commonsense & NLI (Average Rank)</h1> |
|
|
|
|
|
<div class="table-card"> |
|
|
|
|
|
<div id="table-slot"> |
|
|
|
|
|
|
|
|
<table border="1" class="dataframe"> |
|
|
<thead> |
|
|
<tr style="text-align: right;"> |
|
|
<th>Model Name</th> |
|
|
<th>Total Time</th> |
|
|
<th>GPU Util Time</th> |
|
|
<th>Commonsense & NLI Mean Score</th> |
|
|
<th>Commonsense & NLI Avg. Rank</th> |
|
|
</tr> |
|
|
</thead> |
|
|
<tbody> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-14B-Instruct (8bit)</td> |
|
|
<td>52h 44m</td> |
|
|
<td>29h 32m</td> |
|
|
<td>0.7941</td> |
|
|
<td>1</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-14B (8bit)</td> |
|
|
<td>29h 45m</td> |
|
|
<td>17h 29m</td> |
|
|
<td>0.7807</td> |
|
|
<td>2</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>google_gemma-3-12b-it</td> |
|
|
<td>15h 45m</td> |
|
|
<td>14h 8m</td> |
|
|
<td>0.7737</td> |
|
|
<td>3</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-7B-Instruct</td> |
|
|
<td>9h 36m</td> |
|
|
<td>8h 33m</td> |
|
|
<td>0.7730</td> |
|
|
<td>4</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>openchat_openchat-3.6-8b-20240522</td> |
|
|
<td>7h 51m</td> |
|
|
<td>6h 59m</td> |
|
|
<td>0.7726</td> |
|
|
<td>5</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-9B-Chat</td> |
|
|
<td>13h 54m</td> |
|
|
<td>12h 15m</td> |
|
|
<td>0.7691</td> |
|
|
<td>6</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-7B-Instruct-1M</td> |
|
|
<td>11h 17m</td> |
|
|
<td>10h 10m</td> |
|
|
<td>0.7564</td> |
|
|
<td>7</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-8B</td> |
|
|
<td>15h 31m</td> |
|
|
<td>13h 44m</td> |
|
|
<td>0.7468</td> |
|
|
<td>8</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>mistralai_Mistral-7B-Instruct-v0.3</td> |
|
|
<td>8h 38m</td> |
|
|
<td>7h 41m</td> |
|
|
<td>0.7403</td> |
|
|
<td>9</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-6B-Chat</td> |
|
|
<td>8h 4m</td> |
|
|
<td>7h 1m</td> |
|
|
<td>0.7374</td> |
|
|
<td>10</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-3B-Instruct</td> |
|
|
<td>7h 48m</td> |
|
|
<td>6h 30m</td> |
|
|
<td>0.7367</td> |
|
|
<td>11</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>mistralai_Ministral-8B-Instruct-2410</td> |
|
|
<td>10h 46m</td> |
|
|
<td>9h 27m</td> |
|
|
<td>0.7328</td> |
|
|
<td>12</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>NousResearch_Hermes-2-Pro-Mistral-7B</td> |
|
|
<td>8h 27m</td> |
|
|
<td>7h 28m</td> |
|
|
<td>0.7284</td> |
|
|
<td>13</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2-7B-Instruct</td> |
|
|
<td>11h 30m</td> |
|
|
<td>10h 11m</td> |
|
|
<td>0.7274</td> |
|
|
<td>14</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-9B</td> |
|
|
<td>11h 43m</td> |
|
|
<td>10h 26m</td> |
|
|
<td>0.7266</td> |
|
|
<td>15</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-4B</td> |
|
|
<td>5h 51m</td> |
|
|
<td>5h 3m</td> |
|
|
<td>0.7266</td> |
|
|
<td>16</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-3.1-8B-Instruct</td> |
|
|
<td>12h 19m</td> |
|
|
<td>10h 52m</td> |
|
|
<td>0.7249</td> |
|
|
<td>17</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-6B</td> |
|
|
<td>4h 28m</td> |
|
|
<td>3h 54m</td> |
|
|
<td>0.7199</td> |
|
|
<td>18</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>google_gemma-3-4b-it</td> |
|
|
<td>4h 51m</td> |
|
|
<td>3h 50m</td> |
|
|
<td>0.7167</td> |
|
|
<td>19</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-13b-hf</td> |
|
|
<td>19h 21m</td> |
|
|
<td>17h 38m</td> |
|
|
<td>0.7157</td> |
|
|
<td>20</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-13b-chat-hf</td> |
|
|
<td>17h 8m</td> |
|
|
<td>15h 37m</td> |
|
|
<td>0.7153</td> |
|
|
<td>21</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Meta-Llama-3-8B-Instruct</td> |
|
|
<td>6h 30m</td> |
|
|
<td>5h 46m</td> |
|
|
<td>0.7147</td> |
|
|
<td>22</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-0528-Qwen3-8B</td> |
|
|
<td>17h 57m</td> |
|
|
<td>15h 30m</td> |
|
|
<td>0.7094</td> |
|
|
<td>23</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_deepseek-llm-7b-chat</td> |
|
|
<td>10h 6m</td> |
|
|
<td>9h 8m</td> |
|
|
<td>0.7090</td> |
|
|
<td>24</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-7b-chat-hf</td> |
|
|
<td>6h 57m</td> |
|
|
<td>6h 7m</td> |
|
|
<td>0.6978</td> |
|
|
<td>25</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-7b-hf</td> |
|
|
<td>5h 42m</td> |
|
|
<td>4h 59m</td> |
|
|
<td>0.6956</td> |
|
|
<td>26</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-Distill-Llama-8B</td> |
|
|
<td>11h 46m</td> |
|
|
<td>10h 36m</td> |
|
|
<td>0.6928</td> |
|
|
<td>27</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_deepseek-llm-7b-base</td> |
|
|
<td>7h 11m</td> |
|
|
<td>6h 26m</td> |
|
|
<td>0.6886</td> |
|
|
<td>28</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-1.5B-Instruct</td> |
|
|
<td>3h 20m</td> |
|
|
<td>2h 36m</td> |
|
|
<td>0.6803</td> |
|
|
<td>29</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-3.2-3B-Instruct</td> |
|
|
<td>7h 12m</td> |
|
|
<td>5h 57m</td> |
|
|
<td>0.6788</td> |
|
|
<td>30</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_deepseek-math-7b-rl</td> |
|
|
<td>8h 2m</td> |
|
|
<td>7h 12m</td> |
|
|
<td>0.6711</td> |
|
|
<td>31</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-Math-7B</td> |
|
|
<td>27h 21m</td> |
|
|
<td>24h 38m</td> |
|
|
<td>0.6587</td> |
|
|
<td>32</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-1.7B</td> |
|
|
<td>4h 25m</td> |
|
|
<td>3h 36m</td> |
|
|
<td>0.6442</td> |
|
|
<td>33</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-Distill-Qwen-7B</td> |
|
|
<td>6h 28m</td> |
|
|
<td>5h 43m</td> |
|
|
<td>0.6422</td> |
|
|
<td>34</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>google_gemma-3-1b-it</td> |
|
|
<td>6h 50m</td> |
|
|
<td>4h 52m</td> |
|
|
<td>0.6267</td> |
|
|
<td>35</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-3.2-1B-Instruct</td> |
|
|
<td>3h 30m</td> |
|
|
<td>2h 35m</td> |
|
|
<td>0.6264</td> |
|
|
<td>36</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-Math-7B-Instruct</td> |
|
|
<td>5h 37m</td> |
|
|
<td>4h 57m</td> |
|
|
<td>0.6184</td> |
|
|
<td>37</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-0.5B-Instruct</td> |
|
|
<td>2h 34m</td> |
|
|
<td>1h 48m</td> |
|
|
<td>0.6039</td> |
|
|
<td>38</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B</td> |
|
|
<td>3h 40m</td> |
|
|
<td>2h 52m</td> |
|
|
<td>0.5703</td> |
|
|
<td>39</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-0.6B</td> |
|
|
<td>3h 45m</td> |
|
|
<td>2h 53m</td> |
|
|
<td>0.5696</td> |
|
|
<td>40</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-Math-1.5B-Instruct</td> |
|
|
<td>3h 25m</td> |
|
|
<td>2h 39m</td> |
|
|
<td>0.5181</td> |
|
|
<td>41</td> |
|
|
</tr> |
|
|
</tbody> |
|
|
</table> |
|
|
|
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
<div class="wrap"> |
|
|
<h1 class="title">Knowledge & Reading (Average Rank)</h1> |
|
|
|
|
|
<div class="table-card"> |
|
|
|
|
|
<div id="table-slot"> |
|
|
|
|
|
|
|
|
<table border="1" class="dataframe"> |
|
|
<thead> |
|
|
<tr style="text-align: right;"> |
|
|
<th>Model Name</th> |
|
|
<th>Total Time</th> |
|
|
<th>GPU Util Time</th> |
|
|
<th>Knowledge & Reading Mean Score</th> |
|
|
<th>Knowledge & Reading Avg. Rank</th> |
|
|
</tr> |
|
|
</thead> |
|
|
<tbody> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-9B</td> |
|
|
<td>11h 43m</td> |
|
|
<td>10h 26m</td> |
|
|
<td>0.4369</td> |
|
|
<td>1</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>openchat_openchat-3.6-8b-20240522</td> |
|
|
<td>7h 51m</td> |
|
|
<td>6h 59m</td> |
|
|
<td>0.4136</td> |
|
|
<td>2</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-3.1-8B-Instruct</td> |
|
|
<td>12h 19m</td> |
|
|
<td>10h 52m</td> |
|
|
<td>0.4127</td> |
|
|
<td>3</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-6B</td> |
|
|
<td>4h 28m</td> |
|
|
<td>3h 54m</td> |
|
|
<td>0.4063</td> |
|
|
<td>4</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>mistralai_Mistral-7B-Instruct-v0.3</td> |
|
|
<td>8h 38m</td> |
|
|
<td>7h 41m</td> |
|
|
<td>0.4045</td> |
|
|
<td>5</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-7B-Instruct-1M</td> |
|
|
<td>11h 17m</td> |
|
|
<td>10h 10m</td> |
|
|
<td>0.3963</td> |
|
|
<td>6</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-14B (8bit)</td> |
|
|
<td>29h 45m</td> |
|
|
<td>17h 29m</td> |
|
|
<td>0.3926</td> |
|
|
<td>7</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Meta-Llama-3-8B-Instruct</td> |
|
|
<td>6h 30m</td> |
|
|
<td>5h 46m</td> |
|
|
<td>0.3923</td> |
|
|
<td>8</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-7B-Instruct</td> |
|
|
<td>9h 36m</td> |
|
|
<td>8h 33m</td> |
|
|
<td>0.3810</td> |
|
|
<td>9</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>google_gemma-3-12b-it</td> |
|
|
<td>15h 45m</td> |
|
|
<td>14h 8m</td> |
|
|
<td>0.3791</td> |
|
|
<td>10</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>NousResearch_Hermes-2-Pro-Mistral-7B</td> |
|
|
<td>8h 27m</td> |
|
|
<td>7h 28m</td> |
|
|
<td>0.3719</td> |
|
|
<td>11</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>mistralai_Ministral-8B-Instruct-2410</td> |
|
|
<td>10h 46m</td> |
|
|
<td>9h 27m</td> |
|
|
<td>0.3683</td> |
|
|
<td>12</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-14B-Instruct (8bit)</td> |
|
|
<td>52h 44m</td> |
|
|
<td>29h 32m</td> |
|
|
<td>0.3581</td> |
|
|
<td>13</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-8B</td> |
|
|
<td>15h 31m</td> |
|
|
<td>13h 44m</td> |
|
|
<td>0.3566</td> |
|
|
<td>14</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-9B-Chat</td> |
|
|
<td>13h 54m</td> |
|
|
<td>12h 15m</td> |
|
|
<td>0.3467</td> |
|
|
<td>15</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-3.2-3B-Instruct</td> |
|
|
<td>7h 12m</td> |
|
|
<td>5h 57m</td> |
|
|
<td>0.3438</td> |
|
|
<td>16</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-3B-Instruct</td> |
|
|
<td>7h 48m</td> |
|
|
<td>6h 30m</td> |
|
|
<td>0.3406</td> |
|
|
<td>17</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-13b-hf</td> |
|
|
<td>19h 21m</td> |
|
|
<td>17h 38m</td> |
|
|
<td>0.3374</td> |
|
|
<td>18</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>01-ai_Yi-1.5-6B-Chat</td> |
|
|
<td>8h 4m</td> |
|
|
<td>7h 1m</td> |
|
|
<td>0.3339</td> |
|
|
<td>19</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>google_gemma-3-4b-it</td> |
|
|
<td>4h 51m</td> |
|
|
<td>3h 50m</td> |
|
|
<td>0.3261</td> |
|
|
<td>20</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-4B</td> |
|
|
<td>5h 51m</td> |
|
|
<td>5h 3m</td> |
|
|
<td>0.3226</td> |
|
|
<td>21</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-7b-hf</td> |
|
|
<td>5h 42m</td> |
|
|
<td>4h 59m</td> |
|
|
<td>0.3018</td> |
|
|
<td>22</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_deepseek-llm-7b-chat</td> |
|
|
<td>10h 6m</td> |
|
|
<td>9h 8m</td> |
|
|
<td>0.3007</td> |
|
|
<td>23</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2-7B-Instruct</td> |
|
|
<td>11h 30m</td> |
|
|
<td>10h 11m</td> |
|
|
<td>0.2919</td> |
|
|
<td>24</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-1.5B-Instruct</td> |
|
|
<td>3h 20m</td> |
|
|
<td>2h 36m</td> |
|
|
<td>0.2903</td> |
|
|
<td>25</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-13b-chat-hf</td> |
|
|
<td>17h 8m</td> |
|
|
<td>15h 37m</td> |
|
|
<td>0.2864</td> |
|
|
<td>26</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_deepseek-llm-7b-base</td> |
|
|
<td>7h 11m</td> |
|
|
<td>6h 26m</td> |
|
|
<td>0.2864</td> |
|
|
<td>27</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-0528-Qwen3-8B</td> |
|
|
<td>17h 57m</td> |
|
|
<td>15h 30m</td> |
|
|
<td>0.2834</td> |
|
|
<td>28</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-Math-7B</td> |
|
|
<td>27h 21m</td> |
|
|
<td>24h 38m</td> |
|
|
<td>0.2827</td> |
|
|
<td>29</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-Distill-Llama-8B</td> |
|
|
<td>11h 46m</td> |
|
|
<td>10h 36m</td> |
|
|
<td>0.2805</td> |
|
|
<td>30</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-3.2-1B-Instruct</td> |
|
|
<td>3h 30m</td> |
|
|
<td>2h 35m</td> |
|
|
<td>0.2731</td> |
|
|
<td>31</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>meta-llama_Llama-2-7b-chat-hf</td> |
|
|
<td>6h 57m</td> |
|
|
<td>6h 7m</td> |
|
|
<td>0.2656</td> |
|
|
<td>32</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_deepseek-math-7b-rl</td> |
|
|
<td>8h 2m</td> |
|
|
<td>7h 12m</td> |
|
|
<td>0.2581</td> |
|
|
<td>33</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-1.7B</td> |
|
|
<td>4h 25m</td> |
|
|
<td>3h 36m</td> |
|
|
<td>0.2567</td> |
|
|
<td>34</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-Distill-Qwen-7B</td> |
|
|
<td>6h 28m</td> |
|
|
<td>5h 43m</td> |
|
|
<td>0.2340</td> |
|
|
<td>35</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-Math-7B-Instruct</td> |
|
|
<td>5h 37m</td> |
|
|
<td>4h 57m</td> |
|
|
<td>0.2276</td> |
|
|
<td>36</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-0.5B-Instruct</td> |
|
|
<td>2h 34m</td> |
|
|
<td>1h 48m</td> |
|
|
<td>0.2218</td> |
|
|
<td>37</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>google_gemma-3-1b-it</td> |
|
|
<td>6h 50m</td> |
|
|
<td>4h 52m</td> |
|
|
<td>0.2202</td> |
|
|
<td>38</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen3-0.6B</td> |
|
|
<td>3h 45m</td> |
|
|
<td>2h 53m</td> |
|
|
<td>0.2000</td> |
|
|
<td>39</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Qwen_Qwen2.5-Math-1.5B-Instruct</td> |
|
|
<td>3h 25m</td> |
|
|
<td>2h 39m</td> |
|
|
<td>0.1983</td> |
|
|
<td>40</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B</td> |
|
|
<td>3h 40m</td> |
|
|
<td>2h 52m</td> |
|
|
<td>0.1954</td> |
|
|
<td>41</td> |
|
|
</tr> |
|
|
</tbody> |
|
|
</table> |
|
|
|
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<section class="lb-container"> |
|
|
<section class="card card-doc"> |
|
|
<h3>Project Summary</h3> |
|
|
<p> |
|
|
<strong>Total Machine Runtime:</strong> 18 days 8 hours<br> |
|
|
<strong>Total GPU Time (RTX 5090 @ 100% Utilization):</strong> 14 days 23 hours <br> |
|
|
</p> |
|
|
|
|
|
<p class="env-note"> |
|
|
🌱 The environmental impact caused by this project was mitigated through my active use of public transportation. :) |
|
|
</p> |
|
|
</section> |
|
|
</section> |
|
|
|
|
|
<script> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(function () { |
|
|
const slot = document.getElementById('table-slot'); |
|
|
const table = slot.querySelector('table'); |
|
|
if (!table) return; |
|
|
|
|
|
|
|
|
let thead = table.querySelector('thead'); |
|
|
if (!thead) { |
|
|
const firstRow = table.querySelector('tr'); |
|
|
if (firstRow) { |
|
|
thead = document.createElement('thead'); |
|
|
const headRow = firstRow.cloneNode(true); |
|
|
thead.appendChild(headRow); |
|
|
table.insertBefore(thead, firstRow); |
|
|
firstRow.remove(); |
|
|
const tbody = table.querySelector('tbody') || table.createTBody(); |
|
|
|
|
|
Array.from(table.querySelectorAll('tr')).forEach(tr => tbody.appendChild(tr)); |
|
|
} |
|
|
} |
|
|
|
|
|
const headCells = Array.from(table.querySelectorAll('thead th, thead td')).map(th => th.textContent.trim()); |
|
|
const bodyRows = Array.from(table.querySelectorAll('tbody tr')); |
|
|
|
|
|
|
|
|
const nameIdx = headCells.findIndex(h => /model\s*name/i.test(h)); |
|
|
const rankIdx = headCells.findIndex(h => /overall\s*rank/i.test(h)); |
|
|
const totalIdx = headCells.findIndex(h => /total\s*time/i.test(h)); |
|
|
const utilIdx = headCells.findIndex(h => /gpu\s*util\s*time/i.test(h)); |
|
|
|
|
|
|
|
|
bodyRows.forEach(row => { |
|
|
const cells = row.children; |
|
|
if (nameIdx >= 0 && cells[nameIdx]) cells[nameIdx].style.textAlign = 'left'; |
|
|
[rankIdx, totalIdx, utilIdx].forEach(i => { |
|
|
if (i >= 0 && cells[i]) cells[i].style.textAlign = 'right'; |
|
|
}); |
|
|
}); |
|
|
|
|
|
|
|
|
if (rankIdx >= 0) { |
|
|
bodyRows.forEach(row => { |
|
|
const cell = row.children[rankIdx]; |
|
|
if (!cell) return; |
|
|
const n = parseInt((cell.textContent || '').replace(/[^\d]/g, ''), 10); |
|
|
if (n === 1) { row.classList.add('top1'); cell.insertAdjacentHTML('beforeend', '<span class="medal">🥇</span>'); } |
|
|
if (n === 2) { row.classList.add('top2'); cell.insertAdjacentHTML('beforeend', '<span class="medal">🥈</span>'); } |
|
|
if (n === 3) { row.classList.add('top3'); cell.insertAdjacentHTML('beforeend', '<span class="medal">🥉</span>'); } |
|
|
}); |
|
|
} |
|
|
|
|
|
|
|
|
if (nameIdx >= 0) { |
|
|
bodyRows.forEach(row => { |
|
|
const c = row.children[nameIdx]; |
|
|
if (c) { c.style.whiteSpace = 'nowrap'; c.style.textOverflow = 'ellipsis'; c.style.overflow = 'hidden'; maxNameWidth(c, 520); } |
|
|
}); |
|
|
} |
|
|
function maxNameWidth(td, px) { td.style.maxWidth = px + 'px'; } |
|
|
})(); |
|
|
</script> |
|
|
</body> |
|
|
</html> |