Spaces:
Running
Running
Upload from GitHub Actions: drop normalization
Browse files- evals/backend.py +105 -15
- frontend/src/App.js +18 -2
- frontend/src/components/DatasetTable.js +1 -1
- frontend/src/components/LanguageTable.js +1 -1
- frontend/src/components/LanguageTierHistoryPlot.js +128 -0
- frontend/src/components/LicenseHistoryPlot.js +117 -0
- frontend/src/components/ModelTable.js +1 -1
- frontend/src/components/ScoreColumns.js +6 -6
evals/backend.py
CHANGED
|
@@ -28,25 +28,12 @@ task_metrics = [
|
|
| 28 |
"classification_accuracy",
|
| 29 |
"mmlu_accuracy",
|
| 30 |
"arc_accuracy",
|
| 31 |
-
"truthfulqa_accuracy",
|
| 32 |
"mgsm_accuracy",
|
| 33 |
]
|
| 34 |
|
| 35 |
-
|
| 36 |
def compute_normalized_average(df, metrics):
|
| 37 |
-
"""Compute average
|
| 38 |
-
|
| 39 |
-
for col in metrics:
|
| 40 |
-
if col in normalized_df.columns:
|
| 41 |
-
col_min = normalized_df[col].min()
|
| 42 |
-
col_max = normalized_df[col].max()
|
| 43 |
-
if col_max > col_min: # Avoid division by zero
|
| 44 |
-
normalized_df[col] = (normalized_df[col] - col_min) / (
|
| 45 |
-
col_max - col_min
|
| 46 |
-
)
|
| 47 |
-
else:
|
| 48 |
-
normalized_df[col] = 0 # If all values are the same, set to 0
|
| 49 |
-
return normalized_df.mean(axis=1, skipna=False)
|
| 50 |
|
| 51 |
|
| 52 |
def make_model_table(scores_df, models):
|
|
@@ -156,6 +143,105 @@ def make_language_table(scores_df, languages):
|
|
| 156 |
return df
|
| 157 |
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
app = FastAPI()
|
| 160 |
|
| 161 |
app.add_middleware(CORSMiddleware, allow_origins=["*"])
|
|
@@ -190,6 +276,8 @@ async def data(request: Request):
|
|
| 190 |
countries = make_country_table(make_language_table(df, languages))
|
| 191 |
|
| 192 |
language_table = make_language_table(scores, languages)
|
|
|
|
|
|
|
| 193 |
datasets_df = pd.read_json("data/datasets.json")
|
| 194 |
|
| 195 |
return JSONResponse(content={
|
|
@@ -198,6 +286,8 @@ async def data(request: Request):
|
|
| 198 |
"dataset_table": serialize(datasets_df),
|
| 199 |
"countries": serialize(countries),
|
| 200 |
"machine_translated_metrics": list(machine_translated_metrics),
|
|
|
|
|
|
|
| 201 |
})
|
| 202 |
|
| 203 |
|
|
|
|
| 28 |
"classification_accuracy",
|
| 29 |
"mmlu_accuracy",
|
| 30 |
"arc_accuracy",
|
|
|
|
| 31 |
"mgsm_accuracy",
|
| 32 |
]
|
| 33 |
|
|
|
|
| 34 |
def compute_normalized_average(df, metrics):
|
| 35 |
+
"""Compute simple average across metric columns without normalization."""
|
| 36 |
+
return df[metrics].mean(axis=1, skipna=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
def make_model_table(scores_df, models):
|
|
|
|
| 143 |
return df
|
| 144 |
|
| 145 |
|
| 146 |
+
def make_language_tier_history(scores_df, languages, models):
|
| 147 |
+
# Rank languages by speakers
|
| 148 |
+
ranked_langs = languages.sort_values(by="speakers", ascending=False).reset_index(drop=True)
|
| 149 |
+
|
| 150 |
+
# Define tiers
|
| 151 |
+
tier_ranges = {
|
| 152 |
+
"Top 1": (0, 1),
|
| 153 |
+
"Top 2-20": (1, 20),
|
| 154 |
+
"Top 20-200": (19, 500),
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
# Calculate model-language proficiency scores
|
| 158 |
+
scores_df = scores_df.copy()
|
| 159 |
+
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 160 |
+
|
| 161 |
+
# Pivot to get model-language-metric scores
|
| 162 |
+
pivot = scores_df.pivot_table(
|
| 163 |
+
index=["model", "bcp_47"],
|
| 164 |
+
columns="task_metric",
|
| 165 |
+
values="score",
|
| 166 |
+
aggfunc="mean"
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
# Ensure all task_metrics columns exist
|
| 170 |
+
for metric in task_metrics:
|
| 171 |
+
if metric not in pivot.columns:
|
| 172 |
+
pivot[metric] = np.nan
|
| 173 |
+
|
| 174 |
+
# Calculate proficiency score for each model-language pair
|
| 175 |
+
pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics)
|
| 176 |
+
pivot = pivot.reset_index()
|
| 177 |
+
|
| 178 |
+
# Create all tier-level aggregations (allowing overlapping tiers)
|
| 179 |
+
all_tier_scores = []
|
| 180 |
+
for tier_name, (start, end) in tier_ranges.items():
|
| 181 |
+
tier_langs = ranked_langs.iloc[start:end]["bcp_47"].tolist()
|
| 182 |
+
tier_data = pivot[pivot["bcp_47"].isin(tier_langs)]
|
| 183 |
+
tier_scores = tier_data.groupby("model")["proficiency_score"].mean().reset_index()
|
| 184 |
+
tier_scores["tier"] = tier_name
|
| 185 |
+
all_tier_scores.append(tier_scores)
|
| 186 |
+
|
| 187 |
+
tier_scores = pd.concat(all_tier_scores, ignore_index=True)
|
| 188 |
+
|
| 189 |
+
# Merge with models data
|
| 190 |
+
tier_scores = pd.merge(tier_scores, models, left_on="model", right_on="id", how="left")
|
| 191 |
+
|
| 192 |
+
# Select relevant columns
|
| 193 |
+
tier_scores = tier_scores[
|
| 194 |
+
["model", "name", "provider_name", "creation_date", "size", "tier", "proficiency_score"]
|
| 195 |
+
]
|
| 196 |
+
|
| 197 |
+
tier_scores["creation_date"] = tier_scores["creation_date"].apply(
|
| 198 |
+
lambda x: x.isoformat() if x else None
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
return tier_scores
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def make_license_history(scores_df, models):
|
| 205 |
+
scores_df = scores_df.copy()
|
| 206 |
+
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 207 |
+
|
| 208 |
+
# Pivot to get model-level scores
|
| 209 |
+
pivot = scores_df.pivot_table(
|
| 210 |
+
index="model",
|
| 211 |
+
columns="task_metric",
|
| 212 |
+
values="score",
|
| 213 |
+
aggfunc="mean"
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
# Ensure all task_metrics columns exist
|
| 217 |
+
for metric in task_metrics:
|
| 218 |
+
if metric not in pivot.columns:
|
| 219 |
+
pivot[metric] = np.nan
|
| 220 |
+
|
| 221 |
+
# Calculate proficiency score for each model
|
| 222 |
+
pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics)
|
| 223 |
+
pivot = pivot.reset_index()
|
| 224 |
+
|
| 225 |
+
# Merge with models data
|
| 226 |
+
df = pd.merge(pivot, models, left_on="model", right_on="id", how="left")
|
| 227 |
+
|
| 228 |
+
# Classify as commercial or open
|
| 229 |
+
df["license_type"] = df["type"].apply(
|
| 230 |
+
lambda x: "Open-source" if x == "open-source" else "Commercial"
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# Select relevant columns
|
| 234 |
+
df = df[
|
| 235 |
+
["model", "name", "provider_name", "creation_date", "size", "license_type", "proficiency_score"]
|
| 236 |
+
]
|
| 237 |
+
|
| 238 |
+
df["creation_date"] = df["creation_date"].apply(
|
| 239 |
+
lambda x: x.isoformat() if x else None
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
return df
|
| 243 |
+
|
| 244 |
+
|
| 245 |
app = FastAPI()
|
| 246 |
|
| 247 |
app.add_middleware(CORSMiddleware, allow_origins=["*"])
|
|
|
|
| 276 |
countries = make_country_table(make_language_table(df, languages))
|
| 277 |
|
| 278 |
language_table = make_language_table(scores, languages)
|
| 279 |
+
language_tier_history = make_language_tier_history(scores, languages, models)
|
| 280 |
+
license_history = make_license_history(scores, models)
|
| 281 |
datasets_df = pd.read_json("data/datasets.json")
|
| 282 |
|
| 283 |
return JSONResponse(content={
|
|
|
|
| 286 |
"dataset_table": serialize(datasets_df),
|
| 287 |
"countries": serialize(countries),
|
| 288 |
"machine_translated_metrics": list(machine_translated_metrics),
|
| 289 |
+
"language_tier_history": serialize(language_tier_history),
|
| 290 |
+
"license_history": serialize(license_history),
|
| 291 |
})
|
| 292 |
|
| 293 |
|
frontend/src/App.js
CHANGED
|
@@ -9,6 +9,8 @@ import AutoComplete from './components/AutoComplete'
|
|
| 9 |
import LanguagePlot from './components/LanguagePlot'
|
| 10 |
import SpeakerPlot from './components/SpeakerPlot'
|
| 11 |
import HistoryPlot from './components/HistoryPlot'
|
|
|
|
|
|
|
| 12 |
import CostPlot from './components/CostPlot'
|
| 13 |
import { Carousel } from 'primereact/carousel'
|
| 14 |
import { Dialog } from 'primereact/dialog'
|
|
@@ -62,7 +64,9 @@ function App () {
|
|
| 62 |
<LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
|
| 63 |
<SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
|
| 64 |
<HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
|
| 65 |
-
<
|
|
|
|
|
|
|
| 66 |
]);
|
| 67 |
}, 100);
|
| 68 |
|
|
@@ -112,7 +116,19 @@ function App () {
|
|
| 112 |
width={windowWidth * 0.7}
|
| 113 |
height={windowHeight * 0.6}
|
| 114 |
/>,
|
| 115 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
]);
|
| 117 |
}, 100);
|
| 118 |
|
|
|
|
| 9 |
import LanguagePlot from './components/LanguagePlot'
|
| 10 |
import SpeakerPlot from './components/SpeakerPlot'
|
| 11 |
import HistoryPlot from './components/HistoryPlot'
|
| 12 |
+
import LanguageTierHistoryPlot from './components/LanguageTierHistoryPlot'
|
| 13 |
+
import LicenseHistoryPlot from './components/LicenseHistoryPlot'
|
| 14 |
import CostPlot from './components/CostPlot'
|
| 15 |
import { Carousel } from 'primereact/carousel'
|
| 16 |
import { Dialog } from 'primereact/dialog'
|
|
|
|
| 64 |
<LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
|
| 65 |
<SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
|
| 66 |
<HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
|
| 67 |
+
<LanguageTierHistoryPlot key="tierhistplot-4" data={data} width={750} height={500} />,
|
| 68 |
+
<LicenseHistoryPlot key="licensehistplot-5" data={data} width={750} height={500} />,
|
| 69 |
+
<CostPlot key="costplot-6" data={data} width={750} height={500} />
|
| 70 |
]);
|
| 71 |
}, 100);
|
| 72 |
|
|
|
|
| 116 |
width={windowWidth * 0.7}
|
| 117 |
height={windowHeight * 0.6}
|
| 118 |
/>,
|
| 119 |
+
<LanguageTierHistoryPlot
|
| 120 |
+
key="fs-tierhistplot-4"
|
| 121 |
+
data={data}
|
| 122 |
+
width={windowWidth * 0.7}
|
| 123 |
+
height={windowHeight * 0.6}
|
| 124 |
+
/>,
|
| 125 |
+
<LicenseHistoryPlot
|
| 126 |
+
key="fs-licensehistplot-5"
|
| 127 |
+
data={data}
|
| 128 |
+
width={windowWidth * 0.7}
|
| 129 |
+
height={windowHeight * 0.6}
|
| 130 |
+
/>,
|
| 131 |
+
<CostPlot key="fs-costplot-6" data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />
|
| 132 |
]);
|
| 133 |
}, 100);
|
| 134 |
|
frontend/src/components/DatasetTable.js
CHANGED
|
@@ -138,7 +138,7 @@ const DatasetTable = ({ data }) => {
|
|
| 138 |
field='name'
|
| 139 |
header='Name'
|
| 140 |
body={nameBodyTemplate}
|
| 141 |
-
style={{ minWidth: '10rem' }}
|
| 142 |
frozen
|
| 143 |
/>
|
| 144 |
<Column
|
|
|
|
| 138 |
field='name'
|
| 139 |
header='Name'
|
| 140 |
body={nameBodyTemplate}
|
| 141 |
+
style={{ minWidth: '10rem', zIndex: 2 }}
|
| 142 |
frozen
|
| 143 |
/>
|
| 144 |
<Column
|
frontend/src/components/LanguageTable.js
CHANGED
|
@@ -149,7 +149,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
|
|
| 149 |
field='language_name'
|
| 150 |
header='Language'
|
| 151 |
body={languageBodyTemplate}
|
| 152 |
-
style={{ minWidth: '10rem' }}
|
| 153 |
filter
|
| 154 |
showFilterMatchModes={false}
|
| 155 |
frozen
|
|
|
|
| 149 |
field='language_name'
|
| 150 |
header='Language'
|
| 151 |
body={languageBodyTemplate}
|
| 152 |
+
style={{ minWidth: '10rem', zIndex: 2 }}
|
| 153 |
filter
|
| 154 |
showFilterMatchModes={false}
|
| 155 |
frozen
|
frontend/src/components/LanguageTierHistoryPlot.js
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useRef, useEffect } from 'react'
|
| 2 |
+
import * as Plot from '@observablehq/plot'
|
| 3 |
+
|
| 4 |
+
const LanguageTierHistoryPlot = ({ data, width = 750, height = 500 }) => {
|
| 5 |
+
const containerRef = useRef()
|
| 6 |
+
|
| 7 |
+
const tierHistory = [...(data.language_tier_history || [])]
|
| 8 |
+
.filter(d => d.proficiency_score !== null && d.creation_date !== null)
|
| 9 |
+
.sort((a, b) => new Date(a.creation_date) - new Date(b.creation_date))
|
| 10 |
+
|
| 11 |
+
// Get unique tiers from data, dynamically
|
| 12 |
+
const tiers = [...new Set(tierHistory.map(d => d.tier))]
|
| 13 |
+
|
| 14 |
+
// Add " languages" suffix for legend display
|
| 15 |
+
const tierWithSuffix = (tier) => `${tier} languages`
|
| 16 |
+
|
| 17 |
+
// Calculate max proficiency over time for each tier
|
| 18 |
+
const tierRecords = {}
|
| 19 |
+
|
| 20 |
+
tiers.forEach(tier => {
|
| 21 |
+
const tierData = tierHistory.filter(d => d.tier === tier)
|
| 22 |
+
const records = []
|
| 23 |
+
let maxScore = 0
|
| 24 |
+
|
| 25 |
+
tierData.forEach(curr => {
|
| 26 |
+
if (curr.proficiency_score > maxScore) {
|
| 27 |
+
maxScore = curr.proficiency_score
|
| 28 |
+
records.push({
|
| 29 |
+
...curr,
|
| 30 |
+
maxScore: maxScore,
|
| 31 |
+
newRecord: true
|
| 32 |
+
})
|
| 33 |
+
} else {
|
| 34 |
+
records.push({
|
| 35 |
+
...curr,
|
| 36 |
+
maxScore: maxScore,
|
| 37 |
+
newRecord: false
|
| 38 |
+
})
|
| 39 |
+
}
|
| 40 |
+
})
|
| 41 |
+
|
| 42 |
+
tierRecords[tier] = records
|
| 43 |
+
})
|
| 44 |
+
|
| 45 |
+
// Flatten for plotting - only show dots for new records
|
| 46 |
+
// Add " languages" suffix to tier for display
|
| 47 |
+
const recordBreakingDots = Object.values(tierRecords)
|
| 48 |
+
.flat()
|
| 49 |
+
.filter(d => d.newRecord)
|
| 50 |
+
.map(d => ({ ...d, tierDisplay: tierWithSuffix(d.tier) }))
|
| 51 |
+
|
| 52 |
+
// Create step function data for each tier
|
| 53 |
+
const stepData = tiers.flatMap(tier => {
|
| 54 |
+
const records = tierRecords[tier].filter(d => d.newRecord)
|
| 55 |
+
if (records.length === 0) return []
|
| 56 |
+
|
| 57 |
+
return [
|
| 58 |
+
...records.map(d => ({ ...d, tierDisplay: tierWithSuffix(d.tier) })),
|
| 59 |
+
{
|
| 60 |
+
tier: tier,
|
| 61 |
+
tierDisplay: tierWithSuffix(tier),
|
| 62 |
+
creation_date: new Date(),
|
| 63 |
+
maxScore: records[records.length - 1]?.maxScore || 0
|
| 64 |
+
}
|
| 65 |
+
]
|
| 66 |
+
})
|
| 67 |
+
|
| 68 |
+
useEffect(() => {
|
| 69 |
+
const plot = Plot.plot({
|
| 70 |
+
width: width,
|
| 71 |
+
height: height,
|
| 72 |
+
subtitle: 'Model performance on language tiers over time',
|
| 73 |
+
x: {
|
| 74 |
+
label: 'Date',
|
| 75 |
+
type: 'time',
|
| 76 |
+
tickFormat: '%Y-%m'
|
| 77 |
+
},
|
| 78 |
+
y: {
|
| 79 |
+
label: 'Language Tier Proficiency Score'
|
| 80 |
+
},
|
| 81 |
+
color: {
|
| 82 |
+
legend: true,
|
| 83 |
+
domain: tiers.map(tierWithSuffix)
|
| 84 |
+
},
|
| 85 |
+
marks: [
|
| 86 |
+
Plot.dot(recordBreakingDots, {
|
| 87 |
+
x: d => new Date(d.creation_date),
|
| 88 |
+
y: d => d.proficiency_score,
|
| 89 |
+
fill: 'tierDisplay',
|
| 90 |
+
stroke: 'tierDisplay',
|
| 91 |
+
title: d =>
|
| 92 |
+
`${d.provider_name} - ${d.name} (${
|
| 93 |
+
d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
|
| 94 |
+
})\nTier: ${d.tier}\nPublished: ${new Date(
|
| 95 |
+
d.creation_date
|
| 96 |
+
).toLocaleDateString()}\nScore: ${d.proficiency_score.toFixed(2)}`,
|
| 97 |
+
tip: true
|
| 98 |
+
}),
|
| 99 |
+
Plot.line(stepData, {
|
| 100 |
+
x: d => new Date(d.creation_date),
|
| 101 |
+
y: d => d.maxScore || 0,
|
| 102 |
+
stroke: 'tierDisplay',
|
| 103 |
+
curve: 'step-after',
|
| 104 |
+
strokeOpacity: 0.5,
|
| 105 |
+
strokeWidth: 2
|
| 106 |
+
})
|
| 107 |
+
]
|
| 108 |
+
})
|
| 109 |
+
containerRef.current.append(plot)
|
| 110 |
+
return () => plot.remove()
|
| 111 |
+
}, [recordBreakingDots, stepData, width, height, tiers])
|
| 112 |
+
|
| 113 |
+
return (
|
| 114 |
+
<div
|
| 115 |
+
ref={containerRef}
|
| 116 |
+
style={{
|
| 117 |
+
width: '100%',
|
| 118 |
+
height: '100%',
|
| 119 |
+
display: 'flex',
|
| 120 |
+
alignItems: 'center',
|
| 121 |
+
justifyContent: 'center'
|
| 122 |
+
}}
|
| 123 |
+
/>
|
| 124 |
+
)
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
export default LanguageTierHistoryPlot
|
| 128 |
+
|
frontend/src/components/LicenseHistoryPlot.js
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useRef, useEffect } from 'react'
|
| 2 |
+
import * as Plot from '@observablehq/plot'
|
| 3 |
+
|
| 4 |
+
const LicenseHistoryPlot = ({ data, width = 750, height = 500 }) => {
|
| 5 |
+
const containerRef = useRef()
|
| 6 |
+
|
| 7 |
+
const licenseHistory = [...(data.license_history || [])]
|
| 8 |
+
.filter(d => d.proficiency_score !== null && d.creation_date !== null)
|
| 9 |
+
.sort((a, b) => new Date(a.creation_date) - new Date(b.creation_date))
|
| 10 |
+
|
| 11 |
+
const licenseTypes = ['Commercial', 'Open-source']
|
| 12 |
+
const licenseRecords = {}
|
| 13 |
+
|
| 14 |
+
licenseTypes.forEach(type => {
|
| 15 |
+
const typeData = licenseHistory.filter(d => d.license_type === type)
|
| 16 |
+
const records = []
|
| 17 |
+
let maxScore = 0
|
| 18 |
+
|
| 19 |
+
typeData.forEach(curr => {
|
| 20 |
+
if (curr.proficiency_score > maxScore) {
|
| 21 |
+
maxScore = curr.proficiency_score
|
| 22 |
+
records.push({
|
| 23 |
+
...curr,
|
| 24 |
+
maxScore: maxScore,
|
| 25 |
+
newRecord: true
|
| 26 |
+
})
|
| 27 |
+
} else {
|
| 28 |
+
records.push({
|
| 29 |
+
...curr,
|
| 30 |
+
maxScore: maxScore,
|
| 31 |
+
newRecord: false
|
| 32 |
+
})
|
| 33 |
+
}
|
| 34 |
+
})
|
| 35 |
+
|
| 36 |
+
licenseRecords[type] = records
|
| 37 |
+
})
|
| 38 |
+
|
| 39 |
+
// Only show dots for new records
|
| 40 |
+
const recordBreakingDots = Object.values(licenseRecords).flat().filter(d => d.newRecord)
|
| 41 |
+
|
| 42 |
+
// Create step function data
|
| 43 |
+
const stepData = licenseTypes.flatMap(type => {
|
| 44 |
+
const records = licenseRecords[type].filter(d => d.newRecord)
|
| 45 |
+
if (records.length === 0) return []
|
| 46 |
+
|
| 47 |
+
return [
|
| 48 |
+
...records,
|
| 49 |
+
{
|
| 50 |
+
license_type: type,
|
| 51 |
+
creation_date: new Date(),
|
| 52 |
+
maxScore: records[records.length - 1]?.maxScore || 0
|
| 53 |
+
}
|
| 54 |
+
]
|
| 55 |
+
})
|
| 56 |
+
|
| 57 |
+
useEffect(() => {
|
| 58 |
+
const plot = Plot.plot({
|
| 59 |
+
width: width,
|
| 60 |
+
height: height,
|
| 61 |
+
subtitle: 'Commercial vs Open-source models over time',
|
| 62 |
+
x: {
|
| 63 |
+
label: 'Date',
|
| 64 |
+
type: 'time',
|
| 65 |
+
tickFormat: '%Y-%m'
|
| 66 |
+
},
|
| 67 |
+
y: {
|
| 68 |
+
label: 'Language Proficiency Score'
|
| 69 |
+
},
|
| 70 |
+
color: {
|
| 71 |
+
legend: true,
|
| 72 |
+
domain: licenseTypes
|
| 73 |
+
},
|
| 74 |
+
marks: [
|
| 75 |
+
Plot.dot(recordBreakingDots, {
|
| 76 |
+
x: d => new Date(d.creation_date),
|
| 77 |
+
y: d => d.proficiency_score,
|
| 78 |
+
fill: 'license_type',
|
| 79 |
+
stroke: 'license_type',
|
| 80 |
+
title: d =>
|
| 81 |
+
`${d.provider_name} - ${d.name} (${
|
| 82 |
+
d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
|
| 83 |
+
})\nType: ${d.license_type}\nPublished: ${new Date(
|
| 84 |
+
d.creation_date
|
| 85 |
+
).toLocaleDateString()}\nScore: ${d.proficiency_score.toFixed(2)}`,
|
| 86 |
+
tip: true
|
| 87 |
+
}),
|
| 88 |
+
Plot.line(stepData, {
|
| 89 |
+
x: d => new Date(d.creation_date),
|
| 90 |
+
y: d => d.maxScore || 0,
|
| 91 |
+
stroke: 'license_type',
|
| 92 |
+
curve: 'step-after',
|
| 93 |
+
strokeOpacity: 0.5,
|
| 94 |
+
strokeWidth: 2
|
| 95 |
+
})
|
| 96 |
+
]
|
| 97 |
+
})
|
| 98 |
+
containerRef.current.append(plot)
|
| 99 |
+
return () => plot.remove()
|
| 100 |
+
}, [recordBreakingDots, stepData, width, height])
|
| 101 |
+
|
| 102 |
+
return (
|
| 103 |
+
<div
|
| 104 |
+
ref={containerRef}
|
| 105 |
+
style={{
|
| 106 |
+
width: '100%',
|
| 107 |
+
height: '100%',
|
| 108 |
+
display: 'flex',
|
| 109 |
+
alignItems: 'center',
|
| 110 |
+
justifyContent: 'center'
|
| 111 |
+
}}
|
| 112 |
+
/>
|
| 113 |
+
)
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
export default LicenseHistoryPlot
|
| 117 |
+
|
frontend/src/components/ModelTable.js
CHANGED
|
@@ -248,7 +248,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
|
|
| 248 |
<Column
|
| 249 |
field='name'
|
| 250 |
header='Model'
|
| 251 |
-
style={{ minWidth: '10rem' }}
|
| 252 |
body={modelBodyTemplate}
|
| 253 |
filter
|
| 254 |
showFilterMatchModes={false}
|
|
|
|
| 248 |
<Column
|
| 249 |
field='name'
|
| 250 |
header='Model'
|
| 251 |
+
style={{ minWidth: '10rem', zIndex: 2 }}
|
| 252 |
body={modelBodyTemplate}
|
| 253 |
filter
|
| 254 |
showFilterMatchModes={false}
|
frontend/src/components/ScoreColumns.js
CHANGED
|
@@ -21,9 +21,9 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
|
|
| 21 |
<Column
|
| 22 |
field='average'
|
| 23 |
header='Proficiency'
|
| 24 |
-
headerTooltip='Language Proficiency Score (average of the scores for each task
|
| 25 |
sortable
|
| 26 |
-
body={scoreBodyTemplate('average', { minScore: 0.
|
| 27 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 28 |
/>,
|
| 29 |
<Column
|
|
@@ -33,7 +33,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
|
|
| 33 |
sortable
|
| 34 |
body={scoreBodyTemplate('translation_from_bleu', {
|
| 35 |
minScore: 0,
|
| 36 |
-
maxScore: 0.
|
| 37 |
machineTranslatedMetrics
|
| 38 |
})}
|
| 39 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
|
@@ -45,7 +45,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
|
|
| 45 |
sortable
|
| 46 |
body={scoreBodyTemplate('translation_to_bleu', {
|
| 47 |
minScore: 0,
|
| 48 |
-
maxScore: 0.
|
| 49 |
machineTranslatedMetrics
|
| 50 |
})}
|
| 51 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
|
@@ -56,8 +56,8 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
|
|
| 56 |
headerTooltip='Classification performance (accuracy on a sample of the SIB-200 / FLORES+ classification benchmark)'
|
| 57 |
sortable
|
| 58 |
body={scoreBodyTemplate('classification_accuracy', {
|
| 59 |
-
minScore: 0,
|
| 60 |
-
maxScore:
|
| 61 |
machineTranslatedMetrics
|
| 62 |
})}
|
| 63 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
|
|
|
| 21 |
<Column
|
| 22 |
field='average'
|
| 23 |
header='Proficiency'
|
| 24 |
+
headerTooltip='Language Proficiency Score (average of the scores for each task)'
|
| 25 |
sortable
|
| 26 |
+
body={scoreBodyTemplate('average', { minScore: 0.3, maxScore: 0.7, machineTranslatedMetrics })}
|
| 27 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 28 |
/>,
|
| 29 |
<Column
|
|
|
|
| 33 |
sortable
|
| 34 |
body={scoreBodyTemplate('translation_from_bleu', {
|
| 35 |
minScore: 0,
|
| 36 |
+
maxScore: 0.4,
|
| 37 |
machineTranslatedMetrics
|
| 38 |
})}
|
| 39 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
|
|
|
| 45 |
sortable
|
| 46 |
body={scoreBodyTemplate('translation_to_bleu', {
|
| 47 |
minScore: 0,
|
| 48 |
+
maxScore: 0.4,
|
| 49 |
machineTranslatedMetrics
|
| 50 |
})}
|
| 51 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
|
|
|
| 56 |
headerTooltip='Classification performance (accuracy on a sample of the SIB-200 / FLORES+ classification benchmark)'
|
| 57 |
sortable
|
| 58 |
body={scoreBodyTemplate('classification_accuracy', {
|
| 59 |
+
minScore: 0.4,
|
| 60 |
+
maxScore: 1,
|
| 61 |
machineTranslatedMetrics
|
| 62 |
})}
|
| 63 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|