Spaces:

fair-forward
/

languagebench

Running

App Files Files Community

davidpomerenke commited on 24 days ago

Commit

972026c

verified ·

1 Parent(s): 6f0e312

Upload from GitHub Actions: drop normalization

Browse files

Files changed (8) hide show

evals/backend.py +105 -15
frontend/src/App.js +18 -2
frontend/src/components/DatasetTable.js +1 -1
frontend/src/components/LanguageTable.js +1 -1
frontend/src/components/LanguageTierHistoryPlot.js +128 -0
frontend/src/components/LicenseHistoryPlot.js +117 -0
frontend/src/components/ModelTable.js +1 -1
frontend/src/components/ScoreColumns.js +6 -6

evals/backend.py CHANGED Viewed

@@ -28,25 +28,12 @@ task_metrics = [
     "classification_accuracy",
     "mmlu_accuracy",
     "arc_accuracy",
-    "truthfulqa_accuracy",
     "mgsm_accuracy",
 ]
 def compute_normalized_average(df, metrics):
-    """Compute average of min-max normalized metric columns."""
-    normalized_df = df[metrics].copy()
-    for col in metrics:
-        if col in normalized_df.columns:
-            col_min = normalized_df[col].min()
-            col_max = normalized_df[col].max()
-            if col_max > col_min:  # Avoid division by zero
-                normalized_df[col] = (normalized_df[col] - col_min) / (
-                    col_max - col_min
-                )
-            else:
-                normalized_df[col] = 0  # If all values are the same, set to 0
-    return normalized_df.mean(axis=1, skipna=False)
 def make_model_table(scores_df, models):
@@ -156,6 +143,105 @@ def make_language_table(scores_df, languages):
     return df
 app = FastAPI()
 app.add_middleware(CORSMiddleware, allow_origins=["*"])
@@ -190,6 +276,8 @@ async def data(request: Request):
         countries = make_country_table(make_language_table(df, languages))
     language_table = make_language_table(scores, languages)
     datasets_df = pd.read_json("data/datasets.json")
     return JSONResponse(content={
@@ -198,6 +286,8 @@ async def data(request: Request):
         "dataset_table": serialize(datasets_df),
         "countries": serialize(countries),
         "machine_translated_metrics": list(machine_translated_metrics),
     })

     "classification_accuracy",
     "mmlu_accuracy",
     "arc_accuracy",
     "mgsm_accuracy",
 ]
 def compute_normalized_average(df, metrics):
+    """Compute simple average across metric columns without normalization."""
+    return df[metrics].mean(axis=1, skipna=False)
 def make_model_table(scores_df, models):
     return df
+def make_language_tier_history(scores_df, languages, models):
+    # Rank languages by speakers
+    ranked_langs = languages.sort_values(by="speakers", ascending=False).reset_index(drop=True)
+    # Define tiers
+    tier_ranges = {
+        "Top 1": (0, 1),
+        "Top 2-20": (1, 20),
+        "Top 20-200": (19, 500),
+    }
+    # Calculate model-language proficiency scores
+    scores_df = scores_df.copy()
+    scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
+    # Pivot to get model-language-metric scores
+    pivot = scores_df.pivot_table(
+        index=["model", "bcp_47"],
+        columns="task_metric",
+        values="score",
+        aggfunc="mean"
+    )
+    # Ensure all task_metrics columns exist
+    for metric in task_metrics:
+        if metric not in pivot.columns:
+            pivot[metric] = np.nan
+    # Calculate proficiency score for each model-language pair
+    pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics)
+    pivot = pivot.reset_index()
+    # Create all tier-level aggregations (allowing overlapping tiers)
+    all_tier_scores = []
+    for tier_name, (start, end) in tier_ranges.items():
+        tier_langs = ranked_langs.iloc[start:end]["bcp_47"].tolist()
+        tier_data = pivot[pivot["bcp_47"].isin(tier_langs)]
+        tier_scores = tier_data.groupby("model")["proficiency_score"].mean().reset_index()
+        tier_scores["tier"] = tier_name
+        all_tier_scores.append(tier_scores)
+    tier_scores = pd.concat(all_tier_scores, ignore_index=True)
+    # Merge with models data
+    tier_scores = pd.merge(tier_scores, models, left_on="model", right_on="id", how="left")
+    # Select relevant columns
+    tier_scores = tier_scores[
+        ["model", "name", "provider_name", "creation_date", "size", "tier", "proficiency_score"]
+    ]
+    tier_scores["creation_date"] = tier_scores["creation_date"].apply(
+        lambda x: x.isoformat() if x else None
+    )
+    return tier_scores
+def make_license_history(scores_df, models):
+    scores_df = scores_df.copy()
+    scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
+    # Pivot to get model-level scores
+    pivot = scores_df.pivot_table(
+        index="model",
+        columns="task_metric",
+        values="score",
+        aggfunc="mean"
+    )
+    # Ensure all task_metrics columns exist
+    for metric in task_metrics:
+        if metric not in pivot.columns:
+            pivot[metric] = np.nan
+    # Calculate proficiency score for each model
+    pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics)
+    pivot = pivot.reset_index()
+    # Merge with models data
+    df = pd.merge(pivot, models, left_on="model", right_on="id", how="left")
+    # Classify as commercial or open
+    df["license_type"] = df["type"].apply(
+        lambda x: "Open-source" if x == "open-source" else "Commercial"
+    )
+    # Select relevant columns
+    df = df[
+        ["model", "name", "provider_name", "creation_date", "size", "license_type", "proficiency_score"]
+    ]
+    df["creation_date"] = df["creation_date"].apply(
+        lambda x: x.isoformat() if x else None
+    )
+    return df
 app = FastAPI()
 app.add_middleware(CORSMiddleware, allow_origins=["*"])
         countries = make_country_table(make_language_table(df, languages))
     language_table = make_language_table(scores, languages)
+    language_tier_history = make_language_tier_history(scores, languages, models)
+    license_history = make_license_history(scores, models)
     datasets_df = pd.read_json("data/datasets.json")
     return JSONResponse(content={
         "dataset_table": serialize(datasets_df),
         "countries": serialize(countries),
         "machine_translated_metrics": list(machine_translated_metrics),
+        "language_tier_history": serialize(language_tier_history),
+        "license_history": serialize(license_history),
     })

frontend/src/App.js CHANGED Viewed

@@ -9,6 +9,8 @@ import AutoComplete from './components/AutoComplete'
 import LanguagePlot from './components/LanguagePlot'
 import SpeakerPlot from './components/SpeakerPlot'
 import HistoryPlot from './components/HistoryPlot'
 import CostPlot from './components/CostPlot'
 import { Carousel } from 'primereact/carousel'
 import { Dialog } from 'primereact/dialog'
@@ -62,7 +64,9 @@ function App () {
           <LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
           <SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
           <HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
-          <CostPlot key="costplot-4" data={data} width={750} height={500} />
         ]);
       }, 100);
@@ -112,7 +116,19 @@ function App () {
             width={windowWidth * 0.7}
             height={windowHeight * 0.6}
           />,
-          <CostPlot key="fs-costplot-4" data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />
         ]);
       }, 100);

 import LanguagePlot from './components/LanguagePlot'
 import SpeakerPlot from './components/SpeakerPlot'
 import HistoryPlot from './components/HistoryPlot'
+import LanguageTierHistoryPlot from './components/LanguageTierHistoryPlot'
+import LicenseHistoryPlot from './components/LicenseHistoryPlot'
 import CostPlot from './components/CostPlot'
 import { Carousel } from 'primereact/carousel'
 import { Dialog } from 'primereact/dialog'
           <LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
           <SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
           <HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
+          <LanguageTierHistoryPlot key="tierhistplot-4" data={data} width={750} height={500} />,
+          <LicenseHistoryPlot key="licensehistplot-5" data={data} width={750} height={500} />,
+          <CostPlot key="costplot-6" data={data} width={750} height={500} />
         ]);
       }, 100);
             width={windowWidth * 0.7}
             height={windowHeight * 0.6}
           />,
+          <LanguageTierHistoryPlot
+            key="fs-tierhistplot-4"
+            data={data}
+            width={windowWidth * 0.7}
+            height={windowHeight * 0.6}
+          />,
+          <LicenseHistoryPlot
+            key="fs-licensehistplot-5"
+            data={data}
+            width={windowWidth * 0.7}
+            height={windowHeight * 0.6}
+          />,
+          <CostPlot key="fs-costplot-6" data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />
         ]);
       }, 100);

frontend/src/components/DatasetTable.js CHANGED Viewed

@@ -138,7 +138,7 @@ const DatasetTable = ({ data }) => {
         field='name'
         header='Name'
         body={nameBodyTemplate}
-        style={{ minWidth: '10rem' }}
         frozen
       />
       <Column

         field='name'
         header='Name'
         body={nameBodyTemplate}
+        style={{ minWidth: '10rem', zIndex: 2 }}
         frozen
       />
       <Column

frontend/src/components/LanguageTable.js CHANGED Viewed

@@ -149,7 +149,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
         field='language_name'
         header='Language'
         body={languageBodyTemplate}
-        style={{ minWidth: '10rem' }}
         filter
         showFilterMatchModes={false}
         frozen

         field='language_name'
         header='Language'
         body={languageBodyTemplate}
+        style={{ minWidth: '10rem', zIndex: 2 }}
         filter
         showFilterMatchModes={false}
         frozen

frontend/src/components/LanguageTierHistoryPlot.js ADDED Viewed

	@@ -0,0 +1,128 @@

+import { useRef, useEffect } from 'react'
+import * as Plot from '@observablehq/plot'
+const LanguageTierHistoryPlot = ({ data, width = 750, height = 500 }) => {
+  const containerRef = useRef()
+  const tierHistory = [...(data.language_tier_history || [])]
+    .filter(d => d.proficiency_score !== null && d.creation_date !== null)
+    .sort((a, b) => new Date(a.creation_date) - new Date(b.creation_date))
+  // Get unique tiers from data, dynamically
+  const tiers = [...new Set(tierHistory.map(d => d.tier))]
+  // Add " languages" suffix for legend display
+  const tierWithSuffix = (tier) => `${tier} languages`
+  // Calculate max proficiency over time for each tier
+  const tierRecords = {}
+  tiers.forEach(tier => {
+    const tierData = tierHistory.filter(d => d.tier === tier)
+    const records = []
+    let maxScore = 0
+    tierData.forEach(curr => {
+      if (curr.proficiency_score > maxScore) {
+        maxScore = curr.proficiency_score
+        records.push({
+          ...curr,
+          maxScore: maxScore,
+          newRecord: true
+        })
+      } else {
+        records.push({
+          ...curr,
+          maxScore: maxScore,
+          newRecord: false
+        })
+      }
+    })
+    tierRecords[tier] = records
+  })
+  // Flatten for plotting - only show dots for new records
+  // Add " languages" suffix to tier for display
+  const recordBreakingDots = Object.values(tierRecords)
+    .flat()
+    .filter(d => d.newRecord)
+    .map(d => ({ ...d, tierDisplay: tierWithSuffix(d.tier) }))
+  // Create step function data for each tier
+  const stepData = tiers.flatMap(tier => {
+    const records = tierRecords[tier].filter(d => d.newRecord)
+    if (records.length === 0) return []
+    return [
+      ...records.map(d => ({ ...d, tierDisplay: tierWithSuffix(d.tier) })),
+      {
+        tier: tier,
+        tierDisplay: tierWithSuffix(tier),
+        creation_date: new Date(),
+        maxScore: records[records.length - 1]?.maxScore || 0
+      }
+    ]
+  })
+  useEffect(() => {
+    const plot = Plot.plot({
+      width: width,
+      height: height,
+      subtitle: 'Model performance on language tiers over time',
+      x: {
+        label: 'Date',
+        type: 'time',
+        tickFormat: '%Y-%m'
+      },
+      y: {
+        label: 'Language Tier Proficiency Score'
+      },
+      color: {
+        legend: true,
+        domain: tiers.map(tierWithSuffix)
+      },
+      marks: [
+        Plot.dot(recordBreakingDots, {
+          x: d => new Date(d.creation_date),
+          y: d => d.proficiency_score,
+          fill: 'tierDisplay',
+          stroke: 'tierDisplay',
+          title: d =>
+            `${d.provider_name} - ${d.name} (${
+              d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
+            })\nTier: ${d.tier}\nPublished: ${new Date(
+              d.creation_date
+            ).toLocaleDateString()}\nScore: ${d.proficiency_score.toFixed(2)}`,
+          tip: true
+        }),
+        Plot.line(stepData, {
+          x: d => new Date(d.creation_date),
+          y: d => d.maxScore || 0,
+          stroke: 'tierDisplay',
+          curve: 'step-after',
+          strokeOpacity: 0.5,
+          strokeWidth: 2
+        })
+      ]
+    })
+    containerRef.current.append(plot)
+    return () => plot.remove()
+  }, [recordBreakingDots, stepData, width, height, tiers])
+  return (
+    <div
+      ref={containerRef}
+      style={{
+        width: '100%',
+        height: '100%',
+        display: 'flex',
+        alignItems: 'center',
+        justifyContent: 'center'
+      }}
+    />
+  )
+}
+export default LanguageTierHistoryPlot

frontend/src/components/LicenseHistoryPlot.js ADDED Viewed

	@@ -0,0 +1,117 @@

+import { useRef, useEffect } from 'react'
+import * as Plot from '@observablehq/plot'
+const LicenseHistoryPlot = ({ data, width = 750, height = 500 }) => {
+  const containerRef = useRef()
+  const licenseHistory = [...(data.license_history || [])]
+    .filter(d => d.proficiency_score !== null && d.creation_date !== null)
+    .sort((a, b) => new Date(a.creation_date) - new Date(b.creation_date))
+  const licenseTypes = ['Commercial', 'Open-source']
+  const licenseRecords = {}
+  licenseTypes.forEach(type => {
+    const typeData = licenseHistory.filter(d => d.license_type === type)
+    const records = []
+    let maxScore = 0
+    typeData.forEach(curr => {
+      if (curr.proficiency_score > maxScore) {
+        maxScore = curr.proficiency_score
+        records.push({
+          ...curr,
+          maxScore: maxScore,
+          newRecord: true
+        })
+      } else {
+        records.push({
+          ...curr,
+          maxScore: maxScore,
+          newRecord: false
+        })
+      }
+    })
+    licenseRecords[type] = records
+  })
+  // Only show dots for new records
+  const recordBreakingDots = Object.values(licenseRecords).flat().filter(d => d.newRecord)
+  // Create step function data
+  const stepData = licenseTypes.flatMap(type => {
+    const records = licenseRecords[type].filter(d => d.newRecord)
+    if (records.length === 0) return []
+    return [
+      ...records,
+      {
+        license_type: type,
+        creation_date: new Date(),
+        maxScore: records[records.length - 1]?.maxScore || 0
+      }
+    ]
+  })
+  useEffect(() => {
+    const plot = Plot.plot({
+      width: width,
+      height: height,
+      subtitle: 'Commercial vs Open-source models over time',
+      x: {
+        label: 'Date',
+        type: 'time',
+        tickFormat: '%Y-%m'
+      },
+      y: {
+        label: 'Language Proficiency Score'
+      },
+      color: {
+        legend: true,
+        domain: licenseTypes
+      },
+      marks: [
+        Plot.dot(recordBreakingDots, {
+          x: d => new Date(d.creation_date),
+          y: d => d.proficiency_score,
+          fill: 'license_type',
+          stroke: 'license_type',
+          title: d =>
+            `${d.provider_name} - ${d.name} (${
+              d.size?.toLocaleString('en-US', { notation: 'compact' }) || '?B'
+            })\nType: ${d.license_type}\nPublished: ${new Date(
+              d.creation_date
+            ).toLocaleDateString()}\nScore: ${d.proficiency_score.toFixed(2)}`,
+          tip: true
+        }),
+        Plot.line(stepData, {
+          x: d => new Date(d.creation_date),
+          y: d => d.maxScore || 0,
+          stroke: 'license_type',
+          curve: 'step-after',
+          strokeOpacity: 0.5,
+          strokeWidth: 2
+        })
+      ]
+    })
+    containerRef.current.append(plot)
+    return () => plot.remove()
+  }, [recordBreakingDots, stepData, width, height])
+  return (
+    <div
+      ref={containerRef}
+      style={{
+        width: '100%',
+        height: '100%',
+        display: 'flex',
+        alignItems: 'center',
+        justifyContent: 'center'
+      }}
+    />
+  )
+}
+export default LicenseHistoryPlot

frontend/src/components/ModelTable.js CHANGED Viewed

@@ -248,7 +248,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
       <Column
         field='name'
         header='Model'
-        style={{ minWidth: '10rem' }}
         body={modelBodyTemplate}
         filter
         showFilterMatchModes={false}

       <Column
         field='name'
         header='Model'
+        style={{ minWidth: '10rem', zIndex: 2 }}
         body={modelBodyTemplate}
         filter
         showFilterMatchModes={false}

frontend/src/components/ScoreColumns.js CHANGED Viewed

@@ -21,9 +21,9 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
   <Column
     field='average'
     header='Proficiency'
-    headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
     sortable
-    body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5, machineTranslatedMetrics })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
   <Column
@@ -33,7 +33,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('translation_from_bleu', {
       minScore: 0,
-      maxScore: 0.5,
       machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
@@ -45,7 +45,7 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     sortable
     body={scoreBodyTemplate('translation_to_bleu', {
       minScore: 0,
-      maxScore: 0.5,
       machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
@@ -56,8 +56,8 @@ const ScoreColumns = (machineTranslatedMetrics = []) => [
     headerTooltip='Classification performance (accuracy on a sample of the SIB-200 / FLORES+ classification benchmark)'
     sortable
     body={scoreBodyTemplate('classification_accuracy', {
-      minScore: 0,
-      maxScore: 0.5,
       machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}

   <Column
     field='average'
     header='Proficiency'
+    headerTooltip='Language Proficiency Score (average of the scores for each task)'
     sortable
+    body={scoreBodyTemplate('average', { minScore: 0.3, maxScore: 0.7, machineTranslatedMetrics })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
   />,
   <Column
     sortable
     body={scoreBodyTemplate('translation_from_bleu', {
       minScore: 0,
+      maxScore: 0.4,
       machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
     sortable
     body={scoreBodyTemplate('translation_to_bleu', {
       minScore: 0,
+      maxScore: 0.4,
       machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}
     headerTooltip='Classification performance (accuracy on a sample of the SIB-200 / FLORES+ classification benchmark)'
     sortable
     body={scoreBodyTemplate('classification_accuracy', {
+      minScore: 0.4,
+      maxScore: 1,
       machineTranslatedMetrics
     })}
     style={{ minWidth: '5rem', maxWidth: '10rem' }}