Tasks-Explorer

Sleeping

App Files Files Community

hynky HF Staff commited on Jul 23, 2024

Commit

2ce0cc2

1 Parent(s): 66a9bee

migrate to fsspec

Browse files

Files changed (1) hide show

app.py +45 -35

app.py CHANGED Viewed

@@ -9,12 +9,8 @@ import numpy as np
 from datetime import datetime
 import gradio as gr
-import huggingface_hub
 import pandas as pd
-import plotly.graph_objects as go
-from huggingface_hub.file_download import repo_folder_name
-from huggingface_hub.hf_api import RepoFile
-from huggingface_hub.utils import EntryNotFoundError
 FALLBACK_TOKEN_NAME = "HF_TOKEN"
@@ -41,20 +37,20 @@ def get_run_name_seed(run_name):
     run_name, seed = run_name.split("-seed-")
     return run_name, int(seed)
-def fetch_repo_structure(repo_name, oauth_token: gr.OAuthToken | None = None):
     token = os.environ.get(FALLBACK_TOKEN_NAME)
     if oauth_token:
         token = oauth_token.token
-    files = list(huggingface_hub.list_repo_tree(repo_name, "details", recursive=False, token=token))
-    runs = {file.path.split('/')[-1] for file in files if isinstance(file, huggingface_hub.hf_api.RepoFolder)}
     if not runs:
         return {}, gr.update(choices=[], value=None)
     def process_run(run):
-        run_files = list(huggingface_hub.list_repo_tree(repo_name, f"details/{run}", recursive=False, token=token))
-        return run, [file.path.split('/')[-1] for file in run_files if isinstance(file, huggingface_hub.hf_api.RepoFolder)]
     with ThreadPoolExecutor() as executor:
         results = list(executor.map(process_run, runs))
@@ -86,14 +82,16 @@ def select_runs_by_language(runs, current_selected, language):
         return select_runs_by_regex(runs, current_selected, f".*-{language}-.*")
     return current_selected
-def fetch_available_tasks(repo_name, runs_to_fetch, checkpoint) -> dict[str, dict[str, str]]:
     token = os.environ.get(FALLBACK_TOKEN_NAME)
     all_tasks = defaultdict(lambda: defaultdict(dict))
     for run in runs_to_fetch:
         try:
-            files = huggingface_hub.list_repo_tree(repo_name, f"details/{run}/{checkpoint}", token=token)
-            parquet_files = [f.path.split('/')[-1] for f in files if f.path.endswith('.parquet')]
             for full_filename in parquet_files:
                 task_name, date_str = full_filename.replace('.parquet', '').rsplit('_', 1)
@@ -101,8 +99,10 @@ def fetch_available_tasks(repo_name, runs_to_fetch, checkpoint) -> dict[str, dic
                 if run not in all_tasks[task_name] or date > all_tasks[task_name][run]['date']:
                     all_tasks[task_name][run] = {'filename': full_filename, 'date': date}
-        except EntryNotFoundError:
             print(f"Checkpoint not found for run: {run}")
     available_tasks = {
         task: {run: info['filename'] for run, info in runs.items()}
@@ -112,17 +112,17 @@ def fetch_available_tasks(repo_name, runs_to_fetch, checkpoint) -> dict[str, dic
     return available_tasks
-def fetch_run_results(repo_name, runs_to_fetch, checkpoint,
                       oauth_token: gr.OAuthToken | None = None, progress=gr.Progress()):
-    task_runs_dict = fetch_available_tasks(repo_name, runs_to_fetch, checkpoint)
     task_names = list(task_runs_dict.keys())
     return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
 def render_table(df, selected_runs, metric_names):
     if df is None or not selected_runs or not metric_names:
-        return None
     kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
     other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
     df = df.drop(columns=other_metrics)
@@ -130,8 +130,9 @@ def render_table(df, selected_runs, metric_names):
     df = shorten_column_names(df, selected_runs, metric_names)
     # Sample 100
     df = df.sample(n=min(100, len(df)), random_state=42)
-    return df
 def get_column_widths(df):
     column_widths = []
@@ -170,19 +171,25 @@ def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
     return df
-def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files, progress=gr.Progress()):
     token = os.environ.get(FALLBACK_TOKEN_NAME)
     if not runs_to_fetch or not task_name:
         return None, None, None
     def fetch_run_file(run_to_fetch):
         file_path = f"details/{run_to_fetch}/{checkpoint}/{tasks_files[task_name][run_to_fetch]}"
         try:
-            cached_path = huggingface_hub.hf_hub_download(repo_name, file_path, token=token)
-            df = pd.read_parquet(cached_path)
             return df, run_to_fetch
-        except EntryNotFoundError:
-            print(f"File not found: {file_path}")
             return None, run_to_fetch
     with ThreadPoolExecutor() as pool:
@@ -245,7 +252,7 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
     # Join all prepared DataFrames
     for df, run_name in zip(dfs, run_names):
         prepared_df = prepare_df(df, run_name, task_type)
-        combined_df = combined_df.join(prepared_df, how='outer', )
     available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
@@ -259,7 +266,7 @@ with gr.Blocks() as demo:
     results_df_full = gr.State(None)
     tasks_files = gr.State({})
     login_button = gr.LoginButton(visible=False)
-    repo = gr.Textbox(label="HF Repo", value="HuggingFaceFW-Dev/multiligual-ablation-logs-dev", visible=True)
     with gr.Column():
         gr.Markdown("# FineWeb experiments results explorer")
         with gr.Row():
@@ -277,11 +284,14 @@ with gr.Blocks() as demo:
         task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
         metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
         results_df = gr.Dataframe(interactive=False, wrap=True)
     # Run selection
     gr.on(
-        triggers=[repo.change],
-        fn=fetch_repo_structure, inputs=[repo], outputs=[runs_checkpoints, selected_runs],
     )
     gr.on(
         triggers=[select_by_regex_button.click],
@@ -306,37 +316,37 @@ with gr.Blocks() as demo:
     gr.on(
         triggers=[fetch_res.click],
         fn=fetch_run_results,
-        inputs=[repo, selected_runs, checkpoint],
         outputs=[task_name, tasks_files]
     ).then(
         fn=load_task_data,
-        inputs=[repo, selected_runs, checkpoint, task_name, tasks_files],
         outputs=[results_df_full, metric_names]
     ).then(
         fn=render_table,
         inputs=[results_df_full, selected_runs, metric_names],
-        outputs=[results_df]
     )
     # Update results when task name or metric changes
     gr.on(
         triggers=[task_name.input],
         fn=load_task_data,
-        inputs=[repo, selected_runs, checkpoint, task_name, tasks_files],
         outputs=[results_df_full, metric_names]
     ).then(
         fn=render_table,
         inputs=[results_df_full, selected_runs, metric_names],
-        outputs=[results_df]
     )
     gr.on(
         triggers=[metric_names.input],
         fn=render_table,
         inputs=[results_df_full, selected_runs, metric_names],
-        outputs=[results_df]
     )
-    demo.load(fn=fetch_repo_structure, inputs=[repo], outputs=[runs_checkpoints, selected_runs])
 demo.launch()

 from datetime import datetime
 import gradio as gr
 import pandas as pd
+from datatrove.io import DataFolder
 FALLBACK_TOKEN_NAME = "HF_TOKEN"
     run_name, seed = run_name.split("-seed-")
     return run_name, int(seed)
+def fetch_repo_structure(results_uri, oauth_token: gr.OAuthToken | None = None):
     token = os.environ.get(FALLBACK_TOKEN_NAME)
     if oauth_token:
         token = oauth_token.token
+    data_folder = DataFolder(results_uri, token=token)
+    runs = [f.removeprefix("details/") for f in data_folder.list_files("details", recursive=False, include_directories=True) if f != "details"]
     if not runs:
         return {}, gr.update(choices=[], value=None)
     def process_run(run):
+        run_files = [f.removeprefix(f"details/{run}/") for f in data_folder.list_files(f"details/{run}", recursive=False, include_directories=True) if f != f"details/{run}"]
+        return run, run_files
     with ThreadPoolExecutor() as executor:
         results = list(executor.map(process_run, runs))
         return select_runs_by_regex(runs, current_selected, f".*-{language}-.*")
     return current_selected
+def fetch_available_tasks(results_uri, runs_to_fetch, checkpoint) -> dict[str, dict[str, str]]:
     token = os.environ.get(FALLBACK_TOKEN_NAME)
+    data_folder = DataFolder(results_uri, token=token)
     all_tasks = defaultdict(lambda: defaultdict(dict))
     for run in runs_to_fetch:
         try:
+            files = data_folder.list_files(f"details/{run}/{checkpoint}", recursive=False)
+            parquet_files = [f.split("/")[-1] for f in files if f.endswith('.parquet')]
             for full_filename in parquet_files:
                 task_name, date_str = full_filename.replace('.parquet', '').rsplit('_', 1)
                 if run not in all_tasks[task_name] or date > all_tasks[task_name][run]['date']:
                     all_tasks[task_name][run] = {'filename': full_filename, 'date': date}
+        except FileNotFoundError:
             print(f"Checkpoint not found for run: {run}")
+    print(all_tasks)
     available_tasks = {
         task: {run: info['filename'] for run, info in runs.items()}
     return available_tasks
+def fetch_run_results(results_uri, runs_to_fetch, checkpoint,
                       oauth_token: gr.OAuthToken | None = None, progress=gr.Progress()):
+    task_runs_dict = fetch_available_tasks(results_uri, runs_to_fetch, checkpoint)
     task_names = list(task_runs_dict.keys())
     return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
 def render_table(df, selected_runs, metric_names):
     if df is None or not selected_runs or not metric_names:
+        return None, "0"
     kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
     other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
     df = df.drop(columns=other_metrics)
     df = shorten_column_names(df, selected_runs, metric_names)
     # Sample 100
+    n_samples = len(df)
     df = df.sample(n=min(100, len(df)), random_state=42)
+    return df, n_samples
 def get_column_widths(df):
     column_widths = []
     return df
+def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_files, progress=gr.Progress()):
     token = os.environ.get(FALLBACK_TOKEN_NAME)
     if not runs_to_fetch or not task_name:
         return None, None, None
+    print(runs_to_fetch)
+    data_folder = DataFolder(f"filecache::{results_uri}", token=token, cache_storage="./results-cache")
+    print(tasks_files)
     def fetch_run_file(run_to_fetch):
         file_path = f"details/{run_to_fetch}/{checkpoint}/{tasks_files[task_name][run_to_fetch]}"
         try:
+            with data_folder.open(file_path, "rb") as f:
+                df = pd.read_parquet(f)
             return df, run_to_fetch
+        except FileNotFoundError:
+            print(f"File not found: {tasks_files[task_name][run_to_fetch]}")
             return None, run_to_fetch
     with ThreadPoolExecutor() as pool:
     # Join all prepared DataFrames
     for df, run_name in zip(dfs, run_names):
         prepared_df = prepare_df(df, run_name, task_type)
+        combined_df = combined_df.join(prepared_df, how='outer')
     available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
     results_df_full = gr.State(None)
     tasks_files = gr.State({})
     login_button = gr.LoginButton(visible=False)
+    results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True)
     with gr.Column():
         gr.Markdown("# FineWeb experiments results explorer")
         with gr.Row():
         task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
         metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
         results_df = gr.Dataframe(interactive=False, wrap=True)
+        with gr.Row():
+            with gr.Column():
+                num_samples = gr.Text(interactive=False, label="# Samples")
     # Run selection
     gr.on(
+        triggers=[results_uri.change],
+        fn=fetch_repo_structure, inputs=[results_uri], outputs=[runs_checkpoints, selected_runs],
     )
     gr.on(
         triggers=[select_by_regex_button.click],
     gr.on(
         triggers=[fetch_res.click],
         fn=fetch_run_results,
+        inputs=[results_uri, selected_runs, checkpoint],
         outputs=[task_name, tasks_files]
     ).then(
         fn=load_task_data,
+        inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files],
         outputs=[results_df_full, metric_names]
     ).then(
         fn=render_table,
         inputs=[results_df_full, selected_runs, metric_names],
+        outputs=[results_df, num_samples]
     )
     # Update results when task name or metric changes
     gr.on(
         triggers=[task_name.input],
         fn=load_task_data,
+        inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files],
         outputs=[results_df_full, metric_names]
     ).then(
         fn=render_table,
         inputs=[results_df_full, selected_runs, metric_names],
+        outputs=[results_df, num_samples]
     )
     gr.on(
         triggers=[metric_names.input],
         fn=render_table,
         inputs=[results_df_full, selected_runs, metric_names],
+        outputs=[results_df, num_samples]
     )
+    demo.load(fn=fetch_repo_structure, inputs=[results_uri], outputs=[runs_checkpoints, selected_runs])
 demo.launch()