leaderboard-test

Sleeping

App Files Files Community

Clémentine commited on Nov 22, 2023

Commit

01ea22b

1 Parent(s): 4f3c2a8

now with a functionning backend

Browse files

Files changed (14) hide show

.gitignore +3 -5
README.md +6 -2
app.py +10 -1
main_backend.py +78 -0
requirements.txt +3 -1
scripts/fix_harness_import.py +11 -0
src/{display/about.py → about.py} +6 -2
src/backend/manage_requests.py +123 -0
src/backend/run_eval_suite.py +57 -0
src/backend/sort_queue.py +28 -0
src/display/formatting.py +0 -9
src/display/utils.py +1 -1
src/envs.py +11 -3
src/leaderboard/read_evals.py +1 -1

.gitignore CHANGED Viewed

@@ -6,10 +6,8 @@ __pycache__/
 *ipynb
 .vscode/
-gpt_4_evals/
-human_evals/
 eval-queue/
 eval-results/
-auto_evals/
-src/assets/model_counts.html

 *ipynb
 .vscode/
 eval-queue/
 eval-results/
+eval-queue-bk/
+eval-results-bk/
+logs/

README.md CHANGED Viewed

@@ -12,7 +12,7 @@ license: apache-2.0
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
-Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/display/about.
 Results files should have the following format:
 ```
@@ -33,4 +33,8 @@ Results files should have the following format:
 }
 ```
-Request files are created automatically by this tool.

 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+Most of the variables to change for a default leaderboard are in src/env (replace the path for your leaderboard) and src/about.
 Results files should have the following format:
 ```
 }
 ```
+Request files are created automatically by this tool.
+If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
+If you want to run your own backend, you only need to change the logic in src/backend/run_eval_suite, which at the moment launches the Eleuther AI Harness.

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
-from src.display.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
@@ -30,9 +31,14 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=TOKEN)
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
@@ -342,5 +348,8 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

+import subprocess
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
+from src.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
 from src.submission.submit import add_new_eval
+subprocess.run(["python", "scripts/fix_harness_import.py"])
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=TOKEN)
+def launch_backend():
+    _ = subprocess.run(["python", "main_backend.py"])
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
 scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()
+restart_space()

main_backend.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import logging
+import pprint
+from huggingface_hub import snapshot_download
+logging.getLogger("openai").setLevel(logging.WARNING)
+from src.backend.run_eval_suite import run_evaluation
+from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
+from src.backend.sort_queue import sort_models_by_priority
+from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT
+from src.about import Tasks, NUM_FEWSHOT
+TASKS_HARNESS = [task.value.benchmark for task in Tasks]
+logging.basicConfig(level=logging.ERROR)
+pp = pprint.PrettyPrinter(width=80)
+PENDING_STATUS = "PENDING"
+RUNNING_STATUS = "RUNNING"
+FINISHED_STATUS = "FINISHED"
+FAILED_STATUS = "FAILED"
+snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+def run_auto_eval():
+    current_pending_status = [PENDING_STATUS]
+    # pull the eval dataset from the hub and parse any eval requests
+    # check completed evals and set them to finished
+    check_completed_evals(
+        api=API,
+        checked_status=RUNNING_STATUS,
+        completed_status=FINISHED_STATUS,
+        failed_status=FAILED_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+        hf_repo_results=RESULTS_REPO,
+        local_dir_results=EVAL_RESULTS_PATH_BACKEND
+    )
+    # Get all eval request that are PENDING, if you want to run other evals, change this parameter
+    eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
+    # Sort the evals by priority (first submitted first run)
+    eval_requests = sort_models_by_priority(api=API, models=eval_requests)
+    print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
+    if len(eval_requests) == 0:
+        return
+    eval_request = eval_requests[0]
+    pp.pprint(eval_request)
+    set_eval_request(
+        api=API,
+        eval_request=eval_request,
+        set_to_status=RUNNING_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    )
+    run_evaluation(
+        eval_request=eval_request,
+        task_names=TASKS_HARNESS,
+        num_fewshot=NUM_FEWSHOT,
+        local_dir=EVAL_RESULTS_PATH_BACKEND,
+        results_repo=RESULTS_REPO,
+        batch_size=1,
+        device=DEVICE,
+        no_cache=True,
+        limit=LIMIT
+        )
+if __name__ == "__main__":
+    run_auto_eval()

requirements.txt CHANGED Viewed

@@ -12,4 +12,6 @@ python-dateutil==2.8.2
 requests==2.28.2
 tqdm==4.65.0
 transformers==4.35.2
-tokenizers>=0.15.0

 requests==2.28.2
 tqdm==4.65.0
 transformers==4.35.2
+tokenizers>=0.15.0
+git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
+accelerate==0.24.1

scripts/fix_harness_import.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""This file should be used after pip install -r requirements.
+It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
+It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
+"""
+import os
+import lm_eval
+if __name__ == "__main__":
+    lm_eval_path = lm_eval.__path__[0]
+    os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)

src/{display/about.py → about.py} RENAMED Viewed

@@ -11,8 +11,12 @@ class Task:
 # Init: to update with your specific keys
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("task_name1", "metric_name", "First task")
-    task1 = Task("task_name2", "metric_name", "Second task")
 # Your leaderboard name

 # Init: to update with your specific keys
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("anli_r1", "acc", "ANLI")
+    task1 = Task("logiqa", "acc_norm", "LogiQA")
+TASKS_HARNESS = [task.value.benchmark for task in Tasks]
+NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name

src/backend/manage_requests.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import glob
+import json
+from dataclasses import dataclass
+from typing import Optional
+from huggingface_hub import HfApi, snapshot_download
+from src.envs import TOKEN
+@dataclass
+class EvalRequest:
+    model: str
+    private: bool
+    status: str
+    json_filepath: str
+    weight_type: str = "Original"
+    model_type: str = ""  # pretrained, finetuned, with RL
+    precision: str = ""  # float16, bfloat16, 8bit, 4bit, GPTQ
+    base_model: Optional[str] = None # for adapter models
+    revision: str = "main" # commit
+    submitted_time: Optional[str] = "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
+    model_type: Optional[str] = None
+    likes: Optional[int] = 0
+    params: Optional[int] = None
+    license: Optional[str] = ""
+    def get_model_args(self):
+        model_args = f"pretrained={self.model},revision={self.revision}"
+        if self.precision in ["float16", "bfloat16"]:
+            model_args += f",dtype={self.precision}"
+        elif self.precision == "8bit":
+            model_args += ",load_in_8bit=True"
+        elif self.precision == "4bit":
+            model_args += ",load_in_4bit=True"
+        elif self.precision == "GPTQ":
+            # A GPTQ model does not need dtype to be specified,
+            # it will be inferred from the config
+            pass
+        else:
+            raise Exception(f"Unknown precision {self.precision}.")
+        return model_args
+def set_eval_request(
+    api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str
+):
+    """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
+    json_filepath = eval_request.json_filepath
+    with open(json_filepath) as fp:
+        data = json.load(fp)
+    data["status"] = set_to_status
+    with open(json_filepath, "w") as f:
+        f.write(json.dumps(data))
+    api.upload_file(
+        path_or_fileobj=json_filepath,
+        path_in_repo=json_filepath.replace(local_dir, ""),
+        repo_id=hf_repo,
+        repo_type="dataset",
+    )
+def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
+    """Get all pending evaluation requests and return a list in which private
+    models appearing first, followed by public models sorted by the number of
+    likes.
+    Returns:
+        `list[EvalRequest]`: a list of model info dicts.
+    """
+    snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
+    json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
+    eval_requests = []
+    for json_filepath in json_files:
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        if data["status"] in job_status:
+            data["json_filepath"] = json_filepath
+            eval_request = EvalRequest(**data)
+            eval_requests.append(eval_request)
+    return eval_requests
+def check_completed_evals(
+    api: HfApi,
+    hf_repo: str,
+    local_dir: str,
+    checked_status: str,
+    completed_status: str,
+    failed_status: str,
+    hf_repo_results: str,
+    local_dir_results: str,
+):
+    """Checks if the currently running evals are completed, if yes, update their status on the hub."""
+    snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60)
+    running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
+    for eval_request in running_evals:
+        model = eval_request.model
+        print("====================================")
+        print(f"Checking {model}")
+        output_path = model
+        output_file = f"{local_dir_results}/{output_path}/results*.json"
+        output_file_exists = len(glob.glob(output_file)) > 0
+        if output_file_exists:
+            print(
+                f"EXISTS output file exists for {model} setting it to {completed_status}"
+            )
+            set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
+        else:
+            print(
+                f"No result file found for {model} setting it to {failed_status}"
+            )
+            set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

src/backend/run_eval_suite.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import json
+import os
+import logging
+from datetime import datetime
+from lm_eval import tasks, evaluator, utils
+from src.envs import RESULTS_REPO, API
+from src.backend.manage_requests import EvalRequest
+logging.getLogger("openai").setLevel(logging.WARNING)
+def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
+    if limit:
+        print(
+            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
+    task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
+    print(f"Selected Tasks: {task_names}")
+    results = evaluator.simple_evaluate(
+        model="hf-causal-experimental", # "hf-causal"
+        model_args=eval_request.get_model_args(),
+        tasks=task_names,
+        num_fewshot=num_fewshot,
+        batch_size=batch_size,
+        device=device,
+        no_cache=no_cache,
+        limit=limit,
+        write_out=True,
+        output_base_path="logs"
+    )
+    results["config"]["model_dtype"] = eval_request.precision
+    results["config"]["model_name"] = eval_request.model
+    results["config"]["model_sha"] = eval_request.revision
+    dumped = json.dumps(results, indent=2)
+    print(dumped)
+    output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, "w") as f:
+        f.write(dumped)
+    print(evaluator.make_table(results))
+    API.upload_file(
+        path_or_fileobj=output_path,
+        path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
+        repo_id=results_repo,
+        repo_type="dataset",
+    )
+    return results

src/backend/sort_queue.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import re
+from dataclasses import dataclass
+from huggingface_hub import HfApi
+from src.backend.manage_requests import EvalRequest
+@dataclass
+class ModelMetadata:
+    likes: int = 0
+    size: int = 15
+def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
+    private_models = [model for model in models if model.private]
+    public_models = [model for model in models if not model.private]
+    return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
+def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
+def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.params, reverse=False)
+def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)

src/display/formatting.py CHANGED Viewed

@@ -1,12 +1,3 @@
-import os
-from datetime import datetime, timezone
-from huggingface_hub import HfApi
-from huggingface_hub.hf_api import ModelInfo
-API = HfApi()
 def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'











1	def model_hyperlink(link, model_name):
2	return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3

src/display/utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from enum import Enum
 import pandas as pd
-from src.display.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]

 import pandas as pd
+from src.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]

src/envs.py CHANGED Viewed

@@ -2,18 +2,26 @@ import os
 from huggingface_hub import HfApi
-# clone / pull the lmeh eval data
-TOKEN = os.environ.get("TOKEN", None)
-OWNER = "demo-leaderboard"
 REPO_ID = f"{OWNER}/leaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
 CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 API = HfApi(token=TOKEN)

 from huggingface_hub import HfApi
+# Info to change for your repository
+# ----------------------------------
+TOKEN = os.environ.get("TOKEN", None) # A read/write token for your org
+OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request file
+DEVICE = "cpu" # cuda:0 if you add compute
+LIMIT = 20 # !!!! Should be None for actual evaluations!!!
+# ----------------------------------
 REPO_ID = f"{OWNER}/leaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
+# If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
+EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 API = HfApi(token=TOKEN)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -103,7 +103,7 @@ class EvalResult:
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
         except Exception:
-            print(f"Could not find request file for {self.org}/{self.model}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""

             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
         except Exception:
+            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""