Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	
		Clémentine
		
	commited on
		
		
					Commit 
							
							·
						
						01ea22b
	
1
								Parent(s):
							
							4f3c2a8
								
now with a functionning backend
Browse files- .gitignore +3 -5
- README.md +6 -2
- app.py +10 -1
- main_backend.py +78 -0
- requirements.txt +3 -1
- scripts/fix_harness_import.py +11 -0
- src/{display/about.py → about.py} +6 -2
- src/backend/manage_requests.py +123 -0
- src/backend/run_eval_suite.py +57 -0
- src/backend/sort_queue.py +28 -0
- src/display/formatting.py +0 -9
- src/display/utils.py +1 -1
- src/envs.py +11 -3
- src/leaderboard/read_evals.py +1 -1
    	
        .gitignore
    CHANGED
    
    | @@ -6,10 +6,8 @@ __pycache__/ | |
| 6 | 
             
            *ipynb
         | 
| 7 | 
             
            .vscode/
         | 
| 8 |  | 
| 9 | 
            -
            gpt_4_evals/
         | 
| 10 | 
            -
            human_evals/
         | 
| 11 | 
             
            eval-queue/
         | 
| 12 | 
             
            eval-results/
         | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
|  | |
| 6 | 
             
            *ipynb
         | 
| 7 | 
             
            .vscode/
         | 
| 8 |  | 
|  | |
|  | |
| 9 | 
             
            eval-queue/
         | 
| 10 | 
             
            eval-results/
         | 
| 11 | 
            +
            eval-queue-bk/
         | 
| 12 | 
            +
            eval-results-bk/
         | 
| 13 | 
            +
            logs/
         | 
    	
        README.md
    CHANGED
    
    | @@ -12,7 +12,7 @@ license: apache-2.0 | |
| 12 |  | 
| 13 | 
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
| 14 |  | 
| 15 | 
            -
            Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/ | 
| 16 |  | 
| 17 | 
             
            Results files should have the following format:
         | 
| 18 | 
             
            ```
         | 
| @@ -33,4 +33,8 @@ Results files should have the following format: | |
| 33 | 
             
            }
         | 
| 34 | 
             
            ```
         | 
| 35 |  | 
| 36 | 
            -
            Request files are created automatically by this tool.
         | 
|  | |
|  | |
|  | |
|  | 
|  | |
| 12 |  | 
| 13 | 
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
| 14 |  | 
| 15 | 
            +
            Most of the variables to change for a default leaderboard are in src/env (replace the path for your leaderboard) and src/about.
         | 
| 16 |  | 
| 17 | 
             
            Results files should have the following format:
         | 
| 18 | 
             
            ```
         | 
|  | |
| 33 | 
             
            }
         | 
| 34 | 
             
            ```
         | 
| 35 |  | 
| 36 | 
            +
            Request files are created automatically by this tool.
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            If you want to run your own backend, you only need to change the logic in src/backend/run_eval_suite, which at the moment launches the Eleuther AI Harness.
         | 
    	
        app.py
    CHANGED
    
    | @@ -1,9 +1,10 @@ | |
|  | |
| 1 | 
             
            import gradio as gr
         | 
| 2 | 
             
            import pandas as pd
         | 
| 3 | 
             
            from apscheduler.schedulers.background import BackgroundScheduler
         | 
| 4 | 
             
            from huggingface_hub import snapshot_download
         | 
| 5 |  | 
| 6 | 
            -
            from src. | 
| 7 | 
             
                CITATION_BUTTON_LABEL,
         | 
| 8 | 
             
                CITATION_BUTTON_TEXT,
         | 
| 9 | 
             
                EVALUATION_QUEUE_TEXT,
         | 
| @@ -30,9 +31,14 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df | |
| 30 | 
             
            from src.submission.submit import add_new_eval
         | 
| 31 |  | 
| 32 |  | 
|  | |
|  | |
| 33 | 
             
            def restart_space():
         | 
| 34 | 
             
                API.restart_space(repo_id=REPO_ID, token=TOKEN)
         | 
| 35 |  | 
|  | |
|  | |
|  | |
| 36 | 
             
            try:
         | 
| 37 | 
             
                print(EVAL_REQUESTS_PATH)
         | 
| 38 | 
             
                snapshot_download(
         | 
| @@ -342,5 +348,8 @@ with demo: | |
| 342 |  | 
| 343 | 
             
            scheduler = BackgroundScheduler()
         | 
| 344 | 
             
            scheduler.add_job(restart_space, "interval", seconds=1800)
         | 
|  | |
| 345 | 
             
            scheduler.start()
         | 
| 346 | 
             
            demo.queue(default_concurrency_limit=40).launch()
         | 
|  | |
|  | 
|  | |
| 1 | 
            +
            import subprocess
         | 
| 2 | 
             
            import gradio as gr
         | 
| 3 | 
             
            import pandas as pd
         | 
| 4 | 
             
            from apscheduler.schedulers.background import BackgroundScheduler
         | 
| 5 | 
             
            from huggingface_hub import snapshot_download
         | 
| 6 |  | 
| 7 | 
            +
            from src.about import (
         | 
| 8 | 
             
                CITATION_BUTTON_LABEL,
         | 
| 9 | 
             
                CITATION_BUTTON_TEXT,
         | 
| 10 | 
             
                EVALUATION_QUEUE_TEXT,
         | 
|  | |
| 31 | 
             
            from src.submission.submit import add_new_eval
         | 
| 32 |  | 
| 33 |  | 
| 34 | 
            +
            subprocess.run(["python", "scripts/fix_harness_import.py"])
         | 
| 35 | 
            +
             | 
| 36 | 
             
            def restart_space():
         | 
| 37 | 
             
                API.restart_space(repo_id=REPO_ID, token=TOKEN)
         | 
| 38 |  | 
| 39 | 
            +
            def launch_backend():
         | 
| 40 | 
            +
                _ = subprocess.run(["python", "main_backend.py"])
         | 
| 41 | 
            +
             | 
| 42 | 
             
            try:
         | 
| 43 | 
             
                print(EVAL_REQUESTS_PATH)
         | 
| 44 | 
             
                snapshot_download(
         | 
|  | |
| 348 |  | 
| 349 | 
             
            scheduler = BackgroundScheduler()
         | 
| 350 | 
             
            scheduler.add_job(restart_space, "interval", seconds=1800)
         | 
| 351 | 
            +
            scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
         | 
| 352 | 
             
            scheduler.start()
         | 
| 353 | 
             
            demo.queue(default_concurrency_limit=40).launch()
         | 
| 354 | 
            +
             | 
| 355 | 
            +
            restart_space()
         | 
    	
        main_backend.py
    ADDED
    
    | @@ -0,0 +1,78 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import logging
         | 
| 2 | 
            +
            import pprint
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            from huggingface_hub import snapshot_download
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            logging.getLogger("openai").setLevel(logging.WARNING)
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            from src.backend.run_eval_suite import run_evaluation
         | 
| 9 | 
            +
            from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
         | 
| 10 | 
            +
            from src.backend.sort_queue import sort_models_by_priority
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT
         | 
| 13 | 
            +
            from src.about import Tasks, NUM_FEWSHOT
         | 
| 14 | 
            +
            TASKS_HARNESS = [task.value.benchmark for task in Tasks]
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            logging.basicConfig(level=logging.ERROR)
         | 
| 17 | 
            +
            pp = pprint.PrettyPrinter(width=80)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            PENDING_STATUS = "PENDING"
         | 
| 20 | 
            +
            RUNNING_STATUS = "RUNNING"
         | 
| 21 | 
            +
            FINISHED_STATUS = "FINISHED"
         | 
| 22 | 
            +
            FAILED_STATUS = "FAILED"
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
         | 
| 25 | 
            +
            snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            def run_auto_eval():
         | 
| 28 | 
            +
                current_pending_status = [PENDING_STATUS]
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                # pull the eval dataset from the hub and parse any eval requests
         | 
| 31 | 
            +
                # check completed evals and set them to finished
         | 
| 32 | 
            +
                check_completed_evals(
         | 
| 33 | 
            +
                    api=API,
         | 
| 34 | 
            +
                    checked_status=RUNNING_STATUS,
         | 
| 35 | 
            +
                    completed_status=FINISHED_STATUS,
         | 
| 36 | 
            +
                    failed_status=FAILED_STATUS,
         | 
| 37 | 
            +
                    hf_repo=QUEUE_REPO,
         | 
| 38 | 
            +
                    local_dir=EVAL_REQUESTS_PATH_BACKEND,
         | 
| 39 | 
            +
                    hf_repo_results=RESULTS_REPO,
         | 
| 40 | 
            +
                    local_dir_results=EVAL_RESULTS_PATH_BACKEND
         | 
| 41 | 
            +
                )
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                # Get all eval request that are PENDING, if you want to run other evals, change this parameter
         | 
| 44 | 
            +
                eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
         | 
| 45 | 
            +
                # Sort the evals by priority (first submitted first run)
         | 
| 46 | 
            +
                eval_requests = sort_models_by_priority(api=API, models=eval_requests)
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                if len(eval_requests) == 0:
         | 
| 51 | 
            +
                    return
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                eval_request = eval_requests[0]
         | 
| 54 | 
            +
                pp.pprint(eval_request)
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                set_eval_request(
         | 
| 57 | 
            +
                    api=API,
         | 
| 58 | 
            +
                    eval_request=eval_request,
         | 
| 59 | 
            +
                    set_to_status=RUNNING_STATUS,
         | 
| 60 | 
            +
                    hf_repo=QUEUE_REPO,
         | 
| 61 | 
            +
                    local_dir=EVAL_REQUESTS_PATH_BACKEND,
         | 
| 62 | 
            +
                )
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                run_evaluation(
         | 
| 65 | 
            +
                    eval_request=eval_request, 
         | 
| 66 | 
            +
                    task_names=TASKS_HARNESS, 
         | 
| 67 | 
            +
                    num_fewshot=NUM_FEWSHOT, 
         | 
| 68 | 
            +
                    local_dir=EVAL_RESULTS_PATH_BACKEND,
         | 
| 69 | 
            +
                    results_repo=RESULTS_REPO,
         | 
| 70 | 
            +
                    batch_size=1, 
         | 
| 71 | 
            +
                    device=DEVICE, 
         | 
| 72 | 
            +
                    no_cache=True, 
         | 
| 73 | 
            +
                    limit=LIMIT
         | 
| 74 | 
            +
                    )
         | 
| 75 | 
            +
             | 
| 76 | 
            +
             | 
| 77 | 
            +
            if __name__ == "__main__":
         | 
| 78 | 
            +
                run_auto_eval()
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -12,4 +12,6 @@ python-dateutil==2.8.2 | |
| 12 | 
             
            requests==2.28.2
         | 
| 13 | 
             
            tqdm==4.65.0
         | 
| 14 | 
             
            transformers==4.35.2
         | 
| 15 | 
            -
            tokenizers>=0.15.0
         | 
|  | |
|  | 
|  | |
| 12 | 
             
            requests==2.28.2
         | 
| 13 | 
             
            tqdm==4.65.0
         | 
| 14 | 
             
            transformers==4.35.2
         | 
| 15 | 
            +
            tokenizers>=0.15.0
         | 
| 16 | 
            +
            git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
         | 
| 17 | 
            +
            accelerate==0.24.1
         | 
    	
        scripts/fix_harness_import.py
    ADDED
    
    | @@ -0,0 +1,11 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """This file should be used after pip install -r requirements.
         | 
| 2 | 
            +
            It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
         | 
| 3 | 
            +
            It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
         | 
| 4 | 
            +
            """
         | 
| 5 | 
            +
            import os
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            import lm_eval
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            if __name__ == "__main__":
         | 
| 10 | 
            +
                lm_eval_path = lm_eval.__path__[0]
         | 
| 11 | 
            +
                os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
         | 
    	
        src/{display/about.py → about.py}
    RENAMED
    
    | @@ -11,8 +11,12 @@ class Task: | |
| 11 | 
             
            # Init: to update with your specific keys
         | 
| 12 | 
             
            class Tasks(Enum):
         | 
| 13 | 
             
                # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
         | 
| 14 | 
            -
                task0 = Task(" | 
| 15 | 
            -
                task1 = Task(" | 
|  | |
|  | |
|  | |
|  | |
| 16 |  | 
| 17 |  | 
| 18 | 
             
            # Your leaderboard name
         | 
|  | |
| 11 | 
             
            # Init: to update with your specific keys
         | 
| 12 | 
             
            class Tasks(Enum):
         | 
| 13 | 
             
                # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
         | 
| 14 | 
            +
                task0 = Task("anli_r1", "acc", "ANLI")
         | 
| 15 | 
            +
                task1 = Task("logiqa", "acc_norm", "LogiQA")
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            TASKS_HARNESS = [task.value.benchmark for task in Tasks]
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            NUM_FEWSHOT = 0 # Change with your few shot
         | 
| 20 |  | 
| 21 |  | 
| 22 | 
             
            # Your leaderboard name
         | 
    	
        src/backend/manage_requests.py
    ADDED
    
    | @@ -0,0 +1,123 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import glob
         | 
| 2 | 
            +
            import json
         | 
| 3 | 
            +
            from dataclasses import dataclass
         | 
| 4 | 
            +
            from typing import Optional
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from huggingface_hub import HfApi, snapshot_download
         | 
| 7 | 
            +
            from src.envs import TOKEN
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            @dataclass
         | 
| 10 | 
            +
            class EvalRequest:
         | 
| 11 | 
            +
                model: str
         | 
| 12 | 
            +
                private: bool
         | 
| 13 | 
            +
                status: str
         | 
| 14 | 
            +
                json_filepath: str
         | 
| 15 | 
            +
                weight_type: str = "Original"
         | 
| 16 | 
            +
                model_type: str = ""  # pretrained, finetuned, with RL
         | 
| 17 | 
            +
                precision: str = ""  # float16, bfloat16, 8bit, 4bit, GPTQ
         | 
| 18 | 
            +
                base_model: Optional[str] = None # for adapter models
         | 
| 19 | 
            +
                revision: str = "main" # commit
         | 
| 20 | 
            +
                submitted_time: Optional[str] = "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
         | 
| 21 | 
            +
                model_type: Optional[str] = None
         | 
| 22 | 
            +
                likes: Optional[int] = 0
         | 
| 23 | 
            +
                params: Optional[int] = None
         | 
| 24 | 
            +
                license: Optional[str] = ""
         | 
| 25 | 
            +
                
         | 
| 26 | 
            +
                def get_model_args(self):
         | 
| 27 | 
            +
                    model_args = f"pretrained={self.model},revision={self.revision}"
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                    if self.precision in ["float16", "bfloat16"]:
         | 
| 30 | 
            +
                        model_args += f",dtype={self.precision}"
         | 
| 31 | 
            +
                    elif self.precision == "8bit":
         | 
| 32 | 
            +
                        model_args += ",load_in_8bit=True"
         | 
| 33 | 
            +
                    elif self.precision == "4bit":
         | 
| 34 | 
            +
                        model_args += ",load_in_4bit=True"
         | 
| 35 | 
            +
                    elif self.precision == "GPTQ":
         | 
| 36 | 
            +
                        # A GPTQ model does not need dtype to be specified,
         | 
| 37 | 
            +
                        # it will be inferred from the config
         | 
| 38 | 
            +
                        pass
         | 
| 39 | 
            +
                    else:
         | 
| 40 | 
            +
                        raise Exception(f"Unknown precision {self.precision}.")
         | 
| 41 | 
            +
                    
         | 
| 42 | 
            +
                    return model_args
         | 
| 43 | 
            +
             | 
| 44 | 
            +
             | 
| 45 | 
            +
            def set_eval_request(
         | 
| 46 | 
            +
                api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str
         | 
| 47 | 
            +
            ):
         | 
| 48 | 
            +
                """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
         | 
| 49 | 
            +
                json_filepath = eval_request.json_filepath
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                with open(json_filepath) as fp:
         | 
| 52 | 
            +
                    data = json.load(fp)
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                data["status"] = set_to_status
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                with open(json_filepath, "w") as f:
         | 
| 57 | 
            +
                    f.write(json.dumps(data))
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                api.upload_file(
         | 
| 60 | 
            +
                    path_or_fileobj=json_filepath,
         | 
| 61 | 
            +
                    path_in_repo=json_filepath.replace(local_dir, ""),
         | 
| 62 | 
            +
                    repo_id=hf_repo,
         | 
| 63 | 
            +
                    repo_type="dataset",
         | 
| 64 | 
            +
                )
         | 
| 65 | 
            +
             | 
| 66 | 
            +
             | 
| 67 | 
            +
            def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
         | 
| 68 | 
            +
                """Get all pending evaluation requests and return a list in which private
         | 
| 69 | 
            +
                models appearing first, followed by public models sorted by the number of
         | 
| 70 | 
            +
                likes.
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                Returns:
         | 
| 73 | 
            +
                    `list[EvalRequest]`: a list of model info dicts.
         | 
| 74 | 
            +
                """
         | 
| 75 | 
            +
                snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
         | 
| 76 | 
            +
                json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                eval_requests = []
         | 
| 79 | 
            +
                for json_filepath in json_files:
         | 
| 80 | 
            +
                    with open(json_filepath) as fp:
         | 
| 81 | 
            +
                        data = json.load(fp)
         | 
| 82 | 
            +
                    if data["status"] in job_status:
         | 
| 83 | 
            +
                        data["json_filepath"] = json_filepath
         | 
| 84 | 
            +
                        eval_request = EvalRequest(**data)
         | 
| 85 | 
            +
                        eval_requests.append(eval_request)
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                return eval_requests
         | 
| 88 | 
            +
             | 
| 89 | 
            +
             | 
| 90 | 
            +
            def check_completed_evals(
         | 
| 91 | 
            +
                api: HfApi,
         | 
| 92 | 
            +
                hf_repo: str,
         | 
| 93 | 
            +
                local_dir: str,
         | 
| 94 | 
            +
                checked_status: str,
         | 
| 95 | 
            +
                completed_status: str,
         | 
| 96 | 
            +
                failed_status: str,
         | 
| 97 | 
            +
                hf_repo_results: str,
         | 
| 98 | 
            +
                local_dir_results: str,
         | 
| 99 | 
            +
            ):
         | 
| 100 | 
            +
                """Checks if the currently running evals are completed, if yes, update their status on the hub."""
         | 
| 101 | 
            +
                snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60)
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                for eval_request in running_evals:
         | 
| 106 | 
            +
                    model = eval_request.model
         | 
| 107 | 
            +
                    print("====================================")
         | 
| 108 | 
            +
                    print(f"Checking {model}")
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                    output_path = model
         | 
| 111 | 
            +
                    output_file = f"{local_dir_results}/{output_path}/results*.json"
         | 
| 112 | 
            +
                    output_file_exists = len(glob.glob(output_file)) > 0
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                    if output_file_exists:
         | 
| 115 | 
            +
                        print(
         | 
| 116 | 
            +
                            f"EXISTS output file exists for {model} setting it to {completed_status}"
         | 
| 117 | 
            +
                        )
         | 
| 118 | 
            +
                        set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
         | 
| 119 | 
            +
                    else:
         | 
| 120 | 
            +
                        print(
         | 
| 121 | 
            +
                            f"No result file found for {model} setting it to {failed_status}"
         | 
| 122 | 
            +
                        )
         | 
| 123 | 
            +
                        set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
         | 
    	
        src/backend/run_eval_suite.py
    ADDED
    
    | @@ -0,0 +1,57 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import json
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
            +
            import logging
         | 
| 4 | 
            +
            from datetime import datetime
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from lm_eval import tasks, evaluator, utils
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            from src.envs import RESULTS_REPO, API
         | 
| 9 | 
            +
            from src.backend.manage_requests import EvalRequest
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            logging.getLogger("openai").setLevel(logging.WARNING)
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
         | 
| 14 | 
            +
                if limit:
         | 
| 15 | 
            +
                    print(
         | 
| 16 | 
            +
                        "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
         | 
| 17 | 
            +
                    )
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                print(f"Selected Tasks: {task_names}")
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                results = evaluator.simple_evaluate(
         | 
| 24 | 
            +
                    model="hf-causal-experimental", # "hf-causal"
         | 
| 25 | 
            +
                    model_args=eval_request.get_model_args(),
         | 
| 26 | 
            +
                    tasks=task_names,
         | 
| 27 | 
            +
                    num_fewshot=num_fewshot,
         | 
| 28 | 
            +
                    batch_size=batch_size,
         | 
| 29 | 
            +
                    device=device,
         | 
| 30 | 
            +
                    no_cache=no_cache,
         | 
| 31 | 
            +
                    limit=limit,
         | 
| 32 | 
            +
                    write_out=True,
         | 
| 33 | 
            +
                    output_base_path="logs"
         | 
| 34 | 
            +
                )
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                results["config"]["model_dtype"] = eval_request.precision
         | 
| 37 | 
            +
                results["config"]["model_name"] = eval_request.model
         | 
| 38 | 
            +
                results["config"]["model_sha"] = eval_request.revision
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                dumped = json.dumps(results, indent=2)
         | 
| 41 | 
            +
                print(dumped)
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
         | 
| 44 | 
            +
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
         | 
| 45 | 
            +
                with open(output_path, "w") as f:
         | 
| 46 | 
            +
                    f.write(dumped)
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                print(evaluator.make_table(results))
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                API.upload_file(
         | 
| 51 | 
            +
                    path_or_fileobj=output_path,
         | 
| 52 | 
            +
                    path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
         | 
| 53 | 
            +
                    repo_id=results_repo,
         | 
| 54 | 
            +
                    repo_type="dataset",
         | 
| 55 | 
            +
                )
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                return results
         | 
    	
        src/backend/sort_queue.py
    ADDED
    
    | @@ -0,0 +1,28 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import re
         | 
| 2 | 
            +
            from dataclasses import dataclass
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            from huggingface_hub import HfApi
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from src.backend.manage_requests import EvalRequest
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            @dataclass
         | 
| 10 | 
            +
            class ModelMetadata:
         | 
| 11 | 
            +
                likes: int = 0
         | 
| 12 | 
            +
                size: int = 15
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
         | 
| 16 | 
            +
                private_models = [model for model in models if model.private]
         | 
| 17 | 
            +
                public_models = [model for model in models if not model.private]
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
         | 
| 22 | 
            +
                return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
         | 
| 25 | 
            +
                return sorted(eval_requests, key=lambda x: x.params, reverse=False)
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
         | 
| 28 | 
            +
                return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
         | 
    	
        src/display/formatting.py
    CHANGED
    
    | @@ -1,12 +1,3 @@ | |
| 1 | 
            -
            import os
         | 
| 2 | 
            -
            from datetime import datetime, timezone
         | 
| 3 | 
            -
             | 
| 4 | 
            -
            from huggingface_hub import HfApi
         | 
| 5 | 
            -
            from huggingface_hub.hf_api import ModelInfo
         | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
            API = HfApi()
         | 
| 9 | 
            -
             | 
| 10 | 
             
            def model_hyperlink(link, model_name):
         | 
| 11 | 
             
                return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
         | 
| 12 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1 | 
             
            def model_hyperlink(link, model_name):
         | 
| 2 | 
             
                return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
         | 
| 3 |  | 
    	
        src/display/utils.py
    CHANGED
    
    | @@ -3,7 +3,7 @@ from enum import Enum | |
| 3 |  | 
| 4 | 
             
            import pandas as pd
         | 
| 5 |  | 
| 6 | 
            -
            from src. | 
| 7 |  | 
| 8 | 
             
            def fields(raw_class):
         | 
| 9 | 
             
                return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
         | 
|  | |
| 3 |  | 
| 4 | 
             
            import pandas as pd
         | 
| 5 |  | 
| 6 | 
            +
            from src.about import Tasks
         | 
| 7 |  | 
| 8 | 
             
            def fields(raw_class):
         | 
| 9 | 
             
                return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
         | 
    	
        src/envs.py
    CHANGED
    
    | @@ -2,18 +2,26 @@ import os | |
| 2 |  | 
| 3 | 
             
            from huggingface_hub import HfApi
         | 
| 4 |  | 
| 5 | 
            -
            #  | 
| 6 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 7 |  | 
| 8 | 
            -
            OWNER = "demo-leaderboard"
         | 
| 9 | 
             
            REPO_ID = f"{OWNER}/leaderboard"
         | 
| 10 | 
             
            QUEUE_REPO = f"{OWNER}/requests"
         | 
| 11 | 
             
            RESULTS_REPO = f"{OWNER}/results"
         | 
| 12 |  | 
|  | |
| 13 | 
             
            CACHE_PATH=os.getenv("HF_HOME", ".")
         | 
| 14 |  | 
| 15 | 
             
            # Local caches
         | 
| 16 | 
             
            EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
         | 
| 17 | 
             
            EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
         | 
|  | |
|  | |
| 18 |  | 
| 19 | 
             
            API = HfApi(token=TOKEN)
         | 
|  | |
| 2 |  | 
| 3 | 
             
            from huggingface_hub import HfApi
         | 
| 4 |  | 
| 5 | 
            +
            # Info to change for your repository
         | 
| 6 | 
            +
            # ----------------------------------
         | 
| 7 | 
            +
            TOKEN = os.environ.get("TOKEN", None) # A read/write token for your org
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request file
         | 
| 10 | 
            +
            DEVICE = "cpu" # cuda:0 if you add compute
         | 
| 11 | 
            +
            LIMIT = 20 # !!!! Should be None for actual evaluations!!!
         | 
| 12 | 
            +
            # ----------------------------------
         | 
| 13 |  | 
|  | |
| 14 | 
             
            REPO_ID = f"{OWNER}/leaderboard"
         | 
| 15 | 
             
            QUEUE_REPO = f"{OWNER}/requests"
         | 
| 16 | 
             
            RESULTS_REPO = f"{OWNER}/results"
         | 
| 17 |  | 
| 18 | 
            +
            # If you setup a cache later, just change HF_HOME
         | 
| 19 | 
             
            CACHE_PATH=os.getenv("HF_HOME", ".")
         | 
| 20 |  | 
| 21 | 
             
            # Local caches
         | 
| 22 | 
             
            EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
         | 
| 23 | 
             
            EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
         | 
| 24 | 
            +
            EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
         | 
| 25 | 
            +
            EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
         | 
| 26 |  | 
| 27 | 
             
            API = HfApi(token=TOKEN)
         | 
    	
        src/leaderboard/read_evals.py
    CHANGED
    
    | @@ -103,7 +103,7 @@ class EvalResult: | |
| 103 | 
             
                        self.num_params = request.get("params", 0)
         | 
| 104 | 
             
                        self.date = request.get("submitted_time", "")
         | 
| 105 | 
             
                    except Exception:
         | 
| 106 | 
            -
                        print(f"Could not find request file for {self.org}/{self.model}")
         | 
| 107 |  | 
| 108 | 
             
                def to_dict(self):
         | 
| 109 | 
             
                    """Converts the Eval Result to a dict compatible with our dataframe display"""
         | 
|  | |
| 103 | 
             
                        self.num_params = request.get("params", 0)
         | 
| 104 | 
             
                        self.date = request.get("submitted_time", "")
         | 
| 105 | 
             
                    except Exception:
         | 
| 106 | 
            +
                        print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
         | 
| 107 |  | 
| 108 | 
             
                def to_dict(self):
         | 
| 109 | 
             
                    """Converts the Eval Result to a dict compatible with our dataframe display"""
         | 
