ycy commited on
Commit
d55f05b
·
1 Parent(s): c90946a
app.py CHANGED
@@ -33,6 +33,7 @@ def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
  ### Space initialisation
 
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
@@ -96,7 +97,7 @@ with demo:
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- #TODO
100
  leaderboard = init_leaderboard(LEADERBOARD_DF)
101
 
102
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
 
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
  ### Space initialisation
36
+ # load the evaluation requests and results locally
37
  try:
38
  print(EVAL_REQUESTS_PATH)
39
  snapshot_download(
 
97
 
98
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
99
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
100
+
101
  leaderboard = init_leaderboard(LEADERBOARD_DF)
102
 
103
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
src/envs.py CHANGED
@@ -10,8 +10,8 @@ OWNER = "yan111222" # Change to your org - don't forget to create a results and
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/CapArena_Auto_1"
13
- QUEUE_REPO = "demo-leaderboard-backend/requests"
14
- RESULTS_REPO = "demo-leaderboard-backend/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/CapArena_Auto_1"
13
+ QUEUE_REPO = f"{OWNER}/requests"
14
+ RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/leaderboard/read_evals.py CHANGED
@@ -156,6 +156,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
156
 
157
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
  """From the path of the results folder root, extract all needed info for results"""
 
159
  model_result_filepaths = []
160
 
161
  for root, _, files in os.walk(results_path):
@@ -163,7 +164,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
163
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
  continue
165
 
166
- # Sort the files by date
167
  try:
168
  files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
  except dateutil.parser._parser.ParserError:
@@ -171,7 +172,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
171
 
172
  for file in files:
173
  model_result_filepaths.append(os.path.join(root, file))
174
-
 
175
  eval_results = {}
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
 
156
 
157
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
  """From the path of the results folder root, extract all needed info for results"""
159
+ """Get the path of the results folder and the requests folder, and return a list of EvalResult objects"""
160
  model_result_filepaths = []
161
 
162
  for root, _, files in os.walk(results_path):
 
164
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
165
  continue
166
 
167
+ # Sort the files by date 这里得到符合要求的数据集
168
  try:
169
  files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
170
  except dateutil.parser._parser.ParserError:
 
172
 
173
  for file in files:
174
  model_result_filepaths.append(os.path.join(root, file))
175
+
176
+ # get all of the property eval paths
177
  eval_results = {}
178
  for model_result_filepath in model_result_filepaths:
179
  # Creation of result
src/populate.py CHANGED
@@ -11,6 +11,45 @@ from src.leaderboard.read_evals import get_raw_eval_results
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
+
15
+ # raw_data示例
16
+ """raw_data = [
17
+ EvalResult(
18
+ model_name="org1/model1",
19
+ model_dtype="float32",
20
+ model_sha="commit_hash1",
21
+ results={
22
+ "task1": {"metric1": 0.85, "metric2": 0.90},
23
+ "task2": {"metric1": 0.75, "metric2": 0.80}
24
+ },
25
+ model_type="Pretrained",
26
+ weight_type="Original",
27
+ license="MIT",
28
+ likes=100,
29
+ params=123456789,
30
+ submitted_time="2025-02-28T12:34:56Z",
31
+ status="FINISHED",
32
+ precision="float32"
33
+ ),
34
+ EvalResult(
35
+ model_name="org2/model2",
36
+ model_dtype="float32",
37
+ model_sha="commit_hash2",
38
+ results={
39
+ "task1": {"metric1": 0.88, "metric2": 0.92},
40
+ "task2": {"metric1": 0.78, "metric2": 0.82}
41
+ },
42
+ model_type="Fine-tuned",
43
+ weight_type="Adapter",
44
+ license="Apache-2.0",
45
+ likes=200,
46
+ params=987654321,
47
+ submitted_time="2025-02-28T12:34:56Z",
48
+ status="FINISHED",
49
+ precision="float32"
50
+ )
51
+ ]
52
+ """
53
  all_data_json = [v.to_dict() for v in raw_data]
54
 
55
  df = pd.DataFrame.from_records(all_data_json)
src/submission/submit.py CHANGED
@@ -14,6 +14,8 @@ from src.submission.check_validity import (
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
 
 
17
  def add_new_eval(
18
  model: str,
19
  base_model: str,
@@ -22,6 +24,7 @@ def add_new_eval(
22
  weight_type: str,
23
  model_type: str,
24
  ):
 
25
  global REQUESTED_MODELS
26
  global USERS_TO_SUBMISSION_DATES
27
  if not REQUESTED_MODELS:
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
17
+
18
+
19
  def add_new_eval(
20
  model: str,
21
  base_model: str,
 
24
  weight_type: str,
25
  model_type: str,
26
  ):
27
+ """通过提交模型到评估队列,将信息自动保存到requests数据集中"""
28
  global REQUESTED_MODELS
29
  global USERS_TO_SUBMISSION_DATES
30
  if not REQUESTED_MODELS: