Spaces:

autogenCTF
/

agent_ctf_leaderboard

Running

App Files Files Community

bhys commited on Apr 25, 2024

Commit

6e02b3f

verified ·

1 Parent(s): 716c50f

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

__pycache__/content.cpython-310.pyc +0 -0
__pycache__/scorer.cpython-310.pyc +0 -0
app.py +22 -37
scorer.py +2 -2

__pycache__/content.cpython-310.pyc ADDED Viewed

Binary file (4.97 kB). View file

__pycache__/scorer.cpython-310.pyc ADDED Viewed

Binary file (2.11 kB). View file

app.py CHANGED Viewed

@@ -31,35 +31,27 @@ YEAR_VERSION = "2024"
 os.makedirs("scored", exist_ok=True)
-"""Download the CTFAIA dataset from Hugging Face Hub"""
-snapshot_download(
-    repo_id="autogenCTF/CTFAIA",
-    repo_type="dataset",
-    local_dir='./CTFAIA',
-    local_dir_use_symlinks=True,
-    token=TOKEN
-)
-def print_files_and_sizes(directory):
-    for root, dirs, files in os.walk(directory):
-        for file in files:
-            file_path = os.path.join(root, file)
-            file_size = os.path.getsize(file_path)
-            print(f"File: {file_path}  Size: {file_size} bytes")
-def get_all_folders(directory):
-    folders = []
-    for item in os.listdir(directory):
-        item_path = os.path.join(directory, item)
-        if os.path.isdir(item_path):
-            folders.append(str(item))
-    return folders
-all_version = get_all_folders('./CTFAIA')
 eval_results = {}
 for dataset_version in all_version:
     eval_results[dataset_version] = load_dataset(
         RESULTS_DATASET, dataset_version,
         token=TOKEN,
@@ -67,9 +59,6 @@ for dataset_version in all_version:
         ignore_verifications=True
     )
-contact_infos = load_dataset(CONTACT_DATASET, token=TOKEN, download_mode="force_redownload",
-                             ignore_verifications=True)
 def get_dataframe_from_results(eval_results, split):
     local_df = eval_results[split]
@@ -124,8 +113,9 @@ def add_new_eval(
     print("Adding new eval")
     # Check if the combination model/org already exists and prints a warning message if yes
-    if model.lower() in set([m.lower() for m in eval_results[dataset_version][val_or_test]["model"]]) and organisation.lower() in set(
-            [o.lower() for o in eval_results[dataset_version][val_or_test]["organisation"]]):
         return format_warning("This model has been already submitted.")
     if path_to_file is None:
@@ -141,10 +131,7 @@ def add_new_eval(
     )
     # Gold answers
-    gold_results = {}
-    print_files_and_sizes('./CTFAIA/' + dataset_version)
-    gold_dataset = load_dataset('./CTFAIA/' + dataset_version)
-    gold_results = {split: {row["task_name"]: row for row in gold_dataset[split]} for split in ["test", "validation"]}
     # Compute score
     file_path = path_to_file.name
@@ -159,7 +146,6 @@ def add_new_eval(
                 except Exception:
                     return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
-                print(task)
                 if "final_answer" not in task:
                     raise format_error(f"Line {ix} contains no final_answer key. Please fix it and resubmit your file.")
                 answer = task["final_answer"]
@@ -186,7 +172,7 @@ def add_new_eval(
                 num_questions["all"] += 1
                 num_questions[level] += 1
     for task_name, task in gold_results[val_or_test].items():
-        level = task['Level']
         total_scores["all"] += 10
         total_scores[level] += 10
@@ -212,7 +198,6 @@ def add_new_eval(
         "score_level3": scores[3] / total_scores[3] if total_scores[3] else 0,
     }
     eval_results[dataset_version][val_or_test] = eval_results[dataset_version][val_or_test].add_item(eval_entry)
-    print(eval_results)
     eval_results[dataset_version].push_to_hub(RESULTS_DATASET, config_name=dataset_version, token=TOKEN)
     contact_info = {

 os.makedirs("scored", exist_ok=True)
+all_version = ['2024', '20240423']
+contact_infos = load_dataset(
+    CONTACT_DATASET,
+    token=TOKEN,
+    download_mode="force_redownload",
+    ignore_verifications=True
+)
+all_gold_dataset = {}
+all_gold_results = {}
 eval_results = {}
 for dataset_version in all_version:
+    all_gold_dataset[dataset_version] = load_dataset(
+        INTERNAL_DATA_DATASET,
+        dataset_version
+    )
+    all_gold_results[dataset_version] = {
+        split: {row["task_name"]: row for row in all_gold_dataset[dataset_version][split]}
+        for split in ["test", "validation"]
+    }
     eval_results[dataset_version] = load_dataset(
         RESULTS_DATASET, dataset_version,
         token=TOKEN,
         ignore_verifications=True
     )
 def get_dataframe_from_results(eval_results, split):
     local_df = eval_results[split]
     print("Adding new eval")
     # Check if the combination model/org already exists and prints a warning message if yes
+    if model.lower() in set(
+            [m.lower() for m in eval_results[dataset_version][val_or_test]["model"]]) and organisation.lower() in set(
+        [o.lower() for o in eval_results[dataset_version][val_or_test]["organisation"]]):
         return format_warning("This model has been already submitted.")
     if path_to_file is None:
     )
     # Gold answers
+    gold_results = all_gold_results[dataset_version]
     # Compute score
     file_path = path_to_file.name
                 except Exception:
                     return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
                 if "final_answer" not in task:
                     raise format_error(f"Line {ix} contains no final_answer key. Please fix it and resubmit your file.")
                 answer = task["final_answer"]
                 num_questions["all"] += 1
                 num_questions[level] += 1
     for task_name, task in gold_results[val_or_test].items():
+        level = int(task['Level'])
         total_scores["all"] += 10
         total_scores[level] += 10
         "score_level3": scores[3] / total_scores[3] if total_scores[3] else 0,
     }
     eval_results[dataset_version][val_or_test] = eval_results[dataset_version][val_or_test].add_item(eval_entry)
     eval_results[dataset_version].push_to_hub(RESULTS_DATASET, config_name=dataset_version, token=TOKEN)
     contact_info = {

scorer.py CHANGED Viewed

@@ -41,8 +41,8 @@ def question_scorer(
     if user_task["final_answer"] == val["Final answer"]:
         score = val["Total score"]
     else:
-        for i, item in enumerate(val["score"]):
-            if user_task["score_answer"][i] in item["answer"] and item["score"] > score:
                 score = item["score"]
     return score

     if user_task["final_answer"] == val["Final answer"]:
         score = val["Total score"]
     else:
+        for i, item in enumerate(val["score"]["question"]):
+            if user_task["score_answer"][i] in val["score"]["answer"][i] and val["score"]["score"][i] > score:
                 score = item["score"]
     return score