cybermetric80
Browse files- app.py +27 -13
- metric.csv +6 -0
app.py
CHANGED
|
@@ -15,13 +15,18 @@ with st.sidebar:
|
|
| 15 |
selected_category = st.selectbox("Select Dataset Category", dataset_categories, index=0)
|
| 16 |
|
| 17 |
datasets_by_category = {
|
| 18 |
-
"Multiple Choice": ["secQA"],
|
| 19 |
}
|
| 20 |
dataset_choice = st.selectbox("Select Dataset", datasets_by_category[selected_category], index=0)
|
| 21 |
|
| 22 |
st.divider()
|
| 23 |
st.header("Filters & Options")
|
| 24 |
-
dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# For filtering the leaderboard by model type
|
| 26 |
# Note: The available model types will come from the CSV, once loaded.
|
| 27 |
# We'll load the CSV later and then update this filter accordingly.
|
|
@@ -69,8 +74,8 @@ random_accuracyv2 = estimate_random_accuracy(questionnairev2)
|
|
| 69 |
# For now, if dataset_choice is "secQA", we use "Benchmark.csv"
|
| 70 |
if dataset_choice == "secQA":
|
| 71 |
file_path = "Benchmark.csv" # Ensure this file is uploaded in your Hugging Face Space
|
| 72 |
-
|
| 73 |
-
file_path = "
|
| 74 |
|
| 75 |
# Function to load and clean CSV data
|
| 76 |
@st.cache_data
|
|
@@ -91,8 +96,9 @@ def load_data(file_path):
|
|
| 91 |
|
| 92 |
# Convert percentage strings to floats (e.g., "100%" → 1.0)
|
| 93 |
for col in ["V1 Accuracy", "V2 Accuracy"]:
|
| 94 |
-
|
| 95 |
-
|
|
|
|
| 96 |
|
| 97 |
return df
|
| 98 |
|
|
@@ -110,7 +116,12 @@ source_filter = source_filter_placeholder.multiselect(
|
|
| 110 |
df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df
|
| 111 |
|
| 112 |
# Choose the correct metric version and compute Accuracy
|
| 113 |
-
df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna() # Drop rows with errors
|
| 115 |
|
| 116 |
# Sort by Accuracy descending
|
|
@@ -125,7 +136,10 @@ df_filtered = df_filtered[['Rank', 'Model', 'Type', 'Accuracy']]
|
|
| 125 |
tab1, tab2 = st.tabs(["Leaderboard", "About"])
|
| 126 |
|
| 127 |
with tab1:
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
# Use columns to display leaderboard and model details side-by-side
|
| 131 |
col1, col2 = st.columns([2, 1])
|
|
@@ -145,10 +159,10 @@ with tab1:
|
|
| 145 |
|
| 146 |
st.divider()
|
| 147 |
# Display the random baseline accuracy above the leaderboard
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
|
| 153 |
# Footer
|
| 154 |
st.markdown("---")
|
|
@@ -185,4 +199,4 @@ with tab2:
|
|
| 185 |
[Priam.ai](https://www.priam.ai/)
|
| 186 |
|
| 187 |
*This benchmark hub will continue to expand as more models and datasets are released in the cybersecurity NLP space.*
|
| 188 |
-
""")
|
|
|
|
| 15 |
selected_category = st.selectbox("Select Dataset Category", dataset_categories, index=0)
|
| 16 |
|
| 17 |
datasets_by_category = {
|
| 18 |
+
"Multiple Choice": ["secQA","CyberMetric80"],
|
| 19 |
}
|
| 20 |
dataset_choice = st.selectbox("Select Dataset", datasets_by_category[selected_category], index=0)
|
| 21 |
|
| 22 |
st.divider()
|
| 23 |
st.header("Filters & Options")
|
| 24 |
+
#dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
|
| 25 |
+
if dataset_choice == "secQA":
|
| 26 |
+
dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
|
| 27 |
+
else:
|
| 28 |
+
st.markdown("**Note:** Only CyberMetric80 has been evaluated")
|
| 29 |
+
dataset_version = "v1"
|
| 30 |
# For filtering the leaderboard by model type
|
| 31 |
# Note: The available model types will come from the CSV, once loaded.
|
| 32 |
# We'll load the CSV later and then update this filter accordingly.
|
|
|
|
| 74 |
# For now, if dataset_choice is "secQA", we use "Benchmark.csv"
|
| 75 |
if dataset_choice == "secQA":
|
| 76 |
file_path = "Benchmark.csv" # Ensure this file is uploaded in your Hugging Face Space
|
| 77 |
+
elif dataset_choice == "CyberMetric80":
|
| 78 |
+
file_path = "metric.csv" # Placeholder: update with actual file paths for future datasets
|
| 79 |
|
| 80 |
# Function to load and clean CSV data
|
| 81 |
@st.cache_data
|
|
|
|
| 96 |
|
| 97 |
# Convert percentage strings to floats (e.g., "100%" → 1.0)
|
| 98 |
for col in ["V1 Accuracy", "V2 Accuracy"]:
|
| 99 |
+
if col in df.columns:
|
| 100 |
+
df[col] = df[col].astype(str).str.replace("%", "").str.strip()
|
| 101 |
+
df[col] = pd.to_numeric(df[col], errors='coerce') / 100
|
| 102 |
|
| 103 |
return df
|
| 104 |
|
|
|
|
| 116 |
df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df
|
| 117 |
|
| 118 |
# Choose the correct metric version and compute Accuracy
|
| 119 |
+
#df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
|
| 120 |
+
if dataset_choice == "CyberMetric80":
|
| 121 |
+
df_filtered["Accuracy"] = df_filtered["V1 Accuracy"]
|
| 122 |
+
else:
|
| 123 |
+
df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
|
| 124 |
+
|
| 125 |
df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna() # Drop rows with errors
|
| 126 |
|
| 127 |
# Sort by Accuracy descending
|
|
|
|
| 136 |
tab1, tab2 = st.tabs(["Leaderboard", "About"])
|
| 137 |
|
| 138 |
with tab1:
|
| 139 |
+
if dataset_choice == "secQA":
|
| 140 |
+
st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
|
| 141 |
+
elif dataset_choice == "CyberMetric80":
|
| 142 |
+
st.markdown("#### [View the CyberMetric Dataset](https://github.com/cybermetric/CyberMetric)")
|
| 143 |
|
| 144 |
# Use columns to display leaderboard and model details side-by-side
|
| 145 |
col1, col2 = st.columns([2, 1])
|
|
|
|
| 159 |
|
| 160 |
st.divider()
|
| 161 |
# Display the random baseline accuracy above the leaderboard
|
| 162 |
+
if dataset_choice == "secQA":
|
| 163 |
+
st.markdown("### Random Baseline Accuracy")
|
| 164 |
+
st.markdown("**{:.2%}** (computed with random guessing on SECQAv1)".format(random_accuracy))
|
| 165 |
+
st.markdown("**{:.2%}** (computed with random guessing on SECQAv2)".format(random_accuracyv2))
|
| 166 |
|
| 167 |
# Footer
|
| 168 |
st.markdown("---")
|
|
|
|
| 199 |
[Priam.ai](https://www.priam.ai/)
|
| 200 |
|
| 201 |
*This benchmark hub will continue to expand as more models and datasets are released in the cybersecurity NLP space.*
|
| 202 |
+
""")
|
metric.csv
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model name, source, v1 metric
|
| 2 |
+
Google: Gemma 3 27B ,Open Source,95.4022988%
|
| 3 |
+
Google: Gemini Flash 2.0,Proprietary Model,97.7011494%
|
| 4 |
+
Google: Gemini 2.0 Flash Lite,Proprietary Model,95.4022988%
|
| 5 |
+
DeepSeek: R1,Open Source,96.5517241%
|
| 6 |
+
Qwen: QwQ 32B,Open Source,94.2528735%
|