Spaces:

ash-98
/

test

Sleeping

App Files Files Community

ash-98 commited on Apr 11

Commit

b5117fc

1 Parent(s): b8f4692

cybermetric80

Browse files

Files changed (2) hide show

app.py +27 -13
metric.csv +6 -0

app.py CHANGED Viewed

@@ -15,13 +15,18 @@ with st.sidebar:
     selected_category = st.selectbox("Select Dataset Category", dataset_categories, index=0)
     datasets_by_category = {
-        "Multiple Choice": ["secQA"],
     }
     dataset_choice = st.selectbox("Select Dataset", datasets_by_category[selected_category], index=0)
     st.divider()
     st.header("Filters & Options")
-    dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
     # For filtering the leaderboard by model type
     # Note: The available model types will come from the CSV, once loaded.
     # We'll load the CSV later and then update this filter accordingly.
@@ -69,8 +74,8 @@ random_accuracyv2 = estimate_random_accuracy(questionnairev2)
 # For now, if dataset_choice is "secQA", we use "Benchmark.csv"
 if dataset_choice == "secQA":
     file_path = "Benchmark.csv"  # Ensure this file is uploaded in your Hugging Face Space
-else:
-    file_path = "Benchmark.csv"  # Placeholder: update with actual file paths for future datasets
 # Function to load and clean CSV data
 @st.cache_data
@@ -91,8 +96,9 @@ def load_data(file_path):
     # Convert percentage strings to floats (e.g., "100%" → 1.0)
     for col in ["V1 Accuracy", "V2 Accuracy"]:
-        df[col] = df[col].astype(str).str.replace("%", "").str.strip()
-        df[col] = pd.to_numeric(df[col], errors='coerce') / 100
     return df
@@ -110,7 +116,12 @@ source_filter = source_filter_placeholder.multiselect(
 df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df
 # Choose the correct metric version and compute Accuracy
-df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
 df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna()  # Drop rows with errors
 # Sort by Accuracy descending
@@ -125,7 +136,10 @@ df_filtered = df_filtered[['Rank', 'Model', 'Type', 'Accuracy']]
 tab1, tab2 = st.tabs(["Leaderboard", "About"])
 with tab1:
-    st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
     # Use columns to display leaderboard and model details side-by-side
     col1, col2 = st.columns([2, 1])
@@ -145,10 +159,10 @@ with tab1:
         st.divider()
         # Display the random baseline accuracy above the leaderboard
-        st.markdown("### Random Baseline Accuracy")
-        st.markdown("**{:.2%}** (computed with random guessing on SECQAv1)".format(random_accuracy))
-        st.markdown("**{:.2%}** (computed with random guessing on SECQAv2)".format(random_accuracyv2))
     # Footer
     st.markdown("---")
@@ -185,4 +199,4 @@ with tab2:
     [Priam.ai](https://www.priam.ai/)
     *This benchmark hub will continue to expand as more models and datasets are released in the cybersecurity NLP space.*
-    """)  # Replace with actual random_accuracy values if available

     selected_category = st.selectbox("Select Dataset Category", dataset_categories, index=0)
     datasets_by_category = {
+        "Multiple Choice": ["secQA","CyberMetric80"],
     }
     dataset_choice = st.selectbox("Select Dataset", datasets_by_category[selected_category], index=0)
     st.divider()
     st.header("Filters & Options")
+    #dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
+    if dataset_choice == "secQA":
+        dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
+    else:
+        st.markdown("**Note:** Only CyberMetric80 has been evaluated")
+        dataset_version = "v1"
     # For filtering the leaderboard by model type
     # Note: The available model types will come from the CSV, once loaded.
     # We'll load the CSV later and then update this filter accordingly.
 # For now, if dataset_choice is "secQA", we use "Benchmark.csv"
 if dataset_choice == "secQA":
     file_path = "Benchmark.csv"  # Ensure this file is uploaded in your Hugging Face Space
+elif dataset_choice == "CyberMetric80":
+    file_path = "metric.csv"  # Placeholder: update with actual file paths for future datasets
 # Function to load and clean CSV data
 @st.cache_data
     # Convert percentage strings to floats (e.g., "100%" → 1.0)
     for col in ["V1 Accuracy", "V2 Accuracy"]:
+        if col in df.columns:
+            df[col] = df[col].astype(str).str.replace("%", "").str.strip()
+            df[col] = pd.to_numeric(df[col], errors='coerce') / 100
     return df
 df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df
 # Choose the correct metric version and compute Accuracy
+#df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
+if dataset_choice == "CyberMetric80":
+    df_filtered["Accuracy"] = df_filtered["V1 Accuracy"]
+else:
+    df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
 df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna()  # Drop rows with errors
 # Sort by Accuracy descending
 tab1, tab2 = st.tabs(["Leaderboard", "About"])
 with tab1:
+    if dataset_choice == "secQA":
+        st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
+    elif dataset_choice == "CyberMetric80":
+        st.markdown("#### [View the CyberMetric Dataset](https://github.com/cybermetric/CyberMetric)")
     # Use columns to display leaderboard and model details side-by-side
     col1, col2 = st.columns([2, 1])
         st.divider()
         # Display the random baseline accuracy above the leaderboard
+        if dataset_choice == "secQA":
+            st.markdown("### Random Baseline Accuracy")
+            st.markdown("**{:.2%}** (computed with random guessing on SECQAv1)".format(random_accuracy))
+            st.markdown("**{:.2%}** (computed with random guessing on SECQAv2)".format(random_accuracyv2))
     # Footer
     st.markdown("---")
     [Priam.ai](https://www.priam.ai/)
     *This benchmark hub will continue to expand as more models and datasets are released in the cybersecurity NLP space.*
+    """)

metric.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+model name, source, v1 metric
+Google: Gemma 3 27B ,Open Source,95.4022988%
+Google: Gemini Flash 2.0,Proprietary Model,97.7011494%
+Google: Gemini 2.0 Flash Lite,Proprietary Model,95.4022988%
+DeepSeek: R1,Open Source,96.5517241%
+Qwen: QwQ 32B,Open Source,94.2528735%