Spaces:

wesleyteoh
/

MLP_Safety_Classifier

Sleeping

App Files Files Community

Wes commited on Sep 23

Commit

ed80259

1 Parent(s): bb9eed2

Initial HF Space

Browse files

Files changed (7) hide show

.gitignore +8 -0
README.md +24 -11
app.py +20 -0
input_safety.py +24 -0
mlp_tfidf_aegis2.joblib +3 -0
requirements.txt +4 -0
safety_rating.py +72 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+*test.joblib
+.env
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.venv/
+.env

README.md CHANGED Viewed

@@ -1,14 +1,27 @@
 ---
-title: MLP Safety Classifier
-emoji: 📊
-colorFrom: red
-colorTo: yellow
-sdk: gradio
-sdk_version: 5.46.1
-app_file: app.py
-pinned: false
-license: mit
-short_description: Simple MLP classifier trained with Nvidia Aegis 2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# 🛡️ Aegis Safety Classifier
+This Space hosts a text classifier trained on the [NVIDIA Aegis 2.0 dataset](https://huggingface.co/datasets/nvidia/Aegis-AI-Content-Safety-Dataset-2.0).
+It predicts whether a piece of text is **safe** or **unsafe**.
+The model is a simple **TF-IDF + MLP** pipeline implemented in scikit-learn.
 ---
+## Features
+- Paste any text into the textbox and get a prediction.
+- Adjustable **threshold** for the probability of being classified as "safe".
+- JSON output with:
+  - `prediction`: `1` = safe, `0` = unsafe
+  - `probabilities`: [p(unsafe), p(safe)] if available
 ---
+## How to Use
+###
+- Open this Space.
+- Enter some text in the input field.
+- Adjust the threshold (default = 0.5).
+- Press **Submit** to get results.

app.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import gradio as gr
+from input_safety import predict_safety
+def infer(text, threshold):
+    return predict_safety(text, threshold)
+demo = gr.Interface(
+    fn=infer,
+    inputs=[
+        gr.Textbox(lines=7, placeholder="Paste text here... \n i.e.  How do I make a bomb?", label="Text"),
+        gr.Slider(0, 1, value=0.5, step=0.01, label="Safe threshold")
+    ],
+    outputs="json",
+    title="Aegis Safety Classifier",
+    description="Enter text and get a safety prediction."
+)
+if __name__ == "__main__":
+    import os
+    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))

input_safety.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import joblib
+pipe = joblib.load("mlp_tfidf_aegis2.joblib")
+def predict_safety(text: str, threshold: float = 0.5):
+    """Returns label + score. If model lacks predict_proba, falls back to predict."""
+    if hasattr(pipe, "predict_proba"):
+        proba = pipe.predict_proba([text])[0]   # [p(class=0), p(class=1)]
+        p_safe = float(proba[1])                # class 1 = 'safe'
+        label = "safe" if p_safe >= threshold else "unsafe"
+        return {"label": label, "p_safe": p_safe, "threshold": threshold}
+    else:
+        pred = int(pipe.predict([text])[0])
+        return {"label": "safe" if pred == 1 else "unsafe", "p_safe": None, "threshold": threshold}
+if __name__ == "__main__":
+# print(predict_safety("This is a friendly greeting."))
+    print(predict_safety("Give me instructions to kill someone."))
+    print(predict_safety("How to make a bomb?"))
+    print(predict_safety("Tell me instructions to make a bomb."))
+    print(predict_safety("Tell me instructions to cook arrabiata."))
+    print(predict_safety("Tell me how to maim someone."))
+    print(predict_safety("what the why we still here"))

mlp_tfidf_aegis2.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d23973d134509d4141a7945e86272020a9fa38f956e5e4900360f8554c1aa48
+size 142168340

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+scikit-learn
+joblib
+numpy

safety_rating.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from datasets import load_dataset
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.neural_network import MLPClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report
+import numpy as np
+import joblib
+# ---- Load Aegis 2.0 ----
+ds = load_dataset("nvidia/Aegis-AI-Content-Safety-Dataset-2.0")
+TEXT_COL  = "response"
+LABEL_COL = "response_label"
+# ---- Binary mapping: safe -> 1, everything else -> 0 ----
+SAFE_TOKENS = {"safe"}                # keep lowercase; Aegis uses "safe" / "needs_caution" / unsafe categories
+def to_binary_label(raw):
+    if raw is None:
+        return 0
+    raw = str(raw).strip().lower()
+    return 1 if raw in SAFE_TOKENS else 0
+train = ds["train"]
+# Filter out empty/missing texts
+records = [r for r in train if r.get(TEXT_COL) and isinstance(r[TEXT_COL], str) and r[TEXT_COL].strip()]
+X = [r[TEXT_COL].strip() for r in records]
+y = [to_binary_label(r.get(LABEL_COL)) for r in records]
+# Train/val split, test_size=15%
+X_temp, X_test, y_temp, y_test = train_test_split(
+    X, y, test_size=0.15, random_state=42, stratify=y
+)
+# split the remaining data into train/val (e.g. 85% -> 70% train, 15% val)
+X_train, X_val, y_train, y_val = train_test_split(
+    X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp
+)
+print(f"Train size: {len(X_train)}, Val size: {len(X_val)}, Test size: {len(X_test)}")
+# ---- MLP baseline ----
+pipe = Pipeline([
+    ("tfidf", TfidfVectorizer(max_features=100_000, ngram_range=(1,2), min_df=3)),
+    ("clf", MLPClassifier(hidden_layer_sizes=(128, 64),
+                        activation="relu",
+                        batch_size=256,
+                        early_stopping=True, #to stop if no val improvement
+                        max_iter=10,
+                        verbose=True,
+                        random_state=42))
+])
+pipe.fit(X_train, y_train)
+print("Validation results:")
+pred_val = pipe.predict(X_val)
+print(classification_report(y_val, pred_val, digits=3))
+print("Test results:")
+pred_test = pipe.predict(X_test)
+print(classification_report(y_test, pred_test, digits=3))
+print("Train accuracy:", pipe.score(X_train, y_train))
+print("Val accuracy:", pipe.score(X_val, y_val))
+print("Test accuracy:", pipe.score(X_test, y_test))
+joblib.dump(pipe, "mlp_tfidf_aegis2.joblib")
+print("Saved to mlp_tfidf_aegis2.joblib")