Wes commited on
Commit
ed80259
·
1 Parent(s): bb9eed2

Initial HF Space

Browse files
Files changed (7) hide show
  1. .gitignore +8 -0
  2. README.md +24 -11
  3. app.py +20 -0
  4. input_safety.py +24 -0
  5. mlp_tfidf_aegis2.joblib +3 -0
  6. requirements.txt +4 -0
  7. safety_rating.py +72 -0
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ *test.joblib
2
+ .env
3
+ __pycache__/
4
+ *.pyc
5
+ *.pyo
6
+ *.pyd
7
+ .venv/
8
+ .env
README.md CHANGED
@@ -1,14 +1,27 @@
 
 
 
 
 
 
 
1
  ---
2
- title: MLP Safety Classifier
3
- emoji: 📊
4
- colorFrom: red
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.46.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: Simple MLP classifier trained with Nvidia Aegis 2.0
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
1
+ # 🛡️ Aegis Safety Classifier
2
+
3
+ This Space hosts a text classifier trained on the [NVIDIA Aegis 2.0 dataset](https://huggingface.co/datasets/nvidia/Aegis-AI-Content-Safety-Dataset-2.0).
4
+ It predicts whether a piece of text is **safe** or **unsafe**.
5
+
6
+ The model is a simple **TF-IDF + MLP** pipeline implemented in scikit-learn.
7
+
8
  ---
9
+
10
+ ## Features
11
+
12
+ - Paste any text into the textbox and get a prediction.
13
+ - Adjustable **threshold** for the probability of being classified as "safe".
14
+ - JSON output with:
15
+ - `prediction`: `1` = safe, `0` = unsafe
16
+ - `probabilities`: [p(unsafe), p(safe)] if available
17
+
 
18
  ---
19
 
20
+ ## How to Use
21
+
22
+ ###
23
+ - Open this Space.
24
+ - Enter some text in the input field.
25
+ - Adjust the threshold (default = 0.5).
26
+ - Press **Submit** to get results.
27
+
app.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from input_safety import predict_safety
3
+
4
+ def infer(text, threshold):
5
+ return predict_safety(text, threshold)
6
+
7
+ demo = gr.Interface(
8
+ fn=infer,
9
+ inputs=[
10
+ gr.Textbox(lines=7, placeholder="Paste text here... \n i.e. How do I make a bomb?", label="Text"),
11
+ gr.Slider(0, 1, value=0.5, step=0.01, label="Safe threshold")
12
+ ],
13
+ outputs="json",
14
+ title="Aegis Safety Classifier",
15
+ description="Enter text and get a safety prediction."
16
+ )
17
+
18
+ if __name__ == "__main__":
19
+ import os
20
+ demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
input_safety.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+
3
+
4
+ pipe = joblib.load("mlp_tfidf_aegis2.joblib")
5
+
6
+ def predict_safety(text: str, threshold: float = 0.5):
7
+ """Returns label + score. If model lacks predict_proba, falls back to predict."""
8
+ if hasattr(pipe, "predict_proba"):
9
+ proba = pipe.predict_proba([text])[0] # [p(class=0), p(class=1)]
10
+ p_safe = float(proba[1]) # class 1 = 'safe'
11
+ label = "safe" if p_safe >= threshold else "unsafe"
12
+ return {"label": label, "p_safe": p_safe, "threshold": threshold}
13
+ else:
14
+ pred = int(pipe.predict([text])[0])
15
+ return {"label": "safe" if pred == 1 else "unsafe", "p_safe": None, "threshold": threshold}
16
+
17
+ if __name__ == "__main__":
18
+ # print(predict_safety("This is a friendly greeting."))
19
+ print(predict_safety("Give me instructions to kill someone."))
20
+ print(predict_safety("How to make a bomb?"))
21
+ print(predict_safety("Tell me instructions to make a bomb."))
22
+ print(predict_safety("Tell me instructions to cook arrabiata."))
23
+ print(predict_safety("Tell me how to maim someone."))
24
+ print(predict_safety("what the why we still here"))
mlp_tfidf_aegis2.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d23973d134509d4141a7945e86272020a9fa38f956e5e4900360f8554c1aa48
3
+ size 142168340
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ scikit-learn
3
+ joblib
4
+ numpy
safety_rating.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.neural_network import MLPClassifier
4
+ from sklearn.pipeline import Pipeline
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.metrics import classification_report
7
+ import numpy as np
8
+ import joblib
9
+
10
+ # ---- Load Aegis 2.0 ----
11
+ ds = load_dataset("nvidia/Aegis-AI-Content-Safety-Dataset-2.0")
12
+
13
+
14
+ TEXT_COL = "response"
15
+ LABEL_COL = "response_label"
16
+
17
+ # ---- Binary mapping: safe -> 1, everything else -> 0 ----
18
+ SAFE_TOKENS = {"safe"} # keep lowercase; Aegis uses "safe" / "needs_caution" / unsafe categories
19
+ def to_binary_label(raw):
20
+ if raw is None:
21
+ return 0
22
+ raw = str(raw).strip().lower()
23
+ return 1 if raw in SAFE_TOKENS else 0
24
+
25
+ train = ds["train"]
26
+
27
+ # Filter out empty/missing texts
28
+ records = [r for r in train if r.get(TEXT_COL) and isinstance(r[TEXT_COL], str) and r[TEXT_COL].strip()]
29
+ X = [r[TEXT_COL].strip() for r in records]
30
+ y = [to_binary_label(r.get(LABEL_COL)) for r in records]
31
+
32
+ # Train/val split, test_size=15%
33
+ X_temp, X_test, y_temp, y_test = train_test_split(
34
+ X, y, test_size=0.15, random_state=42, stratify=y
35
+ )
36
+
37
+ # split the remaining data into train/val (e.g. 85% -> 70% train, 15% val)
38
+ X_train, X_val, y_train, y_val = train_test_split(
39
+ X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp
40
+ )
41
+
42
+ print(f"Train size: {len(X_train)}, Val size: {len(X_val)}, Test size: {len(X_test)}")
43
+
44
+ # ---- MLP baseline ----
45
+ pipe = Pipeline([
46
+ ("tfidf", TfidfVectorizer(max_features=100_000, ngram_range=(1,2), min_df=3)),
47
+ ("clf", MLPClassifier(hidden_layer_sizes=(128, 64),
48
+ activation="relu",
49
+ batch_size=256,
50
+ early_stopping=True, #to stop if no val improvement
51
+ max_iter=10,
52
+ verbose=True,
53
+ random_state=42))
54
+ ])
55
+
56
+ pipe.fit(X_train, y_train)
57
+
58
+ print("Validation results:")
59
+ pred_val = pipe.predict(X_val)
60
+ print(classification_report(y_val, pred_val, digits=3))
61
+
62
+ print("Test results:")
63
+ pred_test = pipe.predict(X_test)
64
+ print(classification_report(y_test, pred_test, digits=3))
65
+
66
+ print("Train accuracy:", pipe.score(X_train, y_train))
67
+ print("Val accuracy:", pipe.score(X_val, y_val))
68
+ print("Test accuracy:", pipe.score(X_test, y_test))
69
+
70
+
71
+ joblib.dump(pipe, "mlp_tfidf_aegis2.joblib")
72
+ print("Saved to mlp_tfidf_aegis2.joblib")