MLP_Safety_Classifier / safety_rating.py
Wes
Initial HF Space
ed80259
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import joblib
# ---- Load Aegis 2.0 ----
ds = load_dataset("nvidia/Aegis-AI-Content-Safety-Dataset-2.0")
TEXT_COL = "response"
LABEL_COL = "response_label"
# ---- Binary mapping: safe -> 1, everything else -> 0 ----
SAFE_TOKENS = {"safe"} # keep lowercase; Aegis uses "safe" / "needs_caution" / unsafe categories
def to_binary_label(raw):
if raw is None:
return 0
raw = str(raw).strip().lower()
return 1 if raw in SAFE_TOKENS else 0
train = ds["train"]
# Filter out empty/missing texts
records = [r for r in train if r.get(TEXT_COL) and isinstance(r[TEXT_COL], str) and r[TEXT_COL].strip()]
X = [r[TEXT_COL].strip() for r in records]
y = [to_binary_label(r.get(LABEL_COL)) for r in records]
# Train/val split, test_size=15%
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.15, random_state=42, stratify=y
)
# split the remaining data into train/val (e.g. 85% -> 70% train, 15% val)
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp
)
print(f"Train size: {len(X_train)}, Val size: {len(X_val)}, Test size: {len(X_test)}")
# ---- MLP baseline ----
pipe = Pipeline([
("tfidf", TfidfVectorizer(max_features=100_000, ngram_range=(1,2), min_df=3)),
("clf", MLPClassifier(hidden_layer_sizes=(128, 64),
activation="relu",
batch_size=256,
early_stopping=True, #to stop if no val improvement
max_iter=10,
verbose=True,
random_state=42))
])
pipe.fit(X_train, y_train)
print("Validation results:")
pred_val = pipe.predict(X_val)
print(classification_report(y_val, pred_val, digits=3))
print("Test results:")
pred_test = pipe.predict(X_test)
print(classification_report(y_test, pred_test, digits=3))
print("Train accuracy:", pipe.score(X_train, y_train))
print("Val accuracy:", pipe.score(X_val, y_val))
print("Test accuracy:", pipe.score(X_test, y_test))
joblib.dump(pipe, "mlp_tfidf_aegis2.joblib")
print("Saved to mlp_tfidf_aegis2.joblib")