Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- __init__.py +9 -0
- __pycache__/__init__.cpython-311.pyc +0 -0
- __pycache__/masking.cpython-311.pyc +0 -0
- __pycache__/utils.cpython-311.pyc +0 -0
- cli.py +57 -0
- masking.py +52 -0
- utils.py +25 -0
__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AnonySpark: Lightweight PySpark data anonymization
|
| 2 |
+
from .masking import (
|
| 3 |
+
mask_email, mask_name, mask_date,
|
| 4 |
+
mask_ssn, mask_itin, mask_phone,
|
| 5 |
+
mask_email_udf, mask_name_udf, mask_date_udf,
|
| 6 |
+
mask_ssn_udf, mask_itin_udf, mask_phone_udf
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
from .utils import apply_masking
|
__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (624 Bytes). View file
|
|
|
__pycache__/masking.cpython-311.pyc
ADDED
|
Binary file (2.49 kB). View file
|
|
|
__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (1.15 kB). View file
|
|
|
cli.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import argparse
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from pyspark.sql import SparkSession
|
| 6 |
+
from anonyspark.masking import (
|
| 7 |
+
mask_email_udf, mask_name_udf, mask_date_udf,
|
| 8 |
+
mask_ssn_udf, mask_itin_udf, mask_phone_udf
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
def apply_masking(df, schema):
|
| 12 |
+
"""
|
| 13 |
+
Apply masking UDFs based on schema definitions.
|
| 14 |
+
"""
|
| 15 |
+
for column, dtype in schema.items():
|
| 16 |
+
if dtype == "email":
|
| 17 |
+
df = df.withColumn(f"masked_{column}", mask_email_udf(df[column]))
|
| 18 |
+
elif dtype == "name":
|
| 19 |
+
df = df.withColumn(f"masked_{column}", mask_name_udf(df[column]))
|
| 20 |
+
elif dtype == "dob":
|
| 21 |
+
df = df.withColumn(f"masked_{column}", mask_date_udf(df[column]))
|
| 22 |
+
elif dtype == "ssn":
|
| 23 |
+
df = df.withColumn(f"masked_{column}", mask_ssn_udf(df[column]))
|
| 24 |
+
elif dtype == "itin":
|
| 25 |
+
df = df.withColumn(f"masked_{column}", mask_itin_udf(df[column]))
|
| 26 |
+
elif dtype == "phone":
|
| 27 |
+
df = df.withColumn(f"masked_{column}", mask_phone_udf(df[column]))
|
| 28 |
+
return df
|
| 29 |
+
|
| 30 |
+
def main():
|
| 31 |
+
parser = argparse.ArgumentParser(description="AnonySpark CLI for masking sensitive data.")
|
| 32 |
+
parser.add_argument('--input', type=str, required=True, help='Path to input CSV file')
|
| 33 |
+
parser.add_argument('--output', type=str, required=True, help='Directory to save masked output')
|
| 34 |
+
parser.add_argument('--schema', type=str, required=True, help='Path to masking schema JSON file')
|
| 35 |
+
args = parser.parse_args()
|
| 36 |
+
|
| 37 |
+
# Create output directory if it doesn't exist
|
| 38 |
+
os.makedirs(args.output, exist_ok=True)
|
| 39 |
+
|
| 40 |
+
# Start Spark
|
| 41 |
+
spark = SparkSession.builder.master("local[*]").appName("AnonysparkCLI").getOrCreate()
|
| 42 |
+
|
| 43 |
+
# Load data and schema
|
| 44 |
+
df = spark.read.csv(args.input, header=True)
|
| 45 |
+
with open(args.schema, 'r') as f:
|
| 46 |
+
schema = json.load(f)
|
| 47 |
+
|
| 48 |
+
# Apply masking
|
| 49 |
+
masked_df = apply_masking(df, schema)
|
| 50 |
+
|
| 51 |
+
# Save to output directory
|
| 52 |
+
masked_df.write.mode("overwrite").csv(args.output, header=True)
|
| 53 |
+
|
| 54 |
+
print(f"Masked file written to: {args.output}")
|
| 55 |
+
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
main()
|
masking.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__all__ = [
|
| 2 |
+
"mask_email_udf", "mask_name_udf", "mask_date_udf",
|
| 3 |
+
"mask_ssn_udf", "mask_itin_udf", "mask_phone_udf"
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
from pyspark.sql.functions import udf
|
| 7 |
+
from pyspark.sql.types import StringType
|
| 8 |
+
import re
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
# Masking functions
|
| 12 |
+
def mask_email(value):
|
| 13 |
+
if value and "@" in value:
|
| 14 |
+
user, domain = value.split("@")
|
| 15 |
+
return "***@" + domain
|
| 16 |
+
return None
|
| 17 |
+
|
| 18 |
+
def mask_name(value):
|
| 19 |
+
if value:
|
| 20 |
+
return value[0] + "***"
|
| 21 |
+
return None
|
| 22 |
+
|
| 23 |
+
def mask_date(value):
|
| 24 |
+
try:
|
| 25 |
+
dt = datetime.strptime(value, "%Y-%m-%d")
|
| 26 |
+
return dt.strftime("***-**-%d")
|
| 27 |
+
except:
|
| 28 |
+
return None
|
| 29 |
+
|
| 30 |
+
def mask_ssn(value):
|
| 31 |
+
if value and re.match(r"\d{3}-\d{2}-\d{4}", value):
|
| 32 |
+
return "***-**-" + value[-4:]
|
| 33 |
+
return None
|
| 34 |
+
|
| 35 |
+
def mask_itin(value):
|
| 36 |
+
if value and re.match(r"9\d{2}-7\d-\d{4}", value):
|
| 37 |
+
return "***-**-" + value[-4:]
|
| 38 |
+
return None
|
| 39 |
+
|
| 40 |
+
def mask_phone(value):
|
| 41 |
+
if value and re.match(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", value):
|
| 42 |
+
return "***-***-" + value[-4:]
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
# UDFs for Spark
|
| 46 |
+
mask_email_udf = udf(mask_email, StringType())
|
| 47 |
+
mask_name_udf = udf(mask_name, StringType())
|
| 48 |
+
mask_date_udf = udf(mask_date, StringType())
|
| 49 |
+
mask_ssn_udf = udf(mask_ssn, StringType())
|
| 50 |
+
mask_itin_udf = udf(mask_itin, StringType())
|
| 51 |
+
mask_phone_udf = udf(mask_phone, StringType())
|
| 52 |
+
|
utils.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pyspark.sql.functions import col
|
| 2 |
+
|
| 3 |
+
def apply_masking(df, schema):
|
| 4 |
+
"""
|
| 5 |
+
Apply masking UDFs to specified columns based on schema.
|
| 6 |
+
Schema = { "original_col": "mask_type" }
|
| 7 |
+
"""
|
| 8 |
+
from .masking import (
|
| 9 |
+
mask_email_udf, mask_name_udf, mask_date_udf,
|
| 10 |
+
mask_ssn_udf, mask_itin_udf, mask_phone_udf
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
masking_map = {
|
| 14 |
+
"email": mask_email_udf,
|
| 15 |
+
"name": mask_name_udf,
|
| 16 |
+
"dob": mask_date_udf,
|
| 17 |
+
"ssn": mask_ssn_udf,
|
| 18 |
+
"itin": mask_itin_udf,
|
| 19 |
+
"phone": mask_phone_udf,
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
for col_name, mask_type in schema.items():
|
| 23 |
+
if mask_type in masking_map:
|
| 24 |
+
df = df.withColumn(f"masked_{col_name}", masking_map[mask_type](col(col_name)))
|
| 25 |
+
return df
|