|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from sklearn.cluster import KMeans |
|
|
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar |
|
|
|
|
|
def define_target(df): |
|
|
""" |
|
|
Creates the target variable 'IsViolent' based on crime category. |
|
|
""" |
|
|
violent_categories = [ |
|
|
'ASSAULT', 'ROBBERY', 'SEX OFFENSES FORCIBLE', 'KIDNAPPING', 'HOMICIDE', 'ARSON' |
|
|
] |
|
|
|
|
|
df['IsViolent'] = df['Category'].apply(lambda x: 1 if x in violent_categories else 0) |
|
|
return df |
|
|
|
|
|
def extract_temporal_features(df): |
|
|
""" |
|
|
Extracts temporal features from the 'Dates' column. |
|
|
""" |
|
|
df['Hour'] = df['Dates'].dt.hour |
|
|
df['Day'] = df['Dates'].dt.day |
|
|
df['Month'] = df['Dates'].dt.month |
|
|
df['Year'] = df['Dates'].dt.year |
|
|
df['DayOfWeek'] = df['Dates'].dt.dayofweek |
|
|
|
|
|
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0) |
|
|
|
|
|
|
|
|
cal = calendar() |
|
|
holidays = cal.holidays(start=df['Dates'].min(), end=df['Dates'].max()) |
|
|
df['IsHoliday'] = df['Dates'].dt.date.astype('datetime64[ns]').isin(holidays).astype(int) |
|
|
|
|
|
return df |
|
|
|
|
|
def get_season(month): |
|
|
if month in [12, 1, 2]: |
|
|
return 'Winter' |
|
|
elif month in [3, 4, 5]: |
|
|
return 'Spring' |
|
|
elif month in [6, 7, 8]: |
|
|
return 'Summer' |
|
|
else: |
|
|
return 'Fall' |
|
|
|
|
|
def extract_contextual_features(df): |
|
|
""" |
|
|
Extracts contextual features like Season. |
|
|
""" |
|
|
df['Season'] = df['Month'].apply(get_season) |
|
|
return df |
|
|
|
|
|
def extract_location_features(df, n_clusters=10, kmeans_model=None): |
|
|
""" |
|
|
Extracts location features including K-Means clusters for high-crime zones. |
|
|
""" |
|
|
if kmeans_model is None: |
|
|
|
|
|
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) |
|
|
df['LocationCluster'] = kmeans.fit_predict(df[['X', 'Y']]) |
|
|
return df, kmeans |
|
|
else: |
|
|
|
|
|
df['LocationCluster'] = kmeans_model.predict(df[['X', 'Y']]) |
|
|
return df, kmeans_model |
|
|
|
|
|
def preprocess_pipeline(df, is_train=True, kmeans_model=None): |
|
|
""" |
|
|
Runs the full preprocessing pipeline. |
|
|
""" |
|
|
df = extract_temporal_features(df) |
|
|
df = extract_contextual_features(df) |
|
|
|
|
|
|
|
|
df, kmeans_model = extract_location_features(df, kmeans_model=kmeans_model) |
|
|
|
|
|
if is_train: |
|
|
df = define_target(df) |
|
|
|
|
|
return df, kmeans_model |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
pass |
|
|
|