import pandas as pd import numpy as np from sklearn.cluster import KMeans from pandas.tseries.holiday import USFederalHolidayCalendar as calendar def define_target(df): """ Creates the target variable 'IsViolent' based on crime category. """ violent_categories = [ 'ASSAULT', 'ROBBERY', 'SEX OFFENSES FORCIBLE', 'KIDNAPPING', 'HOMICIDE', 'ARSON' ] df['IsViolent'] = df['Category'].apply(lambda x: 1 if x in violent_categories else 0) return df def extract_temporal_features(df): """ Extracts temporal features from the 'Dates' column. """ df['Hour'] = df['Dates'].dt.hour df['Day'] = df['Dates'].dt.day df['Month'] = df['Dates'].dt.month df['Year'] = df['Dates'].dt.year df['DayOfWeek'] = df['Dates'].dt.dayofweek # 0=Monday, 6=Sunday df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0) # Holidays cal = calendar() holidays = cal.holidays(start=df['Dates'].min(), end=df['Dates'].max()) df['IsHoliday'] = df['Dates'].dt.date.astype('datetime64[ns]').isin(holidays).astype(int) return df def get_season(month): if month in [12, 1, 2]: return 'Winter' elif month in [3, 4, 5]: return 'Spring' elif month in [6, 7, 8]: return 'Summer' else: return 'Fall' def extract_contextual_features(df): """ Extracts contextual features like Season. """ df['Season'] = df['Month'].apply(get_season) return df def extract_location_features(df, n_clusters=10, kmeans_model=None): """ Extracts location features including K-Means clusters for high-crime zones. """ if kmeans_model is None: # Fit mode kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) df['LocationCluster'] = kmeans.fit_predict(df[['X', 'Y']]) return df, kmeans else: # Predict mode df['LocationCluster'] = kmeans_model.predict(df[['X', 'Y']]) return df, kmeans_model def preprocess_pipeline(df, is_train=True, kmeans_model=None): """ Runs the full preprocessing pipeline. """ df = extract_temporal_features(df) df = extract_contextual_features(df) # Location features (Clustering) df, kmeans_model = extract_location_features(df, kmeans_model=kmeans_model) if is_train: df = define_target(df) return df, kmeans_model if __name__ == "__main__": # Test pass