File size: 2,474 Bytes
100fb60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

def define_target(df):
    """
    Creates the target variable 'IsViolent' based on crime category.
    """
    violent_categories = [
        'ASSAULT', 'ROBBERY', 'SEX OFFENSES FORCIBLE', 'KIDNAPPING', 'HOMICIDE', 'ARSON'
    ]
    
    df['IsViolent'] = df['Category'].apply(lambda x: 1 if x in violent_categories else 0)
    return df

def extract_temporal_features(df):
    """
    Extracts temporal features from the 'Dates' column.
    """
    df['Hour'] = df['Dates'].dt.hour
    df['Day'] = df['Dates'].dt.day
    df['Month'] = df['Dates'].dt.month
    df['Year'] = df['Dates'].dt.year
    df['DayOfWeek'] = df['Dates'].dt.dayofweek # 0=Monday, 6=Sunday
    
    df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
    
    # Holidays
    cal = calendar()
    holidays = cal.holidays(start=df['Dates'].min(), end=df['Dates'].max())
    df['IsHoliday'] = df['Dates'].dt.date.astype('datetime64[ns]').isin(holidays).astype(int)
    
    return df

def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

def extract_contextual_features(df):
    """
    Extracts contextual features like Season.
    """
    df['Season'] = df['Month'].apply(get_season)
    return df

def extract_location_features(df, n_clusters=10, kmeans_model=None):
    """
    Extracts location features including K-Means clusters for high-crime zones.
    """
    if kmeans_model is None:
        # Fit mode
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        df['LocationCluster'] = kmeans.fit_predict(df[['X', 'Y']])
        return df, kmeans
    else:
        # Predict mode
        df['LocationCluster'] = kmeans_model.predict(df[['X', 'Y']])
        return df, kmeans_model

def preprocess_pipeline(df, is_train=True, kmeans_model=None):
    """
    Runs the full preprocessing pipeline.
    """
    df = extract_temporal_features(df)
    df = extract_contextual_features(df)
    
    # Location features (Clustering)
    df, kmeans_model = extract_location_features(df, kmeans_model=kmeans_model)
    
    if is_train:
        df = define_target(df)
        
    return df, kmeans_model

if __name__ == "__main__":
    # Test
    pass