theolex_streamlit / model.py
Jawad's picture
add fields
a67c43f
raw
history blame
2.28 kB
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import cv
from sklearn.model_selection import train_test_split
def prepare_data(monetary_decision):
monetary_decision = monetary_decision.reset_index(drop=True)
time = round((monetary_decision.decision_date - monetary_decision.decision_date.min()) / np.timedelta64(1, "M"))
monetary_decision.loc[:, ('time')] = time
col_num = ['log10_org_revenues',
'time']
col_cat = ['authorities_country',
'type',
'violation_theme',
'justice_type',
'org_country',
'org_currency',
'org_continent',
'same_country',
'org_company_type']
predictors = monetary_decision[col_num + col_cat]
target = monetary_decision.log10_monetary_sanction
for col in col_cat:
predictors[col] = predictors[col].astype("category")
return predictors, target
def split(predictors, target):
predictors_train, predictors_test, target_train, target_test = train_test_split(predictors,
target,
test_size=0.2,
random_state=42)
return predictors_train, predictors_test, target_train, target_test
def run_training(predictors_train, predictors_test):
data_train = xgb.DMatrix(predictors_train, label=predictors_test, enable_categorical=True)
params = {'max_depth': 4,
'learning_rate': 0.05,
'colsample_bytree': 0.3,
'subsample': 0.8,
'gamma': 0.5,
'objective': 'reg:squarederror'}
num_round = 1000
xgb_cv = cv(dtrain=data_train, params=params, nfold=3,
num_boost_round=1000, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)
print(xgb_cv)
return xgb.train(params, data_train, num_round)
def predict(model, predictors):
data = xgb.DMatrix(predictors, enable_categorical=True)
return model.predict(data)
def features_importance(model):
return pd.Series(model.get_score(importance_type='gain')).sort_values()