theolex_streamlit / model.py
Jawad's picture
add predict sample
b1bb2ef
raw
history blame
2.76 kB
import itertools
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import cv
from sklearn.model_selection import train_test_split
def prepare_predictors(monetary_decision, col_num, col_cat):
monetary_decision = monetary_decision.reset_index(drop=True)
predictors = monetary_decision[col_num + col_cat]
for col in col_cat:
predictors[col] = predictors[col].astype("category")
return predictors
def prepare_data(monetary_decision, col_num, col_cat):
predictors = prepare_predictors(monetary_decision, col_num, col_cat)
target = monetary_decision.log10_monetary_sanction
return predictors, target
def split(predictors, target):
predictors_train, predictors_test, target_train, target_test = train_test_split(predictors,
target,
test_size=0.2,
random_state=50)
return predictors_train, predictors_test, target_train, target_test
def run_cv_training(predictors_train, target_train):
data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
xgb_csv = []
best_params = (100, {}, 10)
for eta, max_depth, col_num in itertools.product([0.05, 0.01], [10, 15], [0.3, 0.8]):
prefix = f"{str(eta)}_{str(max_depth)}_{str(col_num)}"
params = {
'learning_rate': eta,
'max_depth': max_depth,
'colsample_bytree': col_num,
# 'gamma': 0.5,
'subsample': 0.8,
'objective': 'reg:squarederror'}
cv_results = cv(dtrain=data_train, params=params, nfold=2,
num_boost_round=1000, early_stopping_rounds=3, metrics="rmse", as_pandas=True, seed=50)
best_value = cv_results['test-rmse-mean'].values[-1]
best_round = cv_results.index[-1]
xgb_csv.append(
cv_results.rename(columns={col: f'{prefix}_{col}' for col in cv_results.columns}).tail(10).reset_index())
if best_value < best_params[0]:
best_params = (best_value, params, best_round)
return pd.concat(xgb_csv, axis=1), best_params
def run_training(predictors_train, target_train, params, num_rounds):
data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
return xgb.train(params, data_train, num_rounds)
def predict(model, predictors):
data = xgb.DMatrix(predictors, enable_categorical=True)
return model.predict(data)
def features_importance(model):
return pd.Series(model.get_score(importance_type='gain')).sort_values()