File size: 2,760 Bytes
b0e8abd
fe0f3db
 
 
a67c43f
fe0f3db
 
 
b1bb2ef
fe0f3db
 
 
 
b1bb2ef
 
 
 
 
 
fe0f3db
 
 
 
 
b0e8abd
 
d036d71
fe0f3db
 
 
b0e8abd
 
 
 
 
 
 
 
 
 
b1bb2ef
b0e8abd
 
 
d036d71
b0e8abd
 
 
 
 
 
 
 
 
 
 
 
 
fe0f3db
 
 
 
 
 
 
 
b0e8abd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import itertools
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import cv
from sklearn.model_selection import train_test_split


def prepare_predictors(monetary_decision, col_num, col_cat):
    monetary_decision = monetary_decision.reset_index(drop=True)
    predictors = monetary_decision[col_num + col_cat]
    for col in col_cat:
        predictors[col] = predictors[col].astype("category")
    return predictors


def prepare_data(monetary_decision, col_num, col_cat):
    predictors = prepare_predictors(monetary_decision, col_num, col_cat)
    target = monetary_decision.log10_monetary_sanction
    return predictors, target


def split(predictors, target):
    predictors_train, predictors_test, target_train, target_test = train_test_split(predictors,
                                                                                    target,
                                                                                    test_size=0.2,
                                                                                    random_state=50)
    return predictors_train, predictors_test, target_train, target_test


def run_cv_training(predictors_train, target_train):
    data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
    xgb_csv = []
    best_params = (100, {}, 10)
    for eta, max_depth, col_num in itertools.product([0.05, 0.01], [10, 15], [0.3, 0.8]):
        prefix = f"{str(eta)}_{str(max_depth)}_{str(col_num)}"
        params = {
            'learning_rate': eta,
            'max_depth': max_depth,
            'colsample_bytree': col_num,
            # 'gamma': 0.5,
            'subsample': 0.8,
            'objective': 'reg:squarederror'}
        cv_results = cv(dtrain=data_train, params=params, nfold=2,
                        num_boost_round=1000, early_stopping_rounds=3, metrics="rmse", as_pandas=True, seed=50)
        best_value = cv_results['test-rmse-mean'].values[-1]
        best_round = cv_results.index[-1]
        xgb_csv.append(
            cv_results.rename(columns={col: f'{prefix}_{col}' for col in cv_results.columns}).tail(10).reset_index())
        if best_value < best_params[0]:
            best_params = (best_value, params, best_round)

    return pd.concat(xgb_csv, axis=1), best_params


def run_training(predictors_train, target_train, params, num_rounds):
    data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
    return xgb.train(params, data_train, num_rounds)


def predict(model, predictors):
    data = xgb.DMatrix(predictors, enable_categorical=True)
    return model.predict(data)


def features_importance(model):
    return pd.Series(model.get_score(importance_type='gain')).sort_values()