Spaces:

Theolex
/

theolex_streamlit

Runtime error

File size: 2,760 Bytes

b0e8abd
fe0f3db
 
 
a67c43f
fe0f3db
 
 
b1bb2ef
fe0f3db
 
 
 
b1bb2ef
 
 
 
 
 
fe0f3db
 
 
 
 
b0e8abd
 
d036d71
fe0f3db
 
 
b0e8abd
 
 
 
 
 
 
 
 
 
b1bb2ef
b0e8abd
 
 
d036d71
b0e8abd
 
 
 
 
 
 
 
 
 
 
 
 
fe0f3db
 
 
 
 
 
 
 
b0e8abd

import itertools
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import cv
from sklearn.model_selection import train_test_split


def prepare_predictors(monetary_decision, col_num, col_cat):
    monetary_decision = monetary_decision.reset_index(drop=True)
    predictors = monetary_decision[col_num + col_cat]
    for col in col_cat:
        predictors[col] = predictors[col].astype("category")
    return predictors


def prepare_data(monetary_decision, col_num, col_cat):
    predictors = prepare_predictors(monetary_decision, col_num, col_cat)
    target = monetary_decision.log10_monetary_sanction
    return predictors, target


def split(predictors, target):
    predictors_train, predictors_test, target_train, target_test = train_test_split(predictors,
                                                                                    target,
                                                                                    test_size=0.2,
                                                                                    random_state=50)
    return predictors_train, predictors_test, target_train, target_test


def run_cv_training(predictors_train, target_train):
    data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
    xgb_csv = []
    best_params = (100, {}, 10)
    for eta, max_depth, col_num in itertools.product([0.05, 0.01], [10, 15], [0.3, 0.8]):
        prefix = f"{str(eta)}_{str(max_depth)}_{str(col_num)}"
        params = {
            'learning_rate': eta,
            'max_depth': max_depth,
            'colsample_bytree': col_num,
            # 'gamma': 0.5,
            'subsample': 0.8,
            'objective': 'reg:squarederror'}
        cv_results = cv(dtrain=data_train, params=params, nfold=2,
                        num_boost_round=1000, early_stopping_rounds=3, metrics="rmse", as_pandas=True, seed=50)
        best_value = cv_results['test-rmse-mean'].values[-1]
        best_round = cv_results.index[-1]
        xgb_csv.append(
            cv_results.rename(columns={col: f'{prefix}_{col}' for col in cv_results.columns}).tail(10).reset_index())
        if best_value < best_params[0]:
            best_params = (best_value, params, best_round)

    return pd.concat(xgb_csv, axis=1), best_params


def run_training(predictors_train, target_train, params, num_rounds):
    data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
    return xgb.train(params, data_train, num_rounds)


def predict(model, predictors):
    data = xgb.DMatrix(predictors, enable_categorical=True)
    return model.predict(data)


def features_importance(model):
    return pd.Series(model.get_score(importance_type='gain')).sort_values()