Spaces:
Runtime error
Runtime error
| import itertools | |
| import numpy as np | |
| import pandas as pd | |
| import xgboost as xgb | |
| from xgboost import cv | |
| from sklearn.model_selection import train_test_split | |
| def prepare_predictors(monetary_decision, col_num, col_cat): | |
| monetary_decision = monetary_decision.reset_index(drop=True) | |
| predictors = monetary_decision[col_num + col_cat] | |
| for col in col_cat: | |
| predictors[col] = predictors[col].astype("category") | |
| return predictors | |
| def prepare_data(monetary_decision, col_num, col_cat): | |
| predictors = prepare_predictors(monetary_decision, col_num, col_cat) | |
| target = monetary_decision.log10_monetary_sanction | |
| return predictors, target | |
| def split(predictors, target): | |
| predictors_train, predictors_test, target_train, target_test = train_test_split(predictors, | |
| target, | |
| test_size=0.2, | |
| random_state=50) | |
| return predictors_train, predictors_test, target_train, target_test | |
| def run_cv_training(predictors_train, target_train): | |
| data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True) | |
| xgb_csv = [] | |
| best_params = (100, {}, 10) | |
| for eta, max_depth, col_num in itertools.product([0.05, 0.01], [10, 15], [0.3, 0.8]): | |
| prefix = f"{str(eta)}_{str(max_depth)}_{str(col_num)}" | |
| params = { | |
| 'learning_rate': eta, | |
| 'max_depth': max_depth, | |
| 'colsample_bytree': col_num, | |
| # 'gamma': 0.5, | |
| 'subsample': 0.8, | |
| 'objective': 'reg:squarederror'} | |
| cv_results = cv(dtrain=data_train, params=params, nfold=2, | |
| num_boost_round=1000, early_stopping_rounds=3, metrics="rmse", as_pandas=True, seed=50) | |
| best_value = cv_results['test-rmse-mean'].values[-1] | |
| best_round = cv_results.index[-1] | |
| xgb_csv.append( | |
| cv_results.rename(columns={col: f'{prefix}_{col}' for col in cv_results.columns}).tail(10).reset_index()) | |
| if best_value < best_params[0]: | |
| best_params = (best_value, params, best_round) | |
| return pd.concat(xgb_csv, axis=1), best_params | |
| def run_training(predictors_train, target_train, params, num_rounds): | |
| data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True) | |
| return xgb.train(params, data_train, num_rounds) | |
| def predict(model, predictors): | |
| data = xgb.DMatrix(predictors, enable_categorical=True) | |
| return model.predict(data) | |
| def features_importance(model): | |
| return pd.Series(model.get_score(importance_type='gain')).sort_values() | |