Spaces:
Runtime error
Runtime error
add automl'
Browse files- data_processing.py +1 -1
- model.py +14 -2
- stream_app.py +32 -23
data_processing.py
CHANGED
|
@@ -87,7 +87,7 @@ def get_monetary_dataframe(decision_scope):
|
|
| 87 |
time = round((pd.to_datetime('today').date() - monetary_decision.decision_date) / np.timedelta64(1, "M"))
|
| 88 |
monetary_decision['time'] = time
|
| 89 |
# sort by date
|
| 90 |
-
monetary_decision = monetary_decision.sort_values('time', ascending=False)
|
| 91 |
return monetary_decision
|
| 92 |
|
| 93 |
|
|
|
|
| 87 |
time = round((pd.to_datetime('today').date() - monetary_decision.decision_date) / np.timedelta64(1, "M"))
|
| 88 |
monetary_decision['time'] = time
|
| 89 |
# sort by date
|
| 90 |
+
monetary_decision = monetary_decision.sort_values('time', ascending=False).reset_index(drop=True)
|
| 91 |
return monetary_decision
|
| 92 |
|
| 93 |
|
model.py
CHANGED
|
@@ -2,7 +2,8 @@ import itertools
|
|
| 2 |
import pandas as pd
|
| 3 |
import xgboost as xgb
|
| 4 |
from xgboost import cv
|
| 5 |
-
from sklearn.model_selection import TimeSeriesSplit
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def prepare_predictors(monetary_decision, col_num, col_cat):
|
|
@@ -28,7 +29,9 @@ def split(predictors, target, test_size=0.2):
|
|
| 28 |
predictors_test = predictors.tail(nb_test)
|
| 29 |
target_train = target.head(nb_train)
|
| 30 |
target_test = target_train.tail(nb_test)
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def run_cv_training(predictors_train, target_train):
|
|
@@ -65,6 +68,15 @@ def run_training(predictors_train, target_train, params, num_rounds):
|
|
| 65 |
return xgb.train(params, data_train, num_rounds)
|
| 66 |
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
def predict(model, predictors):
|
| 69 |
data = xgb.DMatrix(predictors, enable_categorical=True)
|
| 70 |
return model.predict(data)
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import xgboost as xgb
|
| 4 |
from xgboost import cv
|
| 5 |
+
from sklearn.model_selection import TimeSeriesSplit, train_test_split
|
| 6 |
+
from flaml import AutoML
|
| 7 |
|
| 8 |
|
| 9 |
def prepare_predictors(monetary_decision, col_num, col_cat):
|
|
|
|
| 29 |
predictors_test = predictors.tail(nb_test)
|
| 30 |
target_train = target.head(nb_train)
|
| 31 |
target_test = target_train.tail(nb_test)
|
| 32 |
+
predictors_train, predictors_test, target_train, target_test = train_test_split(predictors, target, test_size=0.2,
|
| 33 |
+
random_state=42)
|
| 34 |
+
return predictors_train.reset_index(drop=True), predictors_test, target_train, target_test
|
| 35 |
|
| 36 |
|
| 37 |
def run_cv_training(predictors_train, target_train):
|
|
|
|
| 68 |
return xgb.train(params, data_train, num_rounds)
|
| 69 |
|
| 70 |
|
| 71 |
+
def automl_training(predictors_train, target_train):
|
| 72 |
+
automl = AutoML()
|
| 73 |
+
automl.fit(predictors_train, target_train,
|
| 74 |
+
task="regression",
|
| 75 |
+
estimator_list=["lgbm"],
|
| 76 |
+
split_type="auto")
|
| 77 |
+
return automl
|
| 78 |
+
|
| 79 |
+
|
| 80 |
def predict(model, predictors):
|
| 81 |
data = xgb.DMatrix(predictors, enable_categorical=True)
|
| 82 |
return model.predict(data)
|
stream_app.py
CHANGED
|
@@ -10,7 +10,7 @@ import plotly.figure_factory as ff
|
|
| 10 |
import scipy
|
| 11 |
import numpy as np
|
| 12 |
from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
|
| 13 |
-
from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training
|
| 14 |
|
| 15 |
|
| 16 |
def _max_width_():
|
|
@@ -178,9 +178,9 @@ col_cat_all = ['authorities_country',
|
|
| 178 |
|
| 179 |
st.sidebar.title("Training params")
|
| 180 |
col_num = st.sidebar.multiselect('Numeric variables',
|
| 181 |
-
col_num_all, default=
|
| 182 |
col_cat = st.sidebar.multiselect('Categorical variables',
|
| 183 |
-
col_cat_all, default=
|
| 184 |
# train the model
|
| 185 |
predictors, target = prepare_data(monetary_decision, col_num, col_cat)
|
| 186 |
if st.button('Run training'):
|
|
@@ -204,27 +204,30 @@ if st.button('Run training'):
|
|
| 204 |
# Run cross validation
|
| 205 |
st.subheader("Cross validation error")
|
| 206 |
with st.spinner('Wait for it...'):
|
| 207 |
-
xgb_cv, best_params = run_cv_training(predictors_train, target_train)
|
| 208 |
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
|
|
|
| 218 |
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
|
|
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
|
|
|
| 228 |
|
| 229 |
fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2)
|
| 230 |
fig.update_layout(width=1000,
|
|
@@ -234,9 +237,15 @@ if st.button('Run training'):
|
|
| 234 |
st.plotly_chart(fig)
|
| 235 |
|
| 236 |
st.subheader("Plot features importance for the trained model")
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
orientation='h',
|
| 241 |
width=1000,
|
| 242 |
template="simple_white",
|
|
@@ -301,7 +310,7 @@ st.dataframe(df_to_predict)
|
|
| 301 |
|
| 302 |
if prediction_model:
|
| 303 |
try:
|
| 304 |
-
predicted = predict(
|
| 305 |
st.metric(label="Monetary sanction prediction", value=f"{'{:,.2f}'.format(10**(predicted[0]-3))} K$")
|
| 306 |
print(predicted)
|
| 307 |
except ValueError:
|
|
|
|
| 10 |
import scipy
|
| 11 |
import numpy as np
|
| 12 |
from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
|
| 13 |
+
from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training, automl_training
|
| 14 |
|
| 15 |
|
| 16 |
def _max_width_():
|
|
|
|
| 178 |
|
| 179 |
st.sidebar.title("Training params")
|
| 180 |
col_num = st.sidebar.multiselect('Numeric variables',
|
| 181 |
+
col_num_all, default=col_num_all)
|
| 182 |
col_cat = st.sidebar.multiselect('Categorical variables',
|
| 183 |
+
col_cat_all, default=col_cat_all)
|
| 184 |
# train the model
|
| 185 |
predictors, target = prepare_data(monetary_decision, col_num, col_cat)
|
| 186 |
if st.button('Run training'):
|
|
|
|
| 204 |
# Run cross validation
|
| 205 |
st.subheader("Cross validation error")
|
| 206 |
with st.spinner('Wait for it...'):
|
| 207 |
+
#xgb_cv, best_params = run_cv_training(predictors_train, target_train)
|
| 208 |
|
| 209 |
+
#st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]])
|
| 210 |
+
#st.subheader("Selected variables")
|
| 211 |
+
#st.json(best_params)
|
| 212 |
|
| 213 |
+
# Train final
|
| 214 |
+
#xgb_model = run_training(predictors_train, target_train, best_params["params"], best_params["best_round"])
|
| 215 |
|
| 216 |
+
xgb_model = automl_training(predictors_train, target_train)
|
| 217 |
+
# save model to file
|
| 218 |
+
pickle.dump(xgb_model, open("xgb_model.pickle.dat", "wb"))
|
| 219 |
|
| 220 |
+
# Evaluate model error
|
| 221 |
+
#target_train_predicted = predict(xgb_model, predictors_train)
|
| 222 |
+
target_train_predicted = xgb_model.predict(predictors_train)
|
| 223 |
+
training_bias = np.mean(target_train_predicted - target_train)
|
| 224 |
+
st.metric(label="Training bias", value=training_bias)
|
| 225 |
|
| 226 |
+
#target_test_predicted = predict(xgb_model, predictors_test)
|
| 227 |
+
target_test_predicted = xgb_model.predict(predictors_test)
|
| 228 |
+
test_errors = target_test_predicted - target_test
|
| 229 |
+
test_bias = np.mean(test_errors)
|
| 230 |
+
st.metric(label="Test bias", value=test_bias)
|
| 231 |
|
| 232 |
fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2)
|
| 233 |
fig.update_layout(width=1000,
|
|
|
|
| 237 |
st.plotly_chart(fig)
|
| 238 |
|
| 239 |
st.subheader("Plot features importance for the trained model")
|
| 240 |
+
print("predictors_train shape: ", predictors_train.columns)
|
| 241 |
+
xgb_features_importance = pd.DataFrame([xgb_model.model.estimator.feature_importances_],
|
| 242 |
+
columns=predictors_train.columns)
|
| 243 |
+
print(xgb_features_importance)
|
| 244 |
+
#st.dataframe(xgb_features_importance)
|
| 245 |
+
|
| 246 |
+
# xgb_features_importance = features_importance(xgb_model)
|
| 247 |
+
#
|
| 248 |
+
fig = px.bar(xgb_features_importance.T,
|
| 249 |
orientation='h',
|
| 250 |
width=1000,
|
| 251 |
template="simple_white",
|
|
|
|
| 310 |
|
| 311 |
if prediction_model:
|
| 312 |
try:
|
| 313 |
+
predicted = prediction_model.predict(df_to_predict)
|
| 314 |
st.metric(label="Monetary sanction prediction", value=f"{'{:,.2f}'.format(10**(predicted[0]-3))} K$")
|
| 315 |
print(predicted)
|
| 316 |
except ValueError:
|