Jawad commited on
Commit
8fc52b2
·
1 Parent(s): b3c7fc7

add automl'

Browse files
Files changed (3) hide show
  1. data_processing.py +1 -1
  2. model.py +14 -2
  3. stream_app.py +32 -23
data_processing.py CHANGED
@@ -87,7 +87,7 @@ def get_monetary_dataframe(decision_scope):
87
  time = round((pd.to_datetime('today').date() - monetary_decision.decision_date) / np.timedelta64(1, "M"))
88
  monetary_decision['time'] = time
89
  # sort by date
90
- monetary_decision = monetary_decision.sort_values('time', ascending=False)
91
  return monetary_decision
92
 
93
 
 
87
  time = round((pd.to_datetime('today').date() - monetary_decision.decision_date) / np.timedelta64(1, "M"))
88
  monetary_decision['time'] = time
89
  # sort by date
90
+ monetary_decision = monetary_decision.sort_values('time', ascending=False).reset_index(drop=True)
91
  return monetary_decision
92
 
93
 
model.py CHANGED
@@ -2,7 +2,8 @@ import itertools
2
  import pandas as pd
3
  import xgboost as xgb
4
  from xgboost import cv
5
- from sklearn.model_selection import TimeSeriesSplit
 
6
 
7
 
8
  def prepare_predictors(monetary_decision, col_num, col_cat):
@@ -28,7 +29,9 @@ def split(predictors, target, test_size=0.2):
28
  predictors_test = predictors.tail(nb_test)
29
  target_train = target.head(nb_train)
30
  target_test = target_train.tail(nb_test)
31
- return predictors_train, predictors_test, target_train, target_test
 
 
32
 
33
 
34
  def run_cv_training(predictors_train, target_train):
@@ -65,6 +68,15 @@ def run_training(predictors_train, target_train, params, num_rounds):
65
  return xgb.train(params, data_train, num_rounds)
66
 
67
 
 
 
 
 
 
 
 
 
 
68
  def predict(model, predictors):
69
  data = xgb.DMatrix(predictors, enable_categorical=True)
70
  return model.predict(data)
 
2
  import pandas as pd
3
  import xgboost as xgb
4
  from xgboost import cv
5
+ from sklearn.model_selection import TimeSeriesSplit, train_test_split
6
+ from flaml import AutoML
7
 
8
 
9
  def prepare_predictors(monetary_decision, col_num, col_cat):
 
29
  predictors_test = predictors.tail(nb_test)
30
  target_train = target.head(nb_train)
31
  target_test = target_train.tail(nb_test)
32
+ predictors_train, predictors_test, target_train, target_test = train_test_split(predictors, target, test_size=0.2,
33
+ random_state=42)
34
+ return predictors_train.reset_index(drop=True), predictors_test, target_train, target_test
35
 
36
 
37
  def run_cv_training(predictors_train, target_train):
 
68
  return xgb.train(params, data_train, num_rounds)
69
 
70
 
71
+ def automl_training(predictors_train, target_train):
72
+ automl = AutoML()
73
+ automl.fit(predictors_train, target_train,
74
+ task="regression",
75
+ estimator_list=["lgbm"],
76
+ split_type="auto")
77
+ return automl
78
+
79
+
80
  def predict(model, predictors):
81
  data = xgb.DMatrix(predictors, enable_categorical=True)
82
  return model.predict(data)
stream_app.py CHANGED
@@ -10,7 +10,7 @@ import plotly.figure_factory as ff
10
  import scipy
11
  import numpy as np
12
  from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
13
- from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training
14
 
15
 
16
  def _max_width_():
@@ -178,9 +178,9 @@ col_cat_all = ['authorities_country',
178
 
179
  st.sidebar.title("Training params")
180
  col_num = st.sidebar.multiselect('Numeric variables',
181
- col_num_all, default=['log10_org_revenues'])
182
  col_cat = st.sidebar.multiselect('Categorical variables',
183
- col_cat_all, default=['violation_theme', 'org_continent'])
184
  # train the model
185
  predictors, target = prepare_data(monetary_decision, col_num, col_cat)
186
  if st.button('Run training'):
@@ -204,27 +204,30 @@ if st.button('Run training'):
204
  # Run cross validation
205
  st.subheader("Cross validation error")
206
  with st.spinner('Wait for it...'):
207
- xgb_cv, best_params = run_cv_training(predictors_train, target_train)
208
 
209
- st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]])
210
- st.subheader("Selected variables")
211
- st.json(best_params)
212
 
213
- # Train final
214
- xgb_model = run_training(predictors_train, target_train, best_params["params"], best_params["best_round"])
215
 
216
- # save model to file
217
- pickle.dump(xgb_model, open("xgb_model.pickle.dat", "wb"))
 
218
 
219
- # Evaluate model error
220
- target_train_predicted = predict(xgb_model, predictors_train)
221
- training_bias = np.mean(target_train_predicted - target_train)
222
- st.metric(label="Training bias", value=training_bias)
 
223
 
224
- target_test_predicted = predict(xgb_model, predictors_test)
225
- test_errors = target_test_predicted - target_test
226
- test_bias = np.mean(test_errors)
227
- st.metric(label="Test bias", value=test_bias)
 
228
 
229
  fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2)
230
  fig.update_layout(width=1000,
@@ -234,9 +237,15 @@ if st.button('Run training'):
234
  st.plotly_chart(fig)
235
 
236
  st.subheader("Plot features importance for the trained model")
237
- xgb_features_importance = features_importance(xgb_model)
238
-
239
- fig = px.bar(xgb_features_importance,
 
 
 
 
 
 
240
  orientation='h',
241
  width=1000,
242
  template="simple_white",
@@ -301,7 +310,7 @@ st.dataframe(df_to_predict)
301
 
302
  if prediction_model:
303
  try:
304
- predicted = predict(prediction_model, df_to_predict)
305
  st.metric(label="Monetary sanction prediction", value=f"{'{:,.2f}'.format(10**(predicted[0]-3))} K$")
306
  print(predicted)
307
  except ValueError:
 
10
  import scipy
11
  import numpy as np
12
  from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
13
+ from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training, automl_training
14
 
15
 
16
  def _max_width_():
 
178
 
179
  st.sidebar.title("Training params")
180
  col_num = st.sidebar.multiselect('Numeric variables',
181
+ col_num_all, default=col_num_all)
182
  col_cat = st.sidebar.multiselect('Categorical variables',
183
+ col_cat_all, default=col_cat_all)
184
  # train the model
185
  predictors, target = prepare_data(monetary_decision, col_num, col_cat)
186
  if st.button('Run training'):
 
204
  # Run cross validation
205
  st.subheader("Cross validation error")
206
  with st.spinner('Wait for it...'):
207
+ #xgb_cv, best_params = run_cv_training(predictors_train, target_train)
208
 
209
+ #st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]])
210
+ #st.subheader("Selected variables")
211
+ #st.json(best_params)
212
 
213
+ # Train final
214
+ #xgb_model = run_training(predictors_train, target_train, best_params["params"], best_params["best_round"])
215
 
216
+ xgb_model = automl_training(predictors_train, target_train)
217
+ # save model to file
218
+ pickle.dump(xgb_model, open("xgb_model.pickle.dat", "wb"))
219
 
220
+ # Evaluate model error
221
+ #target_train_predicted = predict(xgb_model, predictors_train)
222
+ target_train_predicted = xgb_model.predict(predictors_train)
223
+ training_bias = np.mean(target_train_predicted - target_train)
224
+ st.metric(label="Training bias", value=training_bias)
225
 
226
+ #target_test_predicted = predict(xgb_model, predictors_test)
227
+ target_test_predicted = xgb_model.predict(predictors_test)
228
+ test_errors = target_test_predicted - target_test
229
+ test_bias = np.mean(test_errors)
230
+ st.metric(label="Test bias", value=test_bias)
231
 
232
  fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2)
233
  fig.update_layout(width=1000,
 
237
  st.plotly_chart(fig)
238
 
239
  st.subheader("Plot features importance for the trained model")
240
+ print("predictors_train shape: ", predictors_train.columns)
241
+ xgb_features_importance = pd.DataFrame([xgb_model.model.estimator.feature_importances_],
242
+ columns=predictors_train.columns)
243
+ print(xgb_features_importance)
244
+ #st.dataframe(xgb_features_importance)
245
+
246
+ # xgb_features_importance = features_importance(xgb_model)
247
+ #
248
+ fig = px.bar(xgb_features_importance.T,
249
  orientation='h',
250
  width=1000,
251
  template="simple_white",
 
310
 
311
  if prediction_model:
312
  try:
313
+ predicted = prediction_model.predict(df_to_predict)
314
  st.metric(label="Monetary sanction prediction", value=f"{'{:,.2f}'.format(10**(predicted[0]-3))} K$")
315
  print(predicted)
316
  except ValueError: