Jawad commited on
Commit
b1bb2ef
·
1 Parent(s): c359068

add predict sample

Browse files
Files changed (3) hide show
  1. data_processing.py +4 -3
  2. model.py +8 -16
  3. stream_app.py +44 -21
data_processing.py CHANGED
@@ -43,7 +43,7 @@ def load_data():
43
  def process_data(data):
44
  decisions = pd.DataFrame(data['decisions'])
45
  decisions['year'] = pd.to_datetime(decisions['decision_date']).dt.year
46
- decisions.monetary_sanction = decisions.monetary_sanction.astype(float)
47
  # keep validated decisions
48
  decisions = decisions[decisions.status == 'V']
49
  decisions.decision_date = pd.to_datetime(decisions['decision_date']).dt.date
@@ -63,7 +63,6 @@ def process_data(data):
63
  decisions = decisions.merge(organizations, left_on='organizations', right_on='org_id')
64
  # remove Individual
65
  decisions = decisions[decisions.org_company_type != "Individual"]
66
-
67
  # work on authorities
68
  authorities = pd.DataFrame(data['authorities'])
69
  authorities.index = authorities.url.apply(get_id)
@@ -79,12 +78,14 @@ def process_data(data):
79
  def get_monetary_dataframe(decision_scope):
80
  monetary_decision = decision_scope[decision_scope.monetary_sanction > 0]
81
  monetary_decision['has_revenues'] = (monetary_decision.org_revenues != "")
82
- monetary_decision['org_revenues'] = monetary_decision.org_revenues.str.replace('', '0').astype(float)
83
  monetary_decision['log10_org_revenues'] = monetary_decision.org_revenues.apply(lambda x: np.log10(x+1))
84
  monetary_decision['log10_monetary_sanction'] = monetary_decision.monetary_sanction.apply(lambda x: np.log10(x+1))
85
  monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
86
  monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
87
  monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
 
 
88
  return monetary_decision
89
 
90
 
 
43
  def process_data(data):
44
  decisions = pd.DataFrame(data['decisions'])
45
  decisions['year'] = pd.to_datetime(decisions['decision_date']).dt.year
46
+ decisions.monetary_sanction = pd.to_numeric(decisions.monetary_sanction, errors='coerce').fillna(0)
47
  # keep validated decisions
48
  decisions = decisions[decisions.status == 'V']
49
  decisions.decision_date = pd.to_datetime(decisions['decision_date']).dt.date
 
63
  decisions = decisions.merge(organizations, left_on='organizations', right_on='org_id')
64
  # remove Individual
65
  decisions = decisions[decisions.org_company_type != "Individual"]
 
66
  # work on authorities
67
  authorities = pd.DataFrame(data['authorities'])
68
  authorities.index = authorities.url.apply(get_id)
 
78
  def get_monetary_dataframe(decision_scope):
79
  monetary_decision = decision_scope[decision_scope.monetary_sanction > 0]
80
  monetary_decision['has_revenues'] = (monetary_decision.org_revenues != "")
81
+ monetary_decision['org_revenues'] = pd.to_numeric(monetary_decision.org_revenues, errors='coerce').fillna(0)
82
  monetary_decision['log10_org_revenues'] = monetary_decision.org_revenues.apply(lambda x: np.log10(x+1))
83
  monetary_decision['log10_monetary_sanction'] = monetary_decision.monetary_sanction.apply(lambda x: np.log10(x+1))
84
  monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
85
  monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
86
  monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
87
+ time = round((monetary_decision.decision_date - monetary_decision.decision_date.min()) / np.timedelta64(1, "M"))
88
+ monetary_decision['time'] = time
89
  return monetary_decision
90
 
91
 
model.py CHANGED
@@ -6,25 +6,17 @@ from xgboost import cv
6
  from sklearn.model_selection import train_test_split
7
 
8
 
9
- def prepare_data(monetary_decision):
10
  monetary_decision = monetary_decision.reset_index(drop=True)
11
- time = round((monetary_decision.decision_date - monetary_decision.decision_date.min()) / np.timedelta64(1, "M"))
12
- monetary_decision.loc[:, ('time')] = time
13
- col_num = ['log10_org_revenues',
14
- 'time']
15
- col_cat = ['authorities_country',
16
- 'type',
17
- 'violation_theme',
18
- 'justice_type',
19
- 'org_country',
20
- 'org_currency',
21
- 'org_continent',
22
- 'same_country',
23
- 'org_company_type']
24
  predictors = monetary_decision[col_num + col_cat]
25
- target = monetary_decision.log10_monetary_sanction
26
  for col in col_cat:
27
  predictors[col] = predictors[col].astype("category")
 
 
 
 
 
 
28
  return predictors, target
29
 
30
 
@@ -46,7 +38,7 @@ def run_cv_training(predictors_train, target_train):
46
  'learning_rate': eta,
47
  'max_depth': max_depth,
48
  'colsample_bytree': col_num,
49
- #'gamma': 0.5,
50
  'subsample': 0.8,
51
  'objective': 'reg:squarederror'}
52
  cv_results = cv(dtrain=data_train, params=params, nfold=2,
 
6
  from sklearn.model_selection import train_test_split
7
 
8
 
9
+ def prepare_predictors(monetary_decision, col_num, col_cat):
10
  monetary_decision = monetary_decision.reset_index(drop=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  predictors = monetary_decision[col_num + col_cat]
 
12
  for col in col_cat:
13
  predictors[col] = predictors[col].astype("category")
14
+ return predictors
15
+
16
+
17
+ def prepare_data(monetary_decision, col_num, col_cat):
18
+ predictors = prepare_predictors(monetary_decision, col_num, col_cat)
19
+ target = monetary_decision.log10_monetary_sanction
20
  return predictors, target
21
 
22
 
 
38
  'learning_rate': eta,
39
  'max_depth': max_depth,
40
  'colsample_bytree': col_num,
41
+ # 'gamma': 0.5,
42
  'subsample': 0.8,
43
  'objective': 'reg:squarederror'}
44
  cv_results = cv(dtrain=data_train, params=params, nfold=2,
stream_app.py CHANGED
@@ -9,7 +9,7 @@ import plotly.figure_factory as ff
9
  import scipy
10
  import numpy as np
11
  from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
12
- from model import prepare_data, run_training, split, predict, features_importance, run_cv_training
13
 
14
 
15
  def _max_width_():
@@ -158,45 +158,63 @@ with st.expander("Data exploration"):
158
  width=1000, height=600)
159
  st.plotly_chart(fig)
160
 
161
-
162
  ##############################################
163
  ####
164
  # build ML model
165
  ####
166
  ##############################################
167
  st.title("Training phase")
168
-
169
- predictors, target = prepare_data(monetary_decision)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  # train the model
 
171
  if st.button('Run training'):
172
  with st.expander("Training results"):
 
173
  st.write(f"dataset size: {monetary_decision.shape[0]}")
174
- st.markdown("Plot taget distribution: log 10 of monetary sanctions")
175
- fig = ff.create_distplot([target], [' log 10 of monetary sanctions'], bin_size=0.1)
176
  fig.update_layout(width=1000,
177
  template="simple_white",
178
  height=600,
179
  bargap=0.01)
180
  st.plotly_chart(fig)
181
 
182
- # split data set
183
  predictors_train, predictors_test, target_train, target_test = split(predictors, target)
184
  st.subheader("Split dataset between training and test:")
185
  st.metric(label="Training size", value=predictors_train.shape[0])
186
  st.metric(label="Test size", value=predictors_test.shape[0])
187
 
188
- # run cross validation
189
  st.subheader("Cross validation error")
190
- xgb_cv, best_params = run_cv_training(predictors_train, target_train)
 
191
 
192
  st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]])
193
  st.subheader("Selected variables")
194
  st.json(best_params)
195
 
196
- # train final model
197
  xgb_model = run_training(predictors_train, target_train, best_params[1], best_params[2])
198
 
199
- # evaluate model error
200
  target_train_predicted = predict(xgb_model, predictors_train)
201
  training_bias = np.mean(target_train_predicted - target_train)
202
  st.metric(label="Training bias", value=training_bias)
@@ -261,16 +279,21 @@ if st.button('Run training'):
261
  R_sq = corr_matrix[0, 1] ** 2
262
  st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%")
263
 
 
 
264
 
265
- st.subheader("Plot predicted vs real")
266
- #st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%")
267
-
268
- print(stats.pearsonr(test_errors, target_test_predicted))
269
-
270
 
 
 
 
 
 
 
 
 
271
 
272
- st.sidebar.title("Organizations view")
273
- col_x = ['log10_org_revenues', 'authorities_country', 'violation_theme', 'org_country', 'org_company_type']
274
- sample_revenues = st.sidebar.number_input('Yearly revenues', value=1000000)
275
- authority = st.sidebar.selectbox('Organization country', predictors.org_country.cat.categories)
276
- authority = st.sidebar.selectbox('Organization activity', predictors.org_company_type.cat.categories)
 
9
  import scipy
10
  import numpy as np
11
  from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
12
+ from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training
13
 
14
 
15
  def _max_width_():
 
158
  width=1000, height=600)
159
  st.plotly_chart(fig)
160
 
 
161
  ##############################################
162
  ####
163
  # build ML model
164
  ####
165
  ##############################################
166
  st.title("Training phase")
167
+ xgb_model = None
168
+ col_num_all = ['log10_org_revenues',
169
+ 'time']
170
+ col_cat_all = ['authorities_country',
171
+ 'type',
172
+ 'violation_theme',
173
+ 'justice_type',
174
+ 'org_country',
175
+ 'org_currency',
176
+ 'org_continent',
177
+ 'same_country',
178
+ 'org_company_type']
179
+
180
+ st.sidebar.title("Training params")
181
+ col_num = st.sidebar.multiselect('Numeric variables',
182
+ col_num_all, col_num_all)
183
+ col_cat = st.sidebar.multiselect('Categorical variables',
184
+ col_cat_all, col_cat_all)
185
  # train the model
186
+ predictors, target = prepare_data(monetary_decision, col_num, col_cat)
187
  if st.button('Run training'):
188
  with st.expander("Training results"):
189
+ # Study distribution
190
  st.write(f"dataset size: {monetary_decision.shape[0]}")
191
+ st.markdown("Plot target distribution: log 10 of monetary sanctions")
192
+ fig = ff.create_distplot([target], ['log 10 of monetary sanctions'], bin_size=0.1)
193
  fig.update_layout(width=1000,
194
  template="simple_white",
195
  height=600,
196
  bargap=0.01)
197
  st.plotly_chart(fig)
198
 
199
+ # Split data set
200
  predictors_train, predictors_test, target_train, target_test = split(predictors, target)
201
  st.subheader("Split dataset between training and test:")
202
  st.metric(label="Training size", value=predictors_train.shape[0])
203
  st.metric(label="Test size", value=predictors_test.shape[0])
204
 
205
+ # Run cross validation
206
  st.subheader("Cross validation error")
207
+ with st.spinner('Wait for it...'):
208
+ xgb_cv, best_params = run_cv_training(predictors_train, target_train)
209
 
210
  st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]])
211
  st.subheader("Selected variables")
212
  st.json(best_params)
213
 
214
+ # Train final model
215
  xgb_model = run_training(predictors_train, target_train, best_params[1], best_params[2])
216
 
217
+ # Evaluate model error
218
  target_train_predicted = predict(xgb_model, predictors_train)
219
  training_bias = np.mean(target_train_predicted - target_train)
220
  st.metric(label="Training bias", value=training_bias)
 
279
  R_sq = corr_matrix[0, 1] ** 2
280
  st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%")
281
 
282
+ st.subheader("Residuals & homoscedasticity")
283
+ # st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%")
284
 
285
+ print(stats.pearsonr(test_errors, target_test))
 
 
 
 
286
 
287
+ st.title("Organizations view")
288
+ col1, col2, col3 = st.columns(3)
289
+ to_predict = {}
290
+ with col1:
291
+ to_predict['log10_org_revenues'] = [np.log10(st.number_input('Yearly revenues', value=100000000))]
292
+ for col in col_cat:
293
+ to_predict[col] = [st.selectbox(f'{col}', predictors[col].cat.categories)]
294
+ print(to_predict)
295
 
296
+ df_to_predict = prepare_predictors(pd.DataFrame.from_dict(to_predict), col_num, col_cat)
297
+ if xgb_model:
298
+ predicted = predict(xgb_model, df_to_predict)
299
+ print(predicted)