JawadRouen commited on
Commit
b0bca26
·
unverified ·
2 Parent(s): f9dedaa a61cdb6

Merge pull request #4 from THEOLEX-IO/report

Browse files
Files changed (5) hide show
  1. .gitignore +2 -1
  2. data_processing.py +3 -1
  3. exploration_app.py +94 -0
  4. model.py +33 -14
  5. stream_app.py +48 -29
.gitignore CHANGED
@@ -117,4 +117,5 @@ back/
117
  esdata
118
 
119
  #data
120
- *.csv
 
 
117
  esdata
118
 
119
  #data
120
+ *.csv
121
+ *.dat
data_processing.py CHANGED
@@ -84,8 +84,10 @@ def get_monetary_dataframe(decision_scope):
84
  monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
85
  monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
86
  monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
87
- time = round((monetary_decision.decision_date - monetary_decision.decision_date.min()) / np.timedelta64(1, "M"))
88
  monetary_decision['time'] = time
 
 
89
  return monetary_decision
90
 
91
 
 
84
  monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
85
  monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
86
  monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
87
+ time = round((pd.to_datetime('today').date() - monetary_decision.decision_date) / np.timedelta64(1, "M"))
88
  monetary_decision['time'] = time
89
+ # sort by date
90
+ monetary_decision = monetary_decision.sort_values('time', ascending=False).reset_index(drop=True)
91
  return monetary_decision
92
 
93
 
exploration_app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import pickle
3
+ import pandas as pd
4
+ import streamlit as st
5
+ from scipy import stats
6
+
7
+ import plotly.express as px
8
+ import plotly.figure_factory as ff
9
+
10
+ import scipy
11
+ import numpy as np
12
+ from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
13
+ from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training, \
14
+ automl_training
15
+
16
+
17
+ def _max_width_():
18
+ max_width_str = f"max-width: 1500px;"
19
+ st.markdown(
20
+ f"""
21
+ <style>
22
+ .reportview-container .main .block-container{{
23
+ {max_width_str}
24
+ }}
25
+ </style>
26
+ """,
27
+ unsafe_allow_html=True,
28
+ )
29
+
30
+
31
+ # force screen width
32
+ _max_width_()
33
+
34
+ st.title("Data Analysis 🌎 📃")
35
+ st.write("by [Theolex](https://www.theolex.io/)")
36
+
37
+ # load and process data
38
+ data = load_data()
39
+ decisions, organizations, authorities = process_data(data)
40
+
41
+ col1, col2, col3, col4 = st.columns(4)
42
+
43
+ with col1:
44
+ authorities_country = st.selectbox('Authority country', authorities.country.unique())
45
+
46
+ with col2:
47
+ nb_years = st.selectbox('Number of years', range(1, 11), 4)
48
+
49
+ with col3:
50
+ list_continents = decisions.org_continent.unique().tolist()
51
+ org_continent = st.selectbox("Company's continent", list_continents, list_continents.index("europe"))
52
+
53
+ with col4:
54
+ list_company_types = decisions.org_company_type.unique().tolist()
55
+ org_company_type = st.selectbox("Company's activity", list_company_types,
56
+ list_company_types.index("Banking & Finance"))
57
+
58
+ st.subheader(f"Which {authorities_country} regulators and prosecutors have been "
59
+ f"the most active in enforcement actions against {org_continent} "
60
+ f"{org_company_type} companies in the last {nb_years} years?")
61
+
62
+ # apply filters
63
+ select_auth = authorities[authorities.country == authorities_country].name.sort_values()
64
+ authority_filter = decisions.authorities_name.apply(lambda a: bool(set(select_auth) & set(a)))
65
+ year_filter = (decisions.year >= (2021 - nb_years))
66
+ org_continent_filter = (decisions.org_continent == org_continent)
67
+ org_company_type_filter = (decisions.org_company_type == org_company_type)
68
+ decision_scope = decisions[authority_filter & year_filter & org_continent_filter & org_company_type_filter]
69
+
70
+ decision_scope = decision_scope.explode("authorities_name")
71
+ top_auths = decision_scope.groupby(['authorities_name'])['authorities_name'].count().sort_values(ascending=False).head(
72
+ 5)
73
+
74
+ fig = px.bar(top_auths,
75
+ template="simple_white",
76
+ color_continuous_scale='RdBu',
77
+ width=1200, height=600)
78
+ st.plotly_chart(fig)
79
+
80
+ with st.expander("Explore cases"):
81
+ st.dataframe(decision_scope[['authorities_name', 'org_name', 'decision_date', 'monetary_sanction', 'org_country',
82
+ 'org_company_type']])
83
+
84
+ # st.subheader("What are the top 10 negotiated settlements in France "
85
+ # "(involving French or foreign authorities) in the last 5 years?")
86
+ #
87
+ # st.subheader(
88
+ # "What are the top 3 areas (sanctions, anti-corruption, fraud, market manipulation, tax, etc.) "
89
+ # "of enforcement against banks in Germany in the last 3 years?")
90
+ #
91
+ # st.subheader("What are the largest enforcement actions involving French banks in the last 5 years?")
92
+ #
93
+ # st.subheader(
94
+ # "Which US regulators have imposed the largest penalties against financial institutions in the last 3 years?")
model.py CHANGED
@@ -1,9 +1,9 @@
1
  import itertools
2
- import numpy as np
3
  import pandas as pd
4
  import xgboost as xgb
5
  from xgboost import cv
6
- from sklearn.model_selection import train_test_split
 
7
 
8
 
9
  def prepare_predictors(monetary_decision, col_num, col_cat):
@@ -20,19 +20,25 @@ def prepare_data(monetary_decision, col_num, col_cat):
20
  return predictors, target
21
 
22
 
23
- def split(predictors, target):
24
- predictors_train, predictors_test, target_train, target_test = train_test_split(predictors,
25
- target,
26
- test_size=0.2,
27
- random_state=50)
28
- return predictors_train, predictors_test, target_train, target_test
 
 
 
 
 
 
29
 
30
 
31
  def run_cv_training(predictors_train, target_train):
32
  data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
33
  xgb_csv = []
34
- best_params = (100, {}, 10)
35
- for eta, max_depth, col_num in itertools.product([0.05, 0.01], [10, 15], [0.3, 0.8]):
36
  prefix = f"{str(eta)}_{str(max_depth)}_{str(col_num)}"
37
  params = {
38
  'learning_rate': eta,
@@ -41,14 +47,18 @@ def run_cv_training(predictors_train, target_train):
41
  # 'gamma': 0.5,
42
  'subsample': 0.8,
43
  'objective': 'reg:squarederror'}
44
- cv_results = cv(dtrain=data_train, params=params, nfold=2,
45
- num_boost_round=1000, early_stopping_rounds=3, metrics="rmse", as_pandas=True, seed=50)
 
 
 
 
46
  best_value = cv_results['test-rmse-mean'].values[-1]
47
  best_round = cv_results.index[-1]
48
  xgb_csv.append(
49
  cv_results.rename(columns={col: f'{prefix}_{col}' for col in cv_results.columns}).tail(10).reset_index())
50
- if best_value < best_params[0]:
51
- best_params = (best_value, params, best_round)
52
 
53
  return pd.concat(xgb_csv, axis=1), best_params
54
 
@@ -58,6 +68,15 @@ def run_training(predictors_train, target_train, params, num_rounds):
58
  return xgb.train(params, data_train, num_rounds)
59
 
60
 
 
 
 
 
 
 
 
 
 
61
  def predict(model, predictors):
62
  data = xgb.DMatrix(predictors, enable_categorical=True)
63
  return model.predict(data)
 
1
  import itertools
 
2
  import pandas as pd
3
  import xgboost as xgb
4
  from xgboost import cv
5
+ from sklearn.model_selection import TimeSeriesSplit, train_test_split
6
+ from flaml import AutoML
7
 
8
 
9
  def prepare_predictors(monetary_decision, col_num, col_cat):
 
20
  return predictors, target
21
 
22
 
23
+ def split(predictors, target, test_size=0.2):
24
+ df_len = len(target)
25
+ assert df_len == predictors.shape[0]
26
+ nb_test = round(test_size * df_len)
27
+ nb_train = df_len - nb_test
28
+ predictors_train = predictors.head(nb_train)
29
+ predictors_test = predictors.tail(nb_test)
30
+ target_train = target.head(nb_train)
31
+ target_test = target_train.tail(nb_test)
32
+ predictors_train, predictors_test, target_train, target_test = train_test_split(predictors, target, test_size=0.2,
33
+ random_state=42)
34
+ return predictors_train.reset_index(drop=True), predictors_test, target_train, target_test
35
 
36
 
37
  def run_cv_training(predictors_train, target_train):
38
  data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
39
  xgb_csv = []
40
+ best_params = {"best_value": 100, "params": {}, "best_round": 10}
41
+ for eta, max_depth, col_num in itertools.product([0.05, 0.01], [5, 15], [0.3, 0.5]):
42
  prefix = f"{str(eta)}_{str(max_depth)}_{str(col_num)}"
43
  params = {
44
  'learning_rate': eta,
 
47
  # 'gamma': 0.5,
48
  'subsample': 0.8,
49
  'objective': 'reg:squarederror'}
50
+ # build CV folds
51
+ nb_folds = 4
52
+ folds = TimeSeriesSplit(n_splits=nb_folds)
53
+ cv_results = cv(dtrain=data_train, params=params, folds=folds,
54
+ num_boost_round=1000, early_stopping_rounds=3, metrics="rmse",
55
+ as_pandas=True, seed=50)
56
  best_value = cv_results['test-rmse-mean'].values[-1]
57
  best_round = cv_results.index[-1]
58
  xgb_csv.append(
59
  cv_results.rename(columns={col: f'{prefix}_{col}' for col in cv_results.columns}).tail(10).reset_index())
60
+ if best_value < best_params["best_value"]:
61
+ best_params = {"best_value": best_value, "params": params, "best_round": best_round}
62
 
63
  return pd.concat(xgb_csv, axis=1), best_params
64
 
 
68
  return xgb.train(params, data_train, num_rounds)
69
 
70
 
71
+ def automl_training(predictors_train, target_train):
72
+ automl = AutoML()
73
+ automl.fit(predictors_train, target_train,
74
+ task="regression",
75
+ estimator_list=["lgbm"],
76
+ split_type="auto")
77
+ return automl
78
+
79
+
80
  def predict(model, predictors):
81
  data = xgb.DMatrix(predictors, enable_categorical=True)
82
  return model.predict(data)
stream_app.py CHANGED
@@ -1,4 +1,5 @@
1
  # -*- coding: utf-8 -*-
 
2
  import pandas as pd
3
  import streamlit as st
4
  from scipy import stats
@@ -9,7 +10,7 @@ import plotly.figure_factory as ff
9
  import scipy
10
  import numpy as np
11
  from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
12
- from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training
13
 
14
 
15
  def _max_width_():
@@ -60,6 +61,7 @@ st.subheader("Dataset Description")
60
 
61
  st.metric('Number of validated decisions linked to organisations (and not individuals)', decision_scope.shape[0])
62
 
 
63
  st.metric('Decisions with monetary sanctions',
64
  decision_scope[decision_scope.monetary_sanction > 0].shape[0])
65
 
@@ -164,7 +166,6 @@ with st.expander("Data exploration"):
164
  ####
165
  ##############################################
166
  st.title("Training phase")
167
- xgb_model = None
168
  col_num_all = ['log10_org_revenues',
169
  'time']
170
  col_cat_all = ['authorities_country',
@@ -172,16 +173,15 @@ col_cat_all = ['authorities_country',
172
  'violation_theme',
173
  'justice_type',
174
  'org_country',
175
- 'org_currency',
176
  'org_continent',
177
  'same_country',
178
  'org_company_type']
179
 
180
  st.sidebar.title("Training params")
181
  col_num = st.sidebar.multiselect('Numeric variables',
182
- col_num_all, col_num_all)
183
  col_cat = st.sidebar.multiselect('Categorical variables',
184
- col_cat_all, col_cat_all)
185
  # train the model
186
  predictors, target = prepare_data(monetary_decision, col_num, col_cat)
187
  if st.button('Run training'):
@@ -189,7 +189,7 @@ if st.button('Run training'):
189
  # Study distribution
190
  st.write(f"dataset size: {monetary_decision.shape[0]}")
191
  st.markdown("Plot target distribution: log 10 of monetary sanctions")
192
- fig = ff.create_distplot([target], ['log 10 of monetary sanctions'], bin_size=0.1)
193
  fig.update_layout(width=1000,
194
  template="simple_white",
195
  height=600,
@@ -197,7 +197,7 @@ if st.button('Run training'):
197
  st.plotly_chart(fig)
198
 
199
  # Split data set
200
- predictors_train, predictors_test, target_train, target_test = split(predictors, target)
201
  st.subheader("Split dataset between training and test:")
202
  st.metric(label="Training size", value=predictors_train.shape[0])
203
  st.metric(label="Test size", value=predictors_test.shape[0])
@@ -205,24 +205,30 @@ if st.button('Run training'):
205
  # Run cross validation
206
  st.subheader("Cross validation error")
207
  with st.spinner('Wait for it...'):
208
- xgb_cv, best_params = run_cv_training(predictors_train, target_train)
 
 
 
 
209
 
210
- st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]])
211
- st.subheader("Selected variables")
212
- st.json(best_params)
213
 
214
- # Train final model
215
- xgb_model = run_training(predictors_train, target_train, best_params[1], best_params[2])
 
216
 
217
- # Evaluate model error
218
- target_train_predicted = predict(xgb_model, predictors_train)
219
- training_bias = np.mean(target_train_predicted - target_train)
220
- st.metric(label="Training bias", value=training_bias)
 
221
 
222
- target_test_predicted = predict(xgb_model, predictors_test)
223
- test_errors = target_test_predicted - target_test
224
- test_bias = np.mean(test_errors)
225
- st.metric(label="Test bias", value=test_bias)
 
226
 
227
  fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2)
228
  fig.update_layout(width=1000,
@@ -232,9 +238,15 @@ if st.button('Run training'):
232
  st.plotly_chart(fig)
233
 
234
  st.subheader("Plot features importance for the trained model")
235
- xgb_features_importance = features_importance(xgb_model)
236
-
237
- fig = px.bar(xgb_features_importance,
 
 
 
 
 
 
238
  orientation='h',
239
  width=1000,
240
  template="simple_white",
@@ -285,15 +297,22 @@ if st.button('Run training'):
285
  print(stats.pearsonr(test_errors, target_test))
286
 
287
  st.title("Organizations view")
288
- col1, col2, col3 = st.columns(3)
 
289
  to_predict = {}
290
  with col1:
291
  to_predict['log10_org_revenues'] = [np.log10(st.number_input('Yearly revenues', value=100000000))]
 
292
  for col in col_cat:
293
  to_predict[col] = [st.selectbox(f'{col}', predictors[col].cat.categories)]
294
- print(to_predict)
295
 
296
- df_to_predict = prepare_predictors(pd.DataFrame.from_dict(to_predict), col_num, col_cat)
297
- if xgb_model:
298
- predicted = predict(xgb_model, df_to_predict)
 
 
 
 
299
  print(predicted)
 
 
 
1
  # -*- coding: utf-8 -*-
2
+ import pickle
3
  import pandas as pd
4
  import streamlit as st
5
  from scipy import stats
 
10
  import scipy
11
  import numpy as np
12
  from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
13
+ from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training, automl_training
14
 
15
 
16
  def _max_width_():
 
61
 
62
  st.metric('Number of validated decisions linked to organisations (and not individuals)', decision_scope.shape[0])
63
 
64
+
65
  st.metric('Decisions with monetary sanctions',
66
  decision_scope[decision_scope.monetary_sanction > 0].shape[0])
67
 
 
166
  ####
167
  ##############################################
168
  st.title("Training phase")
 
169
  col_num_all = ['log10_org_revenues',
170
  'time']
171
  col_cat_all = ['authorities_country',
 
173
  'violation_theme',
174
  'justice_type',
175
  'org_country',
 
176
  'org_continent',
177
  'same_country',
178
  'org_company_type']
179
 
180
  st.sidebar.title("Training params")
181
  col_num = st.sidebar.multiselect('Numeric variables',
182
+ col_num_all, default=col_num_all)
183
  col_cat = st.sidebar.multiselect('Categorical variables',
184
+ col_cat_all, default=col_cat_all)
185
  # train the model
186
  predictors, target = prepare_data(monetary_decision, col_num, col_cat)
187
  if st.button('Run training'):
 
189
  # Study distribution
190
  st.write(f"dataset size: {monetary_decision.shape[0]}")
191
  st.markdown("Plot target distribution: log 10 of monetary sanctions")
192
+ fig = ff.create_distplot([target], ['log 10 of monetary sanctions'], bin_size=0.05)
193
  fig.update_layout(width=1000,
194
  template="simple_white",
195
  height=600,
 
197
  st.plotly_chart(fig)
198
 
199
  # Split data set
200
+ predictors_train, predictors_test, target_train, target_test = split(predictors, target, test_size=0.05)
201
  st.subheader("Split dataset between training and test:")
202
  st.metric(label="Training size", value=predictors_train.shape[0])
203
  st.metric(label="Test size", value=predictors_test.shape[0])
 
205
  # Run cross validation
206
  st.subheader("Cross validation error")
207
  with st.spinner('Wait for it...'):
208
+ #xgb_cv, best_params = run_cv_training(predictors_train, target_train)
209
+
210
+ #st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]])
211
+ #st.subheader("Selected variables")
212
+ #st.json(best_params)
213
 
214
+ # Train final
215
+ #xgb_model = run_training(predictors_train, target_train, best_params["params"], best_params["best_round"])
 
216
 
217
+ xgb_model = automl_training(predictors_train, target_train)
218
+ # save model to file
219
+ pickle.dump(xgb_model, open("xgb_model.pickle.dat", "wb"))
220
 
221
+ # Evaluate model error
222
+ #target_train_predicted = predict(xgb_model, predictors_train)
223
+ target_train_predicted = xgb_model.predict(predictors_train)
224
+ training_bias = np.mean(target_train_predicted - target_train)
225
+ st.metric(label="Training bias", value=training_bias)
226
 
227
+ #target_test_predicted = predict(xgb_model, predictors_test)
228
+ target_test_predicted = xgb_model.predict(predictors_test)
229
+ test_errors = target_test_predicted - target_test
230
+ test_bias = np.mean(test_errors)
231
+ st.metric(label="Test bias", value=test_bias)
232
 
233
  fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2)
234
  fig.update_layout(width=1000,
 
238
  st.plotly_chart(fig)
239
 
240
  st.subheader("Plot features importance for the trained model")
241
+ print("predictors_train shape: ", predictors_train.columns)
242
+ xgb_features_importance = pd.DataFrame([xgb_model.model.estimator.feature_importances_],
243
+ columns=predictors_train.columns)
244
+ print(xgb_features_importance)
245
+ #st.dataframe(xgb_features_importance)
246
+
247
+ # xgb_features_importance = features_importance(xgb_model)
248
+ #
249
+ fig = px.bar(xgb_features_importance.T,
250
  orientation='h',
251
  width=1000,
252
  template="simple_white",
 
297
  print(stats.pearsonr(test_errors, target_test))
298
 
299
  st.title("Organizations view")
300
+ prediction_model = pickle.load(open("xgb_model.pickle.dat", "rb"))
301
+ col1, _, _ = st.columns(3)
302
  to_predict = {}
303
  with col1:
304
  to_predict['log10_org_revenues'] = [np.log10(st.number_input('Yearly revenues', value=100000000))]
305
+ to_predict['time'] = 0
306
  for col in col_cat:
307
  to_predict[col] = [st.selectbox(f'{col}', predictors[col].cat.categories)]
 
308
 
309
+ df_to_predict = prepare_predictors(pd.DataFrame.from_dict(to_predict), col_num, col_cat)
310
+ st.dataframe(df_to_predict)
311
+
312
+ if prediction_model:
313
+ try:
314
+ predicted = prediction_model.predict(df_to_predict)
315
+ st.metric(label="Monetary sanction prediction", value=f"{'{:,.2f}'.format(10**(predicted[0]-3))} K$")
316
  print(predicted)
317
+ except ValueError:
318
+ st.subheader("You need to rerun training !")