Spaces:
Runtime error
Runtime error
Merge pull request #4 from THEOLEX-IO/report
Browse files- .gitignore +2 -1
- data_processing.py +3 -1
- exploration_app.py +94 -0
- model.py +33 -14
- stream_app.py +48 -29
.gitignore
CHANGED
|
@@ -117,4 +117,5 @@ back/
|
|
| 117 |
esdata
|
| 118 |
|
| 119 |
#data
|
| 120 |
-
*.csv
|
|
|
|
|
|
| 117 |
esdata
|
| 118 |
|
| 119 |
#data
|
| 120 |
+
*.csv
|
| 121 |
+
*.dat
|
data_processing.py
CHANGED
|
@@ -84,8 +84,10 @@ def get_monetary_dataframe(decision_scope):
|
|
| 84 |
monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
|
| 85 |
monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
|
| 86 |
monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
|
| 87 |
-
time = round((
|
| 88 |
monetary_decision['time'] = time
|
|
|
|
|
|
|
| 89 |
return monetary_decision
|
| 90 |
|
| 91 |
|
|
|
|
| 84 |
monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
|
| 85 |
monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
|
| 86 |
monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
|
| 87 |
+
time = round((pd.to_datetime('today').date() - monetary_decision.decision_date) / np.timedelta64(1, "M"))
|
| 88 |
monetary_decision['time'] = time
|
| 89 |
+
# sort by date
|
| 90 |
+
monetary_decision = monetary_decision.sort_values('time', ascending=False).reset_index(drop=True)
|
| 91 |
return monetary_decision
|
| 92 |
|
| 93 |
|
exploration_app.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
import pickle
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import streamlit as st
|
| 5 |
+
from scipy import stats
|
| 6 |
+
|
| 7 |
+
import plotly.express as px
|
| 8 |
+
import plotly.figure_factory as ff
|
| 9 |
+
|
| 10 |
+
import scipy
|
| 11 |
+
import numpy as np
|
| 12 |
+
from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
|
| 13 |
+
from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training, \
|
| 14 |
+
automl_training
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _max_width_():
|
| 18 |
+
max_width_str = f"max-width: 1500px;"
|
| 19 |
+
st.markdown(
|
| 20 |
+
f"""
|
| 21 |
+
<style>
|
| 22 |
+
.reportview-container .main .block-container{{
|
| 23 |
+
{max_width_str}
|
| 24 |
+
}}
|
| 25 |
+
</style>
|
| 26 |
+
""",
|
| 27 |
+
unsafe_allow_html=True,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# force screen width
|
| 32 |
+
_max_width_()
|
| 33 |
+
|
| 34 |
+
st.title("Data Analysis 🌎 📃")
|
| 35 |
+
st.write("by [Theolex](https://www.theolex.io/)")
|
| 36 |
+
|
| 37 |
+
# load and process data
|
| 38 |
+
data = load_data()
|
| 39 |
+
decisions, organizations, authorities = process_data(data)
|
| 40 |
+
|
| 41 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 42 |
+
|
| 43 |
+
with col1:
|
| 44 |
+
authorities_country = st.selectbox('Authority country', authorities.country.unique())
|
| 45 |
+
|
| 46 |
+
with col2:
|
| 47 |
+
nb_years = st.selectbox('Number of years', range(1, 11), 4)
|
| 48 |
+
|
| 49 |
+
with col3:
|
| 50 |
+
list_continents = decisions.org_continent.unique().tolist()
|
| 51 |
+
org_continent = st.selectbox("Company's continent", list_continents, list_continents.index("europe"))
|
| 52 |
+
|
| 53 |
+
with col4:
|
| 54 |
+
list_company_types = decisions.org_company_type.unique().tolist()
|
| 55 |
+
org_company_type = st.selectbox("Company's activity", list_company_types,
|
| 56 |
+
list_company_types.index("Banking & Finance"))
|
| 57 |
+
|
| 58 |
+
st.subheader(f"Which {authorities_country} regulators and prosecutors have been "
|
| 59 |
+
f"the most active in enforcement actions against {org_continent} "
|
| 60 |
+
f"{org_company_type} companies in the last {nb_years} years?")
|
| 61 |
+
|
| 62 |
+
# apply filters
|
| 63 |
+
select_auth = authorities[authorities.country == authorities_country].name.sort_values()
|
| 64 |
+
authority_filter = decisions.authorities_name.apply(lambda a: bool(set(select_auth) & set(a)))
|
| 65 |
+
year_filter = (decisions.year >= (2021 - nb_years))
|
| 66 |
+
org_continent_filter = (decisions.org_continent == org_continent)
|
| 67 |
+
org_company_type_filter = (decisions.org_company_type == org_company_type)
|
| 68 |
+
decision_scope = decisions[authority_filter & year_filter & org_continent_filter & org_company_type_filter]
|
| 69 |
+
|
| 70 |
+
decision_scope = decision_scope.explode("authorities_name")
|
| 71 |
+
top_auths = decision_scope.groupby(['authorities_name'])['authorities_name'].count().sort_values(ascending=False).head(
|
| 72 |
+
5)
|
| 73 |
+
|
| 74 |
+
fig = px.bar(top_auths,
|
| 75 |
+
template="simple_white",
|
| 76 |
+
color_continuous_scale='RdBu',
|
| 77 |
+
width=1200, height=600)
|
| 78 |
+
st.plotly_chart(fig)
|
| 79 |
+
|
| 80 |
+
with st.expander("Explore cases"):
|
| 81 |
+
st.dataframe(decision_scope[['authorities_name', 'org_name', 'decision_date', 'monetary_sanction', 'org_country',
|
| 82 |
+
'org_company_type']])
|
| 83 |
+
|
| 84 |
+
# st.subheader("What are the top 10 negotiated settlements in France "
|
| 85 |
+
# "(involving French or foreign authorities) in the last 5 years?")
|
| 86 |
+
#
|
| 87 |
+
# st.subheader(
|
| 88 |
+
# "What are the top 3 areas (sanctions, anti-corruption, fraud, market manipulation, tax, etc.) "
|
| 89 |
+
# "of enforcement against banks in Germany in the last 3 years?")
|
| 90 |
+
#
|
| 91 |
+
# st.subheader("What are the largest enforcement actions involving French banks in the last 5 years?")
|
| 92 |
+
#
|
| 93 |
+
# st.subheader(
|
| 94 |
+
# "Which US regulators have imposed the largest penalties against financial institutions in the last 3 years?")
|
model.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import itertools
|
| 2 |
-
import numpy as np
|
| 3 |
import pandas as pd
|
| 4 |
import xgboost as xgb
|
| 5 |
from xgboost import cv
|
| 6 |
-
from sklearn.model_selection import train_test_split
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def prepare_predictors(monetary_decision, col_num, col_cat):
|
|
@@ -20,19 +20,25 @@ def prepare_data(monetary_decision, col_num, col_cat):
|
|
| 20 |
return predictors, target
|
| 21 |
|
| 22 |
|
| 23 |
-
def split(predictors, target):
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
def run_cv_training(predictors_train, target_train):
|
| 32 |
data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
|
| 33 |
xgb_csv = []
|
| 34 |
-
best_params =
|
| 35 |
-
for eta, max_depth, col_num in itertools.product([0.05, 0.01], [
|
| 36 |
prefix = f"{str(eta)}_{str(max_depth)}_{str(col_num)}"
|
| 37 |
params = {
|
| 38 |
'learning_rate': eta,
|
|
@@ -41,14 +47,18 @@ def run_cv_training(predictors_train, target_train):
|
|
| 41 |
# 'gamma': 0.5,
|
| 42 |
'subsample': 0.8,
|
| 43 |
'objective': 'reg:squarederror'}
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
best_value = cv_results['test-rmse-mean'].values[-1]
|
| 47 |
best_round = cv_results.index[-1]
|
| 48 |
xgb_csv.append(
|
| 49 |
cv_results.rename(columns={col: f'{prefix}_{col}' for col in cv_results.columns}).tail(10).reset_index())
|
| 50 |
-
if best_value < best_params[
|
| 51 |
-
best_params =
|
| 52 |
|
| 53 |
return pd.concat(xgb_csv, axis=1), best_params
|
| 54 |
|
|
@@ -58,6 +68,15 @@ def run_training(predictors_train, target_train, params, num_rounds):
|
|
| 58 |
return xgb.train(params, data_train, num_rounds)
|
| 59 |
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
def predict(model, predictors):
|
| 62 |
data = xgb.DMatrix(predictors, enable_categorical=True)
|
| 63 |
return model.predict(data)
|
|
|
|
| 1 |
import itertools
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import xgboost as xgb
|
| 4 |
from xgboost import cv
|
| 5 |
+
from sklearn.model_selection import TimeSeriesSplit, train_test_split
|
| 6 |
+
from flaml import AutoML
|
| 7 |
|
| 8 |
|
| 9 |
def prepare_predictors(monetary_decision, col_num, col_cat):
|
|
|
|
| 20 |
return predictors, target
|
| 21 |
|
| 22 |
|
| 23 |
+
def split(predictors, target, test_size=0.2):
|
| 24 |
+
df_len = len(target)
|
| 25 |
+
assert df_len == predictors.shape[0]
|
| 26 |
+
nb_test = round(test_size * df_len)
|
| 27 |
+
nb_train = df_len - nb_test
|
| 28 |
+
predictors_train = predictors.head(nb_train)
|
| 29 |
+
predictors_test = predictors.tail(nb_test)
|
| 30 |
+
target_train = target.head(nb_train)
|
| 31 |
+
target_test = target_train.tail(nb_test)
|
| 32 |
+
predictors_train, predictors_test, target_train, target_test = train_test_split(predictors, target, test_size=0.2,
|
| 33 |
+
random_state=42)
|
| 34 |
+
return predictors_train.reset_index(drop=True), predictors_test, target_train, target_test
|
| 35 |
|
| 36 |
|
| 37 |
def run_cv_training(predictors_train, target_train):
|
| 38 |
data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
|
| 39 |
xgb_csv = []
|
| 40 |
+
best_params = {"best_value": 100, "params": {}, "best_round": 10}
|
| 41 |
+
for eta, max_depth, col_num in itertools.product([0.05, 0.01], [5, 15], [0.3, 0.5]):
|
| 42 |
prefix = f"{str(eta)}_{str(max_depth)}_{str(col_num)}"
|
| 43 |
params = {
|
| 44 |
'learning_rate': eta,
|
|
|
|
| 47 |
# 'gamma': 0.5,
|
| 48 |
'subsample': 0.8,
|
| 49 |
'objective': 'reg:squarederror'}
|
| 50 |
+
# build CV folds
|
| 51 |
+
nb_folds = 4
|
| 52 |
+
folds = TimeSeriesSplit(n_splits=nb_folds)
|
| 53 |
+
cv_results = cv(dtrain=data_train, params=params, folds=folds,
|
| 54 |
+
num_boost_round=1000, early_stopping_rounds=3, metrics="rmse",
|
| 55 |
+
as_pandas=True, seed=50)
|
| 56 |
best_value = cv_results['test-rmse-mean'].values[-1]
|
| 57 |
best_round = cv_results.index[-1]
|
| 58 |
xgb_csv.append(
|
| 59 |
cv_results.rename(columns={col: f'{prefix}_{col}' for col in cv_results.columns}).tail(10).reset_index())
|
| 60 |
+
if best_value < best_params["best_value"]:
|
| 61 |
+
best_params = {"best_value": best_value, "params": params, "best_round": best_round}
|
| 62 |
|
| 63 |
return pd.concat(xgb_csv, axis=1), best_params
|
| 64 |
|
|
|
|
| 68 |
return xgb.train(params, data_train, num_rounds)
|
| 69 |
|
| 70 |
|
| 71 |
+
def automl_training(predictors_train, target_train):
|
| 72 |
+
automl = AutoML()
|
| 73 |
+
automl.fit(predictors_train, target_train,
|
| 74 |
+
task="regression",
|
| 75 |
+
estimator_list=["lgbm"],
|
| 76 |
+
split_type="auto")
|
| 77 |
+
return automl
|
| 78 |
+
|
| 79 |
+
|
| 80 |
def predict(model, predictors):
|
| 81 |
data = xgb.DMatrix(predictors, enable_categorical=True)
|
| 82 |
return model.predict(data)
|
stream_app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import streamlit as st
|
| 4 |
from scipy import stats
|
|
@@ -9,7 +10,7 @@ import plotly.figure_factory as ff
|
|
| 9 |
import scipy
|
| 10 |
import numpy as np
|
| 11 |
from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
|
| 12 |
-
from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training
|
| 13 |
|
| 14 |
|
| 15 |
def _max_width_():
|
|
@@ -60,6 +61,7 @@ st.subheader("Dataset Description")
|
|
| 60 |
|
| 61 |
st.metric('Number of validated decisions linked to organisations (and not individuals)', decision_scope.shape[0])
|
| 62 |
|
|
|
|
| 63 |
st.metric('Decisions with monetary sanctions',
|
| 64 |
decision_scope[decision_scope.monetary_sanction > 0].shape[0])
|
| 65 |
|
|
@@ -164,7 +166,6 @@ with st.expander("Data exploration"):
|
|
| 164 |
####
|
| 165 |
##############################################
|
| 166 |
st.title("Training phase")
|
| 167 |
-
xgb_model = None
|
| 168 |
col_num_all = ['log10_org_revenues',
|
| 169 |
'time']
|
| 170 |
col_cat_all = ['authorities_country',
|
|
@@ -172,16 +173,15 @@ col_cat_all = ['authorities_country',
|
|
| 172 |
'violation_theme',
|
| 173 |
'justice_type',
|
| 174 |
'org_country',
|
| 175 |
-
'org_currency',
|
| 176 |
'org_continent',
|
| 177 |
'same_country',
|
| 178 |
'org_company_type']
|
| 179 |
|
| 180 |
st.sidebar.title("Training params")
|
| 181 |
col_num = st.sidebar.multiselect('Numeric variables',
|
| 182 |
-
col_num_all, col_num_all)
|
| 183 |
col_cat = st.sidebar.multiselect('Categorical variables',
|
| 184 |
-
col_cat_all, col_cat_all)
|
| 185 |
# train the model
|
| 186 |
predictors, target = prepare_data(monetary_decision, col_num, col_cat)
|
| 187 |
if st.button('Run training'):
|
|
@@ -189,7 +189,7 @@ if st.button('Run training'):
|
|
| 189 |
# Study distribution
|
| 190 |
st.write(f"dataset size: {monetary_decision.shape[0]}")
|
| 191 |
st.markdown("Plot target distribution: log 10 of monetary sanctions")
|
| 192 |
-
fig = ff.create_distplot([target], ['log 10 of monetary sanctions'], bin_size=0.
|
| 193 |
fig.update_layout(width=1000,
|
| 194 |
template="simple_white",
|
| 195 |
height=600,
|
|
@@ -197,7 +197,7 @@ if st.button('Run training'):
|
|
| 197 |
st.plotly_chart(fig)
|
| 198 |
|
| 199 |
# Split data set
|
| 200 |
-
predictors_train, predictors_test, target_train, target_test = split(predictors, target)
|
| 201 |
st.subheader("Split dataset between training and test:")
|
| 202 |
st.metric(label="Training size", value=predictors_train.shape[0])
|
| 203 |
st.metric(label="Test size", value=predictors_test.shape[0])
|
|
@@ -205,24 +205,30 @@ if st.button('Run training'):
|
|
| 205 |
# Run cross validation
|
| 206 |
st.subheader("Cross validation error")
|
| 207 |
with st.spinner('Wait for it...'):
|
| 208 |
-
xgb_cv, best_params = run_cv_training(predictors_train, target_train)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
st.json(best_params)
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
|
|
|
| 216 |
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
|
|
|
| 221 |
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
| 226 |
|
| 227 |
fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2)
|
| 228 |
fig.update_layout(width=1000,
|
|
@@ -232,9 +238,15 @@ if st.button('Run training'):
|
|
| 232 |
st.plotly_chart(fig)
|
| 233 |
|
| 234 |
st.subheader("Plot features importance for the trained model")
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
orientation='h',
|
| 239 |
width=1000,
|
| 240 |
template="simple_white",
|
|
@@ -285,15 +297,22 @@ if st.button('Run training'):
|
|
| 285 |
print(stats.pearsonr(test_errors, target_test))
|
| 286 |
|
| 287 |
st.title("Organizations view")
|
| 288 |
-
|
|
|
|
| 289 |
to_predict = {}
|
| 290 |
with col1:
|
| 291 |
to_predict['log10_org_revenues'] = [np.log10(st.number_input('Yearly revenues', value=100000000))]
|
|
|
|
| 292 |
for col in col_cat:
|
| 293 |
to_predict[col] = [st.selectbox(f'{col}', predictors[col].cat.categories)]
|
| 294 |
-
print(to_predict)
|
| 295 |
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
print(predicted)
|
|
|
|
|
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
+
import pickle
|
| 3 |
import pandas as pd
|
| 4 |
import streamlit as st
|
| 5 |
from scipy import stats
|
|
|
|
| 10 |
import scipy
|
| 11 |
import numpy as np
|
| 12 |
from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
|
| 13 |
+
from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training, automl_training
|
| 14 |
|
| 15 |
|
| 16 |
def _max_width_():
|
|
|
|
| 61 |
|
| 62 |
st.metric('Number of validated decisions linked to organisations (and not individuals)', decision_scope.shape[0])
|
| 63 |
|
| 64 |
+
|
| 65 |
st.metric('Decisions with monetary sanctions',
|
| 66 |
decision_scope[decision_scope.monetary_sanction > 0].shape[0])
|
| 67 |
|
|
|
|
| 166 |
####
|
| 167 |
##############################################
|
| 168 |
st.title("Training phase")
|
|
|
|
| 169 |
col_num_all = ['log10_org_revenues',
|
| 170 |
'time']
|
| 171 |
col_cat_all = ['authorities_country',
|
|
|
|
| 173 |
'violation_theme',
|
| 174 |
'justice_type',
|
| 175 |
'org_country',
|
|
|
|
| 176 |
'org_continent',
|
| 177 |
'same_country',
|
| 178 |
'org_company_type']
|
| 179 |
|
| 180 |
st.sidebar.title("Training params")
|
| 181 |
col_num = st.sidebar.multiselect('Numeric variables',
|
| 182 |
+
col_num_all, default=col_num_all)
|
| 183 |
col_cat = st.sidebar.multiselect('Categorical variables',
|
| 184 |
+
col_cat_all, default=col_cat_all)
|
| 185 |
# train the model
|
| 186 |
predictors, target = prepare_data(monetary_decision, col_num, col_cat)
|
| 187 |
if st.button('Run training'):
|
|
|
|
| 189 |
# Study distribution
|
| 190 |
st.write(f"dataset size: {monetary_decision.shape[0]}")
|
| 191 |
st.markdown("Plot target distribution: log 10 of monetary sanctions")
|
| 192 |
+
fig = ff.create_distplot([target], ['log 10 of monetary sanctions'], bin_size=0.05)
|
| 193 |
fig.update_layout(width=1000,
|
| 194 |
template="simple_white",
|
| 195 |
height=600,
|
|
|
|
| 197 |
st.plotly_chart(fig)
|
| 198 |
|
| 199 |
# Split data set
|
| 200 |
+
predictors_train, predictors_test, target_train, target_test = split(predictors, target, test_size=0.05)
|
| 201 |
st.subheader("Split dataset between training and test:")
|
| 202 |
st.metric(label="Training size", value=predictors_train.shape[0])
|
| 203 |
st.metric(label="Test size", value=predictors_test.shape[0])
|
|
|
|
| 205 |
# Run cross validation
|
| 206 |
st.subheader("Cross validation error")
|
| 207 |
with st.spinner('Wait for it...'):
|
| 208 |
+
#xgb_cv, best_params = run_cv_training(predictors_train, target_train)
|
| 209 |
+
|
| 210 |
+
#st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]])
|
| 211 |
+
#st.subheader("Selected variables")
|
| 212 |
+
#st.json(best_params)
|
| 213 |
|
| 214 |
+
# Train final
|
| 215 |
+
#xgb_model = run_training(predictors_train, target_train, best_params["params"], best_params["best_round"])
|
|
|
|
| 216 |
|
| 217 |
+
xgb_model = automl_training(predictors_train, target_train)
|
| 218 |
+
# save model to file
|
| 219 |
+
pickle.dump(xgb_model, open("xgb_model.pickle.dat", "wb"))
|
| 220 |
|
| 221 |
+
# Evaluate model error
|
| 222 |
+
#target_train_predicted = predict(xgb_model, predictors_train)
|
| 223 |
+
target_train_predicted = xgb_model.predict(predictors_train)
|
| 224 |
+
training_bias = np.mean(target_train_predicted - target_train)
|
| 225 |
+
st.metric(label="Training bias", value=training_bias)
|
| 226 |
|
| 227 |
+
#target_test_predicted = predict(xgb_model, predictors_test)
|
| 228 |
+
target_test_predicted = xgb_model.predict(predictors_test)
|
| 229 |
+
test_errors = target_test_predicted - target_test
|
| 230 |
+
test_bias = np.mean(test_errors)
|
| 231 |
+
st.metric(label="Test bias", value=test_bias)
|
| 232 |
|
| 233 |
fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2)
|
| 234 |
fig.update_layout(width=1000,
|
|
|
|
| 238 |
st.plotly_chart(fig)
|
| 239 |
|
| 240 |
st.subheader("Plot features importance for the trained model")
|
| 241 |
+
print("predictors_train shape: ", predictors_train.columns)
|
| 242 |
+
xgb_features_importance = pd.DataFrame([xgb_model.model.estimator.feature_importances_],
|
| 243 |
+
columns=predictors_train.columns)
|
| 244 |
+
print(xgb_features_importance)
|
| 245 |
+
#st.dataframe(xgb_features_importance)
|
| 246 |
+
|
| 247 |
+
# xgb_features_importance = features_importance(xgb_model)
|
| 248 |
+
#
|
| 249 |
+
fig = px.bar(xgb_features_importance.T,
|
| 250 |
orientation='h',
|
| 251 |
width=1000,
|
| 252 |
template="simple_white",
|
|
|
|
| 297 |
print(stats.pearsonr(test_errors, target_test))
|
| 298 |
|
| 299 |
st.title("Organizations view")
|
| 300 |
+
prediction_model = pickle.load(open("xgb_model.pickle.dat", "rb"))
|
| 301 |
+
col1, _, _ = st.columns(3)
|
| 302 |
to_predict = {}
|
| 303 |
with col1:
|
| 304 |
to_predict['log10_org_revenues'] = [np.log10(st.number_input('Yearly revenues', value=100000000))]
|
| 305 |
+
to_predict['time'] = 0
|
| 306 |
for col in col_cat:
|
| 307 |
to_predict[col] = [st.selectbox(f'{col}', predictors[col].cat.categories)]
|
|
|
|
| 308 |
|
| 309 |
+
df_to_predict = prepare_predictors(pd.DataFrame.from_dict(to_predict), col_num, col_cat)
|
| 310 |
+
st.dataframe(df_to_predict)
|
| 311 |
+
|
| 312 |
+
if prediction_model:
|
| 313 |
+
try:
|
| 314 |
+
predicted = prediction_model.predict(df_to_predict)
|
| 315 |
+
st.metric(label="Monetary sanction prediction", value=f"{'{:,.2f}'.format(10**(predicted[0]-3))} K$")
|
| 316 |
print(predicted)
|
| 317 |
+
except ValueError:
|
| 318 |
+
st.subheader("You need to rerun training !")
|