Spaces:

Theolex
/

theolex_streamlit

Runtime error

App Files Files Community

JawadRouen commited on Dec 18, 2021

Commit

b0bca26

unverified ·

2 Parent(s): f9dedaa a61cdb6

Merge pull request #4 from THEOLEX-IO/report

Browse files

Files changed (5) hide show

.gitignore +2 -1
data_processing.py +3 -1
exploration_app.py +94 -0
model.py +33 -14
stream_app.py +48 -29

.gitignore CHANGED Viewed

@@ -117,4 +117,5 @@ back/
 esdata
 #data
-*.csv

 esdata
 #data
+*.csv
+*.dat

data_processing.py CHANGED Viewed

@@ -84,8 +84,10 @@ def get_monetary_dataframe(decision_scope):
     monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
     monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
     monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
-    time = round((monetary_decision.decision_date - monetary_decision.decision_date.min()) / np.timedelta64(1, "M"))
     monetary_decision['time'] = time
     return monetary_decision

     monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
     monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
     monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
+    time = round((pd.to_datetime('today').date() - monetary_decision.decision_date) / np.timedelta64(1, "M"))
     monetary_decision['time'] = time
+    # sort by date
+    monetary_decision = monetary_decision.sort_values('time', ascending=False).reset_index(drop=True)
     return monetary_decision

exploration_app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# -*- coding: utf-8 -*-
+import pickle
+import pandas as pd
+import streamlit as st
+from scipy import stats
+import plotly.express as px
+import plotly.figure_factory as ff
+import scipy
+import numpy as np
+from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
+from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training, \
+    automl_training
+def _max_width_():
+    max_width_str = f"max-width: 1500px;"
+    st.markdown(
+        f"""
+    <style>
+    .reportview-container .main .block-container{{
+        {max_width_str}
+    }}
+    </style>
+    """,
+        unsafe_allow_html=True,
+    )
+# force screen width
+_max_width_()
+st.title("Data Analysis 🌎 📃")
+st.write("by [Theolex](https://www.theolex.io/)")
+# load and process data
+data = load_data()
+decisions, organizations, authorities = process_data(data)
+col1, col2, col3, col4 = st.columns(4)
+with col1:
+    authorities_country = st.selectbox('Authority country', authorities.country.unique())
+with col2:
+    nb_years = st.selectbox('Number of years', range(1, 11), 4)
+with col3:
+    list_continents = decisions.org_continent.unique().tolist()
+    org_continent = st.selectbox("Company's continent", list_continents, list_continents.index("europe"))
+with col4:
+    list_company_types = decisions.org_company_type.unique().tolist()
+    org_company_type = st.selectbox("Company's activity", list_company_types,
+                                    list_company_types.index("Banking & Finance"))
+st.subheader(f"Which {authorities_country} regulators and prosecutors have been "
+             f"the most active in enforcement actions against {org_continent} "
+             f"{org_company_type} companies in the last {nb_years} years?")
+# apply filters
+select_auth = authorities[authorities.country == authorities_country].name.sort_values()
+authority_filter = decisions.authorities_name.apply(lambda a: bool(set(select_auth) & set(a)))
+year_filter = (decisions.year >= (2021 - nb_years))
+org_continent_filter = (decisions.org_continent == org_continent)
+org_company_type_filter = (decisions.org_company_type == org_company_type)
+decision_scope = decisions[authority_filter & year_filter & org_continent_filter & org_company_type_filter]
+decision_scope = decision_scope.explode("authorities_name")
+top_auths = decision_scope.groupby(['authorities_name'])['authorities_name'].count().sort_values(ascending=False).head(
+    5)
+fig = px.bar(top_auths,
+             template="simple_white",
+             color_continuous_scale='RdBu',
+             width=1200, height=600)
+st.plotly_chart(fig)
+with st.expander("Explore cases"):
+    st.dataframe(decision_scope[['authorities_name', 'org_name', 'decision_date', 'monetary_sanction', 'org_country',
+                                 'org_company_type']])
+# st.subheader("What are the top 10 negotiated settlements in France "
+#              "(involving French or foreign authorities) in the last 5 years?")
+#
+# st.subheader(
+#     "What are the top 3 areas (sanctions, anti-corruption, fraud, market manipulation, tax, etc.) "
+#     "of enforcement against banks in Germany in the last 3 years?")
+#
+# st.subheader("What are the largest enforcement actions involving French banks in the last 5 years?")
+#
+# st.subheader(
+#     "Which US regulators have imposed the largest penalties against financial institutions in the last 3 years?")

model.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import itertools
-import numpy as np
 import pandas as pd
 import xgboost as xgb
 from xgboost import cv
-from sklearn.model_selection import train_test_split
 def prepare_predictors(monetary_decision, col_num, col_cat):
@@ -20,19 +20,25 @@ def prepare_data(monetary_decision, col_num, col_cat):
     return predictors, target
-def split(predictors, target):
-    predictors_train, predictors_test, target_train, target_test = train_test_split(predictors,
-                                                                                    target,
-                                                                                    test_size=0.2,
-                                                                                    random_state=50)
-    return predictors_train, predictors_test, target_train, target_test
 def run_cv_training(predictors_train, target_train):
     data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
     xgb_csv = []
-    best_params = (100, {}, 10)
-    for eta, max_depth, col_num in itertools.product([0.05, 0.01], [10, 15], [0.3, 0.8]):
         prefix = f"{str(eta)}_{str(max_depth)}_{str(col_num)}"
         params = {
             'learning_rate': eta,
@@ -41,14 +47,18 @@ def run_cv_training(predictors_train, target_train):
             # 'gamma': 0.5,
             'subsample': 0.8,
             'objective': 'reg:squarederror'}
-        cv_results = cv(dtrain=data_train, params=params, nfold=2,
-                        num_boost_round=1000, early_stopping_rounds=3, metrics="rmse", as_pandas=True, seed=50)
         best_value = cv_results['test-rmse-mean'].values[-1]
         best_round = cv_results.index[-1]
         xgb_csv.append(
             cv_results.rename(columns={col: f'{prefix}_{col}' for col in cv_results.columns}).tail(10).reset_index())
-        if best_value < best_params[0]:
-            best_params = (best_value, params, best_round)
     return pd.concat(xgb_csv, axis=1), best_params
@@ -58,6 +68,15 @@ def run_training(predictors_train, target_train, params, num_rounds):
     return xgb.train(params, data_train, num_rounds)
 def predict(model, predictors):
     data = xgb.DMatrix(predictors, enable_categorical=True)
     return model.predict(data)

 import itertools
 import pandas as pd
 import xgboost as xgb
 from xgboost import cv
+from sklearn.model_selection import TimeSeriesSplit, train_test_split
+from flaml import AutoML
 def prepare_predictors(monetary_decision, col_num, col_cat):
     return predictors, target
+def split(predictors, target, test_size=0.2):
+    df_len = len(target)
+    assert df_len == predictors.shape[0]
+    nb_test = round(test_size * df_len)
+    nb_train = df_len - nb_test
+    predictors_train = predictors.head(nb_train)
+    predictors_test = predictors.tail(nb_test)
+    target_train = target.head(nb_train)
+    target_test = target_train.tail(nb_test)
+    predictors_train, predictors_test, target_train, target_test = train_test_split(predictors, target, test_size=0.2,
+                                                                                    random_state=42)
+    return predictors_train.reset_index(drop=True), predictors_test, target_train, target_test
 def run_cv_training(predictors_train, target_train):
     data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
     xgb_csv = []
+    best_params = {"best_value": 100, "params": {}, "best_round": 10}
+    for eta, max_depth, col_num in itertools.product([0.05, 0.01], [5, 15], [0.3, 0.5]):
         prefix = f"{str(eta)}_{str(max_depth)}_{str(col_num)}"
         params = {
             'learning_rate': eta,
             # 'gamma': 0.5,
             'subsample': 0.8,
             'objective': 'reg:squarederror'}
+        # build CV folds
+        nb_folds = 4
+        folds = TimeSeriesSplit(n_splits=nb_folds)
+        cv_results = cv(dtrain=data_train, params=params, folds=folds,
+                        num_boost_round=1000, early_stopping_rounds=3, metrics="rmse",
+                        as_pandas=True, seed=50)
         best_value = cv_results['test-rmse-mean'].values[-1]
         best_round = cv_results.index[-1]
         xgb_csv.append(
             cv_results.rename(columns={col: f'{prefix}_{col}' for col in cv_results.columns}).tail(10).reset_index())
+        if best_value < best_params["best_value"]:
+            best_params = {"best_value": best_value, "params": params, "best_round": best_round}
     return pd.concat(xgb_csv, axis=1), best_params
     return xgb.train(params, data_train, num_rounds)
+def automl_training(predictors_train, target_train):
+    automl = AutoML()
+    automl.fit(predictors_train, target_train,
+               task="regression",
+               estimator_list=["lgbm"],
+               split_type="auto")
+    return automl
 def predict(model, predictors):
     data = xgb.DMatrix(predictors, enable_categorical=True)
     return model.predict(data)

stream_app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
 import pandas as pd
 import streamlit as st
 from scipy import stats
@@ -9,7 +10,7 @@ import plotly.figure_factory as ff
 import scipy
 import numpy as np
 from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
-from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training
 def _max_width_():
@@ -60,6 +61,7 @@ st.subheader("Dataset Description")
 st.metric('Number of validated decisions linked to organisations (and not individuals)', decision_scope.shape[0])
 st.metric('Decisions with monetary sanctions',
           decision_scope[decision_scope.monetary_sanction > 0].shape[0])
@@ -164,7 +166,6 @@ with st.expander("Data exploration"):
 ####
 ##############################################
 st.title("Training phase")
-xgb_model = None
 col_num_all = ['log10_org_revenues',
                'time']
 col_cat_all = ['authorities_country',
@@ -172,16 +173,15 @@ col_cat_all = ['authorities_country',
                'violation_theme',
                'justice_type',
                'org_country',
-               'org_currency',
                'org_continent',
                'same_country',
                'org_company_type']
 st.sidebar.title("Training params")
 col_num = st.sidebar.multiselect('Numeric variables',
-                                 col_num_all, col_num_all)
 col_cat = st.sidebar.multiselect('Categorical variables',
-                                 col_cat_all, col_cat_all)
 # train the model
 predictors, target = prepare_data(monetary_decision, col_num, col_cat)
 if st.button('Run training'):
@@ -189,7 +189,7 @@ if st.button('Run training'):
         # Study distribution
         st.write(f"dataset size: {monetary_decision.shape[0]}")
         st.markdown("Plot target distribution: log 10 of monetary sanctions")
-        fig = ff.create_distplot([target], ['log 10 of monetary sanctions'], bin_size=0.1)
         fig.update_layout(width=1000,
                           template="simple_white",
                           height=600,
@@ -197,7 +197,7 @@ if st.button('Run training'):
         st.plotly_chart(fig)
         # Split data set
-        predictors_train, predictors_test, target_train, target_test = split(predictors, target)
         st.subheader("Split dataset between training and test:")
         st.metric(label="Training size", value=predictors_train.shape[0])
         st.metric(label="Test size", value=predictors_test.shape[0])
@@ -205,24 +205,30 @@ if st.button('Run training'):
         # Run cross validation
         st.subheader("Cross validation error")
         with st.spinner('Wait for it...'):
-            xgb_cv, best_params = run_cv_training(predictors_train, target_train)
-        st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]])
-        st.subheader("Selected variables")
-        st.json(best_params)
-        # Train final model
-        xgb_model = run_training(predictors_train, target_train, best_params[1], best_params[2])
-        # Evaluate model error
-        target_train_predicted = predict(xgb_model, predictors_train)
-        training_bias = np.mean(target_train_predicted - target_train)
-        st.metric(label="Training bias", value=training_bias)
-        target_test_predicted = predict(xgb_model, predictors_test)
-        test_errors = target_test_predicted - target_test
-        test_bias = np.mean(test_errors)
-        st.metric(label="Test bias", value=test_bias)
         fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2)
         fig.update_layout(width=1000,
@@ -232,9 +238,15 @@ if st.button('Run training'):
         st.plotly_chart(fig)
         st.subheader("Plot features importance for the trained model")
-        xgb_features_importance = features_importance(xgb_model)
-        fig = px.bar(xgb_features_importance,
                      orientation='h',
                      width=1000,
                      template="simple_white",
@@ -285,15 +297,22 @@ if st.button('Run training'):
         print(stats.pearsonr(test_errors, target_test))
 st.title("Organizations view")
-col1, col2, col3 = st.columns(3)
 to_predict = {}
 with col1:
     to_predict['log10_org_revenues'] = [np.log10(st.number_input('Yearly revenues', value=100000000))]
     for col in col_cat:
         to_predict[col] = [st.selectbox(f'{col}', predictors[col].cat.categories)]
-    print(to_predict)
-    df_to_predict = prepare_predictors(pd.DataFrame.from_dict(to_predict), col_num, col_cat)
-    if xgb_model:
-        predicted = predict(xgb_model, df_to_predict)
         print(predicted)

 # -*- coding: utf-8 -*-
+import pickle
 import pandas as pd
 import streamlit as st
 from scipy import stats
 import scipy
 import numpy as np
 from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
+from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training, automl_training
 def _max_width_():
 st.metric('Number of validated decisions linked to organisations (and not individuals)', decision_scope.shape[0])
 st.metric('Decisions with monetary sanctions',
           decision_scope[decision_scope.monetary_sanction > 0].shape[0])
 ####
 ##############################################
 st.title("Training phase")
 col_num_all = ['log10_org_revenues',
                'time']
 col_cat_all = ['authorities_country',
                'violation_theme',
                'justice_type',
                'org_country',
                'org_continent',
                'same_country',
                'org_company_type']
 st.sidebar.title("Training params")
 col_num = st.sidebar.multiselect('Numeric variables',
+                                 col_num_all, default=col_num_all)
 col_cat = st.sidebar.multiselect('Categorical variables',
+                                 col_cat_all, default=col_cat_all)
 # train the model
 predictors, target = prepare_data(monetary_decision, col_num, col_cat)
 if st.button('Run training'):
         # Study distribution
         st.write(f"dataset size: {monetary_decision.shape[0]}")
         st.markdown("Plot target distribution: log 10 of monetary sanctions")
+        fig = ff.create_distplot([target], ['log 10 of monetary sanctions'], bin_size=0.05)
         fig.update_layout(width=1000,
                           template="simple_white",
                           height=600,
         st.plotly_chart(fig)
         # Split data set
+        predictors_train, predictors_test, target_train, target_test = split(predictors, target, test_size=0.05)
         st.subheader("Split dataset between training and test:")
         st.metric(label="Training size", value=predictors_train.shape[0])
         st.metric(label="Test size", value=predictors_test.shape[0])
         # Run cross validation
         st.subheader("Cross validation error")
         with st.spinner('Wait for it...'):
+            #xgb_cv, best_params = run_cv_training(predictors_train, target_train)
+            #st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]])
+            #st.subheader("Selected variables")
+            #st.json(best_params)
+            # Train final
+            #xgb_model = run_training(predictors_train, target_train, best_params["params"], best_params["best_round"])
+            xgb_model = automl_training(predictors_train, target_train)
+            # save model to file
+            pickle.dump(xgb_model, open("xgb_model.pickle.dat", "wb"))
+            # Evaluate model error
+            #target_train_predicted = predict(xgb_model, predictors_train)
+            target_train_predicted = xgb_model.predict(predictors_train)
+            training_bias = np.mean(target_train_predicted - target_train)
+            st.metric(label="Training bias", value=training_bias)
+            #target_test_predicted = predict(xgb_model, predictors_test)
+            target_test_predicted = xgb_model.predict(predictors_test)
+            test_errors = target_test_predicted - target_test
+            test_bias = np.mean(test_errors)
+            st.metric(label="Test bias", value=test_bias)
         fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2)
         fig.update_layout(width=1000,
         st.plotly_chart(fig)
         st.subheader("Plot features importance for the trained model")
+        print("predictors_train shape: ", predictors_train.columns)
+        xgb_features_importance = pd.DataFrame([xgb_model.model.estimator.feature_importances_],
+                                               columns=predictors_train.columns)
+        print(xgb_features_importance)
+        #st.dataframe(xgb_features_importance)
+        # xgb_features_importance = features_importance(xgb_model)
+        #
+        fig = px.bar(xgb_features_importance.T,
                      orientation='h',
                      width=1000,
                      template="simple_white",
         print(stats.pearsonr(test_errors, target_test))
 st.title("Organizations view")
+prediction_model = pickle.load(open("xgb_model.pickle.dat", "rb"))
+col1, _, _ = st.columns(3)
 to_predict = {}
 with col1:
     to_predict['log10_org_revenues'] = [np.log10(st.number_input('Yearly revenues', value=100000000))]
+    to_predict['time'] = 0
     for col in col_cat:
         to_predict[col] = [st.selectbox(f'{col}', predictors[col].cat.categories)]
+df_to_predict = prepare_predictors(pd.DataFrame.from_dict(to_predict), col_num, col_cat)
+st.dataframe(df_to_predict)
+if prediction_model:
+    try:
+        predicted = prediction_model.predict(df_to_predict)
+        st.metric(label="Monetary sanction prediction", value=f"{'{:,.2f}'.format(10**(predicted[0]-3))} K$")
         print(predicted)
+    except ValueError:
+        st.subheader("You need to rerun training !")