Spaces:

Theolex
/

theolex_streamlit

Runtime error

App Files Files Community

Jawad commited on Nov 10, 2021

Commit

fe0f3db

1 Parent(s): 2715942

add traning

Browse files

Files changed (3) hide show

data_processing.py +2 -0
model.py +53 -0
stream_app.py +120 -4

data_processing.py CHANGED Viewed

@@ -82,6 +82,8 @@ def get_monetary_dataframe(decision_scope):
     monetary_decision['log10_org_revenues'] = monetary_decision.org_revenues.apply(np.log10)
     monetary_decision['log10_monetary_sanction'] = monetary_decision.monetary_sanction.apply(np.log10)
     monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
     return monetary_decision

     monetary_decision['log10_org_revenues'] = monetary_decision.org_revenues.apply(np.log10)
     monetary_decision['log10_monetary_sanction'] = monetary_decision.monetary_sanction.apply(np.log10)
     monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
+    monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
+    monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
     return monetary_decision

model.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from sklearn.model_selection import train_test_split
+def prepare_data(monetary_decision):
+    monetary_decision = monetary_decision.reset_index(drop=True)
+    time = round((monetary_decision.decision_date - monetary_decision.decision_date.min()) / np.timedelta64(1, "M"))
+    monetary_decision.loc[:, ('time')] = time
+    col_num = ['log10_org_revenues',
+               'time']
+    col_cat = ['authorities_country',
+               'type',
+               'violation_theme',
+               'justice_type',
+               'org_country',
+               'org_currency',
+               'org_continent',
+               'same_country',
+               'org_company_type']
+    predictors = monetary_decision[col_num + col_cat]
+    target = monetary_decision.log10_monetary_sanction
+    for col in col_cat:
+        predictors[col] = predictors[col].astype("category")
+    return predictors, target
+def split(predictors, target):
+    predictors_train, predictors_test, target_train, target_test = train_test_split(predictors,
+                                                                          target,
+                                                                          test_size=0.2,
+                                                                          random_state=42)
+    return predictors_train, predictors_test, target_train, target_test
+def run_training(predictors_train, predictors_test):
+    data_train = xgb.DMatrix(predictors_train, label=predictors_test, enable_categorical=True)
+    param = {'max_depth': 5,
+             'learning_rate': .2,
+             'colsample_bytree': 0.3,
+             'objective': 'reg:squarederror'}
+    num_round = 50
+    return xgb.train(param, data_train, num_round)
+def predict(model, predictors):
+    data = xgb.DMatrix(predictors, enable_categorical=True)
+    return model.predict(data)
+def features_importance(model):
+    return pd.Series(model.get_score(importance_type='gain')).sort_values()

stream_app.py CHANGED Viewed

@@ -1,9 +1,14 @@
 # -*- coding: utf-8 -*-
 import streamlit as st
 import plotly.express as px
 from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
 def _max_width_():
@@ -30,12 +35,15 @@ st.write("by [Teolex](https://www.theolex.io/)")
 data = load_data()
 decisions, organizations, authorities = process_data(data)
-st.sidebar.title("Parameters")
-authorities_country = st.sidebar.selectbox('Authority country', authorities.country.unique())
-select_auth = authorities[authorities.country == authorities_country].name.sort_values()
-authority = st.sidebar.selectbox('Authority', ['All', *select_auth])
 min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2010, 2021))
 # apply filters
@@ -90,6 +98,43 @@ fig = px.scatter(monetary_decision,
                  width=1000, height=600)
 st.plotly_chart(fig)
 st.subheader("Sum of monetary sanctions over time ")
 st.markdown("The graph shows the cumulated monetary sanction per year for each violation theme")
 chart_data = get_themes_per_year(monetary_decision)
@@ -101,3 +146,74 @@ fig = px.area(chart_data, x="year",
               line_group="violation_theme",
               width=1000, height=600)
 st.plotly_chart(fig)

 # -*- coding: utf-8 -*-
+import pandas as pd
 import streamlit as st
 import plotly.express as px
+import plotly.figure_factory as ff
+import scipy
+import numpy as np
 from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
+from model import prepare_data, run_training, split, predict, features_importance
 def _max_width_():
 data = load_data()
 decisions, organizations, authorities = process_data(data)
+st.sidebar.title("Authorities parameters")
+authorities_country = st.sidebar.selectbox('Authority country', ['All', *authorities.country.unique()])
+if authorities_country != 'All':
+    select_auth = authorities[authorities.country == authorities_country].name.sort_values()
+else:
+    select_auth = authorities.name.sort_values()
+authority = st.sidebar.selectbox('Authority', ['All', *select_auth])
 min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2010, 2021))
 # apply filters
                  width=1000, height=600)
 st.plotly_chart(fig)
+fig = px.scatter(monetary_decision[~monetary_decision.org_revenues.isnull()],
+                 x="decision_date",
+                 size="log10_monetary_sanction",
+                 y="org_revenues",
+                 log_y=True,
+                 template="simple_white",
+                 color="same_country",
+                 hover_name="monetary_sanction",
+                 width=1000, height=600)
+st.plotly_chart(fig)
+fig = px.histogram(monetary_decision, x="log10_monetary_sanction",
+                   # y="log10_org_revenues",
+                   color="same_country",
+                   marginal="box",  # or violin, rug
+                   template="simple_white",
+                   width=1000, height=600, nbins=40, opacity=0.5,
+                   hover_data=monetary_decision.columns)
+st.plotly_chart(fig)
+fig = px.histogram(monetary_decision, x="log10_monetary_sanction_rate",
+                   # y="log10_org_revenues",
+                   color="same_country",
+                   marginal="box",  # or violin, rug
+                   template="simple_white",
+                   width=1000, height=600, nbins=40, opacity=0.5,
+                   hover_data=monetary_decision.columns)
+st.plotly_chart(fig)
+p = scipy.stats.ks_2samp(monetary_decision[monetary_decision.same_country]['log10_monetary_sanction_rate'],
+                         monetary_decision[~monetary_decision.same_country]['log10_monetary_sanction_rate']
+                         , alternative='two-sided', mode='auto')
+st.metric(label="p-value", value=f"{round(p.pvalue, 2)}%")
 st.subheader("Sum of monetary sanctions over time ")
 st.markdown("The graph shows the cumulated monetary sanction per year for each violation theme")
 chart_data = get_themes_per_year(monetary_decision)
               line_group="violation_theme",
               width=1000, height=600)
 st.plotly_chart(fig)
+st.sidebar.title("Organizations view")
+col_x = ['log10_org_revenues', 'authorities_country', 'violation_theme', 'org_country', 'org_company_type']
+predictors, target = prepare_data(monetary_decision)
+st.title("Training phase")
+st.markdown("Plot taget distribution: log 10 of monetary sanctions")
+fig = ff.create_distplot([target], [' log 10 of monetary sanctions'], bin_size=0.1)
+fig.update_layout(width=1000,
+                  template="simple_white",
+                  height=600,
+                  bargap=0.01)
+st.plotly_chart(fig)
+# split data set
+predictors_train, predictors_test, target_train, target_test = split(predictors, target)
+# train the model
+xgb_model = run_training(predictors_train, target_train)
+# evaluate model error
+target_train_predicted = predict(xgb_model, predictors_train)
+training_bias = np.mean(target_train_predicted - target_train)
+st.metric(label="Training bias", value=training_bias)
+target_test_predicted = predict(xgb_model, predictors_test)
+test_errors = target_test_predicted - target_test
+test_bias = np.mean(test_errors)
+st.metric(label="Test bias", value=test_bias)
+fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.1)
+fig.update_layout(width=1000,
+                  template="simple_white",
+                  height=600,
+                  bargap=0.01)
+st.plotly_chart(fig)
+st.subheader("Plot features importance for the trained model")
+xgb_features_importance = features_importance(xgb_model)
+fig = px.bar(xgb_features_importance,
+             orientation='h',
+             width=1000,
+             template="simple_white",
+             height=600,
+             )
+st.plotly_chart(fig)
+st.subheader("Plot predicted vs real")
+import plotly.graph_objs as go
+compare = pd.concat([pd.DataFrame({'target': target_test, 'predicted': target_test_predicted, 'sample': 'test'}),
+                     pd.DataFrame({'target': target_train, 'predicted': target_train_predicted, 'sample': 'train'})])
+fig = px.scatter(
+    compare,
+    x='predicted',
+    y='target',
+    color='sample',
+    marginal_y="violin",
+    width=1000,
+    template="simple_white",
+    height=600,
+    trendline="ols")
+st.plotly_chart(fig)
+sample_revenues = st.sidebar.number_input('Yearly revenues', value=1000000)
+authority = st.sidebar.selectbox('Organization country', predictors.org_country.cat.categories)
+authority = st.sidebar.selectbox('Organization activity', predictors.org_company_type.cat.categories)