# -*- coding: utf-8 -*- import pandas as pd import streamlit as st from scipy import stats import plotly.express as px import plotly.figure_factory as ff import scipy import numpy as np from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training def _max_width_(): max_width_str = f"max-width: 1500px;" st.markdown( f""" """, unsafe_allow_html=True, ) # force screen width _max_width_() st.title("Data Analysis 🌎 📃") st.write("by [Theolex](https://www.theolex.io/)") # load and process data data = load_data() decisions, organizations, authorities = process_data(data) st.sidebar.title("Authorities parameters") authorities_country = st.sidebar.selectbox('Authority country', ['All', *authorities.country.unique()]) if authorities_country != 'All': select_auth = authorities[authorities.country == authorities_country].name.sort_values() else: select_auth = authorities.name.sort_values() authority = st.sidebar.selectbox('Authority', ['All', *select_auth]) min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2008, 2021)) # apply filters authority_filter = True if authority != 'All': authority_filter = decisions.authorities_name.apply(lambda a: authority in a) else: authority_filter = decisions.authorities_name.apply(lambda a: bool(set(select_auth) & set(a))) year_filter = (decisions.year >= min_year) & (decisions.year <= max_year) decision_scope = decisions[authority_filter & year_filter] st.subheader("Dataset Description") st.metric('Number of validated decisions linked to organisations (and not individuals)', decision_scope.shape[0]) st.metric('Decisions with monetary sanctions', decision_scope[decision_scope.monetary_sanction > 0].shape[0]) # explore monetary sanctions monetary_decision = get_monetary_dataframe(decision_scope) st.metric('Decisions with organizations that have published yearly revenues', sum(monetary_decision.has_revenues)) ## # Plot Graphs ## with st.expander("Data exploration"): st.subheader("The organizations' sectors targeted by the sanctions: ") st.markdown("The graph shows the cumulated monetary sanction for the current filters") fig = px.treemap(monetary_decision, path=['org_company_type'], color='org_revenues', color_continuous_scale='RdBu', template="simple_white", values='monetary_sanction', width=1000, height=600) st.plotly_chart(fig) st.subheader("The organizations' regions targeted by the sanctions: ") st.markdown("The graph shows the cumulated monetary sanction for the current filters") fig = px.treemap(monetary_decision[~monetary_decision.org_continent.isnull()], path=['org_continent', 'org_country'], color_continuous_scale='RdBu', template="simple_white", values='monetary_sanction', width=1000, height=600) st.plotly_chart(fig) st.subheader("Revenues vs monetary sanctions representation ") st.markdown("The graph shows the cumulated monetary sanction for the current filters") fig = px.scatter(monetary_decision, x="org_revenues", y="monetary_sanction", log_x=True, log_y=True, template="simple_white", color="same_country", color_continuous_scale='RdBu', hover_name="org_name", width=1000, height=600) st.plotly_chart(fig) fig = px.scatter(monetary_decision[~monetary_decision.org_revenues.isnull()], x="decision_date", size="log10_monetary_sanction", y="org_revenues", log_y=True, template="simple_white", color="same_country", hover_name="monetary_sanction", width=1000, height=600) st.plotly_chart(fig) fig = px.histogram(monetary_decision, x="log10_monetary_sanction", # y="log10_org_revenues", color="same_country", marginal="box", # or violin, rug template="simple_white", width=1000, height=600, nbins=40, opacity=0.5, hover_data=monetary_decision.columns) st.plotly_chart(fig) fig = px.histogram(monetary_decision, x="log10_monetary_sanction_rate", # y="log10_org_revenues", color="same_country", marginal="box", # or violin, rug template="simple_white", width=1000, height=600, nbins=40, opacity=0.5, hover_data=monetary_decision.columns) st.plotly_chart(fig) p = scipy.stats.ks_2samp(monetary_decision[monetary_decision.same_country]['log10_monetary_sanction_rate'], monetary_decision[~monetary_decision.same_country]['log10_monetary_sanction_rate'] , alternative='two-sided', mode='auto') st.metric(label="p-value", value=f"{round(p.pvalue, 2)}%") st.subheader("Sum of monetary sanctions over time ") st.markdown("The graph shows the cumulated monetary sanction per year for each violation theme") chart_data = get_themes_per_year(monetary_decision) fig = px.area(chart_data, x="year", y="monetary_sanction", color="violation_theme", template="simple_white", # groupnorm="fraction", line_group="violation_theme", width=1000, height=600) st.plotly_chart(fig) ############################################## #### # build ML model #### ############################################## st.title("Training phase") xgb_model = None col_num_all = ['log10_org_revenues', 'time'] col_cat_all = ['authorities_country', 'type', 'violation_theme', 'justice_type', 'org_country', 'org_currency', 'org_continent', 'same_country', 'org_company_type'] st.sidebar.title("Training params") col_num = st.sidebar.multiselect('Numeric variables', col_num_all, col_num_all) col_cat = st.sidebar.multiselect('Categorical variables', col_cat_all, col_cat_all) # train the model predictors, target = prepare_data(monetary_decision, col_num, col_cat) if st.button('Run training'): with st.expander("Training results"): # Study distribution st.write(f"dataset size: {monetary_decision.shape[0]}") st.markdown("Plot target distribution: log 10 of monetary sanctions") fig = ff.create_distplot([target], ['log 10 of monetary sanctions'], bin_size=0.1) fig.update_layout(width=1000, template="simple_white", height=600, bargap=0.01) st.plotly_chart(fig) # Split data set predictors_train, predictors_test, target_train, target_test = split(predictors, target) st.subheader("Split dataset between training and test:") st.metric(label="Training size", value=predictors_train.shape[0]) st.metric(label="Test size", value=predictors_test.shape[0]) # Run cross validation st.subheader("Cross validation error") with st.spinner('Wait for it...'): xgb_cv, best_params = run_cv_training(predictors_train, target_train) st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]]) st.subheader("Selected variables") st.json(best_params) # Train final model xgb_model = run_training(predictors_train, target_train, best_params[1], best_params[2]) # Evaluate model error target_train_predicted = predict(xgb_model, predictors_train) training_bias = np.mean(target_train_predicted - target_train) st.metric(label="Training bias", value=training_bias) target_test_predicted = predict(xgb_model, predictors_test) test_errors = target_test_predicted - target_test test_bias = np.mean(test_errors) st.metric(label="Test bias", value=test_bias) fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2) fig.update_layout(width=1000, template="simple_white", height=600, bargap=0.01) st.plotly_chart(fig) st.subheader("Plot features importance for the trained model") xgb_features_importance = features_importance(xgb_model) fig = px.bar(xgb_features_importance, orientation='h', width=1000, template="simple_white", height=600, ) st.plotly_chart(fig) st.subheader("Plot predicted vs real") compare = pd.concat( [pd.DataFrame({'target': target_test, 'predicted': target_test_predicted, 'sample': 'test'}), pd.DataFrame({'target': target_train, 'predicted': target_train_predicted, 'sample': 'train'})]) fig = px.scatter( compare, x='predicted', y='target', color='sample', marginal_y="violin", width=1000, template="simple_white", height=600, trendline="ols") st.plotly_chart(fig) naive_error_std = np.std(target_train - np.mean(target_train_predicted)) model_error_std = np.std(target_train - target_train_predicted) st.metric(label="Naive error standard deviation", value=naive_error_std) st.metric(label="Model error standard deviation", value=model_error_std) corr_matrix = np.corrcoef(target_train, target_train_predicted) R_sq = corr_matrix[0, 1] ** 2 st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%") naive_error_std = np.std(target_test - np.mean(target_test_predicted)) model_error_std = np.std(target_test - target_test_predicted) st.metric(label="Naive error standard deviation", value=naive_error_std) st.metric(label="Model error standard deviation", value=model_error_std) corr_matrix = np.corrcoef(target_test, target_test_predicted) R_sq = corr_matrix[0, 1] ** 2 st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%") st.subheader("Residuals & homoscedasticity") # st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%") print(stats.pearsonr(test_errors, target_test)) st.title("Organizations view") col1, col2, col3 = st.columns(3) to_predict = {} with col1: to_predict['log10_org_revenues'] = [np.log10(st.number_input('Yearly revenues', value=100000000))] for col in col_cat: to_predict[col] = [st.selectbox(f'{col}', predictors[col].cat.categories)] print(to_predict) df_to_predict = prepare_predictors(pd.DataFrame.from_dict(to_predict), col_num, col_cat) if xgb_model: predicted = predict(xgb_model, df_to_predict) print(predicted)