Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| import pickle | |
| import pandas as pd | |
| import streamlit as st | |
| from scipy import stats | |
| import plotly.express as px | |
| import plotly.figure_factory as ff | |
| import scipy | |
| import numpy as np | |
| from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year | |
| from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training, automl_training | |
| def _max_width_(): | |
| max_width_str = f"max-width: 1500px;" | |
| st.markdown( | |
| f""" | |
| <style> | |
| .reportview-container .main .block-container{{ | |
| {max_width_str} | |
| }} | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| # force screen width | |
| _max_width_() | |
| st.title("Data Analysis π π") | |
| st.write("by [Theolex](https://www.theolex.io/)") | |
| # load and process data | |
| data = load_data() | |
| decisions, organizations, authorities = process_data(data) | |
| st.sidebar.title("Authorities parameters") | |
| authorities_country = st.sidebar.selectbox('Authority country', ['All', *authorities.country.unique()]) | |
| if authorities_country != 'All': | |
| select_auth = authorities[authorities.country == authorities_country].name.sort_values() | |
| else: | |
| select_auth = authorities.name.sort_values() | |
| authority = st.sidebar.selectbox('Authority', ['All', *select_auth]) | |
| min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2008, 2021)) | |
| # apply filters | |
| authority_filter = True | |
| if authority != 'All': | |
| authority_filter = decisions.authorities_name.apply(lambda a: authority in a) | |
| else: | |
| authority_filter = decisions.authorities_name.apply(lambda a: bool(set(select_auth) & set(a))) | |
| year_filter = (decisions.year >= min_year) & (decisions.year <= max_year) | |
| decision_scope = decisions[authority_filter & year_filter] | |
| st.subheader("Dataset Description") | |
| st.metric('Number of validated decisions linked to organisations (and not individuals)', decision_scope.shape[0]) | |
| st.metric('Decisions with monetary sanctions', | |
| decision_scope[decision_scope.monetary_sanction > 0].shape[0]) | |
| # explore monetary sanctions | |
| monetary_decision = get_monetary_dataframe(decision_scope) | |
| st.metric('Decisions with organizations that have published yearly revenues', sum(monetary_decision.has_revenues)) | |
| ## | |
| # Plot Graphs | |
| ## | |
| with st.expander("Data exploration"): | |
| st.subheader("The organizations' sectors targeted by the sanctions: ") | |
| st.markdown("The graph shows the cumulated monetary sanction for the current filters") | |
| fig = px.treemap(monetary_decision, | |
| path=['org_company_type'], | |
| color='org_revenues', | |
| color_continuous_scale='RdBu', | |
| template="simple_white", | |
| values='monetary_sanction', | |
| width=1000, height=600) | |
| st.plotly_chart(fig) | |
| st.subheader("The organizations' regions targeted by the sanctions: ") | |
| st.markdown("The graph shows the cumulated monetary sanction for the current filters") | |
| fig = px.treemap(monetary_decision[~monetary_decision.org_continent.isnull()], | |
| path=['org_continent', 'org_country'], | |
| color_continuous_scale='RdBu', | |
| template="simple_white", | |
| values='monetary_sanction', | |
| width=1000, height=600) | |
| st.plotly_chart(fig) | |
| st.subheader("Revenues vs monetary sanctions representation ") | |
| st.markdown("The graph shows the cumulated monetary sanction for the current filters") | |
| fig = px.scatter(monetary_decision, | |
| x="org_revenues", | |
| y="monetary_sanction", | |
| log_x=True, | |
| log_y=True, | |
| template="simple_white", | |
| color="same_country", | |
| color_continuous_scale='RdBu', | |
| hover_name="org_name", | |
| width=1000, height=600) | |
| st.plotly_chart(fig) | |
| fig = px.scatter(monetary_decision[~monetary_decision.org_revenues.isnull()], | |
| x="decision_date", | |
| size="log10_monetary_sanction", | |
| y="org_revenues", | |
| log_y=True, | |
| template="simple_white", | |
| color="same_country", | |
| hover_name="monetary_sanction", | |
| width=1000, height=600) | |
| st.plotly_chart(fig) | |
| fig = px.histogram(monetary_decision, x="log10_monetary_sanction", | |
| # y="log10_org_revenues", | |
| color="same_country", | |
| marginal="box", # or violin, rug | |
| template="simple_white", | |
| width=1000, height=600, nbins=40, opacity=0.5, | |
| hover_data=monetary_decision.columns) | |
| st.plotly_chart(fig) | |
| fig = px.histogram(monetary_decision, x="log10_monetary_sanction_rate", | |
| # y="log10_org_revenues", | |
| color="same_country", | |
| marginal="box", # or violin, rug | |
| template="simple_white", | |
| width=1000, height=600, nbins=40, opacity=0.5, | |
| hover_data=monetary_decision.columns) | |
| st.plotly_chart(fig) | |
| p = scipy.stats.ks_2samp(monetary_decision[monetary_decision.same_country]['log10_monetary_sanction_rate'], | |
| monetary_decision[~monetary_decision.same_country]['log10_monetary_sanction_rate'] | |
| , alternative='two-sided', mode='auto') | |
| st.metric(label="p-value", value=f"{round(p.pvalue, 2)}%") | |
| st.subheader("Sum of monetary sanctions over time ") | |
| st.markdown("The graph shows the cumulated monetary sanction per year for each violation theme") | |
| chart_data = get_themes_per_year(monetary_decision) | |
| fig = px.area(chart_data, x="year", | |
| y="monetary_sanction", | |
| color="violation_theme", | |
| template="simple_white", | |
| # groupnorm="fraction", | |
| line_group="violation_theme", | |
| width=1000, height=600) | |
| st.plotly_chart(fig) | |
| ############################################## | |
| #### | |
| # build ML model | |
| #### | |
| ############################################## | |
| st.title("Training phase") | |
| col_num_all = ['log10_org_revenues', | |
| 'time'] | |
| col_cat_all = ['authorities_country', | |
| 'type', | |
| 'violation_theme', | |
| 'justice_type', | |
| 'org_country', | |
| 'org_continent', | |
| 'same_country', | |
| 'org_company_type'] | |
| st.sidebar.title("Training params") | |
| col_num = st.sidebar.multiselect('Numeric variables', | |
| col_num_all, default=col_num_all) | |
| col_cat = st.sidebar.multiselect('Categorical variables', | |
| col_cat_all, default=col_cat_all) | |
| # train the model | |
| predictors, target = prepare_data(monetary_decision, col_num, col_cat) | |
| if st.button('Run training'): | |
| with st.expander("Training results"): | |
| # Study distribution | |
| st.write(f"dataset size: {monetary_decision.shape[0]}") | |
| st.markdown("Plot target distribution: log 10 of monetary sanctions") | |
| fig = ff.create_distplot([target], ['log 10 of monetary sanctions'], bin_size=0.05) | |
| fig.update_layout(width=1000, | |
| template="simple_white", | |
| height=600, | |
| bargap=0.01) | |
| st.plotly_chart(fig) | |
| # Split data set | |
| predictors_train, predictors_test, target_train, target_test = split(predictors, target, test_size=0.05) | |
| st.subheader("Split dataset between training and test:") | |
| st.metric(label="Training size", value=predictors_train.shape[0]) | |
| st.metric(label="Test size", value=predictors_test.shape[0]) | |
| # Run cross validation | |
| st.subheader("Cross validation error") | |
| with st.spinner('Wait for it...'): | |
| #xgb_cv, best_params = run_cv_training(predictors_train, target_train) | |
| #st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]]) | |
| #st.subheader("Selected variables") | |
| #st.json(best_params) | |
| # Train final | |
| #xgb_model = run_training(predictors_train, target_train, best_params["params"], best_params["best_round"]) | |
| xgb_model = automl_training(predictors_train, target_train) | |
| # save model to file | |
| pickle.dump(xgb_model, open("xgb_model.pickle.dat", "wb")) | |
| # Evaluate model error | |
| #target_train_predicted = predict(xgb_model, predictors_train) | |
| target_train_predicted = xgb_model.predict(predictors_train) | |
| training_bias = np.mean(target_train_predicted - target_train) | |
| st.metric(label="Training bias", value=training_bias) | |
| #target_test_predicted = predict(xgb_model, predictors_test) | |
| target_test_predicted = xgb_model.predict(predictors_test) | |
| test_errors = target_test_predicted - target_test | |
| test_bias = np.mean(test_errors) | |
| st.metric(label="Test bias", value=test_bias) | |
| fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2) | |
| fig.update_layout(width=1000, | |
| template="simple_white", | |
| height=600, | |
| bargap=0.01) | |
| st.plotly_chart(fig) | |
| st.subheader("Plot features importance for the trained model") | |
| print("predictors_train shape: ", predictors_train.columns) | |
| xgb_features_importance = pd.DataFrame([xgb_model.model.estimator.feature_importances_], | |
| columns=predictors_train.columns) | |
| print(xgb_features_importance) | |
| #st.dataframe(xgb_features_importance) | |
| # xgb_features_importance = features_importance(xgb_model) | |
| # | |
| fig = px.bar(xgb_features_importance.T, | |
| orientation='h', | |
| width=1000, | |
| template="simple_white", | |
| height=600, | |
| ) | |
| st.plotly_chart(fig) | |
| st.subheader("Plot predicted vs real") | |
| compare = pd.concat( | |
| [pd.DataFrame({'target': target_test, 'predicted': target_test_predicted, 'sample': 'test'}), | |
| pd.DataFrame({'target': target_train, 'predicted': target_train_predicted, 'sample': 'train'})]) | |
| fig = px.scatter( | |
| compare, | |
| x='predicted', | |
| y='target', | |
| color='sample', | |
| marginal_y="violin", | |
| width=1000, | |
| template="simple_white", | |
| height=600, | |
| trendline="ols") | |
| st.plotly_chart(fig) | |
| naive_error_std = np.std(target_train - np.mean(target_train_predicted)) | |
| model_error_std = np.std(target_train - target_train_predicted) | |
| st.metric(label="Naive error standard deviation", value=naive_error_std) | |
| st.metric(label="Model error standard deviation", value=model_error_std) | |
| corr_matrix = np.corrcoef(target_train, target_train_predicted) | |
| R_sq = corr_matrix[0, 1] ** 2 | |
| st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%") | |
| naive_error_std = np.std(target_test - np.mean(target_test_predicted)) | |
| model_error_std = np.std(target_test - target_test_predicted) | |
| st.metric(label="Naive error standard deviation", value=naive_error_std) | |
| st.metric(label="Model error standard deviation", value=model_error_std) | |
| corr_matrix = np.corrcoef(target_test, target_test_predicted) | |
| R_sq = corr_matrix[0, 1] ** 2 | |
| st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%") | |
| st.subheader("Residuals & homoscedasticity") | |
| # st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%") | |
| print(stats.pearsonr(test_errors, target_test)) | |
| st.title("Organizations view") | |
| prediction_model = pickle.load(open("xgb_model.pickle.dat", "rb")) | |
| col1, _, _ = st.columns(3) | |
| to_predict = {} | |
| with col1: | |
| to_predict['log10_org_revenues'] = [np.log10(st.number_input('Yearly revenues', value=100000000))] | |
| to_predict['time'] = 0 | |
| for col in col_cat: | |
| to_predict[col] = [st.selectbox(f'{col}', predictors[col].cat.categories)] | |
| df_to_predict = prepare_predictors(pd.DataFrame.from_dict(to_predict), col_num, col_cat) | |
| st.dataframe(df_to_predict) | |
| if prediction_model: | |
| try: | |
| predicted = prediction_model.predict(df_to_predict) | |
| st.metric(label="Monetary sanction prediction", value=f"{'{:,.2f}'.format(10**(predicted[0]-3))} K$") | |
| print(predicted) | |
| except ValueError: | |
| st.subheader("You need to rerun training !") | |