Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| import pandas as pd | |
| import streamlit as st | |
| import plotly.express as px | |
| import plotly.figure_factory as ff | |
| import scipy | |
| import numpy as np | |
| from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year | |
| from model import prepare_data, run_training, split, predict, features_importance | |
| def _max_width_(): | |
| max_width_str = f"max-width: 1500px;" | |
| st.markdown( | |
| f""" | |
| <style> | |
| .reportview-container .main .block-container{{ | |
| {max_width_str} | |
| }} | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| # force screen width | |
| _max_width_() | |
| st.title("Data Analysis π π") | |
| st.write("by [Teolex](https://www.theolex.io/)") | |
| # load and process data | |
| data = load_data() | |
| decisions, organizations, authorities = process_data(data) | |
| st.sidebar.title("Authorities parameters") | |
| authorities_country = st.sidebar.selectbox('Authority country', ['All', *authorities.country.unique()]) | |
| if authorities_country != 'All': | |
| select_auth = authorities[authorities.country == authorities_country].name.sort_values() | |
| else: | |
| select_auth = authorities.name.sort_values() | |
| authority = st.sidebar.selectbox('Authority', ['All', *select_auth]) | |
| min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2010, 2021)) | |
| # apply filters | |
| authority_filter = True | |
| if authority != 'All': | |
| authority_filter = decisions.authorities_name.apply(lambda a: authority in a) | |
| else: | |
| authority_filter = decisions.authorities_name.apply(lambda a: bool(set(select_auth) & set(a))) | |
| year_filter = (decisions.year >= min_year) & (decisions.year <= max_year) | |
| decision_scope = decisions[authority_filter & year_filter] | |
| # explore monetary sanctions | |
| monetary_decision = get_monetary_dataframe(decision_scope) | |
| ## | |
| # Plot Graphs | |
| ## | |
| st.subheader("The organizations' sectors targeted by the sanctions: ") | |
| st.markdown("The graph shows the cumulated monetary sanction for the current filters") | |
| fig = px.treemap(monetary_decision, | |
| path=['org_company_type'], | |
| color='org_revenues', | |
| color_continuous_scale='RdBu', | |
| template="simple_white", | |
| values='monetary_sanction', | |
| width=1000, height=600) | |
| st.plotly_chart(fig) | |
| st.subheader("The organizations' regions targeted by the sanctions: ") | |
| st.markdown("The graph shows the cumulated monetary sanction for the current filters") | |
| fig = px.treemap(monetary_decision[~monetary_decision.org_continent.isnull()], | |
| path=['org_continent', 'org_country'], | |
| color_continuous_scale='RdBu', | |
| template="simple_white", | |
| values='monetary_sanction', | |
| width=1000, height=600) | |
| st.plotly_chart(fig) | |
| st.subheader("Revenues vs monetary sanctions representation ") | |
| st.markdown("The graph shows the cumulated monetary sanction for the current filters") | |
| fig = px.scatter(monetary_decision, | |
| x="org_revenues", | |
| y="monetary_sanction", | |
| log_x=True, | |
| log_y=True, | |
| template="simple_white", | |
| color="same_country", | |
| color_continuous_scale='RdBu', | |
| hover_name="org_name", | |
| width=1000, height=600) | |
| st.plotly_chart(fig) | |
| fig = px.scatter(monetary_decision[~monetary_decision.org_revenues.isnull()], | |
| x="decision_date", | |
| size="log10_monetary_sanction", | |
| y="org_revenues", | |
| log_y=True, | |
| template="simple_white", | |
| color="same_country", | |
| hover_name="monetary_sanction", | |
| width=1000, height=600) | |
| st.plotly_chart(fig) | |
| fig = px.histogram(monetary_decision, x="log10_monetary_sanction", | |
| # y="log10_org_revenues", | |
| color="same_country", | |
| marginal="box", # or violin, rug | |
| template="simple_white", | |
| width=1000, height=600, nbins=40, opacity=0.5, | |
| hover_data=monetary_decision.columns) | |
| st.plotly_chart(fig) | |
| fig = px.histogram(monetary_decision, x="log10_monetary_sanction_rate", | |
| # y="log10_org_revenues", | |
| color="same_country", | |
| marginal="box", # or violin, rug | |
| template="simple_white", | |
| width=1000, height=600, nbins=40, opacity=0.5, | |
| hover_data=monetary_decision.columns) | |
| st.plotly_chart(fig) | |
| p = scipy.stats.ks_2samp(monetary_decision[monetary_decision.same_country]['log10_monetary_sanction_rate'], | |
| monetary_decision[~monetary_decision.same_country]['log10_monetary_sanction_rate'] | |
| , alternative='two-sided', mode='auto') | |
| st.metric(label="p-value", value=f"{round(p.pvalue, 2)}%") | |
| st.subheader("Sum of monetary sanctions over time ") | |
| st.markdown("The graph shows the cumulated monetary sanction per year for each violation theme") | |
| chart_data = get_themes_per_year(monetary_decision) | |
| fig = px.area(chart_data, x="year", | |
| y="monetary_sanction", | |
| color="violation_theme", | |
| template="simple_white", | |
| # groupnorm="fraction", | |
| line_group="violation_theme", | |
| width=1000, height=600) | |
| st.plotly_chart(fig) | |
| st.sidebar.title("Organizations view") | |
| col_x = ['log10_org_revenues', 'authorities_country', 'violation_theme', 'org_country', 'org_company_type'] | |
| predictors, target = prepare_data(monetary_decision) | |
| st.title("Training phase") | |
| st.markdown("Plot taget distribution: log 10 of monetary sanctions") | |
| fig = ff.create_distplot([target], [' log 10 of monetary sanctions'], bin_size=0.1) | |
| fig.update_layout(width=1000, | |
| template="simple_white", | |
| height=600, | |
| bargap=0.01) | |
| st.plotly_chart(fig) | |
| # split data set | |
| predictors_train, predictors_test, target_train, target_test = split(predictors, target) | |
| # train the model | |
| xgb_model = run_training(predictors_train, target_train) | |
| # evaluate model error | |
| target_train_predicted = predict(xgb_model, predictors_train) | |
| training_bias = np.mean(target_train_predicted - target_train) | |
| st.metric(label="Training bias", value=training_bias) | |
| target_test_predicted = predict(xgb_model, predictors_test) | |
| test_errors = target_test_predicted - target_test | |
| test_bias = np.mean(test_errors) | |
| st.metric(label="Test bias", value=test_bias) | |
| fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.1) | |
| fig.update_layout(width=1000, | |
| template="simple_white", | |
| height=600, | |
| bargap=0.01) | |
| st.plotly_chart(fig) | |
| st.subheader("Plot features importance for the trained model") | |
| xgb_features_importance = features_importance(xgb_model) | |
| fig = px.bar(xgb_features_importance, | |
| orientation='h', | |
| width=1000, | |
| template="simple_white", | |
| height=600, | |
| ) | |
| st.plotly_chart(fig) | |
| st.subheader("Plot predicted vs real") | |
| import plotly.graph_objs as go | |
| compare = pd.concat([pd.DataFrame({'target': target_test, 'predicted': target_test_predicted, 'sample': 'test'}), | |
| pd.DataFrame({'target': target_train, 'predicted': target_train_predicted, 'sample': 'train'})]) | |
| fig = px.scatter( | |
| compare, | |
| x='predicted', | |
| y='target', | |
| color='sample', | |
| marginal_y="violin", | |
| width=1000, | |
| template="simple_white", | |
| height=600, | |
| trendline="ols") | |
| st.plotly_chart(fig) | |
| sample_revenues = st.sidebar.number_input('Yearly revenues', value=1000000) | |
| authority = st.sidebar.selectbox('Organization country', predictors.org_country.cat.categories) | |
| authority = st.sidebar.selectbox('Organization activity', predictors.org_company_type.cat.categories) | |