theolex_streamlit / stream_app.py
Jawad's picture
add traning
fe0f3db
raw
history blame
7.85 kB
# -*- coding: utf-8 -*-
import pandas as pd
import streamlit as st
import plotly.express as px
import plotly.figure_factory as ff
import scipy
import numpy as np
from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
from model import prepare_data, run_training, split, predict, features_importance
def _max_width_():
max_width_str = f"max-width: 1500px;"
st.markdown(
f"""
<style>
.reportview-container .main .block-container{{
{max_width_str}
}}
</style>
""",
unsafe_allow_html=True,
)
# force screen width
_max_width_()
st.title("Data Analysis 🌎 πŸ“ƒ")
st.write("by [Teolex](https://www.theolex.io/)")
# load and process data
data = load_data()
decisions, organizations, authorities = process_data(data)
st.sidebar.title("Authorities parameters")
authorities_country = st.sidebar.selectbox('Authority country', ['All', *authorities.country.unique()])
if authorities_country != 'All':
select_auth = authorities[authorities.country == authorities_country].name.sort_values()
else:
select_auth = authorities.name.sort_values()
authority = st.sidebar.selectbox('Authority', ['All', *select_auth])
min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2010, 2021))
# apply filters
authority_filter = True
if authority != 'All':
authority_filter = decisions.authorities_name.apply(lambda a: authority in a)
else:
authority_filter = decisions.authorities_name.apply(lambda a: bool(set(select_auth) & set(a)))
year_filter = (decisions.year >= min_year) & (decisions.year <= max_year)
decision_scope = decisions[authority_filter & year_filter]
# explore monetary sanctions
monetary_decision = get_monetary_dataframe(decision_scope)
##
# Plot Graphs
##
st.subheader("The organizations' sectors targeted by the sanctions: ")
st.markdown("The graph shows the cumulated monetary sanction for the current filters")
fig = px.treemap(monetary_decision,
path=['org_company_type'],
color='org_revenues',
color_continuous_scale='RdBu',
template="simple_white",
values='monetary_sanction',
width=1000, height=600)
st.plotly_chart(fig)
st.subheader("The organizations' regions targeted by the sanctions: ")
st.markdown("The graph shows the cumulated monetary sanction for the current filters")
fig = px.treemap(monetary_decision[~monetary_decision.org_continent.isnull()],
path=['org_continent', 'org_country'],
color_continuous_scale='RdBu',
template="simple_white",
values='monetary_sanction',
width=1000, height=600)
st.plotly_chart(fig)
st.subheader("Revenues vs monetary sanctions representation ")
st.markdown("The graph shows the cumulated monetary sanction for the current filters")
fig = px.scatter(monetary_decision,
x="org_revenues",
y="monetary_sanction",
log_x=True,
log_y=True,
template="simple_white",
color="same_country",
color_continuous_scale='RdBu',
hover_name="org_name",
width=1000, height=600)
st.plotly_chart(fig)
fig = px.scatter(monetary_decision[~monetary_decision.org_revenues.isnull()],
x="decision_date",
size="log10_monetary_sanction",
y="org_revenues",
log_y=True,
template="simple_white",
color="same_country",
hover_name="monetary_sanction",
width=1000, height=600)
st.plotly_chart(fig)
fig = px.histogram(monetary_decision, x="log10_monetary_sanction",
# y="log10_org_revenues",
color="same_country",
marginal="box", # or violin, rug
template="simple_white",
width=1000, height=600, nbins=40, opacity=0.5,
hover_data=monetary_decision.columns)
st.plotly_chart(fig)
fig = px.histogram(monetary_decision, x="log10_monetary_sanction_rate",
# y="log10_org_revenues",
color="same_country",
marginal="box", # or violin, rug
template="simple_white",
width=1000, height=600, nbins=40, opacity=0.5,
hover_data=monetary_decision.columns)
st.plotly_chart(fig)
p = scipy.stats.ks_2samp(monetary_decision[monetary_decision.same_country]['log10_monetary_sanction_rate'],
monetary_decision[~monetary_decision.same_country]['log10_monetary_sanction_rate']
, alternative='two-sided', mode='auto')
st.metric(label="p-value", value=f"{round(p.pvalue, 2)}%")
st.subheader("Sum of monetary sanctions over time ")
st.markdown("The graph shows the cumulated monetary sanction per year for each violation theme")
chart_data = get_themes_per_year(monetary_decision)
fig = px.area(chart_data, x="year",
y="monetary_sanction",
color="violation_theme",
template="simple_white",
# groupnorm="fraction",
line_group="violation_theme",
width=1000, height=600)
st.plotly_chart(fig)
st.sidebar.title("Organizations view")
col_x = ['log10_org_revenues', 'authorities_country', 'violation_theme', 'org_country', 'org_company_type']
predictors, target = prepare_data(monetary_decision)
st.title("Training phase")
st.markdown("Plot taget distribution: log 10 of monetary sanctions")
fig = ff.create_distplot([target], [' log 10 of monetary sanctions'], bin_size=0.1)
fig.update_layout(width=1000,
template="simple_white",
height=600,
bargap=0.01)
st.plotly_chart(fig)
# split data set
predictors_train, predictors_test, target_train, target_test = split(predictors, target)
# train the model
xgb_model = run_training(predictors_train, target_train)
# evaluate model error
target_train_predicted = predict(xgb_model, predictors_train)
training_bias = np.mean(target_train_predicted - target_train)
st.metric(label="Training bias", value=training_bias)
target_test_predicted = predict(xgb_model, predictors_test)
test_errors = target_test_predicted - target_test
test_bias = np.mean(test_errors)
st.metric(label="Test bias", value=test_bias)
fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.1)
fig.update_layout(width=1000,
template="simple_white",
height=600,
bargap=0.01)
st.plotly_chart(fig)
st.subheader("Plot features importance for the trained model")
xgb_features_importance = features_importance(xgb_model)
fig = px.bar(xgb_features_importance,
orientation='h',
width=1000,
template="simple_white",
height=600,
)
st.plotly_chart(fig)
st.subheader("Plot predicted vs real")
import plotly.graph_objs as go
compare = pd.concat([pd.DataFrame({'target': target_test, 'predicted': target_test_predicted, 'sample': 'test'}),
pd.DataFrame({'target': target_train, 'predicted': target_train_predicted, 'sample': 'train'})])
fig = px.scatter(
compare,
x='predicted',
y='target',
color='sample',
marginal_y="violin",
width=1000,
template="simple_white",
height=600,
trendline="ols")
st.plotly_chart(fig)
sample_revenues = st.sidebar.number_input('Yearly revenues', value=1000000)
authority = st.sidebar.selectbox('Organization country', predictors.org_country.cat.categories)
authority = st.sidebar.selectbox('Organization activity', predictors.org_company_type.cat.categories)