Spaces:

Theolex
/

theolex_streamlit

Runtime error

App Files Files Community

theolex_streamlit / stream_app.py

Jawad

add exploration

a61cdb6 almost 4 years ago

raw

history blame contribute delete

12.9 kB

	# -- coding: utf-8 --
	import pickle
	import pandas as pd
	import streamlit as st
	from scipy import stats

	import plotly.express as px
	import plotly.figure_factory as ff

	import scipy
	import numpy as np
	from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
	from model import prepare_predictors, prepare_data, run_training, split, predict, features_importance, run_cv_training, automl_training


	def _max_width_():
	max_width_str = f"max-width: 1500px;"
	st.markdown(
	f"""
	<style>
	.reportview-container .main .block-container{{
	{max_width_str}
	}}
	</style>
	""",
	unsafe_allow_html=True,
	)


	# force screen width
	_max_width_()

	st.title("Data Analysis 🌎 📃")
	st.write("by [Theolex](https://www.theolex.io/)")

	# load and process data
	data = load_data()
	decisions, organizations, authorities = process_data(data)

	st.sidebar.title("Authorities parameters")
	authorities_country = st.sidebar.selectbox('Authority country', ['All', *authorities.country.unique()])

	if authorities_country != 'All':
	select_auth = authorities[authorities.country == authorities_country].name.sort_values()
	else:
	select_auth = authorities.name.sort_values()

	authority = st.sidebar.selectbox('Authority', ['All', *select_auth])
	min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2008, 2021))

	# apply filters
	authority_filter = True
	if authority != 'All':
	authority_filter = decisions.authorities_name.apply(lambda a: authority in a)
	else:
	authority_filter = decisions.authorities_name.apply(lambda a: bool(set(select_auth) & set(a)))
	year_filter = (decisions.year >= min_year) & (decisions.year <= max_year)
	decision_scope = decisions[authority_filter & year_filter]

	st.subheader("Dataset Description")

	st.metric('Number of validated decisions linked to organisations (and not individuals)', decision_scope.shape[0])


	st.metric('Decisions with monetary sanctions',
	decision_scope[decision_scope.monetary_sanction > 0].shape[0])

	# explore monetary sanctions
	monetary_decision = get_monetary_dataframe(decision_scope)

	st.metric('Decisions with organizations that have published yearly revenues', sum(monetary_decision.has_revenues))

	##
	# Plot Graphs
	##

	with st.expander("Data exploration"):
	st.subheader("The organizations' sectors targeted by the sanctions: ")
	st.markdown("The graph shows the cumulated monetary sanction for the current filters")

	fig = px.treemap(monetary_decision,
	path=['org_company_type'],
	color='org_revenues',
	color_continuous_scale='RdBu',
	template="simple_white",
	values='monetary_sanction',
	width=1000, height=600)
	st.plotly_chart(fig)

	st.subheader("The organizations' regions targeted by the sanctions: ")
	st.markdown("The graph shows the cumulated monetary sanction for the current filters")
	fig = px.treemap(monetary_decision[~monetary_decision.org_continent.isnull()],
	path=['org_continent', 'org_country'],
	color_continuous_scale='RdBu',
	template="simple_white",
	values='monetary_sanction',
	width=1000, height=600)
	st.plotly_chart(fig)

	st.subheader("Revenues vs monetary sanctions representation ")
	st.markdown("The graph shows the cumulated monetary sanction for the current filters")
	fig = px.scatter(monetary_decision,
	x="org_revenues",
	y="monetary_sanction",
	log_x=True,
	log_y=True,
	template="simple_white",
	color="same_country",
	color_continuous_scale='RdBu',
	hover_name="org_name",
	width=1000, height=600)
	st.plotly_chart(fig)

	fig = px.scatter(monetary_decision[~monetary_decision.org_revenues.isnull()],
	x="decision_date",
	size="log10_monetary_sanction",
	y="org_revenues",
	log_y=True,
	template="simple_white",
	color="same_country",
	hover_name="monetary_sanction",
	width=1000, height=600)
	st.plotly_chart(fig)

	fig = px.histogram(monetary_decision, x="log10_monetary_sanction",
	# y="log10_org_revenues",
	color="same_country",
	marginal="box", # or violin, rug
	template="simple_white",
	width=1000, height=600, nbins=40, opacity=0.5,
	hover_data=monetary_decision.columns)

	st.plotly_chart(fig)

	fig = px.histogram(monetary_decision, x="log10_monetary_sanction_rate",
	# y="log10_org_revenues",
	color="same_country",
	marginal="box", # or violin, rug
	template="simple_white",
	width=1000, height=600, nbins=40, opacity=0.5,
	hover_data=monetary_decision.columns)

	st.plotly_chart(fig)

	p = scipy.stats.ks_2samp(monetary_decision[monetary_decision.same_country]['log10_monetary_sanction_rate'],
	monetary_decision[~monetary_decision.same_country]['log10_monetary_sanction_rate']
	, alternative='two-sided', mode='auto')

	st.metric(label="p-value", value=f"{round(p.pvalue, 2)}%")

	st.subheader("Sum of monetary sanctions over time ")
	st.markdown("The graph shows the cumulated monetary sanction per year for each violation theme")
	chart_data = get_themes_per_year(monetary_decision)
	fig = px.area(chart_data, x="year",
	y="monetary_sanction",
	color="violation_theme",
	template="simple_white",
	# groupnorm="fraction",
	line_group="violation_theme",
	width=1000, height=600)
	st.plotly_chart(fig)

	##############################################
	####
	# build ML model
	####
	##############################################
	st.title("Training phase")
	col_num_all = ['log10_org_revenues',
	'time']
	col_cat_all = ['authorities_country',
	'type',
	'violation_theme',
	'justice_type',
	'org_country',
	'org_continent',
	'same_country',
	'org_company_type']

	st.sidebar.title("Training params")
	col_num = st.sidebar.multiselect('Numeric variables',
	col_num_all, default=col_num_all)
	col_cat = st.sidebar.multiselect('Categorical variables',
	col_cat_all, default=col_cat_all)
	# train the model
	predictors, target = prepare_data(monetary_decision, col_num, col_cat)
	if st.button('Run training'):
	with st.expander("Training results"):
	# Study distribution
	st.write(f"dataset size: {monetary_decision.shape[0]}")
	st.markdown("Plot target distribution: log 10 of monetary sanctions")
	fig = ff.create_distplot([target], ['log 10 of monetary sanctions'], bin_size=0.05)
	fig.update_layout(width=1000,
	template="simple_white",
	height=600,
	bargap=0.01)
	st.plotly_chart(fig)

	# Split data set
	predictors_train, predictors_test, target_train, target_test = split(predictors, target, test_size=0.05)
	st.subheader("Split dataset between training and test:")
	st.metric(label="Training size", value=predictors_train.shape[0])
	st.metric(label="Test size", value=predictors_test.shape[0])

	# Run cross validation
	st.subheader("Cross validation error")
	with st.spinner('Wait for it...'):
	#xgb_cv, best_params = run_cv_training(predictors_train, target_train)

	#st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]])
	#st.subheader("Selected variables")
	#st.json(best_params)

	# Train final
	#xgb_model = run_training(predictors_train, target_train, best_params["params"], best_params["best_round"])

	xgb_model = automl_training(predictors_train, target_train)
	# save model to file
	pickle.dump(xgb_model, open("xgb_model.pickle.dat", "wb"))

	# Evaluate model error
	#target_train_predicted = predict(xgb_model, predictors_train)
	target_train_predicted = xgb_model.predict(predictors_train)
	training_bias = np.mean(target_train_predicted - target_train)
	st.metric(label="Training bias", value=training_bias)

	#target_test_predicted = predict(xgb_model, predictors_test)
	target_test_predicted = xgb_model.predict(predictors_test)
	test_errors = target_test_predicted - target_test
	test_bias = np.mean(test_errors)
	st.metric(label="Test bias", value=test_bias)

	fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.2)
	fig.update_layout(width=1000,
	template="simple_white",
	height=600,
	bargap=0.01)
	st.plotly_chart(fig)

	st.subheader("Plot features importance for the trained model")
	print("predictors_train shape: ", predictors_train.columns)
	xgb_features_importance = pd.DataFrame([xgb_model.model.estimator.feature_importances_],
	columns=predictors_train.columns)
	print(xgb_features_importance)
	#st.dataframe(xgb_features_importance)

	# xgb_features_importance = features_importance(xgb_model)
	#
	fig = px.bar(xgb_features_importance.T,
	orientation='h',
	width=1000,
	template="simple_white",
	height=600,
	)
	st.plotly_chart(fig)

	st.subheader("Plot predicted vs real")
	compare = pd.concat(
	[pd.DataFrame({'target': target_test, 'predicted': target_test_predicted, 'sample': 'test'}),
	pd.DataFrame({'target': target_train, 'predicted': target_train_predicted, 'sample': 'train'})])
	fig = px.scatter(
	compare,
	x='predicted',
	y='target',
	color='sample',
	marginal_y="violin",
	width=1000,
	template="simple_white",
	height=600,
	trendline="ols")

	st.plotly_chart(fig)

	naive_error_std = np.std(target_train - np.mean(target_train_predicted))
	model_error_std = np.std(target_train - target_train_predicted)

	st.metric(label="Naive error standard deviation", value=naive_error_std)
	st.metric(label="Model error standard deviation", value=model_error_std)

	corr_matrix = np.corrcoef(target_train, target_train_predicted)
	R_sq = corr_matrix[0, 1] ** 2
	st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%")

	naive_error_std = np.std(target_test - np.mean(target_test_predicted))
	model_error_std = np.std(target_test - target_test_predicted)

	st.metric(label="Naive error standard deviation", value=naive_error_std)
	st.metric(label="Model error standard deviation", value=model_error_std)

	corr_matrix = np.corrcoef(target_test, target_test_predicted)
	R_sq = corr_matrix[0, 1] ** 2
	st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%")

	st.subheader("Residuals & homoscedasticity")
	# st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%")

	print(stats.pearsonr(test_errors, target_test))

	st.title("Organizations view")
	prediction_model = pickle.load(open("xgb_model.pickle.dat", "rb"))
	col1, _, _ = st.columns(3)
	to_predict = {}
	with col1:
	to_predict['log10_org_revenues'] = [np.log10(st.number_input('Yearly revenues', value=100000000))]
	to_predict['time'] = 0
	for col in col_cat:
	to_predict[col] = [st.selectbox(f'{col}', predictors[col].cat.categories)]

	df_to_predict = prepare_predictors(pd.DataFrame.from_dict(to_predict), col_num, col_cat)
	st.dataframe(df_to_predict)

	if prediction_model:
	try:
	predicted = prediction_model.predict(df_to_predict)
	st.metric(label="Monetary sanction prediction", value=f"{'{:,.2f}'.format(10**(predicted[0]-3))} K$")
	print(predicted)
	except ValueError:
	st.subheader("You need to rerun training !")