Spaces:
Runtime error
Runtime error
init
Browse files- data_processing.py +73 -0
- requirements.txt +84 -2
- stream_app.py +51 -106
data_processing.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import requests
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# util functions
|
| 7 |
+
def get_id(x):
|
| 8 |
+
return int(x.split("/")[-2])
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def get_dict(df, col):
|
| 12 |
+
return df[col].to_dict()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def replace_lis_val(df, col):
|
| 16 |
+
def _replace(l):
|
| 17 |
+
return [get_dict(df, col)[i] for i in l]
|
| 18 |
+
|
| 19 |
+
return _replace
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def mode(lst):
|
| 23 |
+
if lst:
|
| 24 |
+
return max(set(lst), key=lst.count)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@st.cache
|
| 28 |
+
def load_data():
|
| 29 |
+
url = "https://www.theolex.io/data"
|
| 30 |
+
validated_filter = "status=V"
|
| 31 |
+
linked_dataset = "include[]=violations.*&include[]=organizations.*&include[]=authorities.*"
|
| 32 |
+
url_d = f"{url}/decisions/?per_page=4000&{validated_filter}&{linked_dataset}"
|
| 33 |
+
response = requests.get(url_d, headers={'authorization': 'Token 8d55a74628aee8122b7a5a1a51f7caad6d613ec1',
|
| 34 |
+
'accept': 'application/json'})
|
| 35 |
+
|
| 36 |
+
# work on decisions
|
| 37 |
+
return response.json()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def process_data(data):
|
| 41 |
+
decisions = pd.DataFrame(data['decisions'])
|
| 42 |
+
decisions['year'] = pd.to_datetime(decisions['decision_date']).dt.year
|
| 43 |
+
decisions.monetary_sanction = decisions.monetary_sanction.astype(float)
|
| 44 |
+
decisions = decisions[decisions.status == 'V']
|
| 45 |
+
decisions.decision_date = pd.to_datetime(decisions['decision_date']).dt.date
|
| 46 |
+
decisions['id'] = decisions.url.apply(get_id)
|
| 47 |
+
decision_col = ['violations', 'authorities', 'organizations', 'country_of_violation', 'type', 'justice_type',
|
| 48 |
+
'defendant', 'decision_date', 'monetary_sanction', 'nature_de_sanction', 'violation_theme', 'year']
|
| 49 |
+
decisions = decisions[decision_col]
|
| 50 |
+
decisions = decisions.explode('organizations')
|
| 51 |
+
|
| 52 |
+
# work on organisations
|
| 53 |
+
organizations = pd.DataFrame(data['organizations'])
|
| 54 |
+
organizations['id'] = organizations.url.apply(get_id)
|
| 55 |
+
organizations = organizations[["id", "name", "company_type", "revenues", "currency", "country", "lei"]]
|
| 56 |
+
organizations.columns = ['org_' + col for col in organizations.columns]
|
| 57 |
+
decisions = decisions.merge(organizations, left_on='organizations', right_on='org_id')
|
| 58 |
+
|
| 59 |
+
# remove Individual
|
| 60 |
+
decisions = decisions[decisions.org_company_type != "Individual"]
|
| 61 |
+
decisions.org_country = decisions.org_country.str.lower().str.strip()
|
| 62 |
+
|
| 63 |
+
# work on authorities
|
| 64 |
+
authorities = pd.DataFrame(data['authorities'])
|
| 65 |
+
authorities.index = authorities.url.apply(get_id)
|
| 66 |
+
authorities = authorities[["country", "type", "name"]]
|
| 67 |
+
countries = {'FR': 'france', 'US': 'United States', 'UK': 'United Kingdom', 'GE': 'Germany'}
|
| 68 |
+
authorities.country = authorities.country.apply(lambda v: countries.get(v, v)).str.lower().str.strip()
|
| 69 |
+
|
| 70 |
+
decisions['authorities_name'] = decisions.authorities.apply(replace_lis_val(authorities, 'name'))
|
| 71 |
+
decisions['authorities_country'] = decisions.authorities.apply(replace_lis_val(authorities, 'country')).apply(mode)
|
| 72 |
+
|
| 73 |
+
return decisions, organizations, authorities
|
requirements.txt
CHANGED
|
@@ -1,7 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
numpy==1.19.5
|
|
|
|
| 2 |
pandas==1.2.4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
plotly==5.3.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
seaborn==0.11.2
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
streamlit==0.89.0
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
altair==4.1.0
|
| 2 |
+
argon2-cffi==21.1.0
|
| 3 |
+
astor==0.8.1
|
| 4 |
+
attrs==21.2.0
|
| 5 |
+
backcall==0.2.0
|
| 6 |
+
backports.zoneinfo==0.2.1
|
| 7 |
+
base58==2.1.0
|
| 8 |
+
bleach==4.1.0
|
| 9 |
+
blinker==1.4
|
| 10 |
+
cachetools==4.2.4
|
| 11 |
+
certifi==2021.5.30
|
| 12 |
+
cffi==1.14.6
|
| 13 |
+
charset-normalizer==2.0.6
|
| 14 |
+
click==7.1.2
|
| 15 |
+
cycler==0.10.0
|
| 16 |
+
debugpy==1.5.0
|
| 17 |
+
decorator==5.1.0
|
| 18 |
+
defusedxml==0.7.1
|
| 19 |
+
entrypoints==0.3
|
| 20 |
+
gitdb==4.0.7
|
| 21 |
+
GitPython==3.1.24
|
| 22 |
+
idna==3.2
|
| 23 |
+
ipykernel==6.4.1
|
| 24 |
+
ipython==7.28.0
|
| 25 |
+
ipython-genutils==0.2.0
|
| 26 |
+
ipywidgets==7.6.5
|
| 27 |
+
jedi==0.18.0
|
| 28 |
+
Jinja2==3.0.2
|
| 29 |
+
jsonschema==4.0.1
|
| 30 |
+
jupyter-client==7.0.6
|
| 31 |
+
jupyter-core==4.8.1
|
| 32 |
+
jupyterlab-pygments==0.1.2
|
| 33 |
+
jupyterlab-widgets==1.0.2
|
| 34 |
+
kiwisolver==1.3.2
|
| 35 |
+
MarkupSafe==2.0.1
|
| 36 |
+
matplotlib==3.4.3
|
| 37 |
+
matplotlib-inline==0.1.3
|
| 38 |
+
mistune==0.8.4
|
| 39 |
+
nbclient==0.5.4
|
| 40 |
+
nbconvert==6.2.0
|
| 41 |
+
nbformat==5.1.3
|
| 42 |
+
nest-asyncio==1.5.1
|
| 43 |
+
notebook==6.4.4
|
| 44 |
numpy==1.19.5
|
| 45 |
+
packaging==21.0
|
| 46 |
pandas==1.2.4
|
| 47 |
+
pandocfilters==1.5.0
|
| 48 |
+
parso==0.8.2
|
| 49 |
+
patsy==0.5.2
|
| 50 |
+
pexpect==4.8.0
|
| 51 |
+
pickleshare==0.7.5
|
| 52 |
+
Pillow==8.3.2
|
| 53 |
plotly==5.3.1
|
| 54 |
+
prometheus-client==0.11.0
|
| 55 |
+
prompt-toolkit==3.0.20
|
| 56 |
+
protobuf==3.18.1
|
| 57 |
+
ptyprocess==0.7.0
|
| 58 |
+
pyarrow==5.0.0
|
| 59 |
+
pycparser==2.20
|
| 60 |
+
pydeck==0.7.0
|
| 61 |
+
Pygments==2.10.0
|
| 62 |
+
pyparsing==2.4.7
|
| 63 |
+
pyrsistent==0.18.0
|
| 64 |
+
python-dateutil==2.8.2
|
| 65 |
+
pytz==2021.3
|
| 66 |
+
pyzmq==22.3.0
|
| 67 |
+
requests==2.26.0
|
| 68 |
+
scipy==1.7.1
|
| 69 |
seaborn==0.11.2
|
| 70 |
+
Send2Trash==1.8.0
|
| 71 |
+
six==1.16.0
|
| 72 |
+
smmap==4.0.0
|
| 73 |
+
statsmodels==0.13.0
|
| 74 |
streamlit==0.89.0
|
| 75 |
+
tenacity==8.0.1
|
| 76 |
+
terminado==0.12.1
|
| 77 |
+
testpath==0.5.0
|
| 78 |
+
toml==0.10.2
|
| 79 |
+
toolz==0.11.1
|
| 80 |
+
tornado==6.1
|
| 81 |
+
traitlets==5.1.0
|
| 82 |
+
typing-extensions==3.10.0.2
|
| 83 |
+
tzlocal==3.0
|
| 84 |
+
urllib3==1.26.7
|
| 85 |
+
validators==0.18.2
|
| 86 |
+
watchdog==2.1.6
|
| 87 |
+
wcwidth==0.2.5
|
| 88 |
+
webencodings==0.5.1
|
| 89 |
+
widgetsnbextension==3.5.1
|
stream_app.py
CHANGED
|
@@ -1,110 +1,55 @@
|
|
| 1 |
-
|
| 2 |
import streamlit as st
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
-
from datetime import datetime
|
| 5 |
-
import pandas as pd
|
| 6 |
import plotly.express as px
|
| 7 |
-
from plotly.subplots import make_subplots
|
| 8 |
-
import plotly.graph_objects as go
|
| 9 |
-
import matplotlib.pyplot as plt
|
| 10 |
-
from streamlit.elements.arrow_altair import ChartType
|
| 11 |
-
from streamlit.type_util import data_frame_to_bytes
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
#The purpose of this script is to create interactive graphs
|
| 15 |
-
#on an application through streamlit and plotly. These graphs will make it possible to
|
| 16 |
-
#To answer certain questions about the link between the variables
|
| 17 |
-
#Principle to see the distribution of fines according Define the variable that allows filtering to select boxes the variable that allows filtering to select boxes the types of violations, the countries of jurisdiction ...
|
| 18 |
-
|
| 19 |
-
#Define the title of the application
|
| 20 |
-
st.title('Data exploration & Visualization')
|
| 21 |
-
|
| 22 |
-
#Description of the application
|
| 23 |
-
st.markdown('This application allows you to create interactive charts with streamlit in order to answer a number of data-related questions and to better highlight the information contained in the dataset.')
|
| 24 |
-
|
| 25 |
-
#Read the data from a cvs file
|
| 26 |
-
df = pd.read_csv("theolex.csv")
|
| 27 |
-
|
| 28 |
-
# Get the max year from date
|
| 29 |
-
df.decision_date = pd.to_datetime(df.decision_date)
|
| 30 |
-
max_year = df.decision_date.max().year
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
#Define the variable that allows filtering to select boxes
|
| 34 |
-
d = st.select_slider('Select year interval', options=range(max_year-10, max_year))
|
| 35 |
-
|
| 36 |
-
# Plot a bar_chart graph to show the amount monetary sanction per organization
|
| 37 |
-
df = df.set_index('name').sort_values('monetary_sanction_usd', ascending=False)
|
| 38 |
-
st.header('Amount of monetary sanction per oranization name')
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
st.bar_chart(df[df.decision_date.dt.year <= d].head(10)['monetary_sanction_usd'], height=500)
|
| 42 |
-
st.markdown('From this graph we can find the companies that have the highest amount of monetary sanction for selected year.')
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
#---------------------------------------------------------------------------------------
|
| 46 |
-
# Plot a bar_chart graph to show the amount monetary sanction per nature of violation
|
| 47 |
-
|
| 48 |
-
st.header('Amount of monetary sanction per nature of violation')
|
| 49 |
-
|
| 50 |
-
df = df.set_index('nature_of_violations').sort_values('monetary_sanction_usd', ascending=False)
|
| 51 |
-
|
| 52 |
-
st.bar_chart(df[df.decision_date.dt.year <= d].head(10)['monetary_sanction_usd'], height=500)
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
#----------------------------------------------------------------------------------------------------#
|
| 59 |
-
# Plot a bar_chart graph to show the amount monetary sanction per country of the authorities
|
| 60 |
-
|
| 61 |
-
df_filter = pd.read_csv("monetary_year.csv")
|
| 62 |
-
df_filter.decision_date = pd.to_datetime(df_filter.decision_date)
|
| 63 |
-
st.header('Amount of monetary sanction per country of the Authorities')
|
| 64 |
-
|
| 65 |
-
df_filter = df_filter.set_index('country_authorities').sort_values('monetary_sanction', ascending=False)
|
| 66 |
-
|
| 67 |
-
st.bar_chart(df_filter[df_filter.decision_date.dt.year <= d].head(10)['monetary_sanction'], height=500)
|
| 68 |
-
#--------------------------------------------------------------------------------------------------
|
| 69 |
-
|
| 70 |
-
#Here we want to see for a given country, how does the amount of monetary sanction changes per year
|
| 71 |
-
|
| 72 |
-
d = st.selectbox('Select the country of the organisation', options=list(df_filter['country_organisation'].unique()))
|
| 73 |
-
df_filter.decision_date = pd.to_datetime(df_filter.decision_date)
|
| 74 |
-
#df_filter.year=df_filter.decision_date.dt.year
|
| 75 |
-
|
| 76 |
-
df_filter = df_filter.set_index('year').sort_values('monetary_sanction', ascending=True)
|
| 77 |
-
|
| 78 |
-
st.header('Amount of monetary sanction per year for a selected country of organisation')
|
| 79 |
-
|
| 80 |
-
st.line_chart(df_filter[df_filter.country_organisation == d].head(10)['monetary_sanction'])
|
| 81 |
-
st.markdown('From this grap shows the urve of the monetary sanction from 2010 to 2020 for eac contry. For example, if we check for US we can see that the amount monetary sanction increase from 2012 to to 2019 and decreases in 2019.')
|
| 82 |
-
|
| 83 |
-
#-----------------------------------------------------------------------------------------------------
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
# st.header('Amount of monetary sanction compare to the revenue of the company')
|
| 88 |
-
|
| 89 |
-
# #df_filter = df_filter.sort_values('revenues_organisation', ascending=False)
|
| 90 |
-
|
| 91 |
-
# df_filter = df_filter.set_index('revenues_organisation')
|
| 92 |
-
|
| 93 |
-
# #st.area_chart(df_filter.head(10)['monetary_sanction'])
|
| 94 |
-
|
| 95 |
-
# df_agg = df_filter.groupby('year').agg({'monetary_sanction': 'count'})
|
| 96 |
-
# st.bar_chart(df_agg)
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
#df_agg=df_filter.groupby('year').agg({'monetary_sanction': 'count'})
|
| 100 |
-
st.header('The number of monetary sanction per country and per year')
|
| 101 |
-
|
| 102 |
-
df_agg = pd.read_csv("agg_countMt.csv")
|
| 103 |
-
d = st.select_slider('Select year interval', options=list(df_agg['year'].unique()))
|
| 104 |
-
df_agg= df_agg.set_index('country_organisation')
|
| 105 |
-
|
| 106 |
-
df_agg.columns=['year', 'num_monetary_sanction']
|
| 107 |
-
st.bar_chart(df_agg[df_agg.year == d].head(10)['num_monetary_sanction'])
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
import streamlit as st
|
| 3 |
+
|
| 4 |
import numpy as np
|
|
|
|
|
|
|
| 5 |
import plotly.express as px
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
from data_processing import load_data, process_data
|
| 8 |
+
|
| 9 |
+
st.title("Data Analysis 🌎 📃")
|
| 10 |
+
st.write("by [Teolex](https://www.theolex.io/)")
|
| 11 |
+
|
| 12 |
+
# load and process data
|
| 13 |
+
data = load_data()
|
| 14 |
+
decisions, organizations, authorities = process_data(data)
|
| 15 |
+
|
| 16 |
+
st.sidebar.title("Parameters")
|
| 17 |
+
authorities_country = st.sidebar.selectbox('Authority country', authorities.country.unique())
|
| 18 |
+
|
| 19 |
+
select_auth = authorities[authorities.country == authorities_country].name.sort_values()
|
| 20 |
+
authority = st.sidebar.selectbox('Authority', ['All', *select_auth])
|
| 21 |
+
|
| 22 |
+
min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2010, 2021))
|
| 23 |
+
|
| 24 |
+
# apply filters
|
| 25 |
+
authority_filter = True
|
| 26 |
+
if authority != 'All':
|
| 27 |
+
authority_filter = decisions.authorities_name.apply(lambda a: authority in a)
|
| 28 |
+
else:
|
| 29 |
+
authority_filter = decisions.authorities_name.apply(lambda a: bool(set(select_auth) & set(a)))
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
decisions.head()
|
| 33 |
+
year_filter = (decisions.year >= min_year) & (decisions.year <= max_year)
|
| 34 |
+
|
| 35 |
+
decision_scope = decisions[authority_filter & year_filter]
|
| 36 |
+
|
| 37 |
+
## explore monetary sanctions
|
| 38 |
+
monetary_decision = decision_scope[decision_scope.monetary_sanction > 0]
|
| 39 |
+
monetary_decision = monetary_decision[monetary_decision.org_revenues != ""]
|
| 40 |
+
monetary_decision['org_revenues'] = monetary_decision.org_revenues.astype(float)
|
| 41 |
+
monetary_decision['log10_org_revenues'] = monetary_decision.org_revenues.apply(np.log10)
|
| 42 |
+
monetary_decision['log10_monetary_sanction'] = monetary_decision.monetary_sanction.apply(np.log10)
|
| 43 |
+
monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
|
| 44 |
+
fig = px.scatter(monetary_decision,
|
| 45 |
+
x="org_revenues",
|
| 46 |
+
y="monetary_sanction",
|
| 47 |
+
log_x=True,
|
| 48 |
+
log_y=True,template="simple_white",
|
| 49 |
+
color="authorities_country",
|
| 50 |
+
trendline="ols",
|
| 51 |
+
hover_name="org_name")
|
| 52 |
+
st.plotly_chart(fig)
|
| 53 |
+
st.markdown("Comments...")
|
| 54 |
+
st.subheader("1.1 Main takeaways: ")
|
| 55 |
+
st.write("sample text.")
|