Jawad commited on
Commit
cfd4139
·
1 Parent(s): 196857c
Files changed (3) hide show
  1. data_processing.py +73 -0
  2. requirements.txt +84 -2
  3. stream_app.py +51 -106
data_processing.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import pandas as pd
4
+
5
+
6
+ # util functions
7
+ def get_id(x):
8
+ return int(x.split("/")[-2])
9
+
10
+
11
+ def get_dict(df, col):
12
+ return df[col].to_dict()
13
+
14
+
15
+ def replace_lis_val(df, col):
16
+ def _replace(l):
17
+ return [get_dict(df, col)[i] for i in l]
18
+
19
+ return _replace
20
+
21
+
22
+ def mode(lst):
23
+ if lst:
24
+ return max(set(lst), key=lst.count)
25
+
26
+
27
+ @st.cache
28
+ def load_data():
29
+ url = "https://www.theolex.io/data"
30
+ validated_filter = "status=V"
31
+ linked_dataset = "include[]=violations.*&include[]=organizations.*&include[]=authorities.*"
32
+ url_d = f"{url}/decisions/?per_page=4000&{validated_filter}&{linked_dataset}"
33
+ response = requests.get(url_d, headers={'authorization': 'Token 8d55a74628aee8122b7a5a1a51f7caad6d613ec1',
34
+ 'accept': 'application/json'})
35
+
36
+ # work on decisions
37
+ return response.json()
38
+
39
+
40
+ def process_data(data):
41
+ decisions = pd.DataFrame(data['decisions'])
42
+ decisions['year'] = pd.to_datetime(decisions['decision_date']).dt.year
43
+ decisions.monetary_sanction = decisions.monetary_sanction.astype(float)
44
+ decisions = decisions[decisions.status == 'V']
45
+ decisions.decision_date = pd.to_datetime(decisions['decision_date']).dt.date
46
+ decisions['id'] = decisions.url.apply(get_id)
47
+ decision_col = ['violations', 'authorities', 'organizations', 'country_of_violation', 'type', 'justice_type',
48
+ 'defendant', 'decision_date', 'monetary_sanction', 'nature_de_sanction', 'violation_theme', 'year']
49
+ decisions = decisions[decision_col]
50
+ decisions = decisions.explode('organizations')
51
+
52
+ # work on organisations
53
+ organizations = pd.DataFrame(data['organizations'])
54
+ organizations['id'] = organizations.url.apply(get_id)
55
+ organizations = organizations[["id", "name", "company_type", "revenues", "currency", "country", "lei"]]
56
+ organizations.columns = ['org_' + col for col in organizations.columns]
57
+ decisions = decisions.merge(organizations, left_on='organizations', right_on='org_id')
58
+
59
+ # remove Individual
60
+ decisions = decisions[decisions.org_company_type != "Individual"]
61
+ decisions.org_country = decisions.org_country.str.lower().str.strip()
62
+
63
+ # work on authorities
64
+ authorities = pd.DataFrame(data['authorities'])
65
+ authorities.index = authorities.url.apply(get_id)
66
+ authorities = authorities[["country", "type", "name"]]
67
+ countries = {'FR': 'france', 'US': 'United States', 'UK': 'United Kingdom', 'GE': 'Germany'}
68
+ authorities.country = authorities.country.apply(lambda v: countries.get(v, v)).str.lower().str.strip()
69
+
70
+ decisions['authorities_name'] = decisions.authorities.apply(replace_lis_val(authorities, 'name'))
71
+ decisions['authorities_country'] = decisions.authorities.apply(replace_lis_val(authorities, 'country')).apply(mode)
72
+
73
+ return decisions, organizations, authorities
requirements.txt CHANGED
@@ -1,7 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  numpy==1.19.5
 
2
  pandas==1.2.4
 
 
 
 
 
 
3
  plotly==5.3.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  seaborn==0.11.2
 
 
 
 
5
  streamlit==0.89.0
6
-
7
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==4.1.0
2
+ argon2-cffi==21.1.0
3
+ astor==0.8.1
4
+ attrs==21.2.0
5
+ backcall==0.2.0
6
+ backports.zoneinfo==0.2.1
7
+ base58==2.1.0
8
+ bleach==4.1.0
9
+ blinker==1.4
10
+ cachetools==4.2.4
11
+ certifi==2021.5.30
12
+ cffi==1.14.6
13
+ charset-normalizer==2.0.6
14
+ click==7.1.2
15
+ cycler==0.10.0
16
+ debugpy==1.5.0
17
+ decorator==5.1.0
18
+ defusedxml==0.7.1
19
+ entrypoints==0.3
20
+ gitdb==4.0.7
21
+ GitPython==3.1.24
22
+ idna==3.2
23
+ ipykernel==6.4.1
24
+ ipython==7.28.0
25
+ ipython-genutils==0.2.0
26
+ ipywidgets==7.6.5
27
+ jedi==0.18.0
28
+ Jinja2==3.0.2
29
+ jsonschema==4.0.1
30
+ jupyter-client==7.0.6
31
+ jupyter-core==4.8.1
32
+ jupyterlab-pygments==0.1.2
33
+ jupyterlab-widgets==1.0.2
34
+ kiwisolver==1.3.2
35
+ MarkupSafe==2.0.1
36
+ matplotlib==3.4.3
37
+ matplotlib-inline==0.1.3
38
+ mistune==0.8.4
39
+ nbclient==0.5.4
40
+ nbconvert==6.2.0
41
+ nbformat==5.1.3
42
+ nest-asyncio==1.5.1
43
+ notebook==6.4.4
44
  numpy==1.19.5
45
+ packaging==21.0
46
  pandas==1.2.4
47
+ pandocfilters==1.5.0
48
+ parso==0.8.2
49
+ patsy==0.5.2
50
+ pexpect==4.8.0
51
+ pickleshare==0.7.5
52
+ Pillow==8.3.2
53
  plotly==5.3.1
54
+ prometheus-client==0.11.0
55
+ prompt-toolkit==3.0.20
56
+ protobuf==3.18.1
57
+ ptyprocess==0.7.0
58
+ pyarrow==5.0.0
59
+ pycparser==2.20
60
+ pydeck==0.7.0
61
+ Pygments==2.10.0
62
+ pyparsing==2.4.7
63
+ pyrsistent==0.18.0
64
+ python-dateutil==2.8.2
65
+ pytz==2021.3
66
+ pyzmq==22.3.0
67
+ requests==2.26.0
68
+ scipy==1.7.1
69
  seaborn==0.11.2
70
+ Send2Trash==1.8.0
71
+ six==1.16.0
72
+ smmap==4.0.0
73
+ statsmodels==0.13.0
74
  streamlit==0.89.0
75
+ tenacity==8.0.1
76
+ terminado==0.12.1
77
+ testpath==0.5.0
78
+ toml==0.10.2
79
+ toolz==0.11.1
80
+ tornado==6.1
81
+ traitlets==5.1.0
82
+ typing-extensions==3.10.0.2
83
+ tzlocal==3.0
84
+ urllib3==1.26.7
85
+ validators==0.18.2
86
+ watchdog==2.1.6
87
+ wcwidth==0.2.5
88
+ webencodings==0.5.1
89
+ widgetsnbextension==3.5.1
stream_app.py CHANGED
@@ -1,110 +1,55 @@
1
- from os import write
2
  import streamlit as st
 
3
  import numpy as np
4
- from datetime import datetime
5
- import pandas as pd
6
  import plotly.express as px
7
- from plotly.subplots import make_subplots
8
- import plotly.graph_objects as go
9
- import matplotlib.pyplot as plt
10
- from streamlit.elements.arrow_altair import ChartType
11
- from streamlit.type_util import data_frame_to_bytes
12
-
13
-
14
- #The purpose of this script is to create interactive graphs
15
- #on an application through streamlit and plotly. These graphs will make it possible to
16
- #To answer certain questions about the link between the variables
17
- #Principle to see the distribution of fines according Define the variable that allows filtering to select boxes the variable that allows filtering to select boxes the types of violations, the countries of jurisdiction ...
18
-
19
- #Define the title of the application
20
- st.title('Data exploration & Visualization')
21
-
22
- #Description of the application
23
- st.markdown('This application allows you to create interactive charts with streamlit in order to answer a number of data-related questions and to better highlight the information contained in the dataset.')
24
-
25
- #Read the data from a cvs file
26
- df = pd.read_csv("theolex.csv")
27
-
28
- # Get the max year from date
29
- df.decision_date = pd.to_datetime(df.decision_date)
30
- max_year = df.decision_date.max().year
31
-
32
-
33
- #Define the variable that allows filtering to select boxes
34
- d = st.select_slider('Select year interval', options=range(max_year-10, max_year))
35
-
36
- # Plot a bar_chart graph to show the amount monetary sanction per organization
37
- df = df.set_index('name').sort_values('monetary_sanction_usd', ascending=False)
38
- st.header('Amount of monetary sanction per oranization name')
39
-
40
-
41
- st.bar_chart(df[df.decision_date.dt.year <= d].head(10)['monetary_sanction_usd'], height=500)
42
- st.markdown('From this graph we can find the companies that have the highest amount of monetary sanction for selected year.')
43
-
44
-
45
- #---------------------------------------------------------------------------------------
46
- # Plot a bar_chart graph to show the amount monetary sanction per nature of violation
47
-
48
- st.header('Amount of monetary sanction per nature of violation')
49
-
50
- df = df.set_index('nature_of_violations').sort_values('monetary_sanction_usd', ascending=False)
51
-
52
- st.bar_chart(df[df.decision_date.dt.year <= d].head(10)['monetary_sanction_usd'], height=500)
53
-
54
-
55
-
56
-
57
-
58
- #----------------------------------------------------------------------------------------------------#
59
- # Plot a bar_chart graph to show the amount monetary sanction per country of the authorities
60
-
61
- df_filter = pd.read_csv("monetary_year.csv")
62
- df_filter.decision_date = pd.to_datetime(df_filter.decision_date)
63
- st.header('Amount of monetary sanction per country of the Authorities')
64
-
65
- df_filter = df_filter.set_index('country_authorities').sort_values('monetary_sanction', ascending=False)
66
-
67
- st.bar_chart(df_filter[df_filter.decision_date.dt.year <= d].head(10)['monetary_sanction'], height=500)
68
- #--------------------------------------------------------------------------------------------------
69
-
70
- #Here we want to see for a given country, how does the amount of monetary sanction changes per year
71
-
72
- d = st.selectbox('Select the country of the organisation', options=list(df_filter['country_organisation'].unique()))
73
- df_filter.decision_date = pd.to_datetime(df_filter.decision_date)
74
- #df_filter.year=df_filter.decision_date.dt.year
75
-
76
- df_filter = df_filter.set_index('year').sort_values('monetary_sanction', ascending=True)
77
-
78
- st.header('Amount of monetary sanction per year for a selected country of organisation')
79
-
80
- st.line_chart(df_filter[df_filter.country_organisation == d].head(10)['monetary_sanction'])
81
- st.markdown('From this grap shows the urve of the monetary sanction from 2010 to 2020 for eac contry. For example, if we check for US we can see that the amount monetary sanction increase from 2012 to to 2019 and decreases in 2019.')
82
-
83
- #-----------------------------------------------------------------------------------------------------
84
-
85
-
86
-
87
- # st.header('Amount of monetary sanction compare to the revenue of the company')
88
-
89
- # #df_filter = df_filter.sort_values('revenues_organisation', ascending=False)
90
-
91
- # df_filter = df_filter.set_index('revenues_organisation')
92
-
93
- # #st.area_chart(df_filter.head(10)['monetary_sanction'])
94
-
95
- # df_agg = df_filter.groupby('year').agg({'monetary_sanction': 'count'})
96
- # st.bar_chart(df_agg)
97
-
98
-
99
- #df_agg=df_filter.groupby('year').agg({'monetary_sanction': 'count'})
100
- st.header('The number of monetary sanction per country and per year')
101
-
102
- df_agg = pd.read_csv("agg_countMt.csv")
103
- d = st.select_slider('Select year interval', options=list(df_agg['year'].unique()))
104
- df_agg= df_agg.set_index('country_organisation')
105
-
106
- df_agg.columns=['year', 'num_monetary_sanction']
107
- st.bar_chart(df_agg[df_agg.year == d].head(10)['num_monetary_sanction'])
108
 
109
- #df_agg
110
- #ratio entre le revenue et le montant des sanction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
  import streamlit as st
3
+
4
  import numpy as np
 
 
5
  import plotly.express as px
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ from data_processing import load_data, process_data
8
+
9
+ st.title("Data Analysis 🌎 📃")
10
+ st.write("by [Teolex](https://www.theolex.io/)")
11
+
12
+ # load and process data
13
+ data = load_data()
14
+ decisions, organizations, authorities = process_data(data)
15
+
16
+ st.sidebar.title("Parameters")
17
+ authorities_country = st.sidebar.selectbox('Authority country', authorities.country.unique())
18
+
19
+ select_auth = authorities[authorities.country == authorities_country].name.sort_values()
20
+ authority = st.sidebar.selectbox('Authority', ['All', *select_auth])
21
+
22
+ min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2010, 2021))
23
+
24
+ # apply filters
25
+ authority_filter = True
26
+ if authority != 'All':
27
+ authority_filter = decisions.authorities_name.apply(lambda a: authority in a)
28
+ else:
29
+ authority_filter = decisions.authorities_name.apply(lambda a: bool(set(select_auth) & set(a)))
30
+
31
+
32
+ decisions.head()
33
+ year_filter = (decisions.year >= min_year) & (decisions.year <= max_year)
34
+
35
+ decision_scope = decisions[authority_filter & year_filter]
36
+
37
+ ## explore monetary sanctions
38
+ monetary_decision = decision_scope[decision_scope.monetary_sanction > 0]
39
+ monetary_decision = monetary_decision[monetary_decision.org_revenues != ""]
40
+ monetary_decision['org_revenues'] = monetary_decision.org_revenues.astype(float)
41
+ monetary_decision['log10_org_revenues'] = monetary_decision.org_revenues.apply(np.log10)
42
+ monetary_decision['log10_monetary_sanction'] = monetary_decision.monetary_sanction.apply(np.log10)
43
+ monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
44
+ fig = px.scatter(monetary_decision,
45
+ x="org_revenues",
46
+ y="monetary_sanction",
47
+ log_x=True,
48
+ log_y=True,template="simple_white",
49
+ color="authorities_country",
50
+ trendline="ols",
51
+ hover_name="org_name")
52
+ st.plotly_chart(fig)
53
+ st.markdown("Comments...")
54
+ st.subheader("1.1 Main takeaways: ")
55
+ st.write("sample text.")