theolex_streamlit / data_processing.py
Jawad's picture
add traning
fe0f3db
raw
history blame
4.1 kB
import streamlit as st
import requests
import pandas as pd
import numpy as np
from data import countries, country_ref
# util functions
def get_id(x):
return int(x.split("/")[-2])
def get_dict(df, col):
return df[col].to_dict()
def replace_lis_val(df, col):
def _replace(l):
return [get_dict(df, col)[i] for i in l]
return _replace
def mode(lst):
if lst:
return max(set(lst), key=lst.count)
@st.cache
def load_data():
url = "https://www.theolex.io/data"
validated_filter = "status=V"
linked_dataset = "include[]=violations.*&include[]=organizations.*&include[]=authorities.*"
url_d = f"{url}/decisions/?per_page=4000&{validated_filter}&{linked_dataset}"
response = requests.get(url_d, headers={'authorization': 'Token 8d55a74628aee8122b7a5a1a51f7caad6d613ec1',
'accept': 'application/json'})
# work on decisions
return response.json()
def process_data(data):
decisions = pd.DataFrame(data['decisions'])
decisions['year'] = pd.to_datetime(decisions['decision_date']).dt.year
decisions.monetary_sanction = decisions.monetary_sanction.astype(float)
decisions = decisions[decisions.status == 'V']
decisions.decision_date = pd.to_datetime(decisions['decision_date']).dt.date
decisions['id'] = decisions.url.apply(get_id)
decision_col = ['violations', 'authorities', 'organizations', 'country_of_violation', 'type', 'justice_type',
'defendant', 'decision_date', 'monetary_sanction', 'nature_de_sanction', 'violation_theme', 'year']
decisions = decisions[decision_col]
decisions = decisions.explode('organizations')
# work on organisations
organizations = pd.DataFrame(data['organizations'])
organizations['id'] = organizations.url.apply(get_id)
organizations.country = organizations.country.str.lower().str.strip().apply(lambda v: countries.get(v, v))
organizations = organizations[["id", "name", "company_type", "revenues", "currency", "country", "lei"]]
organizations['continent'] = organizations.country.apply(lambda v: country_ref.get(v,v))
organizations.columns = ['org_' + col for col in organizations.columns]
decisions = decisions.merge(organizations, left_on='organizations', right_on='org_id')
# remove Individual
decisions = decisions[decisions.org_company_type != "Individual"]
# work on authorities
authorities = pd.DataFrame(data['authorities'])
authorities.index = authorities.url.apply(get_id)
authorities = authorities[["country", "type", "name"]]
authorities.country = authorities.country.str.lower().str.strip().apply(lambda v: countries.get(v, v))
decisions['authorities_name'] = decisions.authorities.apply(replace_lis_val(authorities, 'name'))
decisions['authorities_country'] = decisions.authorities.apply(replace_lis_val(authorities, 'country')).apply(mode)
return decisions, organizations, authorities
def get_monetary_dataframe(decision_scope):
monetary_decision = decision_scope[decision_scope.monetary_sanction > 0]
monetary_decision = monetary_decision[monetary_decision.org_revenues != ""]
monetary_decision['org_revenues'] = monetary_decision.org_revenues.astype(float)
monetary_decision['log10_org_revenues'] = monetary_decision.org_revenues.apply(np.log10)
monetary_decision['log10_monetary_sanction'] = monetary_decision.monetary_sanction.apply(np.log10)
monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
return monetary_decision
def get_themes_per_year(monetary_decision):
#return monetary_decision.groupby(['year', 'violation_theme'])['monetary_sanction'].sum().unstack().fillna(0)
return monetary_decision.groupby(['year', 'violation_theme'])['monetary_sanction'].sum().reset_index()