File size: 2,924 Bytes
010fa37
 
 
 
 
 
 
 
a67c43f
010fa37
 
 
 
 
 
 
 
 
 
 
 
a67c43f
 
 
010fa37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2715942
010fa37
 
 
 
 
 
 
 
 
 
 
 
 
2715942
010fa37
 
 
 
a67c43f
 
 
 
 
 
 
 
 
 
 
010fa37
a67c43f
2715942
010fa37
 
 
2715942
010fa37
2715942
010fa37
 
 
 
 
 
 
 
 
a67c43f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
import streamlit as st
import requests
import pandas as pd
from data import headers

st.title("Production scores")
st.sidebar.title("Parameters")
source_type_value = st.sidebar.selectbox('Authority country', ['all', 'cftc', 'doj', 'cfbp', 'sec'])

created_at = '2021-01-01'


# load data
@st.cache
def load_data(source_type):
    def get_decision_hist(d_id):
        url = f"https://www.theolex.io/data/decisions/{d_id}/return_hist/"
        res = requests.get(url, headers=headers)
        return res.json()

    url_d = f"https://www.theolex.io/data/data_source/?per_page=4000&"
    if source_type != 'all':
        url_d = f"{url_d}&source_type={source_type}"
    response = requests.get(url_d, headers=headers)
    data = response.json()
    data_sources = pd.DataFrame(data['data_sources'])

    # filter per date
    data_sources = data_sources[data_sources.created_at >= created_at]

    # get decisions history
    data_list = [(_id, get_decision_hist(_id)) for _id in data_sources['decision_id']]
    return [(_id, pd.DataFrame(pd.DataFrame(data).fields.to_dict()).T)
            for _id, data in data_list if len(data) > 0]


df_list = load_data(source_type_value)

# filter to keep processing -- 45 is the airflow user id
processed_decisions = {}
for decision_id, decision in df_list:
    _df = decision[(decision.status == 'P') & (decision.history_user == 45)]
    if _df.shape[0] > 0:
        processed_decisions[decision_id] = _df

# filter to keep validated
validated_decisions = {}
for decision_id, decision in df_list:
    _df = decision[(decision.status == 'V')]
    if _df.shape[0] > 0:
        validated_decisions[decision_id] = _df

# Intersection of the precessed and validated decisions
scope = list(set(processed_decisions.keys()) & set(validated_decisions.keys()))
st.metric(label="Number of elements", value=len(scope))

# compare fields between processing and validation
all_fields = ['monetary_sanction',
              'currency',
              'justice_type',
              'decision_date',
              'defendant',
              'monitor',
              'nature_de_sanction',
              'nature_of_violations',
              'reference',
              'type',
              'country_of_violation']
compare_list = st.sidebar.multiselect('Fields to evaluate',
                                      all_fields, all_fields)

result = {}
details = {}
for decision_id in scope:
    # last processed version
    p = processed_decisions[decision_id].iloc[-1].to_dict()
    # last validated version
    v = validated_decisions[decision_id].iloc[-1].to_dict()
    details[decision_id] = {col: (p[col], v[col]) for col in compare_list}
    result[decision_id] = {col: p[col] == v[col] for col in compare_list}

st.subheader("Accuracy scores:")
st.dataframe(pd.DataFrame(result).T.mean())

st.subheader("fields results:")
st.json(details)
# st.dataframe(pd.DataFrame(details))