Spaces:

Theolex
/

theolex_streamlit

Runtime error

App Files Files Community

Jawad commited on Nov 22, 2021

Commit

b0e8abd

1 Parent(s): 1678afb

add cross validation

Browse files

Files changed (3) hide show

model.py +33 -16
requirements.txt +19 -1
stream_app.py +11 -3

model.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import numpy as np
 import pandas as pd
 import xgboost as xgb
@@ -29,24 +30,40 @@ def prepare_data(monetary_decision):
 def split(predictors, target):
     predictors_train, predictors_test, target_train, target_test = train_test_split(predictors,
-                                                                          target,
-                                                                          test_size=0.2,
-                                                                          random_state=42)
     return predictors_train, predictors_test, target_train, target_test
-def run_training(predictors_train, predictors_test):
-    data_train = xgb.DMatrix(predictors_train, label=predictors_test, enable_categorical=True)
-    params = {'max_depth': 4,
-             'learning_rate': 0.05,
-             'colsample_bytree': 0.3,
-             'subsample': 0.8,
-             'gamma': 0.5,
-             'objective': 'reg:squarederror'}
-    num_round = 1000
-    #xgb_cv = cv(dtrain=data_train, params=params, nfold=3,
-    #            num_boost_round=1000, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)
-    return xgb.train(params, data_train, num_round)
 def predict(model, predictors):
@@ -55,4 +72,4 @@ def predict(model, predictors):
 def features_importance(model):
-    return pd.Series(model.get_score(importance_type='gain')).sort_values()

+import itertools
 import numpy as np
 import pandas as pd
 import xgboost as xgb
 def split(predictors, target):
     predictors_train, predictors_test, target_train, target_test = train_test_split(predictors,
+                                                                                    target,
+                                                                                    test_size=0.2,
+                                                                                    random_state=42)
     return predictors_train, predictors_test, target_train, target_test
+def run_cv_training(predictors_train, target_train):
+    data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
+    xgb_csv = []
+    best_params = (100, {}, 10)
+    for eta, max_depth, col_num in itertools.product([0.05, 0.01], [10, 15], [0.3, 0.8]):
+        prefix = f"{str(eta)}_{str(max_depth)}_{str(col_num)}"
+        params = {
+            'learning_rate': eta,
+            'max_depth': max_depth,
+            'colsample_bytree': col_num,
+            #'gamma': 0.5,
+            'subsample': 0.8,
+            'objective': 'reg:squarederror'}
+        cv_results = cv(dtrain=data_train, params=params, nfold=2,
+                        num_boost_round=1000, early_stopping_rounds=3, metrics="rmse", as_pandas=True, seed=123)
+        best_value = cv_results['test-rmse-mean'].values[-1]
+        best_round = cv_results.index[-1]
+        xgb_csv.append(
+            cv_results.rename(columns={col: f'{prefix}_{col}' for col in cv_results.columns}).tail(10).reset_index())
+        if best_value < best_params[0]:
+            best_params = (best_value, params, best_round)
+    return pd.concat(xgb_csv, axis=1), best_params
+def run_training(predictors_train, target_train, params, num_rounds):
+    data_train = xgb.DMatrix(predictors_train, label=target_train, enable_categorical=True)
+    return xgb.train(params, data_train, num_rounds)
 def predict(model, predictors):
 def features_importance(model):
+    return pd.Series(model.get_score(importance_type='gain')).sort_values()

requirements.txt CHANGED Viewed

@@ -12,6 +12,7 @@ certifi==2021.5.30
 cffi==1.14.6
 charset-normalizer==2.0.6
 click==7.1.2
 cycler==0.10.0
 debugpy==1.5.0
 decorator==5.1.0
@@ -20,12 +21,14 @@ entrypoints==0.3
 gitdb==4.0.7
 GitPython==3.1.24
 idna==3.2
 ipykernel==6.4.1
 ipython==7.28.0
 ipython-genutils==0.2.0
 ipywidgets==7.6.5
 jedi==0.18.0
 Jinja2==3.0.2
 jsonschema==4.0.1
 jupyter-client==7.0.6
 jupyter-core==4.8.1
@@ -51,31 +54,45 @@ pexpect==4.8.0
 pickleshare==0.7.5
 Pillow==8.3.2
 plotly==5.3.1
 prometheus-client==0.11.0
 prompt-toolkit==3.0.20
 protobuf==3.18.1
 ptyprocess==0.7.0
 pyarrow==5.0.0
 pycparser==2.20
 pydeck==0.7.0
 Pygments==2.10.0
 pyparsing==2.4.7
 pyrsistent==0.18.0
 python-dateutil==2.8.2
 pytz==2021.3
 pyzmq==22.3.0
 requests==2.26.0
 scipy==1.7.1
 seaborn==0.11.2
 Send2Trash==1.8.0
 six==1.16.0
 smmap==4.0.0
 statsmodels==0.13.0
-streamlit==0.89.0
 tenacity==8.0.1
 terminado==0.12.1
 testpath==0.5.0
 toml==0.10.2
 toolz==0.11.1
 tornado==6.1
 traitlets==5.1.0
@@ -87,3 +104,4 @@ watchdog==2.1.6
 wcwidth==0.2.5
 webencodings==0.5.1
 widgetsnbextension==3.5.1

 cffi==1.14.6
 charset-normalizer==2.0.6
 click==7.1.2
+coverage==6.1.1
 cycler==0.10.0
 debugpy==1.5.0
 decorator==5.1.0
 gitdb==4.0.7
 GitPython==3.1.24
 idna==3.2
+iniconfig==1.1.1
 ipykernel==6.4.1
 ipython==7.28.0
 ipython-genutils==0.2.0
 ipywidgets==7.6.5
 jedi==0.18.0
 Jinja2==3.0.2
+joblib==1.1.0
 jsonschema==4.0.1
 jupyter-client==7.0.6
 jupyter-core==4.8.1
 pickleshare==0.7.5
 Pillow==8.3.2
 plotly==5.3.1
+pluggy==1.0.0
+pprintpp==0.4.0
 prometheus-client==0.11.0
 prompt-toolkit==3.0.20
 protobuf==3.18.1
 ptyprocess==0.7.0
+py==1.10.0
 pyarrow==5.0.0
+pycountry==20.7.3
+pycountry-convert==0.7.2
 pycparser==2.20
 pydeck==0.7.0
 Pygments==2.10.0
+Pympler==0.9
 pyparsing==2.4.7
 pyrsistent==0.18.0
+pytest==6.2.5
+pytest-cov==3.0.0
+pytest-mock==3.6.1
 python-dateutil==2.8.2
 pytz==2021.3
 pyzmq==22.3.0
+repoze.lru==0.7
 requests==2.26.0
+scikit-learn==1.0.1
 scipy==1.7.1
 seaborn==0.11.2
 Send2Trash==1.8.0
 six==1.16.0
+sklearn==0.0
 smmap==4.0.0
 statsmodels==0.13.0
+streamlit==1.2.0
 tenacity==8.0.1
 terminado==0.12.1
 testpath==0.5.0
+threadpoolctl==3.0.0
 toml==0.10.2
+tomli==1.2.2
 toolz==0.11.1
 tornado==6.1
 traitlets==5.1.0
 wcwidth==0.2.5
 webencodings==0.5.1
 widgetsnbextension==3.5.1
+xgboost==1.5.0

stream_app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import plotly.figure_factory as ff
 import scipy
 import numpy as np
 from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
-from model import prepare_data, run_training, split, predict, features_importance
 def _max_width_():
@@ -57,7 +57,7 @@ decision_scope = decisions[authority_filter & year_filter]
 st.subheader("Dataset Description")
-st.metric('Number of validated decisions liked to organisations (and not individuals)', decision_scope.shape[0])
 st.metric('Decisions with monetary sanctions',
           decision_scope[decision_scope.monetary_sanction > 0].shape[0])
@@ -184,7 +184,15 @@ if st.button('Run training'):
         st.metric(label="Training size", value=predictors_train.shape[0])
         st.metric(label="Test size", value=predictors_test.shape[0])
-        xgb_model = run_training(predictors_train, target_train)
         # evaluate model error
         target_train_predicted = predict(xgb_model, predictors_train)

 import scipy
 import numpy as np
 from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year
+from model import prepare_data, run_training, split, predict, features_importance, run_cv_training
 def _max_width_():
 st.subheader("Dataset Description")
+st.metric('Number of validated decisions linked to organisations (and not individuals)', decision_scope.shape[0])
 st.metric('Decisions with monetary sanctions',
           decision_scope[decision_scope.monetary_sanction > 0].shape[0])
         st.metric(label="Training size", value=predictors_train.shape[0])
         st.metric(label="Test size", value=predictors_test.shape[0])
+        #run cross validation
+        st.subheader("Cross validation error")
+        xgb_cv, best_params = run_cv_training(predictors_train, target_train)
+        print(best_params)
+        st.json(best_params)
+        xgb_cv.to_csv('cv_results.csv')
+        st.line_chart(xgb_cv[[col for col in xgb_cv.columns if "mean" in col]])
+        xgb_model = run_training(predictors_train, target_train, best_params[1], best_params[2])
         # evaluate model error
         target_train_predicted = predict(xgb_model, predictors_train)