From c3b34cb3704e4fffcd358e7f7e5b72c91435625f Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Fri, 26 Jul 2019 15:00:49 +0200 Subject: [PATCH 01/20] [UPD] remove py27 & add py37 --- .travis.yml | 36 +++++++++++++++++++++++------------- VERSION.txt | 2 +- docs/contributing.rst | 2 +- docs/history.rst | 5 +++++ docs/installation.rst | 2 +- setup.py | 4 ++-- 6 files changed, 33 insertions(+), 18 deletions(-) diff --git a/.travis.yml b/.travis.yml index 98dbfdd7..d3d76943 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,48 +1,51 @@ language: python matrix: include: - - os: linux - python: '2.7' + - os: linux python: '3.5' - os: linux python: '3.6' + - os: linux + python: '3.7' + - os: osx language: generic - python: '2.7' + python: '3.5' before_install: - brew update - brew install libomp - brew upgrade pyenv - brew install pyenv-virtualenv - - pyenv install 2.7.15 + - pyenv install 3.5.6 - eval "$(pyenv init -)" - - pyenv virtualenv 2.7.15 venv + - pyenv virtualenv 3.5.6 venv - pyenv activate venv - os: osx language: generic - python: '3.5' + python: '3.6' before_install: - brew update - brew install libomp - brew upgrade pyenv - brew install pyenv-virtualenv - - pyenv install 3.5.6 + - pyenv install 3.6.7 - eval "$(pyenv init -)" - - pyenv virtualenv 3.5.6 venv + - pyenv virtualenv 3.6.7 venv - pyenv activate venv - os: osx language: generic - python: '3.6' + python: '3.7' before_install: - brew update - brew install libomp - brew upgrade pyenv - brew install pyenv-virtualenv - - pyenv install 3.6.7 + - pyenv install 3.7.2 - eval "$(pyenv init -)" - - pyenv virtualenv 3.6.7 venv + - pyenv virtualenv 3.7.2 venv - pyenv activate venv + - os: windows language: sh python: '3.5' @@ -55,6 +58,13 @@ matrix: before_install: - choco install python --version 3.6.7 - export PATH="/c/Python36:/c/Python36/Scripts:$PATH" + - os: windows + language: sh + python: '3.7' + before_install: + - choco install python --version 3.7.2 + - export PATH="/c/Python37:/c/Python37/Scripts:$PATH" + install: - pip install coverage - pip install codecov @@ -63,9 +73,9 @@ install: script: - python setup.py install - cd tests -- if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" = "3.6" ] ; then +- if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" = "3.7" ] ; then coverage run -m --source=../mlbox/ pytest; fi -- if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" != "3.6" ] ; then +- if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" != "3.7" ] ; then pytest; fi - if [ "$TRAVIS_OS_NAME" = "osx" ] ; then pytest; fi - if [ "$TRAVIS_OS_NAME" = "windows" ] ; then pytest; fi diff --git a/VERSION.txt b/VERSION.txt index faef31a4..a3df0a69 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -0.7.0 +0.8.0 diff --git a/docs/contributing.rst b/docs/contributing.rst index 01c135d1..b017eea0 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -69,7 +69,7 @@ Ready to contribute? Here's how to set up `mlbox` for local development. 4. Install your local copy into a virtualenv following this commands to set up your fork for local development:: - $ virtualenv mlboxenv --python=python3.6 + $ virtualenv mlboxenv --python=python3.7 $ cd mlboxenv/ $ python setup.py develop diff --git a/docs/history.rst b/docs/history.rst index 4dee6b58..3af86b65 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -85,3 +85,8 @@ History * add tests * improve documentation & examples * minor changes in the package architecture + +0.8.0 (2019-08-01) +------------------ +* remove python 2.7 & add python 3.7 versions +* update package dependencies diff --git a/docs/installation.rst b/docs/installation.rst index d5a773f9..4be11ac7 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -8,7 +8,7 @@ Compatibilities --------------- * *Operating systems:* **Linux**, **MacOS** & **Windows**. -* *Python versions:* **2.7** (except on Windows), **3.5** - **3.6**. & **64-bit version** only (32-bit python is not supported) +* *Python versions:* **3.5** - **3.7**. & **64-bit version** only (32-bit python is not supported) Basic requirements diff --git a/setup.py b/setup.py index d1c3c4b8..e534b3d2 100644 --- a/setup.py +++ b/setup.py @@ -63,9 +63,9 @@ 'Operating System :: Microsoft :: Windows', 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6' + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7' ], test_suite='tests', tests_require=requirements From b6817add95cf2f7f7523470afcabf3d10b0a3f37 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Fri, 26 Jul 2019 15:18:25 +0200 Subject: [PATCH 02/20] [UPD] update requirements for py37 --- requirements.txt | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/requirements.txt b/requirements.txt index b3e920fe..a7f58124 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,12 @@ -numpy>=1.16.3 -scipy==1.2.1 -matplotlib==2.2.4 -hyperopt==0.1 -Keras==2.1.2 -pandas==0.21.0 -joblib==0.11 -scikit-learn==0.19.0 -tensorflow==1.13.1 +numpy==1.16.4 +scipy==1.3.0 +matplotlib==3.0.3 +hyperopt==0.1.2 +Keras==2.2.4 +pandas==0.25.0 +joblib==0.13.2 +scikit-learn==0.21.2 +tensorflow==1.14.0 lightgbm==2.2.3 -networkx==1.11 tables==3.5.2 xlrd==1.2.0 From bec791e4565f2cd83a78e3aca54d9cf3061935f7 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Fri, 26 Jul 2019 15:36:11 +0200 Subject: [PATCH 03/20] [UPD] update requirements for py37 --- requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index a7f58124..7e4b9997 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ numpy==1.16.4 scipy==1.3.0 -matplotlib==3.0.3 +matplotlib==2.2.4 hyperopt==0.1.2 Keras==2.2.4 -pandas==0.25.0 +pandas==0.24.2 joblib==0.13.2 -scikit-learn==0.21.2 +scikit-learn==0.20.3 tensorflow==1.14.0 lightgbm==2.2.3 tables==3.5.2 From 836ae8dac0e72a2685728ec94e73d8750a740387 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Mon, 29 Jul 2019 10:50:05 +0200 Subject: [PATCH 04/20] [FIX] fix bug neg_log_loss scoring --- examples/classification/classification.py | 2 +- mlbox/optimisation/optimiser.py | 12 ++++++------ tests/test_optimiser.py | 2 +- tests/test_predictor.py | 4 +--- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/examples/classification/classification.py b/examples/classification/classification.py index 60b0d5e0..d8250887 100644 --- a/examples/classification/classification.py +++ b/examples/classification/classification.py @@ -24,7 +24,7 @@ # Tuning # Declare an optimiser. Scoring possibilities for classification lie in : -# {"accuracy", "roc_auc", "f1", "log_loss", "precision", "recall"} +# {"accuracy", "roc_auc", "f1", "neg_log_loss", "precision", "recall"} opt = Optimiser(scoring='accuracy', n_folds=3) opt.evaluate(None, dict) diff --git a/mlbox/optimisation/optimiser.py b/mlbox/optimisation/optimiser.py index 62b481d3..ffdb9387 100644 --- a/mlbox/optimisation/optimiser.py +++ b/mlbox/optimisation/optimiser.py @@ -39,11 +39,11 @@ class Optimiser(): scoring : str, callable or None. default: None A string or a scorer callable object. - If None, "log_loss" is used for classification and + If None, "neg_log_loss" is used for classification and "mean_squared_error" for regression Available scorings for classification : {"accuracy","roc_auc", "f1", - "log_loss", "precision", "recall"} + "neg_log_loss", "precision", "recall"} Available scorings for regression : {"mean_absolute_error", "mean_squared_error","median_absolute_error","r2"} @@ -211,7 +211,7 @@ def evaluate(self, params, df): auc = False if (self.scoring is None): - self.scoring = 'log_loss' + self.scoring = 'neg_log_loss' elif (self.scoring == 'roc_auc'): auc = True @@ -222,12 +222,12 @@ def evaluate(self, params, df): else: if (type(self.scoring) == str): if (self.scoring in ["accuracy", "roc_auc", "f1", - "log_loss", "precision", "recall"]): + "neg_log_loss", "precision", "recall"]): pass else: warnings.warn("Invalid scoring metric. " - "log_loss is used instead.") - self.scoring = 'log_loss' + "neg_log_loss is used instead.") + self.scoring = 'neg_log_loss' else: pass diff --git a/tests/test_optimiser.py b/tests/test_optimiser.py index 0518dedc..6fdbc98c 100644 --- a/tests/test_optimiser.py +++ b/tests/test_optimiser.py @@ -84,7 +84,7 @@ def test_evaluate_classification_optimiser(): assert len(record) == 1 with pytest.warns(UserWarning) as record: score = opt.evaluate(None, dict) - assert opt.scoring == "log_loss" + assert opt.scoring == "neg_log_loss" def test_evaluate_regression_optimiser(): diff --git a/tests/test_predictor.py b/tests/test_predictor.py index eb2b1b1b..6dbedabc 100644 --- a/tests/test_predictor.py +++ b/tests/test_predictor.py @@ -64,9 +64,7 @@ def test_fit_predict_predictor_classification(): space = {'ne__numerical_strategy': {"search": "choice", "space": [0]}, 'ce__strategy': {"search": "choice", - "space": ["label_encoding", - "random_projection", - "entity_embedding"]}, + "space": ["entity_embedding"]}, 'fs__threshold': {"search": "uniform", "space": [0.01, 0.3]}, 'est__max_depth': {"search": "choice", From af9dad3a873c2b15a9a19115f8a709cd2cf34057 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Mon, 29 Jul 2019 11:41:38 +0200 Subject: [PATCH 05/20] [FIX] add feature importances tests --- tests/test_classifier.py | 4 ++++ tests/test_optimiser.py | 4 +--- tests/test_predictor.py | 4 +--- tests/test_regressor.py | 4 ++++ 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/test_classifier.py b/tests/test_classifier.py index a858b0cb..cfc95631 100644 --- a/tests/test_classifier.py +++ b/tests/test_classifier.py @@ -76,6 +76,10 @@ def test_feature_importances_classifier(): classifier.feature_importances() df_train = pd.read_csv("data_for_tests/clean_train.csv") y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) + classifier.set_params(strategy="LightGBM") + classifier.fit(df_train, y_train) + importance = classifier.feature_importances() + assert importance != {} classifier.set_params(strategy="Linear") classifier.fit(df_train, y_train) importance = classifier.feature_importances() diff --git a/tests/test_optimiser.py b/tests/test_optimiser.py index 6fdbc98c..23231c7f 100644 --- a/tests/test_optimiser.py +++ b/tests/test_optimiser.py @@ -148,9 +148,7 @@ def test_evaluate_and_optimise_classification(): space = {'ne__numerical_strategy': {"search": "choice", "space": [0]}, 'ce__strategy': {"search": "choice", - "space": ["label_encoding", - "random_projection", - "entity_embedding"]}, + "space": ["label_encoding"]}, 'fs__threshold': {"search": "uniform", "space": [0.01, 0.3]}, 'est__max_depth': {"search": "choice", diff --git a/tests/test_predictor.py b/tests/test_predictor.py index 6dbedabc..cccc09f4 100644 --- a/tests/test_predictor.py +++ b/tests/test_predictor.py @@ -110,9 +110,7 @@ def test_fit_predict_predictor_regression(mock_show): 'ne__numerical_strategy': {"search": "choice", "space": [0]}, 'ce__strategy': {"search": "choice", - "space": ["label_encoding", - "random_projection", - "entity_embedding"]}, + "space": ["random_projection"]}, 'fs__threshold': {"search": "uniform", "space": [0.01, 0.3]}, 'est__max_depth': {"search": "choice", diff --git a/tests/test_regressor.py b/tests/test_regressor.py index 6a9aea1c..4d7d4501 100644 --- a/tests/test_regressor.py +++ b/tests/test_regressor.py @@ -79,6 +79,10 @@ def test_feature_importances_regressor(): regressor.feature_importances() df_train = pd.read_csv("data_for_tests/clean_train.csv") y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) + regressor.set_params(strategy="LightGBM") + regressor.fit(df_train, y_train) + importance = regressor.feature_importances() + assert importance != {} regressor.set_params(strategy="Linear") regressor.fit(df_train, y_train) importance = regressor.feature_importances() From 91d3ab919c3f5f97912603460ce64f7d8cd7639b Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Mon, 29 Jul 2019 11:53:45 +0200 Subject: [PATCH 06/20] [FIX] remove brew update --- .travis.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index d3d76943..207e3ccc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,6 @@ matrix: language: generic python: '3.5' before_install: - - brew update - brew install libomp - brew upgrade pyenv - brew install pyenv-virtualenv @@ -25,7 +24,6 @@ matrix: language: generic python: '3.6' before_install: - - brew update - brew install libomp - brew upgrade pyenv - brew install pyenv-virtualenv @@ -37,7 +35,6 @@ matrix: language: generic python: '3.7' before_install: - - brew update - brew install libomp - brew upgrade pyenv - brew install pyenv-virtualenv From 02b1f125c1f25e01c688191f6297f7aaf798fc79 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Mon, 29 Jul 2019 15:46:16 +0200 Subject: [PATCH 07/20] [FIX] metrics compliance with sklearn==0.20 --- VERSION.txt | 2 +- docs/history.rst | 8 ++++++-- examples/regression/regression.py | 2 +- mlbox/optimisation/optimiser.py | 18 +++++++++--------- requirements.txt | 2 +- 5 files changed, 18 insertions(+), 14 deletions(-) diff --git a/VERSION.txt b/VERSION.txt index a3df0a69..ac39a106 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -0.8.0 +0.9.0 diff --git a/docs/history.rst b/docs/history.rst index 3af86b65..4c779c97 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -86,7 +86,11 @@ History * improve documentation & examples * minor changes in the package architecture -0.8.0 (2019-08-01) +0.8.0 (2019-07-29) ------------------ -* remove python 2.7 & add python 3.7 versions +* remove support for python 2.7 version + +0.9.0 (2019-08-01) +------------------ +* add python 3.7 version * update package dependencies diff --git a/examples/regression/regression.py b/examples/regression/regression.py index 25b80e4c..af54dae8 100644 --- a/examples/regression/regression.py +++ b/examples/regression/regression.py @@ -34,7 +34,7 @@ needs_proba=False) # Declare an optimiser. You can declare your own score # as presented here or use one in -# {"mean_absolute_error", "mean_squared_error","median_absolute_error","r2"} +# {"neg_mean_absolute_error", "neg_mean_squared_error","neg_median_absolute_error","r2"} opt = Optimiser(scoring=mape, n_folds=3) opt.evaluate(None, dict) diff --git a/mlbox/optimisation/optimiser.py b/mlbox/optimisation/optimiser.py index ffdb9387..951f58b7 100644 --- a/mlbox/optimisation/optimiser.py +++ b/mlbox/optimisation/optimiser.py @@ -40,13 +40,13 @@ class Optimiser(): A string or a scorer callable object. If None, "neg_log_loss" is used for classification and - "mean_squared_error" for regression + "neg_mean_squared_error" for regression Available scorings for classification : {"accuracy","roc_auc", "f1", "neg_log_loss", "precision", "recall"} - Available scorings for regression : {"mean_absolute_error", - "mean_squared_error","median_absolute_error","r2"} + Available scorings for regression : {"neg_mean_absolute_error", + "neg_mean_squared_error","neg_median_absolute_error","r2"} n_folds : int, default = 2 The number of folds for cross validation (stratified for classification) @@ -275,18 +275,18 @@ def evaluate(self, params, df): auc = False if (self.scoring is None): - self.scoring = "mean_squared_error" + self.scoring = "neg_mean_squared_error" else: if (type(self.scoring) == str): - if (self.scoring in ["mean_absolute_error", - "mean_squared_error", - "median_absolute_error", + if (self.scoring in ["neg_mean_absolute_error", + "neg_mean_squared_error", + "neg_median_absolute_error", "r2"]): pass else: warnings.warn("Invalid scoring metric. " - "mean_squarred_error is used instead.") - self.scoring = 'mean_squared_error' + "neg_mean_squarred_error is used instead.") + self.scoring = 'neg_mean_squared_error' else: pass diff --git a/requirements.txt b/requirements.txt index 7e4b9997..3d824a34 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy==1.16.4 +numpy==1.17.0 scipy==1.3.0 matplotlib==2.2.4 hyperopt==0.1.2 From 34e8a7cb5ae0565acd04ad171d06cbafb97e8c26 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Mon, 29 Jul 2019 17:19:34 +0200 Subject: [PATCH 08/20] [FIX] add codecov token --- .codecov.yml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.codecov.yml b/.codecov.yml index dfc378f0..f2f5d301 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -1,12 +1,11 @@ codecov: + token: b03fd907-a5af-4638-b0a8-23075ad380a4 notify: require_ci_to_pass: yes - coverage: precision: 2 round: up range: "50...100" - status: project: default: @@ -19,4 +18,17 @@ coverage: # Be tolerant on slight code coverage diff on PRs to limit # noisy red coverage status on github PRs. target: auto - threshold: 1% \ No newline at end of file + threshold: 1% + changes: no +parsers: + gcov: + branch_detection: + conditional: yes + loop: yes + method: no + macro: no + +comment: + layout: "header, diff" + behavior: default + require_changes: no \ No newline at end of file From f68f07276c8751e8b3f819415dec1ba233d350cd Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Tue, 30 Jul 2019 10:44:59 +0200 Subject: [PATCH 09/20] [UPD] version 0.8.1 --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index ac39a106..6f4eebdf 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -0.9.0 +0.8.1 From c0ddf593ca389d38fdf01b565f638a24e7d60b8d Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Mon, 5 Aug 2019 15:42:26 +0200 Subject: [PATCH 10/20] [UPD] update requirements --- requirements.txt | 6 +++--- tests/.DS_Store | Bin 8196 -> 8196 bytes tests/test_categorical_encoder.py | 13 +++++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3d824a34..86e18bdc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ numpy==1.17.0 scipy==1.3.0 -matplotlib==2.2.4 +matplotlib==3.0.3 hyperopt==0.1.2 Keras==2.2.4 -pandas==0.24.2 +pandas==0.25.0 joblib==0.13.2 -scikit-learn==0.20.3 +scikit-learn==0.21.3 tensorflow==1.14.0 lightgbm==2.2.3 tables==3.5.2 diff --git a/tests/.DS_Store b/tests/.DS_Store index 07c30e908b948e4be9f9d12cf40d70112a71e03c..dd9ee90b03fbb69c56c870d15c1765b3dc4a9f3a 100644 GIT binary patch delta 46 zcmZp1XmOa}&nUJrU^hRb*k&Gq5SGoeL~@xYHdt+Dm-x=YSTb2wOk#7O=r*Q_4dwt{ CQV!h! delta 71 zcmZp1XmOa}&nUhzU^hRb_+}n~5EfBhh7yKUhGHNY&ydTI!;r*~&rmV>poj<)W9#NX aks9XB>=NHuCQlL*nj9d;viX+C2POcjTNE+? diff --git a/tests/test_categorical_encoder.py b/tests/test_categorical_encoder.py index 02eec8bb..630a32f7 100644 --- a/tests/test_categorical_encoder.py +++ b/tests/test_categorical_encoder.py @@ -80,3 +80,16 @@ def test_transform_encoder(): encoder.fit(df, df["Survived"]) df_encoded = encoder.transform(df) assert (df.columns == df_encoded.columns).all() + encoder.set_params(strategy="dummification") + encoder.fit(df, df["Survived"]) + df_encoded = encoder.transform(df) + assert type(df_encoded) == pd.SparseDataFrame + encoder.set_params(strategy="random_projection") + encoder.fit(df, df["Survived"]) + df_encoded = encoder.transform(df) + assert type(df_encoded) == pd.DataFrame + encoder.set_params(strategy="entity_embedding") + encoder.fit(df, df["Survived"]) + df_encoded = encoder.transform(df) + assert type(df_encoded) == pd.DataFrame + From 37301322a8d48b33fe1efe5f2255b33fcd80bfb5 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Mon, 5 Aug 2019 15:59:16 +0200 Subject: [PATCH 11/20] [UPD] update tests for categorical encoder --- docs/history.rst | 2 +- tests/test_categorical_encoder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/history.rst b/docs/history.rst index 4c779c97..55913cd1 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -90,7 +90,7 @@ History ------------------ * remove support for python 2.7 version -0.9.0 (2019-08-01) +0.9.0 (2019-08-29) ------------------ * add python 3.7 version * update package dependencies diff --git a/tests/test_categorical_encoder.py b/tests/test_categorical_encoder.py index 630a32f7..523a696e 100644 --- a/tests/test_categorical_encoder.py +++ b/tests/test_categorical_encoder.py @@ -83,7 +83,7 @@ def test_transform_encoder(): encoder.set_params(strategy="dummification") encoder.fit(df, df["Survived"]) df_encoded = encoder.transform(df) - assert type(df_encoded) == pd.SparseDataFrame + assert (type(df_encoded) == pd.SparseDataFrame) | (type(df_encoded) == pd.DataFrame) encoder.set_params(strategy="random_projection") encoder.fit(df, df["Survived"]) df_encoded = encoder.transform(df) From b5959674f4f6fe9e94ee99630c5e0ad22bc1e936 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Tue, 6 Aug 2019 12:11:51 +0200 Subject: [PATCH 12/20] [UPD] virtualenv setup & docs --- .gitignore | 1 + docs/contributing.rst | 13 ++++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index bbbcedf3..299050f4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/docs/contributing.rst b/docs/contributing.rst index b017eea0..2280adb8 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -59,18 +59,19 @@ Ready to contribute? Here's how to set up `mlbox` for local development. 1. Fork the `mlbox` repo on GitHub. -2. Clone your fork locally:: +2. Clone your fork:: $ git clone git@github.com:your_name_here/mlbox.git -3. If you have virtualenvwrapper install, skip this step. Either, run the following:: +3. If you have virtualenv installed, skip this step. Either, run the following:: $ pip install virtualenv 4. Install your local copy into a virtualenv following this commands to set up your fork for local development:: - $ virtualenv mlboxenv --python=python3.7 - $ cd mlboxenv/ + $ cd MLBox + $ virtualenv env + $ source env/bin/activate $ python setup.py develop If you have any troubles with the setup, please refer to the `installation guide `__ @@ -79,7 +80,9 @@ If you have any troubles with the setup, please refer to the `installation guide $ git checkout -b name-of-your-bugfix-or-feature -Now you can make your changes locally. +**Now you're set, you can make your changes locally.** + +NOTE : each time you work on your branch, you will need to activate the virtualenv: ``source env/bin/activate``. To deactivate it, simply run: ``deactivate``. 6. When you're done making changes, check that your changes pass the tests. From 726982bf32363d1fdbfb6ab6ff80b64bc644b9a5 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Tue, 6 Aug 2019 17:22:09 +0200 Subject: [PATCH 13/20] [UPD] update contributing doc --- docs/contributing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing.rst b/docs/contributing.rst index 2280adb8..097ef64a 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -82,7 +82,7 @@ If you have any troubles with the setup, please refer to the `installation guide **Now you're set, you can make your changes locally.** -NOTE : each time you work on your branch, you will need to activate the virtualenv: ``source env/bin/activate``. To deactivate it, simply run: ``deactivate``. +NOTE : each time you work on your branch, you will need to activate the virtualenv: ``$ source env/bin/activate``. To deactivate it, simply run: ``$ deactivate``. 6. When you're done making changes, check that your changes pass the tests. From 8e68201f733fb5425632d2ae6c35898fb3e46630 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Tue, 6 Aug 2019 17:24:39 +0200 Subject: [PATCH 14/20] =?UTF-8?q?[UPD]=20version=200.8.1=20is=20out=20?= =?UTF-8?q?=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index 6f4eebdf..c18d72be 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -0.8.1 +0.8.1 \ No newline at end of file From 72700eb1c61e6697d1f7a68f1d2c6e12fdb528da Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Fri, 9 Aug 2019 22:38:14 +0200 Subject: [PATCH 15/20] [UPD] pep8 style --- mlbox/optimisation/optimiser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mlbox/optimisation/optimiser.py b/mlbox/optimisation/optimiser.py index 951f58b7..5c24fb3f 100644 --- a/mlbox/optimisation/optimiser.py +++ b/mlbox/optimisation/optimiser.py @@ -451,7 +451,6 @@ def evaluate(self, params, df): return score - def optimise(self, space, df, max_evals=40): """Optimises the Pipeline. From 61e3c2393869b7d8145512bedb8d2a052db82a77 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Tue, 13 Aug 2019 10:57:52 +0200 Subject: [PATCH 16/20] [FIX] first fix for multiclass metric --- examples/regression/regression.py | 2 +- mlbox/optimisation/optimiser.py | 63 +++++++++++++++++++------------ mlbox/preprocessing/reader.py | 8 +++- 3 files changed, 47 insertions(+), 26 deletions(-) diff --git a/examples/regression/regression.py b/examples/regression/regression.py index af54dae8..8a112e87 100644 --- a/examples/regression/regression.py +++ b/examples/regression/regression.py @@ -34,7 +34,7 @@ needs_proba=False) # Declare an optimiser. You can declare your own score # as presented here or use one in -# {"neg_mean_absolute_error", "neg_mean_squared_error","neg_median_absolute_error","r2"} +# {"neg_mean_absolute_error", "neg_mean_squared_error", "neg_mean_squared_log_error", "neg_median_absolute_error","r2"} opt = Optimiser(scoring=mape, n_folds=3) opt.evaluate(None, dict) diff --git a/mlbox/optimisation/optimiser.py b/mlbox/optimisation/optimiser.py index 5c24fb3f..56175bf1 100644 --- a/mlbox/optimisation/optimiser.py +++ b/mlbox/optimisation/optimiser.py @@ -46,7 +46,7 @@ class Optimiser(): "neg_log_loss", "precision", "recall"} Available scorings for regression : {"neg_mean_absolute_error", - "neg_mean_squared_error","neg_median_absolute_error","r2"} + "neg_mean_squared_error", "neg_mean_squared_log_error", "neg_median_absolute_error","r2"} n_folds : int, default = 2 The number of folds for cross validation (stratified for classification) @@ -176,6 +176,10 @@ def evaluate(self, params, df): classes_to_drop = counts[counts < self.n_folds].index mask_to_drop = df['target'].apply(lambda x: x in classes_to_drop) indexes_to_drop = df['target'][mask_to_drop].index + n_classes = len(counts) - len(classes_to_drop) + + if n_classes == 1: + raise ValueError("Your target has not enough classes. You can't run the optimiser") cv = StratifiedKFold(n_splits=self.n_folds, shuffle=True, @@ -208,29 +212,36 @@ def evaluate(self, params, df): # Default scoring for classification - auc = False - if (self.scoring is None): - self.scoring = 'neg_log_loss' - - elif (self.scoring == 'roc_auc'): - auc = True - self.scoring = make_scorer(lambda y_true, y_pred: roc_auc_score(pd.get_dummies(y_true), y_pred), # noqa - greater_is_better=True, - needs_proba=True) + scoring = 'neg_log_loss' # works also for multiclass pb + scoring_func = 'neg_log_loss' else: if (type(self.scoring) == str): if (self.scoring in ["accuracy", "roc_auc", "f1", "neg_log_loss", "precision", "recall"]): - pass + + scoring = self.scoring + + # binary classification + if n_classes <= 2: + scoring_func = self.scoring + + # multiclass classification + else: + + # TODO !!! + else: warnings.warn("Invalid scoring metric. " "neg_log_loss is used instead.") - self.scoring = 'neg_log_loss' + + scoring = 'neg_log_loss' + scoring_func = 'neg_log_loss' else: - pass + scoring = "custom_scoring" + scoring_func = self.scoring ########################################## # Regression @@ -272,23 +283,30 @@ def evaluate(self, params, df): # Default scoring for regression - auc = False - if (self.scoring is None): - self.scoring = "neg_mean_squared_error" + scoring = "neg_mean_squared_error" + scoring_func = "neg_mean_squared_error" + else: if (type(self.scoring) == str): if (self.scoring in ["neg_mean_absolute_error", "neg_mean_squared_error", + "neg_mean_squared_log_error", "neg_median_absolute_error", "r2"]): - pass + + scoring = self.scoring + scoring_func = self.scoring + else: warnings.warn("Invalid scoring metric. " "neg_mean_squarred_error is used instead.") - self.scoring = 'neg_mean_squared_error' + + scoring = 'neg_mean_squared_error' + scoring_func = 'neg_mean_squared_error' else: - pass + scoring = "custom_scoring" + scoring_func = self.scoring else: raise ValueError("Impossible to determine the task. " @@ -411,7 +429,7 @@ def evaluate(self, params, df): scores = cross_val_score(estimator=pp, X=df['train'].drop(indexes_to_drop), y=df['target'].drop(indexes_to_drop), - scoring=self.scoring, + scoring=scoring_func, cv=cv) score = np.mean(scores) @@ -438,12 +456,9 @@ def evaluate(self, params, df): for i, s in enumerate(scores[:-1]): out = out + "fold " + str(i + 1) + " = " + str(s) + ", " - if (auc): - self.scoring = "roc_auc" - if (self.verbose): print("") - print("MEAN SCORE : " + str(self.scoring) + " = " + str(score)) + print("MEAN SCORE : " + str(scoring) + " = " + str(score)) print("VARIANCE : " + str(np.std(scores)) + out + "fold " + str(i + 2) + " = " + str(scores[-1]) + ")") print("CPU time: %s seconds" % (time.time() - start_time)) diff --git a/mlbox/preprocessing/reader.py b/mlbox/preprocessing/reader.py index 36b00cb4..34241a55 100644 --- a/mlbox/preprocessing/reader.py +++ b/mlbox/preprocessing/reader.py @@ -5,6 +5,7 @@ import pickle import os import time +import warnings import numpy as np import pandas as pd from sklearn.preprocessing import LabelEncoder @@ -506,8 +507,9 @@ def train_test_split(self, Lpath, target_name): ############################################################## task = "regression" + count = y_train.nunique() - if (y_train.nunique() <= 2): + if (count <= 2): task = "classification" else: @@ -532,6 +534,10 @@ def train_test_split(self, Lpath, target_name): name=target_name, dtype='int') + if count == 1: + warnings.warn("Your target set has only one class ! Please check it is correct, " + "otherwise there is no need to use MLBox...") + else: if (self.verbose): print(y_train.describe()) From 1178a07b4bc6c3a463a3c68ebb38d88ce2132004 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Tue, 13 Aug 2019 11:31:29 +0200 Subject: [PATCH 17/20] [UPD] remove codecov.yml --- .codecov.yml | 34 --------------------------------- mlbox/optimisation/optimiser.py | 2 +- 2 files changed, 1 insertion(+), 35 deletions(-) delete mode 100644 .codecov.yml diff --git a/.codecov.yml b/.codecov.yml deleted file mode 100644 index f2f5d301..00000000 --- a/.codecov.yml +++ /dev/null @@ -1,34 +0,0 @@ -codecov: - token: b03fd907-a5af-4638-b0a8-23075ad380a4 - notify: - require_ci_to_pass: yes -coverage: - precision: 2 - round: up - range: "50...100" - status: - project: - default: - # Commits pushed to master should not make the overall - # project coverage decrease by more than 1% - target: auto - threshold: 1% - patch: - default: - # Be tolerant on slight code coverage diff on PRs to limit - # noisy red coverage status on github PRs. - target: auto - threshold: 1% - changes: no -parsers: - gcov: - branch_detection: - conditional: yes - loop: yes - method: no - macro: no - -comment: - layout: "header, diff" - behavior: default - require_changes: no \ No newline at end of file diff --git a/mlbox/optimisation/optimiser.py b/mlbox/optimisation/optimiser.py index 56175bf1..37a6c7e9 100644 --- a/mlbox/optimisation/optimiser.py +++ b/mlbox/optimisation/optimiser.py @@ -229,7 +229,7 @@ def evaluate(self, params, df): # multiclass classification else: - + pass # TODO !!! else: From aae8ae7edd0216275723aa28a82d6b409ac49ae5 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Thu, 22 Aug 2019 22:16:49 +0200 Subject: [PATCH 18/20] [FIX] fix metrics issue --- mlbox/optimisation/optimiser.py | 77 +++++++++++++++------------------ 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/mlbox/optimisation/optimiser.py b/mlbox/optimisation/optimiser.py index 37a6c7e9..bb2e107d 100644 --- a/mlbox/optimisation/optimiser.py +++ b/mlbox/optimisation/optimiser.py @@ -10,7 +10,7 @@ from hyperopt import fmin, hp, tpe from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold from sklearn.pipeline import Pipeline -from sklearn.metrics import roc_auc_score, make_scorer +from sklearn.metrics import SCORERS, make_scorer, roc_auc_score from ..encoding.na_encoder import NA_encoder from ..encoding.categorical_encoder import Categorical_encoder @@ -42,11 +42,8 @@ class Optimiser(): If None, "neg_log_loss" is used for classification and "neg_mean_squared_error" for regression - Available scorings for classification : {"accuracy","roc_auc", "f1", - "neg_log_loss", "precision", "recall"} - - Available scorings for regression : {"neg_mean_absolute_error", - "neg_mean_squared_error", "neg_mean_squared_log_error", "neg_median_absolute_error","r2"} + Available scorings can be found in the module sklearn.metrics: + https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules n_folds : int, default = 2 The number of folds for cross validation (stratified for classification) @@ -213,35 +210,41 @@ def evaluate(self, params, df): # Default scoring for classification if (self.scoring is None): - scoring = 'neg_log_loss' # works also for multiclass pb - scoring_func = 'neg_log_loss' + self.scoring = 'neg_log_loss' # works also for multiclass pb else: if (type(self.scoring) == str): - if (self.scoring in ["accuracy", "roc_auc", "f1", - "neg_log_loss", "precision", "recall"]): + if (self.scoring not in list(SCORERS.keys())): + + warnings.warn("Unknown or invalid scoring metric. " + "neg_log_loss is used instead.") + + self.scoring = 'neg_log_loss' - scoring = self.scoring + else: # binary classification if n_classes <= 2: - scoring_func = self.scoring + pass # multiclass classification else: - pass - # TODO !!! + warnings.warn("This is a multiclass problem. Please make sure that your scoring metric is " + "appropriate.") - else: - warnings.warn("Invalid scoring metric. " - "neg_log_loss is used instead.") + if self.scoring+"_weighted" in list(SCORERS.keys()): - scoring = 'neg_log_loss' - scoring_func = 'neg_log_loss' + warnings.warn("Weighted strategy for the scoring metric is used.") + self.scoring = self.scoring + "_weighted" + # specific scenarios + else: + if self.scoring == "roc_auc": + self.scoring = make_scorer(lambda y_true, y_pred: roc_auc_score(pd.get_dummies(y_true), y_pred), # noqa + greater_is_better=True, + needs_proba=True) else: - scoring = "custom_scoring" - scoring_func = self.scoring + pass ########################################## # Regression @@ -284,29 +287,21 @@ def evaluate(self, params, df): # Default scoring for regression if (self.scoring is None): - scoring = "neg_mean_squared_error" - scoring_func = "neg_mean_squared_error" + self.scoring = "neg_mean_squared_error" else: if (type(self.scoring) == str): - if (self.scoring in ["neg_mean_absolute_error", - "neg_mean_squared_error", - "neg_mean_squared_log_error", - "neg_median_absolute_error", - "r2"]): + if (self.scoring not in list(SCORERS.keys())): - scoring = self.scoring - scoring_func = self.scoring + warnings.warn("Unknown or invalid scoring metric. " + "neg_mean_squared_error is used instead.") - else: - warnings.warn("Invalid scoring metric. " - "neg_mean_squarred_error is used instead.") + self.scoring = 'neg_mean_squared_error' - scoring = 'neg_mean_squared_error' - scoring_func = 'neg_mean_squared_error' + else: + pass else: - scoring = "custom_scoring" - scoring_func = self.scoring + pass else: raise ValueError("Impossible to determine the task. " @@ -429,7 +424,7 @@ def evaluate(self, params, df): scores = cross_val_score(estimator=pp, X=df['train'].drop(indexes_to_drop), y=df['target'].drop(indexes_to_drop), - scoring=scoring_func, + scoring=self.scoring, cv=cv) score = np.mean(scores) @@ -444,8 +439,8 @@ def evaluate(self, params, df): if (score == -np.inf): warnings.warn("An error occurred while computing the cross " - "validation mean score. Check the parameter values " - "and your scoring function.") + "validation mean score. Please check that the parameter values are correct " + "and that your scoring function is valid and appropriate to the task.") ########################################## # Reporting scores @@ -458,7 +453,7 @@ def evaluate(self, params, df): if (self.verbose): print("") - print("MEAN SCORE : " + str(scoring) + " = " + str(score)) + print("MEAN SCORE : " + str(self.scoring) + " = " + str(score)) print("VARIANCE : " + str(np.std(scores)) + out + "fold " + str(i + 2) + " = " + str(scores[-1]) + ")") print("CPU time: %s seconds" % (time.time() - start_time)) From 0ace9fe8762999b0fd6ee4d3247211555e700e79 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Fri, 23 Aug 2019 10:45:40 +0200 Subject: [PATCH 19/20] [UPD] update history --- docs/history.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/history.rst b/docs/history.rst index 55913cd1..2cdfa50c 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -90,7 +90,7 @@ History ------------------ * remove support for python 2.7 version -0.9.0 (2019-08-29) +0.8.1 (2019-08-29) ------------------ * add python 3.7 version * update package dependencies From d95d8634f9d178be723561de8e998a3ddf32f806 Mon Sep 17 00:00:00 2001 From: Axel DEROMBLAY Date: Fri, 23 Aug 2019 11:28:03 +0200 Subject: [PATCH 20/20] [UPD] update codecov token --- codecov.yml | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 codecov.yml diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000..a5e412ae --- /dev/null +++ b/codecov.yml @@ -0,0 +1,34 @@ +codecov: + token: 989a47e4-aa64-4cbd-8516-52d00e1eb129 + notify: + require_ci_to_pass: yes +coverage: + precision: 2 + round: up + range: "50...100" + status: + project: + default: + # Commits pushed to master should not make the overall + # project coverage decrease by more than 1% + target: auto + threshold: 1% + patch: + default: + # Be tolerant on slight code coverage diff on PRs to limit + # noisy red coverage status on github PRs. + target: auto + threshold: 1% + changes: no +parsers: + gcov: + branch_detection: + conditional: yes + loop: yes + method: no + macro: no + +comment: + layout: "header, diff" + behavior: default + require_changes: no \ No newline at end of file