Merge pull request #79 from AxeldeRomblay/0.8.1

release 0.8.1
AxeldeRomblay · Aug 25, 2019 · 4ad2f66 · 4ad2f66
2 parents eb6923c + 68e5835
commit 4ad2f66
Show file tree

Hide file tree

Showing 19 changed files with 133 additions and 78 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/.travis.yml b/.travis.yml
@@ -1,12 +1,12 @@
 language: python
 matrix:
   include:
-
   - os: linux
     python: '3.5'
   - os: linux
     python: '3.6'
-
+  - os: linux
+    python: '3.7'
   - os: osx
     language: generic
     python: '3.5'
@@ -29,7 +29,17 @@ matrix:
     - eval "$(pyenv init -)"
     - pyenv virtualenv 3.6.7 venv
     - pyenv activate venv
-
+  - os: osx
+    language: generic
+    python: '3.7'
+    before_install:
+    - brew install libomp
+    - brew upgrade pyenv
+    - brew install pyenv-virtualenv
+    - pyenv install 3.7.2
+    - eval "$(pyenv init -)"
+    - pyenv virtualenv 3.7.2 venv
+    - pyenv activate venv
   - os: windows
     language: sh
     python: '3.5'
@@ -42,6 +52,12 @@ matrix:
     before_install:
     - choco install python --version 3.6.7
     - export PATH="/c/Python36:/c/Python36/Scripts:$PATH"
+  - os: windows
+    language: sh
+    python: '3.7'
+    before_install:
+    - choco install python --version 3.7.2
+    - export PATH="/c/Python37:/c/Python37/Scripts:$PATH"
 install:
 - pip install coverage
 - pip install codecov
@@ -50,9 +66,9 @@ install:
 script:
 - python setup.py install
 - cd tests
-- if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" = "3.6" ] ; then
+- if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" = "3.7" ] ; then
   coverage run -m --source=../mlbox/ pytest; fi
-- if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" != "3.6" ] ; then
+- if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$TRAVIS_PYTHON_VERSION" != "3.7" ] ; then
   pytest; fi
 - if [ "$TRAVIS_OS_NAME" = "osx" ] ; then pytest; fi
 - if [ "$TRAVIS_OS_NAME" = "windows" ] ; then pytest; fi

diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-0.8.0
+0.8.1
diff --git a/.codecov.yml → codecov.yml b/.codecov.yml → codecov.yml
@@ -1,5 +1,5 @@
 codecov:
-  token: b03fd907-a5af-4638-b0a8-23075ad380a4
+  token: 989a47e4-aa64-4cbd-8516-52d00e1eb129
   notify:
     require_ci_to_pass: yes
 coverage:

diff --git a/docs/contributing.rst b/docs/contributing.rst
@@ -59,18 +59,19 @@ Ready to contribute? Here's how to set up `mlbox` for local development.
 
 1. Fork the `mlbox` repo on GitHub.
 
-2. Clone your fork locally::
+2. Clone your fork::
 
     $ git clone git@github.com:your_name_here/mlbox.git
 
-3. If you have virtualenvwrapper install, skip this step. Either, run the following::
+3. If you have virtualenv installed, skip this step. Either, run the following::
 
     $ pip install virtualenv
     
 4. Install your local copy into a virtualenv following this commands to set up your fork for local development::
 
-    $ virtualenv mlboxenv --python=python3.6
-    $ cd mlboxenv/
+    $ cd MLBox
+    $ virtualenv env
+    $ source env/bin/activate
     $ python setup.py develop
 
 If you have any troubles with the setup, please refer to the `installation guide <https://mlbox.readthedocs.io/en/latest/installation.html>`__
@@ -79,7 +80,9 @@ If you have any troubles with the setup, please refer to the `installation guide
 
     $ git checkout -b name-of-your-bugfix-or-feature
 
-Now you can make your changes locally.
+**Now you're set, you can make your changes locally.**
+
+NOTE : each time you work on your branch, you will need to activate the virtualenv: ``$ source env/bin/activate``. To deactivate it, simply run: ``$ deactivate``.
 
 6. When you're done making changes, check that your changes pass the tests.
 

diff --git a/docs/history.rst b/docs/history.rst
@@ -88,4 +88,9 @@ History
 
 0.8.0 (2019-07-29)
 ------------------
-* remove support for python 2.7 version
+* remove support for python 2.7 version
+
+0.8.1 (2019-08-29)
+------------------
+* add python 3.7 version
+* update package dependencies
diff --git a/docs/installation.rst b/docs/installation.rst
@@ -8,7 +8,7 @@ Compatibilities
 ---------------
 
 * *Operating systems:* **Linux**, **MacOS** & **Windows**.
-* *Python versions:* **3.5** - **3.6**. & **64-bit version** only (32-bit python is not supported)
+* *Python versions:* **3.5** - **3.7**. & **64-bit version** only (32-bit python is not supported)
 
 
 Basic requirements

diff --git a/examples/classification/classification.py b/examples/classification/classification.py
@@ -24,7 +24,7 @@
 
 # Tuning
 # Declare an optimiser. Scoring possibilities for classification lie in :
-# {"accuracy", "roc_auc", "f1", "log_loss", "precision", "recall"}
+# {"accuracy", "roc_auc", "f1", "neg_log_loss", "precision", "recall"}
 opt = Optimiser(scoring='accuracy', n_folds=3)
 opt.evaluate(None, dict)
 

diff --git a/examples/regression/regression.py b/examples/regression/regression.py
@@ -34,7 +34,7 @@
                    needs_proba=False)
 # Declare an optimiser. You can declare your own score
 # as presented here or use one in
-# {"mean_absolute_error", "mean_squared_error","median_absolute_error","r2"}
+# {"neg_mean_absolute_error", "neg_mean_squared_error", "neg_mean_squared_log_error", "neg_median_absolute_error","r2"}
 opt = Optimiser(scoring=mape, n_folds=3)
 opt.evaluate(None, dict)
 

diff --git a/mlbox/optimisation/optimiser.py b/mlbox/optimisation/optimiser.py
@@ -10,7 +10,7 @@
 from hyperopt import fmin, hp, tpe
 from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
 from sklearn.pipeline import Pipeline
-from sklearn.metrics import roc_auc_score, make_scorer
+from sklearn.metrics import SCORERS, make_scorer, roc_auc_score
 
 from ..encoding.na_encoder import NA_encoder
 from ..encoding.categorical_encoder import Categorical_encoder
@@ -39,14 +39,11 @@ class Optimiser():
     scoring : str, callable or None. default: None
         A string or a scorer callable object.
 
-        If None, "log_loss" is used for classification and
-        "mean_squared_error" for regression
+        If None, "neg_log_loss" is used for classification and
+        "neg_mean_squared_error" for regression
 
-        Available scorings for classification : {"accuracy","roc_auc", "f1",
-        "log_loss", "precision", "recall"}
-
-        Available scorings for regression : {"mean_absolute_error",
-        "mean_squared_error","median_absolute_error","r2"}
+        Available scorings can be found in the module sklearn.metrics:
+        https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules
 
     n_folds : int, default = 2
         The number of folds for cross validation (stratified for classification)
@@ -176,6 +173,10 @@ def evaluate(self, params, df):
             classes_to_drop = counts[counts < self.n_folds].index
             mask_to_drop = df['target'].apply(lambda x: x in classes_to_drop)
             indexes_to_drop = df['target'][mask_to_drop].index
+            n_classes = len(counts) - len(classes_to_drop)
+
+            if n_classes == 1:
+                raise ValueError("Your target has not enough classes. You can't run the optimiser")
 
             cv = StratifiedKFold(n_splits=self.n_folds,
                                  shuffle=True,
@@ -208,27 +209,40 @@ def evaluate(self, params, df):
 
             # Default scoring for classification
 
-            auc = False
-
             if (self.scoring is None):
-                self.scoring = 'log_loss'
-
-            elif (self.scoring == 'roc_auc'):
-                auc = True
-                self.scoring = make_scorer(lambda y_true, y_pred: roc_auc_score(pd.get_dummies(y_true), y_pred),  # noqa
-                                           greater_is_better=True,
-                                           needs_proba=True)
+                self.scoring = 'neg_log_loss'  # works also for multiclass pb
 
             else:
                 if (type(self.scoring) == str):
-                    if (self.scoring in ["accuracy", "roc_auc", "f1",
-                                         "log_loss", "precision", "recall"]):
-                        pass
+                    if (self.scoring not in list(SCORERS.keys())):
+
+                        warnings.warn("Unknown or invalid scoring metric. "
+                                      "neg_log_loss is used instead.")
+
+                        self.scoring = 'neg_log_loss'
+
                     else:
-                        warnings.warn("Invalid scoring metric. "
-                                      "log_loss is used instead.")
-                        self.scoring = 'log_loss'
 
+                        # binary classification
+                        if n_classes <= 2:
+                            pass
+
+                        # multiclass classification
+                        else:
+                            warnings.warn("This is a multiclass problem. Please make sure that your scoring metric is "
+                                          "appropriate.")
+
+                            if self.scoring+"_weighted" in list(SCORERS.keys()):
+
+                                warnings.warn("Weighted strategy for the scoring metric is used.")
+                                self.scoring = self.scoring + "_weighted"
+
+                            # specific scenarios
+                            else:
+                                if self.scoring == "roc_auc":
+                                    self.scoring = make_scorer(lambda y_true, y_pred: roc_auc_score(pd.get_dummies(y_true), y_pred),  # noqa
+                                                               greater_is_better=True,
+                                                               needs_proba=True)
                 else:
                     pass
 
@@ -272,21 +286,20 @@ def evaluate(self, params, df):
 
             # Default scoring for regression
 
-            auc = False
-
             if (self.scoring is None):
-                self.scoring = "mean_squared_error"
+                self.scoring = "neg_mean_squared_error"
+
             else:
                 if (type(self.scoring) == str):
-                    if (self.scoring in ["mean_absolute_error",
-                                         "mean_squared_error",
-                                         "median_absolute_error",
-                                         "r2"]):
-                        pass
+                    if (self.scoring not in list(SCORERS.keys())):
+
+                        warnings.warn("Unknown or invalid scoring metric. "
+                                      "neg_mean_squared_error is used instead.")
+
+                        self.scoring = 'neg_mean_squared_error'
+
                     else:
-                        warnings.warn("Invalid scoring metric. "
-                                      "mean_squarred_error is used instead.")
-                        self.scoring = 'mean_squared_error'
+                        pass
                 else:
                     pass
 
@@ -426,8 +439,8 @@ def evaluate(self, params, df):
 
         if (score == -np.inf):
             warnings.warn("An error occurred while computing the cross "
-                          "validation mean score. Check the parameter values "
-                          "and your scoring function.")
+                          "validation mean score. Please check that the parameter values are correct "
+                          "and that your scoring function is valid and appropriate to the task.")
 
         ##########################################
         #             Reporting scores
@@ -438,9 +451,6 @@ def evaluate(self, params, df):
         for i, s in enumerate(scores[:-1]):
             out = out + "fold " + str(i + 1) + " = " + str(s) + ", "
 
-        if (auc):
-            self.scoring = "roc_auc"
-
         if (self.verbose):
             print("")
             print("MEAN SCORE : " + str(self.scoring) + " = " + str(score))
@@ -451,7 +461,6 @@ def evaluate(self, params, df):
 
         return score
 
-
     def optimise(self, space, df, max_evals=40):
 
         """Optimises the Pipeline.

diff --git a/mlbox/preprocessing/reader.py b/mlbox/preprocessing/reader.py
@@ -5,6 +5,7 @@
 import pickle
 import os
 import time
+import warnings
 import numpy as np
 import pandas as pd
 from sklearn.preprocessing import LabelEncoder
@@ -506,8 +507,9 @@ def train_test_split(self, Lpath, target_name):
             ##############################################################
 
             task = "regression"
+            count = y_train.nunique()
 
-            if (y_train.nunique() <= 2):
+            if (count <= 2):
                 task = "classification"
 
             else:
@@ -532,6 +534,10 @@ def train_test_split(self, Lpath, target_name):
                                     name=target_name,
                                     dtype='int')
 
+                if count == 1:
+                    warnings.warn("Your target set has only one class ! Please check it is correct, "
+                                  "otherwise there is no need to use MLBox...")
+
             else:
                 if (self.verbose):
                     print(y_train.describe())

diff --git a/requirements.txt b/requirements.txt
@@ -1,13 +1,12 @@
-numpy==1.16.3
-scipy==1.2.1
-matplotlib==2.2.4
-hyperopt==0.1
-Keras==2.1.2
-pandas==0.21.0
-joblib==0.11
-scikit-learn==0.19.0
-tensorflow==1.13.1
+numpy==1.17.0
+scipy==1.3.0
+matplotlib==3.0.3
+hyperopt==0.1.2
+Keras==2.2.4
+pandas==0.25.0
+joblib==0.13.2
+scikit-learn==0.21.3
+tensorflow==1.14.0
 lightgbm==2.2.3
-networkx==1.11
 tables==3.5.2
 xlrd==1.2.0
diff --git a/setup.py b/setup.py
@@ -64,7 +64,8 @@
         'Operating System :: POSIX :: Linux',
 
         'Programming Language :: Python :: 3.5',
-        'Programming Language :: Python :: 3.6'
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7'
     ],
     test_suite='tests',
     tests_require=requirements

diff --git a/tests/.DS_Store b/tests/.DS_Store
diff --git a/tests/test_categorical_encoder.py b/tests/test_categorical_encoder.py
@@ -80,3 +80,16 @@ def test_transform_encoder():
     encoder.fit(df, df["Survived"])
     df_encoded = encoder.transform(df)
     assert (df.columns == df_encoded.columns).all()
+    encoder.set_params(strategy="dummification")
+    encoder.fit(df, df["Survived"])
+    df_encoded = encoder.transform(df)
+    assert (type(df_encoded) == pd.SparseDataFrame) | (type(df_encoded) == pd.DataFrame)
+    encoder.set_params(strategy="random_projection")
+    encoder.fit(df, df["Survived"])
+    df_encoded = encoder.transform(df)
+    assert type(df_encoded) == pd.DataFrame
+    encoder.set_params(strategy="entity_embedding")
+    encoder.fit(df, df["Survived"])
+    df_encoded = encoder.transform(df)
+    assert type(df_encoded) == pd.DataFrame
+
diff --git a/tests/test_classifier.py b/tests/test_classifier.py
@@ -76,6 +76,10 @@ def test_feature_importances_classifier():
         classifier.feature_importances()
     df_train = pd.read_csv("data_for_tests/clean_train.csv")
     y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
+    classifier.set_params(strategy="LightGBM")
+    classifier.fit(df_train, y_train)
+    importance = classifier.feature_importances()
+    assert importance != {}
     classifier.set_params(strategy="Linear")
     classifier.fit(df_train, y_train)
     importance = classifier.feature_importances()