Source code for nltools.cross_validation

"""
Cross-Validation Data Classes
=============================

Scikit-learn compatible classes for performing various
types of cross-validation

"""

__all__ = ["KFoldStratified", "set_cv"]
__author__ = ["Luke Chang"]
__license__ = "MIT"

from sklearn.model_selection._split import _BaseKFold
from sklearn.utils.validation import check_array
import numpy as np
import pandas as pd


[docs]class KFoldStratified(_BaseKFold):
    """K-Folds cross validation iterator which stratifies continuous data
    (unlike scikit-learn equivalent).

    Provides train/test indices to split data in train test sets. Split
    dataset into k consecutive folds while ensuring that same subject is
    held out within each fold.  Each fold is then used a validation set
    once while the k - 1 remaining folds form the training set.
    Extension of KFold from scikit-learn cross_validation model

    Args:
        n_splits: int, default=3
            Number of folds. Must be at least 2.
        shuffle: boolean, optional
            Whether to shuffle the data before splitting into batches.
        random_state: None, int or RandomState
            Pseudo-random number generator state used for random
            sampling. If None, use default numpy RNG for shuffling

    """

    def __init__(self, n_splits=3, shuffle=False, random_state=None):
        super(KFoldStratified, self).__init__(
            n_splits=n_splits, shuffle=shuffle, random_state=random_state
        )

    def _make_test_folds(self, X, y=None, groups=None):
        y = pd.DataFrame(y)
        y_sort = y.sort_values(0)
        test_folds = np.nan * np.ones(len(y_sort))
        for k in range(self.n_splits):
            test_idx = y_sort.index[np.arange(k, len(y_sort), self.n_splits)]
            test_folds[y_sort.iloc[test_idx].index] = k
        return test_folds

    def _iter_test_masks(self, X, y=None, groups=None):
        test_folds = self._make_test_folds(X, y)
        for i in range(self.n_splits):
            yield test_folds == i

[docs]    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.

        Args:
            X : array-like, shape (n_samples, n_features)
                Training data, where n_samples is the number of samples
                and n_features is the number of features.
                Note that providing ``y`` is sufficient to generate the splits
                and hence ``np.zeros(n_samples)`` may be used as a placeholder
                for ``X`` instead of actual training data.
            y : array-like, shape (n_samples,)
                The target variable for supervised learning problems.
                Stratification is done based on the y labels.
            groups : (object) Always ignored, exists for compatibility.

        Returns:
            train : (ndarray) The training set indices for that split.
            test : (ndarray) The testing set indices for that split.

        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(KFoldStratified, self).split(X, y, groups)


[docs]def set_cv(Y=None, cv_dict=None, return_generator=True):
    """Helper function to create a sci-kit learn compatible cv object using
    common parameters for prediction analyses.

    Args:
        Y:  (pd.DataFrame) Pandas Dataframe of Y labels
        cv_dict: (dict) Type of cross_validation to use. A dictionary of
            {'type': 'kfolds', 'n_folds': n},
            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
            {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or
            {'type': 'loso', 'subject_id': holdout}
        return_generator (bool): return a cv generator instead of an instance; default True
    Returns:
        cv: a scikit-learn model-selection generator

    """

    if isinstance(cv_dict, dict):
        if cv_dict["type"] == "kfolds":
            if "subject_id" in cv_dict:  # Hold out subjects within each fold
                from sklearn.model_selection import GroupKFold

                cv_inst = GroupKFold(n_splits=cv_dict["n_folds"])
                cv = cv_inst.split(
                    X=np.zeros(len(Y)), y=Y, groups=cv_dict["subject_id"]
                )
            elif "stratified" in cv_dict:  # Stratified K-Folds Continuous
                from nltools.cross_validation import KFoldStratified

                cv_inst = KFoldStratified(n_splits=cv_dict["n_folds"])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y)
            else:  # Normal K-Folds
                from sklearn.model_selection import KFold

                cv_inst = KFold(n_splits=cv_dict["n_folds"])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y)
        elif cv_dict["type"] == "loso":  # Leave One Subject Out
            from sklearn.model_selection import LeaveOneGroupOut

            cv_inst = LeaveOneGroupOut()
            cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict["subject_id"])
        else:
            raise ValueError(
                """Make sure you specify a dictionary of
                            {'type': 'kfolds', 'n_folds': n},
                            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
                            {'type': 'kfolds', 'n_folds': n,
                            'subject_id': holdout}, or {'type': 'loso',
                            'subject_id': holdout}, where n = number of folds,
                            and subject = vector of subject ids that
                            corresponds to self.Y"""
            )
    else:
        raise ValueError("Make sure 'cv_dict' is a dictionary.")
    if return_generator:
        return cv
    else:
        return cv_inst