Source code for nltools.analysis

"""
    NeuroLearn Analysis Tools
    =========================
    These tools provide the ability to quickly run
    machine-learning analyses on imaging data
"""

__all__ = ["Roc"]
__author__ = ["Luke Chang"]
__license__ = "MIT"

import pandas as pd
import numpy as np
from nltools.plotting import roc_plot
from scipy.stats import norm, binom_test
from sklearn.metrics import auc
from copy import deepcopy


[docs]class Roc(object): """Roc Class The Roc class is based on Tor Wager's Matlab roc_plot.m function and allows a user to easily run different types of receiver operator characteristic curves. For example, one might be interested in single interval or forced choice. Args: input_values: nibabel data instance binary_outcome: vector of training labels threshold_type: ['optimal_overall', 'optimal_balanced', 'minimum_sdt_bias'] **kwargs: Additional keyword arguments to pass to the prediction algorithm """ def __init__( self, input_values=None, binary_outcome=None, threshold_type="optimal_overall", forced_choice=None, **kwargs ): if len(input_values) != len(binary_outcome): raise ValueError( "Data Problem: input_value and binary_outcome" "are different lengths." ) if not any(binary_outcome): raise ValueError("Data Problem: binary_outcome may not be boolean") thr_type = ["optimal_overall", "optimal_balanced", "minimum_sdt_bias"] if threshold_type not in thr_type: raise ValueError( "threshold_type must be ['optimal_overall', " "'optimal_balanced','minimum_sdt_bias']" ) self.input_values = deepcopy(input_values) self.binary_outcome = deepcopy(binary_outcome) self.threshold_type = deepcopy(threshold_type) self.forced_choice = deepcopy(forced_choice) if isinstance(self.binary_outcome, pd.DataFrame): self.binary_outcome = np.array(self.binary_outcome).flatten() else: self.binary_outcome = deepcopy(binary_outcome)
[docs] def calculate( self, input_values=None, binary_outcome=None, criterion_values=None, threshold_type="optimal_overall", forced_choice=None, balanced_acc=False, ): """Calculate Receiver Operating Characteristic plot (ROC) for single-interval classification. Args: input_values: nibabel data instance binary_outcome: vector of training labels criterion_values: (optional) criterion values for calculating fpr & tpr threshold_type: ['optimal_overall', 'optimal_balanced', 'minimum_sdt_bias'] forced_choice: index indicating position for each unique subject (default=None) balanced_acc: balanced accuracy for single-interval classification (bool). THIS IS NOT COMPLETELY IMPLEMENTED BECAUSE IT AFFECTS ACCURACY ESTIMATES, BUT NOT P-VALUES OR THRESHOLD AT WHICH TO EVALUATE SENS/SPEC **kwargs: Additional keyword arguments to pass to the prediction algorithm """ if input_values is not None: self.input_values = deepcopy(input_values) if binary_outcome is not None: self.binary_outcome = deepcopy(binary_outcome) # Create Criterion Values if criterion_values is not None: self.criterion_values = deepcopy(criterion_values) else: self.criterion_values = np.linspace( np.min(self.input_values.squeeze()), np.max(self.input_values.squeeze()), num=50 * len(self.binary_outcome), ) if forced_choice is not None: self.forced_choice = deepcopy(forced_choice) if self.forced_choice is not None: sub_idx = np.unique(self.forced_choice) if len(sub_idx) != len(self.binary_outcome) / 2: raise ValueError( "Make sure that subject ids are correct for 'forced_choice'." ) if len( set(sub_idx).union( set(np.array(self.forced_choice)[self.binary_outcome]) ) ) != len(sub_idx): raise ValueError("Issue with forced_choice subject labels.") if len( set(sub_idx).union( set(np.array(self.forced_choice)[~self.binary_outcome]) ) ) != len(sub_idx): raise ValueError("Issue with forced_choice subject labels.") for sub in sub_idx: sub_mn = ( self.input_values[ (self.forced_choice == sub) & (self.binary_outcome) ] + self.input_values[ (self.forced_choice == sub) & (~self.binary_outcome) ] )[0] / 2 self.input_values[ (self.forced_choice == sub) & (self.binary_outcome) ] = ( self.input_values[ (self.forced_choice == sub) & (self.binary_outcome) ][0] - sub_mn ) self.input_values[ (self.forced_choice == sub) & (~self.binary_outcome) ] = ( self.input_values[ (self.forced_choice == sub) & (~self.binary_outcome) ][0] - sub_mn ) self.class_thr = 0 # Calculate true positive and false positive rate self.tpr = np.zeros(self.criterion_values.shape) self.fpr = np.zeros(self.criterion_values.shape) for i, x in enumerate(self.criterion_values): wh = self.input_values >= x self.tpr[i] = np.sum(wh[self.binary_outcome]) / np.sum(self.binary_outcome) self.fpr[i] = np.sum(wh[~self.binary_outcome]) / np.sum( ~self.binary_outcome ) self.n_true = np.sum(self.binary_outcome) self.n_false = np.sum(~self.binary_outcome) self.auc = auc(self.fpr, self.tpr) # Get criterion threshold if self.forced_choice is None: self.threshold_type = threshold_type if threshold_type == "optimal_balanced": mn = (self.tpr + self.fpr) / 2 self.class_thr = self.criterion_values[np.argmax(mn)] elif threshold_type == "optimal_overall": n_corr_t = self.tpr * self.n_true n_corr_f = (1 - self.fpr) * self.n_false sm = n_corr_t + n_corr_f self.class_thr = self.criterion_values[np.argmax(sm)] elif threshold_type == "minimum_sdt_bias": # Calculate MacMillan and Creelman 2005 Response Bias (c_bias) c_bias = ( norm.ppf(np.maximum(0.0001, np.minimum(0.9999, self.tpr))) + norm.ppf(np.maximum(0.0001, np.minimum(0.9999, self.fpr))) ) / float(2) self.class_thr = self.criterion_values[np.argmin(abs(c_bias))] # Calculate output self.false_positive = (self.input_values >= self.class_thr) & ( ~self.binary_outcome ) self.false_negative = (self.input_values < self.class_thr) & ( self.binary_outcome ) self.misclass = (self.false_negative) | (self.false_positive) self.true_positive = (self.binary_outcome) & (~self.misclass) self.true_negative = (~self.binary_outcome) & (~self.misclass) self.sensitivity = ( np.sum(self.input_values[self.binary_outcome] >= self.class_thr) / self.n_true ) self.specificity = ( 1 - np.sum(self.input_values[~self.binary_outcome] >= self.class_thr) / self.n_false ) self.ppv = np.sum(self.true_positive) / ( np.sum(self.true_positive) + np.sum(self.false_positive) ) if self.forced_choice is not None: self.true_positive = self.true_positive[self.binary_outcome] self.true_negative = self.true_negative[~self.binary_outcome] self.false_negative = self.false_negative[self.binary_outcome] self.false_positive = self.false_positive[~self.binary_outcome] self.misclass = (self.false_positive) | (self.false_negative) # Calculate Accuracy if balanced_acc: self.accuracy = np.mean( [self.sensitivity, self.specificity] ) # See Brodersen, Ong, Stephan, Buhmann (2010) else: self.accuracy = 1 - np.mean(self.misclass) # Calculate p-Value using binomial test (can add hierarchical version of binomial test) self.n = len(self.misclass) self.accuracy_p = binom_test(int(np.sum(~self.misclass)), self.n, p=0.5) self.accuracy_se = np.sqrt( np.mean(~self.misclass) * (np.mean(~self.misclass)) / self.n )
[docs] def plot(self, plot_method="gaussian", balanced_acc=False, **kwargs): """Create ROC Plot Create a specific kind of ROC curve plot, based on input values along a continuous distribution and a binary outcome variable (logical) Args: plot_method: type of plot ['gaussian','observed'] binary_outcome: vector of training labels **kwargs: Additional keyword arguments to pass to the prediction algorithm Returns: fig """ self.calculate(balanced_acc=balanced_acc) # Calculate ROC parameters if plot_method == "gaussian": if self.forced_choice is not None: sub_idx = np.unique(self.forced_choice) diff_scores = [] for sub in sub_idx: diff_scores.append( self.input_values[ (self.forced_choice == sub) & (self.binary_outcome) ][0] - self.input_values[ (self.forced_choice == sub) & (~self.binary_outcome) ][0] ) diff_scores = np.array(diff_scores) mn_diff = np.mean(diff_scores) d = mn_diff / np.std(diff_scores) pooled_sd = np.std(diff_scores) / np.sqrt(2) d_a_model = mn_diff / pooled_sd expected_acc = 1 - norm.cdf(0, d, 1) self.sensitivity = expected_acc self.specificity = expected_acc self.ppv = self.sensitivity / (self.sensitivity + 1 - self.specificity) self.auc = norm.cdf(d_a_model / np.sqrt(2)) x = np.arange(-3, 3, 0.1) self.tpr_smooth = 1 - norm.cdf(x, d, 1) self.fpr_smooth = 1 - norm.cdf(x, -d, 1) else: mn_true = np.mean(self.input_values[self.binary_outcome]) mn_false = np.mean(self.input_values[~self.binary_outcome]) var_true = np.var(self.input_values[self.binary_outcome]) var_false = np.var(self.input_values[~self.binary_outcome]) pooled_sd = np.sqrt( (var_true * (self.n_true - 1)) / (self.n_true + self.n_false - 2) ) d = (mn_true - mn_false) / pooled_sd z_true = mn_true / pooled_sd z_false = mn_false / pooled_sd x = np.arange(z_false - 3, z_true + 3, 0.1) self.tpr_smooth = 1 - (norm.cdf(x, z_true, 1)) self.fpr_smooth = 1 - (norm.cdf(x, z_false, 1)) self.aucn = auc(self.fpr_smooth, self.tpr_smooth) fig = roc_plot(self.fpr_smooth, self.tpr_smooth) elif plot_method == "observed": fig = roc_plot(self.fpr, self.tpr) else: raise ValueError("plot_method must be 'gaussian' or 'observed'") return fig
[docs] def summary(self): """Display a formatted summary of ROC analysis.""" print("------------------------") print(".:ROC Analysis Summary:.") print("------------------------") print("{:20s}".format("Accuracy:") + "{:.2f}".format(self.accuracy)) print("{:20s}".format("Accuracy SE:") + "{:.2f}".format(self.accuracy_se)) print("{:20s}".format("Accuracy p-value:") + "{:.2f}".format(self.accuracy_p)) print("{:20s}".format("Sensitivity:") + "{:.2f}".format(self.sensitivity)) print("{:20s}".format("Specificity:") + "{:.2f}".format(self.specificity)) print("{:20s}".format("AUC:") + "{:.2f}".format(self.auc)) print("{:20s}".format("PPV:") + "{:.2f}".format(self.ppv)) print("------------------------")