Source code for neuromaps.stats

# -*- coding: utf-8 -*-
"""Functions for statistical analyses."""

from functools import partial

import numpy as np
from scipy import special, stats as sstats
try:
    from scipy.stats._stats_py import _chk2_asarray  # scipy >= 1.8.0
except ImportError:
    from scipy.stats.stats import _chk2_asarray  # scipy < 1.8.0
from sklearn.utils.validation import check_random_state

from neuromaps.images import load_data


[docs]def compare_images(src, trg, metric='pearsonr', ignore_zero=True, nulls=None, nan_policy='omit', return_nulls=False): """ Compare images `src` and `trg`. If `src` and `trg` represent data from multiple hemispheres the data are concatenated across hemispheres prior to comparison Parameters ---------- src, trg : tuple or str or os.PathLike or img_like or array-like Images (nib.Nifti1Image or nib.GiftiImage) or parcellated data to be compared. metric : {'pearsonr', 'spearmanr', callable}, optional Type of similarity metric to use to compare `src` and `trg` images. If a callable function is provided it must accept two inputs and return a single value (the similarity metric). Default: 'pearsonr' ignore_zero : bool, optional Whether to perform comparisons ignoring all zero values in `src` and `trg` data. Default: True nulls : array_like, optional Null data for `src` to use in generating a non-parametric p-value. If not specified a parametric p-value is generated. Default: None nan_policy : {'propagate', 'raise', 'omit'}, optional Defines how to handle when input contains nan. 'propagate' propagates the nan values to the callable metric (will return nan if the metric is `spearmanr` `or pearsonr`), 'raise' throws an error, 'omit' performs the calculations ignoring nan values. Default: 'omit' return_nulls : bool, optional Whether to return the null distribution of comparisons. Can only be set to `True` if `nulls` is not None. Default: False Returns ------- similarity : float Comparison metric between `src` and `trg` pvalue : float The p-value of `similarity`, if `nulls` is not None nulls : (n_perm, ) array_like Null distribution of similarity metrics. Only returned if `return_nulls` is True. """ methods = ('pearsonr', 'spearmanr') if metric not in methods: if not callable(metric): raise ValueError(f'Invalid `metric`: {metric}') else: if not np.isscalar(metric([1, 1], [1, 1])): raise ValueError('Provided callable `metric` must accept two ' 'inputs and return single value.') if return_nulls and nulls is None: raise ValueError('`return_nulls` cannot be True when `nulls` is None.') srcdata, trgdata = load_data(src), load_data(trg) # drop NaNs (if nan_policy==`omit`) and zeros (if ignore_zero=True) zeromask = np.zeros(len(srcdata), dtype=bool) if ignore_zero: zeromask = np.logical_or(np.isclose(srcdata, 0), np.isclose(trgdata, 0)) nanmask = np.logical_or(np.isnan(srcdata), np.isnan(trgdata)) if nan_policy == 'raise': if np.any(nanmask): raise ValueError('Inputs contain nan') elif nan_policy == 'omit': mask = np.logical_and(np.logical_not(zeromask), np.logical_not(nanmask)) elif nan_policy == 'propagate': mask = np.logical_not(zeromask) srcdata, trgdata = srcdata[mask], trgdata[mask] if metric in methods: if metric == 'spearmanr': srcdata = sstats.rankdata(srcdata) trgdata = sstats.rankdata(trgdata) metric = partial(efficient_pearsonr, return_pval=False) if nulls is not None: n_perm = nulls.shape[-1] nulls = nulls[mask] return permtest_metric(srcdata, trgdata, metric, n_perm=n_perm, nulls=nulls, nan_policy=nan_policy, return_nulls=return_nulls) return metric(srcdata, trgdata)
[docs]def permtest_metric(a, b, metric='pearsonr', n_perm=1000, seed=0, nulls=None, nan_policy='propagate', return_nulls=False): """ Generate non-parameteric p-value of `a` and `b` for `metric`. Calculates two-tailed p-value for hypothesis of whether samples `a` and `b` are related using permutation tests Parameters ---------- a, b : (N,) array_like Sample observations. These arrays must have the same length metric : {'pearsonr', 'spearmanr', callable}, optional Type of similarity metric to use to compare `a` and `b`. If a callable function is provided it must accept two inputs and return a single value (the similarity metric). Default: 'pearsonr' n_perm : int, optional Number of permutations to assess. Unless `a` and `b` are very small this will approximate a randomization test via Monte Carlo simulations. Default: 1000 seed : {int, np.random.RandomState instance, None}, optional Seed for random number generation. Set to None for pseudo-randomness. Default: 0 nulls : (N, P) array_like, optional Null array used in place of shuffled `a` array to compute null distribution of correlations. Array must have the same length as `a` and `b`. Providing this will override the value supplied to `n_perm`. When not specified a standard permutation is used to shuffle `a`. Default: None nan_policy : {'propagate', 'raise', 'omit'}, optional Defines how to handle when inputs contain nan. 'propagate' returns nan, 'raise' throws an error, 'omit' performs the calculations ignoring nan values. Default: 'propagate' return_nulls : bool, optional Whether to return the null distribution of comparisons. Default: False Returns ------- similarity : float Similarity metric pvalue : float Non-parametric p-value nulls : (n_perm, ) array_like Null distribution of similarity metrics. Only returned if `return_nulls` is True. Notes ----- The lowest p-value that can be returned by this function is equal to 1 / (`n_perm` + 1). """ def nan_wrap(a, b, nan_policy='propagate'): nanmask = np.logical_or(np.isnan(a), np.isnan(b)) if nan_policy == 'raise': if np.any(nanmask): raise ValueError('Input contains nan') elif nan_policy == 'omit': a, b = a[~nanmask], b[~nanmask] return metric(a, b) a, b, _ = _chk2_asarray(a, b, 0) rs = check_random_state(seed) if len(a) != len(b): raise ValueError('Provided arrays do not have same length') if a.size == 0 or b.size == 0: return np.nan, np.nan methods = ('pearsonr', 'spearmanr') if metric in methods: if metric == 'spearmanr': a, b = sstats.rankdata(a), sstats.rankdata(b) compfunc = partial(efficient_pearsonr, return_pval=False) else: compfunc = nan_wrap if nulls is not None: n_perm = nulls.shape[-1] # divide by one forces coercion to float if ndim = 0 true_sim = compfunc(a, b, nan_policy=nan_policy) / 1 abs_true = np.abs(true_sim) permutations = np.ones(true_sim.shape) nulldist = np.zeros(((n_perm, ) + true_sim.shape)) for perm in range(n_perm): # permute `a` and determine whether correlations exceed original ap = a[rs.permutation(len(a))] if nulls is None else nulls[:, perm] nullcomp = compfunc(ap, b, nan_policy=nan_policy) permutations += np.abs(nullcomp) >= abs_true nulldist[perm] = nullcomp pvals = permutations / (n_perm + 1) # + 1 in denom accounts for true_sim if return_nulls: return true_sim, pvals, nulldist return true_sim, pvals
def efficient_pearsonr(a, b, ddof=1, nan_policy='propagate', return_pval=True): """ Compute correlation of matching columns in `a` and `b`. Parameters ---------- a,b : array_like Sample observations. These arrays must have the same length and either an equivalent number of columns or be broadcastable ddof : int, optional Degrees of freedom correction in the calculation of the standard deviation. Default: 1 nan_policy : {'propagate', 'raise', 'omit'}, optional Defines how to handle when input contains nan. 'propagate' returns nan, 'raise' throws an error, 'omit' performs the calculations ignoring nan values. Default: 'propagate' Returns ------- corr : float or numpy.ndarray Pearson's correlation coefficient between matching columns of inputs pval : float or numpy.ndarray Two-tailed p-values. Only returned if `return_pval` is True Notes ----- If either input contains nan and nan_policy is set to 'omit', both arrays will be masked to omit the nan entries. """ a, b, axis = _chk2_asarray(a, b, 0) if len(a) != len(b): raise ValueError('Provided arrays do not have same length') if a.size == 0 or b.size == 0: return np.nan, np.nan if nan_policy not in ('propagate', 'raise', 'omit'): raise ValueError(f'Value for nan_policy "{nan_policy}" not allowed') a, b = a.reshape(len(a), -1), b.reshape(len(b), -1) if (a.shape[1] != b.shape[1]): a, b = np.broadcast_arrays(a, b) mask = np.logical_or(np.isnan(a), np.isnan(b)) if nan_policy == 'raise' and np.any(mask): raise ValueError('Input contains nan') elif nan_policy == 'omit': # avoid making copies of the data, if possible a = np.ma.masked_array(a, mask, copy=False, fill_value=np.nan) b = np.ma.masked_array(b, mask, copy=False, fill_value=np.nan) with np.errstate(invalid='ignore'): corr = (sstats.zscore(a, ddof=ddof, nan_policy=nan_policy) * sstats.zscore(b, ddof=ddof, nan_policy=nan_policy)) sumfunc, n_obs = np.sum, len(a) if nan_policy == 'omit': corr = corr.filled(np.nan) sumfunc = np.nansum n_obs = np.squeeze(np.sum(np.logical_not(np.isnan(corr)), axis=0)) corr = sumfunc(corr, axis=0) / (n_obs - 1) corr = np.squeeze(np.clip(corr, -1, 1)) / 1 if return_pval: # taken from scipy.stats ab = (n_obs / 2) - 1 prob = 2 * special.btdtr(ab, ab, 0.5 * (1 - np.abs(corr))) return corr, prob return corr