Source code for gseapy.stats

# -*- coding: utf-8 -*-
import numpy as np
from scipy.stats import fisher_exact, hypergeom



[docs]
def calc_pvalues(query, gene_sets, background=20000, **kwargs):
    """calculate pvalues for all categories in the graph

    :param set query: set of identifiers for which the p value is calculated
    :param dict gene_sets: gmt file dict after background was set
    :param set background: total number of genes in your annotated database.
    :returns: pvalues
              x: overlapped gene number
              n: length of gene_set which belongs to each terms
              hits: overlapped gene names.


    For 2*2 contingency table:
    =============================================================================
                         |   in  query  |  not in query |    row total
    =>      in gene_set  |        a     |       b       |       a+b
    =>  not in gene_set  |        c     |       d       |       c+d
           column total                                 | a+b+c+d = anno database
    =============================================================================
    background genes number = a + b + c + d.

    Then, in R
        x=a     the number of white balls drawn without replacement
                from an urn which contains both black and white balls.
        m=a+b   the number of white balls in the urn
        n=c+d   the number of black balls in the urn
        k=a+c   the number of balls drawn from the urn

    In Scipy:
    for args in scipy.hypergeom.sf(k, M, n, N, loc=0):
        M: the total number of objects,
        n: the total number of Type I objects.
        k: the random variate represents the number of Type I objects in N drawn
           without replacement from the total population.

    Therefore, these two functions are the same when using parameters from 2*2 table:
    R:     >   phyper(x-1, m, n, k, lower.tail=FALSE)
    Scipy: >>> hypergeom.sf(x-1, m+n, m, k)

    For Odds ratio in Enrichr (see https://maayanlab.cloud/Enrichr/help#background&q=4)

        oddsRatio = (1.0 * x * d) / Math.max(1.0 * b * c, 1)

    where:

        x are the overlapping genes,
        b (m-x) are the genes in the annotated set - overlapping genes,
        c (k-x) are the genes in the input set - overlapping genes,
        d (bg-m-k+x) are the 20,000 genes (or total genes in the background) - genes in the annotated set - genes in the input set + overlapping genes

    """

    query = set(query)
    vals = []
    # background should be all genes in annotated database
    # such as go, kegg et.al.
    if isinstance(background, set):
        bg = len(background)  # total number in your annotated database
        # filter genes that not found in annotated database
        query = query.intersection(background)
    elif isinstance(background, int):
        bg = background
    else:
        raise ValueError("background should be set or int object")
    # number of genes in your query data
    k = len(query)
    # pval
    subsets = sorted(gene_sets.keys())
    for s in subsets:
        category = set(gene_sets.get(s))
        # the categories should be only exist in custom background too
        if isinstance(background, set):
            category = category.intersection(background)
        hits = query.intersection(category)
        x = len(hits)  # overlap hits
        if x < 1:
            continue
        m = len(category)
        # pVal = hypergeom.sf(hitCount-1,popTotal,bgHits,queryTotal)
        # p(X >= hitCounts)
        pval = hypergeom.sf(x - 1, bg, m, k)
        # oddr, pval2 = odds_ratio_calc(bg, k, m, x)
        # expect_count = k*m/bg
        # oddr= x / expect_count
        # oddr= (x*(bg-m))/(m*(k-x)) # thanks to @sreichl.
        oddr = ((x + 0.5) * (bg - m + 0.5)) / (
            (m + 0.5) * (k - x + 0.5)
        )  # Haldane-Anscombe correction, issue #132
        vals.append((s, pval, oddr, x, m, hits))

    return zip(*vals)



# def odds_ratio_calc(bg_n, gene_list_n, gene_set_n, overlap_n):
#     """
#     bg_n = number of background genes
#     gene_list_n = number of genes in the gene list (ie query genes)
#     gene_set_n = number of genes in the (corrected by background) gene set (eg pathways/GO terms)
#     overlap_n = number of genes overlapping with between the (corrected by background) gene set and the gene list
#     """

#     # make contingency table
#     table=np.array([[gene_set_n, bg_n-gene_set_n],[overlap_n, gene_list_n-overlap_n]])
#     # perform Fisher's exact test
#     oddsratio, pvalue = fisher_exact(table)
#     # return (inverse) oddsratio
#     return 1/oddsratio, pvalue


def _ecdf(x):
    nobs = len(x)
    return np.arange(1, nobs + 1) / float(nobs)



[docs]
def fdrcorrection(pvals, alpha=0.05):
    """benjamini hocheberg fdr correction. inspired by statsmodels"""
    # Implement copy from GOATools.
    pvals = np.asarray(pvals)
    pvals_sortind = np.argsort(pvals)
    pvals_sorted = np.take(pvals, pvals_sortind)

    ecdffactor = _ecdf(pvals_sorted)
    reject = pvals_sorted <= ecdffactor * alpha
    if reject.any():
        rejectmax = max(np.nonzero(reject)[0])
        reject[:rejectmax] = True
    pvals_corrected_raw = pvals_sorted / ecdffactor
    pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
    del pvals_corrected_raw
    pvals_corrected[pvals_corrected > 1] = 1
    pvals_corrected_ = np.empty_like(pvals_corrected)
    pvals_corrected_[pvals_sortind] = pvals_corrected
    del pvals_corrected
    reject_ = np.empty_like(reject)
    reject_[pvals_sortind] = reject
    return reject_, pvals_corrected_




[docs]
def multiple_testing_correction(ps, alpha=0.05, method="benjamini-hochberg", **kwargs):
    """correct pvalues for multiple testing and add corrected `q` value

    :param ps: list of pvalues
    :param alpha: significance level default : 0.05
    :param method: multiple testing correction method [bonferroni|benjamini-hochberg]
    :returns (q, rej): two lists of q-values and rejected nodes
    """
    # Implement copy from GOATools.
    _p = np.array(ps)
    q = _p.copy()
    rej = _p.copy()
    mask = ~np.isnan(_p)
    p = _p[mask]
    if method == "bonferroni":
        q[mask] = p * len(p)
        rej[mask] = q[mask] < alpha
    elif method == "benjamini-hochberg":
        _rej, _q = fdrcorrection(p, alpha)
        rej[mask] = _rej
        q[mask] = _q
    else:
        raise ValueError(method)
    return q, rej