Source code for gseapy

import warnings
from importlib.metadata import PackageNotFoundError, version
from typing import Dict, Iterable, List, Optional, Tuple, Union

import pandas as pd

from .biomart import Biomart
from .enrichr import Enrichr
from .gsea import GSEA, Prerank, Replot
from .gsva import GSVA
from .msigdb import Msigdb
from .parser import get_library, get_library_name, read_gmt
from .plot import barplot, dotplot, enrichment_map, gseaplot, gseaplot2, heatmap
from .ssgsea import SingleSampleGSEA
from .utils import GOFilter

try:
    __version__ = version("gseapy")
except PackageNotFoundError:
    __version__ = "unknown"



[docs]
def gsea(
    data: Union[pd.DataFrame, str],
    gene_sets: Union[List[str], str, Dict[str, str]],
    cls: Union[List[str], str],
    *,
    organism: str = "human",
    outdir: Optional[str] = None,
    min_size: int = 15,
    max_size: int = 500,
    permutation_num: int = 1000,
    weight: float = 1.0,
    permutation_type: str = "phenotype",
    method: str = "signal_to_noise",
    ascending: bool = False,
    threads: int = 4,
    figsize: Tuple[float, float] = (6.5, 6),
    format: str = "pdf",
    graph_num: int = 20,
    no_plot: bool = False,
    seed: int = 123,
    verbose: bool = False,
    **kwargs,
) -> GSEA:
    """Run Gene Set Enrichment Analysis.

    :param data: Gene expression data table, Pandas DataFrame, gct file.

    :param gene_sets: Enrichr Library name or .gmt gene sets file or dict of gene sets. Same input with GSEA.
               NOTE: If multiple gene sets are provided, the FDR null distribution will be based on the combined gene sets.
               This may lead to slight differences in FDR values compared to running GSEA separately for each gene set.
               See github issue for more details: https://github.com/zqfang/GSEApy/issues/323

    :param cls: A list or a .cls file format required for GSEA.

    :param str organism: Organism for Enrichr library names (human, mouse, yeast, fly, fish, worm); does not affect custom gene sets (gmt or dict).

    :param str outdir: Results output directory. If None, nothing will write to disk.

    :param int permutation_num: Number of permutations. Default: 1000.
                                Minimial possible nominal p-value is about 1/nperm.

    :param str permutation_type: Type of permutation reshuffling,
                                 choose from {"phenotype": 'sample.labels' , "gene_set" : gene.labels}.

    :param int min_size: Minimum allowed number of genes from gene set also the data set. Default: 15.

    :param int max_size: Maximum allowed number of genes from gene set also the data set. Default: 500.

    :param float weight: Refer to :func:`algorithm.enrichment_score`. Default:1.

    :param method:  The method used to calculate a correlation or ranking. Default: 'signal_to_noise'.
                   Others methods are:

                   1. 'signal_to_noise'

                      You must have at least three samples for each phenotype to use this metric.
                      The larger the signal-to-noise ratio, the larger the differences of the means
                      (scaled by the standard deviations); that is, the more distinct
                      the gene expression is in each phenotype and the more the gene acts as a “class marker.”

                   2. 't_test'

                      Uses the difference of means scaled by the standard deviation and number of samples.
                      Note: You must have at least three samples for each phenotype to use this metric.
                      The larger the tTest ratio, the more distinct the gene expression is in each phenotype
                      and the more the gene acts as a “class marker.”

                   3. 'ratio_of_classes' (also referred to as fold change).

                      Uses the ratio of class means to calculate fold change for natural scale data.

                   4. 'diff_of_classes'


                      Uses the difference of class means to calculate fold change for nature scale data


                   5. 'log2_ratio_of_classes'

                      Uses the log2 ratio of class means to calculate fold change for natural scale data.
                      This is the recommended statistic for calculating fold change for log scale data.


    :param bool ascending: Sorting order of rankings. Default: False.

    :param int threads: Number of threads you are going to use. Default: 4.

    :param list figsize: Matplotlib figsize, accept a tuple or list, e.g. [width,height]. Default: [6.5,6].

    :param str format: Matplotlib figure format. Default: 'pdf'.

    :param int graph_num: Plot graphs for top sets of each phenotype.

    :param bool no_plot: If equals to True, no figure will be drawn. Default: False.

    :param seed: Random seed. expect an integer. Default:None.

    :param bool verbose: Bool, increase output verbosity, print out progress of your job, Default: False.

    :return: Return a GSEA obj. All results store to a dictionary, obj.results,
             where contains::

                 | {
                 |  term: gene set name,
                 |  es: enrichment score,
                 |  nes: normalized enrichment score,
                 |  pval:  Nominal p-value (from the null distribution of the gene set,
                 |  fdr: FDR qvalue (adjusted False Discory Rate),
                 |  fwerp: Family wise error rate p-values,
                 |  tag %: Percent of gene set before running enrichment peak (ES),
                 |  gene %: Percent of gene list before running enrichment peak (ES),
                 |  lead_genes: leading edge genes (gene hits before running enrichment peak),
                 |  matched genes: genes matched to the data,
                 | }


    """
    if "processes" in kwargs:
        warnings.warn("processes is deprecated; use threads", DeprecationWarning, 2)
        threads = kwargs["processes"]
    if "weighted_score_type" in kwargs:
        warnings.warn("weighted_score_type is deprecated; use weight", DeprecationWarning, 2)
        weight = kwargs["weighted_score_type"]

    gs = GSEA(
        data=data,
        gene_sets=gene_sets,
        classes=cls,
        organism=organism,
        outdir=outdir,
        min_size=min_size,
        max_size=max_size,
        permutation_num=permutation_num,
        weight=weight,
        permutation_type=permutation_type,
        method=method,
        ascending=ascending,
        threads=threads,
        figsize=figsize,
        format=format,
        graph_num=graph_num,
        no_plot=no_plot,
        seed=seed,
        verbose=verbose,
    )
    gs.run()

    return gs




[docs]
def ssgsea(
    data: Union[pd.Series, pd.DataFrame, str],
    gene_sets: Union[List[str], str, Dict[str, str]],
    *,
    organism: str = "human",
    outdir: Optional[str] = None,
    sample_norm_method: Optional[str] = "rank",
    correl_norm_type: Optional[str] = "rank",
    min_size: int = 15,
    max_size: int = 500,
    permutation_num: Optional[int] = None,
    weight: float = 0.25,
    ascending: bool = False,
    threads: int = 4,
    figsize: Tuple[float, float] = (6.5, 6),
    format: str = "pdf",
    graph_num: int = 20,
    no_plot: bool = True,
    seed: int = 123,
    verbose: bool = False,
    **kwargs,
) -> SingleSampleGSEA:
    """Run Gene Set Enrichment Analysis with single sample GSEA tool

    :param data: Expression table, pd.Series, pd.DataFrame, GCT file, or .rnk file format.

    :param gene_sets: Enrichr Library name or .gmt gene sets file or dict of gene sets. Same input with GSEA.

    :param str organism: Organism for Enrichr library names (human, mouse, yeast, fly, fish, worm); does not affect custom gene sets (gmt or dict).

    :param outdir: Results output directory. If None, nothing will write to disk.

    :param str sample_norm_method: Sample normalization method. Choose from {'rank', 'log', 'log_rank', None}. Default: rank.
               this argument will be used for ordering genes.

               1. 'rank': Rank your expression data, and transform by 10000*rank_dat/gene_numbers
               2. 'log' : Do not rank, but transform data by log(data + exp(1)), while data = data[data<1] =1.
               3. 'log_rank': Rank your expression data, and transform by log(10000*rank_dat/gene_numbers+ exp(1))
               4. None or 'custom': Do nothing, and use your own rank value to calculate enrichment score.

    see here: https://github.com/GSEA-MSigDB/ssGSEAProjection-gpmodule/blob/master/src/ssGSEAProjection.Library.R, line 86

    :param str correl_norm_type: correlation normalization type. Choose from {'rank', 'symrank', 'zscore', None}. Default: rank.
            After ordering genes by sample_norm_method, further data transformed could be applied to get enrichment score.

            when weight == 0, sample_norm_method and correl_norm_type do not matter;
            when weight > 0, the combination of sample_norm_method and correl_norm_type
            dictate how the gene expression values in input data are transformed
            to obtain the score -- use this setting with care (the transformations
            can skew scores towards +ve or -ve values)

            sample_norm_method will first transformed and rank original data. the data is named correl_vector for each sample.
            then correl_vector is transformed again by

            1. correl_norm_type is None or 'rank' :  do nothing, genes are weighted by actual correl_vector.
            2. correl_norm_type =='symrank': symmetric ranking.
            3. correl_norm_type =='zscore':  standardizes the correl_vector before using them to calculate scores.


    :param int min_size: Minimum allowed number of genes from gene set also the data set. Default: 15.

    :param int max_size: Maximum allowed number of genes from gene set also the data set. Default: 2000.

    :param int permutation_num: For ssGSEA, default is 0.
                                However, if you try to use ssgsea method to get pval and fdr, set to an interger.

    :param str weight: Refer to :func:`algorithm.enrichment_score`. Default:0.25.

    :param bool ascending: Sorting order of rankings. Default: False.

    :param int threads: Number of threads you are going to use. Default: 4.

    :param list figsize: Matplotlib figsize, accept a tuple or list, e.g. [width,height]. Default: [7,6].

    :param str format: Matplotlib figure format. Default: 'pdf'.

    :param int graph_num: Plot graphs for top sets of each phenotype.

    :param bool no_plot: If equals to True, no figure will be drawn. Default: False.

    :param seed: Random seed. expect an integer. Default:None.

    :param bool verbose: Bool, increase output verbosity, print out progress of your job, Default: False.

    :return: Return a ssGSEA obj.
             All results store to  a dictionary, access enrichment score or
             normalized enrichment score by obj.res2d or obj.results.
             if permutation_num > 0, additional results contain::

                 | {
                 |  term: gene set name,
                 |  es: enrichment score,
                 |  nes: normalized enrichment score,
                 |  pval:  Nominal p-value (from the null distribution of the gene set (if permutation_num > 0),
                 |  fdr: FDR qvalue (adjusted FDR) (if permutation_num > 0),
                 |  fwerp: Family wise error rate p-values (if permutation_num > 0),
                 |  tag %: Percent of gene set before running enrichment peak (ES),
                 |  gene %: Percent of gene list before running enrichment peak (ES),
                 |  lead_genes: leading edge genes (gene hits before running enrichment peak),
                 |  matched genes: genes matched to the data,
                 | }


    """
    if "processes" in kwargs:
        warnings.warn("processes is deprecated; use threads", DeprecationWarning, 2)
        threads = kwargs["processes"]
    if "weighted_score_type" in kwargs:
        warnings.warn("weighted_score_type is deprecated; use weight", DeprecationWarning, 2)
        weight = kwargs["weighted_score_type"]

    ss = SingleSampleGSEA(
        data=data,
        gene_sets=gene_sets,
        organism=organism,
        outdir=outdir,
        sample_norm_method=sample_norm_method,
        correl_norm_type=correl_norm_type,
        min_size=min_size,
        max_size=max_size,
        permutation_num=permutation_num,
        weight=weight,
        ascending=ascending,
        threads=threads,
        figsize=figsize,
        format=format,
        graph_num=graph_num,
        no_plot=no_plot,
        seed=seed,
        verbose=verbose,
    )
    ss.run()
    return ss




[docs]
def prerank(
    rnk: Union[pd.DataFrame, pd.Series, str],
    gene_sets: Union[List[str], str, Dict[str, str]],
    *,
    organism: str = "human",
    outdir: Optional[str] = None,
    pheno_pos: str = "Pos",
    pheno_neg: str = "Neg",
    min_size: int = 15,
    max_size: int = 500,
    permutation_num: int = 1000,
    weight: float = 1.0,
    ascending: Optional[bool] = False,
    threads: int = 4,
    figsize: Tuple[float, float] = (6.5, 6),
    format: str = "pdf",
    graph_num: int = 20,
    no_plot: bool = False,
    seed: int = 123,
    verbose: bool = False,
    method: str = "permutation",
    sample_size: int = 101,
    eps: float = 1e-50,
    **kwargs,
) -> Prerank:
    """Run Gene Set Enrichment Analysis with pre-ranked correlation defined by user.

    :param rnk: pre-ranked correlation table or pandas DataFrame. Same input with ``GSEA`` .rnk file.

    :param gene_sets: Enrichr Library name or .gmt gene sets file or dict of gene sets. Same input with GSEA.
               NOTE: If multiple gene sets are provided, the FDR null distribution will be based on the combined gene sets.
               This may lead to slight differences in FDR values compared to running GSEA separately for each gene set.
               See github issue for more details: https://github.com/zqfang/GSEApy/issues/323


    :param str organism: Organism for Enrichr library names (human, mouse, yeast, fly, fish, worm); does not affect custom gene sets (gmt or dict).

    :param outdir: results output directory. If None, nothing will write to disk.

    :param int permutation_num: Number of permutations. Default: 1000.
                                Minimial possible nominal p-value is about 1/nperm.

    :param int min_size: Minimum allowed number of genes from gene set also the data set. Default: 15.

    :param int max_size: Maximum allowed number of genes from gene set also the data set. Defaults: 500.

    :param str weight: Refer to :func:`algorithm.enrichment_score`. Default:1.

    :param bool ascending: Sorting order of rankings. Default: False for descending. If None, do not sort the ranking.

    :param int threads: Number of threads you are going to use. Default: 4.

    :param list figsize: Matplotlib figsize, accept a tuple or list, e.g. [width,height]. Default: [6.5,6].

    :param str format: Matplotlib figure format. Default: 'pdf'.

    :param int graph_num: Plot graphs for top sets of each phenotype.

    :param bool no_plot: If equals to True, no figure will be drawn. Default: False.

    :param seed: Random seed. expect an integer. Default:None.

    :param bool verbose: Bool, increase output verbosity, print out progress of your job, Default: False.

    :param str method: P-value / significance estimation procedure. Default: 'permutation'.
                       Choose from:

                       1. 'permutation'

                          Classic gene-set permutation: the null distribution of ES is built by
                          permuting gene-set membership ``permutation_num`` times. NES, nominal
                          p-value and FDR are derived from this null. Supports both a single
                          preranked list and a multi-column ranking DataFrame.

                       2. 'multilevel'

                          fgsea multilevel procedure (a faithful port of the fgsea C++ core).
                          Estimates arbitrarily small p-values via adaptive multilevel sampling
                          instead of plain permutation, so it can resolve significance well below
                          ``1 / permutation_num``. NES is computed from fgsea's random-gene-set
                          null (``NES = ES / mean(same-sign null ES)``), which differs by design
                          from the classic permutation NES. Supports a single preranked list only
                          (a multi-column DataFrame raises ``NotImplementedError``).

    :param int sample_size: Only used when ``method='multilevel'``. Sample size for the multilevel
                            split step of the fgsea algorithm; larger values give more accurate
                            (but slower) tail p-value estimates. Default: 101.

    :param float eps: Only used when ``method='multilevel'``. Lower boundary for the estimated
                      p-value; p-values smaller than ``eps`` are reported as ``eps``. Set to 0 to
                      estimate p-values as small as machine precision allows. Default: 1e-50.

    :return: Return a Prerank obj. All results store to  a dictionary, obj.results,
             where contains::

                 | {
                 |  term: gene set name,
                 |  es: enrichment score,
                 |  nes: normalized enrichment score,
                 |  pval:  Nominal p-value (from the null distribution of the gene set,
                 |  fdr: FDR qvalue (adjusted False Discory Rate),
                 |  fwerp: Family wise error rate p-values,
                 |  tag %: Percent of gene set before running enrichment peak (ES),
                 |  gene %: Percent of gene list before running enrichment peak (ES),
                 |  lead_genes: leading edge genes (gene hits before running enrichment peak),
                 |  matched genes: genes matched to the data,
                 | }


    """
    if "processes" in kwargs:
        warnings.warn("processes is deprecated; use threads", DeprecationWarning, 2)
        threads = kwargs["processes"]

    if "weighted_score_type" in kwargs:
        warnings.warn("weighted_score_type is deprecated; use weight", DeprecationWarning, 2)
        weight = kwargs["weighted_score_type"]

    pre = Prerank(
        rnk=rnk,
        gene_sets=gene_sets,
        organism=organism,
        outdir=outdir,
        pheno_pos=pheno_pos,
        pheno_neg=pheno_neg,
        min_size=min_size,
        max_size=max_size,
        permutation_num=permutation_num,
        weight=weight,
        ascending=ascending,
        threads=threads,
        figsize=figsize,
        format=format,
        graph_num=graph_num,
        no_plot=no_plot,
        seed=seed,
        verbose=verbose,
        method=method,
        sample_size=sample_size,
        eps=eps,
    )
    pre.run()
    return pre




[docs]
def replot(
    indir,
    outdir="GSEA_Replot",
    weight=1,
    min_size=3,
    max_size=1000,
    figsize=(6.5, 6),
    format="pdf",
    verbose=False,
    *args,
    **kwargs,
):
    """The main function to reproduce GSEA desktop outputs.

    :param indir: GSEA desktop results directory. In the sub folder, you must contain edb file folder.

    :param outdir: Output directory.

    :param float weight: weighted score type. choose from {0,1,1.5,2}. Default: 1.

    :param list figsize: Matplotlib output figure figsize. Default: [6.5,6].

    :param str format: Matplotlib output figure format. Default: 'pdf'.

    :param int min_size: Min size of input genes presented in Gene Sets. Default: 3.

    :param int max_size: Max size of input genes presented in Gene Sets. Default: 5000.
                     You are not encouraged to use min_size, or max_size argument in :func:`replot` function.
                     Because gmt file has already been filtered.

    :param verbose: Bool, increase output verbosity, print out progress of your job, Default: False.

    :return: Generate new figures with selected figure format. Default: 'pdf'.

    """
    if "weighted_score_type" in kwargs:
        warnings.warn("weighted_score_type is deprecated; use weight", DeprecationWarning, 2)
        weight = kwargs["weighted_score_type"]

    rep = Replot(indir, outdir, weight, min_size, max_size, figsize, format, verbose)
    rep.run()

    return




[docs]
def enrichr(
    gene_list: Union[str, List[str], Tuple[str, ...], pd.Series, pd.DataFrame],
    gene_sets: Union[List[str], str, Dict[str, str]],
    *,
    organism: str = "human",
    outdir: Optional[str] = None,
    background: Union[List[str], int, str] = None,
    cutoff: float = 0.05,
    format: str = "pdf",
    figsize: Tuple[float, float] = (6.5, 6),
    top_term: int = 10,
    no_plot: bool = False,
    verbose: bool = False,
) -> Enrichr:
    """Enrichr API.

    :param gene_list: str, list, tuple, pd.Series, pd.DataFrame. Also supports input txt file path with one gene id per row.
                      The gene identifiers in `gene_list` should match the type used in `gene_sets`.

    :param gene_sets: str, list of Enrichr Library name(s), or custom defined gene_sets (dict, or gmt file).
                      or custom defined gene_sets (dict, or gmt file).

                      Examples:

                      Input Enrichr Libraries (https://maayanlab.cloud/Enrichr/#stats):
                        str: 'KEGG_2016'
                        list: ['KEGG_2016','KEGG_2013']
                        Use comma to separate each other, e.g. "KEGG_2016,huMAP,GO_Biological_Process_2018"

                      Input custom files:
                        dict: gene_sets={'A':['gene1', 'gene2',...],
                                        'B':['gene2', 'gene4',...], ...}
                        gmt: "genes.gmt"

                      see also the online docs:
                      https://gseapy.readthedocs.io/en/latest/gseapy_example.html#2.-Enrichr-Example


    :param organism: Organism for Enrichr library names (human, mouse, yeast, fly, fish, worm); does not affect custom gene sets (gmt or dict).

                     Does not affect gmt or dict input of `gene_sets`.

    :param outdir:   Output file directory

    :param background: Background gene set for statistical testing. Type: None | int | list | str.

                       **When is this used?**
                       - Only applies to CUSTOM gene sets (gmt file or dict)
                       - Ignored when using Enrichr library names (e.g., 'KEGG_2016')

                       **Default behavior:**
                       - If None: All genes in your gene_sets will be used as background

                       **Recommended usage (3 options):**

                       Option 1: Gene list (RECOMMENDED)
                           Provide your experiment-specific background genes:

                           background=['gene1', 'gene2', 'gene3', ...]

                           Example: All expressed genes from your RNA-seq experiment
                           Note: Gene identifiers must match those in your gene_sets

                       Option 2: Gene count (simple but less accurate)
                           Specify total number of genes tested:

                           background=20000  # total genes in your experiment

                           Warning: Assumes all genes could be detected. May affect
                           statistical accuracy if gene sets contain genes not in your
                           actual background.

                       Option 3: BioMart dataset (automatic download)
                           Use a BioMart database name for genome-wide background:

                           background='hsapiens_gene_ensembl'  # human genes
                           background='mmusculus_gene_ensembl' # mouse genes

                           The program downloads all annotated genes with Entrez IDs.
                           First download may take a few minutes; results are cached.

                           Cached location: ~/.cache/gseapy/{dataset}.background.genes.txt

                       **Why does background matter?**
                       Background genes define the "universe" for hypergeometric testing.
                       Using the correct background (e.g., detected genes in your
                       experiment) improves statistical accuracy compared to using all
                       possible genes in a genome.

    :param cutoff:   Show enriched terms which Adjusted P-value < cutoff.
                     Only affects the output figure, not the final output file. Default: 0.05
    :param format:  Output figure format supported by matplotlib,('pdf','png','eps'...). Default: 'pdf'.

    :param figsize: Matplotlib figsize, accept a tuple or list, e.g. (width,height). Default: (6.5,6).

    :param bool no_plot: If equals to True, no figure will be drawn. Default: False.

    :param bool verbose: Increase output verbosity, print out progress of your job, Default: False.

    :return: An Enrichr object, which obj.res2d stores your last query, obj.results stores your all queries.

    """
    enr = Enrichr(
        gene_list,
        gene_sets,
        organism,
        outdir,
        background,
        cutoff,
        format,
        figsize,
        top_term,
        no_plot,
        verbose,
    )
    # set organism
    enr.set_organism()
    enr.run()

    return enr




[docs]
def enrich(
    gene_list: Iterable[str],
    gene_sets: Union[List[str], str, Dict[str, str]],
    *,
    background: Union[List[str], int, str] = None,
    outdir: Optional[str] = None,
    cutoff: float = 0.05,
    format: str = "pdf",
    figsize: Tuple[float, float] = (6.5, 6),
    top_term: int = 10,
    no_plot: bool = False,
    verbose: bool = False,
) -> Enrichr:
    """Perform over-representation analysis (hypergeometric test).

    :param gene_list: str, list, tuple, series, dataframe. Also support input txt file with one gene id per row.
                      The input `identifier` should be the same type to `gene_sets`.

    :param gene_sets: custom defined gene_sets (dict, or gmt file).

                      Examples:

                        dict: gene_sets={'A':['gene1', 'gene2',...],
                                        'B':['gene2', 'gene4',...], ...}
                        gmt: "genes.gmt"

    :param outdir:   Output file directory

    :param background: Background gene set for statistical testing. Type: None | int | list | str.

                       **When is this used?**
                       - Only applies to CUSTOM gene sets (gmt file or dict)
                       - Ignored when using Enrichr library names (e.g., 'KEGG_2016')

                       **Default behavior:**
                       - If None: All genes in your gene_sets will be used as background

                       **Recommended usage (3 options):**

                       Option 1: Gene list (RECOMMENDED)
                           Provide your experiment-specific background genes:

                           background=['gene1', 'gene2', 'gene3', ...]

                           Example: All expressed genes from your RNA-seq experiment
                           Note: Gene identifiers must match those in your gene_sets

                       Option 2: Gene count (simple but less accurate)
                           Specify total number of genes tested:

                           background=20000  # total genes in your experiment

                           Warning: Assumes all genes could be detected. May affect
                           statistical accuracy if gene sets contain genes not in your
                           actual background.

                       Option 3: BioMart dataset (automatic download)
                           Use a BioMart database name for genome-wide background:

                           background='hsapiens_gene_ensembl'  # human genes
                           background='mmusculus_gene_ensembl' # mouse genes

                           The program downloads all annotated genes with Entrez IDs.
                           First download may take a few minutes; results are cached.

                           Cached location: ~/.cache/gseapy/{dataset}.background.genes.txt

                       **Why does background matter?**
                       Background genes define the "universe" for hypergeometric testing.
                       Using the correct background (e.g., detected genes in your
                       experiment) improves statistical accuracy compared to using all
                       possible genes in a genome.

    :param cutoff:   Show enriched terms which Adjusted P-value < cutoff.
                     Only affects the output figure, not the final output file. Default: 0.05
    :param format:  Output figure format supported by matplotlib,('pdf','png','eps'...). Default: 'pdf'.

    :param figsize: Matplotlib figsize, accept a tuple or list, e.g. (width,height). Default: (6.5,6).

    :param bool no_plot: If equals to True, no figure will be drawn. Default: False.

    :param bool verbose: Increase output verbosity, print out progress of your job, Default: False.

    :return: An Enrichr object, which obj.res2d stores your last query, obj.results stores your all queries.

    """
    organism = "human"  # has not any effects here
    _gene_sets = gene_sets
    if isinstance(gene_sets, str):
        _gene_sets = gene_sets.split(",")

    enr = Enrichr(
        gene_list,
        _gene_sets,
        organism,
        outdir,
        background,
        cutoff,
        format,
        figsize,
        top_term,
        no_plot,
        verbose,
    )
    # set organism
    enr.set_organism()
    enr.run()

    return enr



def gsva(
    data: Union[pd.DataFrame, pd.Series, str],
    gene_sets: Union[List[str], str, Dict[str, str]],
    *,
    organism: str = "human",
    outdir: Optional[str] = None,
    kcdf: Optional[str] = "Gaussian",
    weight: float = 1.0,
    mx_diff: bool = True,
    abs_rnk: bool = False,
    min_size: int = 15,
    max_size: int = 1000,
    threads: int = 4,
    seed: int = 123,
    verbose: bool = False,
    **kwargs,
) -> GSVA:
    """Run Gene Set Enrichment Analysis with single sample GSEA tool

    :param data: Expression table, pd.Series, pd.DataFrame, GCT file

    :param gene_sets: Enrichr Library name or .gmt gene sets file or dict of gene sets. Same input with GSEA.

    :param str organism: Organism for Enrichr library names (human, mouse, yeast, fly, fish, worm); does not affect custom gene sets (gmt or dict).

    :param outdir: Results output directory. If None, nothing will write to disk.

    :param str kcdf: "Gaussian", "Possion" or None.
                     The non-parametric estimation of the cumulative distribution function

    :param str weight: tau of gsva. Default: 1.

    :param bool mx_diff: Offers two approaches to calculate the enrichment statistic (ES) from the KS random walk statistic.
                         If True (default), ES is calculated as the difference
                         between the maximum positive and negative random walk deviations.
                         If False, ES is calculated as the maximum positive to 0.

    :param bool abs_rnk: Flag used only when mx_diff=True.
                         If False (default), the enrichment statistic (ES) is calculated taking the magnitude difference
                         between the largest positive and negative random walk deviations.
                         If True, feature sets with features enriched on either extreme (high or low)
                         will be regarded as 'highly' activated.

    see GSVA doc: https://rdrr.io/bioc/GSVA/man/gsva.html

    :param int min_size: Minimum allowed number of genes from gene set also the data set. Default: 15.

    :param int max_size: Maximum allowed number of genes from gene set also the data set. Default: 1000.


    :param int threads: Number of threads you are going to use. Default: 4.

    :param seed: Random seed. expect an integer. Default:None.

    :param bool verbose: Bool, increase output verbosity, print out progress of your job, Default: False.

    :return: Return a GSVA obj.
             All results can be accessed by obj.results or obj.res2d.
    """
    gv = GSVA(
        data=data,
        gene_sets=gene_sets,
        organism=organism,
        outdir=outdir,
        kcdf=kcdf,
        weight=weight,
        mx_diff=mx_diff,
        abs_rnk=abs_rnk,
        min_size=min_size,
        max_size=max_size,
        threads=threads,
        seed=seed,
        verbose=verbose,
    )
    gv.run()
    return gv


def gofilter(
    df: pd.DataFrame,
    min_level: int = 1,
    max_level: int = 20,
) -> pd.DataFrame:
    """Filter GO enrichment results by GO term level (depth in the hierarchy).

    Only terms whose GO level falls within ``[min_level, max_level]``
    (inclusive) are retained.  The GO level equals the number of ``is_a``
    ancestors in the Gene Ontology hierarchy (root = level 0, direct children
    of root = level 1, etc.) and is retrieved from the
    `QuickGO <https://www.ebi.ac.uk/QuickGO/>`_ REST API.

    This is analogous to the ``gofilter`` function in R's clusterProfiler
    (https://rdrr.io/bioc/clusterProfiler/man/gofilter.html).

    .. note::
       Requires internet access to query the QuickGO API.
       Only has an effect when the ``Term`` column contains ``GO:XXXXXXX``
       identifiers (e.g. results from GO enrichment libraries).

    Parameters
    ----------
    df : pd.DataFrame
        Enrichment result DataFrame, e.g. ``enr.res2d`` or ``enr.results``.
    min_level : int
        Minimum GO level to keep (inclusive).  Increase this value to exclude
        very general (high-level) terms.  Default: ``1``.
    max_level : int
        Maximum GO level to keep (inclusive).  Decrease this value to exclude
        very specific (low-level) terms.  Default: ``20``.

    Returns
    -------
    pd.DataFrame
        A filtered copy of *df*.

    Examples
    --------
    >>> import gseapy
    >>> enr = gseapy.enrichr(
    ...     gene_list=my_genes,
    ...     gene_sets="GO_Biological_Process_2021",
    ...     organism="human",
    ...     outdir=None,
    ...     no_plot=True,
    ... )
    >>> filtered = gseapy.gofilter(enr.results, min_level=3, max_level=8)
    """
    return GOFilter().filter(df, min_level=min_level, max_level=max_level)


__all__ = [
    "dotplot",
    "barplot",
    "enrichment_map",
    "heatmap",
    "gseaplot",
    "gseaplot2",
    "replot",
    "prerank",
    "gsea",
    "gsva",
    "ssgsea",
    "enrichr",
    "enrich",
    "gofilter",
    "Replot",
    "Prerank",
    "GSEA",
    "GSVA",
    "SingleSampleGSEA",
    "Enrichr",
    "GOFilter",
    "Biomart",
    "Msigdb",
    "get_library",
    "get_library_name",
    "read_gmt",
]