3. scRNA-seq Example¶
Examples to use GSEApy
for scRNA-seq data
[1]:
%load_ext autoreload
%autoreload 2
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
[2]:
import gseapy as gp
import scanpy as sc
[3]:
gp.__version__
[3]:
'1.0.5'
3.1. Read Demo Data¶
Convert demo data from seurat to scanpy
## R code
library(Seurat)
library(SeuratDisk)
ifnb = SeuratData::LoadData("ifnb")
SaveH5Seurat(ifnb, "ifnb.h5seurat", overwrite = T)
Convert("ifnb.h5seurat", "ifnb.h5ad", overwrite = T)
[4]:
adata = sc.read_h5ad("data/ifnb.h5ad") # data from SeuratData::ifnb
[5]:
adata.obs.head()
[5]:
orig.ident | nCount_RNA | nFeature_RNA | stim | seurat_annotations | |
---|---|---|---|---|---|
AAACATACATTTCC.1 | IMMUNE_CTRL | 3017.0 | 877 | CTRL | CD14 Mono |
AAACATACCAGAAA.1 | IMMUNE_CTRL | 2481.0 | 713 | CTRL | CD14 Mono |
AAACATACCTCGCT.1 | IMMUNE_CTRL | 3420.0 | 850 | CTRL | CD14 Mono |
AAACATACCTGGTA.1 | IMMUNE_CTRL | 3156.0 | 1109 | CTRL | pDC |
AAACATACGATGAA.1 | IMMUNE_CTRL | 1868.0 | 634 | CTRL | CD4 Memory T |
[6]:
adata.layers['counts'] = adata.X # Save raw counts
[7]:
# preprocessing
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.layers['lognorm'] = adata.X
[8]:
adata.obs.groupby('seurat_annotations')['stim'].value_counts()
[8]:
seurat_annotations stim
B STIM 571
CTRL 407
B Activated STIM 203
CTRL 185
CD14 Mono CTRL 2215
STIM 2147
CD16 Mono STIM 537
CTRL 507
CD4 Memory T STIM 903
CTRL 859
CD4 Naive T STIM 1526
CTRL 978
CD8 T STIM 462
CTRL 352
DC CTRL 258
STIM 214
Eryth STIM 32
CTRL 23
Mk STIM 121
CTRL 115
NK STIM 321
CTRL 298
T activated STIM 333
CTRL 300
pDC STIM 81
CTRL 51
Name: stim, dtype: int64
[9]:
# set STIM as class 0, CTRL as class 1, to make categorical
adata.obs['stim'] = pd.Categorical(adata.obs['stim'], categories=["STIM", "CTRL"], ordered=True)
indices = adata.obs.sort_values(['seurat_annotations', 'stim']).index
adata = adata[indices,:]
[10]:
# # # subset and write GCT and CLS file
# outdir = "ifnb/"
# for cell in adata.obs.seurat_annotations.unique():
# bdata = adata[adata.obs.seurat_annotations == cell ]
# groups = bdata.obs['stim'].to_list()
# cls_dict = bdata.obs['stim'].to_dict()
# gs = bdata.to_df().T
# gs.index.name = "NAME"
# gs_std = gs.groupby(by=cls_dict, axis=1).std()
# gs = gs[gs_std.sum(axis=1) > 0]
# gs= gs + 1e-08 # we don't like zeros!!!
# gs.insert(0, column="Description", value=cell,)
# outname = os.path.join( outdir, cell + ".gct")
# outcls = os.path.join(outdir, cell +".cls")
# s_len = gs.shape[1] - 1
# with open(outname,"w") as correct:
# line1="#1.2\n"+f"{gs.shape[0]}\t{s_len}\n"
# correct.write(line1)
# gs.to_csv(correct, sep="\t")
# with open(outcls, "w") as cl:
# line = f"{len(groups)} 2 1\n# STIM CTRL\n"
# cl.write(line)
# cl.write(" ".join(groups) + "\n")
# print(outname)
[11]:
# subset data
bdata = adata[adata.obs.seurat_annotations == "CD14 Mono"].copy()
bdata
[11]:
AnnData object with n_obs × n_vars = 4362 × 14053
obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'stim', 'seurat_annotations'
var: 'features'
uns: 'log1p'
layers: 'counts', 'lognorm'
3.2. GSEA¶
[12]:
import time
t1 = time.time()
# NOTE: To speed up, use gp.prerank instead with your own ranked list.
res = gp.gsea(data=bdata.to_df().T, # row -> genes, column-> samples
gene_sets="GO_Biological_Process_2021",
cls=bdata.obs.stim,
permutation_num=1000,
permutation_type='phenotype',
outdir=None,
method='s2n', # signal_to_noise
threads= 16)
t2=time.time()
print(t2-t1)
60.238986015319824
[13]:
res.res2d.head(10)
[13]:
Name | Term | ES | NES | NOM p-val | FDR q-val | FWER p-val | Tag % | Gene % | Lead_genes | |
---|---|---|---|---|---|---|---|---|---|---|
0 | gsea | cytokine-mediated signaling pathway (GO:0019221) | 0.685491 | 3.759972 | 0.0 | 0.0 | 0.0 | 140/490 | 9.03% | ISG15;IFIT3;IFIT1;RSAD2;ISG20;CXCL10;IFITM3;CX... |
1 | gsea | innate immune response (GO:0045087) | 0.784391 | 3.66143 | 0.0 | 0.0 | 0.0 | 56/188 | 6.30% | ISG15;IFIT1;CXCL10;IFITM3;APOBEC3A;MX1;IFI6;OA... |
2 | gsea | regulation of immune response (GO:0050776) | 0.759354 | 3.549856 | 0.0 | 0.0 | 0.0 | 49/140 | 8.77% | RSAD2;IRF7;PLSCR1;HERC5;IL4I1;SLAMF7;IFITM1;HL... |
3 | gsea | defense response to virus (GO:0051607) | 0.903464 | 3.438759 | 0.0 | 0.0 | 0.0 | 42/108 | 2.85% | ISG15;IFIT3;IFIT1;RSAD2;ISG20;CXCL10;IFITM3;AP... |
4 | gsea | response to cytokine (GO:0034097) | 0.718931 | 3.37735 | 0.0 | 0.0 | 0.0 | 37/120 | 7.26% | ISG15;IFITM3;MX1;IFITM2;PLSCR1;MX2;BST2;EIF2AK... |
5 | gsea | defense response to symbiont (GO:0140546) | 0.904717 | 3.362051 | 0.0 | 0.0 | 0.0 | 49/100 | 4.90% | ISG15;IFIT3;IFIT1;RSAD2;ISG20;IFITM3;APOBEC3A;... |
6 | gsea | cellular response to interferon-gamma (GO:0071... | 0.792726 | 3.327923 | 0.0 | 0.0 | 0.0 | 49/99 | 7.18% | CCL8;OAS1;MT2A;OASL;IRF7;GBP1;GBP4;CCL2;OAS3;O... |
7 | gsea | regulation of interferon-beta production (GO:0... | 0.856704 | 3.259412 | 0.0 | 0.0 | 0.0 | 14/44 | 4.94% | ISG15;OAS1;IRF7;DDX58;IFIH1;OAS3;OAS2;DHX58;HS... |
8 | gsea | RNA splicing, via transesterification reaction... | -0.626583 | -3.225436 | 0.0 | 0.0 | 0.0 | 128/234 | 19.45% | YBX1;PABPC1;HNRNPA1;DDX5;SRSF9;HNRNPM;RBMX;SF3... |
9 | gsea | gene expression (GO:0010467) | -0.70455 | -3.219153 | 0.0 | 0.0 | 0.0 | 134/322 | 10.13% | RPL6;RPL7;RPL15;RPL10;RPS3A;RPS6;RPL8;RPL21;RP... |
[14]:
res.ranking.shape # raking metric
[14]:
(13216,)
[15]:
## Heatmap of gene expression
i = 7
genes = res.res2d.Lead_genes.iloc[i].split(";")
ax = gp.heatmap(df = res.heatmat.loc[genes],
z_score=None,
title=res.res2d.Term.iloc[i],
figsize=(6,5),
cmap=plt.cm.viridis,
xticklabels=False)

[16]:
term = res.res2d.Term
# gp.gseaplot(res.ranking, term=term[i], **res.results[term[i]])
axs = res.plot(terms=term[:5])

3.3. DEG Analysis¶
[17]:
# find degs
sc.tl.rank_genes_groups(bdata,
groupby='stim',
use_raw=False,
layer='lognorm',
method='wilcoxon',
groups=["STIM"],
reference='CTRL')
[18]:
bdata.X.max() # already log1p
[18]:
8.065909516515664
[19]:
sc.pl.rank_genes_groups(bdata, n_genes=25, sharey=False)

[20]:
# get deg result
result = bdata.uns['rank_genes_groups']
groups = result['names'].dtype.names
degs = pd.DataFrame(
{group + '_' + key: result[key][group]
for group in groups for key in ['names','scores', 'pvals','pvals_adj','logfoldchanges']})
[21]:
degs.head()
[21]:
STIM_names | STIM_scores | STIM_pvals | STIM_pvals_adj | STIM_logfoldchanges | |
---|---|---|---|---|---|
0 | ISG15 | 57.165920 | 0.0 | 0.0 | 8.660480 |
1 | ISG20 | 57.010372 | 0.0 | 0.0 | 6.850681 |
2 | IFITM3 | 56.890392 | 0.0 | 0.0 | 6.320490 |
3 | APOBEC3A | 56.770397 | 0.0 | 0.0 | 6.616682 |
4 | IFIT3 | 56.569122 | 0.0 | 0.0 | 8.313443 |
[22]:
degs.shape
[22]:
(14053, 5)
3.4. Over-representation analysis (Enrichr API)¶
[23]:
# subset up or down regulated genes
degs_sig = degs[degs.STIM_pvals_adj < 0.05]
degs_up = degs_sig[degs_sig.STIM_logfoldchanges > 0]
degs_dw = degs_sig[degs_sig.STIM_logfoldchanges < 0]
[24]:
degs_up.shape
[24]:
(687, 5)
[25]:
degs_dw.shape
[25]:
(1030, 5)
[26]:
# Enricr API
enr_up = gp.enrichr(degs_up.STIM_names,
gene_sets='GO_Biological_Process_2021',
outdir=None)
[27]:
# trim (go:...)
enr_up.res2d.Term = enr_up.res2d.Term.str.split(" \(GO").str[0]
[28]:
# dotplot
gp.dotplot(enr_up.res2d, figsize=(3,5), title="Up", cmap = plt.cm.autumn_r)
plt.show()

[29]:
enr_dw = gp.enrichr(degs_dw.STIM_names,
gene_sets='GO_Biological_Process_2021',
outdir=None)
[30]:
enr_dw.res2d.Term = enr_dw.res2d.Term.str.split(" \(GO").str[0]
gp.dotplot(enr_dw.res2d,
figsize=(3,5),
title="Down",
cmap = plt.cm.winter_r,
size=5)
plt.show()

[31]:
# concat results
enr_up.res2d['UP_DW'] = "UP"
enr_dw.res2d['UP_DW'] = "DOWN"
enr_res = pd.concat([enr_up.res2d.head(), enr_dw.res2d.head()])
[32]:
from gseapy.scipalette import SciPalette
sci = SciPalette()
NbDr = sci.create_colormap()
# NbDr
[33]:
# display multi-datasets
ax = gp.dotplot(enr_res,figsize=(3,5),
x='UP_DW',
x_order = ["UP","DOWN"],
title="GO_BP",
cmap = NbDr.reversed(),
size=3,
show_ring=True)
ax.set_xlabel("")
plt.show()

[34]:
ax = gp.barplot(enr_res, figsize=(3,5),
group ='UP_DW',
title ="GO_BP",
color = ['b','r'])

3.5. Network Visualization¶
[35]:
import networkx as nx
[36]:
res.res2d.head()
[36]:
Name | Term | ES | NES | NOM p-val | FDR q-val | FWER p-val | Tag % | Gene % | Lead_genes | |
---|---|---|---|---|---|---|---|---|---|---|
0 | gsea | cytokine-mediated signaling pathway (GO:0019221) | 0.685491 | 3.759972 | 0.0 | 0.0 | 0.0 | 140/490 | 9.03% | ISG15;IFIT3;IFIT1;RSAD2;ISG20;CXCL10;IFITM3;CX... |
1 | gsea | innate immune response (GO:0045087) | 0.784391 | 3.66143 | 0.0 | 0.0 | 0.0 | 56/188 | 6.30% | ISG15;IFIT1;CXCL10;IFITM3;APOBEC3A;MX1;IFI6;OA... |
2 | gsea | regulation of immune response (GO:0050776) | 0.759354 | 3.549856 | 0.0 | 0.0 | 0.0 | 49/140 | 8.77% | RSAD2;IRF7;PLSCR1;HERC5;IL4I1;SLAMF7;IFITM1;HL... |
3 | gsea | defense response to virus (GO:0051607) | 0.903464 | 3.438759 | 0.0 | 0.0 | 0.0 | 42/108 | 2.85% | ISG15;IFIT3;IFIT1;RSAD2;ISG20;CXCL10;IFITM3;AP... |
4 | gsea | response to cytokine (GO:0034097) | 0.718931 | 3.37735 | 0.0 | 0.0 | 0.0 | 37/120 | 7.26% | ISG15;IFITM3;MX1;IFITM2;PLSCR1;MX2;BST2;EIF2AK... |
[37]:
# res.res2d.to_csv("data/test.out.txt", sep="\t", index=False)
[38]:
nodes, edges = gp.enrichment_map(res.res2d)
[39]:
nodes.head()
[39]:
Name | Term | ES | NES | NOM p-val | FDR q-val | FWER p-val | Tag % | Gene % | Lead_genes | p_inv | Hits_ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
node_idx | ||||||||||||
0 | gsea | gene expression (GO:0010467) | -0.70455 | -3.219153 | 0.0 | 0.000009 | 0.0 | 134/322 | 10.13% | RPL6;RPL7;RPL15;RPL10;RPS3A;RPS6;RPL8;RPL21;RP... | 5.061359 | 0.416149 |
1 | gsea | RNA splicing, via transesterification reaction... | -0.626583 | -3.225436 | 0.0 | 0.000009 | 0.0 | 128/234 | 19.45% | YBX1;PABPC1;HNRNPA1;DDX5;SRSF9;HNRNPM;RBMX;SF3... | 5.061359 | 0.547009 |
2 | gsea | regulation of interferon-beta production (GO:0... | 0.856704 | 3.259412 | 0.0 | 0.000009 | 0.0 | 14/44 | 4.94% | ISG15;OAS1;IRF7;DDX58;IFIH1;OAS3;OAS2;DHX58;HS... | 5.061359 | 0.318182 |
3 | gsea | cellular response to interferon-gamma (GO:0071... | 0.792726 | 3.327923 | 0.0 | 0.000009 | 0.0 | 49/99 | 7.18% | CCL8;OAS1;MT2A;OASL;IRF7;GBP1;GBP4;CCL2;OAS3;O... | 5.061359 | 0.494949 |
4 | gsea | defense response to symbiont (GO:0140546) | 0.904717 | 3.362051 | 0.0 | 0.000009 | 0.0 | 49/100 | 4.90% | ISG15;IFIT3;IFIT1;RSAD2;ISG20;IFITM3;APOBEC3A;... | 5.061359 | 0.490000 |
[40]:
edges.head()
[40]:
src_idx | targ_idx | src_name | targ_name | jaccard_coef | overlap_coef | overlap_genes | |
---|---|---|---|---|---|---|---|
0 | 0 | 1 | gene expression (GO:0010467) | RNA splicing, via transesterification reaction... | 0.110169 | 0.203125 | POLR2F,RBM8A,HNRNPK,POLR2E,SYNCRIP,SRSF6,SRRM1... |
1 | 0 | 8 | gene expression (GO:0010467) | cellular macromolecule biosynthetic process (G... | 0.645390 | 0.928571 | RPL26,RPL36,RPS2,RPL41,POLR2J,RPL24,RPL29,RPS2... |
2 | 1 | 8 | RNA splicing, via transesterification reaction... | cellular macromolecule biosynthetic process (G... | 0.022624 | 0.051020 | POLR2J,POLR2G,POLR2F,POLR2E,POLR2L |
3 | 2 | 3 | regulation of interferon-beta production (GO:0... | cellular response to interferon-gamma (GO:0071... | 0.105263 | 0.428571 | IRF1,OAS2,OAS1,TLR4,OAS3,IRF7 |
4 | 2 | 4 | regulation of interferon-beta production (GO:0... | defense response to symbiont (GO:0140546) | 0.188679 | 0.714286 | IRF1,TLR7,DDX58,OAS2,ISG15,LILRB1,OAS1,IFIH1,O... |
[41]:
# build graph
G = nx.from_pandas_edgelist(edges,
source='src_idx',
target='targ_idx',
edge_attr=['jaccard_coef', 'overlap_coef', 'overlap_genes'])
[42]:
fig, ax = plt.subplots(figsize=(8, 8))
# init node cooridnates
pos=nx.layout.spiral_layout(G)
#node_size = nx.get_node_attributes()
# draw node
nx.draw_networkx_nodes(G,
pos=pos,
cmap=plt.cm.RdYlBu,
node_color=list(nodes.NES),
node_size=list(nodes.Hits_ratio *1000))
# draw node label
nx.draw_networkx_labels(G,
pos=pos,
labels=nodes.Term.to_dict())
# draw edge
edge_weight = nx.get_edge_attributes(G, 'jaccard_coef').values()
nx.draw_networkx_edges(G,
pos=pos,
width=list(map(lambda x: x*10, edge_weight)),
edge_color='#CDDBD4')
plt.show()

[ ]: