In [1]:
Copied!
import scanpy as sc
import pandas as pd
import numpy as np
import omicverse as ov
import matplotlib.pyplot as plt
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, facecolor='white')
import scanpy as sc
import pandas as pd
import numpy as np
import omicverse as ov
import matplotlib.pyplot as plt
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, facecolor='white')
____ _ _ __ / __ \____ ___ (_)___| | / /__ _____________ / / / / __ `__ \/ / ___/ | / / _ \/ ___/ ___/ _ \ / /_/ / / / / / / / /__ | |/ / __/ / (__ ) __/ \____/_/ /_/ /_/_/\___/ |___/\___/_/ /____/\___/ Version: 1.6.2, Tutorials: https://omicverse.readthedocs.io/
In [2]:
Copied!
bulk_adata=sc.read('bulk_adata_ad.h5ad')
bulk_adata
bulk_adata=sc.read('bulk_adata_ad.h5ad')
bulk_adata
Out[2]:
AnnData object with n_obs × n_vars = 90 × 16504 obs: 'Unnamed: 0', 'RNAIsolation.Group', 'Dissection.Group', 'Sample.ID', 'Case.Year', 'Case.Num', 'Region', 'Neuropath.Dx.1', 'Neuropath.Dx.2', 'Age', 'Sex', 'PMI', 'APoE', 'Clinical.Syndrome', 'Tangle.Stage', 'Plaque.Stage', 'Diagnosis', 'Plaques.Tangles', 'Braak...Braak.Stage', 'Dissection.By', 'RNA.Isolation.By', 'RIN', 'Library.Group', 'Sequencing.Group', 'TOTAL_READS', 'PF_READS', 'PF_READS_ALIGNED', 'PCT_PF_READS_ALIGNED', 'PF_ALIGNED_BASES', 'PF_HQ_ALIGNED_READS', 'PF_HQ_ALIGNED_BASES', 'PF_HQ_ALIGNED_Q20_BASES', 'PF_MISMATCH_RATE', 'PF_HQ_ERROR_RATE', 'PF_INDEL_RATE', 'READS_ALIGNED_IN_PAIRS', 'STRAND_BALANCE', 'PCT_CHIMERAS', 'PCT_ADAPTER', 'UNPAIRED_READS_EXAMINED', 'UNMAPPED_READS', 'READ_PAIR_DUPLICATES', 'READ_PAIR_OPTICAL_DUPLICATES', 'PERCENT_DUPLICATION', 'ESTIMATED_LIBRARY_SIZE', 'PF_BASES', 'PF_ALIGNED_BASES.1', 'RIBOSOMAL_BASES', 'CODING_BASES', 'UTR_BASES', 'INTRONIC_BASES', 'INTERGENIC_BASES', 'CORRECT_STRAND_READS', 'INCORRECT_STRAND_READS', 'PCT_RIBOSOMAL_BASES', 'PCT_CODING_BASES', 'PCT_UTR_BASES', 'PCT_INTRONIC_BASES', 'PCT_INTERGENIC_BASES', 'PCT_MRNA_BASES', 'PCT_USABLE_BASES', 'PCT_CORRECT_STRAND_READS', 'MEDIAN_CV_COVERAGE', 'MEDIAN_5PRIME_BIAS', 'MEDIAN_3PRIME_BIAS', 'MEDIAN_5PRIME_TO_3PRIME_BIAS', 'TOTAL_CLUSTERS', 'ALIGNED_READS', 'AT_DROPOUT', 'GC_DROPOUT', 'Seq.PC1', 'Seq.PC2', 'Seq.PC3', 'Seq.PC4', 'Seq.PC5' var: 'symbol', 'gene_id'
In [ ]:
Copied!
adata.obs['name']=adata.obs.index
adata.obs['name']=adata.obs['name'].astype('category')
adata.obs['name']=adata.obs.index
adata.obs['name']=adata.obs['name'].astype('category')
In [4]:
Copied!
bulk_adata.obs['Region'].value_counts()
bulk_adata.obs['Region'].value_counts()
Out[4]:
FC 90 Name: Region, dtype: int64
In [5]:
Copied!
bulk_adata.obs['Neuropath.Dx.1'].value_counts()
bulk_adata.obs['Neuropath.Dx.1'].value_counts()
Out[5]:
Alzheimer's disease 44 Normal (Mild Braak Changes) 34 Normal - No Pathology Detected 8 Normal (Mild Vascular Changes) 4 Name: Neuropath.Dx.1, dtype: int64
In [6]:
Copied!
bulk_adata.obs['Neuropath.Dx.2'].value_counts()
bulk_adata.obs['Neuropath.Dx.2'].value_counts()
Out[6]:
Unused 77 Hippocampal Sclerosis 8 Acute Cerebral Hypoxia 1 Diffuse Leukencephalopathy 1 Meningiom- Left Hemi (frozen tissue from right hemi) 1 Meningioma - right hemi (frozen tissue from left hemi) 1 Other 1 Name: Neuropath.Dx.2, dtype: int64
In [7]:
Copied!
bulk_adata.obs['APoE'].value_counts()
bulk_adata.obs['APoE'].value_counts()
Out[7]:
e33 26 e43 26 e32 8 e42 3 e44 3 Name: APoE, dtype: int64
In [8]:
Copied!
bulk_adata.obs['Clinical.Syndrome'].value_counts()
bulk_adata.obs['Clinical.Syndrome'].value_counts()
Out[8]:
[Dem] 32 [MCI] 6 [NonADDEM] 6 [Norm] 4 [QCI] 2 Name: Clinical.Syndrome, dtype: int64
In [9]:
Copied!
bulk_adata.obs['Tangle.Stage'].value_counts()
bulk_adata.obs['Tangle.Stage'].value_counts()
Out[9]:
Stage 6 24 Stage 5 21 Stage 1 15 Stage 2 13 Stage 3 11 Stage 4 5 Name: Tangle.Stage, dtype: int64
In [10]:
Copied!
bulk_adata.obs['Plaque.Stage'].value_counts()
bulk_adata.obs['Plaque.Stage'].value_counts()
Out[10]:
Stage C 27 Stage A 26 Stage B 25 None 11 Name: Plaque.Stage, dtype: int64
In [11]:
Copied!
bulk_adata.obs['Diagnosis'].value_counts()
bulk_adata.obs['Diagnosis'].value_counts()
Out[11]:
Control 46 AD 44 Name: Diagnosis, dtype: int64
In [12]:
Copied!
bulk_adata.var_names_make_unique()
bulk_adata.var_names_make_unique()
Different expression object¶
In [13]:
Copied!
data=bulk_adata.T.to_df()
data=bulk_adata.T.to_df()
In [14]:
Copied!
dds=ov.bulk.pyDEG(data)
dds=ov.bulk.pyDEG(data)
In [15]:
Copied!
dds.drop_duplicates_index()
print('... drop_duplicates_index success')
dds.drop_duplicates_index()
print('... drop_duplicates_index success')
... drop_duplicates_index success
In [16]:
Copied!
dds.normalize()
print('... estimateSizeFactors and normalize success')
dds.normalize()
print('... estimateSizeFactors and normalize success')
... estimateSizeFactors and normalize success
In [17]:
Copied!
data.head()
data.head()
Out[17]:
Sample-1 | Sample-10 | Sample-100 | Sample-101 | Sample-11 | Sample-12 | Sample-13 | Sample-14 | Sample-16 | Sample-17 | ... | Sample-89 | Sample-90 | Sample-91 | Sample-92 | Sample-93 | Sample-94 | Sample-95 | Sample-97 | Sample-98 | Sample-99 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
symbol | |||||||||||||||||||||
CELF3 | 5.411992 | 5.372127 | 5.918649 | 6.533444 | 5.701608 | 5.707154 | 6.424472 | 6.555146 | 6.126113 | 6.940997 | ... | 5.634366 | 6.583671 | 5.788903 | 6.126700 | 6.589674 | 6.446354 | 6.130510 | 4.964971 | 6.207335 | 5.715709 |
FCRL5 | 7.547789 | 7.615529 | 7.766842 | 7.622182 | 7.623010 | 7.460036 | 7.553843 | 7.342730 | 7.310592 | 7.635867 | ... | 7.390826 | 8.098926 | 7.313549 | 7.729042 | 8.356948 | 7.326740 | 7.627201 | 7.177275 | 7.831815 | 7.543531 |
SETP11 | 5.889153 | 5.115322 | 4.894407 | 5.754578 | 5.770548 | 5.400205 | 5.892261 | 5.409041 | 5.540659 | 5.976911 | ... | 4.897572 | 5.088561 | 5.054412 | 5.981792 | 5.487043 | 5.619745 | 5.443136 | 3.290396 | 6.425835 | 5.221976 |
ATP8B2 | 8.669230 | 7.934054 | 8.232009 | 8.193802 | 8.276058 | 8.466156 | 8.377337 | 8.226151 | 7.897574 | 8.919280 | ... | 7.910346 | 8.488744 | 7.982265 | 8.152369 | 8.640233 | 8.281754 | 8.195569 | 7.644456 | 8.232993 | 8.382407 |
NECTIN2 | 8.109728 | 7.792938 | 8.040775 | 7.943382 | 8.304275 | 8.599941 | 8.523495 | 7.900356 | 7.760688 | 8.762688 | ... | 8.399589 | 8.340689 | 8.203156 | 7.808631 | 7.486781 | 7.568620 | 7.827593 | 7.655904 | 8.766994 | 7.659839 |
5 rows × 90 columns
In [18]:
Copied!
treatment_groups=bulk_adata.obs.loc[bulk_adata.obs['Diagnosis']=='AD'].index
control_groups=bulk_adata.obs.loc[bulk_adata.obs['Neuropath.Dx.1']=='Normal - No Pathology Detected'].index
result=dds.deg_analysis(treatment_groups,control_groups,method='ttest')
result.head()
treatment_groups=bulk_adata.obs.loc[bulk_adata.obs['Diagnosis']=='AD'].index
control_groups=bulk_adata.obs.loc[bulk_adata.obs['Neuropath.Dx.1']=='Normal - No Pathology Detected'].index
result=dds.deg_analysis(treatment_groups,control_groups,method='ttest')
result.head()
Out[18]:
pvalue | qvalue | FoldChange | -log(pvalue) | -log(qvalue) | BaseMean | log2(BaseMean) | log2FC | abs(log2FC) | size | sig | |
---|---|---|---|---|---|---|---|---|---|---|---|
symbol | |||||||||||
CELF3 | 0.269397 | 0.853069 | 1.027478 | 0.569606 | 0.069016 | 5.961489 | 2.575673 | 0.039108 | 0.039108 | 0.102748 | normal |
FCRL5 | 0.154940 | 0.783915 | 1.012305 | 0.809838 | 0.105731 | 7.625608 | 2.930852 | 0.017644 | 0.017644 | 0.101230 | normal |
SETP11 | 0.049244 | 0.647936 | 1.069283 | 1.307650 | 0.188468 | 5.408247 | 2.435161 | 0.096643 | 0.096643 | 0.106928 | normal |
ATP8B2 | 0.748306 | 0.977829 | 1.003292 | 0.125921 | 0.009737 | 8.361662 | 3.063790 | 0.004742 | 0.004742 | 0.100329 | normal |
NECTIN2 | 0.300599 | 0.866719 | 1.018567 | 0.522013 | 0.062122 | 8.251357 | 3.044631 | 0.026541 | 0.026541 | 0.101857 | normal |
In [19]:
Copied!
result.qvalue=result.pvalue
result['-log(qvalue)']=result['-log(pvalue)']
result.qvalue=result.pvalue
result['-log(qvalue)']=result['-log(pvalue)']
In [20]:
Copied!
print(result.shape)
result=result.loc[result['log2(BaseMean)']>1]
print(result.shape)
print(result.shape)
result=result.loc[result['log2(BaseMean)']>1]
print(result.shape)
(16504, 11) (16392, 11)
In [21]:
Copied!
dds.result=result.loc[result['abs(log2FC)']<1]
dds.result=result.loc[result['abs(log2FC)']<1]
In [22]:
Copied!
dds.result['log2FC']=dds.result['log2FC'].fillna(0)
dds.result['log2FC']=dds.result['log2FC'].fillna(0)
In [23]:
Copied!
# -1 means automatically calculates
dds.foldchange_set(fc_threshold=0.15,
pval_threshold=0.05,
logp_max=6)
# -1 means automatically calculates
dds.foldchange_set(fc_threshold=0.15,
pval_threshold=0.05,
logp_max=6)
... Fold change threshold: 0.15
In [24]:
Copied!
dds.result.to_csv('deg_res.csv')
dds.result.to_csv('deg_res.csv')
In [26]:
Copied!
dds.plot_volcano(title='DEG Analysis',figsize=(4,4),
plot_genes=result.loc[result['pvalue']<0.05].sort_values('abs(log2FC)',ascending=False).index[:10],
#plot_genes_num=20,
plot_genes_fontsize=12,)
plt.savefig("figures/deg_vol.png",dpi=300,bbox_inches = 'tight')
plt.savefig("pdf/deg_vol.pdf",dpi=300,bbox_inches = 'tight')
#plt.xlim(-1,1)
dds.plot_volcano(title='DEG Analysis',figsize=(4,4),
plot_genes=result.loc[result['pvalue']<0.05].sort_values('abs(log2FC)',ascending=False).index[:10],
#plot_genes_num=20,
plot_genes_fontsize=12,)
plt.savefig("figures/deg_vol.png",dpi=300,bbox_inches = 'tight')
plt.savefig("pdf/deg_vol.pdf",dpi=300,bbox_inches = 'tight')
#plt.xlim(-1,1)
In [27]:
Copied!
result.loc[result['pvalue']<0.05].sort_values('abs(log2FC)',ascending=False).index[:10]
result.loc[result['pvalue']<0.05].sort_values('abs(log2FC)',ascending=False).index[:10]
Out[27]:
Index(['S100A11', 'TICAM2', 'GJA9', 'SMIM11B', 'CLIC6', 'STEAP1', 'NPIPB15', 'GJA5', 'PDIA5', 'SNORA36A'], dtype='object', name='symbol')
In [28]:
Copied!
data.loc['CLEC5A']
data.loc['CLEC5A']
Out[28]:
Sample-1 1.299839 Sample-10 0.011549 Sample-100 -0.257921 Sample-101 0.654572 Sample-11 1.262795 ... Sample-94 -1.603970 Sample-95 -1.081690 Sample-97 1.104223 Sample-98 1.090415 Sample-99 0.047309 Name: CLEC5A, Length: 90, dtype: float32
In [29]:
Copied!
from statsmodels.stats.multitest import fdrcorrection
qvalue=fdrcorrection(np.nan_to_num(np.array(result.pvalue),0), alpha=0.1, method='indep', is_sorted=False)
from statsmodels.stats.multitest import fdrcorrection
qvalue=fdrcorrection(np.nan_to_num(np.array(result.pvalue),0), alpha=0.1, method='indep', is_sorted=False)
In [31]:
Copied!
dds.plot_boxplot(genes=result.loc[result['pvalue']<0.05].sort_values('abs(log2FC)',ascending=False).index[:10],
treatment_groups=treatment_groups,
control_groups=control_groups,figsize=(6,4),fontsize=12,
legend_bbox=(1,0.55))
#labels=ax.ax_row_colors.xaxis.get_ticklabels()
plt.xticks(rotation=45, horizontalalignment='right',fontsize=12)
plt.savefig("figures/deg_exp1.png",dpi=300,bbox_inches = 'tight')
plt.savefig("pdf/deg_exp1.pdf",dpi=300,bbox_inches = 'tight')
dds.plot_boxplot(genes=result.loc[result['pvalue']<0.05].sort_values('abs(log2FC)',ascending=False).index[:10],
treatment_groups=treatment_groups,
control_groups=control_groups,figsize=(6,4),fontsize=12,
legend_bbox=(1,0.55))
#labels=ax.ax_row_colors.xaxis.get_ticklabels()
plt.xticks(rotation=45, horizontalalignment='right',fontsize=12)
plt.savefig("figures/deg_exp1.png",dpi=300,bbox_inches = 'tight')
plt.savefig("pdf/deg_exp1.pdf",dpi=300,bbox_inches = 'tight')
In [33]:
Copied!
from omicverse.utils import plot_boxplot
genes=result.loc[result['pvalue']<0.05].sort_values('abs(log2FC)',ascending=False).index[:10]
treatment_groups=treatment_groups
control_groups=control_groups
figsize=(6,4)
fontsize=12
legend_bbox=(1,0.55)
treatment_name='Alzheimer'
control_name='Control'
p_data=pd.DataFrame(columns=['Value','Gene','Type'])
for gene in genes:
plot_data1=pd.DataFrame()
plot_data1['Value']=dds.data[treatment_groups].loc[gene].values
plot_data1['Gene']=gene
plot_data1['Type']=treatment_name
plot_data2=pd.DataFrame()
plot_data2['Value']=dds.data[control_groups].loc[gene].values
plot_data2['Gene']=gene
plot_data2['Type']=control_name
plot_data=pd.concat([plot_data1,plot_data2],axis=0)
p_data=pd.concat([p_data,plot_data],axis=0)
fig,ax=plot_boxplot(p_data,hue='Type',x_value='Gene',y_value='Value',palette=["#a64d79","#674ea7"],
figsize=figsize,fontsize=fontsize,title='Gene Expression',
legend_bbox=legend_bbox,legend_ncol=1,)
#labels=ax.ax_row_colors.xaxis.get_ticklabels()
plt.xticks(rotation=45, horizontalalignment='right',fontsize=12)
plt.savefig("figures/deg_exp2.png",dpi=300,bbox_inches = 'tight')
plt.savefig("pdf/deg_exp2.pdf",dpi=300,bbox_inches = 'tight')
from omicverse.utils import plot_boxplot
genes=result.loc[result['pvalue']<0.05].sort_values('abs(log2FC)',ascending=False).index[:10]
treatment_groups=treatment_groups
control_groups=control_groups
figsize=(6,4)
fontsize=12
legend_bbox=(1,0.55)
treatment_name='Alzheimer'
control_name='Control'
p_data=pd.DataFrame(columns=['Value','Gene','Type'])
for gene in genes:
plot_data1=pd.DataFrame()
plot_data1['Value']=dds.data[treatment_groups].loc[gene].values
plot_data1['Gene']=gene
plot_data1['Type']=treatment_name
plot_data2=pd.DataFrame()
plot_data2['Value']=dds.data[control_groups].loc[gene].values
plot_data2['Gene']=gene
plot_data2['Type']=control_name
plot_data=pd.concat([plot_data1,plot_data2],axis=0)
p_data=pd.concat([p_data,plot_data],axis=0)
fig,ax=plot_boxplot(p_data,hue='Type',x_value='Gene',y_value='Value',palette=["#a64d79","#674ea7"],
figsize=figsize,fontsize=fontsize,title='Gene Expression',
legend_bbox=legend_bbox,legend_ncol=1,)
#labels=ax.ax_row_colors.xaxis.get_ticklabels()
plt.xticks(rotation=45, horizontalalignment='right',fontsize=12)
plt.savefig("figures/deg_exp2.png",dpi=300,bbox_inches = 'tight')
plt.savefig("pdf/deg_exp2.pdf",dpi=300,bbox_inches = 'tight')
Geneset Enrichment¶
In [40]:
Copied!
pathway_dict=ov.utils.geneset_prepare('../baby_analysis/genesets/WikiPathway_2021_Human.txt',organism='Human')
pathway_dict=ov.utils.geneset_prepare('../baby_analysis/genesets/WikiPathway_2021_Human.txt',organism='Human')
In [41]:
Copied!
import omicverse as ov
deg_genes=dds.result.loc[dds.result['sig']!='normal'].index.tolist()
enr=ov.bulk.geneset_enrichment(gene_list=deg_genes,
pathways_dict=pathway_dict,
pvalue_type='auto',
background=result.index.tolist(),
organism='Human')
import omicverse as ov
deg_genes=dds.result.loc[dds.result['sig']!='normal'].index.tolist()
enr=ov.bulk.geneset_enrichment(gene_list=deg_genes,
pathways_dict=pathway_dict,
pvalue_type='auto',
background=result.index.tolist(),
organism='Human')
In [ ]:
Copied!
#Pathway Enrichment
deg_genes=dds.result.loc[dds.result['sig']!='normal'].index.tolist()
#pathway_dict=ov.utils.geneset_prepare('WikiPathway_2021_Human.txt',
# organism='Human')
enr=ov.bulk.geneset_enrichment(gene_list=deg_genes,
pathways_dict=pathway_dict,
pvalue_type='auto',
background=result.index.tolist(),
organism='Human')
#Pathway Enrichment
deg_genes=dds.result.loc[dds.result['sig']!='normal'].index.tolist()
#pathway_dict=ov.utils.geneset_prepare('WikiPathway_2021_Human.txt',
# organism='Human')
enr=ov.bulk.geneset_enrichment(gene_list=deg_genes,
pathways_dict=pathway_dict,
pvalue_type='auto',
background=result.index.tolist(),
organism='Human')
In [222]:
Copied!
ov.utils.pyomic_plot_set()
ov.utils.pyomic_plot_set()
In [224]:
Copied!
import gseapy as gp
help(gp.prerank)
import gseapy as gp
help(gp.prerank)
Help on function prerank in module gseapy.gsea: prerank(rnk, gene_sets, outdir='GSEA_Prerank', pheno_pos='Pos', pheno_neg='Neg', min_size=15, max_size=500, permutation_num=1000, weighted_score_type=1, ascending=False, processes=1, figsize=(6.5, 6), format='pdf', graph_num=20, no_plot=False, seed=None, verbose=False) Run Gene Set Enrichment Analysis with pre-ranked correlation defined by user. :param rnk: pre-ranked correlation table or pandas DataFrame. Same input with ``GSEA`` .rnk file. :param gene_sets: Enrichr Library name or .gmt gene sets file or dict of gene sets. Same input with GSEA. :param outdir: results output directory. :param int permutation_num: Number of permutations for significance computation. Default: 1000. :param int min_size: Minimum allowed number of genes from gene set also the data set. Default: 15. :param int max_size: Maximum allowed number of genes from gene set also the data set. Defaults: 500. :param str weighted_score_type: Refer to :func:`algorithm.enrichment_score`. Default:1. :param bool ascending: Sorting order of rankings. Default: False. :param int processes: Number of Processes you are going to use. Default: 1. :param list figsize: Matplotlib figsize, accept a tuple or list, e.g. [width,height]. Default: [6.5,6]. :param str format: Matplotlib figure format. Default: 'pdf'. :param int graph_num: Plot graphs for top sets of each phenotype. :param bool no_plot: If equals to True, no figure will be drawn. Default: False. :param seed: Random seed. expect an integer. Default:None. :param bool verbose: Bool, increase output verbosity, print out progress of your job, Default: False. :return: Return a Prerank obj. All results store to a dictionary, obj.results, where contains:: | {es: enrichment score, | nes: normalized enrichment score, | p: P-value, | fdr: FDR, | size: gene set size, | matched_size: genes matched to the data, | genes: gene names from the data set | ledge_genes: leading edge genes}
In [43]:
Copied!
dds.result['fcsign']=np.sign(dds.result['log2FC'])
dds.result['logp']=-np.log10(dds.result['pvalue'])
dds.result['metric']=dds.result['logp']/dds.result['fcsign']
dds.result['fcsign']=np.sign(dds.result['log2FC'])
dds.result['logp']=-np.log10(dds.result['pvalue'])
dds.result['metric']=dds.result['logp']/dds.result['fcsign']
In [45]:
Copied!
rnk=pd.DataFrame()
rnk['gene_name']=dds.result.index
rnk['rnk']=dds.result['metric'].values
rnk=rnk.sort_values(by=['rnk'],ascending=False)
k=1
total=0
for i in range(len(rnk)):
if rnk.loc[i,'rnk']==np.inf:
total+=1
#200跟274根据你的数据进行更改,保证inf比你数据最大的大,-inf比数据最小的小就好
for i in range(len(rnk)):
if rnk.loc[i,'rnk']==np.inf:
rnk.loc[i,'rnk']=200+(total-k)
k+=1
elif rnk.loc[i,'rnk']==-np.inf:
rnk.loc[i,'rnk']=-(274+k)
k+=1
#rnk=rnk.replace(np.inf,300)
#rnk=rnk.replace(-np.inf,-300)
rnk.head()
rnk=pd.DataFrame()
rnk['gene_name']=dds.result.index
rnk['rnk']=dds.result['metric'].values
rnk=rnk.sort_values(by=['rnk'],ascending=False)
k=1
total=0
for i in range(len(rnk)):
if rnk.loc[i,'rnk']==np.inf:
total+=1
#200跟274根据你的数据进行更改,保证inf比你数据最大的大,-inf比数据最小的小就好
for i in range(len(rnk)):
if rnk.loc[i,'rnk']==np.inf:
rnk.loc[i,'rnk']=200+(total-k)
k+=1
elif rnk.loc[i,'rnk']==-np.inf:
rnk.loc[i,'rnk']=-(274+k)
k+=1
#rnk=rnk.replace(np.inf,300)
#rnk=rnk.replace(-np.inf,-300)
rnk.head()
Out[45]:
gene_name | rnk | |
---|---|---|
12671 | GK3P | 4.005219 |
8071 | GATAD2A | 3.509306 |
3839 | TTC26 | 3.497201 |
10416 | BBS1 | 3.431648 |
13884 | CYP1A2 | 3.366436 |
In [47]:
Copied!
import gseapy as gp
pre_res = gp.prerank(rnk=rnk, gene_sets=pathway_dict,
processes=8,
permutation_num=100, # reduce number to speed up testing
outdir='gsea', format='png', seed=112)
import gseapy as gp
pre_res = gp.prerank(rnk=rnk, gene_sets=pathway_dict,
processes=8,
permutation_num=100, # reduce number to speed up testing
outdir='gsea', format='png', seed=112)
In [48]:
Copied!
pre_res.res2d.head()
pre_res.res2d.head()
Out[48]:
es | nes | pval | fdr | geneset_size | matched_size | genes | ledge_genes | |
---|---|---|---|---|---|---|---|---|
Term | ||||||||
Cholesterol metabolism (includes both Bloch and Kandutsch-Russell pathways) WP4718 | -0.631080 | -2.046630 | 0.0 | 0.000000 | 46 | 42 | CYP46A1;ACOT2;EBP;MVD;GGPS1;FASN;ELOVL4;MVK;SO... | LBR;FADS1;ELOVL5;ACSL4;SC5D;MYLIP;LSS;TM7SF2;F... |
Hematopoietic Stem Cell Differentiation WP2849 | 0.649242 | 2.076468 | 0.0 | 0.000000 | 55 | 39 | SPI1;RUNX1;KCNH2;ITGB3;RHOH;FLI1;NCKAP1L;LYL1;... | SPI1;RUNX1;KCNH2;ITGB3;RHOH;FLI1;NCKAP1L;LYL1;... |
Electron Transport Chain (OXPHOS system in mitochondria) WP111 | -0.539331 | -1.967466 | 0.0 | 0.004447 | 103 | 87 | SLC25A6;NDUFA3;UQCR10;SCO1;ATP5MF;UQCRH;SURF1;... | NDUFAB1;NDUFA12;COX17;NDUFA10;ATP5MC1;ATP5MC3;... |
Sphingolipid Metabolism (general overview) WP4725 | -0.682113 | -1.874293 | 0.0 | 0.010672 | 24 | 22 | SGPL1;CERS1;UGCG;CERS6;DEGS2;SGPP2;SGMS2;CERK;... | PLPP3;DEGS1;CERS2;KDSR;SGMS1;UGT8;SMPD1;SPTLC1... |
Oxidative phosphorylation WP623 | -0.563671 | -1.875864 | 0.0 | 0.013340 | 60 | 51 | NDUFA3;ATP5MF;NDUFA4;NDUFB6;NDUFV3;NDUFV2;NDUF... | NDUFAB1;NDUFA10;ATP5MC1;ATP5MC3;NDUFB5;ATP5PD;... |
In [49]:
Copied!
terms = pre_res.res2d.index
from gseapy.plot import gseaplot,GSEAPlot
# to save your figure, make sure that ofname is not None
gseaplot(rank_metric=pre_res.ranking, term=terms[3],figsize=(3,4), **pre_res.results[terms[0]])
plt.title('')
terms = pre_res.res2d.index
from gseapy.plot import gseaplot,GSEAPlot
# to save your figure, make sure that ofname is not None
gseaplot(rank_metric=pre_res.ranking, term=terms[3],figsize=(3,4), **pre_res.results[terms[0]])
plt.title('')
In [50]:
Copied!
fig=GSEAPlot(rank_metric=pre_res.ranking, term=terms[3],figsize=(3,4), **pre_res.results[terms[0]])
fig=GSEAPlot(rank_metric=pre_res.ranking, term=terms[3],figsize=(3,4), **pre_res.results[terms[0]])
<Figure size 240x320 with 0 Axes>
In [55]:
Copied!
terms[2]
terms[2]
Out[55]:
'Electron Transport Chain (OXPHOS system in mitochondria) WP111'
In [56]:
Copied!
g = GSEAPlot(
rank_metric=pre_res.ranking, term=terms[1],figsize=(3,4),cmap='RdBu_r',
**pre_res.results[terms[2]]
)
g.fig.suptitle('Electron Transport Chain',fontsize=12,y=0.95)
g.add_axes()
g.fig.savefig("figures/gseapy_1.png",dpi=300,bbox_inches = 'tight')
g.fig.savefig("pdf/gseapy_1.pdf",dpi=300,bbox_inches = 'tight')
g = GSEAPlot(
rank_metric=pre_res.ranking, term=terms[1],figsize=(3,4),cmap='RdBu_r',
**pre_res.results[terms[2]]
)
g.fig.suptitle('Electron Transport Chain',fontsize=12,y=0.95)
g.add_axes()
g.fig.savefig("figures/gseapy_1.png",dpi=300,bbox_inches = 'tight')
g.fig.savefig("pdf/gseapy_1.pdf",dpi=300,bbox_inches = 'tight')
In [59]:
Copied!
terms[3]
terms[3]
Out[59]:
'Sphingolipid Metabolism (general overview) WP4725'
In [60]:
Copied!
g = GSEAPlot(
rank_metric=pre_res.ranking, term=terms[1],figsize=(3,4),cmap='RdBu_r',
**pre_res.results[terms[3]]
)
g.fig.suptitle('Sphingolipid Metabolism',fontsize=12,y=0.95)
g.add_axes()
g.fig.savefig("figures/gseapy_2.png",dpi=300,bbox_inches = 'tight')
g.fig.savefig("pdf/gseapy_2.pdf",dpi=300,bbox_inches = 'tight')
g = GSEAPlot(
rank_metric=pre_res.ranking, term=terms[1],figsize=(3,4),cmap='RdBu_r',
**pre_res.results[terms[3]]
)
g.fig.suptitle('Sphingolipid Metabolism',fontsize=12,y=0.95)
g.add_axes()
g.fig.savefig("figures/gseapy_2.png",dpi=300,bbox_inches = 'tight')
g.fig.savefig("pdf/gseapy_2.pdf",dpi=300,bbox_inches = 'tight')
In [254]:
Copied!
help(gseaplot)
help(gseaplot)
Help on function gseaplot in module gseapy.plot: gseaplot(rank_metric, term, hit_indices, nes, pval, fdr, RES, pheno_pos='', pheno_neg='', figsize=(6, 5.5), cmap='seismic', ofname=None, **kwargs) This is the main function for reproducing the gsea plot. :param rank_metric: pd.Series for rankings, rank_metric.values. :param term: gene_set name :param hit_indices: hits indices of rank_metric.index presented in gene set S. :param nes: Normalized enrichment scores. :param pval: nominal p-value. :param fdr: false discovery rate. :param RES: running enrichment scores. :param pheno_pos: phenotype label, positive correlated. :param pheno_neg: phenotype label, negative correlated. :param figsize: matplotlib figsize. :param ofname: output file name. If None, don't save figure
In [61]:
Copied!
enrich_res=pre_res.res2d[pre_res.res2d['fdr']<0.05]
enrich_res['logp']=-np.log(enrich_res['fdr']+0.0001)
enrich_res['logc']=-enrich_res['nes']
enrich_res['num']=enrich_res['matched_size']
enrich_res['fraction']=enrich_res['matched_size']/enrich_res['geneset_size']
enrich_res['Term']=enrich_res.index.tolist()
enrich_res=pre_res.res2d[pre_res.res2d['fdr']<0.05]
enrich_res['logp']=-np.log(enrich_res['fdr']+0.0001)
enrich_res['logc']=-enrich_res['nes']
enrich_res['num']=enrich_res['matched_size']
enrich_res['fraction']=enrich_res['matched_size']/enrich_res['geneset_size']
enrich_res['Term']=enrich_res.index.tolist()
In [62]:
Copied!
enrich_res['P-value']=enrich_res['fdr']
enrich_res['P-value']=enrich_res['fdr']
In [66]:
Copied!
ov.bulk.geneset_plot(enrich_res,figsize=(2,5),fig_title='Wiki Pathway enrichment',
cmap='Reds',text_maxsize=30,custom_ticks=[5,8])
plt.savefig("figures/gseapy_scatter.png",dpi=300,bbox_inches = 'tight')
plt.savefig("pdf/gseapy_scatter.png",dpi=300,bbox_inches = 'tight')
ov.bulk.geneset_plot(enrich_res,figsize=(2,5),fig_title='Wiki Pathway enrichment',
cmap='Reds',text_maxsize=30,custom_ticks=[5,8])
plt.savefig("figures/gseapy_scatter.png",dpi=300,bbox_inches = 'tight')
plt.savefig("pdf/gseapy_scatter.png",dpi=300,bbox_inches = 'tight')
In [ ]:
Copied!