Api scgsea
omicverse.single.geneset_aucell(adata, geneset_name, geneset, AUC_threshold=0.01, seed=42)
¶
Calculate the AUC-ell score for a given gene set.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
AnnData object Annotated data matrix containing gene expression data. |
required | |
geneset_name |
str Name of the gene set. |
required | |
geneset |
list of str List of gene symbols for the gene set. |
required | |
AUC_threshold |
float, optional AUC threshold used to determine significant interactions (default is 0.01). |
0.01
|
|
seed |
int, optional Seed used to initialize the random number generator (default is 42). |
42
|
Returns:
Type | Description |
---|---|
None Adds a column to the 'obs' attribute of the adata object containing the AUC-ell score for the gene set. |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_scgsea.py
def geneset_aucell(adata,geneset_name,geneset,AUC_threshold=0.01,seed=42):
"""
Calculate the AUC-ell score for a given gene set.
Arguments:
adata: AnnData object
Annotated data matrix containing gene expression data.
geneset_name: str
Name of the gene set.
geneset: list of str
List of gene symbols for the gene set.
AUC_threshold: float, optional
AUC threshold used to determine significant interactions (default is 0.01).
seed: int, optional
Seed used to initialize the random number generator (default is 42).
Returns:
None
Adds a column to the 'obs' attribute of the adata object containing the AUC-ell score for the gene set.
"""
check_ctxcore()
global ctxcore_install
if ctxcore_install==True:
global aucs
global GeneSignature
from ctxcore.recovery import aucs
from ctxcore.genesig import GeneSignature
matrix = adata.to_df()
percentiles = derive_auc_threshold(matrix)
auc_threshold = percentiles[AUC_threshold]
df_rnk=create_rankings(matrix, seed)
rnk=df_rnk.iloc[:, df_rnk.columns.isin(geneset)]
if rnk.empty or (float(len(rnk.columns)) / float(len(geneset))) < 0.80:
print(
f"Less than 80% of the genes in {geneset_name} are present in the "
"expression matrix."
)
adata.obs['{}_aucell'.format(geneset_name)]=np.zeros(shape=(df_rnk.shape[0]), dtype=np.float64)
else:
weights = np.array([1 for i in geneset])
adata.obs['{}_aucell'.format(geneset_name)]=aucs(rnk,
len(df_rnk.columns),weights,auc_threshold
)
omicverse.single.pathway_aucell(adata, pathway_names, pathways_dict, AUC_threshold=0.01, seed=42)
¶
Calculates the area under the curve (AUC) for a set of pathways in an AnnData object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
AnnData object AnnData object containing the data. |
required | |
pathway_names |
list of str Names of the pathways to analyze. |
required | |
pathways_dict |
dict Dictionary containing the gene sets for each pathway. |
required | |
AUC_threshold |
float, optional (default: 0.01) AUC threshold to use for determining significant gene-pathway associations. |
0.01
|
|
seed |
int, optional (default: 42) Random seed for reproducibility. |
42
|
Returns:
Type | Description |
---|---|
None
The function modifies the |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_scgsea.py
def pathway_aucell(adata,pathway_names,pathways_dict,AUC_threshold=0.01,seed=42):
"""
Calculates the area under the curve (AUC) for a set of pathways in an AnnData object.
Arguments:
adata: AnnData object
AnnData object containing the data.
pathway_names: list of str
Names of the pathways to analyze.
pathways_dict: dict
Dictionary containing the gene sets for each pathway.
AUC_threshold: float, optional (default: 0.01)
AUC threshold to use for determining significant gene-pathway associations.
seed: int, optional (default: 42)
Random seed for reproducibility.
Returns:
None
The function modifies the `adata.obs` attribute of the input AnnData object.
"""
check_ctxcore()
global ctxcore_install
if ctxcore_install==True:
global aucs
global GeneSignature
from ctxcore.recovery import aucs
from ctxcore.genesig import GeneSignature
matrix = adata.to_df()
percentiles = derive_auc_threshold(matrix)
auc_threshold = percentiles[AUC_threshold]
df_rnk=create_rankings(matrix, seed)
for pathway_name in pathway_names:
pathway_genes=pathways_dict[pathway_name]
rnk=df_rnk.iloc[:, df_rnk.columns.isin(pathway_genes)]
if rnk.empty or (float(len(rnk.columns)) / float(len(pathway_genes))) < 0.80:
print(
f"Less than 80% of the genes in {pathway_name} are present in the "
"expression matrix."
)
adata.obs['{}_aucell'.format(pathway_name)]=np.zeros(shape=(df_rnk.shape[0]), dtype=np.float64)
else:
weights = np.array([1 for i in pathway_genes])
adata.obs['{}_aucell'.format(pathway_name)]=aucs(rnk,
len(df_rnk.columns),weights,auc_threshold
)
omicverse.single.pathway_aucell_enrichment(adata, pathways_dict, AUC_threshold=0.01, seed=42, num_workers=1)
¶
Enriches cell annotations with pathway activity scores using the AUC-ell method.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
AnnData object AnnData object containing the expression matrix. |
required | |
pathways_dict |
dict A dictionary where keys are pathway names and values are lists of genes associated with each pathway. |
required | |
AUC_threshold |
float, optional The threshold for calculating the area under the curve (AUC) values using the AUC-ell method. The default is 0.01. |
0.01
|
|
seed |
int, optional The seed to use for the random number generator. The default is 42. |
42
|
|
num_workers |
int, optional The number of workers to use for parallel processing. The default is 1. |
1
|
Returns:
Name | Type | Description |
---|---|---|
adata_aucs | AnnData object AnnData object containing the pathway activity scores for each cell in the input AnnData object. |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_scgsea.py
def pathway_aucell_enrichment(adata,pathways_dict,AUC_threshold=0.01,seed=42,num_workers=1):
"""
Enriches cell annotations with pathway activity scores using the AUC-ell method.
Arguments:
adata: AnnData object
AnnData object containing the expression matrix.
pathways_dict: dict
A dictionary where keys are pathway names and values are lists of genes associated with each pathway.
AUC_threshold: float, optional
The threshold for calculating the area under the curve (AUC) values using the AUC-ell method. The default is 0.01.
seed: int, optional
The seed to use for the random number generator. The default is 42.
num_workers: int, optional
The number of workers to use for parallel processing. The default is 1.
Returns:
adata_aucs: AnnData object
AnnData object containing the pathway activity scores for each cell in the input AnnData object.
"""
check_ctxcore()
global ctxcore_install
if ctxcore_install==True:
global aucs
global GeneSignature
from ctxcore.recovery import aucs
from ctxcore.genesig import GeneSignature
test_gmt=[]
for i in pathways_dict.keys():
test_gmt.append(GeneSignature(name=i,gene2weight=dict(zip(pathways_dict[i],[1 for i in pathways_dict[i]]))))
matrix = adata.to_df()
percentiles = derive_auc_threshold(matrix)
auc_threshold = percentiles[AUC_threshold]
aucs_mtx = aucell(matrix, signatures=test_gmt, auc_threshold=auc_threshold, num_workers=num_workers)
adata_aucs=anndata.AnnData(aucs_mtx)
return adata_aucs
omicverse.single.pathway_enrichment(adata, pathways_dict, organism='Human', group_by='louvain', cutoff=0.05, logfc_threshold=2, pvalue_type='adjust', plot=True)
¶
Perform pathway enrichment analysis on gene expression data.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData Annotated data matrix containing gene expression data. |
required | |
pathways_dict |
dict A dictionary of gene sets with their gene members. |
required | |
organism |
str, optional (default: 'Human') The organism to be used for the enrichment analysis. Can be either 'Human' or 'Mouse'. |
'Human'
|
|
group_by |
str, optional (default: 'louvain') The group label of the cells in adata.obs to perform the enrichment analysis on. |
'louvain'
|
|
cutoff |
float, optional (default: 0.05) The adjusted p-value cutoff used to filter enriched pathways. |
0.05
|
|
logfc_threshold |
float, optional (default: 2) The log2 fold change cutoff used to define differentially expressed genes. |
2
|
|
pvalue_type |
str, optional (default: 'adjust') The type of p-value used for filtering enriched pathways. Can be either 'adjust' or 'raw'. |
'adjust'
|
|
plot |
bool, optional (default: True) If True, generate a bar plot for each cluster of enriched pathways. |
True
|
Returns:
Name | Type | Description |
---|---|---|
enrich_res | pandas.DataFrame A pandas dataframe containing the enriched pathways information including term name, p-value, adjusted p-value, overlap genes, odds ratio, log2 fold change, and cluster name. |
Examples:
>>> # Perform pathway enrichment analysis on adata using pathways_dict
>>> res = pathway_enrichment(adata, pathways_dict)
Reference
The code for pathway_enrichment() function was adapted from scanpy workflows: https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html#Gene-set-enrichment-analysis.
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_scgsea.py
def pathway_enrichment(adata, pathways_dict,organism='Human',group_by='louvain',
cutoff=0.05, logfc_threshold=2,pvalue_type='adjust',plot=True):
"""
Perform pathway enrichment analysis on gene expression data.
Arguments:
adata: anndata.AnnData
Annotated data matrix containing gene expression data.
pathways_dict: dict
A dictionary of gene sets with their gene members.
organism: str, optional (default: 'Human')
The organism to be used for the enrichment analysis. Can be either 'Human' or 'Mouse'.
group_by: str, optional (default: 'louvain')
The group label of the cells in adata.obs to perform the enrichment analysis on.
cutoff: float, optional (default: 0.05)
The adjusted p-value cutoff used to filter enriched pathways.
logfc_threshold: float, optional (default: 2)
The log2 fold change cutoff used to define differentially expressed genes.
pvalue_type: str, optional (default: 'adjust')
The type of p-value used for filtering enriched pathways. Can be either 'adjust' or 'raw'.
plot: bool, optional (default: True)
If True, generate a bar plot for each cluster of enriched pathways.
Returns:
enrich_res: pandas.DataFrame
A pandas dataframe containing the enriched pathways information including term name,
p-value, adjusted p-value, overlap genes, odds ratio, log2 fold change, and cluster name.
Examples:
>>> # Perform pathway enrichment analysis on adata using pathways_dict
>>> res = pathway_enrichment(adata, pathways_dict)
Reference:
The code for pathway_enrichment() function was adapted from scanpy workflows:
https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html#Gene-set-enrichment-analysis.
"""
#import gseapy as gp
from ..externel.gseapy import enrichr
df_list = []
cluster_list = []
celltypes = sorted(adata.obs[group_by].unique())
for celltype in celltypes:
degs = sc.get.rank_genes_groups_df(adata, group=celltype, key='rank_genes_groups', log2fc_min=logfc_threshold,
pval_cutoff=cutoff)['names'].squeeze()
if isinstance(degs, str):
degs = [degs.strip()]
else:
degs = degs.str.strip().tolist()
if not degs:
continue
if (organism == 'Mouse') or (organism == 'mouse') or (organism == 'mm'):
background='mmusculus_gene_ensembl'
elif (organism == 'Human') or (organism == 'human') or (organism == 'hs'):
background='hsapiens_gene_ensembl'
else:
background=adata.var.index.tolist()
enr = enrichr(gene_list=degs,
description='',
gene_sets=pathways_dict,
organism=organism,
background=background,
cutoff=0.5,
)
if (enr is not None) and hasattr(enr, 'res2d') and (enr.res2d.shape[0] > 0):
df_list.append(enr.res2d)
cluster_list.append(celltype)
columns = ['Cluster', 'Gene_set', 'Term', 'Overlap', 'Odds Ratio','P-value', 'Adjusted P-value', 'Genes']
df = pd.DataFrame(columns = columns)
for cluster_ind, df_ in zip(cluster_list, df_list):
df_ = df_[df_['Adjusted P-value'] <= cutoff]
df_ = df_.assign(Cluster = cluster_ind)
if (df_.shape[0] > 0):
df = pd.concat([df, df_[columns]], sort=False)
df_tmp = df_.loc[:, ['Term', 'Adjusted P-value']][:min(10, df_.shape[0])]
df_tmp['Term'] = [x.split('(',1)[0] for x in df_tmp['Term']]
df_tmp['-log_adj_p'] = - np.log10(df_tmp['Adjusted P-value'])
df_tmp = df_tmp.sort_values(by='-log_adj_p', ascending=True)
if plot==True:
ax = df_tmp.plot.barh(y='-log_adj_p', x='Term', legend=False, grid=False, figsize=(12,4))
ax.set_title('Cluster {}'.format(cluster_ind))
ax.set_ylabel('')
ax.set_xlabel('-log(Adjusted P-value)')
#pp.savefig(ax.figure, bbox_inches='tight')
#plt.close()
else:
print('No pathway with an adjusted P-value less than the cutoff (={}) for cluster {}'.format(cutoff, cluster_ind))
if pvalue_type=='adjust':
enrich_res=df[df['Adjusted P-value']<cutoff]
enrich_res['logp']=-np.log(enrich_res['Adjusted P-value'])
else:
enrich_res=df[df['P-value']<cutoff]
enrich_res['logp']=-np.log(enrich_res['P-value'])
enrich_res['logc']=np.log(enrich_res['Odds Ratio'])
enrich_res['num']=[int(i.split('/')[0]) for i in enrich_res['Overlap']]
enrich_res['fraction']=[int(i.split('/')[0])/int(i.split('/')[1]) for i in enrich_res['Overlap']]
return enrich_res
omicverse.single.pathway_enrichment_plot(enrich_res, term_num=5, return_table=False, figsize=(3, 10), plot_title='', **kwds)
¶
Visualize the pathway enrichment analysis results as a heatmap.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
enrich_res |
pandas DataFrame The output from the pathway_enrichment() function. |
required | |
term_num |
int, optional The number of enriched terms to display for each cluster. Default is 5. |
5
|
|
return_table |
bool, optional Whether to return the heatmap table as a DataFrame. Default is False. |
False
|
|
figsize |
tuple, optional The size of the plot. Default is (3,10). |
(3, 10)
|
|
plot_title |
str, optional The title of the plot. Default is an empty string. |
''
|
|
**kwds |
optional Other keyword arguments to pass to the seaborn heatmap function. |
{}
|
Returns:
Name | Type | Description |
---|---|---|
ax | matplotlib Axes object The heatmap plot. |
Examples:
>>> res = pathway_enrichment(adata, pathways_dict)
>>> pathway_enrichment_plot(res, term_num=10, return_table=True, figsize=(6,12), cmap='Blues')
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_scgsea.py
def pathway_enrichment_plot(enrich_res,term_num=5,return_table=False,figsize=(3,10),plot_title='',**kwds):
"""
Visualize the pathway enrichment analysis results as a heatmap.
Parameters:
enrich_res: pandas DataFrame
The output from the pathway_enrichment() function.
term_num: int, optional
The number of enriched terms to display for each cluster. Default is 5.
return_table: bool, optional
Whether to return the heatmap table as a DataFrame. Default is False.
figsize: tuple, optional
The size of the plot. Default is (3,10).
plot_title: str, optional
The title of the plot. Default is an empty string.
**kwds: optional
Other keyword arguments to pass to the seaborn heatmap function.
Returns:
ax: matplotlib Axes object
The heatmap plot.
Examples:
>>> res = pathway_enrichment(adata, pathways_dict)
>>> pathway_enrichment_plot(res, term_num=10, return_table=True, figsize=(6,12), cmap='Blues')
"""
celltypes=enrich_res['Cluster'].unique()
plot_heatmap=pd.DataFrame(index=celltypes)
for celltype in celltypes:
res_test=enrich_res.loc[enrich_res['Cluster']==celltype].iloc[:term_num]
plot_heatmap_test=pd.DataFrame(res_test[['Term','logp']])
plot_heatmap_test=plot_heatmap_test.set_index(plot_heatmap_test['Term'])
del plot_heatmap_test['Term']
#plot_heatmap[plot_heatmap_test.index]=0
for i in plot_heatmap_test.index:
if len(enrich_res.loc[(enrich_res['Cluster']==celltype)&(enrich_res['Term']==i),'logp'].values)!=0:
plot_heatmap.loc[celltype,i]=enrich_res.loc[(enrich_res['Cluster']==celltype)&(enrich_res['Term']==i),'logp'].values[0]
else:
plot_heatmap.loc[celltype,i]=0
plot_heatmap.fillna(0,inplace=True)
if return_table==True:
return plot_heatmap
fig,ax=plt.subplots(figsize=figsize)
axr=sns.heatmap(plot_heatmap.T,ax=ax,**kwds)
axr.tick_params(right=True, top=False,left=False, labelright=True, labelleft=False,labeltop=False,rotation=0)
ax.set_title(plot_title,fontsize=12)
ax.set_yticklabels(ax.get_yticklabels(), rotation = 0, fontsize = 10)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90, fontsize = 10)
return ax