Api scsa
omicverse.single.pySCSA
¶
Bases: object
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
class pySCSA(object):
def __init__(self,adata:anndata.AnnData,
foldchange:float=1.5,pvalue:float=0.05,
output:str='temp/rna_anno.txt',
model_path:str='',
outfmt:str='txt',Gensymbol:bool=True,
species:str='Human',weight:int=100,tissue:str='All',target:str='cellmarker',
celltype:str='normal',norefdb:bool=False,cellrange:str=None,
noprint:bool=True,list_tissue:bool=False) -> None:
r"""Initialize the pySCSA class
Arguments:
adata: AnnData object of scRNA-seq after preprocessing
foldchange: Fold change threshold for marker filtering. (2.0)
pvalue: P-value threshold for marker filtering. (0.05)
output: Output file for marker annotation.(temp/rna_anno.txt)
model_path: Path to the Database for annotation. If not provided, the model will be downloaded from the internet.
outfmt: Output format for marker annotation. (txt)
Gensymbol: Using gene symbol ID instead of ensembl ID in input file for calculation.
species: Species for annotation. Only used for cellmarker database. ('Human',['Mouse'])
weight: Weight threshold for marker filtering from cellranger v1.0 results. (100)
tissue: Tissue for annotation. you can use `get_model_tissue` to see the available tissues. ('All')
target: Target to annotation class in Database. (cellmarker,[cancersea,panglaodb])
celltype: Cell type for annotation. (normal,[cancer])
norefdb: Only using user-defined marker database for annotation.
noprint: Do not print any detail results.
list_tissue: List all available tissues in the database.
cellrange: Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.)
"""
#create temp directory
try:
if not os.path.isdir('temp'):
print("...Creating directory {}".format('temp'))
os.makedirs('temp', exist_ok=True)
except OSError as e:
print("...Unable to create directory {}. Reason {}".format('temp',e))
self.adata=adata
self.foldchange=foldchange
self.pvalue=pvalue
self.output=output
self.outfmt=outfmt
self.Gensymbol=Gensymbol
self.species=species
self.weight=weight
self.tissue=tissue
self.celltype=celltype
self.norefdb=norefdb
self.noprint=noprint
self.list_tissue=list_tissue
self.target=target
self.cellrange=cellrange
if model_path =='':
self.model_path=data_downloader(url='https://figshare.com/ndownloader/files/41369037',
path='temp/pySCSA_2023_v2_plus.db',title='whole')
else:
self.model_path=model_path
def get_model_tissue(self,species:str="Human")->None:
r"""List all available tissues in the database.
Arguments:
species: Species for annotation. Only used for cellmarker database. ('Human',['Mouse'])
"""
anno = Annotator(foldchange=self.foldchange,
weight=self.weight,
pvalue=self.pvalue,
tissue=self.tissue,
species=self.species,
target=self.target,
norefdb=self.norefdb,
MarkerDB=None,
db=self.model_path,
noprint=self.noprint,
input="temp/rna.csv",
output=self.output,
source="scanpy",
cluster='all',
fc=self.foldchange,
outfmt=self.outfmt,
celltype=self.celltype,
Gensymbol=self.Gensymbol,
list_tissue=self.list_tissue,
cellrange=self.cellrange)
anno.load_pickle_module(self.model_path)
anno.get_list_tissue(species)
def cell_anno(self,clustertype:str='leiden',
cluster:str='all',rank_rep=False)->pd.DataFrame:
r"""Annotate cell type for each cluster.
Arguments:
clustertype: Clustering name used in scanpy. (leiden)
cluster: Only deal with one cluster of marker genes. (all,[1],[1,2,3],[...])
"""
dat=data_preprocess(self.adata,clustertype=clustertype,path='temp/rna.csv',rank_rep=rank_rep)
dat.to_csv('temp/rna.csv')
print('...Auto annotate cell')
p = Process()
p.run_cmd_p(foldchange=self.foldchange,
weight=self.weight,
pvalue=self.pvalue,
tissue=self.tissue,
species=self.species,
target=self.target,
norefdb=self.norefdb,
MarkerDB=None,
db=self.model_path,
noprint=self.noprint,
input="temp/rna.csv",
output=self.output,
source="scanpy",
cluster=cluster,
fc=self.foldchange,
outfmt=self.outfmt,
celltype=self.celltype,
Gensymbol=self.Gensymbol,
list_tissue=self.list_tissue,
cellrange=self.cellrange)
result=pd.read_csv('temp/rna_anno.txt',sep='\t')
self.result=result
return result
def cell_anno_print(self)->None:
r"""print the annotation result
"""
for i in set(self.result['Cluster']):
test=self.result.loc[self.result['Cluster']==i].iloc[:2]
if test.iloc[0]['Z-score']>test.iloc[1]['Z-score']*2:
print('Nice:Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,test.iloc[0]['Cell Type'],
np.around(test.iloc[0]['Z-score'],3)))
else:
print('Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,('|').join(test['Cell Type'].values.tolist()),
('|').join(np.around(test['Z-score'].values,3).astype(str).tolist())))
def cell_auto_anno(self,adata:anndata.AnnData,
clustertype:str='leiden',key='scsa_celltype')->None:
r"""Add cell type annotation to anndata.obs['scsa_celltype']
Arguments:
adata: anndata object
clustertype: Clustering name used in scanpy. (leiden)
"""
test_li=[]
for i in adata.obs[clustertype].value_counts().index:
if int(i) in self.result['Cluster'].values:
test_li.append(self.result.loc[self.result['Cluster']==int(i)].iloc[0]['Cell Type'])
else:
test_li.append('Unknown')
scsa_anno=dict(zip([str(i) for i in adata.obs[clustertype].value_counts().index],
test_li))
adata.obs[key] = adata.obs[clustertype].map(scsa_anno).astype('category')
print('...cell type added to {} on obs of anndata'.format(key))
def get_celltype_marker(self,adata:anndata.AnnData,
clustertype:str='leiden',
log2fc_min:int=2,scores_type='scores',
pval_cutoff:float=0.05,rank:bool=True)->dict:
r"""Get marker genes for each clusters.
Arguments:
adata: anndata object
clustertype: Clustering name used in scanpy. (leiden)
log2fc_min: Minimum log2 fold change of marker genes. (2)
pval_cutoff: Maximum p value of marker genes. (0.05)
rank: Whether to rank genes by wilcoxon test. (True)
scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`
Returns:
cellmarker: A dictionary of marker genes for each clusters.
"""
print('...get cell type marker')
cell_marker_dict=get_celltype_marker(adata=adata,
clustertype=clustertype,
log2fc_min=log2fc_min,scores_type=scores_type,
pval_cutoff=pval_cutoff,rank=rank)
return cell_marker_dict
__init__(adata, foldchange=1.5, pvalue=0.05, output='temp/rna_anno.txt', model_path='', outfmt='txt', Gensymbol=True, species='Human', weight=100, tissue='All', target='cellmarker', celltype='normal', norefdb=False, cellrange=None, noprint=True, list_tissue=False)
¶
Initialize the pySCSA class
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
AnnData object of scRNA-seq after preprocessing |
required |
foldchange |
float
|
Fold change threshold for marker filtering. (2.0) |
1.5
|
pvalue |
float
|
P-value threshold for marker filtering. (0.05) |
0.05
|
output |
str
|
Output file for marker annotation.(temp/rna_anno.txt) |
'temp/rna_anno.txt'
|
model_path |
str
|
Path to the Database for annotation. If not provided, the model will be downloaded from the internet. |
''
|
outfmt |
str
|
Output format for marker annotation. (txt) |
'txt'
|
Gensymbol |
bool
|
Using gene symbol ID instead of ensembl ID in input file for calculation. |
True
|
species |
str
|
Species for annotation. Only used for cellmarker database. ('Human',['Mouse']) |
'Human'
|
weight |
int
|
Weight threshold for marker filtering from cellranger v1.0 results. (100) |
100
|
tissue |
str
|
Tissue for annotation. you can use |
'All'
|
target |
str
|
Target to annotation class in Database. (cellmarker,[cancersea,panglaodb]) |
'cellmarker'
|
celltype |
str
|
Cell type for annotation. (normal,[cancer]) |
'normal'
|
norefdb |
bool
|
Only using user-defined marker database for annotation. |
False
|
noprint |
bool
|
Do not print any detail results. |
True
|
list_tissue |
bool
|
List all available tissues in the database. |
False
|
cellrange |
str
|
Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.) |
None
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def __init__(self,adata:anndata.AnnData,
foldchange:float=1.5,pvalue:float=0.05,
output:str='temp/rna_anno.txt',
model_path:str='',
outfmt:str='txt',Gensymbol:bool=True,
species:str='Human',weight:int=100,tissue:str='All',target:str='cellmarker',
celltype:str='normal',norefdb:bool=False,cellrange:str=None,
noprint:bool=True,list_tissue:bool=False) -> None:
r"""Initialize the pySCSA class
Arguments:
adata: AnnData object of scRNA-seq after preprocessing
foldchange: Fold change threshold for marker filtering. (2.0)
pvalue: P-value threshold for marker filtering. (0.05)
output: Output file for marker annotation.(temp/rna_anno.txt)
model_path: Path to the Database for annotation. If not provided, the model will be downloaded from the internet.
outfmt: Output format for marker annotation. (txt)
Gensymbol: Using gene symbol ID instead of ensembl ID in input file for calculation.
species: Species for annotation. Only used for cellmarker database. ('Human',['Mouse'])
weight: Weight threshold for marker filtering from cellranger v1.0 results. (100)
tissue: Tissue for annotation. you can use `get_model_tissue` to see the available tissues. ('All')
target: Target to annotation class in Database. (cellmarker,[cancersea,panglaodb])
celltype: Cell type for annotation. (normal,[cancer])
norefdb: Only using user-defined marker database for annotation.
noprint: Do not print any detail results.
list_tissue: List all available tissues in the database.
cellrange: Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.)
"""
#create temp directory
try:
if not os.path.isdir('temp'):
print("...Creating directory {}".format('temp'))
os.makedirs('temp', exist_ok=True)
except OSError as e:
print("...Unable to create directory {}. Reason {}".format('temp',e))
self.adata=adata
self.foldchange=foldchange
self.pvalue=pvalue
self.output=output
self.outfmt=outfmt
self.Gensymbol=Gensymbol
self.species=species
self.weight=weight
self.tissue=tissue
self.celltype=celltype
self.norefdb=norefdb
self.noprint=noprint
self.list_tissue=list_tissue
self.target=target
self.cellrange=cellrange
if model_path =='':
self.model_path=data_downloader(url='https://figshare.com/ndownloader/files/41369037',
path='temp/pySCSA_2023_v2_plus.db',title='whole')
else:
self.model_path=model_path
cell_anno(clustertype='leiden', cluster='all', rank_rep=False)
¶
Annotate cell type for each cluster.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
clustertype |
str
|
Clustering name used in scanpy. (leiden) |
'leiden'
|
cluster |
str
|
Only deal with one cluster of marker genes. (all,[1],[1,2,3],[...]) |
'all'
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def cell_anno(self,clustertype:str='leiden',
cluster:str='all',rank_rep=False)->pd.DataFrame:
r"""Annotate cell type for each cluster.
Arguments:
clustertype: Clustering name used in scanpy. (leiden)
cluster: Only deal with one cluster of marker genes. (all,[1],[1,2,3],[...])
"""
dat=data_preprocess(self.adata,clustertype=clustertype,path='temp/rna.csv',rank_rep=rank_rep)
dat.to_csv('temp/rna.csv')
print('...Auto annotate cell')
p = Process()
p.run_cmd_p(foldchange=self.foldchange,
weight=self.weight,
pvalue=self.pvalue,
tissue=self.tissue,
species=self.species,
target=self.target,
norefdb=self.norefdb,
MarkerDB=None,
db=self.model_path,
noprint=self.noprint,
input="temp/rna.csv",
output=self.output,
source="scanpy",
cluster=cluster,
fc=self.foldchange,
outfmt=self.outfmt,
celltype=self.celltype,
Gensymbol=self.Gensymbol,
list_tissue=self.list_tissue,
cellrange=self.cellrange)
result=pd.read_csv('temp/rna_anno.txt',sep='\t')
self.result=result
return result
cell_anno_print()
¶
print the annotation result
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def cell_anno_print(self)->None:
r"""print the annotation result
"""
for i in set(self.result['Cluster']):
test=self.result.loc[self.result['Cluster']==i].iloc[:2]
if test.iloc[0]['Z-score']>test.iloc[1]['Z-score']*2:
print('Nice:Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,test.iloc[0]['Cell Type'],
np.around(test.iloc[0]['Z-score'],3)))
else:
print('Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,('|').join(test['Cell Type'].values.tolist()),
('|').join(np.around(test['Z-score'].values,3).astype(str).tolist())))
cell_auto_anno(adata, clustertype='leiden', key='scsa_celltype')
¶
Add cell type annotation to anndata.obs['scsa_celltype']
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
anndata object |
required |
clustertype |
str
|
Clustering name used in scanpy. (leiden) |
'leiden'
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def cell_auto_anno(self,adata:anndata.AnnData,
clustertype:str='leiden',key='scsa_celltype')->None:
r"""Add cell type annotation to anndata.obs['scsa_celltype']
Arguments:
adata: anndata object
clustertype: Clustering name used in scanpy. (leiden)
"""
test_li=[]
for i in adata.obs[clustertype].value_counts().index:
if int(i) in self.result['Cluster'].values:
test_li.append(self.result.loc[self.result['Cluster']==int(i)].iloc[0]['Cell Type'])
else:
test_li.append('Unknown')
scsa_anno=dict(zip([str(i) for i in adata.obs[clustertype].value_counts().index],
test_li))
adata.obs[key] = adata.obs[clustertype].map(scsa_anno).astype('category')
print('...cell type added to {} on obs of anndata'.format(key))
get_model_tissue(species='Human')
¶
List all available tissues in the database.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
species |
str
|
Species for annotation. Only used for cellmarker database. ('Human',['Mouse']) |
'Human'
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def get_model_tissue(self,species:str="Human")->None:
r"""List all available tissues in the database.
Arguments:
species: Species for annotation. Only used for cellmarker database. ('Human',['Mouse'])
"""
anno = Annotator(foldchange=self.foldchange,
weight=self.weight,
pvalue=self.pvalue,
tissue=self.tissue,
species=self.species,
target=self.target,
norefdb=self.norefdb,
MarkerDB=None,
db=self.model_path,
noprint=self.noprint,
input="temp/rna.csv",
output=self.output,
source="scanpy",
cluster='all',
fc=self.foldchange,
outfmt=self.outfmt,
celltype=self.celltype,
Gensymbol=self.Gensymbol,
list_tissue=self.list_tissue,
cellrange=self.cellrange)
anno.load_pickle_module(self.model_path)
anno.get_list_tissue(species)
get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=True)
¶
Get marker genes for each clusters.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
anndata object |
required |
clustertype |
str
|
Clustering name used in scanpy. (leiden) |
'leiden'
|
log2fc_min |
int
|
Minimum log2 fold change of marker genes. (2) |
2
|
pval_cutoff |
float
|
Maximum p value of marker genes. (0.05) |
0.05
|
rank |
bool
|
Whether to rank genes by wilcoxon test. (True) |
True
|
scores_type |
The type of scores. can be selected from |
'scores'
|
Returns:
Name | Type | Description |
---|---|---|
cellmarker |
dict
|
A dictionary of marker genes for each clusters. |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def get_celltype_marker(self,adata:anndata.AnnData,
clustertype:str='leiden',
log2fc_min:int=2,scores_type='scores',
pval_cutoff:float=0.05,rank:bool=True)->dict:
r"""Get marker genes for each clusters.
Arguments:
adata: anndata object
clustertype: Clustering name used in scanpy. (leiden)
log2fc_min: Minimum log2 fold change of marker genes. (2)
pval_cutoff: Maximum p value of marker genes. (0.05)
rank: Whether to rank genes by wilcoxon test. (True)
scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`
Returns:
cellmarker: A dictionary of marker genes for each clusters.
"""
print('...get cell type marker')
cell_marker_dict=get_celltype_marker(adata=adata,
clustertype=clustertype,
log2fc_min=log2fc_min,scores_type=scores_type,
pval_cutoff=pval_cutoff,rank=rank)
return cell_marker_dict
omicverse.single.scanpy_lazy(adata, min_genes=200, min_cells=3, drop_doublet=True, n_genes_by_counts=4300, pct_counts_mt=25, target_sum=10000.0, min_mean=0.0125, max_mean=3, min_disp=0.5, max_value=10, n_comps=100, svd_solver='auto', n_neighbors=15, random_state=112, n_pcs=50)
¶
scanpy lazy analysis
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
AnnData object |
required |
min_genes |
int
|
the min number of genes |
200
|
min_cells |
int
|
the min number of cells |
3
|
drop_doublet |
bool
|
whether to drop doublet |
True
|
n_genes_by_counts |
int
|
the max number of genes |
4300
|
pct_counts_mt |
int
|
the max proportion of mito-genes |
25
|
target_sum |
float
|
the max counts of total_counts |
10000.0
|
min_mean |
float
|
the min mean of genes |
0.0125
|
max_mean |
int
|
the max mean of genes |
3
|
min_disp |
float
|
the min dispersion of genes |
0.5
|
max_value |
int
|
the max value of genes |
10
|
n_comps |
int
|
the number of components |
100
|
svd_solver |
str
|
the solver of svd |
'auto'
|
n_neighbors |
int
|
the number of neighbors |
15
|
random_state |
int
|
the random state |
112
|
n_pcs |
int
|
the number of pcs |
50
|
Returns:
Name | Type | Description |
---|---|---|
adata |
anndata.AnnData
|
AnnData object |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def scanpy_lazy(adata:anndata.AnnData,min_genes:int=200,min_cells:int=3,drop_doublet:bool=True,
n_genes_by_counts:int=4300,pct_counts_mt:int=25,
target_sum:float=1e4,min_mean:float=0.0125, max_mean:int=3, min_disp:float=0.5,max_value:int=10,
n_comps:int=100, svd_solver:str="auto",
n_neighbors:int=15, random_state:int = 112, n_pcs:int=50,
)->anndata.AnnData:
r"""scanpy lazy analysis
Arguments:
adata: AnnData object
min_genes: the min number of genes
min_cells: the min number of cells
drop_doublet: whether to drop doublet
n_genes_by_counts: the max number of genes
pct_counts_mt: the max proportion of mito-genes
target_sum: the max counts of total_counts
min_mean: the min mean of genes
max_mean: the max mean of genes
min_disp: the min dispersion of genes
max_value: the max value of genes
n_comps: the number of components
svd_solver: the solver of svd
n_neighbors: the number of neighbors
random_state: the random state
n_pcs: the number of pcs
Returns:
adata: AnnData object
"""
#filter cells and genes
sc.pp.filter_cells(adata, min_genes=min_genes)
sc.pp.filter_genes(adata, min_cells=min_cells)
#filter the doublets cells
if drop_doublet:
sc.external.pp.scrublet(adata) #estimates doublets
adata = adata[adata.obs['predicted_doublet'] == False] #do the actual filtering
#calculate the proportion of mito-genes
adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
adata = adata[adata.obs.n_genes_by_counts < n_genes_by_counts, :]
adata = adata[adata.obs.pct_counts_mt < pct_counts_mt, :]
#save the raw counts
adata.layers["counts"] = adata.X.copy()
#normalization, the max counts of total_counts is 20000 means the amount is 10e4
sc.pp.normalize_total(adata, target_sum=target_sum)
#log
sc.pp.log1p(adata)
#select high-variable genes
sc.pp.highly_variable_genes(adata, min_mean=min_mean, max_mean=max_mean, min_disp=min_disp)
#save and filter
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
#scale
#scale(adata, max_value=max_value)
sc.pp.scale(adata, max_value=max_value)
#pca analysis
sc.tl.pca(adata, n_comps=n_comps, svd_solver=svd_solver)
#pca(adata,layer='scaled',n_pcs=50)
#cell neighbors graph construct
sc.pp.neighbors(adata, n_neighbors=n_neighbors, random_state = random_state, n_pcs=n_pcs)
#umap
sc.tl.leiden(adata)
sc.tl.paga(adata)
sc.pl.paga(adata, plot=False) # remove `plot=False` if you want to see the coarse-grained graph
sc.tl.umap(adata, init_pos='paga')
return adata
omicverse.single.scanpy_cellanno_from_dict(adata, anno_dict, anno_name='major', clustertype='leiden')
¶
add cell type annotation from dict to anndata object
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
AnnData object of scRNA-seq after preprocessing |
required |
anno_dict |
dict
|
dict of cell type annotation. key is the cluster name, value is the cell type name.like |
required |
anno_name |
str
|
the name of annotation |
'major'
|
clustertype |
str
|
Clustering name used in scanpy. (leiden) |
'leiden'
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def scanpy_cellanno_from_dict(adata:anndata.AnnData,
anno_dict:dict,
anno_name:str='major',
clustertype:str='leiden',
)->None:
r"""add cell type annotation from dict to anndata object
Arguments:
adata: AnnData object of scRNA-seq after preprocessing
anno_dict: dict of cell type annotation. key is the cluster name, value is the cell type name.like `{'0':'B cell','1':'T cell'}`
anno_name: the name of annotation
clustertype: Clustering name used in scanpy. (leiden)
"""
adata.obs[anno_name+'_celltype'] = adata.obs[clustertype].map(anno_dict).astype('category')
print('...cell type added to {}_celltype on obs of anndata'.format(anno_name))
omicverse.single.get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=False, key='rank_genes_groups', method='wilcoxon', foldchange=None, topgenenumber=10)
¶
Get marker genes for each clusters.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
anndata object |
required |
clustertype |
str
|
Clustering name used in scanpy. (leiden) |
'leiden'
|
log2fc_min |
int
|
Minimum log2 fold change of marker genes. (2) |
2
|
pval_cutoff |
float
|
Maximum p value of marker genes. (0.05) |
0.05
|
rank |
bool
|
Whether to rank genes by wilcoxon test. (True) |
False
|
scores_type |
The type of scores. can be selected from |
'scores'
|
Returns:
Name | Type | Description |
---|---|---|
cellmarker |
dict
|
A dictionary of marker genes for each clusters. |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def get_celltype_marker(adata:anndata.AnnData,
clustertype:str='leiden',
log2fc_min:int=2,scores_type='scores',
pval_cutoff:float=0.05,rank:bool=False,
key='rank_genes_groups',method='wilcoxon',
foldchange=None,topgenenumber=10)->dict:
r"""Get marker genes for each clusters.
Arguments:
adata: anndata object
clustertype: Clustering name used in scanpy. (leiden)
log2fc_min: Minimum log2 fold change of marker genes. (2)
pval_cutoff: Maximum p value of marker genes. (0.05)
rank: Whether to rank genes by wilcoxon test. (True)
scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`
Returns:
cellmarker: A dictionary of marker genes for each clusters.
"""
print('...get cell type marker')
celltypes = sorted(adata.obs[clustertype].unique())
cell_marker_dict={}
if rank==False:
sc.tl.rank_genes_groups(adata, clustertype, method=method)
for celltype in celltypes:
degs = sc.get.rank_genes_groups_df(adata, group=celltype, key=key, log2fc_min=log2fc_min,
pval_cutoff=pval_cutoff)
foldp=np.histogram(degs[scores_type])
if foldchange is None:
foldchange=(foldp[1][np.where(foldp[1]>0)[0][-5]]+foldp[1][np.where(foldp[1]>0)[0][-6]])/2
cellmarker=degs.loc[degs[scores_type]>foldchange]['names'].values[:topgenenumber]
cell_marker_dict[celltype]=cellmarker
for key in cell_marker_dict.keys():
cell_marker_dict[key]=list(cell_marker_dict[key])
return cell_marker_dict