Api scsa
omicverse.single.pySCSA
¶
Bases: object
Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
class pySCSA(object):
def __init__(self,adata:anndata.AnnData,
foldchange:float=1.5,pvalue:float=0.05,
output:str='temp/rna_anno.txt',
model_path:str='',
outfmt:str='txt',Gensymbol:bool=True,
species:str='Human',weight:int=100,tissue:str='All',target:str='cellmarker',
celltype:str='normal',norefdb:bool=False,cellrange:str=None,
noprint:bool=True,list_tissue:bool=False) -> None:
r"""Initialize the pySCSA class.
Arguments:
adata: AnnData object of scRNA-seq after preprocessing
foldchange: Fold change threshold for marker filtering. (1.5)
pvalue: P-value threshold for marker filtering. (0.05)
output: Output file for marker annotation. ('temp/rna_anno.txt')
model_path: Path to the Database for annotation. If not provided, the model will be downloaded from the internet. ('')
outfmt: Output format for marker annotation. ('txt')
Gensymbol: Using gene symbol ID instead of ensembl ID in input file for calculation. (True)
species: Species for annotation. Only used for cellmarker database. ('Human')
weight: Weight threshold for marker filtering from cellranger v1.0 results. (100)
tissue: Tissue for annotation. you can use `get_model_tissue` to see the available tissues. ('All')
target: Target to annotation class in Database. ('cellmarker')
celltype: Cell type for annotation. ('normal')
norefdb: Only using user-defined marker database for annotation. (False)
cellrange: Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.) (None)
noprint: Do not print any detail results. (True)
list_tissue: List all available tissues in the database. (False)
Returns:
None
"""
#create temp directory
try:
if not os.path.isdir('temp'):
print("...Creating directory {}".format('temp'))
os.makedirs('temp', exist_ok=True)
except OSError as e:
print("...Unable to create directory {}. Reason {}".format('temp',e))
self.adata=adata
self.foldchange=foldchange
self.pvalue=pvalue
self.output=output
self.outfmt=outfmt
self.Gensymbol=Gensymbol
self.species=species
self.weight=weight
self.tissue=tissue
self.celltype=celltype
self.norefdb=norefdb
self.noprint=noprint
self.list_tissue=list_tissue
self.target=target
self.cellrange=cellrange
if model_path =='':
self.model_path=data_downloader(url='https://figshare.com/ndownloader/files/41369037',
path='temp/pySCSA_2023_v2_plus.db',title='whole')
else:
self.model_path=model_path
def get_model_tissue(self,species:str="Human")->None:
r"""List all available tissues in the database.
Arguments:
species: Species for annotation. Only used for cellmarker database. ('Human')
Returns:
None
"""
anno = Annotator(foldchange=self.foldchange,
weight=self.weight,
pvalue=self.pvalue,
tissue=self.tissue,
species=self.species,
target=self.target,
norefdb=self.norefdb,
MarkerDB=None,
db=self.model_path,
noprint=self.noprint,
input="temp/rna.csv",
output=self.output,
source="scanpy",
cluster='all',
fc=self.foldchange,
outfmt=self.outfmt,
celltype=self.celltype,
Gensymbol=self.Gensymbol,
list_tissue=self.list_tissue,
cellrange=self.cellrange)
anno.load_pickle_module(self.model_path)
anno.get_list_tissue(species)
def cell_anno(self,clustertype:str='leiden',
cluster:str='all',rank_rep=False)->pd.DataFrame:
r"""Annotate cell type for each cluster.
Arguments:
clustertype: Clustering name used in scanpy. ('leiden')
cluster: Only deal with one cluster of marker genes. ('all')
rank_rep: Whether to repeat ranking. (False)
Returns:
result: Annotation result as DataFrame
"""
dat=data_preprocess(self.adata,clustertype=clustertype,path='temp/rna.csv',rank_rep=rank_rep)
dat.to_csv('temp/rna.csv')
print('...Auto annotate cell')
p = Process()
p.run_cmd_p(foldchange=self.foldchange,
weight=self.weight,
pvalue=self.pvalue,
tissue=self.tissue,
species=self.species,
target=self.target,
norefdb=self.norefdb,
MarkerDB=None,
db=self.model_path,
noprint=self.noprint,
input="temp/rna.csv",
output=self.output,
source="scanpy",
cluster=cluster,
fc=self.foldchange,
outfmt=self.outfmt,
celltype=self.celltype,
Gensymbol=self.Gensymbol,
list_tissue=self.list_tissue,
cellrange=self.cellrange)
result=pd.read_csv('temp/rna_anno.txt',sep='\t')
self.result=result
add_reference(self.adata,'pySCSA','cell annotation with SCSA')
return result
def cell_anno_print(self)->None:
r"""Print the annotation result.
Returns:
None
"""
for i in set(self.result['Cluster']):
test=self.result.loc[self.result['Cluster']==i].iloc[:2]
if test.iloc[0]['Z-score']>test.iloc[1]['Z-score']*2:
print('Nice:Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,test.iloc[0]['Cell Type'],
np.around(test.iloc[0]['Z-score'],3)))
else:
print('Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,('|').join(test['Cell Type'].values.tolist()),
('|').join(np.around(test['Z-score'].values,3).astype(str).tolist())))
def cell_auto_anno(self,adata:anndata.AnnData,
clustertype:str='leiden',key='scsa_celltype')->None:
r"""Add cell type annotation to anndata.obs['scsa_celltype'].
Arguments:
adata: anndata object
clustertype: Clustering name used in scanpy. ('leiden')
key: Key to store cell type annotation. ('scsa_celltype')
Returns:
None
"""
test_li=[]
for i in adata.obs[clustertype].value_counts().index:
if int(i) in self.result['Cluster'].values:
test_li.append(self.result.loc[self.result['Cluster']==int(i)].iloc[0]['Cell Type'])
else:
test_li.append('Unknown')
scsa_anno=dict(zip([str(i) for i in adata.obs[clustertype].value_counts().index],
test_li))
adata.obs[key] = adata.obs[clustertype].map(scsa_anno).astype('category')
print('...cell type added to {} on obs of anndata'.format(key))
def get_celltype_marker(self,adata:anndata.AnnData,
clustertype:str='leiden',
log2fc_min:int=2,scores_type='scores',
pval_cutoff:float=0.05,rank:bool=True,
unique:bool=True,global_unique:bool=False)->dict:
r"""Get marker genes for each clusters.
Arguments:
adata: anndata object
clustertype: Clustering name used in scanpy. (leiden)
log2fc_min: Minimum log2 fold change of marker genes. (2)
pval_cutoff: Maximum p value of marker genes. (0.05)
rank: Whether to rank genes by wilcoxon test. (True)
scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`
unique: Whether to remove duplicates within each cell type. (True)
global_unique: Whether to remove duplicates across all cell types. (False)
Returns:
cellmarker: A dictionary of marker genes for each clusters.
"""
print('...get cell type marker')
cell_marker_dict=get_celltype_marker(adata=adata,
clustertype=clustertype,
log2fc_min=log2fc_min,scores_type=scores_type,
pval_cutoff=pval_cutoff,rank=rank,
unique=unique,global_unique=global_unique)
return cell_marker_dict
__init__(adata, foldchange=1.5, pvalue=0.05, output='temp/rna_anno.txt', model_path='', outfmt='txt', Gensymbol=True, species='Human', weight=100, tissue='All', target='cellmarker', celltype='normal', norefdb=False, cellrange=None, noprint=True, list_tissue=False)
¶
Initialize the pySCSA class.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
AnnData object of scRNA-seq after preprocessing |
required |
foldchange |
float
|
Fold change threshold for marker filtering. (1.5) |
1.5
|
pvalue |
float
|
P-value threshold for marker filtering. (0.05) |
0.05
|
output |
str
|
Output file for marker annotation. ('temp/rna_anno.txt') |
'temp/rna_anno.txt'
|
model_path |
str
|
Path to the Database for annotation. If not provided, the model will be downloaded from the internet. ('') |
''
|
outfmt |
str
|
Output format for marker annotation. ('txt') |
'txt'
|
Gensymbol |
bool
|
Using gene symbol ID instead of ensembl ID in input file for calculation. (True) |
True
|
species |
str
|
Species for annotation. Only used for cellmarker database. ('Human') |
'Human'
|
weight |
int
|
Weight threshold for marker filtering from cellranger v1.0 results. (100) |
100
|
tissue |
str
|
Tissue for annotation. you can use |
'All'
|
target |
str
|
Target to annotation class in Database. ('cellmarker') |
'cellmarker'
|
celltype |
str
|
Cell type for annotation. ('normal') |
'normal'
|
norefdb |
bool
|
Only using user-defined marker database for annotation. (False) |
False
|
cellrange |
str
|
Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.) (None) |
None
|
noprint |
bool
|
Do not print any detail results. (True) |
True
|
list_tissue |
bool
|
List all available tissues in the database. (False) |
False
|
Returns:
Type | Description |
---|---|
None
|
None |
Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def __init__(self,adata:anndata.AnnData,
foldchange:float=1.5,pvalue:float=0.05,
output:str='temp/rna_anno.txt',
model_path:str='',
outfmt:str='txt',Gensymbol:bool=True,
species:str='Human',weight:int=100,tissue:str='All',target:str='cellmarker',
celltype:str='normal',norefdb:bool=False,cellrange:str=None,
noprint:bool=True,list_tissue:bool=False) -> None:
r"""Initialize the pySCSA class.
Arguments:
adata: AnnData object of scRNA-seq after preprocessing
foldchange: Fold change threshold for marker filtering. (1.5)
pvalue: P-value threshold for marker filtering. (0.05)
output: Output file for marker annotation. ('temp/rna_anno.txt')
model_path: Path to the Database for annotation. If not provided, the model will be downloaded from the internet. ('')
outfmt: Output format for marker annotation. ('txt')
Gensymbol: Using gene symbol ID instead of ensembl ID in input file for calculation. (True)
species: Species for annotation. Only used for cellmarker database. ('Human')
weight: Weight threshold for marker filtering from cellranger v1.0 results. (100)
tissue: Tissue for annotation. you can use `get_model_tissue` to see the available tissues. ('All')
target: Target to annotation class in Database. ('cellmarker')
celltype: Cell type for annotation. ('normal')
norefdb: Only using user-defined marker database for annotation. (False)
cellrange: Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.) (None)
noprint: Do not print any detail results. (True)
list_tissue: List all available tissues in the database. (False)
Returns:
None
"""
#create temp directory
try:
if not os.path.isdir('temp'):
print("...Creating directory {}".format('temp'))
os.makedirs('temp', exist_ok=True)
except OSError as e:
print("...Unable to create directory {}. Reason {}".format('temp',e))
self.adata=adata
self.foldchange=foldchange
self.pvalue=pvalue
self.output=output
self.outfmt=outfmt
self.Gensymbol=Gensymbol
self.species=species
self.weight=weight
self.tissue=tissue
self.celltype=celltype
self.norefdb=norefdb
self.noprint=noprint
self.list_tissue=list_tissue
self.target=target
self.cellrange=cellrange
if model_path =='':
self.model_path=data_downloader(url='https://figshare.com/ndownloader/files/41369037',
path='temp/pySCSA_2023_v2_plus.db',title='whole')
else:
self.model_path=model_path
cell_anno(clustertype='leiden', cluster='all', rank_rep=False)
¶
Annotate cell type for each cluster.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
clustertype |
str
|
Clustering name used in scanpy. ('leiden') |
'leiden'
|
cluster |
str
|
Only deal with one cluster of marker genes. ('all') |
'all'
|
rank_rep |
Whether to repeat ranking. (False) |
False
|
Returns:
Name | Type | Description |
---|---|---|
result |
pd.DataFrame
|
Annotation result as DataFrame |
Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def cell_anno(self,clustertype:str='leiden',
cluster:str='all',rank_rep=False)->pd.DataFrame:
r"""Annotate cell type for each cluster.
Arguments:
clustertype: Clustering name used in scanpy. ('leiden')
cluster: Only deal with one cluster of marker genes. ('all')
rank_rep: Whether to repeat ranking. (False)
Returns:
result: Annotation result as DataFrame
"""
dat=data_preprocess(self.adata,clustertype=clustertype,path='temp/rna.csv',rank_rep=rank_rep)
dat.to_csv('temp/rna.csv')
print('...Auto annotate cell')
p = Process()
p.run_cmd_p(foldchange=self.foldchange,
weight=self.weight,
pvalue=self.pvalue,
tissue=self.tissue,
species=self.species,
target=self.target,
norefdb=self.norefdb,
MarkerDB=None,
db=self.model_path,
noprint=self.noprint,
input="temp/rna.csv",
output=self.output,
source="scanpy",
cluster=cluster,
fc=self.foldchange,
outfmt=self.outfmt,
celltype=self.celltype,
Gensymbol=self.Gensymbol,
list_tissue=self.list_tissue,
cellrange=self.cellrange)
result=pd.read_csv('temp/rna_anno.txt',sep='\t')
self.result=result
add_reference(self.adata,'pySCSA','cell annotation with SCSA')
return result
cell_anno_print()
¶
Print the annotation result.
Returns:
Type | Description |
---|---|
None
|
None |
Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def cell_anno_print(self)->None:
r"""Print the annotation result.
Returns:
None
"""
for i in set(self.result['Cluster']):
test=self.result.loc[self.result['Cluster']==i].iloc[:2]
if test.iloc[0]['Z-score']>test.iloc[1]['Z-score']*2:
print('Nice:Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,test.iloc[0]['Cell Type'],
np.around(test.iloc[0]['Z-score'],3)))
else:
print('Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,('|').join(test['Cell Type'].values.tolist()),
('|').join(np.around(test['Z-score'].values,3).astype(str).tolist())))
cell_auto_anno(adata, clustertype='leiden', key='scsa_celltype')
¶
Add cell type annotation to anndata.obs['scsa_celltype'].
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
anndata object |
required |
clustertype |
str
|
Clustering name used in scanpy. ('leiden') |
'leiden'
|
key |
Key to store cell type annotation. ('scsa_celltype') |
'scsa_celltype'
|
Returns:
Type | Description |
---|---|
None
|
None |
Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def cell_auto_anno(self,adata:anndata.AnnData,
clustertype:str='leiden',key='scsa_celltype')->None:
r"""Add cell type annotation to anndata.obs['scsa_celltype'].
Arguments:
adata: anndata object
clustertype: Clustering name used in scanpy. ('leiden')
key: Key to store cell type annotation. ('scsa_celltype')
Returns:
None
"""
test_li=[]
for i in adata.obs[clustertype].value_counts().index:
if int(i) in self.result['Cluster'].values:
test_li.append(self.result.loc[self.result['Cluster']==int(i)].iloc[0]['Cell Type'])
else:
test_li.append('Unknown')
scsa_anno=dict(zip([str(i) for i in adata.obs[clustertype].value_counts().index],
test_li))
adata.obs[key] = adata.obs[clustertype].map(scsa_anno).astype('category')
print('...cell type added to {} on obs of anndata'.format(key))
get_model_tissue(species='Human')
¶
List all available tissues in the database.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
species |
str
|
Species for annotation. Only used for cellmarker database. ('Human') |
'Human'
|
Returns:
Type | Description |
---|---|
None
|
None |
Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def get_model_tissue(self,species:str="Human")->None:
r"""List all available tissues in the database.
Arguments:
species: Species for annotation. Only used for cellmarker database. ('Human')
Returns:
None
"""
anno = Annotator(foldchange=self.foldchange,
weight=self.weight,
pvalue=self.pvalue,
tissue=self.tissue,
species=self.species,
target=self.target,
norefdb=self.norefdb,
MarkerDB=None,
db=self.model_path,
noprint=self.noprint,
input="temp/rna.csv",
output=self.output,
source="scanpy",
cluster='all',
fc=self.foldchange,
outfmt=self.outfmt,
celltype=self.celltype,
Gensymbol=self.Gensymbol,
list_tissue=self.list_tissue,
cellrange=self.cellrange)
anno.load_pickle_module(self.model_path)
anno.get_list_tissue(species)
get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=True, unique=True, global_unique=False)
¶
Get marker genes for each clusters.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
anndata object |
required |
clustertype |
str
|
Clustering name used in scanpy. (leiden) |
'leiden'
|
log2fc_min |
int
|
Minimum log2 fold change of marker genes. (2) |
2
|
pval_cutoff |
float
|
Maximum p value of marker genes. (0.05) |
0.05
|
rank |
bool
|
Whether to rank genes by wilcoxon test. (True) |
True
|
scores_type |
The type of scores. can be selected from |
'scores'
|
|
unique |
bool
|
Whether to remove duplicates within each cell type. (True) |
True
|
global_unique |
bool
|
Whether to remove duplicates across all cell types. (False) |
False
|
Returns:
Name | Type | Description |
---|---|---|
cellmarker |
dict
|
A dictionary of marker genes for each clusters. |
Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def get_celltype_marker(self,adata:anndata.AnnData,
clustertype:str='leiden',
log2fc_min:int=2,scores_type='scores',
pval_cutoff:float=0.05,rank:bool=True,
unique:bool=True,global_unique:bool=False)->dict:
r"""Get marker genes for each clusters.
Arguments:
adata: anndata object
clustertype: Clustering name used in scanpy. (leiden)
log2fc_min: Minimum log2 fold change of marker genes. (2)
pval_cutoff: Maximum p value of marker genes. (0.05)
rank: Whether to rank genes by wilcoxon test. (True)
scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`
unique: Whether to remove duplicates within each cell type. (True)
global_unique: Whether to remove duplicates across all cell types. (False)
Returns:
cellmarker: A dictionary of marker genes for each clusters.
"""
print('...get cell type marker')
cell_marker_dict=get_celltype_marker(adata=adata,
clustertype=clustertype,
log2fc_min=log2fc_min,scores_type=scores_type,
pval_cutoff=pval_cutoff,rank=rank,
unique=unique,global_unique=global_unique)
return cell_marker_dict
omicverse.single.scanpy_lazy(adata, min_genes=200, min_cells=3, drop_doublet=True, n_genes_by_counts=4300, pct_counts_mt=25, target_sum=10000.0, min_mean=0.0125, max_mean=3, min_disp=0.5, max_value=10, n_comps=100, svd_solver='auto', n_neighbors=15, random_state=112, n_pcs=50)
¶
Scanpy lazy analysis pipeline.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
AnnData object |
required |
min_genes |
int
|
The min number of genes. (200) |
200
|
min_cells |
int
|
The min number of cells. (3) |
3
|
drop_doublet |
bool
|
Whether to drop doublet. (True) |
True
|
n_genes_by_counts |
int
|
The max number of genes. (4300) |
4300
|
pct_counts_mt |
int
|
The max proportion of mito-genes. (25) |
25
|
target_sum |
float
|
The max counts of total_counts. (1e4) |
10000.0
|
min_mean |
float
|
The min mean of genes. (0.0125) |
0.0125
|
max_mean |
int
|
The max mean of genes. (3) |
3
|
min_disp |
float
|
The min dispersion of genes. (0.5) |
0.5
|
max_value |
int
|
The max value of genes. (10) |
10
|
n_comps |
int
|
The number of components. (100) |
100
|
svd_solver |
str
|
The solver of svd. ('auto') |
'auto'
|
n_neighbors |
int
|
The number of neighbors. (15) |
15
|
random_state |
int
|
The random state. (112) |
112
|
n_pcs |
int
|
The number of pcs. (50) |
50
|
Returns:
Name | Type | Description |
---|---|---|
adata |
anndata.AnnData
|
AnnData object |
Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def scanpy_lazy(adata:anndata.AnnData,min_genes:int=200,min_cells:int=3,drop_doublet:bool=True,
n_genes_by_counts:int=4300,pct_counts_mt:int=25,
target_sum:float=1e4,min_mean:float=0.0125, max_mean:int=3, min_disp:float=0.5,max_value:int=10,
n_comps:int=100, svd_solver:str="auto",
n_neighbors:int=15, random_state:int = 112, n_pcs:int=50,
)->anndata.AnnData:
r"""Scanpy lazy analysis pipeline.
Arguments:
adata: AnnData object
min_genes: The min number of genes. (200)
min_cells: The min number of cells. (3)
drop_doublet: Whether to drop doublet. (True)
n_genes_by_counts: The max number of genes. (4300)
pct_counts_mt: The max proportion of mito-genes. (25)
target_sum: The max counts of total_counts. (1e4)
min_mean: The min mean of genes. (0.0125)
max_mean: The max mean of genes. (3)
min_disp: The min dispersion of genes. (0.5)
max_value: The max value of genes. (10)
n_comps: The number of components. (100)
svd_solver: The solver of svd. ('auto')
n_neighbors: The number of neighbors. (15)
random_state: The random state. (112)
n_pcs: The number of pcs. (50)
Returns:
adata: AnnData object
"""
#filter cells and genes
sc.pp.filter_cells(adata, min_genes=min_genes)
sc.pp.filter_genes(adata, min_cells=min_cells)
#filter the doublets cells
if drop_doublet:
sc.external.pp.scrublet(adata) #estimates doublets
adata = adata[adata.obs['predicted_doublet'] == False] #do the actual filtering
#calculate the proportion of mito-genes
adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
adata = adata[adata.obs.n_genes_by_counts < n_genes_by_counts, :]
adata = adata[adata.obs.pct_counts_mt < pct_counts_mt, :]
#save the raw counts
adata.layers["counts"] = adata.X.copy()
#normalization, the max counts of total_counts is 20000 means the amount is 10e4
sc.pp.normalize_total(adata, target_sum=target_sum)
#log
sc.pp.log1p(adata)
#select high-variable genes
sc.pp.highly_variable_genes(adata, min_mean=min_mean, max_mean=max_mean, min_disp=min_disp)
#save and filter
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
#scale
#scale(adata, max_value=max_value)
sc.pp.scale(adata, max_value=max_value)
#pca analysis
sc.tl.pca(adata, n_comps=n_comps, svd_solver=svd_solver)
#pca(adata,layer='scaled',n_pcs=50)
#cell neighbors graph construct
sc.pp.neighbors(adata, n_neighbors=n_neighbors, random_state = random_state, n_pcs=n_pcs)
#umap
sc.tl.leiden(adata)
sc.tl.paga(adata)
sc.pl.paga(adata, plot=False) # remove `plot=False` if you want to see the coarse-grained graph
sc.tl.umap(adata, init_pos='paga')
return adata
omicverse.single.scanpy_cellanno_from_dict(adata, anno_dict, anno_name='major', clustertype='leiden')
¶
Add cell type annotation from dict to anndata object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
AnnData object of scRNA-seq after preprocessing |
required |
anno_dict |
dict
|
Dict of cell type annotation. key is the cluster name, value is the cell type name.like |
required |
anno_name |
str
|
The name of annotation. ('major') |
'major'
|
clustertype |
str
|
Clustering name used in scanpy. ('leiden') |
'leiden'
|
Returns:
Type | Description |
---|---|
None
|
None |
Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def scanpy_cellanno_from_dict(adata:anndata.AnnData,
anno_dict:dict,
anno_name:str='major',
clustertype:str='leiden',
)->None:
r"""Add cell type annotation from dict to anndata object.
Arguments:
adata: AnnData object of scRNA-seq after preprocessing
anno_dict: Dict of cell type annotation. key is the cluster name, value is the cell type name.like `{'0':'B cell','1':'T cell'}`
anno_name: The name of annotation. ('major')
clustertype: Clustering name used in scanpy. ('leiden')
Returns:
None
"""
adata.obs[anno_name+'_celltype'] = adata.obs[clustertype].map(anno_dict).astype('category')
print('...cell type added to {}_celltype on obs of anndata'.format(anno_name))
omicverse.single.get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=False, key='rank_genes_groups', method='wilcoxon', foldchange=None, topgenenumber=10, unique=True, global_unique=False)
¶
Get marker genes for each clusters.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
anndata object |
required |
clustertype |
str
|
Clustering name used in scanpy. (leiden) |
'leiden'
|
log2fc_min |
int
|
Minimum log2 fold change of marker genes. (2) |
2
|
pval_cutoff |
float
|
Maximum p value of marker genes. (0.05) |
0.05
|
rank |
bool
|
Whether to rank genes by wilcoxon test. (True) |
False
|
scores_type |
The type of scores. can be selected from |
'scores'
|
|
unique |
Whether to remove duplicates within each cell type. (True) |
True
|
|
global_unique |
Whether to remove duplicates across all cell types. (False) |
False
|
Returns:
Name | Type | Description |
---|---|---|
cellmarker |
dict
|
A dictionary of marker genes for each clusters. |
Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def get_celltype_marker(adata:anndata.AnnData,
clustertype:str='leiden',
log2fc_min:int=2,scores_type='scores',
pval_cutoff:float=0.05,rank:bool=False,
key='rank_genes_groups',method='wilcoxon',
foldchange=None,topgenenumber=10,unique=True,
global_unique=False)->dict:
r"""Get marker genes for each clusters.
Arguments:
adata: anndata object
clustertype: Clustering name used in scanpy. (leiden)
log2fc_min: Minimum log2 fold change of marker genes. (2)
pval_cutoff: Maximum p value of marker genes. (0.05)
rank: Whether to rank genes by wilcoxon test. (True)
scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`
unique: Whether to remove duplicates within each cell type. (True)
global_unique: Whether to remove duplicates across all cell types. (False)
Returns:
cellmarker: A dictionary of marker genes for each clusters.
"""
print('...get cell type marker')
celltypes = sorted(adata.obs[clustertype].unique())
cell_marker_dict={}
if rank==True and 'rank_genes_groups' not in adata.uns.keys():
sc.tl.rank_genes_groups(adata, clustertype, method=method)
elif rank==True and 'rank_genes_groups' in adata.uns.keys():
pass
for celltype in celltypes:
degs = sc.get.rank_genes_groups_df(adata, group=celltype, key=key, log2fc_min=log2fc_min,
pval_cutoff=pval_cutoff)
foldp=np.histogram(degs[scores_type])
if foldchange is None:
try:
foldchange=(foldp[1][np.where(foldp[1]>0)[0][-5]]+foldp[1][np.where(foldp[1]>0)[0][-6]])/2
except:
foldchange=degs[scores_type].mean()
cellmarker=degs.loc[degs[scores_type]>foldchange]['names'].values[:topgenenumber]
cell_marker_dict[celltype]=cellmarker
if unique==True:
for key in cell_marker_dict.keys():
cell_marker_dict[key]=list(set(cell_marker_dict[key]))
# Global uniqueness across all cell types
if global_unique:
used_genes = set()
for celltype in celltypes:
if celltype in cell_marker_dict:
# Filter out genes that have been used in previous cell types
unique_genes = [gene for gene in cell_marker_dict[celltype] if gene not in used_genes]
cell_marker_dict[celltype] = unique_genes
used_genes.update(unique_genes)
return cell_marker_dict