Skip to content

Api scsa

omicverse.single.pySCSA

Bases: object

Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
class pySCSA(object):

    def __init__(self,adata:anndata.AnnData,
                foldchange:float=1.5,pvalue:float=0.05,
                output:str='temp/rna_anno.txt',
                model_path:str='',
                outfmt:str='txt',Gensymbol:bool=True,
                species:str='Human',weight:int=100,tissue:str='All',target:str='cellmarker',
                celltype:str='normal',norefdb:bool=False,cellrange:str=None,
                noprint:bool=True,list_tissue:bool=False) -> None:

        r"""Initialize the pySCSA class

        Arguments:
            adata: AnnData object of scRNA-seq after preprocessing
            foldchange: Fold change threshold for marker filtering. (2.0)
            pvalue: P-value threshold for marker filtering. (0.05)
            output: Output file for marker annotation.(temp/rna_anno.txt)
            model_path: Path to the Database for annotation. If not provided, the model will be downloaded from the internet.
            outfmt: Output format for marker annotation. (txt)
            Gensymbol: Using gene symbol ID instead of ensembl ID in input file for calculation.
            species: Species for annotation. Only used for cellmarker database. ('Human',['Mouse'])
            weight: Weight threshold for marker filtering from cellranger v1.0 results. (100)
            tissue: Tissue for annotation. you can use `get_model_tissue` to see the available tissues. ('All')
            target: Target to annotation class in Database. (cellmarker,[cancersea,panglaodb])
            celltype: Cell type for annotation. (normal,[cancer])
            norefdb: Only using user-defined marker database for annotation.
            noprint: Do not print any detail results.
            list_tissue: List all available tissues in the database.
            cellrange: Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.)

        """

        #create temp directory
        try:
            if not os.path.isdir('temp'):
                print("...Creating directory {}".format('temp'))
                os.makedirs('temp', exist_ok=True)
        except OSError as e:
            print("...Unable to create directory {}. Reason {}".format('temp',e))

        self.adata=adata
        self.foldchange=foldchange
        self.pvalue=pvalue
        self.output=output
        self.outfmt=outfmt
        self.Gensymbol=Gensymbol
        self.species=species
        self.weight=weight
        self.tissue=tissue
        self.celltype=celltype
        self.norefdb=norefdb
        self.noprint=noprint
        self.list_tissue=list_tissue
        self.target=target
        self.cellrange=cellrange
        if model_path =='':
            self.model_path=data_downloader(url='https://figshare.com/ndownloader/files/41369037',
                                            path='temp/pySCSA_2023_v2_plus.db',title='whole')
        else:
            self.model_path=model_path

    def get_model_tissue(self,species:str="Human")->None:
        r"""List all available tissues in the database.

        Arguments:
            species: Species for annotation. Only used for cellmarker database. ('Human',['Mouse'])

        """

        anno = Annotator(foldchange=self.foldchange,
                    weight=self.weight,
                    pvalue=self.pvalue,
                    tissue=self.tissue,
                    species=self.species,
                    target=self.target,
                    norefdb=self.norefdb,
                    MarkerDB=None,
                    db=self.model_path,
                    noprint=self.noprint,
                    input="temp/rna.csv",
                    output=self.output,
                    source="scanpy",
                    cluster='all',
                    fc=self.foldchange,
                    outfmt=self.outfmt,
                    celltype=self.celltype,
                    Gensymbol=self.Gensymbol,
                    list_tissue=self.list_tissue,
                    cellrange=self.cellrange)
        anno.load_pickle_module(self.model_path)
        anno.get_list_tissue(species)


    def cell_anno(self,clustertype:str='leiden',
                  cluster:str='all',rank_rep=False)->pd.DataFrame:
        r"""Annotate cell type for each cluster.

        Arguments:
            clustertype: Clustering name used in scanpy. (leiden)
            cluster: Only deal with one cluster of marker genes. (all,[1],[1,2,3],[...])
        """

        dat=data_preprocess(self.adata,clustertype=clustertype,path='temp/rna.csv',rank_rep=rank_rep)
        dat.to_csv('temp/rna.csv')

        print('...Auto annotate cell')

        p = Process()
        p.run_cmd_p(foldchange=self.foldchange,
                    weight=self.weight,
                    pvalue=self.pvalue,
                    tissue=self.tissue,
                    species=self.species,
                    target=self.target,
                    norefdb=self.norefdb,
                    MarkerDB=None,
                    db=self.model_path,
                    noprint=self.noprint,
                    input="temp/rna.csv",
                    output=self.output,
                    source="scanpy",
                    cluster=cluster,
                    fc=self.foldchange,
                    outfmt=self.outfmt,
                    celltype=self.celltype,
                    Gensymbol=self.Gensymbol,
                    list_tissue=self.list_tissue,
                    cellrange=self.cellrange)


        result=pd.read_csv('temp/rna_anno.txt',sep='\t')
        self.result=result
        return result

    def cell_anno_print(self)->None:
        r"""print the annotation result

        """
        for i in set(self.result['Cluster']):
            test=self.result.loc[self.result['Cluster']==i].iloc[:2]
            if test.iloc[0]['Z-score']>test.iloc[1]['Z-score']*2:
                print('Nice:Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,test.iloc[0]['Cell Type'],
                                                            np.around(test.iloc[0]['Z-score'],3)))
            else:
                print('Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,('|').join(test['Cell Type'].values.tolist()),
                                                            ('|').join(np.around(test['Z-score'].values,3).astype(str).tolist())))

    def cell_auto_anno(self,adata:anndata.AnnData,
                       clustertype:str='leiden',key='scsa_celltype')->None:
        r"""Add cell type annotation to anndata.obs['scsa_celltype']

        Arguments:
            adata: anndata object
            clustertype: Clustering name used in scanpy. (leiden)
        """
        test_li=[]
        for i in adata.obs[clustertype].value_counts().index:
            if int(i) in self.result['Cluster'].values:
                test_li.append(self.result.loc[self.result['Cluster']==int(i)].iloc[0]['Cell Type'])
            else:
                test_li.append('Unknown')
        scsa_anno=dict(zip([str(i) for i in adata.obs[clustertype].value_counts().index],
            test_li))
        adata.obs[key] = adata.obs[clustertype].map(scsa_anno).astype('category')
        print('...cell type added to {} on obs of anndata'.format(key))

    def get_celltype_marker(self,adata:anndata.AnnData,
                            clustertype:str='leiden',
                            log2fc_min:int=2,scores_type='scores',
                            pval_cutoff:float=0.05,rank:bool=True)->dict:
        r"""Get marker genes for each clusters.

        Arguments:
            adata: anndata object
            clustertype: Clustering name used in scanpy. (leiden)
            log2fc_min: Minimum log2 fold change of marker genes. (2)
            pval_cutoff: Maximum p value of marker genes. (0.05)
            rank: Whether to rank genes by wilcoxon test. (True)
            scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`

        Returns:
            cellmarker: A dictionary of marker genes for each clusters.
        """
        print('...get cell type marker')
        cell_marker_dict=get_celltype_marker(adata=adata,
                            clustertype=clustertype,
                            log2fc_min=log2fc_min,scores_type=scores_type,
                            pval_cutoff=pval_cutoff,rank=rank)

        return cell_marker_dict

__init__(adata, foldchange=1.5, pvalue=0.05, output='temp/rna_anno.txt', model_path='', outfmt='txt', Gensymbol=True, species='Human', weight=100, tissue='All', target='cellmarker', celltype='normal', norefdb=False, cellrange=None, noprint=True, list_tissue=False)

Initialize the pySCSA class

Parameters:

Name Type Description Default
adata anndata.AnnData

AnnData object of scRNA-seq after preprocessing

required
foldchange float

Fold change threshold for marker filtering. (2.0)

1.5
pvalue float

P-value threshold for marker filtering. (0.05)

0.05
output str

Output file for marker annotation.(temp/rna_anno.txt)

'temp/rna_anno.txt'
model_path str

Path to the Database for annotation. If not provided, the model will be downloaded from the internet.

''
outfmt str

Output format for marker annotation. (txt)

'txt'
Gensymbol bool

Using gene symbol ID instead of ensembl ID in input file for calculation.

True
species str

Species for annotation. Only used for cellmarker database. ('Human',['Mouse'])

'Human'
weight int

Weight threshold for marker filtering from cellranger v1.0 results. (100)

100
tissue str

Tissue for annotation. you can use get_model_tissue to see the available tissues. ('All')

'All'
target str

Target to annotation class in Database. (cellmarker,[cancersea,panglaodb])

'cellmarker'
celltype str

Cell type for annotation. (normal,[cancer])

'normal'
norefdb bool

Only using user-defined marker database for annotation.

False
noprint bool

Do not print any detail results.

True
list_tissue bool

List all available tissues in the database.

False
cellrange str

Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.)

None
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def __init__(self,adata:anndata.AnnData,
            foldchange:float=1.5,pvalue:float=0.05,
            output:str='temp/rna_anno.txt',
            model_path:str='',
            outfmt:str='txt',Gensymbol:bool=True,
            species:str='Human',weight:int=100,tissue:str='All',target:str='cellmarker',
            celltype:str='normal',norefdb:bool=False,cellrange:str=None,
            noprint:bool=True,list_tissue:bool=False) -> None:

    r"""Initialize the pySCSA class

    Arguments:
        adata: AnnData object of scRNA-seq after preprocessing
        foldchange: Fold change threshold for marker filtering. (2.0)
        pvalue: P-value threshold for marker filtering. (0.05)
        output: Output file for marker annotation.(temp/rna_anno.txt)
        model_path: Path to the Database for annotation. If not provided, the model will be downloaded from the internet.
        outfmt: Output format for marker annotation. (txt)
        Gensymbol: Using gene symbol ID instead of ensembl ID in input file for calculation.
        species: Species for annotation. Only used for cellmarker database. ('Human',['Mouse'])
        weight: Weight threshold for marker filtering from cellranger v1.0 results. (100)
        tissue: Tissue for annotation. you can use `get_model_tissue` to see the available tissues. ('All')
        target: Target to annotation class in Database. (cellmarker,[cancersea,panglaodb])
        celltype: Cell type for annotation. (normal,[cancer])
        norefdb: Only using user-defined marker database for annotation.
        noprint: Do not print any detail results.
        list_tissue: List all available tissues in the database.
        cellrange: Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.)

    """

    #create temp directory
    try:
        if not os.path.isdir('temp'):
            print("...Creating directory {}".format('temp'))
            os.makedirs('temp', exist_ok=True)
    except OSError as e:
        print("...Unable to create directory {}. Reason {}".format('temp',e))

    self.adata=adata
    self.foldchange=foldchange
    self.pvalue=pvalue
    self.output=output
    self.outfmt=outfmt
    self.Gensymbol=Gensymbol
    self.species=species
    self.weight=weight
    self.tissue=tissue
    self.celltype=celltype
    self.norefdb=norefdb
    self.noprint=noprint
    self.list_tissue=list_tissue
    self.target=target
    self.cellrange=cellrange
    if model_path =='':
        self.model_path=data_downloader(url='https://figshare.com/ndownloader/files/41369037',
                                        path='temp/pySCSA_2023_v2_plus.db',title='whole')
    else:
        self.model_path=model_path

cell_anno(clustertype='leiden', cluster='all', rank_rep=False)

Annotate cell type for each cluster.

Parameters:

Name Type Description Default
clustertype str

Clustering name used in scanpy. (leiden)

'leiden'
cluster str

Only deal with one cluster of marker genes. (all,[1],[1,2,3],[...])

'all'
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def cell_anno(self,clustertype:str='leiden',
              cluster:str='all',rank_rep=False)->pd.DataFrame:
    r"""Annotate cell type for each cluster.

    Arguments:
        clustertype: Clustering name used in scanpy. (leiden)
        cluster: Only deal with one cluster of marker genes. (all,[1],[1,2,3],[...])
    """

    dat=data_preprocess(self.adata,clustertype=clustertype,path='temp/rna.csv',rank_rep=rank_rep)
    dat.to_csv('temp/rna.csv')

    print('...Auto annotate cell')

    p = Process()
    p.run_cmd_p(foldchange=self.foldchange,
                weight=self.weight,
                pvalue=self.pvalue,
                tissue=self.tissue,
                species=self.species,
                target=self.target,
                norefdb=self.norefdb,
                MarkerDB=None,
                db=self.model_path,
                noprint=self.noprint,
                input="temp/rna.csv",
                output=self.output,
                source="scanpy",
                cluster=cluster,
                fc=self.foldchange,
                outfmt=self.outfmt,
                celltype=self.celltype,
                Gensymbol=self.Gensymbol,
                list_tissue=self.list_tissue,
                cellrange=self.cellrange)


    result=pd.read_csv('temp/rna_anno.txt',sep='\t')
    self.result=result
    return result

cell_anno_print()

print the annotation result

Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def cell_anno_print(self)->None:
    r"""print the annotation result

    """
    for i in set(self.result['Cluster']):
        test=self.result.loc[self.result['Cluster']==i].iloc[:2]
        if test.iloc[0]['Z-score']>test.iloc[1]['Z-score']*2:
            print('Nice:Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,test.iloc[0]['Cell Type'],
                                                        np.around(test.iloc[0]['Z-score'],3)))
        else:
            print('Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,('|').join(test['Cell Type'].values.tolist()),
                                                        ('|').join(np.around(test['Z-score'].values,3).astype(str).tolist())))

cell_auto_anno(adata, clustertype='leiden', key='scsa_celltype')

Add cell type annotation to anndata.obs['scsa_celltype']

Parameters:

Name Type Description Default
adata anndata.AnnData

anndata object

required
clustertype str

Clustering name used in scanpy. (leiden)

'leiden'
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def cell_auto_anno(self,adata:anndata.AnnData,
                   clustertype:str='leiden',key='scsa_celltype')->None:
    r"""Add cell type annotation to anndata.obs['scsa_celltype']

    Arguments:
        adata: anndata object
        clustertype: Clustering name used in scanpy. (leiden)
    """
    test_li=[]
    for i in adata.obs[clustertype].value_counts().index:
        if int(i) in self.result['Cluster'].values:
            test_li.append(self.result.loc[self.result['Cluster']==int(i)].iloc[0]['Cell Type'])
        else:
            test_li.append('Unknown')
    scsa_anno=dict(zip([str(i) for i in adata.obs[clustertype].value_counts().index],
        test_li))
    adata.obs[key] = adata.obs[clustertype].map(scsa_anno).astype('category')
    print('...cell type added to {} on obs of anndata'.format(key))

get_model_tissue(species='Human')

List all available tissues in the database.

Parameters:

Name Type Description Default
species str

Species for annotation. Only used for cellmarker database. ('Human',['Mouse'])

'Human'
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def get_model_tissue(self,species:str="Human")->None:
    r"""List all available tissues in the database.

    Arguments:
        species: Species for annotation. Only used for cellmarker database. ('Human',['Mouse'])

    """

    anno = Annotator(foldchange=self.foldchange,
                weight=self.weight,
                pvalue=self.pvalue,
                tissue=self.tissue,
                species=self.species,
                target=self.target,
                norefdb=self.norefdb,
                MarkerDB=None,
                db=self.model_path,
                noprint=self.noprint,
                input="temp/rna.csv",
                output=self.output,
                source="scanpy",
                cluster='all',
                fc=self.foldchange,
                outfmt=self.outfmt,
                celltype=self.celltype,
                Gensymbol=self.Gensymbol,
                list_tissue=self.list_tissue,
                cellrange=self.cellrange)
    anno.load_pickle_module(self.model_path)
    anno.get_list_tissue(species)

get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=True)

Get marker genes for each clusters.

Parameters:

Name Type Description Default
adata anndata.AnnData

anndata object

required
clustertype str

Clustering name used in scanpy. (leiden)

'leiden'
log2fc_min int

Minimum log2 fold change of marker genes. (2)

2
pval_cutoff float

Maximum p value of marker genes. (0.05)

0.05
rank bool

Whether to rank genes by wilcoxon test. (True)

True
scores_type

The type of scores. can be selected from scores and logfoldchanges

'scores'

Returns:

Name Type Description
cellmarker dict

A dictionary of marker genes for each clusters.

Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def get_celltype_marker(self,adata:anndata.AnnData,
                        clustertype:str='leiden',
                        log2fc_min:int=2,scores_type='scores',
                        pval_cutoff:float=0.05,rank:bool=True)->dict:
    r"""Get marker genes for each clusters.

    Arguments:
        adata: anndata object
        clustertype: Clustering name used in scanpy. (leiden)
        log2fc_min: Minimum log2 fold change of marker genes. (2)
        pval_cutoff: Maximum p value of marker genes. (0.05)
        rank: Whether to rank genes by wilcoxon test. (True)
        scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`

    Returns:
        cellmarker: A dictionary of marker genes for each clusters.
    """
    print('...get cell type marker')
    cell_marker_dict=get_celltype_marker(adata=adata,
                        clustertype=clustertype,
                        log2fc_min=log2fc_min,scores_type=scores_type,
                        pval_cutoff=pval_cutoff,rank=rank)

    return cell_marker_dict

omicverse.single.scanpy_lazy(adata, min_genes=200, min_cells=3, drop_doublet=True, n_genes_by_counts=4300, pct_counts_mt=25, target_sum=10000.0, min_mean=0.0125, max_mean=3, min_disp=0.5, max_value=10, n_comps=100, svd_solver='auto', n_neighbors=15, random_state=112, n_pcs=50)

scanpy lazy analysis

Parameters:

Name Type Description Default
adata anndata.AnnData

AnnData object

required
min_genes int

the min number of genes

200
min_cells int

the min number of cells

3
drop_doublet bool

whether to drop doublet

True
n_genes_by_counts int

the max number of genes

4300
pct_counts_mt int

the max proportion of mito-genes

25
target_sum float

the max counts of total_counts

10000.0
min_mean float

the min mean of genes

0.0125
max_mean int

the max mean of genes

3
min_disp float

the min dispersion of genes

0.5
max_value int

the max value of genes

10
n_comps int

the number of components

100
svd_solver str

the solver of svd

'auto'
n_neighbors int

the number of neighbors

15
random_state int

the random state

112
n_pcs int

the number of pcs

50

Returns:

Name Type Description
adata anndata.AnnData

AnnData object

Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def scanpy_lazy(adata:anndata.AnnData,min_genes:int=200,min_cells:int=3,drop_doublet:bool=True,
                n_genes_by_counts:int=4300,pct_counts_mt:int=25,
                target_sum:float=1e4,min_mean:float=0.0125, max_mean:int=3, min_disp:float=0.5,max_value:int=10,
                n_comps:int=100, svd_solver:str="auto",
                n_neighbors:int=15, random_state:int = 112, n_pcs:int=50,
                )->anndata.AnnData:
    r"""scanpy lazy analysis

    Arguments:
        adata: AnnData object
        min_genes: the min number of genes
        min_cells: the min number of cells
        drop_doublet: whether to drop doublet
        n_genes_by_counts: the max number of genes
        pct_counts_mt: the max proportion of mito-genes
        target_sum: the max counts of total_counts
        min_mean: the min mean of genes
        max_mean: the max mean of genes
        min_disp: the min dispersion of genes
        max_value: the max value of genes
        n_comps: the number of components
        svd_solver: the solver of svd
        n_neighbors: the number of neighbors
        random_state: the random state
        n_pcs: the number of pcs

    Returns:
        adata: AnnData object
    """
    #filter cells and genes
    sc.pp.filter_cells(adata, min_genes=min_genes)
    sc.pp.filter_genes(adata, min_cells=min_cells)
    #filter the doublets cells
    if drop_doublet:
        sc.external.pp.scrublet(adata) #estimates doublets
        adata = adata[adata.obs['predicted_doublet'] == False] #do the actual filtering
    #calculate the proportion of mito-genes
    adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata = adata[adata.obs.n_genes_by_counts < n_genes_by_counts, :]
    adata = adata[adata.obs.pct_counts_mt < pct_counts_mt, :]
    #save the raw counts
    adata.layers["counts"] = adata.X.copy()
    #normalization, the max counts of total_counts is 20000 means the amount is 10e4
    sc.pp.normalize_total(adata, target_sum=target_sum)
    #log
    sc.pp.log1p(adata)
    #select high-variable genes
    sc.pp.highly_variable_genes(adata, min_mean=min_mean, max_mean=max_mean, min_disp=min_disp)
    #save and filter
    adata.raw = adata
    adata = adata[:, adata.var.highly_variable]
    #scale
    #scale(adata, max_value=max_value)
    sc.pp.scale(adata, max_value=max_value)
    #pca analysis
    sc.tl.pca(adata, n_comps=n_comps, svd_solver=svd_solver)
    #pca(adata,layer='scaled',n_pcs=50)
    #cell neighbors graph construct
    sc.pp.neighbors(adata, n_neighbors=n_neighbors, random_state = random_state, n_pcs=n_pcs)
    #umap
    sc.tl.leiden(adata)
    sc.tl.paga(adata)
    sc.pl.paga(adata, plot=False)  # remove `plot=False` if you want to see the coarse-grained graph
    sc.tl.umap(adata, init_pos='paga')
    return adata

omicverse.single.scanpy_cellanno_from_dict(adata, anno_dict, anno_name='major', clustertype='leiden')

add cell type annotation from dict to anndata object

Parameters:

Name Type Description Default
adata anndata.AnnData

AnnData object of scRNA-seq after preprocessing

required
anno_dict dict

dict of cell type annotation. key is the cluster name, value is the cell type name.like {'0':'B cell','1':'T cell'}

required
anno_name str

the name of annotation

'major'
clustertype str

Clustering name used in scanpy. (leiden)

'leiden'
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def scanpy_cellanno_from_dict(adata:anndata.AnnData,
                               anno_dict:dict,
                               anno_name:str='major',
                               clustertype:str='leiden',
                               )->None:
    r"""add cell type annotation from dict to anndata object

    Arguments:
        adata: AnnData object of scRNA-seq after preprocessing
        anno_dict: dict of cell type annotation. key is the cluster name, value is the cell type name.like `{'0':'B cell','1':'T cell'}`
        anno_name: the name of annotation
        clustertype: Clustering name used in scanpy. (leiden)

    """

    adata.obs[anno_name+'_celltype'] = adata.obs[clustertype].map(anno_dict).astype('category')
    print('...cell type added to {}_celltype on obs of anndata'.format(anno_name))

omicverse.single.get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=False, key='rank_genes_groups', method='wilcoxon', foldchange=None, topgenenumber=10)

Get marker genes for each clusters.

Parameters:

Name Type Description Default
adata anndata.AnnData

anndata object

required
clustertype str

Clustering name used in scanpy. (leiden)

'leiden'
log2fc_min int

Minimum log2 fold change of marker genes. (2)

2
pval_cutoff float

Maximum p value of marker genes. (0.05)

0.05
rank bool

Whether to rank genes by wilcoxon test. (True)

False
scores_type

The type of scores. can be selected from scores and logfoldchanges

'scores'

Returns:

Name Type Description
cellmarker dict

A dictionary of marker genes for each clusters.

Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_anno.py
def get_celltype_marker(adata:anndata.AnnData,
                            clustertype:str='leiden',
                            log2fc_min:int=2,scores_type='scores',
                            pval_cutoff:float=0.05,rank:bool=False,
                            key='rank_genes_groups',method='wilcoxon',
                            foldchange=None,topgenenumber=10)->dict:
        r"""Get marker genes for each clusters.

        Arguments:
            adata: anndata object
            clustertype: Clustering name used in scanpy. (leiden)
            log2fc_min: Minimum log2 fold change of marker genes. (2)
            pval_cutoff: Maximum p value of marker genes. (0.05)
            rank: Whether to rank genes by wilcoxon test. (True)
            scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`

        Returns:
            cellmarker: A dictionary of marker genes for each clusters.
        """
        print('...get cell type marker')
        celltypes = sorted(adata.obs[clustertype].unique())
        cell_marker_dict={}
        if rank==False:
            sc.tl.rank_genes_groups(adata, clustertype, method=method)
        for celltype in celltypes:
            degs = sc.get.rank_genes_groups_df(adata, group=celltype, key=key, log2fc_min=log2fc_min, 
                                            pval_cutoff=pval_cutoff)
            foldp=np.histogram(degs[scores_type])
            if foldchange is None:
                foldchange=(foldp[1][np.where(foldp[1]>0)[0][-5]]+foldp[1][np.where(foldp[1]>0)[0][-6]])/2
            cellmarker=degs.loc[degs[scores_type]>foldchange]['names'].values[:topgenenumber]
            cell_marker_dict[celltype]=cellmarker

        for key in cell_marker_dict.keys():
            cell_marker_dict[key]=list(cell_marker_dict[key])


        return cell_marker_dict