Skip to content

Api scsa

omicverse.single.pySCSA

Bases: object

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
class pySCSA(object):

    def __init__(self,adata:anndata.AnnData,
                foldchange:float=1.5,pvalue:float=0.05,
                output:str='temp/rna_anno.txt',
                model_path:str='',
                outfmt:str='txt',Gensymbol:bool=True,
                species:str='Human',weight:int=100,tissue:str='All',target:str='cellmarker',
                celltype:str='normal',norefdb:bool=False,cellrange:str=None,
                noprint:bool=True,list_tissue:bool=False) -> None:

        r"""Initialize the pySCSA class.

        Arguments:
            adata: AnnData object of scRNA-seq after preprocessing
            foldchange: Fold change threshold for marker filtering. (1.5)
            pvalue: P-value threshold for marker filtering. (0.05)
            output: Output file for marker annotation. ('temp/rna_anno.txt')
            model_path: Path to the Database for annotation. If not provided, the model will be downloaded from the internet. ('')
            outfmt: Output format for marker annotation. ('txt')
            Gensymbol: Using gene symbol ID instead of ensembl ID in input file for calculation. (True)
            species: Species for annotation. Only used for cellmarker database. ('Human')
            weight: Weight threshold for marker filtering from cellranger v1.0 results. (100)
            tissue: Tissue for annotation. you can use `get_model_tissue` to see the available tissues. ('All')
            target: Target to annotation class in Database. ('cellmarker')
            celltype: Cell type for annotation. ('normal')
            norefdb: Only using user-defined marker database for annotation. (False)
            cellrange: Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.) (None)
            noprint: Do not print any detail results. (True)
            list_tissue: List all available tissues in the database. (False)

        Returns:
            None
        """

        #create temp directory
        try:
            if not os.path.isdir('temp'):
                print("...Creating directory {}".format('temp'))
                os.makedirs('temp', exist_ok=True)
        except OSError as e:
            print("...Unable to create directory {}. Reason {}".format('temp',e))

        self.adata=adata
        self.foldchange=foldchange
        self.pvalue=pvalue
        self.output=output
        self.outfmt=outfmt
        self.Gensymbol=Gensymbol
        self.species=species
        self.weight=weight
        self.tissue=tissue
        self.celltype=celltype
        self.norefdb=norefdb
        self.noprint=noprint
        self.list_tissue=list_tissue
        self.target=target
        self.cellrange=cellrange
        if model_path =='':
            self.model_path=data_downloader(url='https://figshare.com/ndownloader/files/41369037',
                                            path='temp/pySCSA_2023_v2_plus.db',title='whole')
        else:
            self.model_path=model_path

    def get_model_tissue(self,species:str="Human")->None:
        r"""List all available tissues in the database.

        Arguments:
            species: Species for annotation. Only used for cellmarker database. ('Human')

        Returns:
            None
        """

        anno = Annotator(foldchange=self.foldchange,
                    weight=self.weight,
                    pvalue=self.pvalue,
                    tissue=self.tissue,
                    species=self.species,
                    target=self.target,
                    norefdb=self.norefdb,
                    MarkerDB=None,
                    db=self.model_path,
                    noprint=self.noprint,
                    input="temp/rna.csv",
                    output=self.output,
                    source="scanpy",
                    cluster='all',
                    fc=self.foldchange,
                    outfmt=self.outfmt,
                    celltype=self.celltype,
                    Gensymbol=self.Gensymbol,
                    list_tissue=self.list_tissue,
                    cellrange=self.cellrange)
        anno.load_pickle_module(self.model_path)
        anno.get_list_tissue(species)


    def cell_anno(self,clustertype:str='leiden',
                  cluster:str='all',rank_rep=False)->pd.DataFrame:
        r"""Annotate cell type for each cluster.

        Arguments:
            clustertype: Clustering name used in scanpy. ('leiden')
            cluster: Only deal with one cluster of marker genes. ('all')
            rank_rep: Whether to repeat ranking. (False)

        Returns:
            result: Annotation result as DataFrame
        """

        dat=data_preprocess(self.adata,clustertype=clustertype,path='temp/rna.csv',rank_rep=rank_rep)
        dat.to_csv('temp/rna.csv')

        print('...Auto annotate cell')

        p = Process()
        p.run_cmd_p(foldchange=self.foldchange,
                    weight=self.weight,
                    pvalue=self.pvalue,
                    tissue=self.tissue,
                    species=self.species,
                    target=self.target,
                    norefdb=self.norefdb,
                    MarkerDB=None,
                    db=self.model_path,
                    noprint=self.noprint,
                    input="temp/rna.csv",
                    output=self.output,
                    source="scanpy",
                    cluster=cluster,
                    fc=self.foldchange,
                    outfmt=self.outfmt,
                    celltype=self.celltype,
                    Gensymbol=self.Gensymbol,
                    list_tissue=self.list_tissue,
                    cellrange=self.cellrange)


        result=pd.read_csv('temp/rna_anno.txt',sep='\t')
        self.result=result
        add_reference(self.adata,'pySCSA','cell annotation with SCSA')
        return result

    def cell_anno_print(self)->None:
        r"""Print the annotation result.

        Returns:
            None
        """
        for i in set(self.result['Cluster']):
            test=self.result.loc[self.result['Cluster']==i].iloc[:2]
            if test.iloc[0]['Z-score']>test.iloc[1]['Z-score']*2:
                print('Nice:Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,test.iloc[0]['Cell Type'],
                                                            np.around(test.iloc[0]['Z-score'],3)))
            else:
                print('Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,('|').join(test['Cell Type'].values.tolist()),
                                                            ('|').join(np.around(test['Z-score'].values,3).astype(str).tolist())))

    def cell_auto_anno(self,adata:anndata.AnnData,
                       clustertype:str='leiden',key='scsa_celltype')->None:
        r"""Add cell type annotation to anndata.obs['scsa_celltype'].

        Arguments:
            adata: anndata object
            clustertype: Clustering name used in scanpy. ('leiden')
            key: Key to store cell type annotation. ('scsa_celltype')

        Returns:
            None
        """
        test_li=[]
        for i in adata.obs[clustertype].value_counts().index:
            if int(i) in self.result['Cluster'].values:
                test_li.append(self.result.loc[self.result['Cluster']==int(i)].iloc[0]['Cell Type'])
            else:
                test_li.append('Unknown')
        scsa_anno=dict(zip([str(i) for i in adata.obs[clustertype].value_counts().index],
            test_li))
        adata.obs[key] = adata.obs[clustertype].map(scsa_anno).astype('category')
        print('...cell type added to {} on obs of anndata'.format(key))

    def get_celltype_marker(self,adata:anndata.AnnData,
                            clustertype:str='leiden',
                            log2fc_min:int=2,scores_type='scores',
                            pval_cutoff:float=0.05,rank:bool=True,
                            unique:bool=True,global_unique:bool=False)->dict:
        r"""Get marker genes for each clusters.

        Arguments:
            adata: anndata object
            clustertype: Clustering name used in scanpy. (leiden)
            log2fc_min: Minimum log2 fold change of marker genes. (2)
            pval_cutoff: Maximum p value of marker genes. (0.05)
            rank: Whether to rank genes by wilcoxon test. (True)
            scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`
            unique: Whether to remove duplicates within each cell type. (True)
            global_unique: Whether to remove duplicates across all cell types. (False)

        Returns:
            cellmarker: A dictionary of marker genes for each clusters.
        """
        print('...get cell type marker')
        cell_marker_dict=get_celltype_marker(adata=adata,
                            clustertype=clustertype,
                            log2fc_min=log2fc_min,scores_type=scores_type,
                            pval_cutoff=pval_cutoff,rank=rank,
                            unique=unique,global_unique=global_unique)

        return cell_marker_dict

__init__(adata, foldchange=1.5, pvalue=0.05, output='temp/rna_anno.txt', model_path='', outfmt='txt', Gensymbol=True, species='Human', weight=100, tissue='All', target='cellmarker', celltype='normal', norefdb=False, cellrange=None, noprint=True, list_tissue=False)

Initialize the pySCSA class.

Parameters:

Name Type Description Default
adata anndata.AnnData

AnnData object of scRNA-seq after preprocessing

required
foldchange float

Fold change threshold for marker filtering. (1.5)

1.5
pvalue float

P-value threshold for marker filtering. (0.05)

0.05
output str

Output file for marker annotation. ('temp/rna_anno.txt')

'temp/rna_anno.txt'
model_path str

Path to the Database for annotation. If not provided, the model will be downloaded from the internet. ('')

''
outfmt str

Output format for marker annotation. ('txt')

'txt'
Gensymbol bool

Using gene symbol ID instead of ensembl ID in input file for calculation. (True)

True
species str

Species for annotation. Only used for cellmarker database. ('Human')

'Human'
weight int

Weight threshold for marker filtering from cellranger v1.0 results. (100)

100
tissue str

Tissue for annotation. you can use get_model_tissue to see the available tissues. ('All')

'All'
target str

Target to annotation class in Database. ('cellmarker')

'cellmarker'
celltype str

Cell type for annotation. ('normal')

'normal'
norefdb bool

Only using user-defined marker database for annotation. (False)

False
cellrange str

Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.) (None)

None
noprint bool

Do not print any detail results. (True)

True
list_tissue bool

List all available tissues in the database. (False)

False

Returns:

Type Description
None

None

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def __init__(self,adata:anndata.AnnData,
            foldchange:float=1.5,pvalue:float=0.05,
            output:str='temp/rna_anno.txt',
            model_path:str='',
            outfmt:str='txt',Gensymbol:bool=True,
            species:str='Human',weight:int=100,tissue:str='All',target:str='cellmarker',
            celltype:str='normal',norefdb:bool=False,cellrange:str=None,
            noprint:bool=True,list_tissue:bool=False) -> None:

    r"""Initialize the pySCSA class.

    Arguments:
        adata: AnnData object of scRNA-seq after preprocessing
        foldchange: Fold change threshold for marker filtering. (1.5)
        pvalue: P-value threshold for marker filtering. (0.05)
        output: Output file for marker annotation. ('temp/rna_anno.txt')
        model_path: Path to the Database for annotation. If not provided, the model will be downloaded from the internet. ('')
        outfmt: Output format for marker annotation. ('txt')
        Gensymbol: Using gene symbol ID instead of ensembl ID in input file for calculation. (True)
        species: Species for annotation. Only used for cellmarker database. ('Human')
        weight: Weight threshold for marker filtering from cellranger v1.0 results. (100)
        tissue: Tissue for annotation. you can use `get_model_tissue` to see the available tissues. ('All')
        target: Target to annotation class in Database. ('cellmarker')
        celltype: Cell type for annotation. ('normal')
        norefdb: Only using user-defined marker database for annotation. (False)
        cellrange: Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.) (None)
        noprint: Do not print any detail results. (True)
        list_tissue: List all available tissues in the database. (False)

    Returns:
        None
    """

    #create temp directory
    try:
        if not os.path.isdir('temp'):
            print("...Creating directory {}".format('temp'))
            os.makedirs('temp', exist_ok=True)
    except OSError as e:
        print("...Unable to create directory {}. Reason {}".format('temp',e))

    self.adata=adata
    self.foldchange=foldchange
    self.pvalue=pvalue
    self.output=output
    self.outfmt=outfmt
    self.Gensymbol=Gensymbol
    self.species=species
    self.weight=weight
    self.tissue=tissue
    self.celltype=celltype
    self.norefdb=norefdb
    self.noprint=noprint
    self.list_tissue=list_tissue
    self.target=target
    self.cellrange=cellrange
    if model_path =='':
        self.model_path=data_downloader(url='https://figshare.com/ndownloader/files/41369037',
                                        path='temp/pySCSA_2023_v2_plus.db',title='whole')
    else:
        self.model_path=model_path

cell_anno(clustertype='leiden', cluster='all', rank_rep=False)

Annotate cell type for each cluster.

Parameters:

Name Type Description Default
clustertype str

Clustering name used in scanpy. ('leiden')

'leiden'
cluster str

Only deal with one cluster of marker genes. ('all')

'all'
rank_rep

Whether to repeat ranking. (False)

False

Returns:

Name Type Description
result pd.DataFrame

Annotation result as DataFrame

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def cell_anno(self,clustertype:str='leiden',
              cluster:str='all',rank_rep=False)->pd.DataFrame:
    r"""Annotate cell type for each cluster.

    Arguments:
        clustertype: Clustering name used in scanpy. ('leiden')
        cluster: Only deal with one cluster of marker genes. ('all')
        rank_rep: Whether to repeat ranking. (False)

    Returns:
        result: Annotation result as DataFrame
    """

    dat=data_preprocess(self.adata,clustertype=clustertype,path='temp/rna.csv',rank_rep=rank_rep)
    dat.to_csv('temp/rna.csv')

    print('...Auto annotate cell')

    p = Process()
    p.run_cmd_p(foldchange=self.foldchange,
                weight=self.weight,
                pvalue=self.pvalue,
                tissue=self.tissue,
                species=self.species,
                target=self.target,
                norefdb=self.norefdb,
                MarkerDB=None,
                db=self.model_path,
                noprint=self.noprint,
                input="temp/rna.csv",
                output=self.output,
                source="scanpy",
                cluster=cluster,
                fc=self.foldchange,
                outfmt=self.outfmt,
                celltype=self.celltype,
                Gensymbol=self.Gensymbol,
                list_tissue=self.list_tissue,
                cellrange=self.cellrange)


    result=pd.read_csv('temp/rna_anno.txt',sep='\t')
    self.result=result
    add_reference(self.adata,'pySCSA','cell annotation with SCSA')
    return result

cell_anno_print()

Print the annotation result.

Returns:

Type Description
None

None

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def cell_anno_print(self)->None:
    r"""Print the annotation result.

    Returns:
        None
    """
    for i in set(self.result['Cluster']):
        test=self.result.loc[self.result['Cluster']==i].iloc[:2]
        if test.iloc[0]['Z-score']>test.iloc[1]['Z-score']*2:
            print('Nice:Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,test.iloc[0]['Cell Type'],
                                                        np.around(test.iloc[0]['Z-score'],3)))
        else:
            print('Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,('|').join(test['Cell Type'].values.tolist()),
                                                        ('|').join(np.around(test['Z-score'].values,3).astype(str).tolist())))

cell_auto_anno(adata, clustertype='leiden', key='scsa_celltype')

Add cell type annotation to anndata.obs['scsa_celltype'].

Parameters:

Name Type Description Default
adata anndata.AnnData

anndata object

required
clustertype str

Clustering name used in scanpy. ('leiden')

'leiden'
key

Key to store cell type annotation. ('scsa_celltype')

'scsa_celltype'

Returns:

Type Description
None

None

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def cell_auto_anno(self,adata:anndata.AnnData,
                   clustertype:str='leiden',key='scsa_celltype')->None:
    r"""Add cell type annotation to anndata.obs['scsa_celltype'].

    Arguments:
        adata: anndata object
        clustertype: Clustering name used in scanpy. ('leiden')
        key: Key to store cell type annotation. ('scsa_celltype')

    Returns:
        None
    """
    test_li=[]
    for i in adata.obs[clustertype].value_counts().index:
        if int(i) in self.result['Cluster'].values:
            test_li.append(self.result.loc[self.result['Cluster']==int(i)].iloc[0]['Cell Type'])
        else:
            test_li.append('Unknown')
    scsa_anno=dict(zip([str(i) for i in adata.obs[clustertype].value_counts().index],
        test_li))
    adata.obs[key] = adata.obs[clustertype].map(scsa_anno).astype('category')
    print('...cell type added to {} on obs of anndata'.format(key))

get_model_tissue(species='Human')

List all available tissues in the database.

Parameters:

Name Type Description Default
species str

Species for annotation. Only used for cellmarker database. ('Human')

'Human'

Returns:

Type Description
None

None

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def get_model_tissue(self,species:str="Human")->None:
    r"""List all available tissues in the database.

    Arguments:
        species: Species for annotation. Only used for cellmarker database. ('Human')

    Returns:
        None
    """

    anno = Annotator(foldchange=self.foldchange,
                weight=self.weight,
                pvalue=self.pvalue,
                tissue=self.tissue,
                species=self.species,
                target=self.target,
                norefdb=self.norefdb,
                MarkerDB=None,
                db=self.model_path,
                noprint=self.noprint,
                input="temp/rna.csv",
                output=self.output,
                source="scanpy",
                cluster='all',
                fc=self.foldchange,
                outfmt=self.outfmt,
                celltype=self.celltype,
                Gensymbol=self.Gensymbol,
                list_tissue=self.list_tissue,
                cellrange=self.cellrange)
    anno.load_pickle_module(self.model_path)
    anno.get_list_tissue(species)

get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=True, unique=True, global_unique=False)

Get marker genes for each clusters.

Parameters:

Name Type Description Default
adata anndata.AnnData

anndata object

required
clustertype str

Clustering name used in scanpy. (leiden)

'leiden'
log2fc_min int

Minimum log2 fold change of marker genes. (2)

2
pval_cutoff float

Maximum p value of marker genes. (0.05)

0.05
rank bool

Whether to rank genes by wilcoxon test. (True)

True
scores_type

The type of scores. can be selected from scores and logfoldchanges

'scores'
unique bool

Whether to remove duplicates within each cell type. (True)

True
global_unique bool

Whether to remove duplicates across all cell types. (False)

False

Returns:

Name Type Description
cellmarker dict

A dictionary of marker genes for each clusters.

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def get_celltype_marker(self,adata:anndata.AnnData,
                        clustertype:str='leiden',
                        log2fc_min:int=2,scores_type='scores',
                        pval_cutoff:float=0.05,rank:bool=True,
                        unique:bool=True,global_unique:bool=False)->dict:
    r"""Get marker genes for each clusters.

    Arguments:
        adata: anndata object
        clustertype: Clustering name used in scanpy. (leiden)
        log2fc_min: Minimum log2 fold change of marker genes. (2)
        pval_cutoff: Maximum p value of marker genes. (0.05)
        rank: Whether to rank genes by wilcoxon test. (True)
        scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`
        unique: Whether to remove duplicates within each cell type. (True)
        global_unique: Whether to remove duplicates across all cell types. (False)

    Returns:
        cellmarker: A dictionary of marker genes for each clusters.
    """
    print('...get cell type marker')
    cell_marker_dict=get_celltype_marker(adata=adata,
                        clustertype=clustertype,
                        log2fc_min=log2fc_min,scores_type=scores_type,
                        pval_cutoff=pval_cutoff,rank=rank,
                        unique=unique,global_unique=global_unique)

    return cell_marker_dict

omicverse.single.scanpy_lazy(adata, min_genes=200, min_cells=3, drop_doublet=True, n_genes_by_counts=4300, pct_counts_mt=25, target_sum=10000.0, min_mean=0.0125, max_mean=3, min_disp=0.5, max_value=10, n_comps=100, svd_solver='auto', n_neighbors=15, random_state=112, n_pcs=50)

Scanpy lazy analysis pipeline.

Parameters:

Name Type Description Default
adata anndata.AnnData

AnnData object

required
min_genes int

The min number of genes. (200)

200
min_cells int

The min number of cells. (3)

3
drop_doublet bool

Whether to drop doublet. (True)

True
n_genes_by_counts int

The max number of genes. (4300)

4300
pct_counts_mt int

The max proportion of mito-genes. (25)

25
target_sum float

The max counts of total_counts. (1e4)

10000.0
min_mean float

The min mean of genes. (0.0125)

0.0125
max_mean int

The max mean of genes. (3)

3
min_disp float

The min dispersion of genes. (0.5)

0.5
max_value int

The max value of genes. (10)

10
n_comps int

The number of components. (100)

100
svd_solver str

The solver of svd. ('auto')

'auto'
n_neighbors int

The number of neighbors. (15)

15
random_state int

The random state. (112)

112
n_pcs int

The number of pcs. (50)

50

Returns:

Name Type Description
adata anndata.AnnData

AnnData object

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def scanpy_lazy(adata:anndata.AnnData,min_genes:int=200,min_cells:int=3,drop_doublet:bool=True,
                n_genes_by_counts:int=4300,pct_counts_mt:int=25,
                target_sum:float=1e4,min_mean:float=0.0125, max_mean:int=3, min_disp:float=0.5,max_value:int=10,
                n_comps:int=100, svd_solver:str="auto",
                n_neighbors:int=15, random_state:int = 112, n_pcs:int=50,
                )->anndata.AnnData:
    r"""Scanpy lazy analysis pipeline.

    Arguments:
        adata: AnnData object
        min_genes: The min number of genes. (200)
        min_cells: The min number of cells. (3)
        drop_doublet: Whether to drop doublet. (True)
        n_genes_by_counts: The max number of genes. (4300)
        pct_counts_mt: The max proportion of mito-genes. (25)
        target_sum: The max counts of total_counts. (1e4)
        min_mean: The min mean of genes. (0.0125)
        max_mean: The max mean of genes. (3)
        min_disp: The min dispersion of genes. (0.5)
        max_value: The max value of genes. (10)
        n_comps: The number of components. (100)
        svd_solver: The solver of svd. ('auto')
        n_neighbors: The number of neighbors. (15)
        random_state: The random state. (112)
        n_pcs: The number of pcs. (50)

    Returns:
        adata: AnnData object
    """
    #filter cells and genes
    sc.pp.filter_cells(adata, min_genes=min_genes)
    sc.pp.filter_genes(adata, min_cells=min_cells)
    #filter the doublets cells
    if drop_doublet:
        sc.external.pp.scrublet(adata) #estimates doublets
        adata = adata[adata.obs['predicted_doublet'] == False] #do the actual filtering
    #calculate the proportion of mito-genes
    adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata = adata[adata.obs.n_genes_by_counts < n_genes_by_counts, :]
    adata = adata[adata.obs.pct_counts_mt < pct_counts_mt, :]
    #save the raw counts
    adata.layers["counts"] = adata.X.copy()
    #normalization, the max counts of total_counts is 20000 means the amount is 10e4
    sc.pp.normalize_total(adata, target_sum=target_sum)
    #log
    sc.pp.log1p(adata)
    #select high-variable genes
    sc.pp.highly_variable_genes(adata, min_mean=min_mean, max_mean=max_mean, min_disp=min_disp)
    #save and filter
    adata.raw = adata
    adata = adata[:, adata.var.highly_variable]
    #scale
    #scale(adata, max_value=max_value)
    sc.pp.scale(adata, max_value=max_value)
    #pca analysis
    sc.tl.pca(adata, n_comps=n_comps, svd_solver=svd_solver)
    #pca(adata,layer='scaled',n_pcs=50)
    #cell neighbors graph construct
    sc.pp.neighbors(adata, n_neighbors=n_neighbors, random_state = random_state, n_pcs=n_pcs)
    #umap
    sc.tl.leiden(adata)
    sc.tl.paga(adata)
    sc.pl.paga(adata, plot=False)  # remove `plot=False` if you want to see the coarse-grained graph
    sc.tl.umap(adata, init_pos='paga')
    return adata

omicverse.single.scanpy_cellanno_from_dict(adata, anno_dict, anno_name='major', clustertype='leiden')

Add cell type annotation from dict to anndata object.

Parameters:

Name Type Description Default
adata anndata.AnnData

AnnData object of scRNA-seq after preprocessing

required
anno_dict dict

Dict of cell type annotation. key is the cluster name, value is the cell type name.like {'0':'B cell','1':'T cell'}

required
anno_name str

The name of annotation. ('major')

'major'
clustertype str

Clustering name used in scanpy. ('leiden')

'leiden'

Returns:

Type Description
None

None

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def scanpy_cellanno_from_dict(adata:anndata.AnnData,
                               anno_dict:dict,
                               anno_name:str='major',
                               clustertype:str='leiden',
                               )->None:
    r"""Add cell type annotation from dict to anndata object.

    Arguments:
        adata: AnnData object of scRNA-seq after preprocessing
        anno_dict: Dict of cell type annotation. key is the cluster name, value is the cell type name.like `{'0':'B cell','1':'T cell'}`
        anno_name: The name of annotation. ('major')
        clustertype: Clustering name used in scanpy. ('leiden')

    Returns:
        None
    """

    adata.obs[anno_name+'_celltype'] = adata.obs[clustertype].map(anno_dict).astype('category')
    print('...cell type added to {}_celltype on obs of anndata'.format(anno_name))

omicverse.single.get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=False, key='rank_genes_groups', method='wilcoxon', foldchange=None, topgenenumber=10, unique=True, global_unique=False)

Get marker genes for each clusters.

Parameters:

Name Type Description Default
adata anndata.AnnData

anndata object

required
clustertype str

Clustering name used in scanpy. (leiden)

'leiden'
log2fc_min int

Minimum log2 fold change of marker genes. (2)

2
pval_cutoff float

Maximum p value of marker genes. (0.05)

0.05
rank bool

Whether to rank genes by wilcoxon test. (True)

False
scores_type

The type of scores. can be selected from scores and logfoldchanges

'scores'
unique

Whether to remove duplicates within each cell type. (True)

True
global_unique

Whether to remove duplicates across all cell types. (False)

False

Returns:

Name Type Description
cellmarker dict

A dictionary of marker genes for each clusters.

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py
def get_celltype_marker(adata:anndata.AnnData,
                            clustertype:str='leiden',
                            log2fc_min:int=2,scores_type='scores',
                            pval_cutoff:float=0.05,rank:bool=False,
                            key='rank_genes_groups',method='wilcoxon',
                            foldchange=None,topgenenumber=10,unique=True,
                            global_unique=False)->dict:
        r"""Get marker genes for each clusters.

        Arguments:
            adata: anndata object
            clustertype: Clustering name used in scanpy. (leiden)
            log2fc_min: Minimum log2 fold change of marker genes. (2)
            pval_cutoff: Maximum p value of marker genes. (0.05)
            rank: Whether to rank genes by wilcoxon test. (True)
            scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`
            unique: Whether to remove duplicates within each cell type. (True)
            global_unique: Whether to remove duplicates across all cell types. (False)

        Returns:
            cellmarker: A dictionary of marker genes for each clusters.
        """
        print('...get cell type marker')
        celltypes = sorted(adata.obs[clustertype].unique())
        cell_marker_dict={}
        if rank==True and 'rank_genes_groups' not in adata.uns.keys():
            sc.tl.rank_genes_groups(adata, clustertype, method=method)
        elif rank==True and 'rank_genes_groups' in adata.uns.keys():
            pass
        for celltype in celltypes:
            degs = sc.get.rank_genes_groups_df(adata, group=celltype, key=key, log2fc_min=log2fc_min, 
                                            pval_cutoff=pval_cutoff)
            foldp=np.histogram(degs[scores_type])
            if foldchange is None:
                try:
                    foldchange=(foldp[1][np.where(foldp[1]>0)[0][-5]]+foldp[1][np.where(foldp[1]>0)[0][-6]])/2
                except:
                    foldchange=degs[scores_type].mean()

            cellmarker=degs.loc[degs[scores_type]>foldchange]['names'].values[:topgenenumber]
            cell_marker_dict[celltype]=cellmarker
        if unique==True:
            for key in cell_marker_dict.keys():
                cell_marker_dict[key]=list(set(cell_marker_dict[key]))

        # Global uniqueness across all cell types
        if global_unique:
            used_genes = set()
            for celltype in celltypes:
                if celltype in cell_marker_dict:
                    # Filter out genes that have been used in previous cell types
                    unique_genes = [gene for gene in cell_marker_dict[celltype] if gene not in used_genes]
                    cell_marker_dict[celltype] = unique_genes
                    used_genes.update(unique_genes)

        return cell_marker_dict