Api scsa

`omicverse.single.pySCSA` ¶

Bases: object

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py

class pySCSA(object):

    def __init__(self,adata:anndata.AnnData,
                foldchange:float=1.5,pvalue:float=0.05,
                output:str='temp/rna_anno.txt',
                model_path:str='',
                outfmt:str='txt',Gensymbol:bool=True,
                species:str='Human',weight:int=100,tissue:str='All',target:str='cellmarker',
                celltype:str='normal',norefdb:bool=False,cellrange:str=None,
                noprint:bool=True,list_tissue:bool=False) -> None:

        r"""Initialize the pySCSA class.

        Arguments:
            adata: AnnData object of scRNA-seq after preprocessing
            foldchange: Fold change threshold for marker filtering. (1.5)
            pvalue: P-value threshold for marker filtering. (0.05)
            output: Output file for marker annotation. ('temp/rna_anno.txt')
            model_path: Path to the Database for annotation. If not provided, the model will be downloaded from the internet. ('')
            outfmt: Output format for marker annotation. ('txt')
            Gensymbol: Using gene symbol ID instead of ensembl ID in input file for calculation. (True)
            species: Species for annotation. Only used for cellmarker database. ('Human')
            weight: Weight threshold for marker filtering from cellranger v1.0 results. (100)
            tissue: Tissue for annotation. you can use `get_model_tissue` to see the available tissues. ('All')
            target: Target to annotation class in Database. ('cellmarker')
            celltype: Cell type for annotation. ('normal')
            norefdb: Only using user-defined marker database for annotation. (False)
            cellrange: Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.) (None)
            noprint: Do not print any detail results. (True)
            list_tissue: List all available tissues in the database. (False)

        Returns:
            None
        """

        #create temp directory
        try:
            if not os.path.isdir('temp'):
                print("...Creating directory {}".format('temp'))
                os.makedirs('temp', exist_ok=True)
        except OSError as e:
            print("...Unable to create directory {}. Reason {}".format('temp',e))

        self.adata=adata
        self.foldchange=foldchange
        self.pvalue=pvalue
        self.output=output
        self.outfmt=outfmt
        self.Gensymbol=Gensymbol
        self.species=species
        self.weight=weight
        self.tissue=tissue
        self.celltype=celltype
        self.norefdb=norefdb
        self.noprint=noprint
        self.list_tissue=list_tissue
        self.target=target
        self.cellrange=cellrange
        if model_path =='':
            self.model_path=data_downloader(url='https://figshare.com/ndownloader/files/41369037',
                                            path='temp/pySCSA_2023_v2_plus.db',title='whole')
        else:
            self.model_path=model_path

    def get_model_tissue(self,species:str="Human")->None:
        r"""List all available tissues in the database.

        Arguments:
            species: Species for annotation. Only used for cellmarker database. ('Human')

        Returns:
            None
        """

        anno = Annotator(foldchange=self.foldchange,
                    weight=self.weight,
                    pvalue=self.pvalue,
                    tissue=self.tissue,
                    species=self.species,
                    target=self.target,
                    norefdb=self.norefdb,
                    MarkerDB=None,
                    db=self.model_path,
                    noprint=self.noprint,
                    input="temp/rna.csv",
                    output=self.output,
                    source="scanpy",
                    cluster='all',
                    fc=self.foldchange,
                    outfmt=self.outfmt,
                    celltype=self.celltype,
                    Gensymbol=self.Gensymbol,
                    list_tissue=self.list_tissue,
                    cellrange=self.cellrange)
        anno.load_pickle_module(self.model_path)
        anno.get_list_tissue(species)


    def cell_anno(self,clustertype:str='leiden',
                  cluster:str='all',rank_rep=False)->pd.DataFrame:
        r"""Annotate cell type for each cluster.

        Arguments:
            clustertype: Clustering name used in scanpy. ('leiden')
            cluster: Only deal with one cluster of marker genes. ('all')
            rank_rep: Whether to repeat ranking. (False)

        Returns:
            result: Annotation result as DataFrame
        """

        dat=data_preprocess(self.adata,clustertype=clustertype,path='temp/rna.csv',rank_rep=rank_rep)
        dat.to_csv('temp/rna.csv')

        print('...Auto annotate cell')

        p = Process()
        p.run_cmd_p(foldchange=self.foldchange,
                    weight=self.weight,
                    pvalue=self.pvalue,
                    tissue=self.tissue,
                    species=self.species,
                    target=self.target,
                    norefdb=self.norefdb,
                    MarkerDB=None,
                    db=self.model_path,
                    noprint=self.noprint,
                    input="temp/rna.csv",
                    output=self.output,
                    source="scanpy",
                    cluster=cluster,
                    fc=self.foldchange,
                    outfmt=self.outfmt,
                    celltype=self.celltype,
                    Gensymbol=self.Gensymbol,
                    list_tissue=self.list_tissue,
                    cellrange=self.cellrange)


        result=pd.read_csv('temp/rna_anno.txt',sep='\t')
        self.result=result
        add_reference(self.adata,'pySCSA','cell annotation with SCSA')
        return result

    def cell_anno_print(self)->None:
        r"""Print the annotation result.

        Returns:
            None
        """
        for i in set(self.result['Cluster']):
            test=self.result.loc[self.result['Cluster']==i].iloc[:2]
            if test.iloc[0]['Z-score']>test.iloc[1]['Z-score']*2:
                print('Nice:Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,test.iloc[0]['Cell Type'],
                                                            np.around(test.iloc[0]['Z-score'],3)))
            else:
                print('Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,('|').join(test['Cell Type'].values.tolist()),
                                                            ('|').join(np.around(test['Z-score'].values,3).astype(str).tolist())))

    def cell_auto_anno(self,adata:anndata.AnnData,
                       clustertype:str='leiden',key='scsa_celltype')->None:
        r"""Add cell type annotation to anndata.obs['scsa_celltype'].

        Arguments:
            adata: anndata object
            clustertype: Clustering name used in scanpy. ('leiden')
            key: Key to store cell type annotation. ('scsa_celltype')

        Returns:
            None
        """
        test_li=[]
        for i in adata.obs[clustertype].value_counts().index:
            if int(i) in self.result['Cluster'].values:
                test_li.append(self.result.loc[self.result['Cluster']==int(i)].iloc[0]['Cell Type'])
            else:
                test_li.append('Unknown')
        scsa_anno=dict(zip([str(i) for i in adata.obs[clustertype].value_counts().index],
            test_li))
        adata.obs[key] = adata.obs[clustertype].map(scsa_anno).astype('category')
        print('...cell type added to {} on obs of anndata'.format(key))

    def get_celltype_marker(self,adata:anndata.AnnData,
                            clustertype:str='leiden',
                            log2fc_min:int=2,scores_type='scores',
                            pval_cutoff:float=0.05,rank:bool=True,
                            unique:bool=True,global_unique:bool=False)->dict:
        r"""Get marker genes for each clusters.

        Arguments:
            adata: anndata object
            clustertype: Clustering name used in scanpy. (leiden)
            log2fc_min: Minimum log2 fold change of marker genes. (2)
            pval_cutoff: Maximum p value of marker genes. (0.05)
            rank: Whether to rank genes by wilcoxon test. (True)
            scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`
            unique: Whether to remove duplicates within each cell type. (True)
            global_unique: Whether to remove duplicates across all cell types. (False)

        Returns:
            cellmarker: A dictionary of marker genes for each clusters.
        """
        print('...get cell type marker')
        cell_marker_dict=get_celltype_marker(adata=adata,
                            clustertype=clustertype,
                            log2fc_min=log2fc_min,scores_type=scores_type,
                            pval_cutoff=pval_cutoff,rank=rank,
                            unique=unique,global_unique=global_unique)

        return cell_marker_dict

`init(adata, foldchange=1.5, pvalue=0.05, output='temp/rna_anno.txt', model_path='', outfmt='txt', Gensymbol=True, species='Human', weight=100, tissue='All', target='cellmarker', celltype='normal', norefdb=False, cellrange=None, noprint=True, list_tissue=False)` ¶

Initialize the pySCSA class.

Parameters:

Name	Type	Description	Default
`adata`	`anndata.AnnData`	AnnData object of scRNA-seq after preprocessing	required
`foldchange`	`float`	Fold change threshold for marker filtering. (1.5)	`1.5`
`pvalue`	`float`	P-value threshold for marker filtering. (0.05)	`0.05`
`output`	`str`	Output file for marker annotation. ('temp/rna_anno.txt')	`'temp/rna_anno.txt'`
`model_path`	`str`	Path to the Database for annotation. If not provided, the model will be downloaded from the internet. ('')	`''`
`outfmt`	`str`	Output format for marker annotation. ('txt')	`'txt'`
`Gensymbol`	`bool`	Using gene symbol ID instead of ensembl ID in input file for calculation. (True)	`True`
`species`	`str`	Species for annotation. Only used for cellmarker database. ('Human')	`'Human'`
`weight`	`int`	Weight threshold for marker filtering from cellranger v1.0 results. (100)	`100`
`tissue`	`str`	Tissue for annotation. you can use `get_model_tissue` to see the available tissues. ('All')	`'All'`
`target`	`str`	Target to annotation class in Database. ('cellmarker')	`'cellmarker'`
`celltype`	`str`	Cell type for annotation. ('normal')	`'normal'`
`norefdb`	`bool`	Only using user-defined marker database for annotation. (False)	`False`
`cellrange`	`str`	Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.) (None)	`None`
`noprint`	`bool`	Do not print any detail results. (True)	`True`
`list_tissue`	`bool`	List all available tissues in the database. (False)	`False`

Returns:

Type	Description
`None`	None

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py

def __init__(self,adata:anndata.AnnData,
            foldchange:float=1.5,pvalue:float=0.05,
            output:str='temp/rna_anno.txt',
            model_path:str='',
            outfmt:str='txt',Gensymbol:bool=True,
            species:str='Human',weight:int=100,tissue:str='All',target:str='cellmarker',
            celltype:str='normal',norefdb:bool=False,cellrange:str=None,
            noprint:bool=True,list_tissue:bool=False) -> None:

    r"""Initialize the pySCSA class.

    Arguments:
        adata: AnnData object of scRNA-seq after preprocessing
        foldchange: Fold change threshold for marker filtering. (1.5)
        pvalue: P-value threshold for marker filtering. (0.05)
        output: Output file for marker annotation. ('temp/rna_anno.txt')
        model_path: Path to the Database for annotation. If not provided, the model will be downloaded from the internet. ('')
        outfmt: Output format for marker annotation. ('txt')
        Gensymbol: Using gene symbol ID instead of ensembl ID in input file for calculation. (True)
        species: Species for annotation. Only used for cellmarker database. ('Human')
        weight: Weight threshold for marker filtering from cellranger v1.0 results. (100)
        tissue: Tissue for annotation. you can use `get_model_tissue` to see the available tissues. ('All')
        target: Target to annotation class in Database. ('cellmarker')
        celltype: Cell type for annotation. ('normal')
        norefdb: Only using user-defined marker database for annotation. (False)
        cellrange: Cell sub_type for annotation. (if you input T cell, it will only provide T helper cell, T cytotoxic cell, T regulatory cell, etc.) (None)
        noprint: Do not print any detail results. (True)
        list_tissue: List all available tissues in the database. (False)

    Returns:
        None
    """

    #create temp directory
    try:
        if not os.path.isdir('temp'):
            print("...Creating directory {}".format('temp'))
            os.makedirs('temp', exist_ok=True)
    except OSError as e:
        print("...Unable to create directory {}. Reason {}".format('temp',e))

    self.adata=adata
    self.foldchange=foldchange
    self.pvalue=pvalue
    self.output=output
    self.outfmt=outfmt
    self.Gensymbol=Gensymbol
    self.species=species
    self.weight=weight
    self.tissue=tissue
    self.celltype=celltype
    self.norefdb=norefdb
    self.noprint=noprint
    self.list_tissue=list_tissue
    self.target=target
    self.cellrange=cellrange
    if model_path =='':
        self.model_path=data_downloader(url='https://figshare.com/ndownloader/files/41369037',
                                        path='temp/pySCSA_2023_v2_plus.db',title='whole')
    else:
        self.model_path=model_path

`cell_anno(clustertype='leiden', cluster='all', rank_rep=False)` ¶

Annotate cell type for each cluster.

Parameters:

Name	Type	Description	Default
`clustertype`	`str`	Clustering name used in scanpy. ('leiden')	`'leiden'`
`cluster`	`str`	Only deal with one cluster of marker genes. ('all')	`'all'`
`rank_rep`		Whether to repeat ranking. (False)	`False`

Returns:

Name	Type	Description
`result`	`pd.DataFrame`	Annotation result as DataFrame

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py

def cell_anno(self,clustertype:str='leiden',
              cluster:str='all',rank_rep=False)->pd.DataFrame:
    r"""Annotate cell type for each cluster.

    Arguments:
        clustertype: Clustering name used in scanpy. ('leiden')
        cluster: Only deal with one cluster of marker genes. ('all')
        rank_rep: Whether to repeat ranking. (False)

    Returns:
        result: Annotation result as DataFrame
    """

    dat=data_preprocess(self.adata,clustertype=clustertype,path='temp/rna.csv',rank_rep=rank_rep)
    dat.to_csv('temp/rna.csv')

    print('...Auto annotate cell')

    p = Process()
    p.run_cmd_p(foldchange=self.foldchange,
                weight=self.weight,
                pvalue=self.pvalue,
                tissue=self.tissue,
                species=self.species,
                target=self.target,
                norefdb=self.norefdb,
                MarkerDB=None,
                db=self.model_path,
                noprint=self.noprint,
                input="temp/rna.csv",
                output=self.output,
                source="scanpy",
                cluster=cluster,
                fc=self.foldchange,
                outfmt=self.outfmt,
                celltype=self.celltype,
                Gensymbol=self.Gensymbol,
                list_tissue=self.list_tissue,
                cellrange=self.cellrange)


    result=pd.read_csv('temp/rna_anno.txt',sep='\t')
    self.result=result
    add_reference(self.adata,'pySCSA','cell annotation with SCSA')
    return result

`cell_anno_print()` ¶

Print the annotation result.

Returns:

Type	Description
`None`	None

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py

def cell_anno_print(self)->None:
    r"""Print the annotation result.

    Returns:
        None
    """
    for i in set(self.result['Cluster']):
        test=self.result.loc[self.result['Cluster']==i].iloc[:2]
        if test.iloc[0]['Z-score']>test.iloc[1]['Z-score']*2:
            print('Nice:Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,test.iloc[0]['Cell Type'],
                                                        np.around(test.iloc[0]['Z-score'],3)))
        else:
            print('Cluster:{}\tCell_type:{}\tZ-score:{}'.format(i,('|').join(test['Cell Type'].values.tolist()),
                                                        ('|').join(np.around(test['Z-score'].values,3).astype(str).tolist())))

`cell_auto_anno(adata, clustertype='leiden', key='scsa_celltype')` ¶

Add cell type annotation to anndata.obs['scsa_celltype'].

Parameters:

Name	Type	Description	Default
`adata`	`anndata.AnnData`	anndata object	required
`clustertype`	`str`	Clustering name used in scanpy. ('leiden')	`'leiden'`
`key`		Key to store cell type annotation. ('scsa_celltype')	`'scsa_celltype'`

Returns:

Type	Description
`None`	None

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py

def cell_auto_anno(self,adata:anndata.AnnData,
                   clustertype:str='leiden',key='scsa_celltype')->None:
    r"""Add cell type annotation to anndata.obs['scsa_celltype'].

    Arguments:
        adata: anndata object
        clustertype: Clustering name used in scanpy. ('leiden')
        key: Key to store cell type annotation. ('scsa_celltype')

    Returns:
        None
    """
    test_li=[]
    for i in adata.obs[clustertype].value_counts().index:
        if int(i) in self.result['Cluster'].values:
            test_li.append(self.result.loc[self.result['Cluster']==int(i)].iloc[0]['Cell Type'])
        else:
            test_li.append('Unknown')
    scsa_anno=dict(zip([str(i) for i in adata.obs[clustertype].value_counts().index],
        test_li))
    adata.obs[key] = adata.obs[clustertype].map(scsa_anno).astype('category')
    print('...cell type added to {} on obs of anndata'.format(key))

`get_model_tissue(species='Human')` ¶

List all available tissues in the database.

Parameters:

Name	Type	Description	Default
`species`	`str`	Species for annotation. Only used for cellmarker database. ('Human')	`'Human'`

Returns:

Type	Description
`None`	None

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py

def get_model_tissue(self,species:str="Human")->None:
    r"""List all available tissues in the database.

    Arguments:
        species: Species for annotation. Only used for cellmarker database. ('Human')

    Returns:
        None
    """

    anno = Annotator(foldchange=self.foldchange,
                weight=self.weight,
                pvalue=self.pvalue,
                tissue=self.tissue,
                species=self.species,
                target=self.target,
                norefdb=self.norefdb,
                MarkerDB=None,
                db=self.model_path,
                noprint=self.noprint,
                input="temp/rna.csv",
                output=self.output,
                source="scanpy",
                cluster='all',
                fc=self.foldchange,
                outfmt=self.outfmt,
                celltype=self.celltype,
                Gensymbol=self.Gensymbol,
                list_tissue=self.list_tissue,
                cellrange=self.cellrange)
    anno.load_pickle_module(self.model_path)
    anno.get_list_tissue(species)

`get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=True, unique=True, global_unique=False)` ¶

Get marker genes for each clusters.

Parameters:

Name	Type	Description	Default
`adata`	`anndata.AnnData`	anndata object	required
`clustertype`	`str`	Clustering name used in scanpy. (leiden)	`'leiden'`
`log2fc_min`	`int`	Minimum log2 fold change of marker genes. (2)	`2`
`pval_cutoff`	`float`	Maximum p value of marker genes. (0.05)	`0.05`
`rank`	`bool`	Whether to rank genes by wilcoxon test. (True)	`True`
`scores_type`		The type of scores. can be selected from `scores` and `logfoldchanges`	`'scores'`
`unique`	`bool`	Whether to remove duplicates within each cell type. (True)	`True`
`global_unique`	`bool`	Whether to remove duplicates across all cell types. (False)	`False`

Returns:

Name	Type	Description
`cellmarker`	`dict`	A dictionary of marker genes for each clusters.

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py

def get_celltype_marker(self,adata:anndata.AnnData,
                        clustertype:str='leiden',
                        log2fc_min:int=2,scores_type='scores',
                        pval_cutoff:float=0.05,rank:bool=True,
                        unique:bool=True,global_unique:bool=False)->dict:
    r"""Get marker genes for each clusters.

    Arguments:
        adata: anndata object
        clustertype: Clustering name used in scanpy. (leiden)
        log2fc_min: Minimum log2 fold change of marker genes. (2)
        pval_cutoff: Maximum p value of marker genes. (0.05)
        rank: Whether to rank genes by wilcoxon test. (True)
        scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`
        unique: Whether to remove duplicates within each cell type. (True)
        global_unique: Whether to remove duplicates across all cell types. (False)

    Returns:
        cellmarker: A dictionary of marker genes for each clusters.
    """
    print('...get cell type marker')
    cell_marker_dict=get_celltype_marker(adata=adata,
                        clustertype=clustertype,
                        log2fc_min=log2fc_min,scores_type=scores_type,
                        pval_cutoff=pval_cutoff,rank=rank,
                        unique=unique,global_unique=global_unique)

    return cell_marker_dict

`omicverse.single.scanpy_lazy(adata, min_genes=200, min_cells=3, drop_doublet=True, n_genes_by_counts=4300, pct_counts_mt=25, target_sum=10000.0, min_mean=0.0125, max_mean=3, min_disp=0.5, max_value=10, n_comps=100, svd_solver='auto', n_neighbors=15, random_state=112, n_pcs=50)` ¶

Scanpy lazy analysis pipeline.

Parameters:

Name	Type	Description	Default
`adata`	`anndata.AnnData`	AnnData object	required
`min_genes`	`int`	The min number of genes. (200)	`200`
`min_cells`	`int`	The min number of cells. (3)	`3`
`drop_doublet`	`bool`	Whether to drop doublet. (True)	`True`
`n_genes_by_counts`	`int`	The max number of genes. (4300)	`4300`
`pct_counts_mt`	`int`	The max proportion of mito-genes. (25)	`25`
`target_sum`	`float`	The max counts of total_counts. (1e4)	`10000.0`
`min_mean`	`float`	The min mean of genes. (0.0125)	`0.0125`
`max_mean`	`int`	The max mean of genes. (3)	`3`
`min_disp`	`float`	The min dispersion of genes. (0.5)	`0.5`
`max_value`	`int`	The max value of genes. (10)	`10`
`n_comps`	`int`	The number of components. (100)	`100`
`svd_solver`	`str`	The solver of svd. ('auto')	`'auto'`
`n_neighbors`	`int`	The number of neighbors. (15)	`15`
`random_state`	`int`	The random state. (112)	`112`
`n_pcs`	`int`	The number of pcs. (50)	`50`

Returns:

Name	Type	Description
`adata`	`anndata.AnnData`	AnnData object

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py

def scanpy_lazy(adata:anndata.AnnData,min_genes:int=200,min_cells:int=3,drop_doublet:bool=True,
                n_genes_by_counts:int=4300,pct_counts_mt:int=25,
                target_sum:float=1e4,min_mean:float=0.0125, max_mean:int=3, min_disp:float=0.5,max_value:int=10,
                n_comps:int=100, svd_solver:str="auto",
                n_neighbors:int=15, random_state:int = 112, n_pcs:int=50,
                )->anndata.AnnData:
    r"""Scanpy lazy analysis pipeline.

    Arguments:
        adata: AnnData object
        min_genes: The min number of genes. (200)
        min_cells: The min number of cells. (3)
        drop_doublet: Whether to drop doublet. (True)
        n_genes_by_counts: The max number of genes. (4300)
        pct_counts_mt: The max proportion of mito-genes. (25)
        target_sum: The max counts of total_counts. (1e4)
        min_mean: The min mean of genes. (0.0125)
        max_mean: The max mean of genes. (3)
        min_disp: The min dispersion of genes. (0.5)
        max_value: The max value of genes. (10)
        n_comps: The number of components. (100)
        svd_solver: The solver of svd. ('auto')
        n_neighbors: The number of neighbors. (15)
        random_state: The random state. (112)
        n_pcs: The number of pcs. (50)

    Returns:
        adata: AnnData object
    """
    #filter cells and genes
    sc.pp.filter_cells(adata, min_genes=min_genes)
    sc.pp.filter_genes(adata, min_cells=min_cells)
    #filter the doublets cells
    if drop_doublet:
        sc.external.pp.scrublet(adata) #estimates doublets
        adata = adata[adata.obs['predicted_doublet'] == False] #do the actual filtering
    #calculate the proportion of mito-genes
    adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata = adata[adata.obs.n_genes_by_counts < n_genes_by_counts, :]
    adata = adata[adata.obs.pct_counts_mt < pct_counts_mt, :]
    #save the raw counts
    adata.layers["counts"] = adata.X.copy()
    #normalization, the max counts of total_counts is 20000 means the amount is 10e4
    sc.pp.normalize_total(adata, target_sum=target_sum)
    #log
    sc.pp.log1p(adata)
    #select high-variable genes
    sc.pp.highly_variable_genes(adata, min_mean=min_mean, max_mean=max_mean, min_disp=min_disp)
    #save and filter
    adata.raw = adata
    adata = adata[:, adata.var.highly_variable]
    #scale
    #scale(adata, max_value=max_value)
    sc.pp.scale(adata, max_value=max_value)
    #pca analysis
    sc.tl.pca(adata, n_comps=n_comps, svd_solver=svd_solver)
    #pca(adata,layer='scaled',n_pcs=50)
    #cell neighbors graph construct
    sc.pp.neighbors(adata, n_neighbors=n_neighbors, random_state = random_state, n_pcs=n_pcs)
    #umap
    sc.tl.leiden(adata)
    sc.tl.paga(adata)
    sc.pl.paga(adata, plot=False)  # remove `plot=False` if you want to see the coarse-grained graph
    sc.tl.umap(adata, init_pos='paga')
    return adata

`omicverse.single.scanpy_cellanno_from_dict(adata, anno_dict, anno_name='major', clustertype='leiden')` ¶

Add cell type annotation from dict to anndata object.

Parameters:

Name	Type	Description	Default
`adata`	`anndata.AnnData`	AnnData object of scRNA-seq after preprocessing	required
`anno_dict`	`dict`	Dict of cell type annotation. key is the cluster name, value is the cell type name.like `{'0':'B cell','1':'T cell'}`	required
`anno_name`	`str`	The name of annotation. ('major')	`'major'`
`clustertype`	`str`	Clustering name used in scanpy. ('leiden')	`'leiden'`

Returns:

Type	Description
`None`	None

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py

def scanpy_cellanno_from_dict(adata:anndata.AnnData,
                               anno_dict:dict,
                               anno_name:str='major',
                               clustertype:str='leiden',
                               )->None:
    r"""Add cell type annotation from dict to anndata object.

    Arguments:
        adata: AnnData object of scRNA-seq after preprocessing
        anno_dict: Dict of cell type annotation. key is the cluster name, value is the cell type name.like `{'0':'B cell','1':'T cell'}`
        anno_name: The name of annotation. ('major')
        clustertype: Clustering name used in scanpy. ('leiden')

    Returns:
        None
    """

    adata.obs[anno_name+'_celltype'] = adata.obs[clustertype].map(anno_dict).astype('category')
    print('...cell type added to {}_celltype on obs of anndata'.format(anno_name))

`omicverse.single.get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=False, key='rank_genes_groups', method='wilcoxon', foldchange=None, topgenenumber=10, unique=True, global_unique=False)` ¶

Get marker genes for each clusters.

Parameters:

Name	Type	Description	Default
`adata`	`anndata.AnnData`	anndata object	required
`clustertype`	`str`	Clustering name used in scanpy. (leiden)	`'leiden'`
`log2fc_min`	`int`	Minimum log2 fold change of marker genes. (2)	`2`
`pval_cutoff`	`float`	Maximum p value of marker genes. (0.05)	`0.05`
`rank`	`bool`	Whether to rank genes by wilcoxon test. (True)	`False`
`scores_type`		The type of scores. can be selected from `scores` and `logfoldchanges`	`'scores'`
`unique`		Whether to remove duplicates within each cell type. (True)	`True`
`global_unique`		Whether to remove duplicates across all cell types. (False)	`False`

Returns:

Name	Type	Description
`cellmarker`	`dict`	A dictionary of marker genes for each clusters.

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/single/_anno.py

def get_celltype_marker(adata:anndata.AnnData,
                            clustertype:str='leiden',
                            log2fc_min:int=2,scores_type='scores',
                            pval_cutoff:float=0.05,rank:bool=False,
                            key='rank_genes_groups',method='wilcoxon',
                            foldchange=None,topgenenumber=10,unique=True,
                            global_unique=False)->dict:
        r"""Get marker genes for each clusters.

        Arguments:
            adata: anndata object
            clustertype: Clustering name used in scanpy. (leiden)
            log2fc_min: Minimum log2 fold change of marker genes. (2)
            pval_cutoff: Maximum p value of marker genes. (0.05)
            rank: Whether to rank genes by wilcoxon test. (True)
            scores_type: The type of scores. can be selected from `scores` and `logfoldchanges`
            unique: Whether to remove duplicates within each cell type. (True)
            global_unique: Whether to remove duplicates across all cell types. (False)

        Returns:
            cellmarker: A dictionary of marker genes for each clusters.
        """
        print('...get cell type marker')
        celltypes = sorted(adata.obs[clustertype].unique())
        cell_marker_dict={}
        if rank==True and 'rank_genes_groups' not in adata.uns.keys():
            sc.tl.rank_genes_groups(adata, clustertype, method=method)
        elif rank==True and 'rank_genes_groups' in adata.uns.keys():
            pass
        for celltype in celltypes:
            degs = sc.get.rank_genes_groups_df(adata, group=celltype, key=key, log2fc_min=log2fc_min, 
                                            pval_cutoff=pval_cutoff)
            foldp=np.histogram(degs[scores_type])
            if foldchange is None:
                try:
                    foldchange=(foldp[1][np.where(foldp[1]>0)[0][-5]]+foldp[1][np.where(foldp[1]>0)[0][-6]])/2
                except:
                    foldchange=degs[scores_type].mean()

            cellmarker=degs.loc[degs[scores_type]>foldchange]['names'].values[:topgenenumber]
            cell_marker_dict[celltype]=cellmarker
        if unique==True:
            for key in cell_marker_dict.keys():
                cell_marker_dict[key]=list(set(cell_marker_dict[key]))

        # Global uniqueness across all cell types
        if global_unique:
            used_genes = set()
            for celltype in celltypes:
                if celltype in cell_marker_dict:
                    # Filter out genes that have been used in previous cell types
                    unique_genes = [gene for gene in cell_marker_dict[celltype] if gene not in used_genes]
                    cell_marker_dict[celltype] = unique_genes
                    used_genes.update(unique_genes)

        return cell_marker_dict

Api scsa

omicverse.single.pySCSA ¶

__init__(adata, foldchange=1.5, pvalue=0.05, output='temp/rna_anno.txt', model_path='', outfmt='txt', Gensymbol=True, species='Human', weight=100, tissue='All', target='cellmarker', celltype='normal', norefdb=False, cellrange=None, noprint=True, list_tissue=False) ¶

cell_anno(clustertype='leiden', cluster='all', rank_rep=False) ¶

cell_anno_print() ¶

cell_auto_anno(adata, clustertype='leiden', key='scsa_celltype') ¶

get_model_tissue(species='Human') ¶

get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=True, unique=True, global_unique=False) ¶

omicverse.single.scanpy_lazy(adata, min_genes=200, min_cells=3, drop_doublet=True, n_genes_by_counts=4300, pct_counts_mt=25, target_sum=10000.0, min_mean=0.0125, max_mean=3, min_disp=0.5, max_value=10, n_comps=100, svd_solver='auto', n_neighbors=15, random_state=112, n_pcs=50) ¶

omicverse.single.scanpy_cellanno_from_dict(adata, anno_dict, anno_name='major', clustertype='leiden') ¶

omicverse.single.get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=False, key='rank_genes_groups', method='wilcoxon', foldchange=None, topgenenumber=10, unique=True, global_unique=False) ¶

`omicverse.single.pySCSA` ¶

`init(adata, foldchange=1.5, pvalue=0.05, output='temp/rna_anno.txt', model_path='', outfmt='txt', Gensymbol=True, species='Human', weight=100, tissue='All', target='cellmarker', celltype='normal', norefdb=False, cellrange=None, noprint=True, list_tissue=False)` ¶

`cell_anno(clustertype='leiden', cluster='all', rank_rep=False)` ¶

`cell_anno_print()` ¶

`cell_auto_anno(adata, clustertype='leiden', key='scsa_celltype')` ¶

`get_model_tissue(species='Human')` ¶

`get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=True, unique=True, global_unique=False)` ¶

`omicverse.single.scanpy_lazy(adata, min_genes=200, min_cells=3, drop_doublet=True, n_genes_by_counts=4300, pct_counts_mt=25, target_sum=10000.0, min_mean=0.0125, max_mean=3, min_disp=0.5, max_value=10, n_comps=100, svd_solver='auto', n_neighbors=15, random_state=112, n_pcs=50)` ¶

`omicverse.single.scanpy_cellanno_from_dict(adata, anno_dict, anno_name='major', clustertype='leiden')` ¶

`omicverse.single.get_celltype_marker(adata, clustertype='leiden', log2fc_min=2, scores_type='scores', pval_cutoff=0.05, rank=False, key='rank_genes_groups', method='wilcoxon', foldchange=None, topgenenumber=10, unique=True, global_unique=False)` ¶