Skip to content

Api pp

omicverse.pp.preprocess(adata, mode='shiftlog|pearson', target_sum=50 * 10000.0, n_HVGs=2000, organism='human', no_cc=False, batch_key=None)

Preprocesses the AnnData object adata using either a scanpy or a pearson residuals workflow for size normalization and highly variable genes (HVGs) selection, and calculates signature scores if necessary.

Parameters:

Name Type Description Default
adata

The data matrix.

required
mode

The mode for size normalization and HVGs selection.

'shiftlog|pearson'
target_sum

The target total count after normalization.

50 * 10000.0
n_HVGs

the number of HVGs to select.

2000
organism

The organism of the data. It can be either 'human' or 'mouse'.

'human'
no_cc

Whether to remove cc-correlated genes from HVGs.

False

Returns:

Name Type Description
adata

The preprocessed data matrix.

Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def preprocess(adata, mode='shiftlog|pearson', target_sum=50*1e4, n_HVGs=2000,
    organism='human', no_cc=False,batch_key=None,):
    """
    Preprocesses the AnnData object adata using either a scanpy or 
    a pearson residuals workflow for size normalization
    and highly variable genes (HVGs) selection, and calculates signature scores if necessary. 

    Arguments:
        adata: The data matrix.
        mode: The mode for size normalization and HVGs selection. 
        It can be either 'scanpy' or 'pearson'. If 'scanpy', 
        performs size normalization using scanpy's normalize_total() function and selects HVGs 
        using pegasus' highly_variable_features() function with batch correction. 
        If 'pearson', selects HVGs sing scanpy's experimental.pp.highly_variable_genes() function
        with pearson residuals method and performs 
        size normalization using scanpy's experimental.pp.normalize_pearson_residuals() function. 
        target_sum: The target total count after normalization.
        n_HVGs: the number of HVGs to select.
        organism: The organism of the data. It can be either 'human' or 'mouse'. 
        no_cc: Whether to remove cc-correlated genes from HVGs.

    Returns:
        adata: The preprocessed data matrix. 
    """

    # Log-normalization, HVGs identification
    adata.layers['counts'] = adata.X.copy()
    print('Begin robust gene identification')
    identify_robust_genes(adata, percent_cells=0.05)
    adata = adata[:, adata.var['robust']]
    print('End of robust gene identification.')
    method_list = mode.split('|')
    print(f'Begin size normalization: {method_list[0]} and HVGs selection {method_list[1]}')
    if settings.mode == 'cpu':
        data_load_start = time.time()
        if method_list[0] == 'shiftlog': # Size normalization + scanpy batch aware HVGs selection
            sc.pp.normalize_total(
                adata,
                target_sum=target_sum,
                exclude_highly_expressed=True,
                max_fraction=0.2,
            )
            sc.pp.log1p(adata)
        elif method_list[0] == 'pearson':
            # Perason residuals workflow
            sc.experimental.pp.normalize_pearson_residuals(adata)

        if method_list[1] == 'pearson': # Size normalization + scanpy batch aware HVGs selection
            sc.experimental.pp.highly_variable_genes(
                adata,
                flavor="pearson_residuals",
                layer='counts',
                n_top_genes=n_HVGs,
                batch_key=batch_key,
            )
            if no_cc:
                remove_cc_genes(adata, organism=organism, corr_threshold=0.1)
        elif method_list[1] == 'seurat':
            sc.pp.highly_variable_genes(
                adata,
                flavor="seurat_v3",
                layer='counts',
                n_top_genes=n_HVGs,
                batch_key=batch_key,
            )
            if no_cc:
                remove_cc_genes(adata, organism=organism, corr_threshold=0.1)
        data_load_end = time.time()
        print(f'Time to analyze data in cpu: {data_load_end - data_load_start} seconds.')
    else:
        import rapids_singlecell as rsc
        data_load_start = time.time()
        if method_list[0] == 'shiftlog': # Size normalization + scanpy batch aware HVGs selection
            rsc.pp.normalize_total(adata, target_sum=target_sum)
            rsc.pp.log1p(adata)
        elif method_list[0] == 'pearson':
            # Perason residuals workflow
            rsc.pp.normalize_pearson_residuals(adata)
        if method_list[1] == 'pearson': # Size normalization + scanpy batch aware HVGs selection
            rsc.pp.highly_variable_genes(
                adata, 
                flavor="pearson_residuals",
                layer='counts',
                n_top_genes=n_HVGs,
                batch_key=batch_key,
            )
        elif method_list[1] == 'seurat':
            rsc.pp.highly_variable_genes(
                adata,
                flavor="seurat_v3",
                layer='counts',
                n_top_genes=n_HVGs,
                batch_key=batch_key,
            )
        data_load_end = time.time()
        print(f'Time to analyze data in gpu: {data_load_end - data_load_start} seconds.')


    adata.var = adata.var.drop(columns=['highly_variable_features'])
    adata.var['highly_variable_features'] = adata.var['highly_variable']
    adata.var = adata.var.drop(columns=['highly_variable'])
    #adata.var = adata.var.rename(columns={'means':'mean', 'variances':'var'})
    print(f'End of size normalization: {method_list[0]} and HVGs selection {method_list[1]}')
    return adata

omicverse.pp.highly_variable_features(data, batch=None, flavor='pegasus', n_top=2000, span=0.02, min_disp=0.5, max_disp=np.inf, min_mean=0.0125, max_mean=7, n_jobs=-1)

Highly variable features (HVF) selection. The input data should be logarithmized.

Parameters:

Name Type Description Default
data anndata.AnnData

Annotated data matrix with rows for cells and columns for genes.

required
batch str

A key in data.obs specifying batch information.

None
flavor str

The HVF selection method to use.

'pegasus'
n_top int

Number of genes to be selected as HVF. if None, no gene will be selected.

2000
span float

Only applicable when flavor is "pegasus".

0.02
min_disp float

Minimum normalized dispersion.

0.5
max_disp float

Maximum normalized dispersion. Set it to np.inf for infinity bound.

np.inf
min_mean float

Minimum mean.

0.0125
max_mean float

Maximum mean.

7
n_jobs int

Number of threads to be used during calculation.

-1

Update adata.var: * highly_variable_features: replace with Boolean type array indicating the selected highly variable features.

Examples

ov.pp.highly_variable_features(data) ov.pp.highly_variable_features(data, batch="Channel")

Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def highly_variable_features(
    data: anndata.AnnData,
    batch: str = None,
    flavor: str = "pegasus",
    n_top: int = 2000,
    span: float = 0.02,
    min_disp: float = 0.5,
    max_disp: float = np.inf,
    min_mean: float = 0.0125,
    max_mean: float = 7,
    n_jobs: int = -1,
) -> None:
    """ Highly variable features (HVF) selection. The input data should be logarithmized.

    Arguments:
        data: Annotated data matrix with rows for cells and columns for genes.
        batch: A key in data.obs specifying batch information. 
        If `batch` is not set, do not consider batch effects in selecting highly variable features. 
        Otherwise, if `data.obs[batch]` is not categorical, 
        `data.obs[batch]` will be automatically converted into categorical 
        before highly variable feature selection.
        flavor: The HVF selection method to use. 
        Available choices are ``"pegasus"`` or ``"Seurat"``.
        n_top: Number of genes to be selected as HVF. if ``None``, no gene will be selected.
        span: Only applicable when ``flavor`` is ``"pegasus"``. 
        The smoothing factor used by *scikit-learn loess* model in pegasus HVF selection method.
        min_disp: Minimum normalized dispersion.
        max_disp: Maximum normalized dispersion. Set it to ``np.inf`` for infinity bound.
        min_mean: Minimum mean.
        max_mean: Maximum mean.
        n_jobs: Number of threads to be used during calculation. 
        If ``-1``, all physical CPU cores will be used.


    Update ``adata.var``:
        * ``highly_variable_features``: replace with Boolean type array 
        indicating the selected highly variable features.

    Examples
    --------
    >>> ov.pp.highly_variable_features(data)
    >>> ov.pp.highly_variable_features(data, batch="Channel")
    """

    if flavor == "pegasus":
        select_hvf_pegasus(data, batch, n_top=n_top, span=span)
    else:
        assert flavor == "Seurat"
        select_hvf_seurat(
            data,
            batch,
            n_top=n_top,
            min_disp=min_disp,
            max_disp=max_disp,
            min_mean=min_mean,
            max_mean=max_mean,
            n_jobs=n_jobs,
        )

    data.uns.pop("_tmp_fmat_highly_variable_features", None) # Pop up cached feature matrix

    print(f"{data.var['highly_variable_features'].sum()} \
        highly variable features have been selected.")

omicverse.pp.remove_cc_genes(adata, organism='human', corr_threshold=0.1)

Update adata.var['highly_variable_features'] discarding cc correlated genes. Taken from Cospar, Wang et al., 2023.

Parameters:

Name Type Description Default
adata anndata.AnnData

Annotated data matrix with rows for cells and columns for genes.

required
organism str

Organism of the dataset. Available choices are "human" or "mouse".

'human'
corr_threshold float

Threshold for correlation with cc genes.

0.1
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def remove_cc_genes(adata:anndata.AnnData, organism:str='human', corr_threshold:float=0.1):
    """
    Update adata.var['highly_variable_features'] discarding cc correlated genes. 
    Taken from Cospar, Wang et al., 2023.

    Arguments:
        adata: Annotated data matrix with rows for cells and columns for genes.
        organism: Organism of the dataset. Available choices are ``"human"`` or ``"mouse"``.
        corr_threshold: Threshold for correlation with cc genes. 
        Genes having a correlation with cc genes > corr_threshold will be discarded.
    """
    # Get cc genes
    cycling_genes = load_signatures_from_file(predefined_signatures[f'cell_cycle_{organism}'])
    cc_genes = list(set(cycling_genes['G1/S']) | set(cycling_genes['G2/M']))
    cc_genes = [ x for x in cc_genes if x in adata.var_names ]
    # Compute corr
    cc_expression = adata[:, cc_genes].X.A.T
    hvgs = adata.var_names[adata.var['highly_variable_features']]
    hvgs_expression = adata[:, hvgs].X.A.T
    cc_corr = corr2_coeff(hvgs_expression, cc_expression)

    # Discard genes having the maximum correlation with one of the cc > corr_threshold
    max_corr = np.max(abs(cc_corr), 1)
    hvgs_no_cc = hvgs[max_corr < corr_threshold]
    print(
        f'Number of selected non-cycling highly variable genes: {hvgs_no_cc.size}\n'
        f'{np.sum(max_corr > corr_threshold)} cell cycle correlated genes will be removed...'
    )
    # Update
    adata.var['highly_variable_features'] = adata.var_names.isin(hvgs_no_cc)

omicverse.pp.identify_robust_genes(data, percent_cells=0.05)

Identify robust genes as candidates for HVG selection and remove genes that are not expressed in any cells.

Parameters:

Name Type Description Default
data anndata.AnnData

Use current selected modality in data, which should contain one RNA expression matrix.

required
percent_cells float

Only assign genes to be robust that are expressed in at least

0.05

Update data.var:

* ``n_cells``: Total number of cells in which each gene is measured.
* ``percent_cells``: Percent of cells in which each gene is measured.
* ``robust``: Boolean type indicating if a gene is robust based on the QC metrics.
* ``highly_variable_features``: Boolean type indicating if a gene 
is a highly variable feature. 
By default, set all robust genes as highly variable features.
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def identify_robust_genes(data: anndata.AnnData, percent_cells: float = 0.05) -> None:
    """ 
    Identify robust genes as candidates for HVG selection and remove genes 
    that are not expressed in any cells.

    Arguments:
        data: Use current selected modality in data, which should contain one RNA expression matrix.
        percent_cells: Only assign genes to be ``robust`` that are expressed in at least 
        ``percent_cells`` % of cells.


    Update ``data.var``:

        * ``n_cells``: Total number of cells in which each gene is measured.
        * ``percent_cells``: Percent of cells in which each gene is measured.
        * ``robust``: Boolean type indicating if a gene is robust based on the QC metrics.
        * ``highly_variable_features``: Boolean type indicating if a gene 
        is a highly variable feature. 
        By default, set all robust genes as highly variable features.

    """

    prior_n = data.shape[1]

    if issparse(data.X):
        data.var["n_cells"] = data.X.getnnz(axis=0)
        data._inplace_subset_var(data.var["n_cells"] > 0)
        data.var["percent_cells"] = (data.var["n_cells"] / data.shape[0]) * 100
        data.var["robust"] = data.var["percent_cells"] >= percent_cells
    else:
        data.var["robust"] = True

    data.var["highly_variable_features"] = data.var["robust"]  
    # default all robust genes are "highly" variable
    print(f"After filtration, {data.shape[1]}/{prior_n} genes are kept. \
    Among {data.shape[1]} genes, {data.var['robust'].sum()} genes are robust.")

omicverse.pp.scale(adata, max_value=10, layers_add='scaled')

Scale the input AnnData object.

Parameters:

Name Type Description Default
adata

Annotated data matrix with n_obs x n_vars shape.

required

Returns:

Name Type Description
adata

Annotated data matrix with n_obs x n_vars shape.

Adds a new layer called 'scaled' that stores the expression matrix that has been scaled to unit variance and zero mean.

Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def scale(adata,max_value=10,layers_add='scaled'):
    """
    Scale the input AnnData object.

    Arguments:
        adata : Annotated data matrix with n_obs x n_vars shape.

    Returns:
        adata : Annotated data matrix with n_obs x n_vars shape. 
        Adds a new layer called 'scaled' that stores
            the expression matrix that has been scaled to unit variance and zero mean.

    """
    if settings.mode == 'cpu':
        adata_mock = sc.pp.scale(adata, copy=True,max_value=max_value)
        adata.layers[layers_add] = adata_mock.X.copy()
        del adata_mock
    else:
        import rapids_singlecell as rsc
        adata.layers['scaled']=rsc.pp.scale(adata, max_value=max_value,inplace=False)

omicverse.pp.regress(adata)

Regress out covariates from the input AnnData object.

Parameters:

Name Type Description Default
adata

Annotated data matrix with n_obs x n_vars shape.

required

Returns:

Name Type Description
adata

Annotated data matrix with n_obs x n_vars shape.

Adds a new layer called 'regressed' that stores the expression matrix with covariates regressed out.

Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def regress(adata):
    """
    Regress out covariates from the input AnnData object.

    Arguments:
        adata : Annotated data matrix with n_obs x n_vars shape. 
        Should contain columns 'mito_perc' and 'nUMIs'that represent the percentage of 
        mitochondrial genes and the total number of UMI counts, respectively.

    Returns:
        adata : Annotated data matrix with n_obs x n_vars shape. 
        Adds a new layer called 'regressed' that stores
            the expression matrix with covariates regressed out.

    """
    if settings.mode == 'cpu':
        adata_mock = sc.pp.regress_out(adata, ['mito_perc', 'nUMIs'], n_jobs=8, copy=True)
        adata.layers['regressed'] = adata_mock.X.copy()
        del adata_mock
    else:
        import rapids_singlecell as rsc
        adata.layers['regressed']=rsc.pp.regress_out(adata, ['mito_perc', 'nUMIs'], inplace=False)

omicverse.pp.regress_and_scale(adata)

Regress out covariates from the input AnnData object and scale the resulting expression matrix.

Parameters:

Name Type Description Default
adata

Annotated data matrix with n_obs x n_vars shape.

required

Returns:

Name Type Description
adata

Annotated data matrix with n_obs x n_vars shape.

Adds a new layer called 'regressed_and_scaled' that stores the expression matrix with covariates regressed out and then scaled.

Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def regress_and_scale(adata):
    """
    Regress out covariates from the input AnnData object and scale the resulting expression matrix.

    Arguments:
        adata : Annotated data matrix with n_obs x n_vars shape. 
        Should contain a layer called 'regressed'
            that stores the expression matrix with covariates regressed out.

    Returns:
        adata : Annotated data matrix with n_obs x n_vars shape. 
        Adds a new layer called 'regressed_and_scaled'
            that stores the expression matrix with covariates regressed out and then scaled.

    """
    if 'regressed' not in adata.layers:
        raise KeyError('Regress out covariates first!')
    adata_mock= adata.copy()
    adata_mock.X = adata_mock.layers['regressed']
    scale(adata_mock)
    adata.layers['regressed_and_scaled'] = adata_mock.layers['scaled']

    return adata

omicverse.pp.pca(adata, n_pcs=50, layer='scaled', inplace=True, **kwargs)

Performs Principal Component Analysis (PCA) on the data stored in a scanpy AnnData object.

Parameters:

Name Type Description Default
adata

Annotated data matrix with rows representing cells

required
n_pcs

Number of principal components to calculate.

50
layer

The name of the layer in adata where the data to be analyzed is stored.

'scaled'

Returns:

Name Type Description
adata

The original AnnData object with the calculated PCA embeddings

and other information stored in its obsm, varm, and uns fields.

Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def pca(adata, n_pcs=50, layer='scaled',inplace=True,**kwargs):
    """
    Performs Principal Component Analysis (PCA) on the data stored in a scanpy AnnData object.

    Arguments:
        adata : Annotated data matrix with rows representing cells 
        and columns representing features.
        n_pcs : Number of principal components to calculate.
        layer : The name of the layer in `adata` where the data to be analyzed is stored. 
        Defaults to the 'scaled' layer,
            and falls back to 'lognorm' if that layer does not exist. 
        Raises a KeyError if the specified layer is not present.

    Returns:
        adata : The original AnnData object with the calculated PCA embeddings 
        and other information stored in its `obsm`, `varm`,
            and `uns` fields.
    """
    if 'lognorm' not in adata.layers:
        adata.layers['lognorm'] = adata.X
    if layer in adata.layers:
        X = adata.layers[layer]
        key = f'{layer}|original'
    else:
        raise KeyError(f'Selected layer {layer} is not present. Compute it first!')
    if settings.mode == 'cpu':
        if sc.__version__ <'1.10':
            adata_mock=sc.AnnData(adata.layers[layer],obs=adata.obs,var=adata.var)
            sc.pp.pca(adata_mock, n_comps=n_pcs)
            adata.obsm[key + '|X_pca'] = adata_mock.obsm['X_pca']
            adata.varm[key + '|pca_loadings'] = adata_mock.varm['PCs']
            adata.uns[key + '|pca_var_ratios'] = adata_mock.uns['pca']['variance_ratio']
            adata.uns[key + '|cum_sum_eigenvalues'] = adata_mock.uns['pca']['variance']
        else:
            sc.pp.pca(adata, layer=layer,n_comps=n_pcs,**kwargs)
            adata.obsm[key + '|X_pca'] = adata.obsm['X_pca']
            adata.varm[key + '|pca_loadings'] = adata.varm['PCs']
            adata.uns[key + '|pca_var_ratios'] = adata.uns['pca']['variance_ratio']
            adata.uns[key + '|cum_sum_eigenvalues'] = adata.uns['pca']['variance']

    else:
        import rapids_singlecell as rsc
        rsc.pp.pca(adata, layer=layer,n_comps=n_pcs)
        adata.obsm[key + '|X_pca'] = adata.obsm['X_pca']
        adata.varm[key + '|pca_loadings'] = adata.varm['PCs']
        adata.uns[key + '|pca_var_ratios'] = adata.uns['pca']['variance_ratio']
        adata.uns[key + '|cum_sum_eigenvalues'] = adata.uns['pca']['variance']
    if inplace:
        return None
    else:
        return adata