Api pp
omicverse.pp.preprocess(adata, mode='shiftlog|pearson', target_sum=50 * 10000.0, n_HVGs=2000, organism='human', no_cc=False, batch_key=None)
¶
Preprocesses the AnnData object adata using either a scanpy or a pearson residuals workflow for size normalization and highly variable genes (HVGs) selection, and calculates signature scores if necessary.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
The data matrix. |
required | |
mode |
The mode for size normalization and HVGs selection. |
'shiftlog|pearson'
|
|
target_sum |
The target total count after normalization. |
50 * 10000.0
|
|
n_HVGs |
the number of HVGs to select. |
2000
|
|
organism |
The organism of the data. It can be either 'human' or 'mouse'. |
'human'
|
|
no_cc |
Whether to remove cc-correlated genes from HVGs. |
False
|
Returns:
Name | Type | Description |
---|---|---|
adata | The preprocessed data matrix. |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def preprocess(adata, mode='shiftlog|pearson', target_sum=50*1e4, n_HVGs=2000,
organism='human', no_cc=False,batch_key=None,):
"""
Preprocesses the AnnData object adata using either a scanpy or
a pearson residuals workflow for size normalization
and highly variable genes (HVGs) selection, and calculates signature scores if necessary.
Arguments:
adata: The data matrix.
mode: The mode for size normalization and HVGs selection.
It can be either 'scanpy' or 'pearson'. If 'scanpy',
performs size normalization using scanpy's normalize_total() function and selects HVGs
using pegasus' highly_variable_features() function with batch correction.
If 'pearson', selects HVGs sing scanpy's experimental.pp.highly_variable_genes() function
with pearson residuals method and performs
size normalization using scanpy's experimental.pp.normalize_pearson_residuals() function.
target_sum: The target total count after normalization.
n_HVGs: the number of HVGs to select.
organism: The organism of the data. It can be either 'human' or 'mouse'.
no_cc: Whether to remove cc-correlated genes from HVGs.
Returns:
adata: The preprocessed data matrix.
"""
# Log-normalization, HVGs identification
adata.layers['counts'] = adata.X.copy()
print('Begin robust gene identification')
identify_robust_genes(adata, percent_cells=0.05)
adata = adata[:, adata.var['robust']]
print('End of robust gene identification.')
method_list = mode.split('|')
print(f'Begin size normalization: {method_list[0]} and HVGs selection {method_list[1]}')
if settings.mode == 'cpu':
data_load_start = time.time()
if method_list[0] == 'shiftlog': # Size normalization + scanpy batch aware HVGs selection
sc.pp.normalize_total(
adata,
target_sum=target_sum,
exclude_highly_expressed=True,
max_fraction=0.2,
)
sc.pp.log1p(adata)
elif method_list[0] == 'pearson':
# Perason residuals workflow
sc.experimental.pp.normalize_pearson_residuals(adata)
if method_list[1] == 'pearson': # Size normalization + scanpy batch aware HVGs selection
sc.experimental.pp.highly_variable_genes(
adata,
flavor="pearson_residuals",
layer='counts',
n_top_genes=n_HVGs,
batch_key=batch_key,
)
if no_cc:
remove_cc_genes(adata, organism=organism, corr_threshold=0.1)
elif method_list[1] == 'seurat':
sc.pp.highly_variable_genes(
adata,
flavor="seurat_v3",
layer='counts',
n_top_genes=n_HVGs,
batch_key=batch_key,
)
if no_cc:
remove_cc_genes(adata, organism=organism, corr_threshold=0.1)
data_load_end = time.time()
print(f'Time to analyze data in cpu: {data_load_end - data_load_start} seconds.')
else:
import rapids_singlecell as rsc
data_load_start = time.time()
if method_list[0] == 'shiftlog': # Size normalization + scanpy batch aware HVGs selection
rsc.pp.normalize_total(adata, target_sum=target_sum)
rsc.pp.log1p(adata)
elif method_list[0] == 'pearson':
# Perason residuals workflow
rsc.pp.normalize_pearson_residuals(adata)
if method_list[1] == 'pearson': # Size normalization + scanpy batch aware HVGs selection
rsc.pp.highly_variable_genes(
adata,
flavor="pearson_residuals",
layer='counts',
n_top_genes=n_HVGs,
batch_key=batch_key,
)
elif method_list[1] == 'seurat':
rsc.pp.highly_variable_genes(
adata,
flavor="seurat_v3",
layer='counts',
n_top_genes=n_HVGs,
batch_key=batch_key,
)
data_load_end = time.time()
print(f'Time to analyze data in gpu: {data_load_end - data_load_start} seconds.')
adata.var = adata.var.drop(columns=['highly_variable_features'])
adata.var['highly_variable_features'] = adata.var['highly_variable']
adata.var = adata.var.drop(columns=['highly_variable'])
#adata.var = adata.var.rename(columns={'means':'mean', 'variances':'var'})
print(f'End of size normalization: {method_list[0]} and HVGs selection {method_list[1]}')
return adata
omicverse.pp.highly_variable_features(data, batch=None, flavor='pegasus', n_top=2000, span=0.02, min_disp=0.5, max_disp=np.inf, min_mean=0.0125, max_mean=7, n_jobs=-1)
¶
Highly variable features (HVF) selection. The input data should be logarithmized.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
anndata.AnnData
|
Annotated data matrix with rows for cells and columns for genes. |
required |
batch |
str
|
A key in data.obs specifying batch information. |
None
|
flavor |
str
|
The HVF selection method to use. |
'pegasus'
|
n_top |
int
|
Number of genes to be selected as HVF. if |
2000
|
span |
float
|
Only applicable when |
0.02
|
min_disp |
float
|
Minimum normalized dispersion. |
0.5
|
max_disp |
float
|
Maximum normalized dispersion. Set it to |
np.inf
|
min_mean |
float
|
Minimum mean. |
0.0125
|
max_mean |
float
|
Maximum mean. |
7
|
n_jobs |
int
|
Number of threads to be used during calculation. |
-1
|
Update adata.var
:
* highly_variable_features
: replace with Boolean type array
indicating the selected highly variable features.
Examples¶
ov.pp.highly_variable_features(data) ov.pp.highly_variable_features(data, batch="Channel")
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def highly_variable_features(
data: anndata.AnnData,
batch: str = None,
flavor: str = "pegasus",
n_top: int = 2000,
span: float = 0.02,
min_disp: float = 0.5,
max_disp: float = np.inf,
min_mean: float = 0.0125,
max_mean: float = 7,
n_jobs: int = -1,
) -> None:
""" Highly variable features (HVF) selection. The input data should be logarithmized.
Arguments:
data: Annotated data matrix with rows for cells and columns for genes.
batch: A key in data.obs specifying batch information.
If `batch` is not set, do not consider batch effects in selecting highly variable features.
Otherwise, if `data.obs[batch]` is not categorical,
`data.obs[batch]` will be automatically converted into categorical
before highly variable feature selection.
flavor: The HVF selection method to use.
Available choices are ``"pegasus"`` or ``"Seurat"``.
n_top: Number of genes to be selected as HVF. if ``None``, no gene will be selected.
span: Only applicable when ``flavor`` is ``"pegasus"``.
The smoothing factor used by *scikit-learn loess* model in pegasus HVF selection method.
min_disp: Minimum normalized dispersion.
max_disp: Maximum normalized dispersion. Set it to ``np.inf`` for infinity bound.
min_mean: Minimum mean.
max_mean: Maximum mean.
n_jobs: Number of threads to be used during calculation.
If ``-1``, all physical CPU cores will be used.
Update ``adata.var``:
* ``highly_variable_features``: replace with Boolean type array
indicating the selected highly variable features.
Examples
--------
>>> ov.pp.highly_variable_features(data)
>>> ov.pp.highly_variable_features(data, batch="Channel")
"""
if flavor == "pegasus":
select_hvf_pegasus(data, batch, n_top=n_top, span=span)
else:
assert flavor == "Seurat"
select_hvf_seurat(
data,
batch,
n_top=n_top,
min_disp=min_disp,
max_disp=max_disp,
min_mean=min_mean,
max_mean=max_mean,
n_jobs=n_jobs,
)
data.uns.pop("_tmp_fmat_highly_variable_features", None) # Pop up cached feature matrix
print(f"{data.var['highly_variable_features'].sum()} \
highly variable features have been selected.")
omicverse.pp.remove_cc_genes(adata, organism='human', corr_threshold=0.1)
¶
Update adata.var['highly_variable_features'] discarding cc correlated genes. Taken from Cospar, Wang et al., 2023.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
anndata.AnnData
|
Annotated data matrix with rows for cells and columns for genes. |
required |
organism |
str
|
Organism of the dataset. Available choices are |
'human'
|
corr_threshold |
float
|
Threshold for correlation with cc genes. |
0.1
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def remove_cc_genes(adata:anndata.AnnData, organism:str='human', corr_threshold:float=0.1):
"""
Update adata.var['highly_variable_features'] discarding cc correlated genes.
Taken from Cospar, Wang et al., 2023.
Arguments:
adata: Annotated data matrix with rows for cells and columns for genes.
organism: Organism of the dataset. Available choices are ``"human"`` or ``"mouse"``.
corr_threshold: Threshold for correlation with cc genes.
Genes having a correlation with cc genes > corr_threshold will be discarded.
"""
# Get cc genes
cycling_genes = load_signatures_from_file(predefined_signatures[f'cell_cycle_{organism}'])
cc_genes = list(set(cycling_genes['G1/S']) | set(cycling_genes['G2/M']))
cc_genes = [ x for x in cc_genes if x in adata.var_names ]
# Compute corr
cc_expression = adata[:, cc_genes].X.A.T
hvgs = adata.var_names[adata.var['highly_variable_features']]
hvgs_expression = adata[:, hvgs].X.A.T
cc_corr = corr2_coeff(hvgs_expression, cc_expression)
# Discard genes having the maximum correlation with one of the cc > corr_threshold
max_corr = np.max(abs(cc_corr), 1)
hvgs_no_cc = hvgs[max_corr < corr_threshold]
print(
f'Number of selected non-cycling highly variable genes: {hvgs_no_cc.size}\n'
f'{np.sum(max_corr > corr_threshold)} cell cycle correlated genes will be removed...'
)
# Update
adata.var['highly_variable_features'] = adata.var_names.isin(hvgs_no_cc)
omicverse.pp.identify_robust_genes(data, percent_cells=0.05)
¶
Identify robust genes as candidates for HVG selection and remove genes that are not expressed in any cells.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
anndata.AnnData
|
Use current selected modality in data, which should contain one RNA expression matrix. |
required |
percent_cells |
float
|
Only assign genes to be |
0.05
|
Update data.var
:
* ``n_cells``: Total number of cells in which each gene is measured.
* ``percent_cells``: Percent of cells in which each gene is measured.
* ``robust``: Boolean type indicating if a gene is robust based on the QC metrics.
* ``highly_variable_features``: Boolean type indicating if a gene
is a highly variable feature.
By default, set all robust genes as highly variable features.
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def identify_robust_genes(data: anndata.AnnData, percent_cells: float = 0.05) -> None:
"""
Identify robust genes as candidates for HVG selection and remove genes
that are not expressed in any cells.
Arguments:
data: Use current selected modality in data, which should contain one RNA expression matrix.
percent_cells: Only assign genes to be ``robust`` that are expressed in at least
``percent_cells`` % of cells.
Update ``data.var``:
* ``n_cells``: Total number of cells in which each gene is measured.
* ``percent_cells``: Percent of cells in which each gene is measured.
* ``robust``: Boolean type indicating if a gene is robust based on the QC metrics.
* ``highly_variable_features``: Boolean type indicating if a gene
is a highly variable feature.
By default, set all robust genes as highly variable features.
"""
prior_n = data.shape[1]
if issparse(data.X):
data.var["n_cells"] = data.X.getnnz(axis=0)
data._inplace_subset_var(data.var["n_cells"] > 0)
data.var["percent_cells"] = (data.var["n_cells"] / data.shape[0]) * 100
data.var["robust"] = data.var["percent_cells"] >= percent_cells
else:
data.var["robust"] = True
data.var["highly_variable_features"] = data.var["robust"]
# default all robust genes are "highly" variable
print(f"After filtration, {data.shape[1]}/{prior_n} genes are kept. \
Among {data.shape[1]} genes, {data.var['robust'].sum()} genes are robust.")
omicverse.pp.scale(adata, max_value=10, layers_add='scaled')
¶
Scale the input AnnData object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
Annotated data matrix with n_obs x n_vars shape. |
required |
Returns:
Name | Type | Description |
---|---|---|
adata | Annotated data matrix with n_obs x n_vars shape. |
|
Adds a new layer called 'scaled' that stores the expression matrix that has been scaled to unit variance and zero mean. |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def scale(adata,max_value=10,layers_add='scaled'):
"""
Scale the input AnnData object.
Arguments:
adata : Annotated data matrix with n_obs x n_vars shape.
Returns:
adata : Annotated data matrix with n_obs x n_vars shape.
Adds a new layer called 'scaled' that stores
the expression matrix that has been scaled to unit variance and zero mean.
"""
if settings.mode == 'cpu':
adata_mock = sc.pp.scale(adata, copy=True,max_value=max_value)
adata.layers[layers_add] = adata_mock.X.copy()
del adata_mock
else:
import rapids_singlecell as rsc
adata.layers['scaled']=rsc.pp.scale(adata, max_value=max_value,inplace=False)
omicverse.pp.regress(adata)
¶
Regress out covariates from the input AnnData object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
Annotated data matrix with n_obs x n_vars shape. |
required |
Returns:
Name | Type | Description |
---|---|---|
adata | Annotated data matrix with n_obs x n_vars shape. |
|
Adds a new layer called 'regressed' that stores the expression matrix with covariates regressed out. |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def regress(adata):
"""
Regress out covariates from the input AnnData object.
Arguments:
adata : Annotated data matrix with n_obs x n_vars shape.
Should contain columns 'mito_perc' and 'nUMIs'that represent the percentage of
mitochondrial genes and the total number of UMI counts, respectively.
Returns:
adata : Annotated data matrix with n_obs x n_vars shape.
Adds a new layer called 'regressed' that stores
the expression matrix with covariates regressed out.
"""
if settings.mode == 'cpu':
adata_mock = sc.pp.regress_out(adata, ['mito_perc', 'nUMIs'], n_jobs=8, copy=True)
adata.layers['regressed'] = adata_mock.X.copy()
del adata_mock
else:
import rapids_singlecell as rsc
adata.layers['regressed']=rsc.pp.regress_out(adata, ['mito_perc', 'nUMIs'], inplace=False)
omicverse.pp.regress_and_scale(adata)
¶
Regress out covariates from the input AnnData object and scale the resulting expression matrix.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
Annotated data matrix with n_obs x n_vars shape. |
required |
Returns:
Name | Type | Description |
---|---|---|
adata | Annotated data matrix with n_obs x n_vars shape. |
|
Adds a new layer called 'regressed_and_scaled' that stores the expression matrix with covariates regressed out and then scaled. |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def regress_and_scale(adata):
"""
Regress out covariates from the input AnnData object and scale the resulting expression matrix.
Arguments:
adata : Annotated data matrix with n_obs x n_vars shape.
Should contain a layer called 'regressed'
that stores the expression matrix with covariates regressed out.
Returns:
adata : Annotated data matrix with n_obs x n_vars shape.
Adds a new layer called 'regressed_and_scaled'
that stores the expression matrix with covariates regressed out and then scaled.
"""
if 'regressed' not in adata.layers:
raise KeyError('Regress out covariates first!')
adata_mock= adata.copy()
adata_mock.X = adata_mock.layers['regressed']
scale(adata_mock)
adata.layers['regressed_and_scaled'] = adata_mock.layers['scaled']
return adata
omicverse.pp.pca(adata, n_pcs=50, layer='scaled', inplace=True, **kwargs)
¶
Performs Principal Component Analysis (PCA) on the data stored in a scanpy AnnData object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
Annotated data matrix with rows representing cells |
required | |
n_pcs |
Number of principal components to calculate. |
50
|
|
layer |
The name of the layer in |
'scaled'
|
Returns:
Name | Type | Description |
---|---|---|
adata | The original AnnData object with the calculated PCA embeddings |
|
and other information stored in its |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/pp/_preprocess.py
def pca(adata, n_pcs=50, layer='scaled',inplace=True,**kwargs):
"""
Performs Principal Component Analysis (PCA) on the data stored in a scanpy AnnData object.
Arguments:
adata : Annotated data matrix with rows representing cells
and columns representing features.
n_pcs : Number of principal components to calculate.
layer : The name of the layer in `adata` where the data to be analyzed is stored.
Defaults to the 'scaled' layer,
and falls back to 'lognorm' if that layer does not exist.
Raises a KeyError if the specified layer is not present.
Returns:
adata : The original AnnData object with the calculated PCA embeddings
and other information stored in its `obsm`, `varm`,
and `uns` fields.
"""
if 'lognorm' not in adata.layers:
adata.layers['lognorm'] = adata.X
if layer in adata.layers:
X = adata.layers[layer]
key = f'{layer}|original'
else:
raise KeyError(f'Selected layer {layer} is not present. Compute it first!')
if settings.mode == 'cpu':
if sc.__version__ <'1.10':
adata_mock=sc.AnnData(adata.layers[layer],obs=adata.obs,var=adata.var)
sc.pp.pca(adata_mock, n_comps=n_pcs)
adata.obsm[key + '|X_pca'] = adata_mock.obsm['X_pca']
adata.varm[key + '|pca_loadings'] = adata_mock.varm['PCs']
adata.uns[key + '|pca_var_ratios'] = adata_mock.uns['pca']['variance_ratio']
adata.uns[key + '|cum_sum_eigenvalues'] = adata_mock.uns['pca']['variance']
else:
sc.pp.pca(adata, layer=layer,n_comps=n_pcs,**kwargs)
adata.obsm[key + '|X_pca'] = adata.obsm['X_pca']
adata.varm[key + '|pca_loadings'] = adata.varm['PCs']
adata.uns[key + '|pca_var_ratios'] = adata.uns['pca']['variance_ratio']
adata.uns[key + '|cum_sum_eigenvalues'] = adata.uns['pca']['variance']
else:
import rapids_singlecell as rsc
rsc.pp.pca(adata, layer=layer,n_comps=n_pcs)
adata.obsm[key + '|X_pca'] = adata.obsm['X_pca']
adata.varm[key + '|pca_loadings'] = adata.varm['PCs']
adata.uns[key + '|pca_var_ratios'] = adata.uns['pca']['variance_ratio']
adata.uns[key + '|cum_sum_eigenvalues'] = adata.uns['pca']['variance']
if inplace:
return None
else:
return adata