Api cosg
omicverse.single.cosg(adata, groupby='CellTypes', groups='all', mu=1, remove_lowly_expressed=False, expressed_pct=0.1, n_genes_user=50, key_added=None, calculate_logfoldchanges=True, use_raw=True, layer=None, reference='rest', copy=False)
¶
Marker gene identification for single-cell sequencing data using COSG.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
adata |
Annotated data matrix. Note: input parameters are similar to the parameters used for scanpy's rank_genes_groups() function. |
required | |
groupby |
The key of the cell groups in .obs, the default value is set to 'CellTypes'. |
'CellTypes'
|
|
groups |
Union[Literal[all], Iterable[str]]
|
Subset of cell groups, e.g. ['g1', 'g2', 'g3'], to which comparison shall be restricted. The default value is 'all', and all groups will be compared. |
'all'
|
mu |
The penalty restricting marker genes expressing in non-target cell groups. Larger value represents more strict restrictions. mu should be >= 0, and by default, mu = 1. |
1
|
|
remove_lowly_expressed |
bool
|
If True, genes that express a percentage of target cells smaller than a specific value (expressed_pct) are not considered as marker genes for the target cells. The default value is False. |
False
|
expressed_pct |
Optional[float]
|
When remove_lowly_expressed is set to True, genes that express a percentage of target cells smaller than a specific value (expressed_pct) are not considered as marker genes for the target cells. The default value for expressed_pct is 0.1 (10%). |
0.1
|
n_genes_user |
int
|
The number of genes that appear in the returned tables. The default value is 50. |
50
|
key_added |
Optional[str]
|
The key in adata.uns information is saved to. |
None
|
calculate_logfoldchanges |
bool
|
Calculate logfoldchanges. |
True
|
use_raw |
bool
|
Use raw attribute of adata if present. |
True
|
layer |
Optional[str]
|
Key from adata.layers whose value will be used to perform tests on. |
None
|
reference |
str
|
If 'rest', compare each group to the union of the rest of the group. If a group identifier, compare with respect to this group. |
'rest'
|
Returns:
Name | Type | Description |
---|---|---|
names | structured np.ndarray (.uns['rank_genes_groups']) Structured array to be indexed by group id storing the gene names. Ordered according to scores. |
|
scores | structured np.ndarray (.uns['rank_genes_groups']) Structured array to be indexed by group id storing COSG scores for each gene for each group. Ordered according to scores. |
Notes
Contact: Min Dai, daimin@zju.edu.cn
Examples:
>>> import cosg as cosg
>>> import scanpy as sc
>>> adata = sc.datasets.pbmc68k_reduced()
>>> cosg.cosg(adata, key_added='cosg', groupby='bulk_labels')
>>> sc.pl.rank_genes_groups(adata, key='cosg')
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/single/_cosg.py
def cosg(
adata,
groupby='CellTypes',
groups: Union[Literal['all'], Iterable[str]] = 'all',
mu=1,
remove_lowly_expressed:bool=False,
expressed_pct:Optional[float] = 0.1,
n_genes_user:int =50,
key_added: Optional[str] = None,
calculate_logfoldchanges: bool = True,
use_raw: bool = True,
layer: Optional[str] = None,
reference: str = 'rest',
copy:bool=False
):
"""
Marker gene identification for single-cell sequencing data using COSG.
Arguments:
adata: Annotated data matrix. Note: input parameters are similar to the parameters used for scanpy's rank_genes_groups() function.
groupby: The key of the cell groups in .obs, the default value is set to 'CellTypes'.
groups: Subset of cell groups, e.g. ['g1', 'g2', 'g3'], to which comparison shall be restricted. The default value is 'all', and all groups will be compared.
mu: The penalty restricting marker genes expressing in non-target cell groups. Larger value represents more strict restrictions. mu should be >= 0, and by default, mu = 1.
remove_lowly_expressed: If True, genes that express a percentage of target cells smaller than a specific value (expressed_pct) are not considered as marker genes for the target cells. The default value is False.
expressed_pct: When remove_lowly_expressed is set to True, genes that express a percentage of target cells smaller than a specific value (expressed_pct) are not considered as marker genes for the target cells. The default value for expressed_pct is 0.1 (10%).
n_genes_user: The number of genes that appear in the returned tables. The default value is 50.
key_added: The key in adata.uns information is saved to.
calculate_logfoldchanges: Calculate logfoldchanges.
use_raw: Use raw attribute of adata if present.
layer: Key from adata.layers whose value will be used to perform tests on.
reference: If 'rest', compare each group to the union of the rest of the group. If a group identifier, compare with respect to this group.
Returns:
names : structured np.ndarray (.uns['rank_genes_groups'])
Structured array to be indexed by group id storing the gene names. Ordered according to scores.
scores : structured np.ndarray (.uns['rank_genes_groups'])
Structured array to be indexed by group id storing COSG scores for each gene for each group. Ordered according to scores.
Notes:
Contact: Min Dai, daimin@zju.edu.cn
Examples:
>>> import cosg as cosg
>>> import scanpy as sc
>>> adata = sc.datasets.pbmc68k_reduced()
>>> cosg.cosg(adata, key_added='cosg', groupby='bulk_labels')
>>> sc.pl.rank_genes_groups(adata, key='cosg')
"""
adata = adata.copy() if copy else adata
if layer is not None:
if use_raw:
raise ValueError("Cannot specify `layer` and have `use_raw=True`.")
cellxgene = adata.layers[layer]
else:
if use_raw and adata.raw is not None:
cellxgene = adata.raw.X
cellxgene = adata.X
### Refer to scanpy's framework
# https://github.com/theislab/scanpy/blob/5533b644e796379fd146bf8e659fd49f92f718cd/scanpy/tools/_rank_genes_groups.py#L559
if key_added is None:
key_added = 'rank_genes_groups'
adata.uns[key_added] = {}
adata.uns[key_added]['params'] = dict(
groupby=groupby,
reference=reference,
groups=groups,
method='cosg',
use_raw=use_raw,
layer=layer,
)
### Refer to: https://github.com/theislab/scanpy/blob/5533b644e796379fd146bf8e659fd49f92f718cd/scanpy/tools/_rank_genes_groups.py#L543
if groups == 'all':
### group lable for each cell
group_info=adata.obs[groupby]
elif isinstance(groups, (str, int)):
raise ValueError('Specify a sequence of groups')
else:
cells_selected=adata.obs[groupby].isin(groups)
cells_selected=cells_selected.values
if sparse.issparse(cellxgene):
cellxgene=cellxgene[cells_selected]
else:
cellxgene=cellxgene[cells_selected,:]
### group lable for each cell
group_info=adata.obs[groupby].copy()
group_info=group_info[cells_selected]
groups_order=np.unique(group_info)
n_cluster=len(groups_order)
n_cell=cellxgene.shape[0]
cluster_mat=np.zeros(shape=(n_cluster,n_cell))
### To further restrict expression in other clusters, can think about a better way, such as taking the cluster similarities into consideration
order_i=0
for group_i in groups_order:
idx_i=group_info==group_i
cluster_mat[order_i,:][idx_i]=1
order_i=order_i+1
if sparse.issparse(cellxgene):
### Convert to sparse matrix
from scipy.sparse import csr_matrix
cluster_mat=csr_matrix(cluster_mat)
from sklearn.metrics.pairwise import cosine_similarity
### the dimension is: Gene x lambda
cosine_sim=cosine_similarity(X=cellxgene.T,Y=cluster_mat,dense_output=False)
pos_nonzero=cosine_sim.nonzero()
genexlambda=cosine_sim.multiply(cosine_sim)
e_power2_sum=genexlambda.sum(axis=1)
if mu==1:
genexlambda[pos_nonzero]=genexlambda[pos_nonzero]/(np.repeat(e_power2_sum,genexlambda.shape[1],axis=1)[pos_nonzero])
else:
genexlambda[pos_nonzero]=genexlambda[pos_nonzero]/((1-mu)*genexlambda[pos_nonzero]+
mu*(
np.repeat(e_power2_sum,genexlambda.shape[1],axis=1)[pos_nonzero])
)
genexlambda[pos_nonzero]=np.multiply(genexlambda[pos_nonzero],cosine_sim[pos_nonzero])
### If the cellxgene is not a sparse matrix
else:
## Not using sparse matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim=cosine_similarity(X=cellxgene.T,Y=cluster_mat,dense_output=True)
pos_nonzero=cosine_sim!=0
e_power2=np.multiply(cosine_sim,cosine_sim)
e_power2_sum=np.sum(e_power2,axis=1)
e_power2[pos_nonzero]=np.true_divide(e_power2[pos_nonzero],(1-mu)*e_power2[pos_nonzero]+mu*(np.dot(e_power2_sum.reshape(e_power2_sum.shape[0],1),np.repeat(1,e_power2.shape[1]).reshape(1,e_power2.shape[1]))[pos_nonzero]))
e_power2[pos_nonzero]=np.multiply(e_power2[pos_nonzero],cosine_sim[pos_nonzero])
genexlambda=e_power2
### Refer to scanpy
rank_stats=None
### Whether to calculate logfoldchanges, because this is required in scanpy 1.8
if calculate_logfoldchanges:
### Calculate basic stats
### Refer to Scanpy
# for clarity, rename variable
if groups == 'all':
groups_order2 = 'all'
elif isinstance(groups, (str, int)):
raise ValueError('Specify a sequence of groups')
else:
groups_order2 = list(groups)
if isinstance(groups_order2[0], int):
groups_order2 = [str(n) for n in groups_order2]
if reference != 'rest' and reference not in set(groups_order2):
groups_order2 += [reference]
if reference != 'rest' and reference not in adata.obs[groupby].cat.categories:
cats = adata.obs[groupby].cat.categories.tolist()
raise ValueError(
f'reference = {reference} needs to be one of groupby = {cats}.'
)
pts=False
anndata_obj = _RankGenes(adata, groups_order2, groupby, reference, use_raw, layer, pts)
anndata_obj._basic_stats()
### Refer to Scanpy
# for correct getnnz calculation
### get non-zeros for columns
if sparse.issparse(cellxgene):
cellxgene.eliminate_zeros()
if sparse.issparse(cellxgene):
get_nonzeros = lambda X: X.getnnz(axis=0)
else:
get_nonzeros = lambda X: np.count_nonzero(X, axis=0)
order_i=0
for group_i in groups_order:
idx_i=group_info==group_i
### Convert to numpy array
idx_i=idx_i.values
## Compare the most ideal case to the worst case
if sparse.issparse(cellxgene):
scores=genexlambda[:,order_i].A[:,0]
else:
scores=genexlambda[:,order_i]
### Mask these genes expressed in less than 3 cells in the cluster of interest
if remove_lowly_expressed:
n_cells_expressed=get_nonzeros(cellxgene[idx_i])
n_cells_i=np.sum(idx_i)
scores[n_cells_expressed<n_cells_i*expressed_pct]= -1
global_indices = _select_top_n(scores, n_genes_user)
if rank_stats is None:
idx = pd.MultiIndex.from_tuples([(group_i,'names')])
rank_stats = pd.DataFrame(columns=idx)
rank_stats[group_i, 'names'] = adata.var_names[global_indices]
rank_stats[group_i, 'scores'] = scores[global_indices]
if calculate_logfoldchanges:
group_index=np.where(anndata_obj.groups_order==group_i)[0][0]
if anndata_obj.means is not None:
mean_group = anndata_obj.means[group_index]
if anndata_obj.ireference is None:
mean_rest = anndata_obj.means_rest[group_index]
else:
mean_rest = anndata_obj.means[anndata_obj.ireference]
foldchanges = (anndata_obj.expm1_func(mean_group) + 1e-9) / (
anndata_obj.expm1_func(mean_rest) + 1e-9
) # add small value to remove 0's
rank_stats[group_i, 'logfoldchanges'] = np.log2(
foldchanges[global_indices]
)
order_i=order_i+1
## Refer to scanpy
if calculate_logfoldchanges:
dtypes = {
'names': 'O',
'scores': 'float32',
'logfoldchanges': 'float32',
}
else:
dtypes = {
'names': 'O',
'scores': 'float32',
}
###
rank_stats.columns = rank_stats.columns.swaplevel()
for col in rank_stats.columns.levels[0]:
adata.uns[key_added][col]=rank_stats[col].to_records(
index=False, column_dtypes=dtypes[col]
)
adata.uns[key_added]['pvals']=adata.uns['rank_genes_groups']['pvals']
adata.uns[key_added]['pvals_adj']=adata.uns['rank_genes_groups']['pvals_adj']
print('**finished identifying marker genes by COSG**')
### Return the result
return adata if copy else None