Api bulktrajblend
omicverse.bulk2single.BulkTrajBlend
¶
Bases: object
BulkTrajBlend: A class for bulk and single cell data integration and trajectory inference using beta-VAE and GNN.
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
class BulkTrajBlend(object):
"""
BulkTrajBlend: A class for bulk and single cell data integration and trajectory inference using beta-VAE and GNN.
"""
def __init__(self,bulk_seq:pd.DataFrame,single_seq:anndata.AnnData,
celltype_key:str,bulk_group=None,max_single_cells:int=5000,
top_marker_num:int=500,ratio_num:int=1,gpu:Union[int,str]=0) -> None:
"""
Initialize the BulkTrajBlend class
Arguments:
bulk_seq: The bulk data. The index is gene name and the columns is cell name.
single_seq: The single cell data. The index is cell name and the columns is gene name.
celltype_key: The key of cell type in the single cell data.
top_marker_num: The number of top marker genes for each cell type.
ratio_num: The number of cells to be selected for each cell type.
gpu: The gpu id or 'cpu' or 'mps'.
max_single_cells: The maximum number of single cells to be used. Default is 5000.
"""
self.bulk_seq = bulk_seq.copy()
self.single_seq = single_seq.copy()
self.celltype_key=celltype_key
self.top_marker_num=top_marker_num
self.ratio_num=ratio_num
self.gpu=gpu
self.group=bulk_group
self.max_single_cells=max_single_cells
if gpu=='mps' and torch.backends.mps.is_available():
print('Note that mps may loss will be nan, used it when torch is supported')
self.used_device = torch.device("mps")
else:
self.used_device = torch.device(f"cuda:{gpu}") if gpu >= 0 and torch.cuda.is_available() else torch.device('cpu')
self.history=[]
data_dg_v=self.bulk_seq.mean(axis=1)
data_dg=pd.DataFrame(index=data_dg_v.index)
data_dg['group']=data_dg_v
self.bulk_seq_group=data_dg
pass
def bulk_preprocess_lazy(self,)->None:
"""
Preprocess the bulk data
Arguments:
group: The group of the bulk data. Default is None. It need to set to calculate the mean of each group.
"""
print("......drop duplicates index in bulk data")
self.bulk_seq=data_drop_duplicates_index(self.bulk_seq)
print("......deseq2 normalize the bulk data")
self.bulk_seq=deseq2_normalize(self.bulk_seq)
print("......log10 the bulk data")
self.bulk_seq=np.log10(self.bulk_seq+1)
print("......calculate the mean of each group")
if self.group is None:
return None
else:
data_dg_v=self.bulk_seq[self.group].mean(axis=1)
data_dg=pd.DataFrame(index=data_dg_v.index)
data_dg['group']=data_dg_v
self.bulk_seq_group=data_dg
return None
def single_preprocess_lazy(self,target_sum:int=1e4)->None:
"""
Preprocess the single data
Arguments:
target_sum: The target sum of the normalize. Default is 1e4.
"""
print("......normalize the single data")
self.single_seq.obs_names_make_unique()
self.single_seq.var_names_make_unique()
sc.pp.normalize_total(self.single_seq, target_sum=target_sum)
print("......log1p the single data")
sc.pp.log1p(self.single_seq)
return None
def vae_configure(self,cell_target_num=None,**kwargs):
"""
Configure the vae model
Arguments:
cell_target_num: The number of cell types to be generated. Default is 100.
"""
self.vae_model=Bulk2Single(bulk_data=self.bulk_seq,single_data=self.single_seq,
celltype_key=self.celltype_key,bulk_group=self.group,
max_single_cells=self.max_single_cells,
top_marker_num=self.top_marker_num,ratio_num=self.ratio_num,gpu=self.gpu)
if cell_target_num!=None:
self.vae_model.cell_target_num=dict(zip(list(set(self.single_seq.obs[self.celltype_key])),
[cell_target_num]*len(list(set(self.single_seq.obs[self.celltype_key])))))
else:
self.cellfract=self.vae_model.predicted_fraction(**kwargs)
self.sc_ref=self.vae_model.sc_ref.copy()
self.bulk_ref=self.vae_model.bulk_data.T.copy()
self.vae_model.bulk_preprocess_lazy()
self.vae_model.single_preprocess_lazy()
self.vae_model.prepare_input()
def vae_train(self,
vae_save_dir:str='save_model',
vae_save_name:str='vae',
generate_save_dir:str='output',
generate_save_name:str='output',
batch_size:int=512,
learning_rate:int=1e-4,
hidden_size:int=256,
epoch_num:int=5000,
patience:int=50,save:bool=True):
r"""
Train the VAE model of BulkTrajBlend.
Arguments:
vae_save_dir: The directory to save the trained VAE model. Default is 'save_model'.
vae_save_name: The name to save the trained VAE model. Default is 'vae'.
generate_save_dir: The directory to save the generated single-cell data. Default is 'output'.
generate_save_name: The name to save the generated single-cell data. Default is 'output'.
batch_size: The batch size for training the VAE model. Default is 512.
learning_rate: The learning rate for training the VAE model. Default is 1e-4.
hidden_size: The hidden size for the encoder and decoder networks. Default is 256.
epoch_num: The epoch number for training the VAE model. Default is 5000.
patience: The patience for training the VAE model. Default is 50.
save: Whether to save the trained VAE model. Default is True.
"""
self.vae_net=self.vae_model.train(
batch_size=batch_size,
learning_rate=learning_rate,
hidden_size=hidden_size,
epoch_num=epoch_num,
vae_save_dir=vae_save_dir,
vae_save_name=vae_save_name,
generate_save_dir=generate_save_dir,
generate_save_name=generate_save_name,
patience=patience,save=save)
def vae_load(self,vae_load_dir:str,hidden_size:int=256):
r"""
load the trained VAE model of BulkTrajBlend.
Arguments:
vae_load_dir: The directory to load the trained VAE model.
hidden_size: The hidden size for the encoder and decoder networks. Default is 256.
"""
print(f'loading model from {vae_load_dir}')
vae_net = self.vae_model.load(vae_load_dir,hidden_size=hidden_size)
self.vae_net=vae_net
def vae_generate(self,highly_variable_genes:bool=True,max_value:float=10,
n_comps:int=100,svd_solver:str='auto',leiden_size:int=50)->anndata.AnnData:
"""
Generate the single-cell data from the trained VAE model.
Arguments:
highly_variable_genes: Whether to use highly variable genes. Default is True.
max_value: The maximum value for the scaled data. Default is 10.
n_comps: The number of principal components. Default is 100.
svd_solver: The solver for the PCA. Default is 'auto'.
leiden_size: The minimum size of the leiden clusters. Default is 50.
Returns:
generate_adata: The generated single-cell data.
"""
generate_adata=self.vae_model.generate()
self.generate_adata_raw=generate_adata.copy()
generate_adata.raw = generate_adata
if highly_variable_genes:
sc.pp.highly_variable_genes(generate_adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
generate_adata = generate_adata[:, generate_adata.var.highly_variable]
sc.pp.scale(generate_adata, max_value=max_value)
sc.tl.pca(generate_adata, n_comps=n_comps, svd_solver=svd_solver)
sc.pp.neighbors(generate_adata, use_rep="X_pca")
sc.tl.leiden(generate_adata)
filter_leiden=list(generate_adata.obs['leiden'].value_counts()[generate_adata.obs['leiden'].value_counts()<leiden_size].index)
generate_adata.uns['noisy_leiden']=filter_leiden
print("The filter leiden is ",filter_leiden)
generate_adata=generate_adata[~generate_adata.obs['leiden'].isin(filter_leiden)]
self.generate_adata=generate_adata.copy()
return generate_adata.raw.to_adata()
def gnn_configure(self,use_rep='X',neighbor_rep='X_pca',
gpu=0,hidden_size:int=128,
weight_decay:int=1e-2,
dropout:float=0.5,
batch_norm:bool=True,
lr:int=1e-3,
max_epochs:int=500,
display_step:int=25,
balance_loss:bool=True,
stochastic_loss:bool=True,
batch_size:int=2000,num_workers:int=5,):
"""
Configure the GNN model of BulkTrajBlend.
Arguments:
gpu: The GPU ID for training the GNN model. Default is 0.
hidden_size: The hidden size for the GNN model. Default is 128.
weight_decay: The weight decay for the GNN model. Default is 1e-2.
dropout: The dropout for the GNN model. Default is 0.5.
batch_norm: Whether to use batch normalization for the GNN model. Default is True.
lr: The learning rate for the GNN model. Default is 1e-3.
max_epochs: The maximum epoch number for training the GNN model. Default is 500.
display_step: The display step for training the GNN model. Default is 25.
balance_loss: Whether to use the balance loss for training the GNN model. Default is True.
stochastic_loss: Whether to use the stochastic loss for training the GNN model. Default is True.
batch_size: The batch size for training the GNN model. Default is 2000.
num_workers: The number of workers for training the GNN model. Default is 5.
"""
nocd_obj=scnocd(self.generate_adata,use_rep=use_rep,
neighbor_rep=neighbor_rep,gpu=gpu)
#nocd_obj.device = torch.device(f"cuda:{gpu}") if gpu >= 0 and torch.cuda.is_available() else torch.device('cpu')
nocd_obj.matrix_transform(clustertype=self.celltype_key)
nocd_obj.matrix_normalize()
nocd_obj.GNN_configure(hidden_size=hidden_size,weight_decay=weight_decay,
dropout=dropout,batch_norm=batch_norm,lr=lr,
max_epochs=max_epochs,display_step=display_step,
balance_loss=balance_loss,stochastic_loss=stochastic_loss,
batch_size=batch_size)
nocd_obj.GNN_preprocess(num_workers=num_workers)
self.nocd_obj=nocd_obj
def gnn_train(self,thresh:float=0.5,gnn_save_dir:str='save_model',
gnn_save_name:str='gnn'):
"""
Train the GNN model of BulkTrajBlend.
Arguments:
thresh: The threshold for the GNN model. Default is 0.5.
gnn_save_dir: The directory for saving the GNN model. Default is 'save_model'.
gnn_save_name: The name for saving the GNN model. Default is 'gnn'.
"""
self.nocd_obj.GNN_model()
self.nocd_obj.GNN_result(thresh=thresh)
self.nocd_obj.cal_nocd()
self.nocd_obj.save(gnn_save_dir=gnn_save_dir,gnn_save_name=gnn_save_name)
def gnn_load(self,gnn_load_dir:str,thresh:float=0.5,):
"""
Load the GNN model of BulkTrajBlend.
Arguments:
gnn_load_dir: The directory for loading the GNN model.
thresh: The threshold for the GNN model. Default is 0.5.
"""
self.nocd_obj.load(gnn_load_dir)
self.nocd_obj.GNN_result(thresh=thresh)
self.nocd_obj.cal_nocd()
def gnn_generate(self)->pd.DataFrame:
"""
Generate the overlap cell community.
Returns:
res_pd: The overlap cell community.
"""
'''
pair_dict_r={}
for i in range(self.nocd_obj.K):
j=0
while 1:
if self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].shape[0]==0:
break
if j>=len(self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index):
pair_dict_r[str(i)]=self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index[j-1]+'_'+str(j)
break
if self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index[j] not in list(pair_dict_r.values()):
pair_dict_r[str(i)]=self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index[j]
break
else:
j+=1
pair_dict_r
'''
unique_adata=self.nocd_obj.adata[~self.nocd_obj.adata.obs['nocd_n'].str.contains('-')]
pair_dict_r={}
repeat_celltype=dict(zip(list(set(unique_adata.obs[self.celltype_key])),np.zeros(len(list(set(unique_adata.obs[self.celltype_key]))))))
for nocd_class in list(set(unique_adata.obs['nocd_n'])):
now_celltype=unique_adata[unique_adata.obs['nocd_n']==nocd_class].obs.value_counts(self.celltype_key).index[0]
if (now_celltype in pair_dict_r.values()):
#print(now_celltype)
pair_dict_r[str(nocd_class)]=now_celltype+'_'+str(int(repeat_celltype[now_celltype]))
repeat_celltype[now_celltype]+=1
else:
pair_dict_r[str(nocd_class)]=now_celltype
repeat_celltype[now_celltype]+=1
def li_range(li,max_len):
r=[0]*max_len
for i in li:
r[int(i)]=1
return r
res_li=[li_range(i.split('-'),self.nocd_obj.K) for i in self.nocd_obj.adata.obs['nocd_n']]
res_pd=pd.DataFrame(res_li,index=self.nocd_obj.adata.obs.index,columns=['nocd_'+i for i in [pair_dict_r[str(j)] for j in range(self.nocd_obj.K)]])
print("The nocd result is ",res_pd.sum(axis=0))
print("The nocd result has been added to adata.obs['nocd_']")
self.nocd_obj.adata.obs=pd.concat([self.nocd_obj.adata.obs,res_pd],axis=1)
return res_pd
def interpolation(self,celltype:str,adata:anndata.AnnData=None,)->anndata.AnnData:
"""
Interpolate the cell community to raw data.
Arguments:
celltype: The cell type for interpolation.
adata: The raw data for interpolation. If is None, will use the single_seq data. Default is None.
Returns:
adata1: The adata after interpolated .
"""
if adata is None:
adata=self.single_seq
test_adata=self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_{}'.format(celltype)]==1].raw.to_adata()
if test_adata.shape[0]!=0:
adata1=anndata.concat([test_adata,
adata],merge='same')
else:
adata1=adata
print("The cell type {} is not in the nocd result".format(celltype))
return adata1
__init__(bulk_seq, single_seq, celltype_key, bulk_group=None, max_single_cells=5000, top_marker_num=500, ratio_num=1, gpu=0)
¶
Initialize the BulkTrajBlend class
Parameters:
Name | Type | Description | Default |
---|---|---|---|
bulk_seq |
pd.DataFrame
|
The bulk data. The index is gene name and the columns is cell name. |
required |
single_seq |
anndata.AnnData
|
The single cell data. The index is cell name and the columns is gene name. |
required |
celltype_key |
str
|
The key of cell type in the single cell data. |
required |
top_marker_num |
int
|
The number of top marker genes for each cell type. |
500
|
ratio_num |
int
|
The number of cells to be selected for each cell type. |
1
|
gpu |
Union[int, str]
|
The gpu id or 'cpu' or 'mps'. |
0
|
max_single_cells |
int
|
The maximum number of single cells to be used. Default is 5000. |
5000
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
def __init__(self,bulk_seq:pd.DataFrame,single_seq:anndata.AnnData,
celltype_key:str,bulk_group=None,max_single_cells:int=5000,
top_marker_num:int=500,ratio_num:int=1,gpu:Union[int,str]=0) -> None:
"""
Initialize the BulkTrajBlend class
Arguments:
bulk_seq: The bulk data. The index is gene name and the columns is cell name.
single_seq: The single cell data. The index is cell name and the columns is gene name.
celltype_key: The key of cell type in the single cell data.
top_marker_num: The number of top marker genes for each cell type.
ratio_num: The number of cells to be selected for each cell type.
gpu: The gpu id or 'cpu' or 'mps'.
max_single_cells: The maximum number of single cells to be used. Default is 5000.
"""
self.bulk_seq = bulk_seq.copy()
self.single_seq = single_seq.copy()
self.celltype_key=celltype_key
self.top_marker_num=top_marker_num
self.ratio_num=ratio_num
self.gpu=gpu
self.group=bulk_group
self.max_single_cells=max_single_cells
if gpu=='mps' and torch.backends.mps.is_available():
print('Note that mps may loss will be nan, used it when torch is supported')
self.used_device = torch.device("mps")
else:
self.used_device = torch.device(f"cuda:{gpu}") if gpu >= 0 and torch.cuda.is_available() else torch.device('cpu')
self.history=[]
data_dg_v=self.bulk_seq.mean(axis=1)
data_dg=pd.DataFrame(index=data_dg_v.index)
data_dg['group']=data_dg_v
self.bulk_seq_group=data_dg
pass
bulk_preprocess_lazy()
¶
Preprocess the bulk data
Parameters:
Name | Type | Description | Default |
---|---|---|---|
group |
The group of the bulk data. Default is None. It need to set to calculate the mean of each group. |
required |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
def bulk_preprocess_lazy(self,)->None:
"""
Preprocess the bulk data
Arguments:
group: The group of the bulk data. Default is None. It need to set to calculate the mean of each group.
"""
print("......drop duplicates index in bulk data")
self.bulk_seq=data_drop_duplicates_index(self.bulk_seq)
print("......deseq2 normalize the bulk data")
self.bulk_seq=deseq2_normalize(self.bulk_seq)
print("......log10 the bulk data")
self.bulk_seq=np.log10(self.bulk_seq+1)
print("......calculate the mean of each group")
if self.group is None:
return None
else:
data_dg_v=self.bulk_seq[self.group].mean(axis=1)
data_dg=pd.DataFrame(index=data_dg_v.index)
data_dg['group']=data_dg_v
self.bulk_seq_group=data_dg
return None
single_preprocess_lazy(target_sum=10000.0)
¶
Preprocess the single data
Parameters:
Name | Type | Description | Default |
---|---|---|---|
target_sum |
int
|
The target sum of the normalize. Default is 1e4. |
10000.0
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
def single_preprocess_lazy(self,target_sum:int=1e4)->None:
"""
Preprocess the single data
Arguments:
target_sum: The target sum of the normalize. Default is 1e4.
"""
print("......normalize the single data")
self.single_seq.obs_names_make_unique()
self.single_seq.var_names_make_unique()
sc.pp.normalize_total(self.single_seq, target_sum=target_sum)
print("......log1p the single data")
sc.pp.log1p(self.single_seq)
return None
vae_configure(cell_target_num=None, **kwargs)
¶
Configure the vae model
Parameters:
Name | Type | Description | Default |
---|---|---|---|
cell_target_num |
The number of cell types to be generated. Default is 100. |
None
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
def vae_configure(self,cell_target_num=None,**kwargs):
"""
Configure the vae model
Arguments:
cell_target_num: The number of cell types to be generated. Default is 100.
"""
self.vae_model=Bulk2Single(bulk_data=self.bulk_seq,single_data=self.single_seq,
celltype_key=self.celltype_key,bulk_group=self.group,
max_single_cells=self.max_single_cells,
top_marker_num=self.top_marker_num,ratio_num=self.ratio_num,gpu=self.gpu)
if cell_target_num!=None:
self.vae_model.cell_target_num=dict(zip(list(set(self.single_seq.obs[self.celltype_key])),
[cell_target_num]*len(list(set(self.single_seq.obs[self.celltype_key])))))
else:
self.cellfract=self.vae_model.predicted_fraction(**kwargs)
self.sc_ref=self.vae_model.sc_ref.copy()
self.bulk_ref=self.vae_model.bulk_data.T.copy()
self.vae_model.bulk_preprocess_lazy()
self.vae_model.single_preprocess_lazy()
self.vae_model.prepare_input()
vae_train(vae_save_dir='save_model', vae_save_name='vae', generate_save_dir='output', generate_save_name='output', batch_size=512, learning_rate=0.0001, hidden_size=256, epoch_num=5000, patience=50, save=True)
¶
Train the VAE model of BulkTrajBlend.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
vae_save_dir |
str
|
The directory to save the trained VAE model. Default is 'save_model'. |
'save_model'
|
vae_save_name |
str
|
The name to save the trained VAE model. Default is 'vae'. |
'vae'
|
generate_save_dir |
str
|
The directory to save the generated single-cell data. Default is 'output'. |
'output'
|
generate_save_name |
str
|
The name to save the generated single-cell data. Default is 'output'. |
'output'
|
batch_size |
int
|
The batch size for training the VAE model. Default is 512. |
512
|
learning_rate |
int
|
The learning rate for training the VAE model. Default is 1e-4. |
0.0001
|
hidden_size |
int
|
The hidden size for the encoder and decoder networks. Default is 256. |
256
|
epoch_num |
int
|
The epoch number for training the VAE model. Default is 5000. |
5000
|
patience |
int
|
The patience for training the VAE model. Default is 50. |
50
|
save |
bool
|
Whether to save the trained VAE model. Default is True. |
True
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
def vae_train(self,
vae_save_dir:str='save_model',
vae_save_name:str='vae',
generate_save_dir:str='output',
generate_save_name:str='output',
batch_size:int=512,
learning_rate:int=1e-4,
hidden_size:int=256,
epoch_num:int=5000,
patience:int=50,save:bool=True):
r"""
Train the VAE model of BulkTrajBlend.
Arguments:
vae_save_dir: The directory to save the trained VAE model. Default is 'save_model'.
vae_save_name: The name to save the trained VAE model. Default is 'vae'.
generate_save_dir: The directory to save the generated single-cell data. Default is 'output'.
generate_save_name: The name to save the generated single-cell data. Default is 'output'.
batch_size: The batch size for training the VAE model. Default is 512.
learning_rate: The learning rate for training the VAE model. Default is 1e-4.
hidden_size: The hidden size for the encoder and decoder networks. Default is 256.
epoch_num: The epoch number for training the VAE model. Default is 5000.
patience: The patience for training the VAE model. Default is 50.
save: Whether to save the trained VAE model. Default is True.
"""
self.vae_net=self.vae_model.train(
batch_size=batch_size,
learning_rate=learning_rate,
hidden_size=hidden_size,
epoch_num=epoch_num,
vae_save_dir=vae_save_dir,
vae_save_name=vae_save_name,
generate_save_dir=generate_save_dir,
generate_save_name=generate_save_name,
patience=patience,save=save)
vae_load(vae_load_dir, hidden_size=256)
¶
load the trained VAE model of BulkTrajBlend.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
vae_load_dir |
str
|
The directory to load the trained VAE model. |
required |
hidden_size |
int
|
The hidden size for the encoder and decoder networks. Default is 256. |
256
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
def vae_load(self,vae_load_dir:str,hidden_size:int=256):
r"""
load the trained VAE model of BulkTrajBlend.
Arguments:
vae_load_dir: The directory to load the trained VAE model.
hidden_size: The hidden size for the encoder and decoder networks. Default is 256.
"""
print(f'loading model from {vae_load_dir}')
vae_net = self.vae_model.load(vae_load_dir,hidden_size=hidden_size)
self.vae_net=vae_net
vae_generate(highly_variable_genes=True, max_value=10, n_comps=100, svd_solver='auto', leiden_size=50)
¶
Generate the single-cell data from the trained VAE model.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
highly_variable_genes |
bool
|
Whether to use highly variable genes. Default is True. |
True
|
max_value |
float
|
The maximum value for the scaled data. Default is 10. |
10
|
n_comps |
int
|
The number of principal components. Default is 100. |
100
|
svd_solver |
str
|
The solver for the PCA. Default is 'auto'. |
'auto'
|
leiden_size |
int
|
The minimum size of the leiden clusters. Default is 50. |
50
|
Returns:
Name | Type | Description |
---|---|---|
generate_adata |
anndata.AnnData
|
The generated single-cell data. |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
def vae_generate(self,highly_variable_genes:bool=True,max_value:float=10,
n_comps:int=100,svd_solver:str='auto',leiden_size:int=50)->anndata.AnnData:
"""
Generate the single-cell data from the trained VAE model.
Arguments:
highly_variable_genes: Whether to use highly variable genes. Default is True.
max_value: The maximum value for the scaled data. Default is 10.
n_comps: The number of principal components. Default is 100.
svd_solver: The solver for the PCA. Default is 'auto'.
leiden_size: The minimum size of the leiden clusters. Default is 50.
Returns:
generate_adata: The generated single-cell data.
"""
generate_adata=self.vae_model.generate()
self.generate_adata_raw=generate_adata.copy()
generate_adata.raw = generate_adata
if highly_variable_genes:
sc.pp.highly_variable_genes(generate_adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
generate_adata = generate_adata[:, generate_adata.var.highly_variable]
sc.pp.scale(generate_adata, max_value=max_value)
sc.tl.pca(generate_adata, n_comps=n_comps, svd_solver=svd_solver)
sc.pp.neighbors(generate_adata, use_rep="X_pca")
sc.tl.leiden(generate_adata)
filter_leiden=list(generate_adata.obs['leiden'].value_counts()[generate_adata.obs['leiden'].value_counts()<leiden_size].index)
generate_adata.uns['noisy_leiden']=filter_leiden
print("The filter leiden is ",filter_leiden)
generate_adata=generate_adata[~generate_adata.obs['leiden'].isin(filter_leiden)]
self.generate_adata=generate_adata.copy()
return generate_adata.raw.to_adata()
gnn_configure(use_rep='X', neighbor_rep='X_pca', gpu=0, hidden_size=128, weight_decay=0.01, dropout=0.5, batch_norm=True, lr=0.001, max_epochs=500, display_step=25, balance_loss=True, stochastic_loss=True, batch_size=2000, num_workers=5)
¶
Configure the GNN model of BulkTrajBlend.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
gpu |
The GPU ID for training the GNN model. Default is 0. |
0
|
|
hidden_size |
int
|
The hidden size for the GNN model. Default is 128. |
128
|
weight_decay |
int
|
The weight decay for the GNN model. Default is 1e-2. |
0.01
|
dropout |
float
|
The dropout for the GNN model. Default is 0.5. |
0.5
|
batch_norm |
bool
|
Whether to use batch normalization for the GNN model. Default is True. |
True
|
lr |
int
|
The learning rate for the GNN model. Default is 1e-3. |
0.001
|
max_epochs |
int
|
The maximum epoch number for training the GNN model. Default is 500. |
500
|
display_step |
int
|
The display step for training the GNN model. Default is 25. |
25
|
balance_loss |
bool
|
Whether to use the balance loss for training the GNN model. Default is True. |
True
|
stochastic_loss |
bool
|
Whether to use the stochastic loss for training the GNN model. Default is True. |
True
|
batch_size |
int
|
The batch size for training the GNN model. Default is 2000. |
2000
|
num_workers |
int
|
The number of workers for training the GNN model. Default is 5. |
5
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
def gnn_configure(self,use_rep='X',neighbor_rep='X_pca',
gpu=0,hidden_size:int=128,
weight_decay:int=1e-2,
dropout:float=0.5,
batch_norm:bool=True,
lr:int=1e-3,
max_epochs:int=500,
display_step:int=25,
balance_loss:bool=True,
stochastic_loss:bool=True,
batch_size:int=2000,num_workers:int=5,):
"""
Configure the GNN model of BulkTrajBlend.
Arguments:
gpu: The GPU ID for training the GNN model. Default is 0.
hidden_size: The hidden size for the GNN model. Default is 128.
weight_decay: The weight decay for the GNN model. Default is 1e-2.
dropout: The dropout for the GNN model. Default is 0.5.
batch_norm: Whether to use batch normalization for the GNN model. Default is True.
lr: The learning rate for the GNN model. Default is 1e-3.
max_epochs: The maximum epoch number for training the GNN model. Default is 500.
display_step: The display step for training the GNN model. Default is 25.
balance_loss: Whether to use the balance loss for training the GNN model. Default is True.
stochastic_loss: Whether to use the stochastic loss for training the GNN model. Default is True.
batch_size: The batch size for training the GNN model. Default is 2000.
num_workers: The number of workers for training the GNN model. Default is 5.
"""
nocd_obj=scnocd(self.generate_adata,use_rep=use_rep,
neighbor_rep=neighbor_rep,gpu=gpu)
#nocd_obj.device = torch.device(f"cuda:{gpu}") if gpu >= 0 and torch.cuda.is_available() else torch.device('cpu')
nocd_obj.matrix_transform(clustertype=self.celltype_key)
nocd_obj.matrix_normalize()
nocd_obj.GNN_configure(hidden_size=hidden_size,weight_decay=weight_decay,
dropout=dropout,batch_norm=batch_norm,lr=lr,
max_epochs=max_epochs,display_step=display_step,
balance_loss=balance_loss,stochastic_loss=stochastic_loss,
batch_size=batch_size)
nocd_obj.GNN_preprocess(num_workers=num_workers)
self.nocd_obj=nocd_obj
gnn_train(thresh=0.5, gnn_save_dir='save_model', gnn_save_name='gnn')
¶
Train the GNN model of BulkTrajBlend.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
thresh |
float
|
The threshold for the GNN model. Default is 0.5. |
0.5
|
gnn_save_dir |
str
|
The directory for saving the GNN model. Default is 'save_model'. |
'save_model'
|
gnn_save_name |
str
|
The name for saving the GNN model. Default is 'gnn'. |
'gnn'
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
def gnn_train(self,thresh:float=0.5,gnn_save_dir:str='save_model',
gnn_save_name:str='gnn'):
"""
Train the GNN model of BulkTrajBlend.
Arguments:
thresh: The threshold for the GNN model. Default is 0.5.
gnn_save_dir: The directory for saving the GNN model. Default is 'save_model'.
gnn_save_name: The name for saving the GNN model. Default is 'gnn'.
"""
self.nocd_obj.GNN_model()
self.nocd_obj.GNN_result(thresh=thresh)
self.nocd_obj.cal_nocd()
self.nocd_obj.save(gnn_save_dir=gnn_save_dir,gnn_save_name=gnn_save_name)
gnn_load(gnn_load_dir, thresh=0.5)
¶
Load the GNN model of BulkTrajBlend.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
gnn_load_dir |
str
|
The directory for loading the GNN model. |
required |
thresh |
float
|
The threshold for the GNN model. Default is 0.5. |
0.5
|
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
def gnn_load(self,gnn_load_dir:str,thresh:float=0.5,):
"""
Load the GNN model of BulkTrajBlend.
Arguments:
gnn_load_dir: The directory for loading the GNN model.
thresh: The threshold for the GNN model. Default is 0.5.
"""
self.nocd_obj.load(gnn_load_dir)
self.nocd_obj.GNN_result(thresh=thresh)
self.nocd_obj.cal_nocd()
gnn_generate()
¶
Generate the overlap cell community.
Returns:
Name | Type | Description |
---|---|---|
res_pd |
pd.DataFrame
|
The overlap cell community. |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
def gnn_generate(self)->pd.DataFrame:
"""
Generate the overlap cell community.
Returns:
res_pd: The overlap cell community.
"""
'''
pair_dict_r={}
for i in range(self.nocd_obj.K):
j=0
while 1:
if self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].shape[0]==0:
break
if j>=len(self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index):
pair_dict_r[str(i)]=self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index[j-1]+'_'+str(j)
break
if self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index[j] not in list(pair_dict_r.values()):
pair_dict_r[str(i)]=self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index[j]
break
else:
j+=1
pair_dict_r
'''
unique_adata=self.nocd_obj.adata[~self.nocd_obj.adata.obs['nocd_n'].str.contains('-')]
pair_dict_r={}
repeat_celltype=dict(zip(list(set(unique_adata.obs[self.celltype_key])),np.zeros(len(list(set(unique_adata.obs[self.celltype_key]))))))
for nocd_class in list(set(unique_adata.obs['nocd_n'])):
now_celltype=unique_adata[unique_adata.obs['nocd_n']==nocd_class].obs.value_counts(self.celltype_key).index[0]
if (now_celltype in pair_dict_r.values()):
#print(now_celltype)
pair_dict_r[str(nocd_class)]=now_celltype+'_'+str(int(repeat_celltype[now_celltype]))
repeat_celltype[now_celltype]+=1
else:
pair_dict_r[str(nocd_class)]=now_celltype
repeat_celltype[now_celltype]+=1
def li_range(li,max_len):
r=[0]*max_len
for i in li:
r[int(i)]=1
return r
res_li=[li_range(i.split('-'),self.nocd_obj.K) for i in self.nocd_obj.adata.obs['nocd_n']]
res_pd=pd.DataFrame(res_li,index=self.nocd_obj.adata.obs.index,columns=['nocd_'+i for i in [pair_dict_r[str(j)] for j in range(self.nocd_obj.K)]])
print("The nocd result is ",res_pd.sum(axis=0))
print("The nocd result has been added to adata.obs['nocd_']")
self.nocd_obj.adata.obs=pd.concat([self.nocd_obj.adata.obs,res_pd],axis=1)
return res_pd
interpolation(celltype, adata=None)
¶
Interpolate the cell community to raw data.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
celltype |
str
|
The cell type for interpolation. |
required |
adata |
anndata.AnnData
|
The raw data for interpolation. If is None, will use the single_seq data. Default is None. |
None
|
Returns:
Name | Type | Description |
---|---|---|
adata1 |
anndata.AnnData
|
The adata after interpolated . |
Source code in /Users/fernandozeng/miniforge3/envs/scbasset/lib/python3.8/site-packages/omicverse/bulk2single/_bulktrajblend.py
def interpolation(self,celltype:str,adata:anndata.AnnData=None,)->anndata.AnnData:
"""
Interpolate the cell community to raw data.
Arguments:
celltype: The cell type for interpolation.
adata: The raw data for interpolation. If is None, will use the single_seq data. Default is None.
Returns:
adata1: The adata after interpolated .
"""
if adata is None:
adata=self.single_seq
test_adata=self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_{}'.format(celltype)]==1].raw.to_adata()
if test_adata.shape[0]!=0:
adata1=anndata.concat([test_adata,
adata],merge='same')
else:
adata1=adata
print("The cell type {} is not in the nocd result".format(celltype))
return adata1