Skip to content

Api bulktrajblend

omicverse.bulk2single.BulkTrajBlend

Bases: object

Bulk-to-single-cell trajectory blending using VAE and GNN integration.

This class implements a comprehensive workflow for integrating bulk RNA-seq data with single-cell data to infer cellular trajectories and transition states. The method combines: - VAE-based bulk-to-single-cell deconvolution - Graph Neural Network (GNN) analysis for trajectory inference - Non-overlapping cell-type decomposition (NOCD) for transition states

The workflow enables identification of intermediate cell states and trajectory dynamics that bridge bulk expression profiles with single-cell resolution.

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
class BulkTrajBlend(object):
    r"""
    Bulk-to-single-cell trajectory blending using VAE and GNN integration.

    This class implements a comprehensive workflow for integrating bulk RNA-seq data
    with single-cell data to infer cellular trajectories and transition states.
    The method combines:
    - VAE-based bulk-to-single-cell deconvolution
    - Graph Neural Network (GNN) analysis for trajectory inference
    - Non-overlapping cell-type decomposition (NOCD) for transition states

    The workflow enables identification of intermediate cell states and trajectory
    dynamics that bridge bulk expression profiles with single-cell resolution.
    """

    def __init__(self,bulk_seq:pd.DataFrame,single_seq:anndata.AnnData,
                 celltype_key:str,bulk_group=None,max_single_cells:int=5000,
                 top_marker_num:int=500,ratio_num:int=1,gpu:Union[int,str]=0) -> None:
        r"""
        Initialize BulkTrajBlend for trajectory inference and cell blending.

        Arguments:
            bulk_seq: Bulk RNA-seq data with genes as rows and samples as columns
            single_seq: Single-cell RNA-seq reference data as AnnData object
            celltype_key: Column name in single_seq.obs containing cell type annotations
            bulk_group: Column names in bulk_seq for sample grouping (None)
            max_single_cells: Maximum number of single cells to use (5000)
            top_marker_num: Number of top marker genes per cell type (500)
            ratio_num: Cell selection ratio for each cell type (1)
            gpu: GPU device ID; -1 for CPU, 'mps' for Apple Silicon (0)

        Returns:
            None
        """

        self.bulk_seq = bulk_seq.copy()
        self.single_seq = single_seq.copy()
        self.celltype_key=celltype_key
        self.top_marker_num=top_marker_num
        self.ratio_num=ratio_num
        self.gpu=gpu
        self.group=bulk_group
        self.max_single_cells=max_single_cells
        if gpu=='mps' and torch.backends.mps.is_available():
            print('Note that mps may loss will be nan, used it when torch is supported')
            self.used_device = torch.device("mps")
        else:
            self.used_device = torch.device(f"cuda:{gpu}") if gpu >= 0 and torch.cuda.is_available() else torch.device('cpu')
        self.history=[]
        data_dg_v=self.bulk_seq.mean(axis=1)
        data_dg=pd.DataFrame(index=data_dg_v.index)
        data_dg['group']=data_dg_v
        self.bulk_seq_group=data_dg
        pass

    def bulk_preprocess_lazy(self,)->None:
        r"""
        Preprocess bulk RNA-seq data for trajectory analysis.

        Performs normalization, log transformation, and optional group averaging
        of bulk data for downstream trajectory inference.

        Arguments:
            None

        Returns:
            None: Updates self.bulk_seq and self.bulk_seq_group in place
        """

        print("......drop duplicates index in bulk data")
        self.bulk_seq=data_drop_duplicates_index(self.bulk_seq)
        print("......deseq2 normalize the bulk data")
        self.bulk_seq=deseq2_normalize(self.bulk_seq)
        print("......log10 the bulk data")
        self.bulk_seq=np.log10(self.bulk_seq+1)
        print("......calculate the mean of each group")
        if self.group is None:
            return None
        else:
            data_dg_v=self.bulk_seq[self.group].mean(axis=1)
            data_dg=pd.DataFrame(index=data_dg_v.index)
            data_dg['group']=data_dg_v
            self.bulk_seq_group=data_dg
        return None

    def single_preprocess_lazy(self,target_sum:int=1e4)->None:
        r"""
        Preprocess single-cell reference data for trajectory analysis.

        Normalizes single-cell data and makes cell/gene names unique for
        consistent integration with bulk data.

        Arguments:
            target_sum: Target sum for total count normalization (10000)

        Returns:
            None: Updates self.single_seq in place
        """

        print("......normalize the single data")
        self.single_seq.obs_names_make_unique()
        self.single_seq.var_names_make_unique()
        sc.pp.normalize_total(self.single_seq, target_sum=target_sum)
        print("......log1p the single data")
        sc.pp.log1p(self.single_seq)
        return None

    def vae_configure(self,cell_target_num=None,**kwargs):
        r"""
        Configure the VAE model for bulk-to-single-cell generation.

        Sets up the Bulk2Single model with cell-type target numbers either from
        deconvolution prediction or manual specification.

        Arguments:
            cell_target_num: Number of cells per type to generate (None for auto-prediction)
            **kwargs: Additional arguments passed to predicted_fraction method

        Returns:
            None: Initializes self.vae_model and preprocessing steps
        """
        self.vae_model=Bulk2Single(bulk_data=self.bulk_seq,single_data=self.single_seq,
                                   celltype_key=self.celltype_key,bulk_group=self.group,
                                      max_single_cells=self.max_single_cells,
                 top_marker_num=self.top_marker_num,ratio_num=self.ratio_num,gpu=self.gpu)
        if cell_target_num!=None:
            self.vae_model.cell_target_num=dict(zip(list(set(self.single_seq.obs[self.celltype_key])),
                                                [cell_target_num]*len(list(set(self.single_seq.obs[self.celltype_key])))))
        else:
            self.cellfract=self.vae_model.predicted_fraction(**kwargs)

        self.sc_ref=self.vae_model.sc_ref.copy()
        self.bulk_ref=self.vae_model.bulk_data.T.copy()

        self.vae_model.bulk_preprocess_lazy()
        self.vae_model.single_preprocess_lazy()
        self.vae_model.prepare_input()



    def vae_train(self,
                  vae_save_dir:str='save_model',
            vae_save_name:str='vae',
            generate_save_dir:str='output',
            generate_save_name:str='output',
            batch_size:int=512,
            learning_rate:int=1e-4,
            hidden_size:int=256,
            epoch_num:int=5000,
            patience:int=50,save:bool=True):
        r"""
        Train the VAE model for trajectory-aware single-cell generation.

        Trains the underlying Bulk2Single VAE model to generate synthetic single
        cells that preserve trajectory information from bulk data.

        Arguments:
            vae_save_dir: Directory to save trained VAE model ('save_model')
            vae_save_name: Filename for saved VAE model ('vae')
            generate_save_dir: Directory for generated data output ('output')
            generate_save_name: Filename for generated data ('output')
            batch_size: Training batch size (512)
            learning_rate: Optimizer learning rate (1e-4)
            hidden_size: Hidden layer dimensions (256)
            epoch_num: Maximum training epochs (5000)
            patience: Early stopping patience (50)
            save: Whether to save trained model (True)

        Returns:
            None: Updates self.vae_net with trained model
        """

        self.vae_net=self.vae_model.train(
                batch_size=batch_size,
                learning_rate=learning_rate,
                hidden_size=hidden_size,
                epoch_num=epoch_num,
                vae_save_dir=vae_save_dir,
                vae_save_name=vae_save_name,
                generate_save_dir=generate_save_dir,
                generate_save_name=generate_save_name,
                patience=patience,save=save)


    def vae_load(self,vae_load_dir:str,hidden_size:int=256):
        r"""
        Load a pre-trained VAE model for trajectory analysis.

        Loads a previously trained VAE model for generating trajectory-aware
        single-cell data.

        Arguments:
            vae_load_dir: Directory containing the trained VAE model
            hidden_size: Hidden layer dimensions matching training (256)

        Returns:
            None: Updates self.vae_net with loaded model
        """

        print(f'loading model from {vae_load_dir}')
        vae_net = self.vae_model.load(vae_load_dir,hidden_size=hidden_size)
        self.vae_net=vae_net

    def vae_generate(self,highly_variable_genes:bool=True,max_value:float=10,
                     n_comps:int=100,svd_solver:str='auto',leiden_size:int=50)->anndata.AnnData:
        r"""
        Generate trajectory-aware single-cell data with quality filtering.

        Uses the trained VAE to generate synthetic single cells and applies
        quality control filtering to remove noisy clusters.

        Arguments:
            highly_variable_genes: Whether to select highly variable genes (True)
            max_value: Maximum value for data scaling (10)
            n_comps: Number of principal components for PCA (100)
            svd_solver: SVD solver for PCA ('auto')
            leiden_size: Minimum cluster size threshold for filtering (50)

        Returns:
            anndata.AnnData: Generated and filtered single-cell data
        """

        generate_adata=self.vae_model.generate()
        self.generate_adata_raw=generate_adata.copy()
        generate_adata.raw = generate_adata
        if highly_variable_genes:
            sc.pp.highly_variable_genes(generate_adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
            generate_adata = generate_adata[:, generate_adata.var.highly_variable]
        sc.pp.scale(generate_adata, max_value=max_value)
        sc.tl.pca(generate_adata, n_comps=n_comps, svd_solver=svd_solver)
        sc.pp.neighbors(generate_adata, use_rep="X_pca")
        sc.tl.leiden(generate_adata)
        filter_leiden=list(generate_adata.obs['leiden'].value_counts()[generate_adata.obs['leiden'].value_counts()<leiden_size].index)
        generate_adata.uns['noisy_leiden']=filter_leiden
        print("The filter leiden is ",filter_leiden)
        generate_adata=generate_adata[~generate_adata.obs['leiden'].isin(filter_leiden)]
        self.generate_adata=generate_adata.copy()
        return generate_adata.raw.to_adata()

    def gnn_configure(self,use_rep='X',neighbor_rep='X_pca',
                      gpu=0,hidden_size:int=128,
                     weight_decay:int=1e-2,
                     dropout:float=0.5,
                     batch_norm:bool=True,
                     lr:int=1e-3,
                     max_epochs:int=500,
                     display_step:int=25,
                     balance_loss:bool=True,
                     stochastic_loss:bool=True,
                     batch_size:int=2000,num_workers:int=5,):
        r"""
        Configure Graph Neural Network for trajectory and transition state analysis.

        Sets up the NOCD (Non-Overlapping Cell-type Decomposition) GNN model
        for identifying cellular trajectories and intermediate states.

        Arguments:
            use_rep: Representation to use for GNN input ('X')
            neighbor_rep: Representation for neighbor graph construction ('X_pca')
            gpu: GPU device ID for training (0)
            hidden_size: Hidden layer dimensions in GNN (128)
            weight_decay: L2 regularization strength (1e-2)
            dropout: Dropout probability (0.5)
            batch_norm: Whether to use batch normalization (True)
            lr: Learning rate for GNN training (1e-3)
            max_epochs: Maximum training epochs (500)
            display_step: Frequency of progress updates (25)
            balance_loss: Whether to use balanced loss function (True)
            stochastic_loss: Whether to use stochastic loss (True)
            batch_size: Training batch size (2000)
            num_workers: Number of data loading workers (5)

        Returns:
            None: Initializes self.nocd_obj with configured GNN model
        """
        nocd_obj=scnocd(self.generate_adata,use_rep=use_rep,
                        neighbor_rep=neighbor_rep,gpu=gpu)
        #nocd_obj.device = torch.device(f"cuda:{gpu}") if gpu >= 0 and torch.cuda.is_available() else torch.device('cpu')
        nocd_obj.matrix_transform(clustertype=self.celltype_key)
        nocd_obj.matrix_normalize()
        nocd_obj.GNN_configure(hidden_size=hidden_size,weight_decay=weight_decay,
                             dropout=dropout,batch_norm=batch_norm,lr=lr,
                             max_epochs=max_epochs,display_step=display_step,
                             balance_loss=balance_loss,stochastic_loss=stochastic_loss,
                             batch_size=batch_size)
        nocd_obj.GNN_preprocess(num_workers=num_workers)

        self.nocd_obj=nocd_obj

    def gnn_train(self,thresh:float=0.5,gnn_save_dir:str='save_model',
            gnn_save_name:str='gnn'):
        r"""
        Train the GNN model for trajectory and transition state inference.

        Trains the NOCD model to identify cellular trajectories and overlapping
        cell communities representing transition states.

        Arguments:
            thresh: Threshold for community assignment (0.5)
            gnn_save_dir: Directory to save trained GNN model ('save_model')
            gnn_save_name: Filename for saved GNN model ('gnn')

        Returns:
            None: Trains model and computes trajectory results
        """
        self.nocd_obj.GNN_model()
        self.nocd_obj.GNN_result(thresh=thresh)
        self.nocd_obj.cal_nocd()
        self.nocd_obj.save(gnn_save_dir=gnn_save_dir,gnn_save_name=gnn_save_name)

    def gnn_load(self,gnn_load_dir:str,thresh:float=0.5,):
        r"""
        Load a pre-trained GNN model for trajectory analysis.

        Loads a previously trained NOCD model and computes trajectory results.

        Arguments:
            gnn_load_dir: Directory containing the trained GNN model
            thresh: Threshold for community assignment (0.5)

        Returns:
            None: Loads model and computes trajectory results
        """
        self.nocd_obj.load(gnn_load_dir)
        self.nocd_obj.GNN_result(thresh=thresh)
        self.nocd_obj.cal_nocd()

    def gnn_generate(self)->pd.DataFrame:
        r"""
        Generate overlapping cell communities representing transition states.

        Identifies and names cell communities based on trajectory analysis,
        creating binary matrix indicating community membership for each cell.

        Arguments:
            None

        Returns:
            pd.DataFrame: Binary matrix of cell community assignments with community names
        """
        '''
        pair_dict_r={}
        for i in range(self.nocd_obj.K):
            j=0
            while 1:
                if self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].shape[0]==0:
                    break
                if j>=len(self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index):
                    pair_dict_r[str(i)]=self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index[j-1]+'_'+str(j)
                    break
                if self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index[j] not in list(pair_dict_r.values()):
                    pair_dict_r[str(i)]=self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index[j]
                    break
                else:
                    j+=1
        pair_dict_r
        '''
        unique_adata=self.nocd_obj.adata[~self.nocd_obj.adata.obs['nocd_n'].str.contains('-')]
        pair_dict_r={}
        repeat_celltype=dict(zip(list(set(unique_adata.obs[self.celltype_key])),np.zeros(len(list(set(unique_adata.obs[self.celltype_key]))))))
        for nocd_class in list(set(unique_adata.obs['nocd_n'])):
            now_celltype=unique_adata[unique_adata.obs['nocd_n']==nocd_class].obs.value_counts(self.celltype_key).index[0]
            if (now_celltype in pair_dict_r.values()):
                #print(now_celltype)
                pair_dict_r[str(nocd_class)]=now_celltype+'_'+str(int(repeat_celltype[now_celltype]))
                repeat_celltype[now_celltype]+=1
            else:
                pair_dict_r[str(nocd_class)]=now_celltype
                repeat_celltype[now_celltype]+=1

        def li_range(li,max_len):
            r=[0]*max_len   
            for i in li:
                r[int(i)]=1
            return r

        res_li=[li_range(i.split('-'),self.nocd_obj.K) for i in self.nocd_obj.adata.obs['nocd_n']]
        res_pd=pd.DataFrame(res_li,index=self.nocd_obj.adata.obs.index,columns=['nocd_'+i for i in [pair_dict_r[str(j)] for j in range(self.nocd_obj.K)]])
        print("The nocd result is ",res_pd.sum(axis=0))
        print("The nocd result has been added to adata.obs['nocd_']")
        self.nocd_obj.adata.obs=pd.concat([self.nocd_obj.adata.obs,res_pd],axis=1)
        return res_pd 

    def interpolation(self,celltype:str,adata:anndata.AnnData=None,)->anndata.AnnData:
        r"""
        Interpolate trajectory communities back to original data space.

        Integrates identified cell communities from generated data back with
        original single-cell reference data for downstream analysis.

        Arguments:
            celltype: Cell type or community name to interpolate
            adata: Original data for interpolation; uses self.single_seq if None (None)

        Returns:
            anndata.AnnData: Combined data with interpolated community information
        """
        if adata is None:
            adata=self.single_seq
        test_adata=self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_{}'.format(celltype)]==1].raw.to_adata()
        if test_adata.shape[0]!=0:
            adata1=anndata.concat([test_adata,
                        adata],merge='same')
        else:
            adata1=adata 
            print("The cell type {} is not in the nocd result".format(celltype))
        return adata1

__init__(bulk_seq, single_seq, celltype_key, bulk_group=None, max_single_cells=5000, top_marker_num=500, ratio_num=1, gpu=0)

Initialize BulkTrajBlend for trajectory inference and cell blending.

Parameters:

Name Type Description Default
bulk_seq pd.DataFrame

Bulk RNA-seq data with genes as rows and samples as columns

required
single_seq anndata.AnnData

Single-cell RNA-seq reference data as AnnData object

required
celltype_key str

Column name in single_seq.obs containing cell type annotations

required
bulk_group

Column names in bulk_seq for sample grouping (None)

None
max_single_cells int

Maximum number of single cells to use (5000)

5000
top_marker_num int

Number of top marker genes per cell type (500)

500
ratio_num int

Cell selection ratio for each cell type (1)

1
gpu Union[int, str]

GPU device ID; -1 for CPU, 'mps' for Apple Silicon (0)

0

Returns:

Type Description
None

None

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
def __init__(self,bulk_seq:pd.DataFrame,single_seq:anndata.AnnData,
             celltype_key:str,bulk_group=None,max_single_cells:int=5000,
             top_marker_num:int=500,ratio_num:int=1,gpu:Union[int,str]=0) -> None:
    r"""
    Initialize BulkTrajBlend for trajectory inference and cell blending.

    Arguments:
        bulk_seq: Bulk RNA-seq data with genes as rows and samples as columns
        single_seq: Single-cell RNA-seq reference data as AnnData object
        celltype_key: Column name in single_seq.obs containing cell type annotations
        bulk_group: Column names in bulk_seq for sample grouping (None)
        max_single_cells: Maximum number of single cells to use (5000)
        top_marker_num: Number of top marker genes per cell type (500)
        ratio_num: Cell selection ratio for each cell type (1)
        gpu: GPU device ID; -1 for CPU, 'mps' for Apple Silicon (0)

    Returns:
        None
    """

    self.bulk_seq = bulk_seq.copy()
    self.single_seq = single_seq.copy()
    self.celltype_key=celltype_key
    self.top_marker_num=top_marker_num
    self.ratio_num=ratio_num
    self.gpu=gpu
    self.group=bulk_group
    self.max_single_cells=max_single_cells
    if gpu=='mps' and torch.backends.mps.is_available():
        print('Note that mps may loss will be nan, used it when torch is supported')
        self.used_device = torch.device("mps")
    else:
        self.used_device = torch.device(f"cuda:{gpu}") if gpu >= 0 and torch.cuda.is_available() else torch.device('cpu')
    self.history=[]
    data_dg_v=self.bulk_seq.mean(axis=1)
    data_dg=pd.DataFrame(index=data_dg_v.index)
    data_dg['group']=data_dg_v
    self.bulk_seq_group=data_dg
    pass

bulk_preprocess_lazy()

Preprocess bulk RNA-seq data for trajectory analysis.

Performs normalization, log transformation, and optional group averaging of bulk data for downstream trajectory inference.

Returns:

Name Type Description
None None

Updates self.bulk_seq and self.bulk_seq_group in place

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
def bulk_preprocess_lazy(self,)->None:
    r"""
    Preprocess bulk RNA-seq data for trajectory analysis.

    Performs normalization, log transformation, and optional group averaging
    of bulk data for downstream trajectory inference.

    Arguments:
        None

    Returns:
        None: Updates self.bulk_seq and self.bulk_seq_group in place
    """

    print("......drop duplicates index in bulk data")
    self.bulk_seq=data_drop_duplicates_index(self.bulk_seq)
    print("......deseq2 normalize the bulk data")
    self.bulk_seq=deseq2_normalize(self.bulk_seq)
    print("......log10 the bulk data")
    self.bulk_seq=np.log10(self.bulk_seq+1)
    print("......calculate the mean of each group")
    if self.group is None:
        return None
    else:
        data_dg_v=self.bulk_seq[self.group].mean(axis=1)
        data_dg=pd.DataFrame(index=data_dg_v.index)
        data_dg['group']=data_dg_v
        self.bulk_seq_group=data_dg
    return None

single_preprocess_lazy(target_sum=10000.0)

Preprocess single-cell reference data for trajectory analysis.

Normalizes single-cell data and makes cell/gene names unique for consistent integration with bulk data.

Parameters:

Name Type Description Default
target_sum int

Target sum for total count normalization (10000)

10000.0

Returns:

Name Type Description
None None

Updates self.single_seq in place

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
def single_preprocess_lazy(self,target_sum:int=1e4)->None:
    r"""
    Preprocess single-cell reference data for trajectory analysis.

    Normalizes single-cell data and makes cell/gene names unique for
    consistent integration with bulk data.

    Arguments:
        target_sum: Target sum for total count normalization (10000)

    Returns:
        None: Updates self.single_seq in place
    """

    print("......normalize the single data")
    self.single_seq.obs_names_make_unique()
    self.single_seq.var_names_make_unique()
    sc.pp.normalize_total(self.single_seq, target_sum=target_sum)
    print("......log1p the single data")
    sc.pp.log1p(self.single_seq)
    return None

vae_configure(cell_target_num=None, **kwargs)

Configure the VAE model for bulk-to-single-cell generation.

Sets up the Bulk2Single model with cell-type target numbers either from deconvolution prediction or manual specification.

Parameters:

Name Type Description Default
cell_target_num

Number of cells per type to generate (None for auto-prediction)

None
**kwargs

Additional arguments passed to predicted_fraction method

{}

Returns:

Name Type Description
None

Initializes self.vae_model and preprocessing steps

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
def vae_configure(self,cell_target_num=None,**kwargs):
    r"""
    Configure the VAE model for bulk-to-single-cell generation.

    Sets up the Bulk2Single model with cell-type target numbers either from
    deconvolution prediction or manual specification.

    Arguments:
        cell_target_num: Number of cells per type to generate (None for auto-prediction)
        **kwargs: Additional arguments passed to predicted_fraction method

    Returns:
        None: Initializes self.vae_model and preprocessing steps
    """
    self.vae_model=Bulk2Single(bulk_data=self.bulk_seq,single_data=self.single_seq,
                               celltype_key=self.celltype_key,bulk_group=self.group,
                                  max_single_cells=self.max_single_cells,
             top_marker_num=self.top_marker_num,ratio_num=self.ratio_num,gpu=self.gpu)
    if cell_target_num!=None:
        self.vae_model.cell_target_num=dict(zip(list(set(self.single_seq.obs[self.celltype_key])),
                                            [cell_target_num]*len(list(set(self.single_seq.obs[self.celltype_key])))))
    else:
        self.cellfract=self.vae_model.predicted_fraction(**kwargs)

    self.sc_ref=self.vae_model.sc_ref.copy()
    self.bulk_ref=self.vae_model.bulk_data.T.copy()

    self.vae_model.bulk_preprocess_lazy()
    self.vae_model.single_preprocess_lazy()
    self.vae_model.prepare_input()

vae_train(vae_save_dir='save_model', vae_save_name='vae', generate_save_dir='output', generate_save_name='output', batch_size=512, learning_rate=0.0001, hidden_size=256, epoch_num=5000, patience=50, save=True)

Train the VAE model for trajectory-aware single-cell generation.

Trains the underlying Bulk2Single VAE model to generate synthetic single cells that preserve trajectory information from bulk data.

Parameters:

Name Type Description Default
vae_save_dir str

Directory to save trained VAE model ('save_model')

'save_model'
vae_save_name str

Filename for saved VAE model ('vae')

'vae'
generate_save_dir str

Directory for generated data output ('output')

'output'
generate_save_name str

Filename for generated data ('output')

'output'
batch_size int

Training batch size (512)

512
learning_rate int

Optimizer learning rate (1e-4)

0.0001
hidden_size int

Hidden layer dimensions (256)

256
epoch_num int

Maximum training epochs (5000)

5000
patience int

Early stopping patience (50)

50
save bool

Whether to save trained model (True)

True

Returns:

Name Type Description
None

Updates self.vae_net with trained model

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
def vae_train(self,
              vae_save_dir:str='save_model',
        vae_save_name:str='vae',
        generate_save_dir:str='output',
        generate_save_name:str='output',
        batch_size:int=512,
        learning_rate:int=1e-4,
        hidden_size:int=256,
        epoch_num:int=5000,
        patience:int=50,save:bool=True):
    r"""
    Train the VAE model for trajectory-aware single-cell generation.

    Trains the underlying Bulk2Single VAE model to generate synthetic single
    cells that preserve trajectory information from bulk data.

    Arguments:
        vae_save_dir: Directory to save trained VAE model ('save_model')
        vae_save_name: Filename for saved VAE model ('vae')
        generate_save_dir: Directory for generated data output ('output')
        generate_save_name: Filename for generated data ('output')
        batch_size: Training batch size (512)
        learning_rate: Optimizer learning rate (1e-4)
        hidden_size: Hidden layer dimensions (256)
        epoch_num: Maximum training epochs (5000)
        patience: Early stopping patience (50)
        save: Whether to save trained model (True)

    Returns:
        None: Updates self.vae_net with trained model
    """

    self.vae_net=self.vae_model.train(
            batch_size=batch_size,
            learning_rate=learning_rate,
            hidden_size=hidden_size,
            epoch_num=epoch_num,
            vae_save_dir=vae_save_dir,
            vae_save_name=vae_save_name,
            generate_save_dir=generate_save_dir,
            generate_save_name=generate_save_name,
            patience=patience,save=save)

vae_load(vae_load_dir, hidden_size=256)

Load a pre-trained VAE model for trajectory analysis.

Loads a previously trained VAE model for generating trajectory-aware single-cell data.

Parameters:

Name Type Description Default
vae_load_dir str

Directory containing the trained VAE model

required
hidden_size int

Hidden layer dimensions matching training (256)

256

Returns:

Name Type Description
None

Updates self.vae_net with loaded model

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
def vae_load(self,vae_load_dir:str,hidden_size:int=256):
    r"""
    Load a pre-trained VAE model for trajectory analysis.

    Loads a previously trained VAE model for generating trajectory-aware
    single-cell data.

    Arguments:
        vae_load_dir: Directory containing the trained VAE model
        hidden_size: Hidden layer dimensions matching training (256)

    Returns:
        None: Updates self.vae_net with loaded model
    """

    print(f'loading model from {vae_load_dir}')
    vae_net = self.vae_model.load(vae_load_dir,hidden_size=hidden_size)
    self.vae_net=vae_net

vae_generate(highly_variable_genes=True, max_value=10, n_comps=100, svd_solver='auto', leiden_size=50)

Generate trajectory-aware single-cell data with quality filtering.

Uses the trained VAE to generate synthetic single cells and applies quality control filtering to remove noisy clusters.

Parameters:

Name Type Description Default
highly_variable_genes bool

Whether to select highly variable genes (True)

True
max_value float

Maximum value for data scaling (10)

10
n_comps int

Number of principal components for PCA (100)

100
svd_solver str

SVD solver for PCA ('auto')

'auto'
leiden_size int

Minimum cluster size threshold for filtering (50)

50

Returns:

Type Description
anndata.AnnData

anndata.AnnData: Generated and filtered single-cell data

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
def vae_generate(self,highly_variable_genes:bool=True,max_value:float=10,
                 n_comps:int=100,svd_solver:str='auto',leiden_size:int=50)->anndata.AnnData:
    r"""
    Generate trajectory-aware single-cell data with quality filtering.

    Uses the trained VAE to generate synthetic single cells and applies
    quality control filtering to remove noisy clusters.

    Arguments:
        highly_variable_genes: Whether to select highly variable genes (True)
        max_value: Maximum value for data scaling (10)
        n_comps: Number of principal components for PCA (100)
        svd_solver: SVD solver for PCA ('auto')
        leiden_size: Minimum cluster size threshold for filtering (50)

    Returns:
        anndata.AnnData: Generated and filtered single-cell data
    """

    generate_adata=self.vae_model.generate()
    self.generate_adata_raw=generate_adata.copy()
    generate_adata.raw = generate_adata
    if highly_variable_genes:
        sc.pp.highly_variable_genes(generate_adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
        generate_adata = generate_adata[:, generate_adata.var.highly_variable]
    sc.pp.scale(generate_adata, max_value=max_value)
    sc.tl.pca(generate_adata, n_comps=n_comps, svd_solver=svd_solver)
    sc.pp.neighbors(generate_adata, use_rep="X_pca")
    sc.tl.leiden(generate_adata)
    filter_leiden=list(generate_adata.obs['leiden'].value_counts()[generate_adata.obs['leiden'].value_counts()<leiden_size].index)
    generate_adata.uns['noisy_leiden']=filter_leiden
    print("The filter leiden is ",filter_leiden)
    generate_adata=generate_adata[~generate_adata.obs['leiden'].isin(filter_leiden)]
    self.generate_adata=generate_adata.copy()
    return generate_adata.raw.to_adata()

gnn_configure(use_rep='X', neighbor_rep='X_pca', gpu=0, hidden_size=128, weight_decay=0.01, dropout=0.5, batch_norm=True, lr=0.001, max_epochs=500, display_step=25, balance_loss=True, stochastic_loss=True, batch_size=2000, num_workers=5)

Configure Graph Neural Network for trajectory and transition state analysis.

Sets up the NOCD (Non-Overlapping Cell-type Decomposition) GNN model for identifying cellular trajectories and intermediate states.

Parameters:

Name Type Description Default
use_rep

Representation to use for GNN input ('X')

'X'
neighbor_rep

Representation for neighbor graph construction ('X_pca')

'X_pca'
gpu

GPU device ID for training (0)

0
hidden_size int

Hidden layer dimensions in GNN (128)

128
weight_decay int

L2 regularization strength (1e-2)

0.01
dropout float

Dropout probability (0.5)

0.5
batch_norm bool

Whether to use batch normalization (True)

True
lr int

Learning rate for GNN training (1e-3)

0.001
max_epochs int

Maximum training epochs (500)

500
display_step int

Frequency of progress updates (25)

25
balance_loss bool

Whether to use balanced loss function (True)

True
stochastic_loss bool

Whether to use stochastic loss (True)

True
batch_size int

Training batch size (2000)

2000
num_workers int

Number of data loading workers (5)

5

Returns:

Name Type Description
None

Initializes self.nocd_obj with configured GNN model

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
def gnn_configure(self,use_rep='X',neighbor_rep='X_pca',
                  gpu=0,hidden_size:int=128,
                 weight_decay:int=1e-2,
                 dropout:float=0.5,
                 batch_norm:bool=True,
                 lr:int=1e-3,
                 max_epochs:int=500,
                 display_step:int=25,
                 balance_loss:bool=True,
                 stochastic_loss:bool=True,
                 batch_size:int=2000,num_workers:int=5,):
    r"""
    Configure Graph Neural Network for trajectory and transition state analysis.

    Sets up the NOCD (Non-Overlapping Cell-type Decomposition) GNN model
    for identifying cellular trajectories and intermediate states.

    Arguments:
        use_rep: Representation to use for GNN input ('X')
        neighbor_rep: Representation for neighbor graph construction ('X_pca')
        gpu: GPU device ID for training (0)
        hidden_size: Hidden layer dimensions in GNN (128)
        weight_decay: L2 regularization strength (1e-2)
        dropout: Dropout probability (0.5)
        batch_norm: Whether to use batch normalization (True)
        lr: Learning rate for GNN training (1e-3)
        max_epochs: Maximum training epochs (500)
        display_step: Frequency of progress updates (25)
        balance_loss: Whether to use balanced loss function (True)
        stochastic_loss: Whether to use stochastic loss (True)
        batch_size: Training batch size (2000)
        num_workers: Number of data loading workers (5)

    Returns:
        None: Initializes self.nocd_obj with configured GNN model
    """
    nocd_obj=scnocd(self.generate_adata,use_rep=use_rep,
                    neighbor_rep=neighbor_rep,gpu=gpu)
    #nocd_obj.device = torch.device(f"cuda:{gpu}") if gpu >= 0 and torch.cuda.is_available() else torch.device('cpu')
    nocd_obj.matrix_transform(clustertype=self.celltype_key)
    nocd_obj.matrix_normalize()
    nocd_obj.GNN_configure(hidden_size=hidden_size,weight_decay=weight_decay,
                         dropout=dropout,batch_norm=batch_norm,lr=lr,
                         max_epochs=max_epochs,display_step=display_step,
                         balance_loss=balance_loss,stochastic_loss=stochastic_loss,
                         batch_size=batch_size)
    nocd_obj.GNN_preprocess(num_workers=num_workers)

    self.nocd_obj=nocd_obj

gnn_train(thresh=0.5, gnn_save_dir='save_model', gnn_save_name='gnn')

Train the GNN model for trajectory and transition state inference.

Trains the NOCD model to identify cellular trajectories and overlapping cell communities representing transition states.

Parameters:

Name Type Description Default
thresh float

Threshold for community assignment (0.5)

0.5
gnn_save_dir str

Directory to save trained GNN model ('save_model')

'save_model'
gnn_save_name str

Filename for saved GNN model ('gnn')

'gnn'

Returns:

Name Type Description
None

Trains model and computes trajectory results

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
def gnn_train(self,thresh:float=0.5,gnn_save_dir:str='save_model',
        gnn_save_name:str='gnn'):
    r"""
    Train the GNN model for trajectory and transition state inference.

    Trains the NOCD model to identify cellular trajectories and overlapping
    cell communities representing transition states.

    Arguments:
        thresh: Threshold for community assignment (0.5)
        gnn_save_dir: Directory to save trained GNN model ('save_model')
        gnn_save_name: Filename for saved GNN model ('gnn')

    Returns:
        None: Trains model and computes trajectory results
    """
    self.nocd_obj.GNN_model()
    self.nocd_obj.GNN_result(thresh=thresh)
    self.nocd_obj.cal_nocd()
    self.nocd_obj.save(gnn_save_dir=gnn_save_dir,gnn_save_name=gnn_save_name)

gnn_load(gnn_load_dir, thresh=0.5)

Load a pre-trained GNN model for trajectory analysis.

Loads a previously trained NOCD model and computes trajectory results.

Parameters:

Name Type Description Default
gnn_load_dir str

Directory containing the trained GNN model

required
thresh float

Threshold for community assignment (0.5)

0.5

Returns:

Name Type Description
None

Loads model and computes trajectory results

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
def gnn_load(self,gnn_load_dir:str,thresh:float=0.5,):
    r"""
    Load a pre-trained GNN model for trajectory analysis.

    Loads a previously trained NOCD model and computes trajectory results.

    Arguments:
        gnn_load_dir: Directory containing the trained GNN model
        thresh: Threshold for community assignment (0.5)

    Returns:
        None: Loads model and computes trajectory results
    """
    self.nocd_obj.load(gnn_load_dir)
    self.nocd_obj.GNN_result(thresh=thresh)
    self.nocd_obj.cal_nocd()

gnn_generate()

Generate overlapping cell communities representing transition states.

Identifies and names cell communities based on trajectory analysis, creating binary matrix indicating community membership for each cell.

Returns:

Type Description
pd.DataFrame

pd.DataFrame: Binary matrix of cell community assignments with community names

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
def gnn_generate(self)->pd.DataFrame:
    r"""
    Generate overlapping cell communities representing transition states.

    Identifies and names cell communities based on trajectory analysis,
    creating binary matrix indicating community membership for each cell.

    Arguments:
        None

    Returns:
        pd.DataFrame: Binary matrix of cell community assignments with community names
    """
    '''
    pair_dict_r={}
    for i in range(self.nocd_obj.K):
        j=0
        while 1:
            if self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].shape[0]==0:
                break
            if j>=len(self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index):
                pair_dict_r[str(i)]=self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index[j-1]+'_'+str(j)
                break
            if self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index[j] not in list(pair_dict_r.values()):
                pair_dict_r[str(i)]=self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_n']==str(i)].obs.value_counts(self.celltype_key).index[j]
                break
            else:
                j+=1
    pair_dict_r
    '''
    unique_adata=self.nocd_obj.adata[~self.nocd_obj.adata.obs['nocd_n'].str.contains('-')]
    pair_dict_r={}
    repeat_celltype=dict(zip(list(set(unique_adata.obs[self.celltype_key])),np.zeros(len(list(set(unique_adata.obs[self.celltype_key]))))))
    for nocd_class in list(set(unique_adata.obs['nocd_n'])):
        now_celltype=unique_adata[unique_adata.obs['nocd_n']==nocd_class].obs.value_counts(self.celltype_key).index[0]
        if (now_celltype in pair_dict_r.values()):
            #print(now_celltype)
            pair_dict_r[str(nocd_class)]=now_celltype+'_'+str(int(repeat_celltype[now_celltype]))
            repeat_celltype[now_celltype]+=1
        else:
            pair_dict_r[str(nocd_class)]=now_celltype
            repeat_celltype[now_celltype]+=1

    def li_range(li,max_len):
        r=[0]*max_len   
        for i in li:
            r[int(i)]=1
        return r

    res_li=[li_range(i.split('-'),self.nocd_obj.K) for i in self.nocd_obj.adata.obs['nocd_n']]
    res_pd=pd.DataFrame(res_li,index=self.nocd_obj.adata.obs.index,columns=['nocd_'+i for i in [pair_dict_r[str(j)] for j in range(self.nocd_obj.K)]])
    print("The nocd result is ",res_pd.sum(axis=0))
    print("The nocd result has been added to adata.obs['nocd_']")
    self.nocd_obj.adata.obs=pd.concat([self.nocd_obj.adata.obs,res_pd],axis=1)
    return res_pd 

interpolation(celltype, adata=None)

Interpolate trajectory communities back to original data space.

Integrates identified cell communities from generated data back with original single-cell reference data for downstream analysis.

Parameters:

Name Type Description Default
celltype str

Cell type or community name to interpolate

required
adata anndata.AnnData

Original data for interpolation; uses self.single_seq if None (None)

None

Returns:

Type Description
anndata.AnnData

anndata.AnnData: Combined data with interpolated community information

Source code in /Users/fernandozeng/miniforge3/envs/space/lib/python3.10/site-packages/omicverse/bulk2single/_bulktrajblend.py
def interpolation(self,celltype:str,adata:anndata.AnnData=None,)->anndata.AnnData:
    r"""
    Interpolate trajectory communities back to original data space.

    Integrates identified cell communities from generated data back with
    original single-cell reference data for downstream analysis.

    Arguments:
        celltype: Cell type or community name to interpolate
        adata: Original data for interpolation; uses self.single_seq if None (None)

    Returns:
        anndata.AnnData: Combined data with interpolated community information
    """
    if adata is None:
        adata=self.single_seq
    test_adata=self.nocd_obj.adata[self.nocd_obj.adata.obs['nocd_{}'.format(celltype)]==1].raw.to_adata()
    if test_adata.shape[0]!=0:
        adata1=anndata.concat([test_adata,
                    adata],merge='same')
    else:
        adata1=adata 
        print("The cell type {} is not in the nocd result".format(celltype))
    return adata1