单细胞样本对齐: 数据预处理
1. 数据获取
在本研究中,我们使用了2021年7月在Nature genetic上发表的一篇文章,该文章同时使用了snATAC-seq与snRNA-seq两种不同的技术进行测量阿尔茨海默症晚期病人的皮层。
wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE174nnn/GSE174367/suppl/GSE174367_snATAC-seq_cell_meta.csv.gz
wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE174nnn/GSE174367/suppl/GSE174367_snATAC-seq_filtered_peak_bc_matrix.h5
wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE174nnn/GSE174367/suppl/GSE174367_snRNA-seq_cell_meta.csv.gz
wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE174nnn/GSE174367/suppl/GSE174367_snRNA-seq_filtered_feature_bc_matrix.h5
2. RNA-seq数据预处理
2.1 导入包
import anndata
import networkx as nx
import scanpy as sc
import pandas as pd
import numpy as np
2.2 导入数据
adata=sc.read_10x_h5('/content/GSE174367_snRNA-seq_filtered_feature_bc_matrix.h5')
adata.var_names_make_unique()
adata.obs.index.name, adata.var.index.name = "cells", "genes"
2.3 设置meta
meta=pd.read_csv('/content/GSE174367_snRNA-seq_cell_meta.csv.gz')
meta.set_index(meta.columns[0],inplace=True)
2.4 去除cell_type未知的细胞
xl=[]
for i in adata.obs.index:
if(i in meta.index):
xl.append(meta.loc[i]['Cell.Type'])
else:
xl.append('Nan')
adata.obs["cell_type"] = xl
adata=adata[adata.obs['cell_type']!='Nan']
rna=adata
2.4 单细胞obs类型设置
Cell=[]
Sample=[]
Batch=[]
Age=[]
Sex=[]
PMI=[]
Tangle=[]
Plaque=[]
Diagnosis=[]
RIN=[]
for i in rna.obs.index:
if(i in meta.index):
test=meta.loc[i]
Cell.append(test['Cell.Type'])
Sample.append(test['SampleID'])
Batch.append(test['Batch'])
Age.append(test['Age'])
Sex.append(test['Sex'])
PMI.append(test['PMI'])
Tangle.append(test['Tangle.Stage'])
Plaque.append(test['Plaque.Stage'])
Diagnosis.append(test['Diagnosis'])
RIN.append(test['RIN'])
else:
Cell.append('Nan')
Sample.append('Nan')
Batch.append('Nan')
Age.append('Nan')
Sex.append('Nan')
PMI.append('Nan')
Tangle.append('Nan')
Plaque.append('Nan')
Diagnosis.append('Nan')
RIN.append('Nan')
rna.obs["cell_type"] = Cell
rna.obs["Sample.ID"] = Sample
rna.obs["Batch"] = Batch
rna.obs["Age"] = Age
rna.obs["Sex"] = Sex
rna.obs["PMI"] = PMI
rna.obs["Tangle.Stage"] = Tangle
rna.obs["Plaque.Stage"] = Plaque
rna.obs["Diagnosis"] = Diagnosis
rna.obs["RIN"] = RIN
rna.obs.astype({
'cell_type':'category',
'Sample.ID':'category',
'Batch':'category',
'Age':'category',
'Sex':'category',
'PMI':'category',
'Tangle.Stage':'category',
'Plague.Stage':'category',
'Diagnosis':'category',
'RIN':'category',
})
2.5 保存数据
rna.write_h5ad('GSE174367rna_61472.h5ad',compression="gzip")
3. ATAC-seq数据预处理
我们得到的ATAC-seq矩阵较为复杂,所以我们需要对数据进行一定的预处理
3.1 导入包
import anndata
import networkx as nx
import scanpy as sc
import pandas as pd
import numpy as np
3.2 导入数据
atacdata=sc.read_10x_h5('/content/GSE174367_snATAC-seq_filtered_peak_bc_matrix.h5',gex_only=False)
atacdata.obs.index.name, atacdata.var.index.name = "cells", "peaks"
3.3 染色体位置拆分
atacdata.var["chrom"] = np.vectorize(lambda x: x.split(":")[0])(atacdata.var["gene_ids"])
atacdata.var["chromStart"] = np.vectorize(lambda x: int(x.split(":")[1].split("-")[0]))(atacdata.var["gene_ids"])
atacdata.var["chromEnd"] = np.vectorize(lambda x: int(x.split("-")[1]))(atacdata.var["gene_ids"])
del atacdata.var["gene_ids"]
atacdata.var.head()
3.4 过滤表达为0的peak
sc.pp.filter_genes(atacdata, min_counts=1)
atacdata
AnnData object with n_obs × n_vars = 143401 × 217707 var: 'feature_types', 'genome', 'chrom', 'chromStart', 'chromEnd', 'n_counts'
3.5 设置细胞meta
#导入meta
meta1=pd.read_csv('/content/GSE174367_snATAC-seq_cell_meta.csv.gz')
#设置barcode为index
meta1.set_index(meta1.columns[11],inplace=True)
#设置细胞的meta
Cell=[]
Sample=[]
Batch=[]
Age=[]
Sex=[]
PMI=[]
Tangle=[]
Plaque=[]
Diagnosis=[]
RIN=[]
for i in atacdata.obs.index:
if(i in meta1.index):
test=meta1.loc[i]
Cell.append(test['Cell.Type'])
Sample.append(test['Sample.ID'])
Batch.append(test['Batch'])
Age.append(test['Age'])
Sex.append(test['Sex'])
PMI.append(test['PMI'])
Tangle.append(test['Tangle.Stage'])
Plaque.append(test['Plaque.Stage'])
Diagnosis.append(test['Diagnosis'])
RIN.append(test['RIN'])
else:
Cell.append('Nan')
Sample.append('Nan')
Batch.append('Nan')
Age.append('Nan')
Sex.append('Nan')
PMI.append('Nan')
Tangle.append('Nan')
Plaque.append('Nan')
Diagnosis.append('Nan')
RIN.append('Nan')
atacdata.obs["cell_type"] = Cell
atacdata.obs["Sample.ID"] = Sample
atacdata.obs["Batch"] = Batch
atacdata.obs["Age"] = Age
atacdata.obs["Sex"] = Sex
atacdata.obs["PMI"] = PMI
atacdata.obs["Tangle.Stage"] = Tangle
atacdata.obs["Plaque.Stage"] = Plaque
atacdata.obs["Diagnosis"] = Diagnosis
atacdata.obs["RIN"] = RIN
atacdata.obs.head()
atacdata.obs.astype({
'cell_type':'category',
'Sample.ID':'category',
'Batch':'category',
'Age':'category',
'Sex':'category',
'PMI':'category',
'Tangle.Stage':'category',
'Plague.Stage':'category',
'Diagnosis':'category',
'RIN':'category',
})
3.6 过滤细胞
由于ATAC-seq有130418个细胞,而RNA-seq只有61472个细胞,于是我们就需要随机去除ATAC-seq里面多的细胞,同时要保证细胞类型的比例不变,所以我们按类随机去除细胞
atacdata.obs['ran']=np.zeros(len(atacdata.obs))
cell_type=list(set(rna.obs['cell_type']))
for i in cell_type:
cell_len=len(rna[rna.obs['cell_type']==i])
#random select
a1=atacdata.obs[atacdata.obs['cell_type']==i].sample(n=cell_len).index
atacdata.obs.loc[a1,'ran']=1
atacdata1=atacdata[atacdata.obs['ran']==1]
atacdata1
View of AnnData object with n_obs × n_vars = 61472 × 217707 obs: 'cell_type', 'ran', 'Sample.ID', 'Batch', 'Age', 'Sex', 'PMI', 'Tangle.Stage', 'Plaque.Stage', 'Diagnosis', 'RIN' var: 'feature_types', 'genome', 'chrom', 'chromStart', 'chromEnd', 'n_counts'
3.7 保存数据
atacdata1.write_h5ad('GSE174367atac_61472.h5ad',compression="gzip")