a b/RNA-seq/AnalysisPipeline/1.preProcessData.R
1
#' @description: proProcess, cluster and remove the batch effect
2
3
# load package
4
library(Seurat)
5
library(harmony)
6
library(clustree)
7
library(ggpubr)
8
library(dplyr)
9
library(patchwork)
10
library(ComplexHeatmap)
11
library(circlize)
12
library(vegan)
13
set.seed(101)
14
library(future)
15
plan("multiprocess", workers = 10) 
16
options(future.globals.maxSize = 100000 * 1024^2) # set 50G RAM
17
setwd(dir = "/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA")
18
source(file = "/home/longzhilin/Analysis_Code/Visualization/colorPalettes.R")
19
20
#### four samples
21
T1.data <- Read10X(data.dir = "/data/raw_data/lzl/RenalTumor-20200713/cellRanger_result/scRNA-V5.0/T1/outs/filtered_feature_bc_matrix")
22
colnames(T1.data) <- paste0("T1_", colnames(T1.data))
23
#36601*8849
24
T2.data <- Read10X(data.dir = "/data/raw_data/lzl/RenalTumor-20200713/cellRanger_result/scRNA-V5.0/T2/outs/filtered_feature_bc_matrix")
25
colnames(T2.data) <- paste0("T2_", colnames(T2.data))
26
#36601*15440
27
T3.data <- Read10X(data.dir = "/data/raw_data/lzl/RenalTumor-20200713/cellRanger_result/scRNA-V5.0/T3/outs/filtered_feature_bc_matrix")
28
colnames(T3.data) <- paste0("T3_", colnames(T3.data))
29
#36601*14644
30
T4.data <- Read10X(data.dir = "/data/raw_data/lzl/RenalTumor-20200713/cellRanger_result/scRNA-V5.0/T4/outs/filtered_feature_bc_matrix")
31
colnames(T4.data) <- paste0("T4_", colnames(T4.data))
32
#36601*11016
33
34
#### Preliminary filtration
35
#min.cell >3 & min.features >200                        
36
T1 <- CreateSeuratObject(counts = T1.data,
37
                         project = "T1",
38
                         min.cells = 3,
39
                         min.features = 200)
40
#23137*8823                         
41
T2 <- CreateSeuratObject(counts = T2.data,
42
                         project = "T2",
43
                         min.cells = 3,
44
                         min.features = 200) 
45
#23487*15437                         
46
T3 <- CreateSeuratObject(counts = T3.data,
47
                         project = "T3",
48
                         min.cells = 3,
49
                         min.features = 200)
50
#24106*14602                         
51
T4 <- CreateSeuratObject(counts = T4.data,
52
                         project = "T4",
53
                         min.cells = 3,
54
                         min.features = 200) 
55
#23065*10792
56
57
#### Mitochondrial gene ratio
58
T1[["percent.mt"]] <- PercentageFeatureSet(T1, pattern = "^MT-")
59
T2[["percent.mt"]] <- PercentageFeatureSet(T2, pattern = "^MT-")
60
T3[["percent.mt"]] <- PercentageFeatureSet(T3, pattern = "^MT-")
61
T4[["percent.mt"]] <- PercentageFeatureSet(T4, pattern = "^MT-")
62
63
#### Draw a statistical graph of the number of genes/count number/proportion of mitochondrial genes
64
pdf(file = "1.QualityControl/count.feature.mt.pdf")
65
VlnPlot(T1, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
66
plot1 <- FeatureScatter(T1, feature1 = "nCount_RNA", feature2 = "percent.mt")
67
ploT1 <- FeatureScatter(T1, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
68
plot1 + ploT1
69
VlnPlot(T2, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
70
plot1 <- FeatureScatter(T2, feature1 = "nCount_RNA", feature2 = "percent.mt")
71
ploT1 <- FeatureScatter(T2, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
72
plot1 + ploT1
73
VlnPlot(T3, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
74
plot1 <- FeatureScatter(T3, feature1 = "nCount_RNA", feature2 = "percent.mt")
75
ploT1 <- FeatureScatter(T3, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
76
plot1 + ploT1
77
VlnPlot(T4, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
78
plot1 <- FeatureScatter(T4, feature1 = "nCount_RNA", feature2 = "percent.mt")
79
ploT1 <- FeatureScatter(T4, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
80
plot1 + ploT1
81
82
ggdensity(T1@meta.data, x = "nCount_RNA", title = "T1")
83
ggdensity(T1@meta.data, x = "nFeature_RNA", title = "T1")
84
ggdensity(T1@meta.data, x = "percent.mt", title = "T1")
85
ggdensity(T2@meta.data, x = "nCount_RNA", title = "T2")
86
ggdensity(T2@meta.data, x = "nFeature_RNA", title = "T2")
87
ggdensity(T2@meta.data, x = "percent.mt", title = "T2")
88
ggdensity(T3@meta.data, x = "nCount_RNA", title = "T3")
89
ggdensity(T3@meta.data, x = "nFeature_RNA", title = "T3")
90
ggdensity(T3@meta.data, x = "percent.mt", title = "T3")
91
ggdensity(T4@meta.data, x = "nCount_RNA", title = "T4")
92
ggdensity(T4@meta.data, x = "nFeature_RNA", title = "T4")
93
ggdensity(T4@meta.data, x = "percent.mt", title = "T4")
94
dev.off()
95
96
######################### Detect the resolution parameters of each sample cluster. After the parameters are determined, you can block them without executing [test]
97
set.resolutions <- seq(0.5, 2, by = 0.1)
98
pdf(file = "1.QualityControl/PCA-test.pdf")
99
100
#### T1
101
T1.pro <- subset(T1, subset = nFeature_RNA > 200 & percent.mt < 10 & nCount_RNA > 1000 & nFeature_RNA < 6000) 
102
T1.pro <- SCTransform(T1.pro, vars.to.regress = c("nCount_RNA", "percent.mt"), verbose = FALSE)
103
T1.pro <- RunPCA(T1.pro, npcs = 100, verbose = FALSE)
104
ElbowPlot(object = T1.pro, ndims = 100)
105
T1.pro  <- FindNeighbors(object = T1.pro , dims = 1:50, verbose = FALSE)
106
T1.pro  <- FindClusters(object = T1.pro , resolution = set.resolutions, verbose = FALSE) 
107
clustree(T1.pro)
108
T1.pro  <- RunUMAP(T1.pro , dims = 1:50)
109
T1.res <- sapply(set.resolutions, function(x){
110
    p <- DimPlot(object = T1.pro, reduction = 'umap',label = TRUE, group.by = paste0("SCT_snn_res.", x))
111
    print(p)
112
})
113
114
#### T2
115
T2.pro <- subset(T2, subset = nFeature_RNA > 200 & percent.mt < 10 & nCount_RNA > 1000 & nFeature_RNA < 6000)
116
T2.pro <- SCTransform(T2.pro, vars.to.regress = c("nCount_RNA", "percent.mt"), verbose = FALSE)
117
T2.pro <- RunPCA(T2.pro, npcs = 100, verbose = FALSE)
118
ElbowPlot(object = T2.pro, ndims = 100)
119
T2.pro  <- FindNeighbors(object = T2.pro , dims = 1:50, verbose = FALSE)
120
T2.pro  <- FindClusters(object = T2.pro , resolution = set.resolutions, verbose = FALSE) 
121
clustree(T2.pro)
122
T2.pro  <- RunUMAP(T2.pro , dims = 1:50)
123
T2.res <- sapply(set.resolutions, function(x){
124
    p <- DimPlot(object = T2.pro, reduction = 'umap',label = TRUE, group.by = paste0("SCT_snn_res.", x))
125
    print(p)
126
})
127
128
#### T3
129
T3.pro <- subset(T3, subset = nFeature_RNA > 200 & percent.mt < 10 & nCount_RNA > 1000 & nFeature_RNA < 6000) 
130
T3.pro <- SCTransform(T3.pro, vars.to.regress = c("nCount_RNA", "percent.mt"), verbose = FALSE)
131
T3.pro <- RunPCA(T3.pro, npcs = 100, verbose = FALSE)
132
ElbowPlot(object = T3.pro, ndims = 100)
133
T3.pro  <- FindNeighbors(object = T3.pro , dims = 1:50, verbose = FALSE)
134
T3.pro  <- FindClusters(object = T3.pro , resolution = set.resolutions, verbose = FALSE) 
135
clustree(T3.pro)
136
T3.pro  <- RunUMAP(T3.pro , dims = 1:50)
137
T3.res <- sapply(set.resolutions, function(x){
138
    p <- DimPlot(object = T3.pro, reduction = 'umap',label = TRUE, group.by = paste0("SCT_snn_res.", x))
139
    print(p)
140
})
141
142
#### T4
143
T4.pro <- subset(T4, subset = nFeature_RNA > 200 & percent.mt < 10 & nCount_RNA > 1000 & nFeature_RNA < 6000) 
144
T4.pro <- SCTransform(T4.pro, vars.to.regress = c("nCount_RNA", "percent.mt"), verbose = FALSE)
145
T4.pro <- RunPCA(T4.pro, npcs = 100, verbose = FALSE)
146
ElbowPlot(object = T4.pro, ndims = 100)
147
T4.pro  <- FindNeighbors(object = T4.pro , dims = 1:50, verbose = FALSE)
148
T4.pro  <- FindClusters(object = T4.pro , resolution = set.resolutions, verbose = FALSE) 
149
clustree(T4.pro)
150
T4.pro  <- RunUMAP(T4.pro , dims = 1:50)
151
T4.res <- sapply(set.resolutions, function(x){
152
    p <- DimPlot(object = T4.pro, reduction = 'umap',label = TRUE, group.by = paste0("SCT_snn_res.", x))
153
    print(p)
154
})
155
dev.off()
156
157
#### remove doublet
158
library(DoubletFinder) # Require cleanup of low-quality cells in advance
159
source(file = "/home/longzhilin/Analysis_Code/SingleCell/doubletDetect.R")
160
pdf("1.QualityControl/doublet.pdf")
161
162
T1.pro1 <- doubletDetect(Seurat.object = T1.pro, PCs = 1:50, doublet.rate = 0.061, annotation = "SCT_snn_res.0.7", sct = T) #7893 ~8000
163
T2.pro1 <- doubletDetect(Seurat.object = T2.pro, PCs = 1:50, doublet.rate = 0.106, annotation = "SCT_snn_res.1.7", sct = T) #13992 ~14000
164
T3.pro1 <- doubletDetect(Seurat.object = T3.pro, PCs = 1:50, doublet.rate = 0.091, annotation = "SCT_snn_res.0.7", sct = T) #11973 ~12000
165
T4.pro1 <- doubletDetect(Seurat.object = T4.pro, PCs = 1:50, doublet.rate = 0.061, annotation = "SCT_snn_res.0.5", sct = T) #8054 ~8000
166
dev.off()
167
saveRDS(T1.pro1, file = "T1.pro1.rds")
168
saveRDS(T2.pro1, file = "T2.pro1.rds")
169
saveRDS(T3.pro1, file = "T3.pro1.rds")
170
saveRDS(T4.pro1, file = "T4.pro1.rds")
171
172
pdf("1.QualityControl/doublet.cell.pdf")
173
DimPlot(object = T1.pro1, reduction = 'umap', group.by = "Doublet")
174
DimPlot(object = T2.pro1, reduction = 'umap', group.by = "Doublet")
175
DimPlot(object = T3.pro1, reduction = 'umap', group.by = "Doublet")
176
DimPlot(object = T4.pro1, reduction = 'umap', group.by = "Doublet")
177
dev.off()
178
179
T1.pro2 <- subset(T1.pro1, subset = Doublet == "Singlet") #21247*7449
180
T2.pro2 <- subset(T2.pro1, subset = Doublet == "Singlet") #21605*12574
181
T3.pro2 <- subset(T3.pro1, subset = Doublet == "Singlet") #21947*10937
182
T4.pro2 <- subset(T4.pro1, subset = Doublet == "Singlet") #20459*7605
183
saveRDS(T1.pro2, file = "T1.pro2.rds")
184
saveRDS(T2.pro2, file = "T2.pro2.rds")
185
saveRDS(T3.pro2, file = "T3.pro2.rds")
186
saveRDS(T4.pro2, file = "T4.pro2.rds")
187
188
############################################## merge data and correct the batch effect
189
DefaultAssay(T1.pro2) <- "RNA"
190
DefaultAssay(T2.pro2) <- "RNA"
191
DefaultAssay(T3.pro2) <- "RNA"
192
DefaultAssay(T4.pro2) <- "RNA"
193
194
source(file = "/home/longzhilin/Analysis_Code/SingleCell/variableFeatureSelection.R")
195
Renal.list <- list(T1 = T1.pro2, T2 = T2.pro2, T3 = T3.pro2, T4 = T4.pro2)
196
Renal.list.Stardard <- variableFeatureSelection(seurat.lists = Renal.list, method = "Stardard", nfeatures = 3000)
197
saveRDS(Renal.list.Stardard, file = "Renal.list.Stardard.3000.rds")
198
199
Renal.list.SCT <- variableFeatureSelection(seurat.lists = Renal.list, method = "SCT", nfeatures = 3000)
200
saveRDS(Renal.list.SCT, file = "Renal.list.SCT.3000.rds")
201
202
#### 
203
# assay=SCT
204
data.merge <- merge(Renal.list.SCT[[1]], y = Renal.list.SCT[2:length(Renal.list.SCT)], project = "Renal")
205
DefaultAssay(data.merge) <- "SCT"
206
seurat.features.SCT <- SelectIntegrationFeatures(object.list = Renal.list.SCT, nfeatures = 3000)
207
VariableFeatures(data.merge) <- seurat.features.SCT
208
# Remove previous clustering results
209
index <- match(paste0("SCT_snn_res.", seq(0.5, 2, by=0.1)), colnames(data.merge@meta.data))
210
data.merge@meta.data <- data.merge@meta.data[,-index]
211
# assay=RNA
212
seurat.features.RNA <- SelectIntegrationFeatures(object.list = Renal.list.Stardard, nfeatures = 3000)
213
DefaultAssay(data.merge) <- "RNA"
214
VariableFeatures(data.merge) <- seurat.features.RNA
215
data.merge <- NormalizeData(data.merge, verbose = FALSE)
216
data.merge <- ScaleData(data.merge, verbose = FALSE, vars.to.regress = c("nCount_RNA", "percent.mt"), features = rownames(data.merge@assays$RNA@data))
217
DefaultAssay(data.merge) <- "SCT"
218
saveRDS(data.merge, file = "data.merge.rds")
219
220
##############################################2.Evaluation of cellcycle and patient bias
221
DefaultAssay(data.merge) <- "SCT"
222
pdf("1.QualityControl/filtered.statistics.pdf")
223
VlnPlot(object = data.merge, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "orig.ident", cols = Palettes$group_pal[1:length(unique(data.merge@meta.data$orig.ident))], pt.size = 0)
224
FeatureScatter(object = data.merge, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "orig.ident")
225
FeatureScatter(object = data.merge, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "orig.ident")
226
dev.off()
227
# Draw the distribution of the number of samples
228
cell.number <- as.data.frame(table(data.merge$orig.ident))
229
pdf("1.QualityControl/highQuality.pdf")
230
ggbarplot(cell.number, x="Var1", y="Freq", fill = "Var1", color = "Var1", palette = Palettes$group_pal[1:length(unique(data.merge@meta.data$orig.ident))],
231
          sort.by.groups=FALSE, #不按组排序
232
          label = T, xlab = "", ylab = "Cell Number") + theme(legend.position="none") 
233
dev.off()
234
235
#### Assess the cell cycle effect
236
s.genes <- cc.genes$s.genes
237
g2m.genes <- cc.genes$g2m.genes
238
data.merge <- CellCycleScoring(data.merge, s.features = s.genes, g2m.features = g2m.genes, set.ident = F)
239
data.merge <- RunPCA(data.merge, features = c(s.genes, g2m.genes))
240
pdf("1.QualityControl/cellCycle.afterMerge.pdf")
241
DimPlot(data.merge, dims = c(1, 2), reduction = "pca", group.by = "Phase")
242
DimPlot(data.merge, dims = c(1, 3), reduction = "pca", group.by = "Phase")
243
DimPlot(data.merge, dims = c(2, 3), reduction = "pca", group.by = "Phase")
244
dev.off()
245
246
set.resolutions <- seq(0.2, 1.2, by = 0.1)
247
#### First observe whether the clustering effect will depend on the sample
248
a <- data.merge
249
a <- RunPCA(a, npcs = 100, verbose = T)
250
pdf("1.QualityControl/merge.observe.batch.pdf")
251
ElbowPlot(object = a, ndims = 100)
252
a <- FindNeighbors(a, dims = 1:50, verbose = T)
253
a <- FindClusters(object = a, resolution = set.resolutions, verbose = T) 
254
clustree(a)
255
a <- RunUMAP(a, dims = 1:50)
256
DimPlot(object = a, reduction = 'umap',label = TRUE, group.by = "orig.ident")
257
DimPlot(object = a, reduction = 'umap',label = TRUE, group.by = "Phase")
258
merge.res <- sapply(set.resolutions, function(x){
259
    p <- DimPlot(object = a, reduction = 'umap',label = TRUE, group.by = paste0("SCT_snn_res.", x))
260
    print(p)
261
})
262
dev.off()
263
264
##############################################3.correct batch effect
265
source(file = "/home/longzhilin/Analysis_Code/SingleCell/scRNA.Integrate.multipleSample.R")
266
pdf("2.Cluster/SCT.Harmony.Integration.PC40.feature3000.pdf")
267
data.merge.harmony.PC40.SCT <- Harmony.integration.reduceDimension(seurat.object = data.merge, assay = "SCT", set.resolutions = seq(0.2, 1.2, by = 0.1), PC = 40, nfeatures = 3000, npcs = 50)
268
dev.off()
269
saveRDS(data.merge.harmony.PC40.SCT, file = "data.merge.harmony.PC40.SCT.feature3000.rds")