--- a +++ b/RNA-seq/AnalysisPipeline/1.preProcessData.R @@ -0,0 +1,269 @@ +#' @description: proProcess, cluster and remove the batch effect + +# load package +library(Seurat) +library(harmony) +library(clustree) +library(ggpubr) +library(dplyr) +library(patchwork) +library(ComplexHeatmap) +library(circlize) +library(vegan) +set.seed(101) +library(future) +plan("multiprocess", workers = 10) +options(future.globals.maxSize = 100000 * 1024^2) # set 50G RAM +setwd(dir = "/data/active_data/lzl/RenalTumor-20200713/DataAnalysis-20210803/scRNA") +source(file = "/home/longzhilin/Analysis_Code/Visualization/colorPalettes.R") + +#### four samples +T1.data <- Read10X(data.dir = "/data/raw_data/lzl/RenalTumor-20200713/cellRanger_result/scRNA-V5.0/T1/outs/filtered_feature_bc_matrix") +colnames(T1.data) <- paste0("T1_", colnames(T1.data)) +#36601*8849 +T2.data <- Read10X(data.dir = "/data/raw_data/lzl/RenalTumor-20200713/cellRanger_result/scRNA-V5.0/T2/outs/filtered_feature_bc_matrix") +colnames(T2.data) <- paste0("T2_", colnames(T2.data)) +#36601*15440 +T3.data <- Read10X(data.dir = "/data/raw_data/lzl/RenalTumor-20200713/cellRanger_result/scRNA-V5.0/T3/outs/filtered_feature_bc_matrix") +colnames(T3.data) <- paste0("T3_", colnames(T3.data)) +#36601*14644 +T4.data <- Read10X(data.dir = "/data/raw_data/lzl/RenalTumor-20200713/cellRanger_result/scRNA-V5.0/T4/outs/filtered_feature_bc_matrix") +colnames(T4.data) <- paste0("T4_", colnames(T4.data)) +#36601*11016 + +#### Preliminary filtration +#min.cell >3 & min.features >200 +T1 <- CreateSeuratObject(counts = T1.data, + project = "T1", + min.cells = 3, + min.features = 200) +#23137*8823 +T2 <- CreateSeuratObject(counts = T2.data, + project = "T2", + min.cells = 3, + min.features = 200) +#23487*15437 +T3 <- CreateSeuratObject(counts = T3.data, + project = "T3", + min.cells = 3, + min.features = 200) +#24106*14602 +T4 <- CreateSeuratObject(counts = T4.data, + project = "T4", + min.cells = 3, + min.features = 200) +#23065*10792 + +#### Mitochondrial gene ratio +T1[["percent.mt"]] <- PercentageFeatureSet(T1, pattern = "^MT-") +T2[["percent.mt"]] <- PercentageFeatureSet(T2, pattern = "^MT-") +T3[["percent.mt"]] <- PercentageFeatureSet(T3, pattern = "^MT-") +T4[["percent.mt"]] <- PercentageFeatureSet(T4, pattern = "^MT-") + +#### Draw a statistical graph of the number of genes/count number/proportion of mitochondrial genes +pdf(file = "1.QualityControl/count.feature.mt.pdf") +VlnPlot(T1, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) +plot1 <- FeatureScatter(T1, feature1 = "nCount_RNA", feature2 = "percent.mt") +ploT1 <- FeatureScatter(T1, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +plot1 + ploT1 +VlnPlot(T2, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) +plot1 <- FeatureScatter(T2, feature1 = "nCount_RNA", feature2 = "percent.mt") +ploT1 <- FeatureScatter(T2, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +plot1 + ploT1 +VlnPlot(T3, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) +plot1 <- FeatureScatter(T3, feature1 = "nCount_RNA", feature2 = "percent.mt") +ploT1 <- FeatureScatter(T3, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +plot1 + ploT1 +VlnPlot(T4, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) +plot1 <- FeatureScatter(T4, feature1 = "nCount_RNA", feature2 = "percent.mt") +ploT1 <- FeatureScatter(T4, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +plot1 + ploT1 + +ggdensity(T1@meta.data, x = "nCount_RNA", title = "T1") +ggdensity(T1@meta.data, x = "nFeature_RNA", title = "T1") +ggdensity(T1@meta.data, x = "percent.mt", title = "T1") +ggdensity(T2@meta.data, x = "nCount_RNA", title = "T2") +ggdensity(T2@meta.data, x = "nFeature_RNA", title = "T2") +ggdensity(T2@meta.data, x = "percent.mt", title = "T2") +ggdensity(T3@meta.data, x = "nCount_RNA", title = "T3") +ggdensity(T3@meta.data, x = "nFeature_RNA", title = "T3") +ggdensity(T3@meta.data, x = "percent.mt", title = "T3") +ggdensity(T4@meta.data, x = "nCount_RNA", title = "T4") +ggdensity(T4@meta.data, x = "nFeature_RNA", title = "T4") +ggdensity(T4@meta.data, x = "percent.mt", title = "T4") +dev.off() + +######################### Detect the resolution parameters of each sample cluster. After the parameters are determined, you can block them without executing [test] +set.resolutions <- seq(0.5, 2, by = 0.1) +pdf(file = "1.QualityControl/PCA-test.pdf") + +#### T1 +T1.pro <- subset(T1, subset = nFeature_RNA > 200 & percent.mt < 10 & nCount_RNA > 1000 & nFeature_RNA < 6000) +T1.pro <- SCTransform(T1.pro, vars.to.regress = c("nCount_RNA", "percent.mt"), verbose = FALSE) +T1.pro <- RunPCA(T1.pro, npcs = 100, verbose = FALSE) +ElbowPlot(object = T1.pro, ndims = 100) +T1.pro <- FindNeighbors(object = T1.pro , dims = 1:50, verbose = FALSE) +T1.pro <- FindClusters(object = T1.pro , resolution = set.resolutions, verbose = FALSE) +clustree(T1.pro) +T1.pro <- RunUMAP(T1.pro , dims = 1:50) +T1.res <- sapply(set.resolutions, function(x){ + p <- DimPlot(object = T1.pro, reduction = 'umap',label = TRUE, group.by = paste0("SCT_snn_res.", x)) + print(p) +}) + +#### T2 +T2.pro <- subset(T2, subset = nFeature_RNA > 200 & percent.mt < 10 & nCount_RNA > 1000 & nFeature_RNA < 6000) +T2.pro <- SCTransform(T2.pro, vars.to.regress = c("nCount_RNA", "percent.mt"), verbose = FALSE) +T2.pro <- RunPCA(T2.pro, npcs = 100, verbose = FALSE) +ElbowPlot(object = T2.pro, ndims = 100) +T2.pro <- FindNeighbors(object = T2.pro , dims = 1:50, verbose = FALSE) +T2.pro <- FindClusters(object = T2.pro , resolution = set.resolutions, verbose = FALSE) +clustree(T2.pro) +T2.pro <- RunUMAP(T2.pro , dims = 1:50) +T2.res <- sapply(set.resolutions, function(x){ + p <- DimPlot(object = T2.pro, reduction = 'umap',label = TRUE, group.by = paste0("SCT_snn_res.", x)) + print(p) +}) + +#### T3 +T3.pro <- subset(T3, subset = nFeature_RNA > 200 & percent.mt < 10 & nCount_RNA > 1000 & nFeature_RNA < 6000) +T3.pro <- SCTransform(T3.pro, vars.to.regress = c("nCount_RNA", "percent.mt"), verbose = FALSE) +T3.pro <- RunPCA(T3.pro, npcs = 100, verbose = FALSE) +ElbowPlot(object = T3.pro, ndims = 100) +T3.pro <- FindNeighbors(object = T3.pro , dims = 1:50, verbose = FALSE) +T3.pro <- FindClusters(object = T3.pro , resolution = set.resolutions, verbose = FALSE) +clustree(T3.pro) +T3.pro <- RunUMAP(T3.pro , dims = 1:50) +T3.res <- sapply(set.resolutions, function(x){ + p <- DimPlot(object = T3.pro, reduction = 'umap',label = TRUE, group.by = paste0("SCT_snn_res.", x)) + print(p) +}) + +#### T4 +T4.pro <- subset(T4, subset = nFeature_RNA > 200 & percent.mt < 10 & nCount_RNA > 1000 & nFeature_RNA < 6000) +T4.pro <- SCTransform(T4.pro, vars.to.regress = c("nCount_RNA", "percent.mt"), verbose = FALSE) +T4.pro <- RunPCA(T4.pro, npcs = 100, verbose = FALSE) +ElbowPlot(object = T4.pro, ndims = 100) +T4.pro <- FindNeighbors(object = T4.pro , dims = 1:50, verbose = FALSE) +T4.pro <- FindClusters(object = T4.pro , resolution = set.resolutions, verbose = FALSE) +clustree(T4.pro) +T4.pro <- RunUMAP(T4.pro , dims = 1:50) +T4.res <- sapply(set.resolutions, function(x){ + p <- DimPlot(object = T4.pro, reduction = 'umap',label = TRUE, group.by = paste0("SCT_snn_res.", x)) + print(p) +}) +dev.off() + +#### remove doublet +library(DoubletFinder) # Require cleanup of low-quality cells in advance +source(file = "/home/longzhilin/Analysis_Code/SingleCell/doubletDetect.R") +pdf("1.QualityControl/doublet.pdf") + +T1.pro1 <- doubletDetect(Seurat.object = T1.pro, PCs = 1:50, doublet.rate = 0.061, annotation = "SCT_snn_res.0.7", sct = T) #7893 ~8000 +T2.pro1 <- doubletDetect(Seurat.object = T2.pro, PCs = 1:50, doublet.rate = 0.106, annotation = "SCT_snn_res.1.7", sct = T) #13992 ~14000 +T3.pro1 <- doubletDetect(Seurat.object = T3.pro, PCs = 1:50, doublet.rate = 0.091, annotation = "SCT_snn_res.0.7", sct = T) #11973 ~12000 +T4.pro1 <- doubletDetect(Seurat.object = T4.pro, PCs = 1:50, doublet.rate = 0.061, annotation = "SCT_snn_res.0.5", sct = T) #8054 ~8000 +dev.off() +saveRDS(T1.pro1, file = "T1.pro1.rds") +saveRDS(T2.pro1, file = "T2.pro1.rds") +saveRDS(T3.pro1, file = "T3.pro1.rds") +saveRDS(T4.pro1, file = "T4.pro1.rds") + +pdf("1.QualityControl/doublet.cell.pdf") +DimPlot(object = T1.pro1, reduction = 'umap', group.by = "Doublet") +DimPlot(object = T2.pro1, reduction = 'umap', group.by = "Doublet") +DimPlot(object = T3.pro1, reduction = 'umap', group.by = "Doublet") +DimPlot(object = T4.pro1, reduction = 'umap', group.by = "Doublet") +dev.off() + +T1.pro2 <- subset(T1.pro1, subset = Doublet == "Singlet") #21247*7449 +T2.pro2 <- subset(T2.pro1, subset = Doublet == "Singlet") #21605*12574 +T3.pro2 <- subset(T3.pro1, subset = Doublet == "Singlet") #21947*10937 +T4.pro2 <- subset(T4.pro1, subset = Doublet == "Singlet") #20459*7605 +saveRDS(T1.pro2, file = "T1.pro2.rds") +saveRDS(T2.pro2, file = "T2.pro2.rds") +saveRDS(T3.pro2, file = "T3.pro2.rds") +saveRDS(T4.pro2, file = "T4.pro2.rds") + +############################################## merge data and correct the batch effect +DefaultAssay(T1.pro2) <- "RNA" +DefaultAssay(T2.pro2) <- "RNA" +DefaultAssay(T3.pro2) <- "RNA" +DefaultAssay(T4.pro2) <- "RNA" + +source(file = "/home/longzhilin/Analysis_Code/SingleCell/variableFeatureSelection.R") +Renal.list <- list(T1 = T1.pro2, T2 = T2.pro2, T3 = T3.pro2, T4 = T4.pro2) +Renal.list.Stardard <- variableFeatureSelection(seurat.lists = Renal.list, method = "Stardard", nfeatures = 3000) +saveRDS(Renal.list.Stardard, file = "Renal.list.Stardard.3000.rds") + +Renal.list.SCT <- variableFeatureSelection(seurat.lists = Renal.list, method = "SCT", nfeatures = 3000) +saveRDS(Renal.list.SCT, file = "Renal.list.SCT.3000.rds") + +#### +# assay=SCT +data.merge <- merge(Renal.list.SCT[[1]], y = Renal.list.SCT[2:length(Renal.list.SCT)], project = "Renal") +DefaultAssay(data.merge) <- "SCT" +seurat.features.SCT <- SelectIntegrationFeatures(object.list = Renal.list.SCT, nfeatures = 3000) +VariableFeatures(data.merge) <- seurat.features.SCT +# Remove previous clustering results +index <- match(paste0("SCT_snn_res.", seq(0.5, 2, by=0.1)), colnames(data.merge@meta.data)) +data.merge@meta.data <- data.merge@meta.data[,-index] +# assay=RNA +seurat.features.RNA <- SelectIntegrationFeatures(object.list = Renal.list.Stardard, nfeatures = 3000) +DefaultAssay(data.merge) <- "RNA" +VariableFeatures(data.merge) <- seurat.features.RNA +data.merge <- NormalizeData(data.merge, verbose = FALSE) +data.merge <- ScaleData(data.merge, verbose = FALSE, vars.to.regress = c("nCount_RNA", "percent.mt"), features = rownames(data.merge@assays$RNA@data)) +DefaultAssay(data.merge) <- "SCT" +saveRDS(data.merge, file = "data.merge.rds") + +##############################################2.Evaluation of cellcycle and patient bias +DefaultAssay(data.merge) <- "SCT" +pdf("1.QualityControl/filtered.statistics.pdf") +VlnPlot(object = data.merge, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "orig.ident", cols = Palettes$group_pal[1:length(unique(data.merge@meta.data$orig.ident))], pt.size = 0) +FeatureScatter(object = data.merge, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "orig.ident") +FeatureScatter(object = data.merge, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "orig.ident") +dev.off() +# Draw the distribution of the number of samples +cell.number <- as.data.frame(table(data.merge$orig.ident)) +pdf("1.QualityControl/highQuality.pdf") +ggbarplot(cell.number, x="Var1", y="Freq", fill = "Var1", color = "Var1", palette = Palettes$group_pal[1:length(unique(data.merge@meta.data$orig.ident))], + sort.by.groups=FALSE, #不按组排序 + label = T, xlab = "", ylab = "Cell Number") + theme(legend.position="none") +dev.off() + +#### Assess the cell cycle effect +s.genes <- cc.genes$s.genes +g2m.genes <- cc.genes$g2m.genes +data.merge <- CellCycleScoring(data.merge, s.features = s.genes, g2m.features = g2m.genes, set.ident = F) +data.merge <- RunPCA(data.merge, features = c(s.genes, g2m.genes)) +pdf("1.QualityControl/cellCycle.afterMerge.pdf") +DimPlot(data.merge, dims = c(1, 2), reduction = "pca", group.by = "Phase") +DimPlot(data.merge, dims = c(1, 3), reduction = "pca", group.by = "Phase") +DimPlot(data.merge, dims = c(2, 3), reduction = "pca", group.by = "Phase") +dev.off() + +set.resolutions <- seq(0.2, 1.2, by = 0.1) +#### First observe whether the clustering effect will depend on the sample +a <- data.merge +a <- RunPCA(a, npcs = 100, verbose = T) +pdf("1.QualityControl/merge.observe.batch.pdf") +ElbowPlot(object = a, ndims = 100) +a <- FindNeighbors(a, dims = 1:50, verbose = T) +a <- FindClusters(object = a, resolution = set.resolutions, verbose = T) +clustree(a) +a <- RunUMAP(a, dims = 1:50) +DimPlot(object = a, reduction = 'umap',label = TRUE, group.by = "orig.ident") +DimPlot(object = a, reduction = 'umap',label = TRUE, group.by = "Phase") +merge.res <- sapply(set.resolutions, function(x){ + p <- DimPlot(object = a, reduction = 'umap',label = TRUE, group.by = paste0("SCT_snn_res.", x)) + print(p) +}) +dev.off() + +##############################################3.correct batch effect +source(file = "/home/longzhilin/Analysis_Code/SingleCell/scRNA.Integrate.multipleSample.R") +pdf("2.Cluster/SCT.Harmony.Integration.PC40.feature3000.pdf") +data.merge.harmony.PC40.SCT <- Harmony.integration.reduceDimension(seurat.object = data.merge, assay = "SCT", set.resolutions = seq(0.2, 1.2, by = 0.1), PC = 40, nfeatures = 3000, npcs = 50) +dev.off() +saveRDS(data.merge.harmony.PC40.SCT, file = "data.merge.harmony.PC40.SCT.feature3000.rds")