Switch to unified view

a b/preprocessing/Preprocessing_scRNA_HCA.R
1
# # HumanCellAtlas data
2
# file="/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HCA.h5ad"
3
# library(Seurat)
4
# hca <- ReadH5AD(file)
5
# Idents(hca) <- "louvain"
6
# save(hca, file="/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HumanCellAtlas_scRNA.Rdata")
7
8
# file="/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HCA_raw.h5ad"
9
# 
10
# counts=t(data.table::fread("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/data/raw/X_allgenes.csv.gz", data.table = F))
11
# rownames(counts)=scan("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/data/raw/var_allgenes.txt","genes")
12
# colnames(counts)=scan("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/data/raw/obs_allgenes.txt","sample")
13
# save(counts, file="/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HumanCellAtlas_scRNA_rawCounts.Rdata")
14
15
#******************************************* all cells *********************************************
16
17
library(Matrix)
18
library(Seurat)
19
library(dplyr)
20
source("/research/users/ppolonen/git_home/common_scripts/scRNA/functions.scRNA.analysis.R")
21
22
load("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HumanCellAtlas_scRNA_rawCounts.Rdata")
23
load("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HumanCellAtlas_scRNA.Rdata")
24
25
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/")
26
27
name="HumanCellAtlas"
28
29
clusters=Idents(hca)
30
hca[["rnaClusterID"]] <- clusters
31
32
#
33
library("HGNChelper")
34
update=checkGeneSymbols(rownames(data))
35
genelist1=update[,1]
36
genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3]
37
rownames(data)=genelist1
38
39
# try own processing:
40
batch=gsub("_HiSeq_.*.", "", colnames(scmat))
41
name=paste0(colnames(counts), "-", gsub("_HiSeq_.*.", "", colnames(counts)))
42
43
test1=sc.data.analysis(scmat = counts[rowSums(counts)>1000,name%in%colnames(scmat)], regress.cell.label = batch, batch.correction.method = "MNNcorrect", name="HCA", nr.pcs = 50, check.pcs=F, plot.umap = T, cores = 10, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000)
44
45
46
# sbatch
47
library(Matrix)
48
library(Seurat)
49
library(dplyr)
50
source("functions.scRNA.analysis.R")
51
52
load("HumanCellAtlas_scRNA_rawCounts.Rdata")
53
load("HumanCellAtlas_scRNA.Rdata")
54
55
name="HumanCellAtlas"
56
57
clusters=Idents(hca)
58
hca[["rnaClusterID"]] <- clusters
59
60
#
61
library("HGNChelper")
62
update=checkGeneSymbols(rownames(data))
63
genelist1=update[,1]
64
genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3]
65
rownames(data)=genelist1
66
67
# try own processing:
68
batch=gsub("_HiSeq_.*.", "", colnames(scmat))
69
name=paste0(colnames(counts), "-", gsub("_HiSeq_.*.", "", colnames(counts)))
70
71
test1=sc.data.analysis(scmat = counts[,name%in%colnames(scmat)], regress.cell.label = batch, batch.correction.method = "MNNcorrect", name="HCA", nr.pcs = 50, check.pcs=F, plot.umap = T, cores = 10, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000)
72
73
74
#******************************************* only TNK cells *********************************************
75
library(Matrix)
76
library(Seurat)
77
library(dplyr)
78
source("/research/users/ppolonen/git_home/common_scripts/scRNA/functions.scRNA.analysis.R")
79
80
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/")
81
82
name="HumanCellAtlas_TNK"
83
84
load("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/myrun/HCA_scRNA.Rdata")
85
scmat=scmat[,grepl("Treg|NK|CD", scmat[["SingleR.label"]][,1])]
86
87
data=GetAssayData(scmat, assay = "RNA", slot = "counts")
88
89
library("HGNChelper")
90
update=checkGeneSymbols(rownames(data))
91
genelist1=update[,1]
92
genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3]
93
rownames(data)=genelist1
94
95
# try own processing:
96
batch=gsub("_HiSeq_.*.", "", colnames(scmat))
97
98
test1=sc.data.analysis(scmat = data, regress.cell.label = batch, batch.correction.method = "MNNcorrect", name=name, nr.pcs = 50, check.pcs=F, plot.umap = T, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000, cores = 4, resolution = 1)
99
#************************************************************************************************************
100
101
102
#******************************************* only NK cells *********************************************
103
library(Matrix)
104
library(Seurat)
105
library(dplyr)
106
source("/research/users/ppolonen/git_home/common_scripts/scRNA/functions.scRNA.analysis.R")
107
108
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/")
109
110
name="HumanCellAtlas_NK"
111
112
load("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/myrun/HCA_scRNA.Rdata")
113
scmat=scmat[,grepl("NK", scmat[["SingleR.label"]][,1])]
114
115
# try own processing:
116
batch=gsub("_HiSeq_.*.", "", colnames(scmat))
117
118
data=GetAssayData(scmat, assay = "RNA", slot = "counts")
119
120
library("HGNChelper")
121
update=checkGeneSymbols(rownames(data))
122
genelist1=update[,1]
123
genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3]
124
rownames(data)=genelist1
125
126
test1=sc.data.analysis(scmat = data, regress.cell.label = batch, batch.correction.method = "MNNcorrect", name=name, nr.pcs = 50, check.pcs=F, plot.umap = T,  percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000, cores = 4, resolution = 1)
127
#************************************************************************************************************
128
129
130
#******************************************* only T cells *********************************************
131
library(Matrix)
132
library(Seurat)
133
library(dplyr)
134
source("/research/users/ppolonen/git_home/common_scripts/scRNA/functions.scRNA.analysis.R")
135
136
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/")
137
138
name="HumanCellAtlas_Tcells"
139
140
load("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/myrun/HCA_scRNA.Rdata")
141
scmat=scmat[,grepl("Treg|CD", scmat[["SingleR.label"]][,1])]
142
143
data=GetAssayData(scmat, assay = "RNA", slot = "counts")
144
145
library("HGNChelper")
146
update=checkGeneSymbols(rownames(data))
147
genelist1=update[,1]
148
genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3]
149
rownames(data)=genelist1
150
151
# try own processing:
152
batch=gsub("_HiSeq_.*.", "", colnames(scmat))
153
154
test1=sc.data.analysis(scmat = data, regress.cell.label = batch, batch.correction.method = "MNNcorrect", name=name, nr.pcs = 50, check.pcs=F, plot.umap = T, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000, cores = 4, resolution = 1)
155
#************************************************************************************************************
156
157
158
159
# add names and RNA cluster
160
clusters=Idents(hca)
161
hca[["rnaClusterID"]] <- clusters
162
163
# take 50% of cells from each cluster, and preserve proportions
164
filt=unlist(lapply(unique(clusters), function(clust){
165
  set.seed(1)
166
  ind=which(clusters%in%clust)
167
  nr.samples=table(clusters)/2
168
  sample(colnames(hca)[ind], nr.samples[names(nr.samples)%in%clust])
169
}))
170
171
scmat=subset(hca, cells=filt)
172
173
# old:
174
library("SingleR")
175
cores=8
176
counts <- data.matrix(GetAssayData(scmat, assay = "RNA"))
177
178
# singler = CreateSinglerObject(counts, annot = NULL, clusters=Idents(scmat), ref.list = list(blueprint_encode), project.name=name, fine.tune = T,do.signatures = T, numCores=cores)
179
# singler2 = CreateSinglerObject(counts, annot = NULL, clusters=Idents(scmat), ref.list = list(hpca), project.name=name, fine.tune = T,do.signatures = T, numCores=cores)
180
181
# save(singler, file=paste0(name, "_singler_object_blueprint_encode.Rdata"))
182
# save(singler2, file=paste0(name, "_singler_object_hpca.Rdata"))
183
184
load(paste0(name, "_singler_object_blueprint_encode.Rdata"))
185
load(paste0(name, "_singler_object_hpca.Rdata"))
186
187
# add original identifiers
188
singler$meta.data$orig.ident = scmat@meta.data$orig.ident # the original identities, if not supplied in 'annot'
189
190
## if using Seurat v3.0 and over use:
191
singler$meta.data$xy = scmat@reductions$umap@cell.embeddings # the tSNE coordinates
192
singler$meta.data$clusters = scmat@active.ident # the Seurat clusters (if 'clusters' not provided)
193
194
# these names are not intuitive, replace them, mentioned that these are actually naive http://comphealth.ucsf.edu/SingleR/SupplementaryInformation2.html:
195
singler$singler[[1]]$SingleR.single$labels[singler$singler[[1]]$SingleR.single$labels%in%"CD4+ T-cells"]="CD4+ Tn"
196
singler$singler[[1]]$SingleR.single$labels[singler$singler[[1]]$SingleR.single$labels%in%"CD8+ T-cells"]="CD8+ Tn"
197
singler$singler[[1]]$SingleR.clusters$labels[singler$singler[[1]]$SingleR.clusters$labels%in%"CD4+ T-cells"]="CD4+ Tn"
198
singler$singler[[1]]$SingleR.clusters$labels[singler$singler[[1]]$SingleR.clusters$labels%in%"CD8+ T-cells"]="CD8+ Tn"
199
200
# change B-cells
201
singler$singler[[1]]$SingleR.single$labels[singler$singler[[1]]$SingleR.single$labels%in%c("B-cells","Class-switched memory B-cells", "naive B-cells", "Memory B-cells")]="B-cells"
202
singler$singler[[1]]$SingleR.clusters$labels[singler$singler[[1]]$SingleR.clusters$labels%in%c("B-cells","Class-switched memory B-cells", "naive B-cells", "Memory B-cells")]="B-cells"
203
204
scmat[["SingleR.label.main"]]=singler$singler[[1]]$SingleR.single.main$labels
205
scmat[["SingleR.label"]]=singler$singler[[1]]$SingleR.single$labels
206
scmat[["SingleR.label.cluster"]]=paste(singler$singler[[1]]$SingleR.single$labels, Idents(scmat))
207
208
# can be added as cluster id
209
cluster=singler$singler[[1]]$SingleR.clusters$labels
210
cluster.main=singler$singler[[1]]$SingleR.clusters.main$labels
211
212
# order based on cell type and add celltype to cluster:
213
lineage=c("HSC", "MPP","GMP","CLP","CMP","MEP","Monocytes","DC","Macrophages","Macrophages M1","Macrophages M2","CD4+ Tn","CD4+ T-cells", "CD4+ Tcm", "CD4+ Tem","CD8+ Tn","CD8+ T-cells","CD8+ Tcm","CD8+ Tem","NK cells","Tregs", "Pre-B_cell", "Pro-B_cell", "naive B-cells", "Memory B-cells","Class-switched memory B-cells","Plasma cells","Endothelial cells","Neutrophils","Eosinophils","Fibroblasts","Smooth muscle","Erythrocytes","Megakaryocytes")
214
lineage=unique(c(lineage,blueprint_encode$types))
215
lineage=lineage[lineage%in%cluster]
216
217
identityVector.samples=as.character(Idents(scmat))
218
clusters.samples=Idents(scmat)
219
220
for(j in seq(cluster)){
221
  identityVector.samples[clusters.samples%in%rownames(cluster)[j]]=paste(cluster[j], rownames(cluster)[j])
222
}
223
224
# order and cluster identity;
225
cluster=cluster[order(match(cluster[,1],lineage)),,drop=F]
226
cat(paste(levels(Idents(scmat)), collapse=","), paste(cluster, collapse=","), sep="\t\t")   
227
228
# order seurat clusters too:
229
Idents(scmat)=factor(identityVector.samples, levels=paste(cluster, rownames(cluster)))
230
231
r4=DimPlot(object = scmat, group.by = "SingleR.label", reduction = 'umap' , label = TRUE,  pt.size = 0.5, repel = T)  + NoLegend() + NoAxes()
232
ggplot2::ggsave(r4, filename = paste0(name, "_UMAP_singleR.pdf"), width = 6, height = 6)
233
234
235
save(singler, file=paste0(name, "_singler_object.Rdata"))
236
cat("\nSingleR done", sep="\n\n")
237
238
# add immunoscores to FM format data matrix
239
fm.f <- data.matrix(GetAssayData(scmat, assay = "RNA"))
240
241
# add gm based score:
242
add.scores=list(HLAIScore=c("B2M", "HLA-A", "HLA-B","HLA-C"), HLAIIScore=c("HLA-DMA","HLA-DMB","HLA-DPA1","HLA-DPB1","HLA-DRA","HLA-DRB1"), CytolyticScore=c("GZMA", "PRF1", "GNLY", "GZMH", "GZMM"))
243
244
gm.objects=do.call(rbind, lapply(seq(add.scores), function(i){
245
  dat3=fm.f[rownames(fm.f)%in%add.scores[[i]],]
246
  gm=t(apply(dat3, 2, gm_mean)) # done to normalized values
247
  rownames(gm)=names(add.scores)[i]
248
  return(gm)
249
}))
250
251
# also add to seurat object:
252
for(i in seq(add.scores)){
253
  scmat[[names(add.scores)[i]]] <- gm.objects[i,]
254
}
255
256
# gene expression and scores to fm:
257
rownames(fm.f)=paste0("N:GEXP:", rownames(fm.f))
258
rownames(gm.objects)=paste0("N:SAMP:", rownames(gm.objects))
259
fm=rbind(gm.objects, fm.f)
260
261
# DE analysis:
262
# markers.all=FindAllMarkers(scmat, only.pos = T)#, ...)
263
264
save(list=c("fm", "scmat"), file=paste0(name, "_scRNA.Rdata"))
265
# save(list=c("fm", "scmat", "markers.all"), file=paste0(name, "_scRNA.Rdata"))
266
267
268
r5=DimPlot(object = scmat, reduction = 'umap', label=T)
269
ggplot2::ggsave(r5, filename = paste0(name, "_UMAP_clusters.pdf"))