|
a |
|
b/preprocessing/Preprocessing_scRNA_HCA.R |
|
|
1 |
# # HumanCellAtlas data |
|
|
2 |
# file="/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HCA.h5ad" |
|
|
3 |
# library(Seurat) |
|
|
4 |
# hca <- ReadH5AD(file) |
|
|
5 |
# Idents(hca) <- "louvain" |
|
|
6 |
# save(hca, file="/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HumanCellAtlas_scRNA.Rdata") |
|
|
7 |
|
|
|
8 |
# file="/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HCA_raw.h5ad" |
|
|
9 |
# |
|
|
10 |
# counts=t(data.table::fread("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/data/raw/X_allgenes.csv.gz", data.table = F)) |
|
|
11 |
# rownames(counts)=scan("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/data/raw/var_allgenes.txt","genes") |
|
|
12 |
# colnames(counts)=scan("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/data/raw/obs_allgenes.txt","sample") |
|
|
13 |
# save(counts, file="/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HumanCellAtlas_scRNA_rawCounts.Rdata") |
|
|
14 |
|
|
|
15 |
#******************************************* all cells ********************************************* |
|
|
16 |
|
|
|
17 |
library(Matrix) |
|
|
18 |
library(Seurat) |
|
|
19 |
library(dplyr) |
|
|
20 |
source("/research/users/ppolonen/git_home/common_scripts/scRNA/functions.scRNA.analysis.R") |
|
|
21 |
|
|
|
22 |
load("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HumanCellAtlas_scRNA_rawCounts.Rdata") |
|
|
23 |
load("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HumanCellAtlas_scRNA.Rdata") |
|
|
24 |
|
|
|
25 |
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/") |
|
|
26 |
|
|
|
27 |
name="HumanCellAtlas" |
|
|
28 |
|
|
|
29 |
clusters=Idents(hca) |
|
|
30 |
hca[["rnaClusterID"]] <- clusters |
|
|
31 |
|
|
|
32 |
# |
|
|
33 |
library("HGNChelper") |
|
|
34 |
update=checkGeneSymbols(rownames(data)) |
|
|
35 |
genelist1=update[,1] |
|
|
36 |
genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3] |
|
|
37 |
rownames(data)=genelist1 |
|
|
38 |
|
|
|
39 |
# try own processing: |
|
|
40 |
batch=gsub("_HiSeq_.*.", "", colnames(scmat)) |
|
|
41 |
name=paste0(colnames(counts), "-", gsub("_HiSeq_.*.", "", colnames(counts))) |
|
|
42 |
|
|
|
43 |
test1=sc.data.analysis(scmat = counts[rowSums(counts)>1000,name%in%colnames(scmat)], regress.cell.label = batch, batch.correction.method = "MNNcorrect", name="HCA", nr.pcs = 50, check.pcs=F, plot.umap = T, cores = 10, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000) |
|
|
44 |
|
|
|
45 |
|
|
|
46 |
# sbatch |
|
|
47 |
library(Matrix) |
|
|
48 |
library(Seurat) |
|
|
49 |
library(dplyr) |
|
|
50 |
source("functions.scRNA.analysis.R") |
|
|
51 |
|
|
|
52 |
load("HumanCellAtlas_scRNA_rawCounts.Rdata") |
|
|
53 |
load("HumanCellAtlas_scRNA.Rdata") |
|
|
54 |
|
|
|
55 |
name="HumanCellAtlas" |
|
|
56 |
|
|
|
57 |
clusters=Idents(hca) |
|
|
58 |
hca[["rnaClusterID"]] <- clusters |
|
|
59 |
|
|
|
60 |
# |
|
|
61 |
library("HGNChelper") |
|
|
62 |
update=checkGeneSymbols(rownames(data)) |
|
|
63 |
genelist1=update[,1] |
|
|
64 |
genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3] |
|
|
65 |
rownames(data)=genelist1 |
|
|
66 |
|
|
|
67 |
# try own processing: |
|
|
68 |
batch=gsub("_HiSeq_.*.", "", colnames(scmat)) |
|
|
69 |
name=paste0(colnames(counts), "-", gsub("_HiSeq_.*.", "", colnames(counts))) |
|
|
70 |
|
|
|
71 |
test1=sc.data.analysis(scmat = counts[,name%in%colnames(scmat)], regress.cell.label = batch, batch.correction.method = "MNNcorrect", name="HCA", nr.pcs = 50, check.pcs=F, plot.umap = T, cores = 10, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000) |
|
|
72 |
|
|
|
73 |
|
|
|
74 |
#******************************************* only TNK cells ********************************************* |
|
|
75 |
library(Matrix) |
|
|
76 |
library(Seurat) |
|
|
77 |
library(dplyr) |
|
|
78 |
source("/research/users/ppolonen/git_home/common_scripts/scRNA/functions.scRNA.analysis.R") |
|
|
79 |
|
|
|
80 |
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/") |
|
|
81 |
|
|
|
82 |
name="HumanCellAtlas_TNK" |
|
|
83 |
|
|
|
84 |
load("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/myrun/HCA_scRNA.Rdata") |
|
|
85 |
scmat=scmat[,grepl("Treg|NK|CD", scmat[["SingleR.label"]][,1])] |
|
|
86 |
|
|
|
87 |
data=GetAssayData(scmat, assay = "RNA", slot = "counts") |
|
|
88 |
|
|
|
89 |
library("HGNChelper") |
|
|
90 |
update=checkGeneSymbols(rownames(data)) |
|
|
91 |
genelist1=update[,1] |
|
|
92 |
genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3] |
|
|
93 |
rownames(data)=genelist1 |
|
|
94 |
|
|
|
95 |
# try own processing: |
|
|
96 |
batch=gsub("_HiSeq_.*.", "", colnames(scmat)) |
|
|
97 |
|
|
|
98 |
test1=sc.data.analysis(scmat = data, regress.cell.label = batch, batch.correction.method = "MNNcorrect", name=name, nr.pcs = 50, check.pcs=F, plot.umap = T, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000, cores = 4, resolution = 1) |
|
|
99 |
#************************************************************************************************************ |
|
|
100 |
|
|
|
101 |
|
|
|
102 |
#******************************************* only NK cells ********************************************* |
|
|
103 |
library(Matrix) |
|
|
104 |
library(Seurat) |
|
|
105 |
library(dplyr) |
|
|
106 |
source("/research/users/ppolonen/git_home/common_scripts/scRNA/functions.scRNA.analysis.R") |
|
|
107 |
|
|
|
108 |
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/") |
|
|
109 |
|
|
|
110 |
name="HumanCellAtlas_NK" |
|
|
111 |
|
|
|
112 |
load("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/myrun/HCA_scRNA.Rdata") |
|
|
113 |
scmat=scmat[,grepl("NK", scmat[["SingleR.label"]][,1])] |
|
|
114 |
|
|
|
115 |
# try own processing: |
|
|
116 |
batch=gsub("_HiSeq_.*.", "", colnames(scmat)) |
|
|
117 |
|
|
|
118 |
data=GetAssayData(scmat, assay = "RNA", slot = "counts") |
|
|
119 |
|
|
|
120 |
library("HGNChelper") |
|
|
121 |
update=checkGeneSymbols(rownames(data)) |
|
|
122 |
genelist1=update[,1] |
|
|
123 |
genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3] |
|
|
124 |
rownames(data)=genelist1 |
|
|
125 |
|
|
|
126 |
test1=sc.data.analysis(scmat = data, regress.cell.label = batch, batch.correction.method = "MNNcorrect", name=name, nr.pcs = 50, check.pcs=F, plot.umap = T, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000, cores = 4, resolution = 1) |
|
|
127 |
#************************************************************************************************************ |
|
|
128 |
|
|
|
129 |
|
|
|
130 |
#******************************************* only T cells ********************************************* |
|
|
131 |
library(Matrix) |
|
|
132 |
library(Seurat) |
|
|
133 |
library(dplyr) |
|
|
134 |
source("/research/users/ppolonen/git_home/common_scripts/scRNA/functions.scRNA.analysis.R") |
|
|
135 |
|
|
|
136 |
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/") |
|
|
137 |
|
|
|
138 |
name="HumanCellAtlas_Tcells" |
|
|
139 |
|
|
|
140 |
load("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/myrun/HCA_scRNA.Rdata") |
|
|
141 |
scmat=scmat[,grepl("Treg|CD", scmat[["SingleR.label"]][,1])] |
|
|
142 |
|
|
|
143 |
data=GetAssayData(scmat, assay = "RNA", slot = "counts") |
|
|
144 |
|
|
|
145 |
library("HGNChelper") |
|
|
146 |
update=checkGeneSymbols(rownames(data)) |
|
|
147 |
genelist1=update[,1] |
|
|
148 |
genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3] |
|
|
149 |
rownames(data)=genelist1 |
|
|
150 |
|
|
|
151 |
# try own processing: |
|
|
152 |
batch=gsub("_HiSeq_.*.", "", colnames(scmat)) |
|
|
153 |
|
|
|
154 |
test1=sc.data.analysis(scmat = data, regress.cell.label = batch, batch.correction.method = "MNNcorrect", name=name, nr.pcs = 50, check.pcs=F, plot.umap = T, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000, cores = 4, resolution = 1) |
|
|
155 |
#************************************************************************************************************ |
|
|
156 |
|
|
|
157 |
|
|
|
158 |
|
|
|
159 |
# add names and RNA cluster |
|
|
160 |
clusters=Idents(hca) |
|
|
161 |
hca[["rnaClusterID"]] <- clusters |
|
|
162 |
|
|
|
163 |
# take 50% of cells from each cluster, and preserve proportions |
|
|
164 |
filt=unlist(lapply(unique(clusters), function(clust){ |
|
|
165 |
set.seed(1) |
|
|
166 |
ind=which(clusters%in%clust) |
|
|
167 |
nr.samples=table(clusters)/2 |
|
|
168 |
sample(colnames(hca)[ind], nr.samples[names(nr.samples)%in%clust]) |
|
|
169 |
})) |
|
|
170 |
|
|
|
171 |
scmat=subset(hca, cells=filt) |
|
|
172 |
|
|
|
173 |
# old: |
|
|
174 |
library("SingleR") |
|
|
175 |
cores=8 |
|
|
176 |
counts <- data.matrix(GetAssayData(scmat, assay = "RNA")) |
|
|
177 |
|
|
|
178 |
# singler = CreateSinglerObject(counts, annot = NULL, clusters=Idents(scmat), ref.list = list(blueprint_encode), project.name=name, fine.tune = T,do.signatures = T, numCores=cores) |
|
|
179 |
# singler2 = CreateSinglerObject(counts, annot = NULL, clusters=Idents(scmat), ref.list = list(hpca), project.name=name, fine.tune = T,do.signatures = T, numCores=cores) |
|
|
180 |
|
|
|
181 |
# save(singler, file=paste0(name, "_singler_object_blueprint_encode.Rdata")) |
|
|
182 |
# save(singler2, file=paste0(name, "_singler_object_hpca.Rdata")) |
|
|
183 |
|
|
|
184 |
load(paste0(name, "_singler_object_blueprint_encode.Rdata")) |
|
|
185 |
load(paste0(name, "_singler_object_hpca.Rdata")) |
|
|
186 |
|
|
|
187 |
# add original identifiers |
|
|
188 |
singler$meta.data$orig.ident = scmat@meta.data$orig.ident # the original identities, if not supplied in 'annot' |
|
|
189 |
|
|
|
190 |
## if using Seurat v3.0 and over use: |
|
|
191 |
singler$meta.data$xy = scmat@reductions$umap@cell.embeddings # the tSNE coordinates |
|
|
192 |
singler$meta.data$clusters = scmat@active.ident # the Seurat clusters (if 'clusters' not provided) |
|
|
193 |
|
|
|
194 |
# these names are not intuitive, replace them, mentioned that these are actually naive http://comphealth.ucsf.edu/SingleR/SupplementaryInformation2.html: |
|
|
195 |
singler$singler[[1]]$SingleR.single$labels[singler$singler[[1]]$SingleR.single$labels%in%"CD4+ T-cells"]="CD4+ Tn" |
|
|
196 |
singler$singler[[1]]$SingleR.single$labels[singler$singler[[1]]$SingleR.single$labels%in%"CD8+ T-cells"]="CD8+ Tn" |
|
|
197 |
singler$singler[[1]]$SingleR.clusters$labels[singler$singler[[1]]$SingleR.clusters$labels%in%"CD4+ T-cells"]="CD4+ Tn" |
|
|
198 |
singler$singler[[1]]$SingleR.clusters$labels[singler$singler[[1]]$SingleR.clusters$labels%in%"CD8+ T-cells"]="CD8+ Tn" |
|
|
199 |
|
|
|
200 |
# change B-cells |
|
|
201 |
singler$singler[[1]]$SingleR.single$labels[singler$singler[[1]]$SingleR.single$labels%in%c("B-cells","Class-switched memory B-cells", "naive B-cells", "Memory B-cells")]="B-cells" |
|
|
202 |
singler$singler[[1]]$SingleR.clusters$labels[singler$singler[[1]]$SingleR.clusters$labels%in%c("B-cells","Class-switched memory B-cells", "naive B-cells", "Memory B-cells")]="B-cells" |
|
|
203 |
|
|
|
204 |
scmat[["SingleR.label.main"]]=singler$singler[[1]]$SingleR.single.main$labels |
|
|
205 |
scmat[["SingleR.label"]]=singler$singler[[1]]$SingleR.single$labels |
|
|
206 |
scmat[["SingleR.label.cluster"]]=paste(singler$singler[[1]]$SingleR.single$labels, Idents(scmat)) |
|
|
207 |
|
|
|
208 |
# can be added as cluster id |
|
|
209 |
cluster=singler$singler[[1]]$SingleR.clusters$labels |
|
|
210 |
cluster.main=singler$singler[[1]]$SingleR.clusters.main$labels |
|
|
211 |
|
|
|
212 |
# order based on cell type and add celltype to cluster: |
|
|
213 |
lineage=c("HSC", "MPP","GMP","CLP","CMP","MEP","Monocytes","DC","Macrophages","Macrophages M1","Macrophages M2","CD4+ Tn","CD4+ T-cells", "CD4+ Tcm", "CD4+ Tem","CD8+ Tn","CD8+ T-cells","CD8+ Tcm","CD8+ Tem","NK cells","Tregs", "Pre-B_cell", "Pro-B_cell", "naive B-cells", "Memory B-cells","Class-switched memory B-cells","Plasma cells","Endothelial cells","Neutrophils","Eosinophils","Fibroblasts","Smooth muscle","Erythrocytes","Megakaryocytes") |
|
|
214 |
lineage=unique(c(lineage,blueprint_encode$types)) |
|
|
215 |
lineage=lineage[lineage%in%cluster] |
|
|
216 |
|
|
|
217 |
identityVector.samples=as.character(Idents(scmat)) |
|
|
218 |
clusters.samples=Idents(scmat) |
|
|
219 |
|
|
|
220 |
for(j in seq(cluster)){ |
|
|
221 |
identityVector.samples[clusters.samples%in%rownames(cluster)[j]]=paste(cluster[j], rownames(cluster)[j]) |
|
|
222 |
} |
|
|
223 |
|
|
|
224 |
# order and cluster identity; |
|
|
225 |
cluster=cluster[order(match(cluster[,1],lineage)),,drop=F] |
|
|
226 |
cat(paste(levels(Idents(scmat)), collapse=","), paste(cluster, collapse=","), sep="\t\t") |
|
|
227 |
|
|
|
228 |
# order seurat clusters too: |
|
|
229 |
Idents(scmat)=factor(identityVector.samples, levels=paste(cluster, rownames(cluster))) |
|
|
230 |
|
|
|
231 |
r4=DimPlot(object = scmat, group.by = "SingleR.label", reduction = 'umap' , label = TRUE, pt.size = 0.5, repel = T) + NoLegend() + NoAxes() |
|
|
232 |
ggplot2::ggsave(r4, filename = paste0(name, "_UMAP_singleR.pdf"), width = 6, height = 6) |
|
|
233 |
|
|
|
234 |
|
|
|
235 |
save(singler, file=paste0(name, "_singler_object.Rdata")) |
|
|
236 |
cat("\nSingleR done", sep="\n\n") |
|
|
237 |
|
|
|
238 |
# add immunoscores to FM format data matrix |
|
|
239 |
fm.f <- data.matrix(GetAssayData(scmat, assay = "RNA")) |
|
|
240 |
|
|
|
241 |
# add gm based score: |
|
|
242 |
add.scores=list(HLAIScore=c("B2M", "HLA-A", "HLA-B","HLA-C"), HLAIIScore=c("HLA-DMA","HLA-DMB","HLA-DPA1","HLA-DPB1","HLA-DRA","HLA-DRB1"), CytolyticScore=c("GZMA", "PRF1", "GNLY", "GZMH", "GZMM")) |
|
|
243 |
|
|
|
244 |
gm.objects=do.call(rbind, lapply(seq(add.scores), function(i){ |
|
|
245 |
dat3=fm.f[rownames(fm.f)%in%add.scores[[i]],] |
|
|
246 |
gm=t(apply(dat3, 2, gm_mean)) # done to normalized values |
|
|
247 |
rownames(gm)=names(add.scores)[i] |
|
|
248 |
return(gm) |
|
|
249 |
})) |
|
|
250 |
|
|
|
251 |
# also add to seurat object: |
|
|
252 |
for(i in seq(add.scores)){ |
|
|
253 |
scmat[[names(add.scores)[i]]] <- gm.objects[i,] |
|
|
254 |
} |
|
|
255 |
|
|
|
256 |
# gene expression and scores to fm: |
|
|
257 |
rownames(fm.f)=paste0("N:GEXP:", rownames(fm.f)) |
|
|
258 |
rownames(gm.objects)=paste0("N:SAMP:", rownames(gm.objects)) |
|
|
259 |
fm=rbind(gm.objects, fm.f) |
|
|
260 |
|
|
|
261 |
# DE analysis: |
|
|
262 |
# markers.all=FindAllMarkers(scmat, only.pos = T)#, ...) |
|
|
263 |
|
|
|
264 |
save(list=c("fm", "scmat"), file=paste0(name, "_scRNA.Rdata")) |
|
|
265 |
# save(list=c("fm", "scmat", "markers.all"), file=paste0(name, "_scRNA.Rdata")) |
|
|
266 |
|
|
|
267 |
|
|
|
268 |
r5=DimPlot(object = scmat, reduction = 'umap', label=T) |
|
|
269 |
ggplot2::ggsave(r5, filename = paste0(name, "_UMAP_clusters.pdf")) |