Switch to unified view

a b/preprocessing/Preprocessing_PanALL.R
1
# PECAN ALL data processing:
2
# https://pecan.stjude.cloud/proteinpaint/study/PanALL
3
files=list.files("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/Pecan_ALL", "HTSeq$", full.names = T)
4
5
m=do.call(cbind, parallel::mclapply(files, read.delim, header=F, row.names=1, mc.cores=8))
6
7
colnames(m)=gsub(".HTSeq|_.*.", "", list.files("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/Pecan_ALL", "HTSeq$", full.names = F))
8
9
save(m, file="PECAN_ALL_counts.Rdata")
10
11
12
# load data:
13
load("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/Pecan_ALL/PECAN_ALL_counts.Rdata")
14
15
# convert ENSAMBL to symbol:
16
17
# filter:
18
keep <- rowSums(edgeR::cpm(m) > 1) >= ceiling(dim(m)[2]*0.025)
19
20
m=m[keep,]
21
22
# map IDs:
23
library('biomaRt')
24
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl", host="useast.ensembl.org"))
25
genes <- rownames(m)
26
G_list <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol"),values=genes,mart= mart)
27
28
G_list=G_list[!(G_list$hgnc_symbol%in%""|duplicated(G_list$hgnc_symbol)),]
29
m=m[match(G_list$ensembl_gene_id, rownames(m)),]
30
31
rownames(m)=G_list$hgnc_symbol
32
33
# load existing subtype coords:
34
annot=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/Pecan_ALL/clinical_pecan.txt", data.table = F,dec = ",")
35
annot=annot[match(gsub("_.*.", "", colnames(gexp)), annot$patient),]
36
37
# normalize library size
38
DGE <- edgeR::DGEList(m)
39
DGE <- edgeR::calcNormFactors(DGE,method =c("TMM"))
40
41
# voom:
42
DGE.voom=limma::voom(DGE, plot=T)$E
43
44
# run combat:
45
# library(sva)
46
# annot$batch = annot$`RNA-seq library`
47
# modcombat = model.matrix(~1, data=annot)
48
# gexp = ComBat(dat=as.matrix(DGE.voom), batch=annot$batch, mod=modcombat, par.prior=TRUE, prior.plots=FALSE)
49
50
gexp=limma::removeBatchEffect(x = DGE.voom, batch = annot$`RNA-seq library`)
51
52
# write data out
53
coordinates.subtype=CancerMap(data = t(as.matrix(gexp)), name = "Pecan_pre-B-ALL", VAR = 10, BW = 1.75, perplexity = 30, PATH_OUTPUT = "/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/")
54
coordinates.subtype$subtype=annot$`primary subtype`
55
56
save(list = c("gexp", "coordinates.subtype", "annot"), file="/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/PecanALL_subtypes.Rdata")
57
58
59
coord_pecan=CancerMap(data = t(as.matrix(gexp)), name = "Pecan_pre-B-ALL", VAR = 10, BW = 1.75, perplexity = 30, PATH_OUTPUT = "/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/")
60
plot.scatter(x=coord_pecan$x, y = coord_pecan$y, group =  coordinates.subtype$`primary subtype`, namev = coordinates.subtype$`primary subtype`, main = "Pecan ALL", rasterize = F, width = 70*2, height = 74*2, SIZE = 0.5)
61
62
plot.scatter(x=coord_pecan$x, y = coord_pecan$y, group =  annot$`RNA-seq library`, namev = annot$`RNA-seq library`, main = "Pecan ALL", rasterize = F, width = 70*2, height = 74*2, SIZE = 0.5)
63
plot.scatter(x=coord_pecan$x, y = coord_pecan$y, group =  annot$institute, namev = annot$institute, main = "Pecan ALL", rasterize = F, width = 70*2, height = 74*2, SIZE = 0.5)
64
65
mod=names(table(annot$protocol))[table(annot$protocol)<3]
66
annot$protocol[annot$protocol%in%mod]="other"
67
plot.scatter(x=coord_pecan$x, y = coord_pecan$y, group =  annot$protocol, namev = annot$protocol, main = "Pecan ALL", rasterize = F, width = 70*2, height = 74*2, SIZE = 0.5)
68
69
mod=names(table(annot$protocol))[table(annot$protocol)<3]
70
annot$protocol[annot$protocol%in%mod]="other"
71
72
plot.scatter(x=coord_pecan$x, y = coord_pecan$y, group =  batch, namev =batch, main = "Pecan ALL", rasterize = F, width = 70*2, height = 74*2, SIZE = 0.5)