--- a +++ b/preprocessing/Preprocessing_PanALL.R @@ -0,0 +1,72 @@ +# PECAN ALL data processing: +# https://pecan.stjude.cloud/proteinpaint/study/PanALL +files=list.files("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/Pecan_ALL", "HTSeq$", full.names = T) + +m=do.call(cbind, parallel::mclapply(files, read.delim, header=F, row.names=1, mc.cores=8)) + +colnames(m)=gsub(".HTSeq|_.*.", "", list.files("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/Pecan_ALL", "HTSeq$", full.names = F)) + +save(m, file="PECAN_ALL_counts.Rdata") + + +# load data: +load("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/Pecan_ALL/PECAN_ALL_counts.Rdata") + +# convert ENSAMBL to symbol: + +# filter: +keep <- rowSums(edgeR::cpm(m) > 1) >= ceiling(dim(m)[2]*0.025) + +m=m[keep,] + +# map IDs: +library('biomaRt') +mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl", host="useast.ensembl.org")) +genes <- rownames(m) +G_list <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol"),values=genes,mart= mart) + +G_list=G_list[!(G_list$hgnc_symbol%in%""|duplicated(G_list$hgnc_symbol)),] +m=m[match(G_list$ensembl_gene_id, rownames(m)),] + +rownames(m)=G_list$hgnc_symbol + +# load existing subtype coords: +annot=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/Pecan_ALL/clinical_pecan.txt", data.table = F,dec = ",") +annot=annot[match(gsub("_.*.", "", colnames(gexp)), annot$patient),] + +# normalize library size +DGE <- edgeR::DGEList(m) +DGE <- edgeR::calcNormFactors(DGE,method =c("TMM")) + +# voom: +DGE.voom=limma::voom(DGE, plot=T)$E + +# run combat: +# library(sva) +# annot$batch = annot$`RNA-seq library` +# modcombat = model.matrix(~1, data=annot) +# gexp = ComBat(dat=as.matrix(DGE.voom), batch=annot$batch, mod=modcombat, par.prior=TRUE, prior.plots=FALSE) + +gexp=limma::removeBatchEffect(x = DGE.voom, batch = annot$`RNA-seq library`) + +# write data out +coordinates.subtype=CancerMap(data = t(as.matrix(gexp)), name = "Pecan_pre-B-ALL", VAR = 10, BW = 1.75, perplexity = 30, PATH_OUTPUT = "/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/") +coordinates.subtype$subtype=annot$`primary subtype` + +save(list = c("gexp", "coordinates.subtype", "annot"), file="/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/PecanALL_subtypes.Rdata") + + +coord_pecan=CancerMap(data = t(as.matrix(gexp)), name = "Pecan_pre-B-ALL", VAR = 10, BW = 1.75, perplexity = 30, PATH_OUTPUT = "/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/") +plot.scatter(x=coord_pecan$x, y = coord_pecan$y, group = coordinates.subtype$`primary subtype`, namev = coordinates.subtype$`primary subtype`, main = "Pecan ALL", rasterize = F, width = 70*2, height = 74*2, SIZE = 0.5) + +plot.scatter(x=coord_pecan$x, y = coord_pecan$y, group = annot$`RNA-seq library`, namev = annot$`RNA-seq library`, main = "Pecan ALL", rasterize = F, width = 70*2, height = 74*2, SIZE = 0.5) +plot.scatter(x=coord_pecan$x, y = coord_pecan$y, group = annot$institute, namev = annot$institute, main = "Pecan ALL", rasterize = F, width = 70*2, height = 74*2, SIZE = 0.5) + +mod=names(table(annot$protocol))[table(annot$protocol)<3] +annot$protocol[annot$protocol%in%mod]="other" +plot.scatter(x=coord_pecan$x, y = coord_pecan$y, group = annot$protocol, namev = annot$protocol, main = "Pecan ALL", rasterize = F, width = 70*2, height = 74*2, SIZE = 0.5) + +mod=names(table(annot$protocol))[table(annot$protocol)<3] +annot$protocol[annot$protocol%in%mod]="other" + +plot.scatter(x=coord_pecan$x, y = coord_pecan$y, group = batch, namev =batch, main = "Pecan ALL", rasterize = F, width = 70*2, height = 74*2, SIZE = 0.5) \ No newline at end of file