Switch to unified view

a b/preprocessing/Preprocessing_coMMpass_featurematrix_generation.R
1
# data downloaded 2019 from:
2
# https://gdc.cancer.gov/about-data/publications/panimmune
3
# https://gdc.cancer.gov/node/905/
4
# TCGA pancancer FM
5
6
GIT_HOME="/research/users/ppolonen/git_home/common_scripts"
7
source(file.path(GIT_HOME, "visualisation/plotting_functions.R"))
8
source(file.path(GIT_HOME, "featurematrix/functions_generate_fm.R"))
9
source(file.path(GIT_HOME, "featurematrix/compute.pairwise.R"))
10
source(file.path(GIT_HOME, "statistics/functions_statistics.R"))
11
source(file.path(GIT_HOME, "statistics/useful_functions.R"))
12
library(parallel)
13
14
# annotations
15
clin=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_PER_PATIENT.csv", data.table=F)
16
surv=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_SURVIVAL.csv", data.table=F)
17
18
# OMICS data
19
rna=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/MMRF_CoMMpass_IA13a_E74GTF_HtSeq_Gene_Counts.txt", data.table=F)
20
rna2=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/MMRF_CoMMpass_IA13a_E74GTF_Cufflinks_Gene_FPKM.txt", data.table=F)
21
rna2=rna2[,-2]
22
23
# Mutation data
24
mut=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/MMRF_CoMMpass_IA13a_All_Canonical_Variants_ENSG_Mutation_Counts.txt", data.table=F)
25
maf=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/MMRF_CoMMpass_IA13a_All_Canonical_NS_Variants.txt", data.table=F)
26
# maf=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/MMRF_CoMMpass_IA13a_All_Canonical_Variants.txt", data.table=F)
27
28
# CNV
29
cnv.long=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/MMRF_CoMMpass_IA13a_CNA_LongInsert_FISH_CN_All_Specimens.txt", data.table=F)
30
31
# fusion
32
fusion=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/MMRF_CoMMpass_IA13a_TophatFusion_Results.txt", data.table=F)
33
ig=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/MMRF_CoMMpass_IA13a_LongInsert_Canonical_Ig_Translocations.txt", data.table=F)
34
rnaig=data.table::fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/MMRF_CoMMpass_IA13a_RNAseq_Canonical_Ig_Translocations.txt", data.table=F)
35
36
#*************************************** filter and process each data type ******************************************
37
38
clin.filt=clin[,!colnames(clin)%in%c("PUBLIC_ID")]
39
rownames(clin.filt)=clin$PUBLIC_ID
40
clin.filt=data.frame(clin.filt, stringsAsFactors = F)
41
42
clin.filt$R_ISS=as.character(clin.filt$R_ISS)
43
clin.filt$IMWG_Risk_Class=as.character(clin.filt$IMWG_Risk_Class)
44
45
surv.filt=data.frame("STATUS"=0, "OS"=surv$lstalive)
46
surv.filt$STATUS[!is.na(surv$deathdy)]=1
47
surv.filt$OS[!is.na(surv$deathdy)]=surv$deathdy[!is.na(surv$deathdy)]
48
rownames(surv.filt)=surv$public_id
49
50
#******************************************************************************************************************
51
52
mut.m=data.matrix((table(maf$`ANN[*].GENE`, maf$Sample)>0)*1)
53
colnames(mut.m)=gsub("_._BM|_._PB", "", colnames(mut.m))
54
mut.m=mut.m[,!duplicated(colnames(mut.m))]
55
56
# # all mutations, is mutated or not:
57
# mut.m=data.matrix(mut[,-1]>0)*1
58
# match.gene=intersect(genes$gene_id, mut[,1])
59
# genes2=genes[match(match.gene, genes$gene_id),]
60
# mut.m=mut.m[match(match.gene, mut[,1]),]
61
# mut=mut[match(match.gene, mut[,1]),]
62
# rownames(mut.m)=genes2$gene_name[match(match.gene, mut[,1])]
63
64
# convert to symbol:
65
library(data.table)
66
# http://ftp.ensembl.org/pub/release-74/gtf/homo_sapiens/Homo_sapiens.GRCh37.74.gtf.gz same file was used in the analysis, so genes will match
67
genes <- fread("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/Homo_sapiens.GRCh37.74.gtf")
68
setnames(genes, names(genes), c("chr","source","type","start","end","score","strand","phase","attributes") )
69
70
# the problem is the attributes column that tends to be a collection
71
# of the bits of information you're actually interested in
72
# in order to pull out just the information I want based on the 
73
# tag name, e.g. "gene_id", I have the following function:
74
extract_attributes <- function(gtf_attributes, att_of_interest){
75
  att <- strsplit(gtf_attributes, "; ")
76
  att <- gsub("\"","",unlist(att))
77
  if(!is.null(unlist(strsplit(att[grep(att_of_interest, att)], " ")))){
78
    return( unlist(strsplit(att[grep(att_of_interest, att)], " "))[2])
79
  }else{
80
    return(NA)}
81
}
82
83
# this is how to, for example, extract the values for the attributes of interest (here: "gene_id")
84
genes$gene_id <- unlist(mclapply(genes$attributes, extract_attributes, "gene_id", mc.cores=10))
85
genes$gene_name <- unlist(mclapply(genes$attributes, extract_attributes, "gene_name", mc.cores=10))
86
genes=unique(genes[,10:11])
87
genes=genes[!is.na(genes$gene_name),]
88
89
#******************************************************************************************************************
90
91
# fusions:
92
rnaig_filt=rnaig[,grepl("Call", colnames(rnaig))]
93
colnames(rnaig_filt)=gsub("_Call", "_Ig_translocation", colnames(rnaig_filt))
94
95
name=gsub("_._BM|_._PB", "", rnaig$Specimen_ID)
96
97
rnaig_m=do.call(rbind, lapply(unique(name), function(n){
98
  (colSums(rnaig_filt[name%in%n,])>0)*1
99
}))
100
101
rownames(rnaig_m)=unique(name)
102
103
104
ig.filt=ig[,grepl("CALL", colnames(ig))]
105
colnames(ig.filt)=gsub("_CALL", "_Ig_translocation", colnames(ig.filt))
106
name=gsub("_._BM|_._PB", "", ig$Study_Visit_iD)
107
           
108
ig_m=do.call(rbind, lapply(unique(name), function(n){
109
  (colSums(ig.filt[name%in%n,])>0)*1
110
}))
111
112
rownames(ig_m)=unique(name)
113
114
a=fusion[gsub("_._BM|_._PB", "", fusion$ID)%in%coord$ID[coord$cluster=="5"],]
115
b=(table(paste(a$left.Gene, a$right.Gene), gsub("_._BM|_._PB", "", a$ID))>0)*1
116
117
sort(rowSums(b))
118
119
b2=(table(paste(fusion$left.Gene, fusion$right.Gene), gsub("_._BM|_._PB", "", fusion$ID))>0)*1
120
b3=b2==1
121
122
test=do.call(rbind, apply(b3, 1, function(lv2)fisher.2x2(lv1 = colnames(b2)%in%coord$ID[coord$cluster=="5"], lv2, alternative = "greater")))
123
124
#******************************************************************************************************************
125
126
# GEXP
127
rna.filt=rna[,-1]
128
rna.filt2=rna2[,-1]
129
rna.filt=rna.filt[,grepl("1_BM", colnames(rna.filt))]
130
rna.filt2=rna.filt2[,grepl("1_BM", colnames(rna.filt2))]
131
132
match.gene=intersect(genes$gene_id, rna[,1])
133
genes2=genes[match(match.gene, genes$gene_id),]
134
rna.filt=rna.filt[match(match.gene, rna[,1]),]
135
rna=rna[match(match.gene, rna[,1]),]
136
rownames(rna.filt)=make.unique(genes2$gene_name[match(match.gene, rna[,1])])
137
138
match.gene=intersect(genes$gene_id, rna2[,1])
139
genes2=genes[match(match.gene, genes$gene_id),]
140
rna.filt2=rna.filt2[match(match.gene, rna2[,1]),]
141
rna2=rna2[match(match.gene, rna2[,1]),]
142
rownames(rna.filt2)=make.unique(genes2$gene_name[match(match.gene, rna2[,1])])
143
rna.filt2=rna.filt2[,match(colnames(rna.filt), colnames(rna.filt2))]
144
145
# filter:
146
filt=rowSums(edgeR::cpm(rna.filt)>1)>dim(rna.filt)[2]*0.025
147
148
# normalize library size
149
DGE <- edgeR::DGEList(rna.filt[filt,])
150
DGE <- edgeR::calcNormFactors(DGE)
151
152
# voom transform
153
gexp=limma::voom(DGE, plot=T)$E
154
colnames(gexp)=gsub("_._BM|_._PB", "", colnames(gexp))
155
156
#******************************************************************************************************************
157
cnv.long.filt=cnv.long[,-1]
158
rownames(cnv.long.filt)=cnv.long[,1]
159
cnv.long.filt=cnv.long.filt[,!grepl("percent", colnames(cnv.long.filt))]
160
colnames(cnv.long.filt)=gsub("SeqWGS_Cp_", "", colnames(cnv.long.filt))
161
cnv.long.filt=cnv.long.filt[grepl("1_BM", rownames(cnv.long.filt)),]
162
rownames(cnv.long.filt)=gsub("_._BM|_._PB", "", rownames(cnv.long.filt))
163
164
#******************************************************************************************************************
165
# immunoscores:
166
library(circlize)
167
168
dat_a3=gexp[rownames(gexp)%in%c("B2M",
169
                                "HLA-A",
170
                                "HLA-B",
171
                                "HLA-C"),]
172
173
dat3=2^dat_a3+0.01
174
gm1=log2(t(apply(dat3, 2, gm_mean)))
175
rownames(gm1)="HLAIScore"
176
177
dat_a3=gexp[rownames(gexp)%in%c("HLA-DMA",
178
                                "HLA-DMB",
179
                                "HLA-DPA1",
180
                                "HLA-DPB1",
181
                                "HLA-DRA",
182
                                "HLA-DRB1"),]
183
184
dat3=2^dat_a3+0.01
185
gm2=log2(t(apply(dat3, 2, gm_mean)))
186
rownames(gm2)="HLAIIScore"
187
188
dat_a3=gexp[rownames(gexp)%in%c("GZMA", "PRF1", "GNLY", "GZMH", "GZMM"),]
189
190
dat3=2^dat_a3+0.01
191
gm3=log2(t(apply(dat3, 2, gm_mean)))
192
rownames(gm3)="CytolyticScore"
193
194
classification1=data.frame(t(rep("medium", length(gm3))))
195
zscore=as.numeric(scale(t(gm3)))
196
classification1[zscore>=1]="high"
197
classification1[zscore<=(-1)]="low"
198
rownames(classification1)="CytolyticScore" 
199
colnames(classification1)=colnames(gexp)
200
201
classification2=data.frame(t(rep("medium", length(gm1))))
202
zscore=as.numeric(scale(t(gm1)))
203
classification2[zscore>=1]="high"
204
classification2[zscore<=(-1)]="low"
205
rownames(classification2)="HLAIScore" 
206
colnames(classification2)=colnames(gexp)
207
208
classification3=data.frame(t(rep("medium", length(gm2))))
209
zscore=as.numeric(scale(t(gm2)))
210
classification3[zscore>=1]="high"
211
classification3[zscore<=(-1)]="low"
212
rownames(classification3)="HLAIIScore" 
213
colnames(classification3)=colnames(gexp)
214
215
classification=data.frame(t(rbind(classification1,classification2,classification3)), stringsAsFactors = F)
216
217
# CGA:
218
t.df = read.delim("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/t.antigen_df.txt", stringsAsFactors=F, header=T)
219
genelist=c(unique(t.df[,1]))
220
221
# rank patients by number of testis antigens expressed
222
expressed_testis_num=data.frame(t(colSums(gexp[rownames(gexp)%in%genelist,]>3))) # good cutoff point in fpkm and cpm, a lot of noise otherwise
223
# expressed_testis_num=data.frame(t(colSums(log2(rna.filt2[rownames(rna.filt2)%in%genelist,])>3)))
224
225
colnames(expressed_testis_num)=colnames(gexp)
226
rownames(expressed_testis_num)="nCGA"
227
228
feat_class=expressed_testis_num
229
feat_class[expressed_testis_num==0]="0_Antigens"
230
feat_class[expressed_testis_num>=1&expressed_testis_num<=2]="1to2_Antigens"
231
feat_class[expressed_testis_num>=3&expressed_testis_num<=4]="3to4_Antigens"
232
feat_class[expressed_testis_num>=5&expressed_testis_num<=6]="5to6_Antigens"
233
feat_class[expressed_testis_num>=7]="over7_Antigens"
234
235
rownames(feat_class)="catCGA"
236
237
immunoscores=as.data.frame(t(rbind(gm1, gm2,expressed_testis_num)),stringsAsFactors=F)
238
immunoscores$catCGA=as.character(feat_class)
239
240
#****************************************************************************************************************************
241
#*************************************** Make a feature matrix from each data type ******************************************
242
#****************************************************************************************************************************
243
# clustering:
244
coord=read.delim("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/cancermap_CoMMpass_12.5pct_genes_BH-SNE_mean-shift_BW1.txt", header=T, stringsAsFactors = F)
245
peaks=read.delim("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/cancermap_CoMMpass_12.5pct_genes_BH-SNE_mean-shift_BW1_cluster_centroids.txt", header=T, stringsAsFactors = F)
246
247
coord$cluster[coord$cluster%in%c(21)]="CGA_Prolif"
248
249
clust=make.features(data.frame("cancermap_cluster"=as.character(coord$cluster), stringsAsFactors = F), datatype="SAMP", make.pairwise = F)
250
colnames(clust)=coord$ID
251
252
# clustering larger:
253
coord.2=read.delim("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/cancermap_CoMMpass_12.5pct_genes_BH-SNE_mean-shift_BW2.txt", header=T, stringsAsFactors = F)
254
peaks.2=read.delim("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/MM_compass/cancermap_CoMMpass_12.5pct_genes_BH-SNE_mean-shift_BW2_cluster_centroids.txt", header=T, stringsAsFactors = F)
255
256
coord2=coord.2
257
258
# checked the genetics and combined the subtypes as larger subtypes:
259
coord2$cluster[coord.2$cluster%in%c(6)]="MAF_Ig"
260
coord2$cluster[coord.2$cluster%in%c(3)]="WHSC1_FGFR3_Ig"
261
coord2$cluster[coord.2$cluster%in%c(1,2)]="CCND1_Ig"
262
coord2$cluster[coord.2$cluster%in%c(4)]="Hyperdiploid"
263
coord2$cluster[coord.2$cluster%in%c(5)]="Hyperdiploid_amp1q"
264
coord2$cluster[coord.2$cluster%in%c(7)]="TRAF3_Aberrated"
265
266
clust2=make.features(data.frame("cancermap_subtypes_"=as.character(coord2$cluster), stringsAsFactors = F), datatype="SAMP")
267
colnames(clust2)=coord$ID
268
269
270
# clinical and surv
271
clindatfm=make.features(df = clin.filt, datatype="CLIN", prefix="", make.pairwise = F)
272
survdatfm=make.features(df = surv.filt, datatype="CLIN", prefix="", make.pairwise = F)
273
274
# OMICS
275
gexpfm=make.features(df = data.frame(t(gexp), check.names = F), datatype="GEXP", prefix="", make.pairwise = F)
276
colnames(gexpfm)=gsub("_._BM|_._PB", "", colnames(gexp))
277
278
# Genetics
279
cnvfm=make.features(df = data.frame(cnv.long.filt, check.names = F), datatype="CNVR", prefix="", make.pairwise = F)
280
colnames(cnvfm)=gsub("_._BM|_._PB", "", rownames(cnv.long.filt))
281
282
mutfm=make.features(as.data.frame(t(mut.m), check.names = F), datatype="GNAB", prefix="")
283
colnames(mutfm)=gsub("_._BM|_._PB", "", colnames(mut.m))
284
285
# immunology
286
immunoscoresfm=make.features(immunoscores, datatype="SAMP", prefix="")
287
colnames(immunoscoresfm)=gsub("_._BM|_._PB", "", colnames(gexp))
288
289
immunoscoresfm2=make.features(classification, datatype="SAMP", prefix="")
290
colnames(immunoscoresfm2)=gsub("_._BM|_._PB", "", colnames(gexp))
291
292
rnaig_fm=make.features(data.frame(rnaig_m), datatype="CNVR", prefix="")
293
ig_fm=make.features(data.frame(ig_m), datatype="CNVR", prefix="")
294
295
296
annot=cbind(clin.filt,surv.filt[match(rownames(clin.filt), rownames(surv.filt)),], "cluster"=coord$cluster[match(rownames(clin.filt),coord$ID)], "subtype"=coord2[match(rownames(clin.filt),coord2$ID),])
297
l.fm=list(clust,clust2, clindatfm, survdatfm, gexpfm, cnvfm, mutfm, immunoscoresfm, immunoscoresfm2, rnaig_fm, ig_fm)
298
299
library(data.table)
300
fm=rbindlist(l.fm, use.names=T, fill=T)
301
302
fm=data.frame(fm, stringsAsFactors=F, check.names = F)
303
rownames(fm)=unlist(lapply(l.fm, rownames))
304
305
save(gexp, file="/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/MM_COMPASS/MM_COMPASS_GEXP.Rdata")
306
save(fm, file="/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/MM_COMPASS/MM_COMPASS_FM.Rdata")
307
save(annot, file="/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/MM_COMPASS/MM_COMPASS_ANNOT.Rdata")