Diff of /R/TCGA_Data_Preparation.R [000000] .. [c3b4f8]

Switch to unified view

a b/R/TCGA_Data_Preparation.R
1
# TCGA_Data_Preparation.R
2
3
getGDCprojects()
4
5
# Download_GDC.R
6
7
# ==== Preperation ====
8
if (!require(SummarizedExperiment)) {
9
  install.packages("SummarizedExperiment")
10
  library(SummarizedExperiment)
11
}
12
if (!require(parallel)) {
13
  install.packages("parallel")
14
  library(parallel)
15
}
16
if (!require(data.table)) {
17
  install.packages("data.table")
18
  library(data.table)
19
}
20
# Note that on Linux, libmariadb-client-lgpl-dev must be installed
21
# This allows for configuration of RMySQL
22
if (!require(TCGAbiolinks)) {
23
  if (!requireNamespace("BiocManager", quietly = TRUE))
24
    install.packages("BiocManager")
25
  BiocManager::install("TCGAbiolinks")
26
  require(TCGAbiolinks)
27
  ??TCGAbiolinks
28
  
29
  library(TCGAbiolinks)
30
}
31
32
getGDCInfo()
33
getGDCprojects()
34
getProjectSummary("TCGA-ACC")
35
# Data Release 28, February 02, 2021
36
# Move all data to a single directory later on
37
dir.create(("Data/TCGA"))
38
dir.create("Data/TCGA/SummarizedExperiments")
39
dir.create("Data/TCGA/SummarizedExperiments/Mut")
40
dir.create("Data/TCGA/SummarizedExperiments/CNV")
41
dir.create("Data/TCGA/SummarizedExperiments/Exp")
42
43
projects <- TCGAbiolinks:::getGDCprojects()$project_id
44
projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
45
46
47
# ==== Download Mut data ====
48
plyr::alply(sort(projects), 1, function(cur_project) {
49
  tryCatch(
50
    query <- GDCquery(
51
      project = cur_project,
52
      data.category = "Simple Nucleotide Variation",
53
      data.type = "Masked Somatic Mutation",
54
      workflow.type = 'VarScan2 Variant Aggregation and Masking'
55
    )
56
  )
57
  query <- GDCquery(
58
    project = cur_project,
59
    data.category = "Simple Nucleotide Variation",
60
    data.type = "Masked Somatic Mutation",
61
    workflow.type = 'VarScan2 Variant Aggregation and Masking'
62
  )
63
  
64
  GDCdownload(query, files.per.chunk = 10, method = "client")
65
  GDCprepare(query,
66
    save = T,
67
    save.filename = paste0("Data/TCGA/SummarizedExperiments/Mut/", cur_project,
68
                           "_VarScan2_MAF.rdata")
69
  )
70
})
71
72
# ==== Download CNV data ====
73
plyr::alply(sort(projects)[-(1:14)], 1, function(cur_project) {
74
  tryCatch(
75
    query <- GDCquery(
76
      project = cur_project,
77
      data.category = "Copy Number Variation",
78
      data.type = "Masked Copy Number Segment", 
79
    )
80
  )
81
  query <- GDCquery(
82
    project = cur_project,
83
    data.category = "Copy Number Variation",
84
    data.type = "Masked Copy Number Segment",
85
  )
86
  GDCdownload(query, files.per.chunk = 10)
87
  GDCprepare(query,
88
             save = T,
89
             save.filename = paste0("Data/TCGA/SummarizedExperiments/CNV/", cur_project,
90
                                    "_CopyNumber.rdata"))
91
  # cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/CNV/", cur_project, "_CopyNumber.rdata")))
92
  
93
})
94
95
# ==== Download clinical CNV data ====
96
97
dir.create("Data/TCGA/Clinical")
98
dir.create("Data/TCGA/Clinical/CNV")
99
plyr::alply(sort(projects)[-(1:14)], 1, function(cur_project) {
100
  tryCatch(
101
    query.biospecimen <- GDCquery(project = cur_project, 
102
                                  data.category = "Biospecimen",
103
                                  data.type = "Biospecimen Supplement", 
104
                                  data.format = "BCR Biotab")
105
  )
106
  query.biospecimen <- GDCquery(project = cur_project, 
107
                                data.category = "Biospecimen",
108
                                data.type = "Biospecimen Supplement", 
109
                                data.format = "BCR Biotab")
110
  GDCdownload(query.biospecimen, files.per.chunk = 10)
111
  GDCprepare(query.biospecimen,
112
             save = T,
113
             save.filename = paste0("Data/TCGA/Clinical/CNV/", cur_project,
114
                                    "_CopyNumber_Clinical.rdata"))
115
  
116
})
117
118
# ==== Download gene expression data ====
119
plyr::alply(projects, 1, function(cur_project) {
120
  tryCatch(
121
      query <- GDCquery(
122
          project = cur_project,
123
          data.category = "Transcriptome Profiling",
124
          data.type = "Gene Expression Quantification",
125
          workflow.type = "HTSeq - Counts"
126
      )
127
  )
128
  query <- GDCquery(
129
    project = cur_project,
130
    data.category = "Transcriptome Profiling",
131
    data.type = "Gene Expression Quantification",
132
    workflow.type = "HTSeq - Counts"
133
  )
134
  GDCdownload(query, files.per.chunk = 10, method = "client")
135
  GDCprepare(
136
    query,
137
    save = T,
138
    save.filename = paste0("Data/TCGA/SummarizedExperiments/Exp/", cur_project, "_GeneExpression.rdata")
139
  )
140
  
141
})
142
143
144
# =====================================
145
# For each data type, combine all projects into a single file
146
require(data.table)
147
require(TCGAbiolinks)
148
require(SummarizedExperiment)
149
require(GeoTcgaData)
150
require(stringr)
151
require(biomaRt)
152
require(curl)
153
154
# Get clinical info from Gene Expression data ====
155
all_proj_info <- vector(mode = "list", length = length(projects))
156
i <- 1
157
for (cur_project in projects) {
158
  cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/Exp/", cur_project, "_GeneExpression.rdata")))
159
  cur_proj_assay <- assays(cur_proj_data)[[1]]
160
  cur_proj_info <- colData(cur_proj_data)
161
  all_proj_info[[i]] <- as.data.table(cur_proj_info[, c("barcode", "name")])
162
  i <- i + 1
163
}
164
165
all_proj_info <- rbindlist(all_proj_info)
166
# Save
167
fwrite(all_proj_info, "Data/TCGA/All_Sample_Info.csv")
168
169
170
# ==== Merge Exp Data ====
171
require(data.table)
172
require(biomaRt)
173
dir.create("Data/TCGA/Exp")
174
projects <- TCGAbiolinks:::getGDCprojects()$project_id
175
projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
176
# gene_dict <- fread("Data/gene_dictionary.txt")
177
depmap_exp <- fread("Data/DepMap/20Q2/CCLE_expression.csv")
178
depmap_ensg_numbers <- str_replace(colnames(depmap_exp), ".+\\((.+)\\)", "\\1")
179
depmap_hgnc <- str_replace(colnames(depmap_exp), "(.+)\\s\\(.+\\)", "\\1")
180
depmap_exp_dict <- data.table(HGNC = depmap_hgnc, ENSG = depmap_ensg_numbers)
181
182
rm(depmap_exp)
183
184
ensembl = useMart(
185
  biomart = "ENSEMBL_MART_ENSEMBL",
186
  # host = "feb2014.archive.ensembl.org",
187
  path = "/biomart/martservice",
188
  dataset = "hsapiens_gene_ensembl"
189
)
190
atts <- listAttributes(ensembl, what = "name")
191
atts[grep(pattern = "gene", x = atts, ignore.case = T)]
192
193
cur_project = projects[1]
194
195
for (cur_project in projects) {
196
  cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/Exp/", cur_project, "_GeneExpression.rdata")))
197
  cur_proj_assay <- assays(cur_proj_data)[[1]]
198
  cur_proj_info <- colData(cur_proj_data)
199
  cur_proj_tpm <- GeoTcgaData::fpkmToTpm_matrix(cur_proj_assay)
200
  
201
  # log2 transform with a pseudo-count of 1
202
  cur_proj_tpm <- cur_proj_tpm+1
203
  cur_proj_tpm <- log2(cur_proj_tpm)
204
  
205
  # Convert ENSG ID's to numbers
206
  cur_ensg_numbers <- str_replace(rownames(cur_proj_tpm), "ENSG0*", "")
207
  cur_proj_tpm_dt <- as.data.table(cur_proj_tpm, keep.rownames = T)
208
  cur_dict <- biomaRt::getBM(attributes = c("ensembl_gene_id",
209
                                            "external_gene_name"),
210
                         filters = "ensembl_gene_id", values = cur_proj_tpm_dt$rn,
211
                         mart = ensembl)
212
  cur_dict <- as.data.table(cur_dict)
213
  temp <- merge(cur_proj_tpm_dt, cur_dict, by.x = "rn", by.y = "ensembl_gene_id")
214
  # Find overlap with HGNC symbols in DepMap 
215
  cur_reduced_dt <- temp[external_gene_name %in% depmap_exp_dict$HGNC]
216
  
217
  dup_genes <- cur_reduced_dt[duplicated(cur_reduced_dt$external_gene_name)]$external_gene_name
218
  # Duplicated genes may represent transcripts from the same gene, must consolidate by adding TPM values
219
  cur_reduced_dt$rn <- NULL
220
  cur_reduced_dt <- cur_reduced_dt[, lapply(.SD, sum), by = external_gene_name]
221
  final_dt <- transpose(cur_reduced_dt, keep.names = "external_gene_name")
222
  
223
  colnames(final_dt) <- unname(unlist(final_dt[1,]))
224
  final_dt <- final_dt[!(TSPAN6 == "TSPAN6")]
225
  # Add cancer type
226
  final_dt <- cbind(cancer_type = cur_proj_info$name, final_dt)
227
  colnames(final_dt)[2] <- "tcga_sample_id"
228
  # Sort all the other columns
229
  new_order <- c("tcga_sample_id", "cancer_type", sort(colnames(final_dt)[-c(1:2)]))
230
  setcolorder(final_dt, new_order)
231
  
232
  fwrite(final_dt, paste0("Data/TCGA/Exp/", cur_project, "_Exp.csv"))
233
}
234
235
# Now merge all the data.tables
236
exp_file_paths <- list.files("Data/TCGA/Exp/", full.names = T)
237
exp_dt_list <- vector(mode = "list", length = length(exp_file_paths))
238
239
for (i in 1:length(exp_file_paths)) {
240
  exp_dt_list[[i]] <- fread(exp_file_paths[i])
241
}
242
243
all_exp <- rbindlist(exp_dt_list)
244
all_exp[1:5, 1:5]
245
dim(all_exp)
246
fwrite(all_exp, "Data/TCGA/TCGA_All_Exp_Data.csv")
247
rm(exp_dt_list)
248
rm(all_exp)
249
250
# all_exp <- fread("Data/TCGA/TCGA_All_Exp_Data.csv")
251
# ==== Merge CNV Data ====
252
require(GenomicRanges)
253
# Must find the genomic ranges of genes in DepMap CNV data (ensure reference is the same), then merge using the GRanges package (map onto)
254
dir.create("Data/TCGA/CNV")
255
projects <- TCGAbiolinks:::getGDCprojects()$project_id
256
projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
257
# gene_dict <- fread("Data/gene_dictionary.txt")
258
depmap_cnv <- fread("Data/DepMap/20Q2/CCLE_gene_copy_number.csv")
259
head(colnames(depmap_cnv))
260
depmap_ensg_numbers <- str_replace(colnames(depmap_cnv), ".+\\((.+)\\)", "\\1")
261
depmap_hgnc <- str_replace(colnames(depmap_cnv), "(.+)\\s\\(.+\\)", "\\1")
262
depmap_cnv_dict <- data.table(HGNC = depmap_hgnc, ENSG = depmap_ensg_numbers)
263
264
rm(depmap_cnv)
265
266
ensembl = useMart(
267
  biomart = "ENSEMBL_MART_ENSEMBL",
268
  # host = "feb2014.archive.ensembl.org",
269
  path = "/biomart/martservice",
270
  dataset = "hsapiens_gene_ensembl"
271
)
272
273
cur_translation <- biomaRt::getBM(attributes = c("ensembl_gene_id", "wikigene_name",
274
                                                 "external_gene_name", 'chromosome_name',
275
                                                 'start_position', 'end_position', 'strand'),
276
                           filters = "external_gene_name", values = depmap_cnv_dict$HGNC, uniqueRows = T,
277
                           mart = ensembl)
278
cur_translation <- as.data.table(cur_translation)
279
cur_translation <- cur_translation[!duplicated(cur_translation[,"wikigene_name"]), ]
280
cur_translation <- cur_translation[!duplicated(cur_translation[,"external_gene_name"]), ]
281
282
depmap_cnv_dict_1 <- merge(depmap_cnv_dict, cur_translation, by.x = "HGNC", by.y = "external_gene_name")
283
depmap_cnv_dict_1$wikigene_name <- NULL
284
# unfound <- depmap_cnv_dict$HGNC[!(depmap_cnv_dict$HGNC %in% cur_translation$wikigene_name)]
285
unfound <- depmap_cnv_dict$HGNC[!(depmap_cnv_dict$HGNC %in% cur_translation$external_gene_name)]
286
# Search for the unfound in wikigene info
287
unfound_translation <- biomaRt::getBM(attributes = c("ensembl_gene_id", "wikigene_name",
288
                                                 "external_gene_name", 'chromosome_name',
289
                                                 'start_position', 'end_position', 'strand'),
290
                                  filters = "wikigene_name", values = unfound, uniqueRows = T,
291
                                  mart = ensembl)
292
unfound_translation <- as.data.table(unfound_translation)
293
depmap_cnv_dict_2 <- merge(depmap_cnv_dict, unfound_translation, by.x = "HGNC", by.y = "wikigene_name")
294
depmap_cnv_dict_2$external_gene_name <- NULL
295
296
# Add together
297
# all_translation <- rbindlist(list(cur_translation, unfound_translation))
298
depmap_cnv_dict <- rbindlist(list(depmap_cnv_dict_1, depmap_cnv_dict_2))
299
depmap_cnv_dict$strand[depmap_cnv_dict$strand == -1] <- "-"
300
depmap_cnv_dict$strand[depmap_cnv_dict$strand == 1] <- "+"
301
302
# Convert to GRanges
303
depmap_cnv_granges <- makeGRangesFromDataFrame(df = depmap_cnv_dict, keep.extra.columns = T, seqnames.field = "chromosome_name",
304
                         start.field = "start_position", end.field = "end_position", strand.field = "strand")
305
306
atts <- listAttributes(ensembl, what = "name")
307
atts[grep(pattern = "gene", x = atts, ignore.case = T)]
308
atts[grep(pattern = "chromosome", x = atts, ignore.case = T)]
309
310
cur_project = projects[1]
311
312
rm(list = c("depmap_cnv_dict_1", "depmap_cnv_dict_2"))
313
314
315
for (cur_project in projects) {
316
  cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/CNV/", cur_project, "_CopyNumber.rdata")))
317
  # Convert to GRanges
318
  cur_proj_granges <- makeGRangesFromDataFrame(df = cur_proj_data, keep.extra.columns = T, start.field = "Start",
319
                                               end.field = "End", seqnames.field = "Chromosome")
320
  # Merge with DepMap data
321
  cur_proj_merge <- mergeByOverlaps(query = cur_proj_granges, subject = depmap_cnv_granges)
322
  cur_proj_merge <- cur_proj_merge[, c("Sample", "HGNC", "Segment_Mean")]
323
  cur_proj_merge <- as.data.table(cur_proj_merge)
324
  cur_proj_merge <- unique(cur_proj_merge)
325
  
326
  cur_wide_data <- dcast(cur_proj_merge, Sample ~ HGNC, value.var = "Segment_Mean", fun.aggregate = sum)
327
  
328
  colnames(cur_wide_data)[1] <- "tcga_sample_id"
329
  # Sort all the other columns
330
  new_order <- c("tcga_sample_id", sort(colnames(cur_wide_data)[-1]))
331
  setcolorder(cur_wide_data, new_order)
332
333
  fwrite(cur_wide_data, paste0("Data/TCGA/CNV/", cur_project, "_CNV.csv"))
334
}
335
336
# Now merge all the data.tables
337
cnv_file_paths <- list.files("Data/TCGA/CNV/", full.names = T)
338
cnv_dt_list <- vector(mode = "list", length = length(cnv_file_paths))
339
340
for (i in 1:length(cnv_file_paths)) {
341
  cnv_dt_list[[i]] <- fread(cnv_file_paths[i])
342
}
343
344
all_cnv <- rbindlist(cnv_dt_list)
345
all_cnv[1:5, 1:5]
346
fwrite(all_cnv, "Data/TCGA/TCGA_All_CNV_Data.csv")
347
348
# The first 3 ID's should be enough to indicate tumor type
349
require(stringr)
350
all_proj_info <- fread("Data/TCGA/All_Sample_Info.csv")
351
temp <- str_split_fixed(all_proj_info$barcode, "-", n = 7)[, 1:3]
352
all_proj_info$sub_id <- paste(temp[,1], temp[,2], temp[,3], sep = '-')
353
354
all_cnv <- fread("Data/TCGA/TCGA_All_CNV_Data.csv")
355
temp2 <- str_split_fixed(all_cnv$tcga_sample_id, "-", n = 7)[, 1:4]
356
sample_type <- str_split_fixed(temp2[, 4], "", n=3)[, 1:2]
357
sample_type <- as.integer(paste0(sample_type[,1], sample_type[,2]))
358
all_cnv$sub_id <- paste(temp2[,1], temp2[,2], temp2[,3], sep = '-')
359
360
all_cnv$sample_type <- ifelse(sample_type < 10, yes = "Tumor", no = "Normal")
361
table(all_cnv$sample_type)
362
363
# Subset to Tumor
364
all_cnv <- all_cnv[sample_type == "Tumor"]
365
# add cancer type
366
all_cnv <- merge(all_cnv, all_proj_info[, c("name", "sub_id")], by = "sub_id")
367
table(all_cnv$name)
368
dim(all_cnv)
369
# remove sub_id, sample_type
370
all_cnv$sub_id <- NULL
371
all_cnv$sample_type <- NULL
372
# reorder
373
setcolorder(all_cnv, "name")
374
colnames(all_cnv)[1] <- "cancer_type"
375
all_cnv[1:5, 1:5]
376
377
# Save
378
fwrite(all_cnv, "Data/TCGA/TCGA_All_CNV_Data.csv")
379
380
dim(all_cnv)
381
382
rm(cnv_dt_list)
383
rm(all_cnv)
384
385
386
# ==== Merge Mut Data ====
387
# dir.create("Data/TCGA/Mut")
388
# projects <- TCGAbiolinks:::getGDCprojects()$project_id
389
# projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
390
# # gene_dict <- fread("Data/gene_dictionary.txt")
391
# depmap_mut <- fread("Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv")
392
# depmap_ensg_numbers <- str_replace(colnames(depmap_mut), ".+\\((.+)\\)", "\\1")
393
# depmap_hgnc <- str_replace(colnames(depmap_mut), "(.+)\\s\\(.+\\)", "\\1")
394
# depmap_mut_dict <- data.table(HGNC = depmap_hgnc, ENSG = depmap_ensg_numbers)
395
# 
396
# rm(depmap_mut)
397
# 
398
# ensembl = useMart(
399
#   biomart = "ENSEMBL_MART_ENSEMBL",
400
#   # host = "feb2014.archive.ensembl.org",
401
#   path = "/biomart/martservice",
402
#   dataset = "hsapiens_gene_ensembl"
403
# )
404
# atts <- listAttributes(ensembl, what = "name")
405
# 
406
# # cur_project = projects[1]
407
# 
408
# for (cur_project in projects) {
409
#   cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/Mut/", cur_project, "_VarScan2_MAF.rdata")))
410
#   cur_sub <- cur_proj_data[, c("Hugo_Symbol", "Variant_Classification", "Variant_Type", "Tumor_Sample_Barcode")]
411
#   unique(cur_sub$Hugo_Symbol)
412
#   # Find overlap with DepMap
413
#   sum(depmap_mut_dict$HGNC %in% unique(cur_sub$Hugo_Symbol))
414
#   
415
#   
416
#   cur_proj_assay <- assays(cur_proj_data)[[1]]
417
#   # cur_proj_assay[1:5, 1:5]
418
#   # class(cur_proj_assay)
419
#   cur_proj_tpm <- GeoTcgaData::fpkmToTpm_matrix(cur_proj_assay)
420
#   # Convert ENSG ID's to numbers
421
#   cur_ensg_numbers <- str_replace(rownames(cur_proj_tpm), "ENSG0*", "")
422
#   cur_proj_tpm_dt <- as.data.table(cur_proj_tpm, keep.rownames = T)
423
#   # cur_proj_tpm_dt[1:5, 1:5]
424
#   # cur_proj_tpm_dt$rn <- str_replace(cur_proj_tpm_dt$rn, "ENSG0*", "")
425
#   cur_dict <- biomaRt::getBM(attributes = c("ensembl_gene_id",
426
#                                             "external_gene_name"),
427
#                              filters = "ensembl_gene_id", values = cur_proj_tpm_dt$rn,
428
#                              mart = ensembl)
429
#   cur_dict <- as.data.table(cur_dict)
430
#   temp <- merge(cur_proj_tpm_dt, cur_dict, by.x = "rn", by.y = "ensembl_gene_id")
431
#   # dim(temp)
432
#   # Find overlap with HGNC symbols in DepMap 
433
#   cur_reduced_dt <- temp[external_gene_name %in% depmap_exp_dict$HGNC]
434
#   # dim(cur_reduced_dt)
435
#   
436
#   # sum(duplicated(cur_reduced_dt$external_gene_name))
437
#   dup_genes <- cur_reduced_dt[duplicated(cur_reduced_dt$external_gene_name)]$external_gene_name
438
#   # Duplicated genes may represent transcripts from the same gene, must consolidate by adding TPM values
439
#   cur_reduced_dt$rn <- NULL
440
#   cur_reduced_dt <- cur_reduced_dt[, lapply(.SD, sum), by = external_gene_name]
441
#   final_dt <- transpose(cur_reduced_dt, keep.names = "external_gene_name")
442
#   colnames(final_dt) <- unname(unlist(final_dt[1,]))
443
#   # final_dt[1:5, 1:5]
444
#   final_dt <- final_dt[!(TSPAN6 == "TSPAN6")]
445
#   # final_dt[1:5, 1:5]
446
#   # dim(final_dt)
447
#   
448
#   # temp1[external_gene_name %in% dup_genes][,c("external_gene_name", "TCGA-FI-A2D5-01A-11R-A17B-07")]
449
#   fwrite(final_dt, paste0("Data/TCGA/Exp/", cur_project, "_Exp.csv"))
450
# }
451
# 
452
# # Now merge all the data.tables
453
# exp_file_paths <- list.files("Data/TCGA/Exp/", full.names = T)
454
# exp_dt_list <- vector(mode = "list", length = length(exp_file_paths))
455
# 
456
# for (i in 1:length(exp_file_paths)) {
457
#   exp_dt_list[[i]] <- fread(exp_file_paths[i])
458
# }
459
# 
460
# all_exp <- rbindlist(exp_dt_list)
461
# fwrite(all_exp, "Data/TCGA/TCGA_All_Exp_Data.csv")
462
# 
463
464
465
# ==== Sort and Clean Data ====
466
467
# ==== Match TCGA and DepMap Exp Data ====
468
require(data.table)
469
exp_tcga <- fread("Data/TCGA/TCGA_All_Exp_Data.csv")
470
exp_tcga[1:5, 1:5]
471
exp_depmap <- fread("Data/DRP_Training_Data/DepMap_21Q2_Expression.csv")
472
exp_depmap[1:5, 1:5]
473
dim(exp_tcga)
474
dim(exp_depmap)
475
476
# Find shared columns:
477
shared_exp <- intersect(colnames(exp_depmap), colnames(exp_tcga))
478
# Sort columns
479
shared_exp <- sort(shared_exp)
480
shared_exp <- c("tcga_sample_id", "cancer_type", shared_exp)
481
exp_tcga <- exp_tcga[, ..shared_exp]
482
colnames(exp_tcga)[2] <- "primary_disease"
483
484
exp_tcga[1:5, 1:5]
485
dim(exp_tcga)
486
487
# Save TCGA pre-training data
488
fwrite(exp_tcga, "Data/DRP_Training_Data/TCGA_PreTraining_Expression.csv")
489
490
# Subset DepMap data to what's available in TCGA
491
shared_exp <- unique(c("stripped_cell_line_name", "primary_disease", sort(intersect(colnames(exp_depmap), colnames(exp_tcga)))))
492
exp_depmap <- exp_depmap[, ..shared_exp]
493
exp_depmap[1:5, 1:5]
494
dim(exp_depmap)
495
# Save DepMap training data
496
fwrite(exp_depmap, "Data/DRP_Training_Data/DepMap_21Q2_Training_Expression.csv")
497
498
rm(exp_depmap)
499
rm(exp_tcga)
500
# exp_tcga <- fread("Data/DRP_Training_Data/TCGA_PreTraining_Expression.csv")
501
# exp_depmap <- fread("Data/DRP_Training_Data/DepMap_20Q2_Training_Expression.csv")
502
# exp_tcga[1:5, 1:5]
503
# exp_depmap[1:5, 1:5]
504
505
# ==== Match TCGA and DepMap CNV Data ====
506
cnv_tcga <- fread("Data/TCGA/TCGA_All_CNV_Data.csv")
507
cnv_tcga[1:5, 1:5]
508
cnv_depmap <- fread("Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv")
509
cnv_depmap[1:5, 1:5]
510
dim(cnv_depmap)
511
dim(cnv_tcga)
512
513
shared_cnv <- intersect(colnames(cnv_depmap), colnames(cnv_tcga))
514
# Sort columns
515
shared_cnv <- sort(shared_cnv)
516
shared_cnv <- c("tcga_sample_id", "cancer_type", shared_cnv)
517
cnv_tcga <- cnv_tcga[, ..shared_cnv]
518
colnames(cnv_tcga)[2] <- "primary_disease"
519
520
cnv_tcga[1:5, 1:5]
521
522
# Must add pseudocounts to log2 ratios of TCGA data
523
temp <- cnv_tcga[, -c(1:2)][, lapply(.SD, function(x) {log2(2**x + 1)})]
524
temp[1:5, 1:5]
525
temp$tcga_sample_id <- cnv_tcga$tcga_sample_id
526
temp$primary_disease <- cnv_tcga$primary_disease
527
setcolorder(temp, c("tcga_sample_id", "primary_disease"))
528
temp[1:5, 1:5]
529
dim(temp)
530
531
fwrite(temp, "Data/DRP_Training_Data/TCGA_PreTraining_CopyNumber.csv")
532
533
534
# Subset DepMap data to what's available in TCGA
535
cnv_depmap[1:5, 1:5]
536
shared_cnv <- unique(c("stripped_cell_line_name", "primary_disease", sort(intersect(colnames(cnv_depmap), colnames(cnv_tcga)))))
537
cnv_depmap <- cnv_depmap[, ..shared_cnv]
538
539
cnv_depmap[1:5, 1:5]
540
dim(cnv_depmap)
541
# Save DepMap training data
542
fwrite(cnv_depmap, "Data/DRP_Training_Data/DepMap_21Q2_Training_CopyNumber.csv")
543
544
rm(cnv_depmap)
545
rm(cnv_tcga)
546
gc()