|
a |
|
b/R/TCGA_Data_Preparation.R |
|
|
1 |
# TCGA_Data_Preparation.R |
|
|
2 |
|
|
|
3 |
getGDCprojects() |
|
|
4 |
|
|
|
5 |
# Download_GDC.R |
|
|
6 |
|
|
|
7 |
# ==== Preperation ==== |
|
|
8 |
if (!require(SummarizedExperiment)) { |
|
|
9 |
install.packages("SummarizedExperiment") |
|
|
10 |
library(SummarizedExperiment) |
|
|
11 |
} |
|
|
12 |
if (!require(parallel)) { |
|
|
13 |
install.packages("parallel") |
|
|
14 |
library(parallel) |
|
|
15 |
} |
|
|
16 |
if (!require(data.table)) { |
|
|
17 |
install.packages("data.table") |
|
|
18 |
library(data.table) |
|
|
19 |
} |
|
|
20 |
# Note that on Linux, libmariadb-client-lgpl-dev must be installed |
|
|
21 |
# This allows for configuration of RMySQL |
|
|
22 |
if (!require(TCGAbiolinks)) { |
|
|
23 |
if (!requireNamespace("BiocManager", quietly = TRUE)) |
|
|
24 |
install.packages("BiocManager") |
|
|
25 |
BiocManager::install("TCGAbiolinks") |
|
|
26 |
require(TCGAbiolinks) |
|
|
27 |
??TCGAbiolinks |
|
|
28 |
|
|
|
29 |
library(TCGAbiolinks) |
|
|
30 |
} |
|
|
31 |
|
|
|
32 |
getGDCInfo() |
|
|
33 |
getGDCprojects() |
|
|
34 |
getProjectSummary("TCGA-ACC") |
|
|
35 |
# Data Release 28, February 02, 2021 |
|
|
36 |
# Move all data to a single directory later on |
|
|
37 |
dir.create(("Data/TCGA")) |
|
|
38 |
dir.create("Data/TCGA/SummarizedExperiments") |
|
|
39 |
dir.create("Data/TCGA/SummarizedExperiments/Mut") |
|
|
40 |
dir.create("Data/TCGA/SummarizedExperiments/CNV") |
|
|
41 |
dir.create("Data/TCGA/SummarizedExperiments/Exp") |
|
|
42 |
|
|
|
43 |
projects <- TCGAbiolinks:::getGDCprojects()$project_id |
|
|
44 |
projects <- projects[grepl('^TCGA', projects, perl=TRUE)] |
|
|
45 |
|
|
|
46 |
|
|
|
47 |
# ==== Download Mut data ==== |
|
|
48 |
plyr::alply(sort(projects), 1, function(cur_project) { |
|
|
49 |
tryCatch( |
|
|
50 |
query <- GDCquery( |
|
|
51 |
project = cur_project, |
|
|
52 |
data.category = "Simple Nucleotide Variation", |
|
|
53 |
data.type = "Masked Somatic Mutation", |
|
|
54 |
workflow.type = 'VarScan2 Variant Aggregation and Masking' |
|
|
55 |
) |
|
|
56 |
) |
|
|
57 |
query <- GDCquery( |
|
|
58 |
project = cur_project, |
|
|
59 |
data.category = "Simple Nucleotide Variation", |
|
|
60 |
data.type = "Masked Somatic Mutation", |
|
|
61 |
workflow.type = 'VarScan2 Variant Aggregation and Masking' |
|
|
62 |
) |
|
|
63 |
|
|
|
64 |
GDCdownload(query, files.per.chunk = 10, method = "client") |
|
|
65 |
GDCprepare(query, |
|
|
66 |
save = T, |
|
|
67 |
save.filename = paste0("Data/TCGA/SummarizedExperiments/Mut/", cur_project, |
|
|
68 |
"_VarScan2_MAF.rdata") |
|
|
69 |
) |
|
|
70 |
}) |
|
|
71 |
|
|
|
72 |
# ==== Download CNV data ==== |
|
|
73 |
plyr::alply(sort(projects)[-(1:14)], 1, function(cur_project) { |
|
|
74 |
tryCatch( |
|
|
75 |
query <- GDCquery( |
|
|
76 |
project = cur_project, |
|
|
77 |
data.category = "Copy Number Variation", |
|
|
78 |
data.type = "Masked Copy Number Segment", |
|
|
79 |
) |
|
|
80 |
) |
|
|
81 |
query <- GDCquery( |
|
|
82 |
project = cur_project, |
|
|
83 |
data.category = "Copy Number Variation", |
|
|
84 |
data.type = "Masked Copy Number Segment", |
|
|
85 |
) |
|
|
86 |
GDCdownload(query, files.per.chunk = 10) |
|
|
87 |
GDCprepare(query, |
|
|
88 |
save = T, |
|
|
89 |
save.filename = paste0("Data/TCGA/SummarizedExperiments/CNV/", cur_project, |
|
|
90 |
"_CopyNumber.rdata")) |
|
|
91 |
# cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/CNV/", cur_project, "_CopyNumber.rdata"))) |
|
|
92 |
|
|
|
93 |
}) |
|
|
94 |
|
|
|
95 |
# ==== Download clinical CNV data ==== |
|
|
96 |
|
|
|
97 |
dir.create("Data/TCGA/Clinical") |
|
|
98 |
dir.create("Data/TCGA/Clinical/CNV") |
|
|
99 |
plyr::alply(sort(projects)[-(1:14)], 1, function(cur_project) { |
|
|
100 |
tryCatch( |
|
|
101 |
query.biospecimen <- GDCquery(project = cur_project, |
|
|
102 |
data.category = "Biospecimen", |
|
|
103 |
data.type = "Biospecimen Supplement", |
|
|
104 |
data.format = "BCR Biotab") |
|
|
105 |
) |
|
|
106 |
query.biospecimen <- GDCquery(project = cur_project, |
|
|
107 |
data.category = "Biospecimen", |
|
|
108 |
data.type = "Biospecimen Supplement", |
|
|
109 |
data.format = "BCR Biotab") |
|
|
110 |
GDCdownload(query.biospecimen, files.per.chunk = 10) |
|
|
111 |
GDCprepare(query.biospecimen, |
|
|
112 |
save = T, |
|
|
113 |
save.filename = paste0("Data/TCGA/Clinical/CNV/", cur_project, |
|
|
114 |
"_CopyNumber_Clinical.rdata")) |
|
|
115 |
|
|
|
116 |
}) |
|
|
117 |
|
|
|
118 |
# ==== Download gene expression data ==== |
|
|
119 |
plyr::alply(projects, 1, function(cur_project) { |
|
|
120 |
tryCatch( |
|
|
121 |
query <- GDCquery( |
|
|
122 |
project = cur_project, |
|
|
123 |
data.category = "Transcriptome Profiling", |
|
|
124 |
data.type = "Gene Expression Quantification", |
|
|
125 |
workflow.type = "HTSeq - Counts" |
|
|
126 |
) |
|
|
127 |
) |
|
|
128 |
query <- GDCquery( |
|
|
129 |
project = cur_project, |
|
|
130 |
data.category = "Transcriptome Profiling", |
|
|
131 |
data.type = "Gene Expression Quantification", |
|
|
132 |
workflow.type = "HTSeq - Counts" |
|
|
133 |
) |
|
|
134 |
GDCdownload(query, files.per.chunk = 10, method = "client") |
|
|
135 |
GDCprepare( |
|
|
136 |
query, |
|
|
137 |
save = T, |
|
|
138 |
save.filename = paste0("Data/TCGA/SummarizedExperiments/Exp/", cur_project, "_GeneExpression.rdata") |
|
|
139 |
) |
|
|
140 |
|
|
|
141 |
}) |
|
|
142 |
|
|
|
143 |
|
|
|
144 |
# ===================================== |
|
|
145 |
# For each data type, combine all projects into a single file |
|
|
146 |
require(data.table) |
|
|
147 |
require(TCGAbiolinks) |
|
|
148 |
require(SummarizedExperiment) |
|
|
149 |
require(GeoTcgaData) |
|
|
150 |
require(stringr) |
|
|
151 |
require(biomaRt) |
|
|
152 |
require(curl) |
|
|
153 |
|
|
|
154 |
# Get clinical info from Gene Expression data ==== |
|
|
155 |
all_proj_info <- vector(mode = "list", length = length(projects)) |
|
|
156 |
i <- 1 |
|
|
157 |
for (cur_project in projects) { |
|
|
158 |
cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/Exp/", cur_project, "_GeneExpression.rdata"))) |
|
|
159 |
cur_proj_assay <- assays(cur_proj_data)[[1]] |
|
|
160 |
cur_proj_info <- colData(cur_proj_data) |
|
|
161 |
all_proj_info[[i]] <- as.data.table(cur_proj_info[, c("barcode", "name")]) |
|
|
162 |
i <- i + 1 |
|
|
163 |
} |
|
|
164 |
|
|
|
165 |
all_proj_info <- rbindlist(all_proj_info) |
|
|
166 |
# Save |
|
|
167 |
fwrite(all_proj_info, "Data/TCGA/All_Sample_Info.csv") |
|
|
168 |
|
|
|
169 |
|
|
|
170 |
# ==== Merge Exp Data ==== |
|
|
171 |
require(data.table) |
|
|
172 |
require(biomaRt) |
|
|
173 |
dir.create("Data/TCGA/Exp") |
|
|
174 |
projects <- TCGAbiolinks:::getGDCprojects()$project_id |
|
|
175 |
projects <- projects[grepl('^TCGA', projects, perl=TRUE)] |
|
|
176 |
# gene_dict <- fread("Data/gene_dictionary.txt") |
|
|
177 |
depmap_exp <- fread("Data/DepMap/20Q2/CCLE_expression.csv") |
|
|
178 |
depmap_ensg_numbers <- str_replace(colnames(depmap_exp), ".+\\((.+)\\)", "\\1") |
|
|
179 |
depmap_hgnc <- str_replace(colnames(depmap_exp), "(.+)\\s\\(.+\\)", "\\1") |
|
|
180 |
depmap_exp_dict <- data.table(HGNC = depmap_hgnc, ENSG = depmap_ensg_numbers) |
|
|
181 |
|
|
|
182 |
rm(depmap_exp) |
|
|
183 |
|
|
|
184 |
ensembl = useMart( |
|
|
185 |
biomart = "ENSEMBL_MART_ENSEMBL", |
|
|
186 |
# host = "feb2014.archive.ensembl.org", |
|
|
187 |
path = "/biomart/martservice", |
|
|
188 |
dataset = "hsapiens_gene_ensembl" |
|
|
189 |
) |
|
|
190 |
atts <- listAttributes(ensembl, what = "name") |
|
|
191 |
atts[grep(pattern = "gene", x = atts, ignore.case = T)] |
|
|
192 |
|
|
|
193 |
cur_project = projects[1] |
|
|
194 |
|
|
|
195 |
for (cur_project in projects) { |
|
|
196 |
cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/Exp/", cur_project, "_GeneExpression.rdata"))) |
|
|
197 |
cur_proj_assay <- assays(cur_proj_data)[[1]] |
|
|
198 |
cur_proj_info <- colData(cur_proj_data) |
|
|
199 |
cur_proj_tpm <- GeoTcgaData::fpkmToTpm_matrix(cur_proj_assay) |
|
|
200 |
|
|
|
201 |
# log2 transform with a pseudo-count of 1 |
|
|
202 |
cur_proj_tpm <- cur_proj_tpm+1 |
|
|
203 |
cur_proj_tpm <- log2(cur_proj_tpm) |
|
|
204 |
|
|
|
205 |
# Convert ENSG ID's to numbers |
|
|
206 |
cur_ensg_numbers <- str_replace(rownames(cur_proj_tpm), "ENSG0*", "") |
|
|
207 |
cur_proj_tpm_dt <- as.data.table(cur_proj_tpm, keep.rownames = T) |
|
|
208 |
cur_dict <- biomaRt::getBM(attributes = c("ensembl_gene_id", |
|
|
209 |
"external_gene_name"), |
|
|
210 |
filters = "ensembl_gene_id", values = cur_proj_tpm_dt$rn, |
|
|
211 |
mart = ensembl) |
|
|
212 |
cur_dict <- as.data.table(cur_dict) |
|
|
213 |
temp <- merge(cur_proj_tpm_dt, cur_dict, by.x = "rn", by.y = "ensembl_gene_id") |
|
|
214 |
# Find overlap with HGNC symbols in DepMap |
|
|
215 |
cur_reduced_dt <- temp[external_gene_name %in% depmap_exp_dict$HGNC] |
|
|
216 |
|
|
|
217 |
dup_genes <- cur_reduced_dt[duplicated(cur_reduced_dt$external_gene_name)]$external_gene_name |
|
|
218 |
# Duplicated genes may represent transcripts from the same gene, must consolidate by adding TPM values |
|
|
219 |
cur_reduced_dt$rn <- NULL |
|
|
220 |
cur_reduced_dt <- cur_reduced_dt[, lapply(.SD, sum), by = external_gene_name] |
|
|
221 |
final_dt <- transpose(cur_reduced_dt, keep.names = "external_gene_name") |
|
|
222 |
|
|
|
223 |
colnames(final_dt) <- unname(unlist(final_dt[1,])) |
|
|
224 |
final_dt <- final_dt[!(TSPAN6 == "TSPAN6")] |
|
|
225 |
# Add cancer type |
|
|
226 |
final_dt <- cbind(cancer_type = cur_proj_info$name, final_dt) |
|
|
227 |
colnames(final_dt)[2] <- "tcga_sample_id" |
|
|
228 |
# Sort all the other columns |
|
|
229 |
new_order <- c("tcga_sample_id", "cancer_type", sort(colnames(final_dt)[-c(1:2)])) |
|
|
230 |
setcolorder(final_dt, new_order) |
|
|
231 |
|
|
|
232 |
fwrite(final_dt, paste0("Data/TCGA/Exp/", cur_project, "_Exp.csv")) |
|
|
233 |
} |
|
|
234 |
|
|
|
235 |
# Now merge all the data.tables |
|
|
236 |
exp_file_paths <- list.files("Data/TCGA/Exp/", full.names = T) |
|
|
237 |
exp_dt_list <- vector(mode = "list", length = length(exp_file_paths)) |
|
|
238 |
|
|
|
239 |
for (i in 1:length(exp_file_paths)) { |
|
|
240 |
exp_dt_list[[i]] <- fread(exp_file_paths[i]) |
|
|
241 |
} |
|
|
242 |
|
|
|
243 |
all_exp <- rbindlist(exp_dt_list) |
|
|
244 |
all_exp[1:5, 1:5] |
|
|
245 |
dim(all_exp) |
|
|
246 |
fwrite(all_exp, "Data/TCGA/TCGA_All_Exp_Data.csv") |
|
|
247 |
rm(exp_dt_list) |
|
|
248 |
rm(all_exp) |
|
|
249 |
|
|
|
250 |
# all_exp <- fread("Data/TCGA/TCGA_All_Exp_Data.csv") |
|
|
251 |
# ==== Merge CNV Data ==== |
|
|
252 |
require(GenomicRanges) |
|
|
253 |
# Must find the genomic ranges of genes in DepMap CNV data (ensure reference is the same), then merge using the GRanges package (map onto) |
|
|
254 |
dir.create("Data/TCGA/CNV") |
|
|
255 |
projects <- TCGAbiolinks:::getGDCprojects()$project_id |
|
|
256 |
projects <- projects[grepl('^TCGA', projects, perl=TRUE)] |
|
|
257 |
# gene_dict <- fread("Data/gene_dictionary.txt") |
|
|
258 |
depmap_cnv <- fread("Data/DepMap/20Q2/CCLE_gene_copy_number.csv") |
|
|
259 |
head(colnames(depmap_cnv)) |
|
|
260 |
depmap_ensg_numbers <- str_replace(colnames(depmap_cnv), ".+\\((.+)\\)", "\\1") |
|
|
261 |
depmap_hgnc <- str_replace(colnames(depmap_cnv), "(.+)\\s\\(.+\\)", "\\1") |
|
|
262 |
depmap_cnv_dict <- data.table(HGNC = depmap_hgnc, ENSG = depmap_ensg_numbers) |
|
|
263 |
|
|
|
264 |
rm(depmap_cnv) |
|
|
265 |
|
|
|
266 |
ensembl = useMart( |
|
|
267 |
biomart = "ENSEMBL_MART_ENSEMBL", |
|
|
268 |
# host = "feb2014.archive.ensembl.org", |
|
|
269 |
path = "/biomart/martservice", |
|
|
270 |
dataset = "hsapiens_gene_ensembl" |
|
|
271 |
) |
|
|
272 |
|
|
|
273 |
cur_translation <- biomaRt::getBM(attributes = c("ensembl_gene_id", "wikigene_name", |
|
|
274 |
"external_gene_name", 'chromosome_name', |
|
|
275 |
'start_position', 'end_position', 'strand'), |
|
|
276 |
filters = "external_gene_name", values = depmap_cnv_dict$HGNC, uniqueRows = T, |
|
|
277 |
mart = ensembl) |
|
|
278 |
cur_translation <- as.data.table(cur_translation) |
|
|
279 |
cur_translation <- cur_translation[!duplicated(cur_translation[,"wikigene_name"]), ] |
|
|
280 |
cur_translation <- cur_translation[!duplicated(cur_translation[,"external_gene_name"]), ] |
|
|
281 |
|
|
|
282 |
depmap_cnv_dict_1 <- merge(depmap_cnv_dict, cur_translation, by.x = "HGNC", by.y = "external_gene_name") |
|
|
283 |
depmap_cnv_dict_1$wikigene_name <- NULL |
|
|
284 |
# unfound <- depmap_cnv_dict$HGNC[!(depmap_cnv_dict$HGNC %in% cur_translation$wikigene_name)] |
|
|
285 |
unfound <- depmap_cnv_dict$HGNC[!(depmap_cnv_dict$HGNC %in% cur_translation$external_gene_name)] |
|
|
286 |
# Search for the unfound in wikigene info |
|
|
287 |
unfound_translation <- biomaRt::getBM(attributes = c("ensembl_gene_id", "wikigene_name", |
|
|
288 |
"external_gene_name", 'chromosome_name', |
|
|
289 |
'start_position', 'end_position', 'strand'), |
|
|
290 |
filters = "wikigene_name", values = unfound, uniqueRows = T, |
|
|
291 |
mart = ensembl) |
|
|
292 |
unfound_translation <- as.data.table(unfound_translation) |
|
|
293 |
depmap_cnv_dict_2 <- merge(depmap_cnv_dict, unfound_translation, by.x = "HGNC", by.y = "wikigene_name") |
|
|
294 |
depmap_cnv_dict_2$external_gene_name <- NULL |
|
|
295 |
|
|
|
296 |
# Add together |
|
|
297 |
# all_translation <- rbindlist(list(cur_translation, unfound_translation)) |
|
|
298 |
depmap_cnv_dict <- rbindlist(list(depmap_cnv_dict_1, depmap_cnv_dict_2)) |
|
|
299 |
depmap_cnv_dict$strand[depmap_cnv_dict$strand == -1] <- "-" |
|
|
300 |
depmap_cnv_dict$strand[depmap_cnv_dict$strand == 1] <- "+" |
|
|
301 |
|
|
|
302 |
# Convert to GRanges |
|
|
303 |
depmap_cnv_granges <- makeGRangesFromDataFrame(df = depmap_cnv_dict, keep.extra.columns = T, seqnames.field = "chromosome_name", |
|
|
304 |
start.field = "start_position", end.field = "end_position", strand.field = "strand") |
|
|
305 |
|
|
|
306 |
atts <- listAttributes(ensembl, what = "name") |
|
|
307 |
atts[grep(pattern = "gene", x = atts, ignore.case = T)] |
|
|
308 |
atts[grep(pattern = "chromosome", x = atts, ignore.case = T)] |
|
|
309 |
|
|
|
310 |
cur_project = projects[1] |
|
|
311 |
|
|
|
312 |
rm(list = c("depmap_cnv_dict_1", "depmap_cnv_dict_2")) |
|
|
313 |
|
|
|
314 |
|
|
|
315 |
for (cur_project in projects) { |
|
|
316 |
cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/CNV/", cur_project, "_CopyNumber.rdata"))) |
|
|
317 |
# Convert to GRanges |
|
|
318 |
cur_proj_granges <- makeGRangesFromDataFrame(df = cur_proj_data, keep.extra.columns = T, start.field = "Start", |
|
|
319 |
end.field = "End", seqnames.field = "Chromosome") |
|
|
320 |
# Merge with DepMap data |
|
|
321 |
cur_proj_merge <- mergeByOverlaps(query = cur_proj_granges, subject = depmap_cnv_granges) |
|
|
322 |
cur_proj_merge <- cur_proj_merge[, c("Sample", "HGNC", "Segment_Mean")] |
|
|
323 |
cur_proj_merge <- as.data.table(cur_proj_merge) |
|
|
324 |
cur_proj_merge <- unique(cur_proj_merge) |
|
|
325 |
|
|
|
326 |
cur_wide_data <- dcast(cur_proj_merge, Sample ~ HGNC, value.var = "Segment_Mean", fun.aggregate = sum) |
|
|
327 |
|
|
|
328 |
colnames(cur_wide_data)[1] <- "tcga_sample_id" |
|
|
329 |
# Sort all the other columns |
|
|
330 |
new_order <- c("tcga_sample_id", sort(colnames(cur_wide_data)[-1])) |
|
|
331 |
setcolorder(cur_wide_data, new_order) |
|
|
332 |
|
|
|
333 |
fwrite(cur_wide_data, paste0("Data/TCGA/CNV/", cur_project, "_CNV.csv")) |
|
|
334 |
} |
|
|
335 |
|
|
|
336 |
# Now merge all the data.tables |
|
|
337 |
cnv_file_paths <- list.files("Data/TCGA/CNV/", full.names = T) |
|
|
338 |
cnv_dt_list <- vector(mode = "list", length = length(cnv_file_paths)) |
|
|
339 |
|
|
|
340 |
for (i in 1:length(cnv_file_paths)) { |
|
|
341 |
cnv_dt_list[[i]] <- fread(cnv_file_paths[i]) |
|
|
342 |
} |
|
|
343 |
|
|
|
344 |
all_cnv <- rbindlist(cnv_dt_list) |
|
|
345 |
all_cnv[1:5, 1:5] |
|
|
346 |
fwrite(all_cnv, "Data/TCGA/TCGA_All_CNV_Data.csv") |
|
|
347 |
|
|
|
348 |
# The first 3 ID's should be enough to indicate tumor type |
|
|
349 |
require(stringr) |
|
|
350 |
all_proj_info <- fread("Data/TCGA/All_Sample_Info.csv") |
|
|
351 |
temp <- str_split_fixed(all_proj_info$barcode, "-", n = 7)[, 1:3] |
|
|
352 |
all_proj_info$sub_id <- paste(temp[,1], temp[,2], temp[,3], sep = '-') |
|
|
353 |
|
|
|
354 |
all_cnv <- fread("Data/TCGA/TCGA_All_CNV_Data.csv") |
|
|
355 |
temp2 <- str_split_fixed(all_cnv$tcga_sample_id, "-", n = 7)[, 1:4] |
|
|
356 |
sample_type <- str_split_fixed(temp2[, 4], "", n=3)[, 1:2] |
|
|
357 |
sample_type <- as.integer(paste0(sample_type[,1], sample_type[,2])) |
|
|
358 |
all_cnv$sub_id <- paste(temp2[,1], temp2[,2], temp2[,3], sep = '-') |
|
|
359 |
|
|
|
360 |
all_cnv$sample_type <- ifelse(sample_type < 10, yes = "Tumor", no = "Normal") |
|
|
361 |
table(all_cnv$sample_type) |
|
|
362 |
|
|
|
363 |
# Subset to Tumor |
|
|
364 |
all_cnv <- all_cnv[sample_type == "Tumor"] |
|
|
365 |
# add cancer type |
|
|
366 |
all_cnv <- merge(all_cnv, all_proj_info[, c("name", "sub_id")], by = "sub_id") |
|
|
367 |
table(all_cnv$name) |
|
|
368 |
dim(all_cnv) |
|
|
369 |
# remove sub_id, sample_type |
|
|
370 |
all_cnv$sub_id <- NULL |
|
|
371 |
all_cnv$sample_type <- NULL |
|
|
372 |
# reorder |
|
|
373 |
setcolorder(all_cnv, "name") |
|
|
374 |
colnames(all_cnv)[1] <- "cancer_type" |
|
|
375 |
all_cnv[1:5, 1:5] |
|
|
376 |
|
|
|
377 |
# Save |
|
|
378 |
fwrite(all_cnv, "Data/TCGA/TCGA_All_CNV_Data.csv") |
|
|
379 |
|
|
|
380 |
dim(all_cnv) |
|
|
381 |
|
|
|
382 |
rm(cnv_dt_list) |
|
|
383 |
rm(all_cnv) |
|
|
384 |
|
|
|
385 |
|
|
|
386 |
# ==== Merge Mut Data ==== |
|
|
387 |
# dir.create("Data/TCGA/Mut") |
|
|
388 |
# projects <- TCGAbiolinks:::getGDCprojects()$project_id |
|
|
389 |
# projects <- projects[grepl('^TCGA', projects, perl=TRUE)] |
|
|
390 |
# # gene_dict <- fread("Data/gene_dictionary.txt") |
|
|
391 |
# depmap_mut <- fread("Data/DRP_Training_Data/DepMap_20Q2_CGC_Mutations_by_Cell.csv") |
|
|
392 |
# depmap_ensg_numbers <- str_replace(colnames(depmap_mut), ".+\\((.+)\\)", "\\1") |
|
|
393 |
# depmap_hgnc <- str_replace(colnames(depmap_mut), "(.+)\\s\\(.+\\)", "\\1") |
|
|
394 |
# depmap_mut_dict <- data.table(HGNC = depmap_hgnc, ENSG = depmap_ensg_numbers) |
|
|
395 |
# |
|
|
396 |
# rm(depmap_mut) |
|
|
397 |
# |
|
|
398 |
# ensembl = useMart( |
|
|
399 |
# biomart = "ENSEMBL_MART_ENSEMBL", |
|
|
400 |
# # host = "feb2014.archive.ensembl.org", |
|
|
401 |
# path = "/biomart/martservice", |
|
|
402 |
# dataset = "hsapiens_gene_ensembl" |
|
|
403 |
# ) |
|
|
404 |
# atts <- listAttributes(ensembl, what = "name") |
|
|
405 |
# |
|
|
406 |
# # cur_project = projects[1] |
|
|
407 |
# |
|
|
408 |
# for (cur_project in projects) { |
|
|
409 |
# cur_proj_data <- get(load(paste0("Data/TCGA/SummarizedExperiments/Mut/", cur_project, "_VarScan2_MAF.rdata"))) |
|
|
410 |
# cur_sub <- cur_proj_data[, c("Hugo_Symbol", "Variant_Classification", "Variant_Type", "Tumor_Sample_Barcode")] |
|
|
411 |
# unique(cur_sub$Hugo_Symbol) |
|
|
412 |
# # Find overlap with DepMap |
|
|
413 |
# sum(depmap_mut_dict$HGNC %in% unique(cur_sub$Hugo_Symbol)) |
|
|
414 |
# |
|
|
415 |
# |
|
|
416 |
# cur_proj_assay <- assays(cur_proj_data)[[1]] |
|
|
417 |
# # cur_proj_assay[1:5, 1:5] |
|
|
418 |
# # class(cur_proj_assay) |
|
|
419 |
# cur_proj_tpm <- GeoTcgaData::fpkmToTpm_matrix(cur_proj_assay) |
|
|
420 |
# # Convert ENSG ID's to numbers |
|
|
421 |
# cur_ensg_numbers <- str_replace(rownames(cur_proj_tpm), "ENSG0*", "") |
|
|
422 |
# cur_proj_tpm_dt <- as.data.table(cur_proj_tpm, keep.rownames = T) |
|
|
423 |
# # cur_proj_tpm_dt[1:5, 1:5] |
|
|
424 |
# # cur_proj_tpm_dt$rn <- str_replace(cur_proj_tpm_dt$rn, "ENSG0*", "") |
|
|
425 |
# cur_dict <- biomaRt::getBM(attributes = c("ensembl_gene_id", |
|
|
426 |
# "external_gene_name"), |
|
|
427 |
# filters = "ensembl_gene_id", values = cur_proj_tpm_dt$rn, |
|
|
428 |
# mart = ensembl) |
|
|
429 |
# cur_dict <- as.data.table(cur_dict) |
|
|
430 |
# temp <- merge(cur_proj_tpm_dt, cur_dict, by.x = "rn", by.y = "ensembl_gene_id") |
|
|
431 |
# # dim(temp) |
|
|
432 |
# # Find overlap with HGNC symbols in DepMap |
|
|
433 |
# cur_reduced_dt <- temp[external_gene_name %in% depmap_exp_dict$HGNC] |
|
|
434 |
# # dim(cur_reduced_dt) |
|
|
435 |
# |
|
|
436 |
# # sum(duplicated(cur_reduced_dt$external_gene_name)) |
|
|
437 |
# dup_genes <- cur_reduced_dt[duplicated(cur_reduced_dt$external_gene_name)]$external_gene_name |
|
|
438 |
# # Duplicated genes may represent transcripts from the same gene, must consolidate by adding TPM values |
|
|
439 |
# cur_reduced_dt$rn <- NULL |
|
|
440 |
# cur_reduced_dt <- cur_reduced_dt[, lapply(.SD, sum), by = external_gene_name] |
|
|
441 |
# final_dt <- transpose(cur_reduced_dt, keep.names = "external_gene_name") |
|
|
442 |
# colnames(final_dt) <- unname(unlist(final_dt[1,])) |
|
|
443 |
# # final_dt[1:5, 1:5] |
|
|
444 |
# final_dt <- final_dt[!(TSPAN6 == "TSPAN6")] |
|
|
445 |
# # final_dt[1:5, 1:5] |
|
|
446 |
# # dim(final_dt) |
|
|
447 |
# |
|
|
448 |
# # temp1[external_gene_name %in% dup_genes][,c("external_gene_name", "TCGA-FI-A2D5-01A-11R-A17B-07")] |
|
|
449 |
# fwrite(final_dt, paste0("Data/TCGA/Exp/", cur_project, "_Exp.csv")) |
|
|
450 |
# } |
|
|
451 |
# |
|
|
452 |
# # Now merge all the data.tables |
|
|
453 |
# exp_file_paths <- list.files("Data/TCGA/Exp/", full.names = T) |
|
|
454 |
# exp_dt_list <- vector(mode = "list", length = length(exp_file_paths)) |
|
|
455 |
# |
|
|
456 |
# for (i in 1:length(exp_file_paths)) { |
|
|
457 |
# exp_dt_list[[i]] <- fread(exp_file_paths[i]) |
|
|
458 |
# } |
|
|
459 |
# |
|
|
460 |
# all_exp <- rbindlist(exp_dt_list) |
|
|
461 |
# fwrite(all_exp, "Data/TCGA/TCGA_All_Exp_Data.csv") |
|
|
462 |
# |
|
|
463 |
|
|
|
464 |
|
|
|
465 |
# ==== Sort and Clean Data ==== |
|
|
466 |
|
|
|
467 |
# ==== Match TCGA and DepMap Exp Data ==== |
|
|
468 |
require(data.table) |
|
|
469 |
exp_tcga <- fread("Data/TCGA/TCGA_All_Exp_Data.csv") |
|
|
470 |
exp_tcga[1:5, 1:5] |
|
|
471 |
exp_depmap <- fread("Data/DRP_Training_Data/DepMap_21Q2_Expression.csv") |
|
|
472 |
exp_depmap[1:5, 1:5] |
|
|
473 |
dim(exp_tcga) |
|
|
474 |
dim(exp_depmap) |
|
|
475 |
|
|
|
476 |
# Find shared columns: |
|
|
477 |
shared_exp <- intersect(colnames(exp_depmap), colnames(exp_tcga)) |
|
|
478 |
# Sort columns |
|
|
479 |
shared_exp <- sort(shared_exp) |
|
|
480 |
shared_exp <- c("tcga_sample_id", "cancer_type", shared_exp) |
|
|
481 |
exp_tcga <- exp_tcga[, ..shared_exp] |
|
|
482 |
colnames(exp_tcga)[2] <- "primary_disease" |
|
|
483 |
|
|
|
484 |
exp_tcga[1:5, 1:5] |
|
|
485 |
dim(exp_tcga) |
|
|
486 |
|
|
|
487 |
# Save TCGA pre-training data |
|
|
488 |
fwrite(exp_tcga, "Data/DRP_Training_Data/TCGA_PreTraining_Expression.csv") |
|
|
489 |
|
|
|
490 |
# Subset DepMap data to what's available in TCGA |
|
|
491 |
shared_exp <- unique(c("stripped_cell_line_name", "primary_disease", sort(intersect(colnames(exp_depmap), colnames(exp_tcga))))) |
|
|
492 |
exp_depmap <- exp_depmap[, ..shared_exp] |
|
|
493 |
exp_depmap[1:5, 1:5] |
|
|
494 |
dim(exp_depmap) |
|
|
495 |
# Save DepMap training data |
|
|
496 |
fwrite(exp_depmap, "Data/DRP_Training_Data/DepMap_21Q2_Training_Expression.csv") |
|
|
497 |
|
|
|
498 |
rm(exp_depmap) |
|
|
499 |
rm(exp_tcga) |
|
|
500 |
# exp_tcga <- fread("Data/DRP_Training_Data/TCGA_PreTraining_Expression.csv") |
|
|
501 |
# exp_depmap <- fread("Data/DRP_Training_Data/DepMap_20Q2_Training_Expression.csv") |
|
|
502 |
# exp_tcga[1:5, 1:5] |
|
|
503 |
# exp_depmap[1:5, 1:5] |
|
|
504 |
|
|
|
505 |
# ==== Match TCGA and DepMap CNV Data ==== |
|
|
506 |
cnv_tcga <- fread("Data/TCGA/TCGA_All_CNV_Data.csv") |
|
|
507 |
cnv_tcga[1:5, 1:5] |
|
|
508 |
cnv_depmap <- fread("Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv") |
|
|
509 |
cnv_depmap[1:5, 1:5] |
|
|
510 |
dim(cnv_depmap) |
|
|
511 |
dim(cnv_tcga) |
|
|
512 |
|
|
|
513 |
shared_cnv <- intersect(colnames(cnv_depmap), colnames(cnv_tcga)) |
|
|
514 |
# Sort columns |
|
|
515 |
shared_cnv <- sort(shared_cnv) |
|
|
516 |
shared_cnv <- c("tcga_sample_id", "cancer_type", shared_cnv) |
|
|
517 |
cnv_tcga <- cnv_tcga[, ..shared_cnv] |
|
|
518 |
colnames(cnv_tcga)[2] <- "primary_disease" |
|
|
519 |
|
|
|
520 |
cnv_tcga[1:5, 1:5] |
|
|
521 |
|
|
|
522 |
# Must add pseudocounts to log2 ratios of TCGA data |
|
|
523 |
temp <- cnv_tcga[, -c(1:2)][, lapply(.SD, function(x) {log2(2**x + 1)})] |
|
|
524 |
temp[1:5, 1:5] |
|
|
525 |
temp$tcga_sample_id <- cnv_tcga$tcga_sample_id |
|
|
526 |
temp$primary_disease <- cnv_tcga$primary_disease |
|
|
527 |
setcolorder(temp, c("tcga_sample_id", "primary_disease")) |
|
|
528 |
temp[1:5, 1:5] |
|
|
529 |
dim(temp) |
|
|
530 |
|
|
|
531 |
fwrite(temp, "Data/DRP_Training_Data/TCGA_PreTraining_CopyNumber.csv") |
|
|
532 |
|
|
|
533 |
|
|
|
534 |
# Subset DepMap data to what's available in TCGA |
|
|
535 |
cnv_depmap[1:5, 1:5] |
|
|
536 |
shared_cnv <- unique(c("stripped_cell_line_name", "primary_disease", sort(intersect(colnames(cnv_depmap), colnames(cnv_tcga))))) |
|
|
537 |
cnv_depmap <- cnv_depmap[, ..shared_cnv] |
|
|
538 |
|
|
|
539 |
cnv_depmap[1:5, 1:5] |
|
|
540 |
dim(cnv_depmap) |
|
|
541 |
# Save DepMap training data |
|
|
542 |
fwrite(cnv_depmap, "Data/DRP_Training_Data/DepMap_21Q2_Training_CopyNumber.csv") |
|
|
543 |
|
|
|
544 |
rm(cnv_depmap) |
|
|
545 |
rm(cnv_tcga) |
|
|
546 |
gc() |