--- a +++ b/R/Read_LINCS.R @@ -0,0 +1,132 @@ +# Read_LINCS.R +# library(devtools) +# install_github("cmap/cmapR") +library(cmapR) +# LINCSDataPortal::download_dataset() + +library(data.table) + +# ds_path = "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_Level3_INF_mlr12k_n345976x12328_2017-03-06.gctx" +ds_path = "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328_2017-03-06.gctx" +# read the column annotations as a data.frame +# col_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_inst_info_2017-03-06.txt" +col_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_sig_info_2017-03-06.txt" +col_meta <- fread(col_meta_path) +head(unique(col_meta$pert_iname)) # Drugs and shRNAs +unique(col_meta$pert_type) + + +cell_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_cell_info_2017-04-28.txt" +cell_meta <- fread(cell_meta_path) + +gene_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_gene_info_2017-03-06.txt" +gene_meta <- fread(gene_meta_path) + +sig_metrics <- fread("Data/LINCS/GSE70138/GSE70138_Broad_LINCS_sig_metrics_2017-03-06.txt") +sig_metrics[pert_type == "trt_xpr"] + +pert_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_pert_info_2017-03-06.txt" +pert_meta <- fread(pert_meta_path) +unique(pert_meta$pert_type) +pert_meta[pert_type == "trt_cp"] +pert_meta[pert_type == "ctl_vehicle"] + +# figure out which signatures correspond to vorinostat by searching the 'pert_iname' column +col_meta +col_meta[pert_iname == "vorinostat" & cell_id == "A375"] +idx <- which(col_meta$pert_iname=="vorinostat") + +# and get the corresponding sig_ids +sig_ids <- col_meta$sig_id[idx] + +# read only those columns from the GCTX file by using the 'cid' parameter +cur_cols <- read.gctx.ids(ds_path, dimension = "col") + +sum(grepl(pattern = "REP", x = cur_cols)) +cur_cols <- gsub(pattern = "REP\\.", replacement = "", x = cur_cols) +sum(cur_cols %in% col_meta$sig_id) +vorinostat_ds <- parse.gctx(ds_path, cid=sig_ids) + +# simply typing the variable name of a GCT object will display the object structure, similar to calling 'str' +vorinostat_ds + + +# ==== Get all untreated cell line expression data ==== +untreated_ids <- col_meta[pert_iname == "DMSO", sig_id] +untreated_ds <- parse.gctx(ds_path, cid = untreated_ids) +untreated_dt <- as.data.table(untreated_ds@mat) +dim(untreated_dt) +# Get cell IDs, maintaining column order (%in% discards order!) +col_meta[sig_id %in% c("LJP005_A375_24H:A07", "LJP005_A375_24H:A03"), pert_id] +cur_cells <- col_meta[match(colnames(untreated_dt), col_meta$sig_id), cell_id] +cell_meta[match(cur_cells, cell_meta$cell_id), primary_site] +y = cell_meta[match(cur_cells, cell_meta$cell_id), subtype] +length(unique(y)) # 17 classes/subtypes + +# Transpose the gene expression matrix +untreated_dt <- transpose(untreated_dt) +dim(untreated_dt) +colnames(untreated_dt) +untreated_dt <- cbind(untreated_dt, y) + +# Remove samples with no subtypes indicated +untreated_dt <- untreated_dt[y != "-666"] +dim(untreated_dt) +untreated_dt[1:5, 1:5] + +# Stratify into train and test data +# install.packages("caret") +library(caret) + +temp <- caret::createDataPartition(y = as.factor(untreated_dt$y), times = 1, p = 0.8) +temp$Resample1 +train_data <- untreated_dt[temp$Resample1,] +test_data <- untreated_dt[-temp$Resample1] +dim(train_data) +dim(test_data) + +# Save +fwrite(train_data, "Data/LINCS/GSE70138/DMSO_celllines_subtypes_TRAIN.txt", sep = "\t") +fwrite(test_data, "Data/LINCS/GSE70138/DMSO_celllines_subtypes_TEST.txt", sep = "\t") + +# ==== Match (compound) treatment and cell line responses ==== +# Will have before (X) and after (y) treatment responses +untreated_ids <- col_meta[pert_id == "DMSO", sig_id] +ctrl <- parse.gctx(fname = ds_path, cid = untreated_ids) +gene_ids <- ctrl@rid +ctrl <- as.data.table(ctrl@mat) + +# Divide training data by cell lines +all_lines <- unique(col_meta$cell_id) +cell = all_lines[1] +for (cell in all_lines) { + cur_ids <- col_meta[cell_id == cell & pert_id != "DMSO" & pert_type == "trt_cp", sig_id] + cur_data <- parse.gctx(fname = ds_path, cid = cur_ids, matrix_only = T) + cur_gene_ids <- cur_data@rid + # all(cur_gene_ids == gene_ids) + cur_data <- as.data.table(cur_data@mat) + cur_perts <- col_meta[match(colnames(cur_data), col_meta$sig_id), pert_id] + # Get the SMILES for current perturbations, in order + cur_smiles <- pert_meta[match(cur_perts, pert_meta$pert_id), canonical_smiles] + # Match current cell line controls, duplicate as necessary (sample with replacement) + # TODO: How to augment the data? + cur_ctrl <- ctrl[, col_meta[cell_id == cell & pert_id == "DMSO", sig_id], with = F] + cur_ctrl <- cur_ctrl[, sample(x = 1:ncol(cur_ctrl), size = ncol(cur_data), replace = T), with = F] + # dim(cur_ctrl) + # dim(cur_data) + # length(cur_smiles) + + # Save Chem X + fwrite(as.data.table(cur_smiles), paste0(cell, "_SMILES_data.txt")) + # Save Cell X + fwrite(transpose(cur_ctrl), paste0(cell, "_cellline_ctrl_input_data.txt")) + # Save Cell y + fwrite(transpose(cur_data), paste0(cell, "_cellline_trt_label_data.txt")) +} + + +all <- parse.gctx(fname = ds_path, matrix_only = T) +dim(ctrl) + + +