|
a |
|
b/R/Read_LINCS.R |
|
|
1 |
# Read_LINCS.R |
|
|
2 |
# library(devtools) |
|
|
3 |
# install_github("cmap/cmapR") |
|
|
4 |
library(cmapR) |
|
|
5 |
# LINCSDataPortal::download_dataset() |
|
|
6 |
|
|
|
7 |
library(data.table) |
|
|
8 |
|
|
|
9 |
# ds_path = "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_Level3_INF_mlr12k_n345976x12328_2017-03-06.gctx" |
|
|
10 |
ds_path = "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328_2017-03-06.gctx" |
|
|
11 |
# read the column annotations as a data.frame |
|
|
12 |
# col_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_inst_info_2017-03-06.txt" |
|
|
13 |
col_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_sig_info_2017-03-06.txt" |
|
|
14 |
col_meta <- fread(col_meta_path) |
|
|
15 |
head(unique(col_meta$pert_iname)) # Drugs and shRNAs |
|
|
16 |
unique(col_meta$pert_type) |
|
|
17 |
|
|
|
18 |
|
|
|
19 |
cell_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_cell_info_2017-04-28.txt" |
|
|
20 |
cell_meta <- fread(cell_meta_path) |
|
|
21 |
|
|
|
22 |
gene_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_gene_info_2017-03-06.txt" |
|
|
23 |
gene_meta <- fread(gene_meta_path) |
|
|
24 |
|
|
|
25 |
sig_metrics <- fread("Data/LINCS/GSE70138/GSE70138_Broad_LINCS_sig_metrics_2017-03-06.txt") |
|
|
26 |
sig_metrics[pert_type == "trt_xpr"] |
|
|
27 |
|
|
|
28 |
pert_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_pert_info_2017-03-06.txt" |
|
|
29 |
pert_meta <- fread(pert_meta_path) |
|
|
30 |
unique(pert_meta$pert_type) |
|
|
31 |
pert_meta[pert_type == "trt_cp"] |
|
|
32 |
pert_meta[pert_type == "ctl_vehicle"] |
|
|
33 |
|
|
|
34 |
# figure out which signatures correspond to vorinostat by searching the 'pert_iname' column |
|
|
35 |
col_meta |
|
|
36 |
col_meta[pert_iname == "vorinostat" & cell_id == "A375"] |
|
|
37 |
idx <- which(col_meta$pert_iname=="vorinostat") |
|
|
38 |
|
|
|
39 |
# and get the corresponding sig_ids |
|
|
40 |
sig_ids <- col_meta$sig_id[idx] |
|
|
41 |
|
|
|
42 |
# read only those columns from the GCTX file by using the 'cid' parameter |
|
|
43 |
cur_cols <- read.gctx.ids(ds_path, dimension = "col") |
|
|
44 |
|
|
|
45 |
sum(grepl(pattern = "REP", x = cur_cols)) |
|
|
46 |
cur_cols <- gsub(pattern = "REP\\.", replacement = "", x = cur_cols) |
|
|
47 |
sum(cur_cols %in% col_meta$sig_id) |
|
|
48 |
vorinostat_ds <- parse.gctx(ds_path, cid=sig_ids) |
|
|
49 |
|
|
|
50 |
# simply typing the variable name of a GCT object will display the object structure, similar to calling 'str' |
|
|
51 |
vorinostat_ds |
|
|
52 |
|
|
|
53 |
|
|
|
54 |
# ==== Get all untreated cell line expression data ==== |
|
|
55 |
untreated_ids <- col_meta[pert_iname == "DMSO", sig_id] |
|
|
56 |
untreated_ds <- parse.gctx(ds_path, cid = untreated_ids) |
|
|
57 |
untreated_dt <- as.data.table(untreated_ds@mat) |
|
|
58 |
dim(untreated_dt) |
|
|
59 |
# Get cell IDs, maintaining column order (%in% discards order!) |
|
|
60 |
col_meta[sig_id %in% c("LJP005_A375_24H:A07", "LJP005_A375_24H:A03"), pert_id] |
|
|
61 |
cur_cells <- col_meta[match(colnames(untreated_dt), col_meta$sig_id), cell_id] |
|
|
62 |
cell_meta[match(cur_cells, cell_meta$cell_id), primary_site] |
|
|
63 |
y = cell_meta[match(cur_cells, cell_meta$cell_id), subtype] |
|
|
64 |
length(unique(y)) # 17 classes/subtypes |
|
|
65 |
|
|
|
66 |
# Transpose the gene expression matrix |
|
|
67 |
untreated_dt <- transpose(untreated_dt) |
|
|
68 |
dim(untreated_dt) |
|
|
69 |
colnames(untreated_dt) |
|
|
70 |
untreated_dt <- cbind(untreated_dt, y) |
|
|
71 |
|
|
|
72 |
# Remove samples with no subtypes indicated |
|
|
73 |
untreated_dt <- untreated_dt[y != "-666"] |
|
|
74 |
dim(untreated_dt) |
|
|
75 |
untreated_dt[1:5, 1:5] |
|
|
76 |
|
|
|
77 |
# Stratify into train and test data |
|
|
78 |
# install.packages("caret") |
|
|
79 |
library(caret) |
|
|
80 |
|
|
|
81 |
temp <- caret::createDataPartition(y = as.factor(untreated_dt$y), times = 1, p = 0.8) |
|
|
82 |
temp$Resample1 |
|
|
83 |
train_data <- untreated_dt[temp$Resample1,] |
|
|
84 |
test_data <- untreated_dt[-temp$Resample1] |
|
|
85 |
dim(train_data) |
|
|
86 |
dim(test_data) |
|
|
87 |
|
|
|
88 |
# Save |
|
|
89 |
fwrite(train_data, "Data/LINCS/GSE70138/DMSO_celllines_subtypes_TRAIN.txt", sep = "\t") |
|
|
90 |
fwrite(test_data, "Data/LINCS/GSE70138/DMSO_celllines_subtypes_TEST.txt", sep = "\t") |
|
|
91 |
|
|
|
92 |
# ==== Match (compound) treatment and cell line responses ==== |
|
|
93 |
# Will have before (X) and after (y) treatment responses |
|
|
94 |
untreated_ids <- col_meta[pert_id == "DMSO", sig_id] |
|
|
95 |
ctrl <- parse.gctx(fname = ds_path, cid = untreated_ids) |
|
|
96 |
gene_ids <- ctrl@rid |
|
|
97 |
ctrl <- as.data.table(ctrl@mat) |
|
|
98 |
|
|
|
99 |
# Divide training data by cell lines |
|
|
100 |
all_lines <- unique(col_meta$cell_id) |
|
|
101 |
cell = all_lines[1] |
|
|
102 |
for (cell in all_lines) { |
|
|
103 |
cur_ids <- col_meta[cell_id == cell & pert_id != "DMSO" & pert_type == "trt_cp", sig_id] |
|
|
104 |
cur_data <- parse.gctx(fname = ds_path, cid = cur_ids, matrix_only = T) |
|
|
105 |
cur_gene_ids <- cur_data@rid |
|
|
106 |
# all(cur_gene_ids == gene_ids) |
|
|
107 |
cur_data <- as.data.table(cur_data@mat) |
|
|
108 |
cur_perts <- col_meta[match(colnames(cur_data), col_meta$sig_id), pert_id] |
|
|
109 |
# Get the SMILES for current perturbations, in order |
|
|
110 |
cur_smiles <- pert_meta[match(cur_perts, pert_meta$pert_id), canonical_smiles] |
|
|
111 |
# Match current cell line controls, duplicate as necessary (sample with replacement) |
|
|
112 |
# TODO: How to augment the data? |
|
|
113 |
cur_ctrl <- ctrl[, col_meta[cell_id == cell & pert_id == "DMSO", sig_id], with = F] |
|
|
114 |
cur_ctrl <- cur_ctrl[, sample(x = 1:ncol(cur_ctrl), size = ncol(cur_data), replace = T), with = F] |
|
|
115 |
# dim(cur_ctrl) |
|
|
116 |
# dim(cur_data) |
|
|
117 |
# length(cur_smiles) |
|
|
118 |
|
|
|
119 |
# Save Chem X |
|
|
120 |
fwrite(as.data.table(cur_smiles), paste0(cell, "_SMILES_data.txt")) |
|
|
121 |
# Save Cell X |
|
|
122 |
fwrite(transpose(cur_ctrl), paste0(cell, "_cellline_ctrl_input_data.txt")) |
|
|
123 |
# Save Cell y |
|
|
124 |
fwrite(transpose(cur_data), paste0(cell, "_cellline_trt_label_data.txt")) |
|
|
125 |
} |
|
|
126 |
|
|
|
127 |
|
|
|
128 |
all <- parse.gctx(fname = ds_path, matrix_only = T) |
|
|
129 |
dim(ctrl) |
|
|
130 |
|
|
|
131 |
|
|
|
132 |
|