Diff of /R/Read_LINCS.R [000000] .. [c3b4f8]

Switch to unified view

a b/R/Read_LINCS.R
1
# Read_LINCS.R
2
# library(devtools)
3
# install_github("cmap/cmapR")
4
library(cmapR)
5
# LINCSDataPortal::download_dataset()
6
7
library(data.table)
8
9
# ds_path = "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_Level3_INF_mlr12k_n345976x12328_2017-03-06.gctx"
10
ds_path = "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328_2017-03-06.gctx"
11
# read the column annotations as a data.frame
12
# col_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_inst_info_2017-03-06.txt"
13
col_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_sig_info_2017-03-06.txt"
14
col_meta <- fread(col_meta_path)
15
head(unique(col_meta$pert_iname)) # Drugs and shRNAs
16
unique(col_meta$pert_type)
17
18
19
cell_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_cell_info_2017-04-28.txt"
20
cell_meta <- fread(cell_meta_path)
21
22
gene_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_gene_info_2017-03-06.txt"
23
gene_meta <- fread(gene_meta_path)
24
25
sig_metrics <- fread("Data/LINCS/GSE70138/GSE70138_Broad_LINCS_sig_metrics_2017-03-06.txt")
26
sig_metrics[pert_type == "trt_xpr"]
27
28
pert_meta_path <- "Data/LINCS/GSE70138/GSE70138_Broad_LINCS_pert_info_2017-03-06.txt"
29
pert_meta <- fread(pert_meta_path)
30
unique(pert_meta$pert_type)
31
pert_meta[pert_type == "trt_cp"]
32
pert_meta[pert_type == "ctl_vehicle"]
33
34
# figure out which signatures correspond to vorinostat by searching the 'pert_iname' column
35
col_meta
36
col_meta[pert_iname == "vorinostat" & cell_id == "A375"]
37
idx <- which(col_meta$pert_iname=="vorinostat")
38
39
# and get the corresponding sig_ids
40
sig_ids <- col_meta$sig_id[idx]
41
42
# read only those columns from the GCTX file by using the 'cid' parameter
43
cur_cols <- read.gctx.ids(ds_path, dimension = "col")
44
45
sum(grepl(pattern = "REP", x = cur_cols))
46
cur_cols <- gsub(pattern = "REP\\.", replacement = "", x = cur_cols)
47
sum(cur_cols %in% col_meta$sig_id)
48
vorinostat_ds <- parse.gctx(ds_path, cid=sig_ids)
49
50
# simply typing the variable name of a GCT object will display the object structure, similar to calling 'str'
51
vorinostat_ds
52
53
54
# ==== Get all untreated cell line expression data ====
55
untreated_ids <- col_meta[pert_iname == "DMSO", sig_id]
56
untreated_ds <- parse.gctx(ds_path, cid = untreated_ids)
57
untreated_dt <- as.data.table(untreated_ds@mat)
58
dim(untreated_dt)
59
# Get cell IDs, maintaining column order (%in% discards order!)
60
col_meta[sig_id %in% c("LJP005_A375_24H:A07", "LJP005_A375_24H:A03"), pert_id]
61
cur_cells <- col_meta[match(colnames(untreated_dt), col_meta$sig_id), cell_id]
62
cell_meta[match(cur_cells, cell_meta$cell_id), primary_site]
63
y = cell_meta[match(cur_cells, cell_meta$cell_id), subtype]
64
length(unique(y)) # 17 classes/subtypes
65
66
# Transpose the gene expression matrix
67
untreated_dt <- transpose(untreated_dt)
68
dim(untreated_dt)
69
colnames(untreated_dt)
70
untreated_dt <- cbind(untreated_dt, y)
71
72
# Remove samples with no subtypes indicated
73
untreated_dt <- untreated_dt[y != "-666"]
74
dim(untreated_dt)
75
untreated_dt[1:5, 1:5]
76
77
# Stratify into train and test data
78
# install.packages("caret")
79
library(caret)
80
81
temp <- caret::createDataPartition(y = as.factor(untreated_dt$y), times = 1, p = 0.8)
82
temp$Resample1
83
train_data <- untreated_dt[temp$Resample1,]
84
test_data <- untreated_dt[-temp$Resample1]
85
dim(train_data)
86
dim(test_data)
87
88
# Save
89
fwrite(train_data, "Data/LINCS/GSE70138/DMSO_celllines_subtypes_TRAIN.txt", sep = "\t")
90
fwrite(test_data, "Data/LINCS/GSE70138/DMSO_celllines_subtypes_TEST.txt", sep = "\t")
91
92
# ==== Match (compound) treatment and cell line responses ====
93
# Will have before (X) and after (y) treatment responses
94
untreated_ids <- col_meta[pert_id == "DMSO", sig_id]
95
ctrl <- parse.gctx(fname = ds_path, cid = untreated_ids)
96
gene_ids <- ctrl@rid
97
ctrl <- as.data.table(ctrl@mat)
98
99
# Divide training data by cell lines
100
all_lines <- unique(col_meta$cell_id)
101
cell = all_lines[1]
102
for (cell in all_lines) {
103
  cur_ids <- col_meta[cell_id == cell & pert_id != "DMSO" & pert_type == "trt_cp", sig_id]
104
  cur_data <- parse.gctx(fname = ds_path, cid = cur_ids, matrix_only = T)
105
  cur_gene_ids <- cur_data@rid
106
  # all(cur_gene_ids == gene_ids)
107
  cur_data <- as.data.table(cur_data@mat)
108
  cur_perts <- col_meta[match(colnames(cur_data), col_meta$sig_id), pert_id]
109
  # Get the SMILES for current perturbations, in order
110
  cur_smiles <- pert_meta[match(cur_perts, pert_meta$pert_id), canonical_smiles]
111
  # Match current cell line controls, duplicate as necessary (sample with replacement)
112
  # TODO: How to augment the data?
113
  cur_ctrl <- ctrl[, col_meta[cell_id == cell & pert_id == "DMSO", sig_id], with = F]
114
  cur_ctrl <- cur_ctrl[, sample(x = 1:ncol(cur_ctrl), size = ncol(cur_data), replace = T), with = F]
115
  # dim(cur_ctrl)
116
  # dim(cur_data)
117
  # length(cur_smiles)
118
  
119
  # Save Chem X
120
  fwrite(as.data.table(cur_smiles), paste0(cell, "_SMILES_data.txt"))
121
  # Save Cell X
122
  fwrite(transpose(cur_ctrl), paste0(cell, "_cellline_ctrl_input_data.txt"))
123
  # Save Cell y
124
  fwrite(transpose(cur_data), paste0(cell, "_cellline_trt_label_data.txt"))
125
}
126
127
128
all <- parse.gctx(fname = ds_path, matrix_only = T)
129
dim(ctrl)
130
131
132