Diff of /R/Fig.1a_Upset_Plot.R [000000] .. [c3b4f8]

Switch to unified view

a b/R/Fig.1a_Upset_Plot.R
1
# upset_plots.R
2
3
# install.packages("UpSetR")
4
# install.packages("ggupset")
5
require(UpSetR)
6
# require(ggupset)
7
require(data.table)
8
require(stringr)
9
require(ggplot2)
10
require(patchwork)
11
12
line_info <- fread("Data/DRP_Training_Data/DepMap_21Q2_Line_Info.csv")
13
ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
14
# gdsc1 <- fread("Data/DRP_Training_Data/GDSC1_AAC_SMILES.txt")
15
gdsc2 <- fread("Data/DRP_Training_Data/GDSC2_AAC_SMILES.txt")
16
exp <- fread("Data/DRP_Training_Data/DepMap_21Q2_Expression.csv")
17
mut <- fread("Data/DRP_Training_Data/DepMap_21Q2_Mutations_by_Cell.csv")
18
cnv <- fread("Data/DRP_Training_Data/DepMap_21Q2_CopyNumber.csv")
19
prot <- fread("Data/DRP_Training_Data/DepMap_20Q2_No_NA_ProteinQuant.csv")
20
21
mirna <- fread("Data/DRP_Training_Data/DepMap_2019_miRNA.csv")
22
hist <- fread("Data/DRP_Training_Data/DepMap_2019_ChromatinProfiling.csv")
23
metab <- fread("Data/DRP_Training_Data/DepMap_2019_Metabolomics.csv")
24
rppa <- fread("Data/DRP_Training_Data/DepMap_2019_RPPA.csv")
25
26
27
dim(rppa)
28
dim(metab)
29
dim(hist)
30
dim(mirna)
31
dim(prot)
32
dim(exp)
33
dim(cnv)
34
dim(mut)
35
uniqueN(gdsc2$ccl_name)
36
uniqueN(ctrp$ccl_name)
37
# ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "")
38
# 
39
# exp_ccl_names = exp$stripped_cell_line_name
40
# exp_ccl_names = str_replace(toupper(exp_ccl_names), "-", "")
41
# 
42
# mut_ccl_names = mut$stripped_cell_line_name
43
# mut_ccl_names = str_replace(toupper(mut_ccl_names), "-", "")
44
# 
45
# cnv_ccl_names = cnv$stripped_cell_line_name
46
# cnv_ccl_names = str_replace(toupper(cnv_ccl_names), "-", "")
47
mut$stripped_cell_line_name = str_replace(toupper(mut$stripped_cell_line_name), "-", "")
48
cnv$stripped_cell_line_name = str_replace(toupper(cnv$stripped_cell_line_name), "-", "")
49
exp$stripped_cell_line_name = str_replace(toupper(exp$stripped_cell_line_name), "-", "")
50
prot$stripped_cell_line_name = str_replace(toupper(prot$stripped_cell_line_name), "-", "")
51
52
mirna$stripped_cell_line_name = str_replace(toupper(mirna$stripped_cell_line_name), "-", "")
53
hist$stripped_cell_line_name = str_replace(toupper(hist$stripped_cell_line_name), "-", "")
54
metab$stripped_cell_line_name = str_replace(toupper(metab$stripped_cell_line_name), "-", "")
55
rppa$stripped_cell_line_name = str_replace(toupper(rppa$stripped_cell_line_name), "-", "")
56
57
ctrp$ccl_name = str_replace(toupper(ctrp$ccl_name), "-", "")
58
gdsc2$ccl_name = str_replace(toupper(gdsc2$ccl_name), "-", "")
59
60
mut_line_info <- line_info[stripped_cell_line_name %in% unique(mut$stripped_cell_line_name)]  
61
cnv_line_info <- line_info[stripped_cell_line_name %in% unique(cnv$stripped_cell_line_name)]  
62
exp_line_info <- line_info[stripped_cell_line_name %in% unique(exp$stripped_cell_line_name)]  
63
prot_line_info <- line_info[stripped_cell_line_name %in% unique(prot$stripped_cell_line_name)]
64
65
mirna_line_info <- line_info[stripped_cell_line_name %in% unique(mirna$stripped_cell_line_name)]  
66
hist_line_info <- line_info[stripped_cell_line_name %in% unique(hist$stripped_cell_line_name)]  
67
metab_line_info <- line_info[stripped_cell_line_name %in% unique(metab$stripped_cell_line_name)]  
68
rppa_line_info <- line_info[stripped_cell_line_name %in% unique(rppa$stripped_cell_line_name)]
69
70
ctrp_line_info <- line_info[stripped_cell_line_name %in% unique(ctrp$ccl_name)]
71
gdsc2_line_info <- line_info[stripped_cell_line_name %in% unique(gdsc2$ccl_name)]
72
73
mut_line_info <- mut_line_info[, c("stripped_cell_line_name", "primary_disease")]
74
mut_line_info$data_type <- "Mutational"
75
cnv_line_info <- cnv_line_info[, c("stripped_cell_line_name", "primary_disease")]
76
cnv_line_info$data_type <- "Copy Number Variation"
77
exp_line_info <- exp_line_info[, c("stripped_cell_line_name", "primary_disease")]
78
exp_line_info$data_type <- "Gene Expression"
79
prot_line_info <- prot_line_info[, c("stripped_cell_line_name", "primary_disease")]
80
prot_line_info$data_type <- "Protein Quantification"
81
82
mirna_line_info <- mirna_line_info[, c("stripped_cell_line_name", "primary_disease")]
83
mirna_line_info$data_type <- "microRNA Expression"
84
hist_line_info <- hist_line_info[, c("stripped_cell_line_name", "primary_disease")]
85
hist_line_info$data_type <- "Histone Modification"
86
metab_line_info <- metab_line_info[, c("stripped_cell_line_name", "primary_disease")]
87
metab_line_info$data_type <- "Metabolomics"
88
rppa_line_info <- rppa_line_info[, c("stripped_cell_line_name", "primary_disease")]
89
rppa_line_info$data_type <- "Reverse-Phase Protein Array"
90
91
ctrp_line_info <- ctrp_line_info[, c("stripped_cell_line_name", "primary_disease")]
92
ctrp_line_info$data_type <- "Dose-Response"
93
94
gdsc2_line_info <- gdsc2_line_info[, c("stripped_cell_line_name", "primary_disease")]
95
gdsc2_line_info$data_type <- "Dose-Response"
96
97
list_input <- list(
98
  Mutational = mut_line_info$stripped_cell_line_name,
99
  `Copy Number Variation` = cnv_line_info$stripped_cell_line_name,
100
  `Gene Expression` = exp_line_info$stripped_cell_line_name,
101
  `Protein Quantification` = prot_line_info$stripped_cell_line_name,
102
  
103
  `microRNA Expression` = mirna_line_info$stripped_cell_line_name,
104
  `Histone Modification` = hist_line_info$stripped_cell_line_name,
105
  `Metabolomics` = metab_line_info$stripped_cell_line_name,
106
  `Reverse-Phase Protein Array` = rppa_line_info$stripped_cell_line_name,
107
  
108
  `CTRPv2 Dose-Response` = unique(ctrp$ccl_name),
109
  `GDSC2 Dose-Response` = unique(gdsc2$ccl_name)
110
)
111
112
make_all_combinations <- function(set){
113
  unlist(lapply(seq_along(set), function(size){
114
    apply(combn(set, size), 2, paste0, collapse="-")
115
  }))
116
}
117
118
119
?upset
120
p <- upset(fromList(list_input),
121
      sets = c("CTRPv2 Dose-Response", "GDSC2 Dose-Response",
122
               "Mutational","Copy Number Variation", "Gene Expression", "Protein Quantification",
123
               "microRNA Expression", "Histone Modification", "Metabolomics", "Reverse-Phase Protein Array"
124
               ),
125
      keep.order = T,
126
      # sets = c("Dose-Response"),
127
      mainbar.y.label = "Data Intersection Size",
128
      # mainbar.y.max = 30,
129
      sets.x.label = "Cell Lines per Data Type",
130
      # group.by = "sets",
131
      order.by = "freq",
132
      scale.sets = "identity",
133
      # set_size.angles = 45,
134
      text.scale = c(1.3, 1.3, 1, 1, 1, 0.75))
135
p
136
137
pdf(file="Plots/Dataset_Exploration/UpSetR_Overlap_Plot_CTRPv2.pdf",
138
    width = 10, height = 5
139
    )
140
p
141
dev.off()
142
143
# ggsave(filename = "Plots/Dataset_Exploration/UpSetR_Overlap_Plot_CTRPv2.pdf")
144
145
# all_cells <- rbindlist(list(
146
#   mut_line_info,
147
#   cnv_line_info,
148
#   exp_line_info,
149
#   prot_line_info,
150
#   mirna_line_info,
151
#   hist_line_info,
152
#   metab_line_info,
153
#   rppa_line_info)
154
# )
155
# all_cells <- all_cells[, -2]
156
# 
157
# # install.packages("tidyverse")
158
# require(tidyverse)
159
# all_cells <- tidyr::as_tibble(all_cells[, 1:2])
160
# simple_groups_df <- all_cells %>%
161
#   group_by(stripped_cell_line_name) %>%
162
#   summarize(groups = list(data_type))
163
# 
164
# extended_groups_df <- simple_groups_df %>%
165
#   mutate(groups = lapply(groups, make_all_combinations)) %>%
166
#   unnest()
167
# 
168
# unique(extended_groups_df)
169
# ggplot(extended_groups_df, aes(x=groups)) +
170
#   geom_bar() +
171
#   axis_combmatrix(sep = "-", )
172
# 
173
# all_cells[, extended_groups := lapply(data_type, make_all_combinations)]
174
# lapply(all_cells, make_all_combinations)
175
176
# ==== Bimodal Intersections Counts ====
177
require(ggplot2)
178
require(data.table)
179
require(flextable)
180
require(magrittr)
181
require(scales)
182
require(officer)
183
184
ctrp <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
185
mut_line_info$data_type <- "MUT"
186
cnv_line_info$data_type <- "CNV"
187
exp_line_info$data_type <- "EXP"
188
prot_line_info$data_type <- "PROT"
189
190
mirna_line_info$data_type <- "MIRNA"
191
hist_line_info$data_type <- "HIST"
192
metab_line_info$data_type <- "METAB"
193
rppa_line_info$data_type <- "RPPA"
194
195
ctrp_line_info$data_type <- "CTRP"
196
197
all_cells <- rbindlist(list(mut_line_info, cnv_line_info, exp_line_info, prot_line_info,
198
                            mirna_line_info, metab_line_info, hist_line_info, rppa_line_info))
199
all_cells <- unique(all_cells)
200
201
ctrp_cells <- unique(ctrp_line_info$stripped_cell_line_name)
202
all_omics <- data.table(
203
  `Data Type(s)` = c("Mutational","Copy Number", "Gene Expression", "Protein Quantification",
204
                     "microRNA Expression", "Metabolomics", "Histone Modification", "RPPA"),
205
  `Abbreviation` = c("MUT", "CNV", "EXP", "PROT", "MIRNA", "METAB", "HIST", "RPPA"),
206
                        `Number of Samples` = vector(mode = "integer", length = 8))
207
208
for (i in 1:nrow(all_omics)) {
209
  first_cells <- all_cells[data_type == all_omics[i, 2]]$stripped_cell_line_name
210
  # second_cells <- all_cells[data_type == all_omics[i, 2]]$stripped_cell_line_name
211
  # cell_overlap <- Reduce(intersect, list(first_cells, second_cells, ctrp_cells))
212
  ctrp_overlap <- uniqueN(ctrp[ccl_name %in% first_cells])
213
  all_omics[i, 3] <- ctrp_overlap
214
}
215
216
set_flextable_defaults(
217
  font.size = 10, theme_fun = theme_vanilla,
218
  padding = 6,
219
  background.color = "#EFEFEF")
220
221
colourer <- col_numeric(
222
  palette = c("red", "white"),
223
  domain = c(min(all_omics$`Number of Samples`), max(all_omics$`Number of Samples`)))
224
225
226
# ==== bimodal 
227
ft <- flextable(all_omics)
228
final_ft <- ft %>%
229
  merge_v(j = c("Data Type(s)", "Number of Samples")) %>%
230
  border_inner(border = fp_border(color="gray", width = 1)) %>%
231
  border_outer(part="all", border = fp_border(color="gray", width = 2)) %>%
232
  align(align = "center", j = c(2, 3), part = "header") %>%
233
234
  bg(
235
    bg = colourer,
236
    j = "Number of Samples", 
237
    part = "body")
238
  
239
240
final_ft <- autofit(final_ft)
241
read_docx() %>% 
242
  body_add_flextable(value = final_ft) %>% 
243
  print(target = "Plots/Dataset_Exploration/bimodal_samples_per_data_type_combo.docx")
244
245
# ==== trimodal
246
all_tri_omic_combos_el <- utils::combn(c("MUT", 'CNV', 'EXP', 'PROT', 'MIRNA', 'METAB', 'HIST', 'RPPA'), 2, simplify = T)
247
all_tri_omic_combos_el <- t(all_tri_omic_combos_el)
248
all_tri_omic_combos_el <- as.data.table(all_tri_omic_combos_el)
249
250
# all_sample_counts <- vector(mode = "numeric", length = nrow(temp))
251
ctrp_cells <- unique(ctrp_line_info$stripped_cell_line_name)
252
all_tri_omic_combos_el$sample_counts <- vector(mode = "integer")
253
for (i in 1:nrow(all_tri_omic_combos_el)) {
254
  first_cells <- all_cells[data_type == all_tri_omic_combos_el[i, 1]]$stripped_cell_line_name
255
  second_cells <- all_cells[data_type == all_tri_omic_combos_el[i, 2]]$stripped_cell_line_name
256
  cell_overlap <- Reduce(intersect, list(first_cells, second_cells, ctrp_cells))
257
  ctrp_overlap <- uniqueN(ctrp[ccl_name %in% cell_overlap])
258
  all_tri_omic_combos_el[i, 3] <- ctrp_overlap
259
}
260
colnames(all_tri_omic_combos_el) <- c("Data Type 1", "Data Type 2", "Number of Samples")
261
262
colourer <- col_numeric(
263
  palette = c("red", "white"),
264
  domain = c(min(all_tri_omic_combos_el$`Number of Samples`), max(all_tri_omic_combos_el$`Number of Samples`)))
265
266
ft <- flextable(all_tri_omic_combos_el)
267
final_ft <- ft %>%
268
  merge_v(j = c("Data Type 1", "Data Type 2", "Number of Samples")) %>%
269
  border_inner(border = fp_border(color="gray", width = 1)) %>%
270
  border_outer(part="all", border = fp_border(color="gray", width = 2)) %>%
271
  align(align = "center", j = 1:3, part = "all") %>%
272
  bg(
273
    bg = colourer,
274
    j = "Number of Samples", 
275
    part = "body")
276
277
278
final_ft <- autofit(final_ft)
279
read_docx() %>% 
280
  body_add_flextable(value = final_ft) %>% 
281
  print(target = "Plots/Dataset_Exploration/trimodal_samples_per_data_type_combo.docx")