Switch to unified view

a b/R/01_Dose-Response_Data_Preparation.R
1
# PharmacoGx.R
2
# BiocManager::install("PharmacoGx")
3
require(PharmacoGx)
4
require(data.table)
5
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/"
6
dir.create(paste0(path, "Data/DRP_Training_Data"))
7
8
depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv"))
9
10
depmap_samples$stripped_cell_line_name
11
PharmacoGx::availablePSets()
12
13
# CTRPv2 ====
14
??summarizeSensitivityProfiles
15
require(PharmacoGx)
16
17
ctrp2 <- downloadPSet("CTRPv2_2015")
18
19
ctrp2_aac <- summarizeSensitivityProfiles(object = ctrp2, sensitivity.measure = "aac_recomputed", summary.stat = "median", fill.missing = T,)
20
21
fwrite(as.data.table(ctrp2@drug, keep.rownames = T), "Data/DRP_Training_Data/CTRP_DRUG_INFO.csv")
22
drug_info <- ctrp2@drug[, c("gene_symbol_of_protein_target", "target_or_activity_of_compound", "cpd_status")]
23
drug_info[1:5,]
24
drug_info <- as.data.table(drug_info, keep.rownames = T)
25
26
drug_info[rn %like% "Paclitaxel"]
27
drug_info[gene_symbol_of_protein_target != "" & cpd_status == "clinical"]$rn
28
# ctrp2@drug[1:20, c("drugid", "cpd_name")]
29
# ctrp2_ic50 <- PharmacoGx::summarizeSensitivityProfiles(object = ctrp2, sensitivity.measure = "ic50_recomputed", summary.stat = "median", fill.missing = T)
30
# Convert to data.table
31
ctrp2_aac <- as.data.table(ctrp2_aac, keep.rownames = T)
32
# ctrp2_ic50 <- as.data.table(ctrp2_ic50, keep.rownames = T)
33
dim(ctrp2_aac)
34
# dim(ctrp2_ic50)
35
# Convert to long format
36
ctrp2_aac <- melt.data.table(data = ctrp2_aac)
37
# ctrp2_ic50 <- melt.data.table(data = ctrp2_ic50)
38
# Remove NAs
39
ctrp2_aac <- ctrp2_aac[!is.na(value)]
40
# ctrp2_ic50 <- ctrp2_ic50[!is.na(value)]
41
colnames(ctrp2_aac) <- c("cpd_name", "ccl_name", "area_above_curve")
42
# colnames(ctrp2_ic50) <- c("cpd_name", "ccl_name", "ic50")
43
44
# Add smiles data
45
dim(ctrp2_aac)  ### 363,634 dose-response curves
46
47
dim(ctrp2_ic50)
48
dim(ctrp2)
49
ctrp2_smiles <- data.table(drugid = ctrp2@drug$drugid, cpd_smiles = ctrp2@drug$cpd_smiles)
50
51
sum(unique(ctrp2@drug$drugid) %in% unique(ctrp2_aac$cpd_name)) / length(unique(ctrp2_aac$cpd_name))
52
53
54
length(unique(ctrp2_aac$cpd_name))
55
length(unique(ctrp2_ic50$cpd_name))
56
length(unique(ctrp2_smiles$drugid))
57
58
sum(unique(ctrp2_smiles$drugid) %in% unique(ctrp2_aac$cpd_name)) / length(unique(ctrp2_aac$cpd_name))
59
sum(unique(ctrp2_smiles$drugid) == unique(ctrp2_ic50$cpd_name))
60
61
sum(tolower(unique(ctrp2_smiles$drugid)) == tolower(unique(ctrp2_aac$cpd_name)))
62
sum(tolower(unique(ctrp2_smiles$drugid)) == tolower(unique(ctrp2_ic50$cpd_name)))
63
changed = !(tolower(unique(ctrp2_smiles$drugid)) == tolower(unique(ctrp2_aac$cpd_name)))
64
changed = !tolower(unique(ctrp2_smiles$drugid)) == tolower(unique(ctrp2_ic50$cpd_name))
65
66
temp = data.table(before = unique(ctrp2_smiles$drugid)[changed], after = unique(ctrp2_aac$cpd_name)[changed])
67
temp = data.table(before = unique(ctrp2_smiles$drugid)[changed], after = unique(ctrp2_ic50$cpd_name)[changed])
68
View(temp)
69
70
ctrp2_smiles
71
ctrp2_aac
72
73
unique(ctrp2_smiles$drugid)[!(unique(ctrp2_smiles$drugid) == unique(ctrp2_aac$cpd_name))]
74
unique(ctrp2_aac$cpd_name)[!(unique(ctrp2_smiles$cpd_name) == unique(ctrp2_aac$cpd_name))]
75
76
drug_table <- data.table(canonical_name = rownames(drugInfo(ctrp2)),
77
                         common_name = drugInfo(ctrp2)[, c("cpd_name")],
78
                         cpd_smiles = drugInfo(ctrp2)[, c("cpd_smiles")])
79
final_ctrpv2_aac <- merge(ctrp2_aac, drug_table[, c("canonical_name", "cpd_smiles")], by.x = "cpd_name", by.y = "canonical_name")
80
81
final_ctrpv2_ic50 <- merge(ctrp2_ic50, drug_table[, c("canonical_name", "cpd_smiles")], by.x = "drugid", by.y = "canonical_name")
82
83
# Save
84
fwrite(final_ctrpv2_aac, paste0(path, "Data/DRP_Training_Data/CTRP_AAC_SMILES.txt"))
85
fwrite(final_ctrpv2_ic50, paste0(path, "Data/DRP_Training_Data/CTRP_IC50_SMILES.txt"))
86
final_ctrpv2_aac <- fread(paste0(path, "Data/DRP_Training_Data/CTRP_AAC_SMILES.txt"))
87
final_ctrpv2_ic50 <- fread(paste0(path, "Data/DRP_Training_Data/CTRP_IC50_SMILES.txt"))
88
89
# Add disease information
90
require(stringr)
91
line_info <- fread("Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv")
92
final_ctrpv2_aac <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
93
94
# Remove all hyphens, convert to upper case
95
line_info$other_ccl_name <- str_replace_all(toupper(line_info$stripped_cell_line_name), "-", "")
96
final_ctrpv2_aac$other_ccl_name <- str_replace_all(toupper(final_ctrpv2_aac$ccl_name), "-", "")
97
# Remove all spaces
98
line_info$other_ccl_name <- str_replace_all(toupper(line_info$other_ccl_name), " ", "")
99
final_ctrpv2_aac$other_ccl_name <- str_replace_all(toupper(final_ctrpv2_aac$other_ccl_name), " ", "")
100
101
sum(unique(final_ctrpv2_aac$other_ccl_name) %in% unique(line_info$other_ccl_name)) / length(unique(final_ctrpv2_aac$other_ccl_name))  ### 0.89177!!!
102
# A quarter of the data cannot be paired...
103
104
# Find what cannot be paired
105
final_ctrpv2_aac[other_ccl_name %in% unique(final_ctrpv2_aac$other_ccl_name)[!(unique(final_ctrpv2_aac$other_ccl_name) %in% unique(line_info$other_ccl_name))]]
106
107
final_ctrpv2_aac <- merge(final_ctrpv2_aac, line_info[, c("other_ccl_name", "primary_disease")], by = "other_ccl_name")
108
final_ctrpv2_aac$other_ccl_name <- NULL
109
setcolorder(final_ctrpv2_aac, neworder = c("cpd_name", "ccl_name", "primary_disease", "area_above_curve", "cpd_smiles"))
110
111
fwrite(final_ctrpv2_aac, "Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
112
113
# ==== Update August 2021 =====
114
# Fix Tipifarnbi SMILES
115
require(data.table)
116
ctrp_aac <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
117
ctrp_aac[cpd_name == "Tipifarnib"]$cpd_smiles <- "CN1C=NC=C1C(C2=CC=C(C=C2)Cl)(C3=CC4=C(C=C3)N(C(=O)C=C4C5=CC(=CC=C5)Cl)C)N"
118
fwrite(ctrp_aac, "Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
119
120
# ctrp_aac <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")
121
# ==== Update September 2021 ====
122
# Add canonical SMILES for some drugs
123
require(data.table)
124
gdsc1_aac <- fread("Data/DRP_Training_Data/GDSC1_AAC_SMILES.txt")
125
gdsc1_aac[cpd_name == "Tamoxifen"]$cpd_smiles <- "CCC(=C(C1=CC=CC=C1)C2=CC=C(C=C2)OCCN(C)C)C3=CC=CC=C3"
126
gdsc1_aac[cpd_name == "JW-7-52-1"]$cpd_smiles <- "CCC(=O)N1CCN(CC1)C2=C(C=C(C=C2)N3C(=O)C=CC4=CN=C5C=CC(=CC5=C43)C6=CC7=CC=CC=C7N=C6)C(F)(F)F"
127
gdsc1_aac[cpd_name == "KIN001-135"]$cpd_smiles <- "COC1=C(C=C2C(=C1)N=CN2C3=CC(=C(S3)C#N)OCC4=CC=CC=C4S(=O)(=O)C)OC"
128
fwrite(gdsc1_aac, "Data/DRP_Training_Data/GDSC1_AAC_SMILES.txt")
129
130
gdsc2_aac <- fread("Data/DRP_Training_Data/GDSC2_AAC_SMILES.txt")
131
gdsc2_aac[cpd_name == "Tamoxifen"]$cpd_smiles <- "CCC(=C(C1=CC=CC=C1)C2=CC=C(C=C2)OCCN(C)C)C3=CC=CC=C3"
132
fwrite(gdsc2_aac, "Data/DRP_Training_Data/GDSC2_AAC_SMILES.txt")
133
134
# CCLE ====
135
# ccle <- downloadPSet("CCLE")
136
# ccle_auc <- summarizeSensitivityProfiles(pSet = ccle, sensitivity.measure = "auc_recomputed", summary.stat = "median", fill.missing = T)
137
# # Convert to data.table
138
# ccle_auc <- as.data.table(ccle_auc, keep.rownames = T)
139
# dim(ccle_auc)
140
# # Convert to long format
141
# ccle_auc <- melt.data.table(data = ccle_auc)
142
# # Remove NAs
143
# ccle_auc <- ccle_auc[!is.na(value)]
144
# colnames(ccle_auc) <- c("cpd_name", "ccl_name", "area_under_curve")
145
# drug_table <- data.table(canonical_name = rownames(drugInfo(ccle)), common_name = drugInfo(ccle)[, c("drug.name")], pubchem = drugInfo(ccle)[, "PubCHEM"])
146
# ccle_smiles <- webchem::pc_prop(cid = drug_table$pubchem, properties = "CanonicalSMILES")
147
# # Remove drugs without pubchem entries
148
# sum(is.na(drug_table$pubchem))  # 15
149
# drug_table <- drug_table[!is.na(pubchem)]
150
# drug_table <- merge(drug_table, ccle_smiles, by.x = "pubchem", by.y = "CID")
151
# 
152
# 
153
# ccle_final <- merge(ccle_auc, drug_table[, c("canonical_name", "CanonicalSMILES")], by.x = "cpd_name", by.y = "canonical_name")
154
# # Save
155
# fwrite(ccle_final, paste0(path, "Data/DRP_Training_Data/CCLE_AUC_SMILES.txt"))
156
157
158
# GDSC1 ====
159
gdsc2 <- downloadPSet("GDSC")
160
gdsc2_auc <- summarizeSensitivityProfiles(pSet = gdsc2, sensitivity.measure = "auc_recomputed", summary.stat = "median", fill.missing = T)
161
# Convert to data.table
162
gdsc2_auc <- as.data.table(gdsc2_auc, keep.rownames = T)
163
dim(gdsc2_auc)
164
# Convert to long format
165
gdsc2_auc <- melt.data.table(data = gdsc2_auc)
166
# Remove NAs
167
gdsc2_auc <- gdsc2_auc[!is.na(value)]
168
colnames(gdsc2_auc) <- c("cpd_name", "ccl_name", "area_under_curve")
169
drug_table <- data.table(canonical_name = rownames(drugInfo(gdsc2)), common_name = drugInfo(gdsc2)[, c("drug.name")], pubchem = drugInfo(gdsc2)[, "PubCHEM"])
170
gdsc2_smiles <- webchem::pc_prop(cid = drug_table$pubchem, properties = "CanonicalSMILES")
171
# Remove drugs without pubchem entries
172
sum(is.na(drug_table$pubchem))  # 15
173
drug_table <- drug_table[!is.na(pubchem)]
174
drug_table <- merge(drug_table, gdsc2_smiles, by.x = "pubchem", by.y = "CID")
175
176
177
gdsc2_final <- merge(gdsc2_auc, drug_table[, c("canonical_name", "CanonicalSMILES")], by.x = "cpd_name", by.y = "canonical_name")
178
# Save
179
fwrite(gdsc2_final, paste0(path, "Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt"))
180
181
gdsc2_final <- fread(paste0(path, "Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt"))
182
colnames(gdsc2_final)[4] <- "cpd_smiles"