|
a |
|
b/R/01_Dose-Response_Data_Preparation.R |
|
|
1 |
# PharmacoGx.R |
|
|
2 |
# BiocManager::install("PharmacoGx") |
|
|
3 |
require(PharmacoGx) |
|
|
4 |
require(data.table) |
|
|
5 |
path = "/Users/ftaj/OneDrive - University of Toronto/Drug_Response/" |
|
|
6 |
dir.create(paste0(path, "Data/DRP_Training_Data")) |
|
|
7 |
|
|
|
8 |
depmap_samples <- fread(paste0(path, "Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv")) |
|
|
9 |
|
|
|
10 |
depmap_samples$stripped_cell_line_name |
|
|
11 |
PharmacoGx::availablePSets() |
|
|
12 |
|
|
|
13 |
# CTRPv2 ==== |
|
|
14 |
??summarizeSensitivityProfiles |
|
|
15 |
require(PharmacoGx) |
|
|
16 |
|
|
|
17 |
ctrp2 <- downloadPSet("CTRPv2_2015") |
|
|
18 |
|
|
|
19 |
ctrp2_aac <- summarizeSensitivityProfiles(object = ctrp2, sensitivity.measure = "aac_recomputed", summary.stat = "median", fill.missing = T,) |
|
|
20 |
|
|
|
21 |
fwrite(as.data.table(ctrp2@drug, keep.rownames = T), "Data/DRP_Training_Data/CTRP_DRUG_INFO.csv") |
|
|
22 |
drug_info <- ctrp2@drug[, c("gene_symbol_of_protein_target", "target_or_activity_of_compound", "cpd_status")] |
|
|
23 |
drug_info[1:5,] |
|
|
24 |
drug_info <- as.data.table(drug_info, keep.rownames = T) |
|
|
25 |
|
|
|
26 |
drug_info[rn %like% "Paclitaxel"] |
|
|
27 |
drug_info[gene_symbol_of_protein_target != "" & cpd_status == "clinical"]$rn |
|
|
28 |
# ctrp2@drug[1:20, c("drugid", "cpd_name")] |
|
|
29 |
# ctrp2_ic50 <- PharmacoGx::summarizeSensitivityProfiles(object = ctrp2, sensitivity.measure = "ic50_recomputed", summary.stat = "median", fill.missing = T) |
|
|
30 |
# Convert to data.table |
|
|
31 |
ctrp2_aac <- as.data.table(ctrp2_aac, keep.rownames = T) |
|
|
32 |
# ctrp2_ic50 <- as.data.table(ctrp2_ic50, keep.rownames = T) |
|
|
33 |
dim(ctrp2_aac) |
|
|
34 |
# dim(ctrp2_ic50) |
|
|
35 |
# Convert to long format |
|
|
36 |
ctrp2_aac <- melt.data.table(data = ctrp2_aac) |
|
|
37 |
# ctrp2_ic50 <- melt.data.table(data = ctrp2_ic50) |
|
|
38 |
# Remove NAs |
|
|
39 |
ctrp2_aac <- ctrp2_aac[!is.na(value)] |
|
|
40 |
# ctrp2_ic50 <- ctrp2_ic50[!is.na(value)] |
|
|
41 |
colnames(ctrp2_aac) <- c("cpd_name", "ccl_name", "area_above_curve") |
|
|
42 |
# colnames(ctrp2_ic50) <- c("cpd_name", "ccl_name", "ic50") |
|
|
43 |
|
|
|
44 |
# Add smiles data |
|
|
45 |
dim(ctrp2_aac) ### 363,634 dose-response curves |
|
|
46 |
|
|
|
47 |
dim(ctrp2_ic50) |
|
|
48 |
dim(ctrp2) |
|
|
49 |
ctrp2_smiles <- data.table(drugid = ctrp2@drug$drugid, cpd_smiles = ctrp2@drug$cpd_smiles) |
|
|
50 |
|
|
|
51 |
sum(unique(ctrp2@drug$drugid) %in% unique(ctrp2_aac$cpd_name)) / length(unique(ctrp2_aac$cpd_name)) |
|
|
52 |
|
|
|
53 |
|
|
|
54 |
length(unique(ctrp2_aac$cpd_name)) |
|
|
55 |
length(unique(ctrp2_ic50$cpd_name)) |
|
|
56 |
length(unique(ctrp2_smiles$drugid)) |
|
|
57 |
|
|
|
58 |
sum(unique(ctrp2_smiles$drugid) %in% unique(ctrp2_aac$cpd_name)) / length(unique(ctrp2_aac$cpd_name)) |
|
|
59 |
sum(unique(ctrp2_smiles$drugid) == unique(ctrp2_ic50$cpd_name)) |
|
|
60 |
|
|
|
61 |
sum(tolower(unique(ctrp2_smiles$drugid)) == tolower(unique(ctrp2_aac$cpd_name))) |
|
|
62 |
sum(tolower(unique(ctrp2_smiles$drugid)) == tolower(unique(ctrp2_ic50$cpd_name))) |
|
|
63 |
changed = !(tolower(unique(ctrp2_smiles$drugid)) == tolower(unique(ctrp2_aac$cpd_name))) |
|
|
64 |
changed = !tolower(unique(ctrp2_smiles$drugid)) == tolower(unique(ctrp2_ic50$cpd_name)) |
|
|
65 |
|
|
|
66 |
temp = data.table(before = unique(ctrp2_smiles$drugid)[changed], after = unique(ctrp2_aac$cpd_name)[changed]) |
|
|
67 |
temp = data.table(before = unique(ctrp2_smiles$drugid)[changed], after = unique(ctrp2_ic50$cpd_name)[changed]) |
|
|
68 |
View(temp) |
|
|
69 |
|
|
|
70 |
ctrp2_smiles |
|
|
71 |
ctrp2_aac |
|
|
72 |
|
|
|
73 |
unique(ctrp2_smiles$drugid)[!(unique(ctrp2_smiles$drugid) == unique(ctrp2_aac$cpd_name))] |
|
|
74 |
unique(ctrp2_aac$cpd_name)[!(unique(ctrp2_smiles$cpd_name) == unique(ctrp2_aac$cpd_name))] |
|
|
75 |
|
|
|
76 |
drug_table <- data.table(canonical_name = rownames(drugInfo(ctrp2)), |
|
|
77 |
common_name = drugInfo(ctrp2)[, c("cpd_name")], |
|
|
78 |
cpd_smiles = drugInfo(ctrp2)[, c("cpd_smiles")]) |
|
|
79 |
final_ctrpv2_aac <- merge(ctrp2_aac, drug_table[, c("canonical_name", "cpd_smiles")], by.x = "cpd_name", by.y = "canonical_name") |
|
|
80 |
|
|
|
81 |
final_ctrpv2_ic50 <- merge(ctrp2_ic50, drug_table[, c("canonical_name", "cpd_smiles")], by.x = "drugid", by.y = "canonical_name") |
|
|
82 |
|
|
|
83 |
# Save |
|
|
84 |
fwrite(final_ctrpv2_aac, paste0(path, "Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")) |
|
|
85 |
fwrite(final_ctrpv2_ic50, paste0(path, "Data/DRP_Training_Data/CTRP_IC50_SMILES.txt")) |
|
|
86 |
final_ctrpv2_aac <- fread(paste0(path, "Data/DRP_Training_Data/CTRP_AAC_SMILES.txt")) |
|
|
87 |
final_ctrpv2_ic50 <- fread(paste0(path, "Data/DRP_Training_Data/CTRP_IC50_SMILES.txt")) |
|
|
88 |
|
|
|
89 |
# Add disease information |
|
|
90 |
require(stringr) |
|
|
91 |
line_info <- fread("Data/DRP_Training_Data/DepMap_20Q2_Line_Info.csv") |
|
|
92 |
final_ctrpv2_aac <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt") |
|
|
93 |
|
|
|
94 |
# Remove all hyphens, convert to upper case |
|
|
95 |
line_info$other_ccl_name <- str_replace_all(toupper(line_info$stripped_cell_line_name), "-", "") |
|
|
96 |
final_ctrpv2_aac$other_ccl_name <- str_replace_all(toupper(final_ctrpv2_aac$ccl_name), "-", "") |
|
|
97 |
# Remove all spaces |
|
|
98 |
line_info$other_ccl_name <- str_replace_all(toupper(line_info$other_ccl_name), " ", "") |
|
|
99 |
final_ctrpv2_aac$other_ccl_name <- str_replace_all(toupper(final_ctrpv2_aac$other_ccl_name), " ", "") |
|
|
100 |
|
|
|
101 |
sum(unique(final_ctrpv2_aac$other_ccl_name) %in% unique(line_info$other_ccl_name)) / length(unique(final_ctrpv2_aac$other_ccl_name)) ### 0.89177!!! |
|
|
102 |
# A quarter of the data cannot be paired... |
|
|
103 |
|
|
|
104 |
# Find what cannot be paired |
|
|
105 |
final_ctrpv2_aac[other_ccl_name %in% unique(final_ctrpv2_aac$other_ccl_name)[!(unique(final_ctrpv2_aac$other_ccl_name) %in% unique(line_info$other_ccl_name))]] |
|
|
106 |
|
|
|
107 |
final_ctrpv2_aac <- merge(final_ctrpv2_aac, line_info[, c("other_ccl_name", "primary_disease")], by = "other_ccl_name") |
|
|
108 |
final_ctrpv2_aac$other_ccl_name <- NULL |
|
|
109 |
setcolorder(final_ctrpv2_aac, neworder = c("cpd_name", "ccl_name", "primary_disease", "area_above_curve", "cpd_smiles")) |
|
|
110 |
|
|
|
111 |
fwrite(final_ctrpv2_aac, "Data/DRP_Training_Data/CTRP_AAC_SMILES.txt") |
|
|
112 |
|
|
|
113 |
# ==== Update August 2021 ===== |
|
|
114 |
# Fix Tipifarnbi SMILES |
|
|
115 |
require(data.table) |
|
|
116 |
ctrp_aac <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt") |
|
|
117 |
ctrp_aac[cpd_name == "Tipifarnib"]$cpd_smiles <- "CN1C=NC=C1C(C2=CC=C(C=C2)Cl)(C3=CC4=C(C=C3)N(C(=O)C=C4C5=CC(=CC=C5)Cl)C)N" |
|
|
118 |
fwrite(ctrp_aac, "Data/DRP_Training_Data/CTRP_AAC_SMILES.txt") |
|
|
119 |
|
|
|
120 |
# ctrp_aac <- fread("Data/DRP_Training_Data/CTRP_AAC_SMILES.txt") |
|
|
121 |
# ==== Update September 2021 ==== |
|
|
122 |
# Add canonical SMILES for some drugs |
|
|
123 |
require(data.table) |
|
|
124 |
gdsc1_aac <- fread("Data/DRP_Training_Data/GDSC1_AAC_SMILES.txt") |
|
|
125 |
gdsc1_aac[cpd_name == "Tamoxifen"]$cpd_smiles <- "CCC(=C(C1=CC=CC=C1)C2=CC=C(C=C2)OCCN(C)C)C3=CC=CC=C3" |
|
|
126 |
gdsc1_aac[cpd_name == "JW-7-52-1"]$cpd_smiles <- "CCC(=O)N1CCN(CC1)C2=C(C=C(C=C2)N3C(=O)C=CC4=CN=C5C=CC(=CC5=C43)C6=CC7=CC=CC=C7N=C6)C(F)(F)F" |
|
|
127 |
gdsc1_aac[cpd_name == "KIN001-135"]$cpd_smiles <- "COC1=C(C=C2C(=C1)N=CN2C3=CC(=C(S3)C#N)OCC4=CC=CC=C4S(=O)(=O)C)OC" |
|
|
128 |
fwrite(gdsc1_aac, "Data/DRP_Training_Data/GDSC1_AAC_SMILES.txt") |
|
|
129 |
|
|
|
130 |
gdsc2_aac <- fread("Data/DRP_Training_Data/GDSC2_AAC_SMILES.txt") |
|
|
131 |
gdsc2_aac[cpd_name == "Tamoxifen"]$cpd_smiles <- "CCC(=C(C1=CC=CC=C1)C2=CC=C(C=C2)OCCN(C)C)C3=CC=CC=C3" |
|
|
132 |
fwrite(gdsc2_aac, "Data/DRP_Training_Data/GDSC2_AAC_SMILES.txt") |
|
|
133 |
|
|
|
134 |
# CCLE ==== |
|
|
135 |
# ccle <- downloadPSet("CCLE") |
|
|
136 |
# ccle_auc <- summarizeSensitivityProfiles(pSet = ccle, sensitivity.measure = "auc_recomputed", summary.stat = "median", fill.missing = T) |
|
|
137 |
# # Convert to data.table |
|
|
138 |
# ccle_auc <- as.data.table(ccle_auc, keep.rownames = T) |
|
|
139 |
# dim(ccle_auc) |
|
|
140 |
# # Convert to long format |
|
|
141 |
# ccle_auc <- melt.data.table(data = ccle_auc) |
|
|
142 |
# # Remove NAs |
|
|
143 |
# ccle_auc <- ccle_auc[!is.na(value)] |
|
|
144 |
# colnames(ccle_auc) <- c("cpd_name", "ccl_name", "area_under_curve") |
|
|
145 |
# drug_table <- data.table(canonical_name = rownames(drugInfo(ccle)), common_name = drugInfo(ccle)[, c("drug.name")], pubchem = drugInfo(ccle)[, "PubCHEM"]) |
|
|
146 |
# ccle_smiles <- webchem::pc_prop(cid = drug_table$pubchem, properties = "CanonicalSMILES") |
|
|
147 |
# # Remove drugs without pubchem entries |
|
|
148 |
# sum(is.na(drug_table$pubchem)) # 15 |
|
|
149 |
# drug_table <- drug_table[!is.na(pubchem)] |
|
|
150 |
# drug_table <- merge(drug_table, ccle_smiles, by.x = "pubchem", by.y = "CID") |
|
|
151 |
# |
|
|
152 |
# |
|
|
153 |
# ccle_final <- merge(ccle_auc, drug_table[, c("canonical_name", "CanonicalSMILES")], by.x = "cpd_name", by.y = "canonical_name") |
|
|
154 |
# # Save |
|
|
155 |
# fwrite(ccle_final, paste0(path, "Data/DRP_Training_Data/CCLE_AUC_SMILES.txt")) |
|
|
156 |
|
|
|
157 |
|
|
|
158 |
# GDSC1 ==== |
|
|
159 |
gdsc2 <- downloadPSet("GDSC") |
|
|
160 |
gdsc2_auc <- summarizeSensitivityProfiles(pSet = gdsc2, sensitivity.measure = "auc_recomputed", summary.stat = "median", fill.missing = T) |
|
|
161 |
# Convert to data.table |
|
|
162 |
gdsc2_auc <- as.data.table(gdsc2_auc, keep.rownames = T) |
|
|
163 |
dim(gdsc2_auc) |
|
|
164 |
# Convert to long format |
|
|
165 |
gdsc2_auc <- melt.data.table(data = gdsc2_auc) |
|
|
166 |
# Remove NAs |
|
|
167 |
gdsc2_auc <- gdsc2_auc[!is.na(value)] |
|
|
168 |
colnames(gdsc2_auc) <- c("cpd_name", "ccl_name", "area_under_curve") |
|
|
169 |
drug_table <- data.table(canonical_name = rownames(drugInfo(gdsc2)), common_name = drugInfo(gdsc2)[, c("drug.name")], pubchem = drugInfo(gdsc2)[, "PubCHEM"]) |
|
|
170 |
gdsc2_smiles <- webchem::pc_prop(cid = drug_table$pubchem, properties = "CanonicalSMILES") |
|
|
171 |
# Remove drugs without pubchem entries |
|
|
172 |
sum(is.na(drug_table$pubchem)) # 15 |
|
|
173 |
drug_table <- drug_table[!is.na(pubchem)] |
|
|
174 |
drug_table <- merge(drug_table, gdsc2_smiles, by.x = "pubchem", by.y = "CID") |
|
|
175 |
|
|
|
176 |
|
|
|
177 |
gdsc2_final <- merge(gdsc2_auc, drug_table[, c("canonical_name", "CanonicalSMILES")], by.x = "cpd_name", by.y = "canonical_name") |
|
|
178 |
# Save |
|
|
179 |
fwrite(gdsc2_final, paste0(path, "Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt")) |
|
|
180 |
|
|
|
181 |
gdsc2_final <- fread(paste0(path, "Data/DRP_Training_Data/GDSC2_AUC_SMILES.txt")) |
|
|
182 |
colnames(gdsc2_final)[4] <- "cpd_smiles" |