Diff of /R/Proteomic_EDA.R [000000] .. [c3b4f8]

Switch to side-by-side view

--- a
+++ b/R/Proteomic_EDA.R
@@ -0,0 +1,158 @@
+# Proteomic_EDA.R
+
+require(data.table)
+require(ggplot2)
+require(stringr)
+
+# Cell lines getting engineered for DCAF1 (March 2020)
+# NCIH-2170/NCIH-1915/NCIH-1703/NCI-H1373/A549A/NCI-H647/NCIH520
+
+# ==== Create Long CCLE Proteomics Data with Cell Line Info ====
+# Read CCLE proteomics data 
+ccle_prot <- fread("Data/DepMap/20Q1/Cellular Models/CCLE_normalized_protein_expression.csv")
+ccle_nonorm <- fread("Data/DepMap/20Q1/Cellular Models/CCLE_summed_sn_non_normalized.csv")
+colnames(ccle_prot)
+
+
+# Find extrema
+min(ccle_prot[,-c(1:48)], na.rm = T)
+min(ccle_nonorm[,-c(1:48)], na.rm = T)
+max(ccle_prot[,-c(1:48)], na.rm = T)
+max(ccle_nonorm[,-c(1:48)], na.rm = T)
+
+# Extract cell line names and replace with column names
+ccle_prot_lines <- gsub("\\_Ten.+", "", colnames(ccle_prot)[-c(1:48)])
+colnames(ccle_prot)[-c(1:48)] <- ccle_prot_lines
+
+### Attach tissue information for each cell line:
+ccle_line_info <- fread("Data/DepMap/20Q1/sample_info.csv")
+sum(ccle_prot_lines %in% ccle_line_info$CCLE_Name) / length(ccle_prot_lines)  # All cell line info is available
+
+# Convert data to long format
+long_ccle <- melt.data.table(ccle_prot[, c(2, 6, 49:ncol(ccle_prot)), with = F],
+                             id.vars = colnames(ccle_prot)[c(2,6)],
+                             variable.name = "line", value.name = "norm_quant")
+
+# Divide cell line into ID and tissue
+long_ccle$cell_line <- gsub("\\_.+", "", long_ccle$line)
+long_ccle$tissue <- gsub(".*?\\_(.+)", "\\1", long_ccle$line)  # '?' means greedy, so the least '.' is used
+# long_ccle$line <- NULL
+
+# Merge with cell line info
+long_ccle <- merge(long_ccle, ccle_line_info[, c("CCLE_Name", "lineage", "lineage_subtype",
+                                                "lineage_sub_subtype", "sex", "disease",
+                                                "disease_subtype", "age", "additional_info")],
+                   by.x = "line", by.y = "CCLE_Name")
+
+# Save
+fwrite(long_ccle, "Data/DepMap/20Q1/long_ccle_prot_data.txt", sep = '\t')
+
+# ==== Source Tissue Statistics ====
+t1 <- unique(long_ccle[, c("line", "cell_line", "tissue")])
+ggplot(t1) +
+  geom_bar(aes(x = tissue)) +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+ggsave("Plots/All_CCLE/CCLE_Line_Counts_per_Tissue.png")
+
+
+
+# ==== Basal Levels of Proteins of Interest ====
+require(data.table)
+require(ggplot2)
+require(ggridges)
+dir.create("Plots")
+
+# Some UniProt IDs
+# DCAF1: Q9Y4B6
+# COPB2: P35606
+# FBXW11: Q9UKB1
+
+long_ccle <- fread("Data/DepMap/20Q1/long_ccle_prot_data.txt")
+
+long_ccle[Uniprot_Acc == "Q9Y4B6"]
+
+# DCAF1
+dcaf1_all <- ggplot(data = long_ccle[Uniprot_Acc == "Q9Y4B6"]) + 
+  geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") +
+  geom_jitter(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")], 
+              aes(x = tissue, y = norm_quant), colour = "red", size = 2) +
+  geom_text(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")],
+            mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) +
+  geom_hline(yintercept = 0, colour = "red") +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+dcaf1_all
+dir.create("Plots/All_CCLE")
+dir.create("Plots/All_CCLE/DCAF1")
+ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_jitter.png", dcaf1_all)
+
+dcaf1_ridge <- ggplot(data = long_ccle[Uniprot_Acc == "Q9Y4B6"]) + 
+  geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") +
+  geom_jitter(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")], 
+              aes(x = tissue, y = norm_quant), colour = "red", size = 2) +
+  geom_text(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")],
+            mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) +
+  geom_hline(yintercept = 0, colour = "red") +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+require(viridis)
+require(hrbrthemes)
+
+dcaf1_ridge <- ggplot(long_ccle[Uniprot_Acc == "Q9Y4B6"],
+       aes(x = norm_quant, y = tissue, fill = ..x..)) +
+  geom_density_ridges_gradient(scale = 2, rel_min_height = 0.01) +
+  scale_fill_viridis(name = "norm_quant", option = "C") +
+  labs(title = 'Protein Expression of DCAF1 in CCLE Tissues') +
+  theme_ipsum() +
+  theme(
+    legend.position="none",
+    panel.spacing = unit(0.1, "lines"),
+    strip.text.x = element_text(size = 8)
+  )
+
+dcaf1_ridge
+ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_ridge.png", dcaf1_ridge, width = 10)
+
+
+dcaf1_ridge_hist <- ggplot(long_ccle[Uniprot_Acc == "Q9Y4B6"],
+                      aes(x = norm_quant, y = tissue, fill = tissue)) +
+  geom_density_ridges(alpha=0.6, stat="binline", bins=20) +
+  theme_ridges() +
+  labs(title = 'Protein Expression of DCAF1 in CCLE Tissues') +
+  theme(
+    legend.position="none",
+    panel.spacing = unit(0.1, "lines"),
+    strip.text.x = element_text(size = 8)
+  )
+
+dcaf1_ridge_hist
+ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_ridge_hist.png", dcaf1_ridge_hist, width = 10)
+
+
+# COPB2
+ggplot(data = long_ccle[Uniprot_Acc == "P35606"]) + 
+  geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") +
+  geom_jitter(data = long_ccle[Uniprot_Acc == "P35606" & cell_line %in% c("PC9")], 
+              aes(x = tissue, y = norm_quant), colour = "red", size = 2) +
+  geom_text(data = long_ccle[Uniprot_Acc == "P35606" & cell_line %in% c("PC9")],
+            mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) +
+  geom_hline(yintercept = 0, colour = "red") +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+
+ccle_line_info[CCLE_Name %like% "PC9"]
+unique(long_ccle[tissue == "LUNG"]$cell_line)
+
+
+# Cell lines we already tested
+ccle_prot[Uniprot_Acc == "Q9Y4B6"]$SW1573_LUNG_TenPx33
+ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH358_LUNG_TenPx06
+ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH460_LUNG_TenPx22
+
+# Cell lines to be tested (and available in CCLE)
+ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH2170_LUNG_TenPx12
+ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH1703_LUNG_TenPx19
+ccle_prot[Uniprot_Acc == "Q9Y4B6"]$A549_LUNG_TenPx12
+ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH520_LUNG_TenPx10
+
+ccle_prot[Uniprot_Acc == "Q9Y4B6", -c(1:48)]