--- a +++ b/R/Proteomic_EDA.R @@ -0,0 +1,158 @@ +# Proteomic_EDA.R + +require(data.table) +require(ggplot2) +require(stringr) + +# Cell lines getting engineered for DCAF1 (March 2020) +# NCIH-2170/NCIH-1915/NCIH-1703/NCI-H1373/A549A/NCI-H647/NCIH520 + +# ==== Create Long CCLE Proteomics Data with Cell Line Info ==== +# Read CCLE proteomics data +ccle_prot <- fread("Data/DepMap/20Q1/Cellular Models/CCLE_normalized_protein_expression.csv") +ccle_nonorm <- fread("Data/DepMap/20Q1/Cellular Models/CCLE_summed_sn_non_normalized.csv") +colnames(ccle_prot) + + +# Find extrema +min(ccle_prot[,-c(1:48)], na.rm = T) +min(ccle_nonorm[,-c(1:48)], na.rm = T) +max(ccle_prot[,-c(1:48)], na.rm = T) +max(ccle_nonorm[,-c(1:48)], na.rm = T) + +# Extract cell line names and replace with column names +ccle_prot_lines <- gsub("\\_Ten.+", "", colnames(ccle_prot)[-c(1:48)]) +colnames(ccle_prot)[-c(1:48)] <- ccle_prot_lines + +### Attach tissue information for each cell line: +ccle_line_info <- fread("Data/DepMap/20Q1/sample_info.csv") +sum(ccle_prot_lines %in% ccle_line_info$CCLE_Name) / length(ccle_prot_lines) # All cell line info is available + +# Convert data to long format +long_ccle <- melt.data.table(ccle_prot[, c(2, 6, 49:ncol(ccle_prot)), with = F], + id.vars = colnames(ccle_prot)[c(2,6)], + variable.name = "line", value.name = "norm_quant") + +# Divide cell line into ID and tissue +long_ccle$cell_line <- gsub("\\_.+", "", long_ccle$line) +long_ccle$tissue <- gsub(".*?\\_(.+)", "\\1", long_ccle$line) # '?' means greedy, so the least '.' is used +# long_ccle$line <- NULL + +# Merge with cell line info +long_ccle <- merge(long_ccle, ccle_line_info[, c("CCLE_Name", "lineage", "lineage_subtype", + "lineage_sub_subtype", "sex", "disease", + "disease_subtype", "age", "additional_info")], + by.x = "line", by.y = "CCLE_Name") + +# Save +fwrite(long_ccle, "Data/DepMap/20Q1/long_ccle_prot_data.txt", sep = '\t') + +# ==== Source Tissue Statistics ==== +t1 <- unique(long_ccle[, c("line", "cell_line", "tissue")]) +ggplot(t1) + + geom_bar(aes(x = tissue)) + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +ggsave("Plots/All_CCLE/CCLE_Line_Counts_per_Tissue.png") + + + +# ==== Basal Levels of Proteins of Interest ==== +require(data.table) +require(ggplot2) +require(ggridges) +dir.create("Plots") + +# Some UniProt IDs +# DCAF1: Q9Y4B6 +# COPB2: P35606 +# FBXW11: Q9UKB1 + +long_ccle <- fread("Data/DepMap/20Q1/long_ccle_prot_data.txt") + +long_ccle[Uniprot_Acc == "Q9Y4B6"] + +# DCAF1 +dcaf1_all <- ggplot(data = long_ccle[Uniprot_Acc == "Q9Y4B6"]) + + geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") + + geom_jitter(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")], + aes(x = tissue, y = norm_quant), colour = "red", size = 2) + + geom_text(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")], + mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) + + geom_hline(yintercept = 0, colour = "red") + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + +dcaf1_all +dir.create("Plots/All_CCLE") +dir.create("Plots/All_CCLE/DCAF1") +ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_jitter.png", dcaf1_all) + +dcaf1_ridge <- ggplot(data = long_ccle[Uniprot_Acc == "Q9Y4B6"]) + + geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") + + geom_jitter(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")], + aes(x = tissue, y = norm_quant), colour = "red", size = 2) + + geom_text(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")], + mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) + + geom_hline(yintercept = 0, colour = "red") + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + +require(viridis) +require(hrbrthemes) + +dcaf1_ridge <- ggplot(long_ccle[Uniprot_Acc == "Q9Y4B6"], + aes(x = norm_quant, y = tissue, fill = ..x..)) + + geom_density_ridges_gradient(scale = 2, rel_min_height = 0.01) + + scale_fill_viridis(name = "norm_quant", option = "C") + + labs(title = 'Protein Expression of DCAF1 in CCLE Tissues') + + theme_ipsum() + + theme( + legend.position="none", + panel.spacing = unit(0.1, "lines"), + strip.text.x = element_text(size = 8) + ) + +dcaf1_ridge +ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_ridge.png", dcaf1_ridge, width = 10) + + +dcaf1_ridge_hist <- ggplot(long_ccle[Uniprot_Acc == "Q9Y4B6"], + aes(x = norm_quant, y = tissue, fill = tissue)) + + geom_density_ridges(alpha=0.6, stat="binline", bins=20) + + theme_ridges() + + labs(title = 'Protein Expression of DCAF1 in CCLE Tissues') + + theme( + legend.position="none", + panel.spacing = unit(0.1, "lines"), + strip.text.x = element_text(size = 8) + ) + +dcaf1_ridge_hist +ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_ridge_hist.png", dcaf1_ridge_hist, width = 10) + + +# COPB2 +ggplot(data = long_ccle[Uniprot_Acc == "P35606"]) + + geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") + + geom_jitter(data = long_ccle[Uniprot_Acc == "P35606" & cell_line %in% c("PC9")], + aes(x = tissue, y = norm_quant), colour = "red", size = 2) + + geom_text(data = long_ccle[Uniprot_Acc == "P35606" & cell_line %in% c("PC9")], + mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) + + geom_hline(yintercept = 0, colour = "red") + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + +ccle_line_info[CCLE_Name %like% "PC9"] +unique(long_ccle[tissue == "LUNG"]$cell_line) + + +# Cell lines we already tested +ccle_prot[Uniprot_Acc == "Q9Y4B6"]$SW1573_LUNG_TenPx33 +ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH358_LUNG_TenPx06 +ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH460_LUNG_TenPx22 + +# Cell lines to be tested (and available in CCLE) +ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH2170_LUNG_TenPx12 +ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH1703_LUNG_TenPx19 +ccle_prot[Uniprot_Acc == "Q9Y4B6"]$A549_LUNG_TenPx12 +ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH520_LUNG_TenPx10 + +ccle_prot[Uniprot_Acc == "Q9Y4B6", -c(1:48)]