|
a |
|
b/R/Proteomic_EDA.R |
|
|
1 |
# Proteomic_EDA.R |
|
|
2 |
|
|
|
3 |
require(data.table) |
|
|
4 |
require(ggplot2) |
|
|
5 |
require(stringr) |
|
|
6 |
|
|
|
7 |
# Cell lines getting engineered for DCAF1 (March 2020) |
|
|
8 |
# NCIH-2170/NCIH-1915/NCIH-1703/NCI-H1373/A549A/NCI-H647/NCIH520 |
|
|
9 |
|
|
|
10 |
# ==== Create Long CCLE Proteomics Data with Cell Line Info ==== |
|
|
11 |
# Read CCLE proteomics data |
|
|
12 |
ccle_prot <- fread("Data/DepMap/20Q1/Cellular Models/CCLE_normalized_protein_expression.csv") |
|
|
13 |
ccle_nonorm <- fread("Data/DepMap/20Q1/Cellular Models/CCLE_summed_sn_non_normalized.csv") |
|
|
14 |
colnames(ccle_prot) |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
# Find extrema |
|
|
18 |
min(ccle_prot[,-c(1:48)], na.rm = T) |
|
|
19 |
min(ccle_nonorm[,-c(1:48)], na.rm = T) |
|
|
20 |
max(ccle_prot[,-c(1:48)], na.rm = T) |
|
|
21 |
max(ccle_nonorm[,-c(1:48)], na.rm = T) |
|
|
22 |
|
|
|
23 |
# Extract cell line names and replace with column names |
|
|
24 |
ccle_prot_lines <- gsub("\\_Ten.+", "", colnames(ccle_prot)[-c(1:48)]) |
|
|
25 |
colnames(ccle_prot)[-c(1:48)] <- ccle_prot_lines |
|
|
26 |
|
|
|
27 |
### Attach tissue information for each cell line: |
|
|
28 |
ccle_line_info <- fread("Data/DepMap/20Q1/sample_info.csv") |
|
|
29 |
sum(ccle_prot_lines %in% ccle_line_info$CCLE_Name) / length(ccle_prot_lines) # All cell line info is available |
|
|
30 |
|
|
|
31 |
# Convert data to long format |
|
|
32 |
long_ccle <- melt.data.table(ccle_prot[, c(2, 6, 49:ncol(ccle_prot)), with = F], |
|
|
33 |
id.vars = colnames(ccle_prot)[c(2,6)], |
|
|
34 |
variable.name = "line", value.name = "norm_quant") |
|
|
35 |
|
|
|
36 |
# Divide cell line into ID and tissue |
|
|
37 |
long_ccle$cell_line <- gsub("\\_.+", "", long_ccle$line) |
|
|
38 |
long_ccle$tissue <- gsub(".*?\\_(.+)", "\\1", long_ccle$line) # '?' means greedy, so the least '.' is used |
|
|
39 |
# long_ccle$line <- NULL |
|
|
40 |
|
|
|
41 |
# Merge with cell line info |
|
|
42 |
long_ccle <- merge(long_ccle, ccle_line_info[, c("CCLE_Name", "lineage", "lineage_subtype", |
|
|
43 |
"lineage_sub_subtype", "sex", "disease", |
|
|
44 |
"disease_subtype", "age", "additional_info")], |
|
|
45 |
by.x = "line", by.y = "CCLE_Name") |
|
|
46 |
|
|
|
47 |
# Save |
|
|
48 |
fwrite(long_ccle, "Data/DepMap/20Q1/long_ccle_prot_data.txt", sep = '\t') |
|
|
49 |
|
|
|
50 |
# ==== Source Tissue Statistics ==== |
|
|
51 |
t1 <- unique(long_ccle[, c("line", "cell_line", "tissue")]) |
|
|
52 |
ggplot(t1) + |
|
|
53 |
geom_bar(aes(x = tissue)) + |
|
|
54 |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) |
|
|
55 |
ggsave("Plots/All_CCLE/CCLE_Line_Counts_per_Tissue.png") |
|
|
56 |
|
|
|
57 |
|
|
|
58 |
|
|
|
59 |
# ==== Basal Levels of Proteins of Interest ==== |
|
|
60 |
require(data.table) |
|
|
61 |
require(ggplot2) |
|
|
62 |
require(ggridges) |
|
|
63 |
dir.create("Plots") |
|
|
64 |
|
|
|
65 |
# Some UniProt IDs |
|
|
66 |
# DCAF1: Q9Y4B6 |
|
|
67 |
# COPB2: P35606 |
|
|
68 |
# FBXW11: Q9UKB1 |
|
|
69 |
|
|
|
70 |
long_ccle <- fread("Data/DepMap/20Q1/long_ccle_prot_data.txt") |
|
|
71 |
|
|
|
72 |
long_ccle[Uniprot_Acc == "Q9Y4B6"] |
|
|
73 |
|
|
|
74 |
# DCAF1 |
|
|
75 |
dcaf1_all <- ggplot(data = long_ccle[Uniprot_Acc == "Q9Y4B6"]) + |
|
|
76 |
geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") + |
|
|
77 |
geom_jitter(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")], |
|
|
78 |
aes(x = tissue, y = norm_quant), colour = "red", size = 2) + |
|
|
79 |
geom_text(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")], |
|
|
80 |
mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) + |
|
|
81 |
geom_hline(yintercept = 0, colour = "red") + |
|
|
82 |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) |
|
|
83 |
|
|
|
84 |
dcaf1_all |
|
|
85 |
dir.create("Plots/All_CCLE") |
|
|
86 |
dir.create("Plots/All_CCLE/DCAF1") |
|
|
87 |
ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_jitter.png", dcaf1_all) |
|
|
88 |
|
|
|
89 |
dcaf1_ridge <- ggplot(data = long_ccle[Uniprot_Acc == "Q9Y4B6"]) + |
|
|
90 |
geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") + |
|
|
91 |
geom_jitter(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")], |
|
|
92 |
aes(x = tissue, y = norm_quant), colour = "red", size = 2) + |
|
|
93 |
geom_text(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")], |
|
|
94 |
mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) + |
|
|
95 |
geom_hline(yintercept = 0, colour = "red") + |
|
|
96 |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) |
|
|
97 |
|
|
|
98 |
require(viridis) |
|
|
99 |
require(hrbrthemes) |
|
|
100 |
|
|
|
101 |
dcaf1_ridge <- ggplot(long_ccle[Uniprot_Acc == "Q9Y4B6"], |
|
|
102 |
aes(x = norm_quant, y = tissue, fill = ..x..)) + |
|
|
103 |
geom_density_ridges_gradient(scale = 2, rel_min_height = 0.01) + |
|
|
104 |
scale_fill_viridis(name = "norm_quant", option = "C") + |
|
|
105 |
labs(title = 'Protein Expression of DCAF1 in CCLE Tissues') + |
|
|
106 |
theme_ipsum() + |
|
|
107 |
theme( |
|
|
108 |
legend.position="none", |
|
|
109 |
panel.spacing = unit(0.1, "lines"), |
|
|
110 |
strip.text.x = element_text(size = 8) |
|
|
111 |
) |
|
|
112 |
|
|
|
113 |
dcaf1_ridge |
|
|
114 |
ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_ridge.png", dcaf1_ridge, width = 10) |
|
|
115 |
|
|
|
116 |
|
|
|
117 |
dcaf1_ridge_hist <- ggplot(long_ccle[Uniprot_Acc == "Q9Y4B6"], |
|
|
118 |
aes(x = norm_quant, y = tissue, fill = tissue)) + |
|
|
119 |
geom_density_ridges(alpha=0.6, stat="binline", bins=20) + |
|
|
120 |
theme_ridges() + |
|
|
121 |
labs(title = 'Protein Expression of DCAF1 in CCLE Tissues') + |
|
|
122 |
theme( |
|
|
123 |
legend.position="none", |
|
|
124 |
panel.spacing = unit(0.1, "lines"), |
|
|
125 |
strip.text.x = element_text(size = 8) |
|
|
126 |
) |
|
|
127 |
|
|
|
128 |
dcaf1_ridge_hist |
|
|
129 |
ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_ridge_hist.png", dcaf1_ridge_hist, width = 10) |
|
|
130 |
|
|
|
131 |
|
|
|
132 |
# COPB2 |
|
|
133 |
ggplot(data = long_ccle[Uniprot_Acc == "P35606"]) + |
|
|
134 |
geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") + |
|
|
135 |
geom_jitter(data = long_ccle[Uniprot_Acc == "P35606" & cell_line %in% c("PC9")], |
|
|
136 |
aes(x = tissue, y = norm_quant), colour = "red", size = 2) + |
|
|
137 |
geom_text(data = long_ccle[Uniprot_Acc == "P35606" & cell_line %in% c("PC9")], |
|
|
138 |
mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) + |
|
|
139 |
geom_hline(yintercept = 0, colour = "red") + |
|
|
140 |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) |
|
|
141 |
|
|
|
142 |
|
|
|
143 |
ccle_line_info[CCLE_Name %like% "PC9"] |
|
|
144 |
unique(long_ccle[tissue == "LUNG"]$cell_line) |
|
|
145 |
|
|
|
146 |
|
|
|
147 |
# Cell lines we already tested |
|
|
148 |
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$SW1573_LUNG_TenPx33 |
|
|
149 |
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH358_LUNG_TenPx06 |
|
|
150 |
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH460_LUNG_TenPx22 |
|
|
151 |
|
|
|
152 |
# Cell lines to be tested (and available in CCLE) |
|
|
153 |
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH2170_LUNG_TenPx12 |
|
|
154 |
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH1703_LUNG_TenPx19 |
|
|
155 |
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$A549_LUNG_TenPx12 |
|
|
156 |
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH520_LUNG_TenPx10 |
|
|
157 |
|
|
|
158 |
ccle_prot[Uniprot_Acc == "Q9Y4B6", -c(1:48)] |