Diff of /R/Proteomic_EDA.R [000000] .. [c3b4f8]

Switch to unified view

a b/R/Proteomic_EDA.R
1
# Proteomic_EDA.R
2
3
require(data.table)
4
require(ggplot2)
5
require(stringr)
6
7
# Cell lines getting engineered for DCAF1 (March 2020)
8
# NCIH-2170/NCIH-1915/NCIH-1703/NCI-H1373/A549A/NCI-H647/NCIH520
9
10
# ==== Create Long CCLE Proteomics Data with Cell Line Info ====
11
# Read CCLE proteomics data 
12
ccle_prot <- fread("Data/DepMap/20Q1/Cellular Models/CCLE_normalized_protein_expression.csv")
13
ccle_nonorm <- fread("Data/DepMap/20Q1/Cellular Models/CCLE_summed_sn_non_normalized.csv")
14
colnames(ccle_prot)
15
16
17
# Find extrema
18
min(ccle_prot[,-c(1:48)], na.rm = T)
19
min(ccle_nonorm[,-c(1:48)], na.rm = T)
20
max(ccle_prot[,-c(1:48)], na.rm = T)
21
max(ccle_nonorm[,-c(1:48)], na.rm = T)
22
23
# Extract cell line names and replace with column names
24
ccle_prot_lines <- gsub("\\_Ten.+", "", colnames(ccle_prot)[-c(1:48)])
25
colnames(ccle_prot)[-c(1:48)] <- ccle_prot_lines
26
27
### Attach tissue information for each cell line:
28
ccle_line_info <- fread("Data/DepMap/20Q1/sample_info.csv")
29
sum(ccle_prot_lines %in% ccle_line_info$CCLE_Name) / length(ccle_prot_lines)  # All cell line info is available
30
31
# Convert data to long format
32
long_ccle <- melt.data.table(ccle_prot[, c(2, 6, 49:ncol(ccle_prot)), with = F],
33
                             id.vars = colnames(ccle_prot)[c(2,6)],
34
                             variable.name = "line", value.name = "norm_quant")
35
36
# Divide cell line into ID and tissue
37
long_ccle$cell_line <- gsub("\\_.+", "", long_ccle$line)
38
long_ccle$tissue <- gsub(".*?\\_(.+)", "\\1", long_ccle$line)  # '?' means greedy, so the least '.' is used
39
# long_ccle$line <- NULL
40
41
# Merge with cell line info
42
long_ccle <- merge(long_ccle, ccle_line_info[, c("CCLE_Name", "lineage", "lineage_subtype",
43
                                                "lineage_sub_subtype", "sex", "disease",
44
                                                "disease_subtype", "age", "additional_info")],
45
                   by.x = "line", by.y = "CCLE_Name")
46
47
# Save
48
fwrite(long_ccle, "Data/DepMap/20Q1/long_ccle_prot_data.txt", sep = '\t')
49
50
# ==== Source Tissue Statistics ====
51
t1 <- unique(long_ccle[, c("line", "cell_line", "tissue")])
52
ggplot(t1) +
53
  geom_bar(aes(x = tissue)) +
54
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
55
ggsave("Plots/All_CCLE/CCLE_Line_Counts_per_Tissue.png")
56
57
58
59
# ==== Basal Levels of Proteins of Interest ====
60
require(data.table)
61
require(ggplot2)
62
require(ggridges)
63
dir.create("Plots")
64
65
# Some UniProt IDs
66
# DCAF1: Q9Y4B6
67
# COPB2: P35606
68
# FBXW11: Q9UKB1
69
70
long_ccle <- fread("Data/DepMap/20Q1/long_ccle_prot_data.txt")
71
72
long_ccle[Uniprot_Acc == "Q9Y4B6"]
73
74
# DCAF1
75
dcaf1_all <- ggplot(data = long_ccle[Uniprot_Acc == "Q9Y4B6"]) + 
76
  geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") +
77
  geom_jitter(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")], 
78
              aes(x = tissue, y = norm_quant), colour = "red", size = 2) +
79
  geom_text(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")],
80
            mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) +
81
  geom_hline(yintercept = 0, colour = "red") +
82
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
83
84
dcaf1_all
85
dir.create("Plots/All_CCLE")
86
dir.create("Plots/All_CCLE/DCAF1")
87
ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_jitter.png", dcaf1_all)
88
89
dcaf1_ridge <- ggplot(data = long_ccle[Uniprot_Acc == "Q9Y4B6"]) + 
90
  geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") +
91
  geom_jitter(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")], 
92
              aes(x = tissue, y = norm_quant), colour = "red", size = 2) +
93
  geom_text(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")],
94
            mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) +
95
  geom_hline(yintercept = 0, colour = "red") +
96
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
97
98
require(viridis)
99
require(hrbrthemes)
100
101
dcaf1_ridge <- ggplot(long_ccle[Uniprot_Acc == "Q9Y4B6"],
102
       aes(x = norm_quant, y = tissue, fill = ..x..)) +
103
  geom_density_ridges_gradient(scale = 2, rel_min_height = 0.01) +
104
  scale_fill_viridis(name = "norm_quant", option = "C") +
105
  labs(title = 'Protein Expression of DCAF1 in CCLE Tissues') +
106
  theme_ipsum() +
107
  theme(
108
    legend.position="none",
109
    panel.spacing = unit(0.1, "lines"),
110
    strip.text.x = element_text(size = 8)
111
  )
112
113
dcaf1_ridge
114
ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_ridge.png", dcaf1_ridge, width = 10)
115
116
117
dcaf1_ridge_hist <- ggplot(long_ccle[Uniprot_Acc == "Q9Y4B6"],
118
                      aes(x = norm_quant, y = tissue, fill = tissue)) +
119
  geom_density_ridges(alpha=0.6, stat="binline", bins=20) +
120
  theme_ridges() +
121
  labs(title = 'Protein Expression of DCAF1 in CCLE Tissues') +
122
  theme(
123
    legend.position="none",
124
    panel.spacing = unit(0.1, "lines"),
125
    strip.text.x = element_text(size = 8)
126
  )
127
128
dcaf1_ridge_hist
129
ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_ridge_hist.png", dcaf1_ridge_hist, width = 10)
130
131
132
# COPB2
133
ggplot(data = long_ccle[Uniprot_Acc == "P35606"]) + 
134
  geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") +
135
  geom_jitter(data = long_ccle[Uniprot_Acc == "P35606" & cell_line %in% c("PC9")], 
136
              aes(x = tissue, y = norm_quant), colour = "red", size = 2) +
137
  geom_text(data = long_ccle[Uniprot_Acc == "P35606" & cell_line %in% c("PC9")],
138
            mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) +
139
  geom_hline(yintercept = 0, colour = "red") +
140
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
141
142
143
ccle_line_info[CCLE_Name %like% "PC9"]
144
unique(long_ccle[tissue == "LUNG"]$cell_line)
145
146
147
# Cell lines we already tested
148
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$SW1573_LUNG_TenPx33
149
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH358_LUNG_TenPx06
150
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH460_LUNG_TenPx22
151
152
# Cell lines to be tested (and available in CCLE)
153
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH2170_LUNG_TenPx12
154
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH1703_LUNG_TenPx19
155
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$A549_LUNG_TenPx12
156
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH520_LUNG_TenPx10
157
158
ccle_prot[Uniprot_Acc == "Q9Y4B6", -c(1:48)]