[c3b4f8]: / R / Proteomic_EDA.R

Download this file

159 lines (123 with data), 6.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Proteomic_EDA.R
require(data.table)
require(ggplot2)
require(stringr)
# Cell lines getting engineered for DCAF1 (March 2020)
# NCIH-2170/NCIH-1915/NCIH-1703/NCI-H1373/A549A/NCI-H647/NCIH520
# ==== Create Long CCLE Proteomics Data with Cell Line Info ====
# Read CCLE proteomics data
ccle_prot <- fread("Data/DepMap/20Q1/Cellular Models/CCLE_normalized_protein_expression.csv")
ccle_nonorm <- fread("Data/DepMap/20Q1/Cellular Models/CCLE_summed_sn_non_normalized.csv")
colnames(ccle_prot)
# Find extrema
min(ccle_prot[,-c(1:48)], na.rm = T)
min(ccle_nonorm[,-c(1:48)], na.rm = T)
max(ccle_prot[,-c(1:48)], na.rm = T)
max(ccle_nonorm[,-c(1:48)], na.rm = T)
# Extract cell line names and replace with column names
ccle_prot_lines <- gsub("\\_Ten.+", "", colnames(ccle_prot)[-c(1:48)])
colnames(ccle_prot)[-c(1:48)] <- ccle_prot_lines
### Attach tissue information for each cell line:
ccle_line_info <- fread("Data/DepMap/20Q1/sample_info.csv")
sum(ccle_prot_lines %in% ccle_line_info$CCLE_Name) / length(ccle_prot_lines) # All cell line info is available
# Convert data to long format
long_ccle <- melt.data.table(ccle_prot[, c(2, 6, 49:ncol(ccle_prot)), with = F],
id.vars = colnames(ccle_prot)[c(2,6)],
variable.name = "line", value.name = "norm_quant")
# Divide cell line into ID and tissue
long_ccle$cell_line <- gsub("\\_.+", "", long_ccle$line)
long_ccle$tissue <- gsub(".*?\\_(.+)", "\\1", long_ccle$line) # '?' means greedy, so the least '.' is used
# long_ccle$line <- NULL
# Merge with cell line info
long_ccle <- merge(long_ccle, ccle_line_info[, c("CCLE_Name", "lineage", "lineage_subtype",
"lineage_sub_subtype", "sex", "disease",
"disease_subtype", "age", "additional_info")],
by.x = "line", by.y = "CCLE_Name")
# Save
fwrite(long_ccle, "Data/DepMap/20Q1/long_ccle_prot_data.txt", sep = '\t')
# ==== Source Tissue Statistics ====
t1 <- unique(long_ccle[, c("line", "cell_line", "tissue")])
ggplot(t1) +
geom_bar(aes(x = tissue)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave("Plots/All_CCLE/CCLE_Line_Counts_per_Tissue.png")
# ==== Basal Levels of Proteins of Interest ====
require(data.table)
require(ggplot2)
require(ggridges)
dir.create("Plots")
# Some UniProt IDs
# DCAF1: Q9Y4B6
# COPB2: P35606
# FBXW11: Q9UKB1
long_ccle <- fread("Data/DepMap/20Q1/long_ccle_prot_data.txt")
long_ccle[Uniprot_Acc == "Q9Y4B6"]
# DCAF1
dcaf1_all <- ggplot(data = long_ccle[Uniprot_Acc == "Q9Y4B6"]) +
geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") +
geom_jitter(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")],
aes(x = tissue, y = norm_quant), colour = "red", size = 2) +
geom_text(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")],
mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) +
geom_hline(yintercept = 0, colour = "red") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
dcaf1_all
dir.create("Plots/All_CCLE")
dir.create("Plots/All_CCLE/DCAF1")
ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_jitter.png", dcaf1_all)
dcaf1_ridge <- ggplot(data = long_ccle[Uniprot_Acc == "Q9Y4B6"]) +
geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") +
geom_jitter(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")],
aes(x = tissue, y = norm_quant), colour = "red", size = 2) +
geom_text(data = long_ccle[Uniprot_Acc == "Q9Y4B6" & cell_line %in% c("SW1573", "NCIH460", "NCIH358")],
mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) +
geom_hline(yintercept = 0, colour = "red") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
require(viridis)
require(hrbrthemes)
dcaf1_ridge <- ggplot(long_ccle[Uniprot_Acc == "Q9Y4B6"],
aes(x = norm_quant, y = tissue, fill = ..x..)) +
geom_density_ridges_gradient(scale = 2, rel_min_height = 0.01) +
scale_fill_viridis(name = "norm_quant", option = "C") +
labs(title = 'Protein Expression of DCAF1 in CCLE Tissues') +
theme_ipsum() +
theme(
legend.position="none",
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 8)
)
dcaf1_ridge
ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_ridge.png", dcaf1_ridge, width = 10)
dcaf1_ridge_hist <- ggplot(long_ccle[Uniprot_Acc == "Q9Y4B6"],
aes(x = norm_quant, y = tissue, fill = tissue)) +
geom_density_ridges(alpha=0.6, stat="binline", bins=20) +
theme_ridges() +
labs(title = 'Protein Expression of DCAF1 in CCLE Tissues') +
theme(
legend.position="none",
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 8)
)
dcaf1_ridge_hist
ggsave("Plots/All_CCLE/DCAF1/DCAF1_All_CCLE_ridge_hist.png", dcaf1_ridge_hist, width = 10)
# COPB2
ggplot(data = long_ccle[Uniprot_Acc == "P35606"]) +
geom_jitter(aes(x = tissue, y = norm_quant), stat = "identity") +
geom_jitter(data = long_ccle[Uniprot_Acc == "P35606" & cell_line %in% c("PC9")],
aes(x = tissue, y = norm_quant), colour = "red", size = 2) +
geom_text(data = long_ccle[Uniprot_Acc == "P35606" & cell_line %in% c("PC9")],
mapping = aes(x = tissue, y = norm_quant, label = cell_line), colour = "red", nudge_x = 2) +
geom_hline(yintercept = 0, colour = "red") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ccle_line_info[CCLE_Name %like% "PC9"]
unique(long_ccle[tissue == "LUNG"]$cell_line)
# Cell lines we already tested
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$SW1573_LUNG_TenPx33
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH358_LUNG_TenPx06
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH460_LUNG_TenPx22
# Cell lines to be tested (and available in CCLE)
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH2170_LUNG_TenPx12
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH1703_LUNG_TenPx19
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$A549_LUNG_TenPx12
ccle_prot[Uniprot_Acc == "Q9Y4B6"]$NCIH520_LUNG_TenPx10
ccle_prot[Uniprot_Acc == "Q9Y4B6", -c(1:48)]