[bd22c4]: / eda / EAT / GC_metabolomics_heatmap.R

Download this file

461 lines (359 with data), 19.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
#### 01_EAT_exploring_KAO_04_GCdata.R #####
## Access DB SQlite where GC data lives and use in pheatmap.
## Append patient meta information
## The RSQLite code was taken from KAO.
#BiocManager::install("DelayedMatrixStats")
library(DBI)
library(RSQLite)
library(pheatmap)
library(plyr)
library(RColorBrewer)
library(DelayedMatrixStats)
library(randomcoloR)
library(cluster)
##### Establish a connection with DB #####
con <- dbConnect(SQLite(), dbname = "P:/All_20200428_COVID_plasma_multiomics/SQLite Database/Covid-19 Study DB.sqlite")
##### Pull data from DB #####
dbListTables(con)
# [1] "biomolecules" "deidentified_patient_metadata" "lipidomics_measurements" "lipidomics_runs"
# [5] "metabolomics_measurements" "metabolomics_runs" "metadata" "omes"
# [9] "patient_metadata" "patient_samples" "proteomics_measurements" "proteomics_runs"
# [13] "rawfiles" "sqlite_sequence"
# Extract tables from SQLite database. The square prakets provide the first row or column names of table.
dbReadTable(con, "biomolecules")[0,]
dbReadTable(con, "metadata")[1:10,]
dbReadTable(con, "metabolomics_measurements")[0,]
dbReadTable(con, "metabolomics_runs")[0,]
dbReadTable(con, "biomolecules")[0,]
dbReadTable(con, "deidentified_patient_metadata")[0,]
##### Knit tables into a long dataframe #####
## df is a dataframe of normalized metabolite abundances, the rawfile where metabolite was extracted from, and batch info
## deiidentified_patient_meta is a dataframe that contains metat data for the patients
## metabolite_class is a dataframe of the metabolite standarized names and metadata associated to metabolite class
# Select unique_identifier from metabolomics_runs
# Select normalized_abundance from from metabolomics_measurements
# Select biomolecule.id from metabolomic_measurements
# Select batch from rawfiles
df<- dbGetQuery(con, "SELECT unique_identifier, normalized_abundance, metabolomics_measurements.biomolecule_id, batch
FROM metabolomics_measurements
INNER JOIN metabolomics_runs ON metabolomics_runs.replicate_id = metabolomics_measurements.replicate_id
INNER JOIN rawfiles ON rawfiles.rawfile_id = metabolomics_runs.rawfile_id
INNER JOIN biomolecules on biomolecules.biomolecule_id = metabolomics_measurements.biomolecule_id
WHERE rawfiles.keep = 1
AND ome_id = 3
AND biomolecules.keep = '1'
")
# Select Sample_label, Gender, ICU_1, Mech_Ventilation from deidentified_patient_metadata
deidentified_patient_meta <- dbGetQuery(con, "SELECT Sample_label, Gender, ICU_1, Mech_Ventilation
FROM deidentified_patient_metadata
")
# Select Metabolite standardized_name in biomolecules
# Select biomolecule_id in metadatat
# Select metadata_type in metadata
# Select metadata_value in metadata
metabolite_class <- dbGetQuery(con, "SELECT standardized_name, metadata.biomolecule_id, metadata_type, metadata_value
FROM metadata
INNER JOIN biomolecules ON biomolecules.biomolecule_id = metadata.biomolecule_id
WHERE biomolecules.omics_id = 3
AND biomolecules.keep = 1
AND metadata.metadata_type = 'Class'
")
dbDisconnect(con)
##### Data formatting #####
# Reshape data into wide format with first two columns containing meta data
# Column names is "normalized_abundance.X" where X is the biomolecule.id
df_wide <- reshape(df, timevar = "biomolecule_id", v.names = "normalized_abundance",
idvar = "unique_identifier", direction = "wide" )
##### Meta formatting #####
## meta contains the relevant information provided in the rawfile name (aka unique_identifier)
# Subset meta data and break up string by "_"
meta <- as.data.frame(stringr::str_split_fixed(df_wide$unique_identifier,"_",5))
# Fix typo in Disease state. Update factors and eleves
meta$V3[which(meta$V3 == "Conrol")] <- "Control"
meta$V3 <- factor(meta$V3,levels = c("Control","COVID","NONCOVID"))
# Pad single digit numbers with a zero ex. 2 -> '02'
samples <- vector()
for (i in 1:nrow(meta)) {
if(nchar(as.character(meta$V4[i]),type = "char")==1){
one_sample <- paste0("0",meta$V4[i])
samples <- append(samples,one_sample)
}else{
one_sample <- as.character(meta$V4[i])
samples <- append(samples,one_sample)
}
}
#overwrite split1:25 with sample_number padded with a zero if single digit
meta$V5 <- samples
meta$V6 <- paste0(meta$V3,"_",meta$V5)
#add batch info column
meta$V7 <- df_wide$batch
#rename meta columns
colnames(meta) <- c("Rawfile.Date.Stamp", "Sample.Type",
"Disease.State", "Patient.Number",
"Padded.Patient.Number", "Sample_Label",
"Batch")
# The patient id's are used more than once.
# To make all id's unique, the make.unique function will add numbers to each duplicated entry
meta$Sample_Lable.unique <- make.unique(as.character(meta$Sample_Label))
##### Prepare for heatmap ######
# Subset dataframe to include abunance values only
metabolomics <- df_wide[,c(3:ncol(df_wide))]
# rename row names to Patient unique Id. (non-duplicated)
rownames(metabolomics) <- meta$Sample_Lable.unique
# rename columns to only the metabolite unique identifier
colnames(metabolomics) <- sub(".*abundance.","",colnames(metabolomics))
# Annotations for patients
patient_annotation <- meta[,c(1,3,7)]
patient_annotation$Batch <- factor(patient_annotation$Batch, levels = seq(1:7))
rownames(patient_annotation) <- meta$meta$Sample_Lable.unique
#patient_annotation$Date <- as.Date(patient_annotation$Date)
my_colour = list(
Rawfile.Date.Stamp = c(`20200427` = "#F4F4F9", `20200428` = "#E1FBFB", `20200429` = "#B8DBD9", `20200430` = "#86D6E0",
`20200506` = "#3C99B2", `20200507` = "#368BA4", `20200508` = "#18475C" ),
Disease.State = c(`Control` = "#F7F7F7", `COVID` = "#D0D3CA", `NONCOVID` = "#A3A79D"),
Batch = c(`1` = "#BCF8EC", `2` = "#AED9E0", `3` = "#9FA0C3", `4` = "#8B687F", `5` = "#7B435B", `6` = "#5C3344", `7` = "#1C093D"))
##### Plot Heatmap #####
scaleRYG <- colorRampPalette(c("#3C99B2","#E8CB2E","#EF2D00"), space = "rgb")(20)
pheatmap(t(metabolomics),
color = scaleRYG,
annotation_colors = my_colour,
annotation_col = patient_annotation,
cluster_rows = T,
#scale = "column",
show_colnames = F,
show_rownames = F,
main = "GC-MS Small Molecule COVID-19 HeatMap")
#### Remove Controls ####
## Create a heatmap without Controls and append more patient metadata
##### Patient Meta formatting ####
# Remove controls from meta
meta_patientONLY <- meta[-grep("Control",meta$Sample_Lable.unique),]
# What patient meta information is missing?
PatientSample_Missing <- meta_patientONLY[which(!meta_patientONLY$Sample_Label %in% deidentified_patient_meta$Sample_label),]
# match sample labels in meta and deidentified_patient_meta
meta_patientONLY <- meta_patientONLY[which(meta_patientONLY$Sample_Lable.unique %in% deidentified_patient_meta$Sample_label),]
# reorder deidentified_patient_meta data to match sample label in meta_patientONLY
deidentified_patient_meta <- deidentified_patient_meta[order(match(deidentified_patient_meta$Sample_label,meta_patientONLY$Sample_Label)),]
# Merge deidentified_patient_meta with meta_patientONLY by sample label
meta_patientONLY_merged <- merge(meta_patientONLY,deidentified_patient_meta, by.x = "Sample_Label", by.y = "Sample_label" )
##### Data without Controls #####
# Remove rows that are connected with Control Data
metabolomics_noControls <- metabolomics[which(rownames(metabolomics) %in% meta_patientONLY_merged$Sample_Label),]
# For metaboanalyst
Disease.state <- sub("_.*", "",rownames(metabolomics_noControls))
metaboanalyst_df <- cbind(Disease.state,metabolomics_noControls)
metaboanalyst_df <- t(metaboanalyst_df)
rownames(metaboanalyst_df) <- c("Disease.state",metabolite_class$standardized_name)
#write.csv(metaboanalyst_df, file = "H:/Projects/COVID19/SmallMolecule/Files/metaboanalyst_GCMetabolomics_normalized.csv")
# Annotations for patients
#patient_annotation <- meta_patientONLY_merged[,c(2,4,7,9,10,11)]
patient_annotation <- meta_patientONLY_merged[,c(4,9,10,11)]
#patient_annotation$Batch <- factor(patient_annotation$Batch, levels = seq(1:7))
patient_annotation$Gender <- factor(patient_annotation$Gender, levels = c("F","M"))
patient_annotation$ICU_1 <- factor(patient_annotation$ICU_1, levels = c("0","1"))
patient_annotation$Mech_Ventilation <- factor(as.character(patient_annotation$Mech_Ventilation), levels = c("0","1"))
rownames(patient_annotation) <- meta_patientONLY_merged$Sample_Label
#patient_annotation$Date <- as.Date(patient_annotation$Date)
# Annotations for molecules
metabolite_annotation <- metabolite_class[which(metabolite_class$biomolecule_id %in% colnames(metabolomics)),]
rownames(metabolite_annotation) <- metabolite_annotation$biomolecule_id
metabolite_class_extended <- as.data.frame(stringr::str_split_fixed(metabolite_class$metadata_value,";",2))
metabolite_class_extended$V1 <- as.character(metabolite_class_extended$V1)
metabolite_class_extended$V2 <- trimws(as.character(metabolite_class_extended$V2))
# Simplify classes
simplified_class <- vector()
for (i in 1:nrow(metabolite_class_extended)) {
if(nchar(as.character(metabolite_class_extended$V2[i]), type = "char") == 0){
metabolite_class_extended$V2[i] <- metabolite_class_extended$V1[i]
}
if(length(grep("Amino",metabolite_class_extended$V2[i])) != 0){
simplified_class <- append(simplified_class,"Amino acids")
}else{
simplified_class <- append(simplified_class,metabolite_class_extended$V2[i])
}
}
metabolite_class_extended$V3 <- simplified_class
metabolite_annotation <- as.data.frame(metabolite_class_extended$V3)
colnames(metabolite_annotation) <- "Metabolite.Class"
rownames(metabolite_annotation) <- metabolite_class$standardized_name
metabolite_annotation$Metabolite.Class <- as.factor(metabolite_annotation$Metabolite.Class)
my_colour = list(
#Rawfile.Date.Stamp = c(`20200427` = "#F4F4F9", `20200428` = "#E1FBFB", `20200429` = "#B8DBD9", `20200430` = "#86D6E0",
#`20200506` = "#3C99B2", `20200507` = "#368BA4", `20200508` = "#18475C" ),
Disease.State = c( `COVID` = "#D0D3CA", `NONCOVID` = "#A3A79D"),
#Batch = c(`1` = "#FDF0FE", `2` = "#E8C5E9", `3` = "#9FA0C3", `4` = "#8B687F", `5` = "#7B435B", `6` = "#5C3344", `7` = "#1C093D"),
Gender = c(`F` = "#FB3640", `M` = "#1D3461"))
#ICU_1 = c(`0` = "#E1FBFB ", `1` = "#368BA4"))
#Mech_Ventilation = c(`0` = "#D9FF29", `1` = "#1B2D2A"))
#scaleRYG <- colorRampPalette(c("#3C99B2","#E8CB2E","#EF2D00"), space = "rgb")(20)
#scaleRYG <- colorRampPalette(c("#3C99B2","#ffffff","#EF2D00"), space = "rgb")(20)
pheatmap(t(metabolomics_noControls),
color = scaleRYG,
annotation_colors = my_colour,
annotation_col = patient_annotation,
annotation_row = metabolite_annotation,
cluster_cols = T,
#scale = "column",
show_colnames = F,
show_rownames = F,
#scale = "row",
main = "GC-MS Small Molecule COVID-19 HeatMap")
##### Exploratory Figures #####
metabolites_Known_Unknown <- t(metabolomics_noControls)
rownames(metabolites_Known_Unknown) <- metabolite_class$standardized_name
metabolites_Unknown <- metabolites_Known_Unknown[grep("unknown",rownames(metabolites_Known_Unknown)),]
metabolites_Known <- metabolites_Known_Unknown[-grep("unknown",rownames(metabolites_Known_Unknown)),]
# Boxplot of unknown metabolites
boxplot(t(metabolites_Unknown),
main = "Boxplot of unknown GC-MS metabolites",
ylab = "Log2(Intensity)")
par(mar = c(22.1, 4.1, 4.1, 4.1) # change the margins bottow, left, top, and right
#lwd = 2, # increase the line thickness
#cex.axis = 1.2 # increase default axis label size
)
# Boxplot of known metabolites
boxplot(t(metabolites_Known),
main = "Boxplot of known GC-MS metabolites",
ylab = "Log2(Intensity)",
las = 1,
xaxt = "n")
text(x = 1:length(rownames(metabolites_Known)),
y = par("usr")[3] - 0.30, #subtract from y axis to push labels down
labels = sub(" RT.*","",rownames(metabolites_Known)),
xpd = NA, #print below axis
srt = 90,
adj = 1)
# Histogram of Unknowns and Knowns
par(mar = c(6.1, 4.1, 4.1, 4.1) # change the margins bottow, left, top, and right
#lwd = 2, # increase the line thickness
#cex.axis = 1.2 # increase default axis label size
)
hist(t(metabolites_Unknown),
main = "Histogram of GC-MS Uknown features Log2(Intensity)",
xlab = "Log2(Intensity)",
xlim = c(0,30),
las = 1,
breaks = 20)
hist(t(metabolites_Known),
main = "Histogram of GC-MS Known metabolites Log2(Intensity)",
xlab = "Log2(Intensity)",
xlim = c(0,35),
las = 1,
breaks = 20)
##### Fold Change to Median NONCOVID #####
metabolomics_COVID <- metabolomics_noControls[-grep("NONCOVID",rownames(metabolomics_noControls)),]
#write.csv(metabolomics_COVID, file = "H:/Projects/COVID19/SmallMolecule/Files/COVID_GCMetabolomics_normalized.csv")
metabolomics_NONCOVID <- metabolomics_noControls[grep("NONCOVID",rownames(metabolomics_noControls)),]
#write.csv(metabolomics_NONCOVID, file = "H:/Projects/COVID19/SmallMolecule/Files/NONCOVID_GCMetabolomics_normalized.csv")
# Boxplot of NON-COVID patient's dynamic range
par(mar = c(6.1, 4.1, 4.1, 4.1))
boxplot(t(metabolomics_NONCOVID),
main = "Boxplot of dynamic range for each Non-COVID patient",
ylab = "Log2(Intensity",
las=1,
xaxt= "n")
text(x = 1:length(rownames(metabolomics_NONCOVID)),
y = par("usr")[3] - 0.45,
labels = rownames(metabolomics_NONCOVID),
xpd = NA,
srt = 35,
adj = 1)
# Calculate the median for each column = metabolite/feature
NONCOVID_Median.MetaboliteIntensity <- colMedians(as.matrix(metabolomics_NONCOVID))
# Foldchange function -> take every row and divide it by the median vector
fold_change.FUNC <- function(x) x-NONCOVID_Median.MetaboliteIntensity
df_fc <- apply(metabolomics_COVID,1, fold_change.FUNC)
rownames(df_fc) <- metabolite_class$standardized_name
#n < -40
#palette <- distinctColorPalette(n)
# New color palette patients
my_colour = list(
Gender = c(`F` = "#C0B9DD", `M` = "#234947"),
ICU_1 = c(`No` = "#D0D3CA", `Yes` = "#368BA4"),
Mech_Ventilation = c(`No` = "#D0D3CA", `Yes` = "#463F3A"))
# Substitute all 0 -> NO and 1 -> Yes in ICU_1
patient_annotation$ICU_1 <- as.character(patient_annotation$ICU_1)
patient_annotation$ICU_1 <- sub("0","No",patient_annotation$ICU_1)
patient_annotation$ICU_1 <- sub("1","Yes",patient_annotation$ICU_1)
patient_annotation$ICU_1 <- factor(patient_annotation$ICU_1, levels = c("No","Yes"))
# Substitute all 1 -> NO and 1 -> Yes in ICU_1
patient_annotation$Mech_Ventilation <- as.character(patient_annotation$Mech_Ventilation)
patient_annotation$Mech_Ventilation <- sub("0","No",patient_annotation$Mech_Ventilation)
patient_annotation$Mech_Ventilation <- sub("1","Yes",patient_annotation$Mech_Ventilation)
patient_annotation$Mech_Ventilation <- factor(patient_annotation$Mech_Ventilation, levels = c("No","Yes"))
# Heatmap of the Fold Change calculated from the median in NONCOVID cohort
par(mar = c(20, 4.1, 4.1, 4.1))
scaleRYG <- colorRampPalette(c("#3C99B2","#ffffff","#EF2D00"), space = "rgb")(20)
pheatmap(df_fc,
color = scaleRYG,
annotation_colors = my_colour,
annotation_col = patient_annotation[-grep("NONCOVID",rownames(patient_annotation)),-c(1)],
#annotation_row = metabolite_annotation,
cluster_cols = T,
#scale = "column",
show_colnames = F,
show_rownames = F,
#scale = "row",
main = "Heatmap Fold Change COVID/NONCOVID(median)")
# Histogram of Fold Changes
par(mar = c(6.1, 4.1, 4.1, 4.1))
hist(df_fc,
main = "Histogram of Fold Changes COVID relative to NONCOVID",
xlab = "Fold Change of Log2",
xlim = c(-15,15),
breaks = 50)
# subset features with fold change greater than 1.2 to cluster
important_features_foldchange <- df_fc[rowSums(abs(df_fc)>4.2)>0,]
out <- pheatmap(important_features_foldchange,
color = scaleRYG,
annotation_colors = my_colour,
annotation_col = patient_annotation[-grep("NONCOVID",rownames(patient_annotation)),-c(1)],
annotation_row = metabolite_annotation,
cluster_cols = T,
#scale = "column",
show_colnames = F,
show_rownames = F,
#scale = "row",
main = "Heatmap Fold Change COVID/NONCOVID(median)\n Subset Features that have abs(FC) > 4.2 in at least one patient")
# Protein groups with fold change greater than 1.2
table(rowSums(abs(df_fc)>1.2)>0)
# Protein groups with fold change greater than 2
table(rowSums(abs(df_fc)>2)>0)
# Protein groups with fold change greater than 4
table(rowSums(abs(df_fc)>4)>0)
# Protein groups with fold change greater than 8
table(rowSums(abs(df_fc)>8)>0)
# Re-0rder original data (metabolites) to match ordering in heatmap (top-to-bottom)
rownames(important_features_foldchange[out$tree_row[["order"]],])
# Re-order original data (patients) to match ordering in heatmap(left-to-right)
colnames(important_features_foldchange[,out$tree_col[["order"]]])
# metabolite-to-cluster assignments
sort(cutree(out$tree_row, k=10))
# Plot extracted gene-t0-cluster assignments
plot(out$tree_row)
##### K means clustering #####
fold_hclust <- hclust(dist(important_features_foldchange, method = "euclidean"))
clusters <- cutree(fold_hclust, k = 10) #k = 20)
clusplot(important_features_foldchange,
clusters,
lines = 0,# no distance lines will appear,
main = "ClusPlot of subset features\n witth abs(log2(FC)) > 4.2 in at least one patient")
dev.off()
plot(fold_hclust, labels = FALSE)
rect.hclust(fold_hclust, k=10, border = "red")
##### Compress Fold Change ####
# bin protein groups so that we describe the changes within fold changes of -2 and 2
important_features_foldchange_compress <- important_features_foldchange
important_features_foldchange_compress[important_features_foldchange_compress < -2] = -2.1
important_features_foldchange_compress[important_features_foldchange_compress > 2] = 2.1
pheatmap(important_features_foldchange_compress,
color = scaleRYG,
annotation_colors = my_colour,
annotation_col = patient_annotation[-grep("NONCOVID",rownames(patient_annotation)),-c(1)],
annotation_row = metabolite_annotation,
cluster_cols = T,
#scale = "column",
show_colnames = F,
show_rownames = F,
#scale = "row",
main = "Heatmap Fold Change COVID/NONCOVID(median)")