Diff of /EDA.Rmd [000000] .. [c9fae8]

Switch to unified view

a b/EDA.Rmd
1
---
2
title: "R Notebook"
3
output: html_notebook
4
editor_options: 
5
  markdown: 
6
    wrap: 72
7
---
8
9
# Exploratory Data Analysis and Feature Extraction
10
11
In this markdown, exploratory data analysis is performed on sample
12
151507, with the goal of discovering genes whose expression values are
13
correlated with simple histology image features (RGB/Grayscale color
14
space). Sample 151507 includes 4136 spots and 21131 genes after
15
performing QC on spots and removing genes with an expression level of 0
16
for all spots.
17
18
First, RGB/grayscale color channels are extracted from the image. All
19
images are in lowres, meaning that they each have size 600 x 600. Then,
20
the 4136 x 9 dataframe 'data' is created. Each row of data represents a
21
barcode/spot, and the following column data is represented:
22
23
"pxl_col_in_fullres": column number of barcode in fullres image
24
25
"pxl_row_in_fullres": row number of barcode in fullres image
26
27
"barcode": string containing barcode
28
29
"scaled_pxl_col_in_lowres": column number of barcode in lowres image
30
31
"scaled_pxl_row_in_lowres": row number of barcode in lowres image
32
33
"red": red channel value of barcode
34
35
"green": green channel value of barcode
36
37
"blue": blue channel value of barcode
38
39
"grey": grey channel value of barcode
40
41
Then, correlation + the removal of outliers is performed in two ways,
42
with the second being optimal. 1. Correlation between gene expression
43
values and RGB/Grayscale pixel values is performed; then, outlier pixel
44
and gene expression values are removed. 2. Outliers are removed first,
45
then correlation is performed.
46
47
## Imports
48
49
```{r Import, message=FALSE}
50
library(spatialLIBD)
51
library(scuttle)
52
library(dplyr)
53
library(ggcorrplot)
54
library(ggplot2)
55
library(EBImage)
56
library(ggpubr)
57
library(foreach)
58
library(doParallel)
59
library(plotwidgets)
60
library(Dict)
61
library(lme4)
62
registerDoParallel(4)
63
library(reticulate)
64
65
py_install("tensorflow")
66
py_install("keras")
67
68
layers <- import("tensorflow.keras.layers")
69
models <- import("tensorflow.keras.models")
70
optimizers <- import("tensorflow.keras.optimizers")
71
```
72
73
## Finding Features of Interest From Color Spaces
74
75
The first features we want to extract from the sample are from the
76
RGBGrayscale + HSL color spaces. We can first scale the pxl values in
77
the SpatialExperiment object to fit the lowres image, and then determine
78
the feature values for each pixel and add them to the SpatialExperiment
79
object.
80
81
```{r Helper}
82
### Helper Functions for Finding Features of Interest
83
84
get_img <- function(sample_id) {
85
  png_link <- paste('https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/',sample_id,'_tissue_lowres_image.png',sep='')
86
  img <- readImage(png_link)
87
  img
88
}
89
90
get_rgbg <- function(img) {
91
  # create 600x600 color channel matrices for each channel
92
  red <- imageData(EBImage::channel(img, 'red'))
93
  green <- imageData(EBImage::channel(img, 'green'))
94
  blue <- imageData(EBImage::channel(img, 'blue'))
95
  grey <- imageData(EBImage::channel(img, 'grey'))
96
  list('red'=red, 'green'=green, 'blue'=blue, 'grey'=grey)
97
}
98
99
visualize_color_channels <- function(rgbg) {
100
  hist(rgbg$red)
101
  hist(rgbg$green)
102
  hist(rgbg$blue)
103
  hist(rgbg$grey)
104
}
105
106
get_hsl <- function(rgbg) {
107
  rgb_mat <- t(data.frame(c(rgbg$red), c(rgbg$green), c(rgbg$blue)))
108
  hsl_mat <- rgb2hsl(rgb_mat)
109
  
110
  # HSL color space
111
  hue <- matrix(hsl_mat[1,], nrow=600, ncol=600)
112
  saturation <- matrix(hsl_mat[2,], nrow=600, ncol=600)
113
  lightness <- matrix(hsl_mat[3,], nrow=600, ncol=600)
114
  
115
  list("hue"=hue, "saturation"=saturation, "lightness"=lightness)
116
}
117
118
add_scaled_pxls_lowres <- function(spe_sub) {
119
  scaling <- SpatialExperiment::scaleFactors(spe_sub)
120
  pxl_col_in_lowres <-round(spatialCoords(spe_sub)[,'pxl_col_in_fullres'] * scaling)
121
  pxl_row_in_lowres <-round(spatialCoords(spe_sub)[,'pxl_row_in_fullres'] * scaling)
122
  spatialCoords(spe_sub) <- cbind(spatialCoords(spe_sub), pxl_col_in_lowres, pxl_row_in_lowres)
123
  spe_sub
124
}
125
126
match_all_color_spaces <- function(color, pxl_indices) {
127
  apply(pxl_indices, MARGIN=1, match_each_color_space, color=color)
128
}
129
130
match_each_color_space <- function(pxl_indices, color) {
131
  color[pxl_indices[1], pxl_indices[2]]
132
}
133
134
add_color_features <- function(spe_sub, colors) {
135
  pxl_indices <- spatialCoords(spe_sub)[,c('pxl_row_in_lowres','pxl_col_in_lowres')]
136
  feature_df <- lapply(c(rgbg, hsl), match_all_color_spaces, pxl_indices=pxl_indices)
137
  spatialCoords(spe_sub) <- cbind(spatialCoords(spe_sub), 
138
                                feature_df$red,
139
                                feature_df$green,
140
                                feature_df$blue,
141
                                feature_df$grey,
142
                                feature_df$hue,
143
                                feature_df$saturation,
144
                                feature_df$lightness)
145
  colnames(spatialCoords(spe_sub)) <- c('pxl_col_in_fullres',
146
                                        'pxl_row_in_fullres',
147
                                        'pxl_col_in_lowres',
148
                                        'pxl_row_in_lowres',
149
                                        'red', 'green', 'blue', 'grey', 
150
                                        'hue', 'saturation', 'lightness')
151
  spe_sub
152
}
153
```
154
155
```{r}
156
# Get image and RGBG/HSL feature spaces
157
img <- get_img("151673")
158
rgbg <- get_rgbg(img)
159
hsl <- get_hsl(rgbg)
160
161
# Add low resolution pixel values to SpatialExperiment object
162
spe_sub <- add_scaled_pxls_lowres(spe_sub)
163
164
# Match color space values to pixel row/column location, and add to SpatialExperiment object
165
spe_sub <- add_color_features(spe_sub, c(rgbg, hsl))
166
```
167
168
## Finding Features of Interest From Autoencoder Latent Space
169
170
Another approach to finding features of interest is by using an encoder
171
model to capture the latent space between an encoder/decoder autoencoder
172
model. Then, we can use these low-dimensional features to perform
173
classification.
174
175
To build this model, we first need to split out input image into image
176
'patches' centered at each spot. The size of these patches was chosen to
177
be 14x14 pixels. Then, for each spot, the corresponding image patch will
178
be encoded into a feature vector.
179
180
To simplify this process, a 'Spot' class has been created to store the
181
current pixel values of the spot and features from the latent space.
182
183
```{r}
184
create_spot_instance <- function(rowname, img, spe_sub) {
185
  coords = spatialCoords(spe_sub)[rowname,]
186
  row = coords['pxl_row_in_lowres']
187
  col = coords['pxl_col_in_lowres']
188
  
189
  patch = imageData(img[(row - 6): (row + 6), (col - 6): (col + 6),])
190
  
191
  new("Spot", 
192
      sampleId=imgData(spe_sub)$sample_id, 
193
      barcode=rowname,
194
      imagePatch=patch,
195
      spatialCoords=coords)
196
}
197
198
setClass("Spot", 
199
         slots=list(sampleId="character", 
200
                    barcode="character", 
201
                    imagePatch="array", 
202
                    spatialCoords="numeric", 
203
                    latentSpace="array"))
204
205
vec <- lapply(rownames(spatialCoords(spe_sub)), create_spot_instance, img=img, spe_sub=spe_sub)
206
```
207
208
```{r}
209
encoder_input = layers$Input(shape=as.integer(507), name='encoder_input')
210
encoder_dense_layer1 = layers$Dense(units=as.integer(300), name='encoder_dense_1')(encoder_input)
211
encoder_activ_layer1 = layers$LeakyReLU(name="encoder_leakyrelu_1")(encoder_dense_layer1)
212
encoder_dense_layer2 = layers$Dense(units=as.integer(2), name='encoder_dense_2')(encoder_activ_layer1)
213
encoder_output = layers$LeakyReLU(name='encoder_leakyrelu_2')(encoder_dense_layer2)
214
encoder = models$Model(encoder_input, encoder_output, name="encoder_model")
215
216
decoder_input = layers$Input(shape=as.integer(2), name='decoder_input')
217
decoder_dense_layer1 = layers$Dense(units=as.integer(300), name='decoder_dense_1')(decoder_input)
218
decoder_activ_layer1 = layers$LeakyReLU(name="decoder_leakyrelu_1")(decoder_dense_layer1)
219
decoder_dense_layer2 = layers$Dense(units=as.integer(507), name='decoder_dense_2')(decoder_activ_layer1)
220
decoder_output = layers$LeakyReLU(name='decoder_leakyrelu_2')(decoder_dense_layer2)
221
decoder = models$Model(decoder_input, decoder_output, name="decoder_model")
222
223
ae_input = layers$Input(shape=as.integer(507), name="AE_input")
224
ae_encoder_output = encoder(ae_input)
225
ae_decoder_output = decoder(ae_encoder_output)
226
```
227
228
```{r}
229
get_encoded_features <- function(spot) {
230
  temp <- c(spot@imagePatch)
231
  dim(temp) <- c(1, length(temp))
232
  
233
  ae = models$Model(ae_input, ae_decoder_output, name="AE")
234
  ae$compile(loss="mse", optimizer=optimizers$Adam(learning_rate=0.0005))
235
  ae$fit(temp, temp, epochs=as.integer(50), shuffle=TRUE, verbose=as.integer(0))
236
  
237
  encoding = encoder$predict(temp, verbose=as.integer(0))
238
  spot@latentSpace <- encoding
239
  spot
240
}
241
242
extract_encodings <- function(spot) {
243
  c(spot@latentSpace, spot@spatialCoords)
244
}
245
246
extract_barcodes <- function(spot) {
247
  spot@barcode
248
}
249
250
add_gene_col <- function(g_name, features, logcounts, spe_sub) {
251
  if (!(g_name %in% colnames(features))) {
252
    features[,ncol(features) + 1] <- data.frame(logcounts[,get_gene_id(g_name, spe_sub)])
253
    colnames(features)[ncol(features)] <- g_name
254
  }
255
  features
256
}
257
258
temp_vec_1 <- lapply(vec[1:500], get_encoded_features)
259
temp_vec_2 <- lapply(vec[501:1000], get_encoded_features)
260
temp_vec_3 <- lapply(vec[1001:1500], get_encoded_features)
261
temp_vec_4 <- lapply(vec[1501:2000], get_encoded_features)
262
temp_vec_5 <- lapply(vec[2001:2500], get_encoded_features)
263
temp_vec_6 <- lapply(vec[2501:3000], get_encoded_features)
264
temp_vec_7 <- lapply(vec[3001:3590], get_encoded_features)
265
266
vec <- c(temp_vec_1, temp_vec_2, temp_vec_3, temp_vec_4, temp_vec_5, temp_vec_6, temp_vec_7)
267
features <- t(sapply(vec, extract_encodings))
268
rownames(features) <- sapply(vec, extract_barcodes)
269
colnames(features)[1:2] <- c('latent_space_1', 'latent_space_2')
270
271
features <- add_gene_col('MOBP', features, logcounts, spe_sub)
272
features <- add_gene_col('SNAP25', features, logcounts, spe_sub)
273
features <- add_gene_col('PCP4', features, logcounts, spe_sub)
274
```
275
276
```{r}
277
# plot MOBP, SNAP25, PCP4 in 2D feature space
278
print(ggplot(features, aes(x=features[,'latent_space_1'], y = features[,'latent_space_2'])) + geom_point(aes(color=features[,'MOBP'])) + theme(aspect.ratio = 1) + scale_color_gradient(low='pink', high='purple') + ggtitle(paste("MOBP Expression Level in 2D Latent Space")))
279
280
print(ggplot(features, aes(x=features[,'latent_space_1'], y = features[,'latent_space_2'])) + geom_point(aes(color=features[,'SNAP25'])) + theme(aspect.ratio = 1) + scale_color_gradient(low='pink', high='purple') + ggtitle(paste("SNAP25 Expression Level in 2D Latent Space")))
281
282
print(ggplot(features, aes(x=features[,'latent_space_1'], y = features[,'latent_space_2'])) + geom_point(aes(color=features[,'PCP4'])) + theme(aspect.ratio = 1) + scale_color_gradient(low='pink', high='purple') + ggtitle(paste("PCP4 Expression Level in 2D Latent Space")))
283
```
284
285
```{r}
286
# Plot two feature latent space
287
features <- data.frame(features)
288
# Spot Plots for Autoencoder Features
289
print(ggplot(features, aes(x = features[,'pxl_row_in_lowres'], y=600 - features[,'pxl_col_in_lowres'])) + geom_point(aes(color=features[,'latent_space_1'])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",'latent_space_1')))
290
291
print(ggplot(features, aes(x = features[,'pxl_row_in_lowres'], y=600 - features[,'pxl_col_in_lowres'])) + geom_point(aes(color=features[,'latent_space_2'])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",'latent_space_2')))
292
```
293
294
```{r}
295
mat <- compute_corr_matrix(logcounts, features[,1:13])
296
297
top_corr <- get_top_k_corr(mat, 10)
298
299
plot_corr_matrix(data.frame(top_corr))
300
```
301
302
## Finding Genes of Interest
303
304
An overall goal of this exploratory data analysis is to identify genes
305
of interest so that we can build predictive models in the future. To
306
accomplish this, a correlation matrix between spots and image features
307
(color spaces) is created. Then, the genes exhibiting the highest
308
correlation to image features are selected and plotted.
309
310
```{r}
311
get_gene_name <- function(x, spe_sub) {
312
  rowData(spe_sub)[rowData(spe_sub)$gene_id == x,]$gene_name
313
}
314
315
get_gene_id <- function(x, spe_sub) {
316
  rowData(spe_sub)[rowData(spe_sub)$gene_name == x ,]$gene_id
317
}
318
319
create_id_name_dict <- function(logcounts, spe_sub) {
320
  id_to_name <- mclapply(colnames(logcounts), get_gene_name, spe_sub=spe_sub)
321
  names(id_to_name) <- colnames(logcounts)
322
  id_to_name
323
}
324
325
# compute correlation matrix in parallel
326
compute_corr_matrix <- function(expression_arr, color_arr, round=4) {
327
  foreach(i = 1:ncol(expression_arr),
328
  .combine = rbind,
329
  .packages = c('data.table', 'doParallel')) %dopar% {
330
    colName <- colnames(expression_arr)[i]
331
    df <- data.frame(round(cor(expression_arr[,i], color_arr, method = 'pearson', use="complete.obs"), round))
332
    rownames(df) <- colName
333
    df
334
  }
335
}
336
337
get_top_k_corr <- function(mat, k) {
338
  temp_ind <- c()
339
  for (i in 1:ncol(mat)) {
340
    temp_ind <- c(temp_ind, order(abs(mat[,i]), decreasing=T)[1:k])
341
  }
342
  top_corr_ind <- unique(temp_ind)
343
  # temp_ind <- c(temp_ind, order(abs(as.numeric(unlist(mat))), decreasing=T)[1:k])
344
  # temp_ind <- temp_ind %% 600
345
  top_corr <- rename_gene_ids(data.frame(mat[top_corr_ind,]))
346
  top_corr
347
}
348
349
# Replace gene IDs with gene names
350
rename_gene_ids <- function(correlation_matrix) {
351
  for (i in 1:nrow(correlation_matrix)) {
352
    g_name <- rownames(correlation_matrix)[i]
353
    rownames(correlation_matrix)[i] <- rowData(spe_sub[g_name])$gene_name
354
  }
355
  correlation_matrix
356
}
357
358
# Plot correlation matrix
359
plot_corr_matrix <- function(correlation_matrix) {
360
  ggcorrplot(correlation_matrix, sig.level=0.01, lab_size = 4.5, p.mat = NULL,
361
           insig = c("pch", "blank"), pch = 1, pch.col = "black", pch.cex =1,
362
           tl.cex = 7) +
363
  theme(axis.text.x = element_text(margin=margin(-2,0,0,0)),
364
        axis.text.y = element_text(margin=margin(0,-2,0,0)),
365
        panel.grid.minor = element_line(size=7)) + 
366
  geom_tile(fill="white") +
367
  geom_tile(height=1, width=1)
368
}
369
370
get_gene_id <- function(g_name, spe) {
371
  gene_id = rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id
372
  gene_id
373
}
374
```
375
376
```{r}
377
gene_id <- get_gene_id('TERB1', spe_sub)
378
mat[gene_id,]
379
380
```
381
382
```{r}
383
logcounts <- t(logcounts(spe_sub))
384
385
id_to_name <- create_id_name_dict(logcounts, spe_sub)
386
387
mat <- compute_corr_matrix(logcounts, 
388
                           spatialCoords(spe_sub)[,5:ncol(spatialCoords(spe_sub))])
389
390
top_corr <- get_top_k_corr(mat, 20)
391
392
plot_corr_matrix(data.frame(top_corr))
393
```
394
395
```{r}
396
mat[get_gene_id('SNAP25', spe_sub),]
397
top_corr
398
```
399
400
## Spot Plots of Features
401
402
```{r}
403
temp_df <- data.frame(spatialCoords(spe_sub))
404
for (g_name in colnames(temp_df[,5:11])) {
405
  # print(paste(" Number of expressed barcodes for ", g_name, ": ",sum(data[,g_name] > 0), "/", length(data[,g_name])))
406
  print(ggplot(temp_df, aes(x = temp_df[,'pxl_row_in_lowres'], y=600 - temp_df[,'pxl_col_in_lowres'])) + geom_point(aes(color=temp_df[,g_name])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",g_name)))
407
}
408
```
409
410
```{r}
411
# Add genes
412
```
413
414
```{r}
415
temp_df <- data.frame(spatialCoords(spe_sub))
416
for (g_name in colnames(temp_df[,5:11])) {
417
  # print(paste(" Number of expressed barcodes for ", g_name, ": ",sum(data[,g_name] > 0), "/", length(data[,g_name])))
418
  print(ggplot(temp_df, aes(x = temp_df[,'pxl_row_in_lowres'], y=600 - temp_df[,'pxl_col_in_lowres'])) + geom_point(aes(color=temp_df[,g_name])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",g_name)))
419
}
420
```
421
422
### Create RGB and HSL color channels and plot histograms of distribution
423
424
```{r, message=FALSE}
425
png_link <- paste('https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/',sample_id,'_tissue_lowres_image.png',sep='')
426
img <- readImage(png_link)
427
428
# create 600x600 color channel matrices for each channel
429
red <- imageData(EBImage::channel(img, 'red'))
430
green <- imageData(EBImage::channel(img, 'green'))
431
blue <- imageData(EBImage::channel(img, 'blue'))
432
grey <- imageData(EBImage::channel(img, 'grey'))
433
434
# visualize histograms of each color channel
435
hist(red)
436
hist(green)
437
hist(blue)
438
hist(grey)
439
440
rgb_mat <- t(data.frame(c(red), c(green), c(blue)))
441
hsl_mat <- rgb2hsl(rgb_mat)
442
443
# HSL color space
444
hue <- Matrix(hsl_mat[1,], nrow=600, ncol=600)
445
saturation <- Matrix(hsl_mat[2,], nrow=600, ncol=600)
446
lightness <- Matrix(hsl_mat[3,], nrow=600, ncol=600)
447
448
# get scale factor
449
scaling <- SpatialExperiment::scaleFactors(spe_sub)
450
451
# store spatial coords information (pxl col and row numbers in fullres image for each barcode)
452
spatial_coords <- spatialCoords(spe_sub)
453
454
# create data frame to hold pixel locations and pixel color channel values
455
data <- data.frame(spatial_coords, row.names=NULL)
456
data['barcode'] <- rownames(spatial_coords)
457
458
# add scaled pixel columns
459
new <- ceiling(data[["pxl_col_in_fullres"]] * scaling)
460
data[ , ncol(data) + 1] <- new 
461
colnames(data)[ncol(data)] <- "scaled_pxl_col_in_lowres"
462
463
# add scaled pixel rows
464
new <- ceiling(data[["pxl_row_in_fullres"]] * scaling)
465
data[ , ncol(data) + 1] <- new 
466
colnames(data)[ncol(data)] <- "scaled_pxl_row_in_lowres"
467
468
# create vectors to be added to dataframe
469
X_r <- vector(mode="double", length = dim(data)[1])
470
X_g <- vector(mode="double", length = dim(data)[1])
471
X_b <- vector(mode="double", length = dim(data)[1])
472
X_grey <- vector(mode="double", length = dim(data)[1])
473
X_h <- vector(mode="double", length = dim(data)[1])
474
X_s <- vector(mode="double", length = dim(data)[1])
475
X_l <- vector(mode="double", length = dim(data)[1])
476
477
# iterate over dataframe, add rgb values
478
for (i in 1:nrow(data)) {
479
    col_index <- data[i,]$scaled_pxl_col_in_lowres
480
    row_index <- data[i,]$scaled_pxl_row_in_lowres
481
    X_r[i] <- red[row_index, col_index]
482
    X_g[i] <- green[row_index, col_index]
483
    X_b[i] <- blue[row_index, col_index]
484
    X_grey[i] <- grey[row_index, col_index]
485
    X_h[i] <- hue[row_index, col_index]
486
    X_s[i] <- saturation[row_index, col_index]
487
    X_l[i] <- lightness[row_index, col_index]
488
}
489
490
data[ , ncol(data) + 1] <- X_r
491
colnames(data)[ncol(data)] <- "red"
492
data[ , ncol(data) + 1] <- X_g
493
colnames(data)[ncol(data)] <- "green"
494
data[ , ncol(data) + 1] <- X_b
495
colnames(data)[ncol(data)] <- "blue"
496
data[ , ncol(data) + 1] <- X_grey
497
colnames(data)[ncol(data)] <- "grey"
498
data[ , ncol(data) + 1] <- X_h
499
colnames(data)[ncol(data)] <- "hue"
500
data[ , ncol(data) + 1] <- X_s
501
colnames(data)[ncol(data)] <- "saturation"
502
data[ , ncol(data) + 1] <- X_l
503
colnames(data)[ncol(data)] <- "lightness"
504
```
505
506
### Functions for high res images, where there exist more than one pixel per spot
507
508
```{r, message=FALSE}
509
# Mode Function
510
getmode <- function(v) {
511
   uniqv <- unique(v)
512
   uniqv[which.max(tabulate(match(v, uniqv)))]
513
}
514
515
# For a spot 'i', plot histogram of pixel color values given by 'color'
516
plot_pixels_per_spot <- function(ind_barcodes, data, color, i) {
517
  # histogram of color channel values in one spot
518
  barcode = rownames(ind_barcodes)[i]
519
  x <- data[data$barcode == barcode, color]
520
  hist(x, main=paste("Histogram of ",  color, " Color Channel Values For a Single Barcode", sep = ''),breaks=100)
521
}
522
523
# Create a df where rows are barcodes and columns are mean, median, and mode values for each RGB/Grayscale category
524
populate_ind_barcodes <- function(i, ind_barcodes) {
525
  colors <- c("red", "green", "blue", "grey")
526
  for (j in 1:length(colors)) {
527
    color <- colors[j]
528
    vals <- data[data$barcode == row.names(ind_barcodes)[i],][,color]
529
    ind_barcodes[i, paste("mean_", color, sep="")] <- mean(vals)
530
    ind_barcodes[i, paste("median_", color, sep="")] <- median(vals)
531
    ind_barcodes[i, paste("mode_", color, sep="")] <- getmode(vals) 
532
  }
533
  ind_barcodes
534
}
535
536
# create empty df
537
ind_barcodes <- data.frame(matrix(ncol = 12, nrow = length(unique(data[,'barcode']))))
538
colnames(ind_barcodes) <- c('mean_red', 'median_red', 'mode_red', 'mean_green', 'median_green', 'mode_green', 'mean_blue', 'median_blue', 'mode_blue', 'mean_grey', 'median_grey', 'mode_grey')
539
rownames(ind_barcodes) <- unique(data[, 'barcode'])
540
541
# add mean, median, mode data
542
for (i in 1:nrow(ind_barcodes)) {
543
 ind_barcodes <- populate_ind_barcodes(i, ind_barcodes)
544
}
545
ind_barcodes
546
```
547
548
### Analysis 1: Compute Correlation First, Compute Outliers After
549
550
In the following blocks, the correlation between all genes + pixel color
551
channel values is calculated. Then, for the top performing genes, a
552
plotting function is called in which spots corresponding to outlier gene
553
expression values + spots corresponding to outlier pixel color channel
554
values are removed.
555
556
```{r, message=FALSE}
557
# compute correlation matrix in parallel
558
compute_corr_matrix <- function(expression_arr, color_arr, round=4) {
559
  foreach(i = 1:ncol(expression_arr),
560
  .combine = rbind,
561
  .packages = c('data.table', 'doParallel')) %dopar% {
562
    colName <- colnames(expression_arr)[i]
563
    df <- data.frame(round(cor(expression_arr[,i], color_arr, method = 'pearson', use="complete.obs"), round))
564
    rownames(df) <- colName
565
    df
566
  }
567
}
568
569
# Filter correlation matrix based on a threshold and replace gene IDs with gene names
570
filter_corr_matrix <- function(correlation_matrix, threshold) {
571
  filter <- as.data.frame(apply(abs(correlation_matrix) >= threshold, 1, any))
572
  correlation_matrix_filtered <- correlation_matrix[filter[,1],]
573
  correlation_matrix_filtered <- rename_gene_ids(correlation_matrix_filtered)
574
  correlation_matrix_filtered
575
}
576
577
# Replace gene IDs with gene names
578
rename_gene_ids <- function(correlation_matrix) {
579
  for (i in 1:nrow(correlation_matrix)) {
580
    g_name <- rownames(correlation_matrix)[i]
581
    rownames(correlation_matrix)[i] <- rowData(spe_sub[g_name])$gene_name
582
  }
583
  correlation_matrix
584
}
585
586
# Plot correlation matrix
587
plot_corr_matrix <- function(correlation_matrix) {
588
  ggcorrplot(correlation_matrix, sig.level=0.01, lab_size = 4.5, p.mat = NULL,
589
           insig = c("pch", "blank"), pch = 1, pch.col = "black", pch.cex =1,
590
           tl.cex = 7) +
591
  theme(axis.text.x = element_text(margin=margin(-2,0,0,0)),
592
        axis.text.y = element_text(margin=margin(0,-2,0,0)),
593
        panel.grid.minor = element_line(size=7)) + 
594
  geom_tile(fill="white") +
595
  geom_tile(height=1, width=1)
596
}
597
598
# function to calculate regression equation. Used as a sanity check
599
lm_eqn <- function(vec1, vec2){
600
    m <- lm(vec1 ~ vec2, data.frame(vec1, vec2));
601
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
602
         list(a = format(unname(coef(m)[1]), digits = 2),
603
              b = format(unname(coef(m)[2]), digits = 2),
604
             r2 = format(summary(m)$r.squared, digits = 3)))
605
    as.character(as.expression(eq));
606
}
607
```
608
609
```{r, message=FALSE}
610
color_arr <- data[,6:ncol(data)]
611
correlation_matrix <- compute_corr_matrix(logcounts_151507_filtered, color_arr)
612
correlation_matrix_filtered <- filter_corr_matrix(correlation_matrix, 0.3)
613
plot_corr_matrix(correlation_matrix_filtered)
614
```
615
616
```{r, message=FALSE}
617
# plot color channel value vs gene expression. If remove_outliers == TRUE, outlier gene expression values + outlier pixel color values are removed.
618
# ASSUMES THAT EACH BARCODE HAS MULTIPLE PIXELS
619
plot_channel_highres <- function(g_name, gene_id, color, stat, log_arr, color_arr, remove_outliers) {
620
  stat_type <- paste(color, stat, sep="_")
621
  x_axis <- log_arr[,gene_id]
622
  y_axis <- color_arr[,stat_type] * 255
623
  df <- data.frame(x_axis, y_axis)
624
  if (remove_outliers) {
625
    barcode_names <- rownames(log_arr)
626
    overlap <- union(rownames(log_arr)[isOutlier(log_arr[,gene_id])], rownames(color_arr)[isOutlier(color_arr[,stat_type])])
627
    rownames(df) <- barcode_names
628
    df <- df[-which(rownames(df) %in% overlap),]
629
    title <- paste(stat_type, " Channel Values vs ", g_name, " Gene Expression with Outliers Removed", sep="")
630
  } else {
631
    title <- paste(stat_type, " Channel Values vs ", g_name, " Gene Expression without Outliers Removed", sep="")
632
  }
633
  ggplot(df, aes(x=df[,1], y=df[,2])) + xlab(g_name) + ylab(stat_type) + geom_point(color=color) +
634
  geom_smooth(method='lm', color='black') + stat_regline_equation(label.y = 190, aes(label = ..eq.label..)) +
635
  stat_regline_equation(label.y = 180, aes(label = ..rr.label..)) + ggtitle(title) 
636
}
637
638
# call plot_channel function for all color and mean/median/mode combinations
639
# ASSUMES THAT EACH BARCODE HAS MULTIPLE PIXELS
640
plot_all_channels_highres <- function(g_name, log_arr, color_arr, remove_outliers) {
641
  gene_id <- rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id
642
  colors <- c('red', 'green', 'blue', 'grey')
643
  stats <- c('mean', 'median', 'mode')
644
645
  for (color in colors) {
646
    for (stat in stats) {
647
      print(plot_channel_highres(g_name, gene_id, color, stat, log_arr, color_arr, remove_outliers)) 
648
    }
649
  }
650
}
651
652
# plot color channel value vs gene expression. If remove_outliers == TRUE, outlier gene expression values + outlier pixel color values are removed.
653
# ASSUMES THAT EACH BARCODE HAS ONE PIXEL
654
plot_channel <- function(g_name, gene_id, color, log_arr, color_arr, remove_outliers) {
655
  x_axis <- log_arr[,gene_id]
656
  y_axis <- color_arr[,color] * 255
657
  df <- data.frame(x_axis, y_axis)
658
  if (remove_outliers) {
659
    barcode_names <- rownames(log_arr)
660
    overlap <- union(rownames(log_arr)[isOutlier(log_arr[,gene_id])], rownames(color_arr)[isOutlier(color_arr[,color])])
661
    rownames(df) <- barcode_names
662
    df <- df[-which(rownames(df) %in% overlap),]
663
    title <- paste(color, " Channel Values vs ", g_name, " Gene Expression with Outliers Removed", sep="")
664
  } else {
665
    title <- paste(color, " Channel Values vs ", g_name, " Gene Expression without Outliers Removed", sep="")
666
  }
667
  ggplot(df, aes(x=df[,1], y=df[,2])) + xlab(g_name) + ylab(color) + geom_point(color=color) +
668
  geom_smooth(method='lm', color='black') + stat_regline_equation(label.y = 190, aes(label = ..eq.label..)) +
669
  stat_regline_equation(label.y = 180, aes(label = ..rr.label..)) + ggtitle(title) 
670
}
671
672
# call plot_channel function for all colors
673
# ASSUMES THAT EACH BARCODE HAS ONE PIXEL
674
plot_all_channels <- function(g_name, log_arr, color_arr, remove_outliers) {
675
  gene_id <- rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id
676
  colors <- c('red', 'green', 'blue', 'grey')
677
678
  for (color in colors) {
679
    print(plot_channel(g_name, gene_id, color, log_arr, color_arr, remove_outliers))
680
  }
681
}
682
```
683
684
```{r, message=FALSE}
685
plot_all_channels('MT-ND1', logcounts_151507_filtered, color_arr, remove_outliers=FALSE)
686
plot_all_channels('MT-ND1', logcounts_151507_filtered, color_arr, remove_outliers=TRUE)
687
```
688
689
```{r, message=FALSE}
690
plot_all_channels('MT-CO2', logcounts_151507_filtered, color_arr, remove_outliers=FALSE)
691
plot_all_channels('MT-CO2', logcounts_151507_filtered, color_arr, remove_outliers=TRUE)
692
```
693
694
```{r, message=FALSE}
695
plot_all_channels('MT-ATP6', logcounts_151507_filtered, color_arr, remove_outliers=FALSE)
696
plot_all_channels('MT-ATP6', logcounts_151507_filtered, color_arr, remove_outliers=TRUE)
697
```
698
699
```{r, message=FALSE}
700
plot_all_channels('MT-CYB', logcounts_151507_filtered, color_arr, remove_outliers=FALSE)
701
plot_all_channels('MT-CYB', logcounts_151507_filtered, color_arr, remove_outliers=TRUE)
702
```
703
704
## Redo Analysis Removing Outliers Before Correlation Computation
705
706
In the following blocks, analysis is redone by removing outliers before
707
correlation computation based on the isOutlier() function. If the number
708
of non-NA expression values \< num = 100, the gene is removed from
709
consideration. Then, correlation between the RGB/Grayscale values and
710
gene expression values is computed.
711
712
```{r, message=FALSE}
713
# Remove outlier gene expression values. If median gene expression value is 0, replace complement of outliers with NA. Else replace outliers with NA. If remaining non-NA expression values < num, replace entire column with NA.
714
remove_outliers <- function(arr, num) {
715
  foreach(i = 1:ncol(arr),
716
  .combine = cbind,
717
  .packages = c('data.table', 'doParallel')) %dopar% {
718
    if (median(arr[,i]) == 0) {
719
      arr[,i][!isOutlier(arr[,i])] <- NA
720
    } else {
721
      arr[,i][isOutlier(arr[,i])] <- NA
722
    }
723
    if (matrixStats::count(!is.na(arr[,i])) < num) {
724
      arr[,i] <- rep(NA, nrow(arr))
725
    }
726
    arr[,i]
727
  }
728
}
729
730
get_top_k_corr <- function(mat, k) {
731
  temp_ind <- c()
732
  for (i in 1:ncol(mat)) {
733
    temp_ind <- c(temp_ind, order(abs(mat[,i]), decreasing=T)[1:k])
734
  }
735
  top_corr_ind <- unique(temp_ind)
736
  top_corr <- rename_gene_ids(data.frame(mat[top_corr_ind,]))
737
  top_corr
738
}
739
740
get_gene_id <- function(g_name, spe) {
741
  gene_id = rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id
742
  gene_id
743
}
744
745
add_gene_col_data_df <- function(gene_id, g_name, data) {
746
  if (!(g_name %in% colnames(data))) {
747
    data[,ncol(data) + 1] <- data.frame(logcounts_151673_filtered[,gene_id])
748
    colnames(data)[ncol(data)] <- g_name
749
  }
750
  data
751
}
752
753
drop_cols_data_df <- function(drop, data) {
754
  data <- data[,!(names(data) %in% drop)]
755
  data
756
}
757
758
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
759
```
760
761
```{r}
762
g_name <- 'MOBP'
763
gene_id <- get_gene_id(g_name, spe)
764
data <- add_gene_col_data_df(gene_id, g_name, data)
765
```
766
767
```{r}
768
#temp <- data.frame(spatialCoords(spe_sub), vec1)
769
g_name='vec1'
770
temp <- data.frame(spatialCoords(spe_sub), vec1)
771
print(ggplot(temp, aes(x = temp[,'pxl_row_in_lowres'], y=600 - temp[,'pxl_col_in_lowres'])) + geom_point(aes(color=temp[,g_name])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",g_name)))
772
```
773
774
```{r}
775
for (g_name in colnames(data[6:12])) {
776
  print(ggplot(data, aes(x = data[,'scaled_pxl_row_in_lowres'], y=600 - data[,'scaled_pxl_col_in_lowres'])) + geom_point(aes(color=data[,g_name])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",g_name)))
777
}
778
```
779
780
```{r}
781
top_corr
782
```
783
784
```{r}
785
mat <- compute_corr_matrix(logcounts_151673_filtered, data[6:12])
786
top_corr <- get_top_k_corr(mat, 10)
787
plot_corr_matrix(data.frame(top_corr))
788
```
789
790
```{r}
791
for (g_name in rownames(top_corr)) {
792
  gene_id <- get_gene_id(g_name, spe)
793
  data <- add_gene_col_data_df(gene_id, g_name, data)
794
}
795
data
796
```
797
798
```{r}
799
for (g_name in colnames(data[13:ncol(data)])) {
800
  print(paste(" Number of expressed barcodes for ", g_name, ": ",sum(data[,g_name] > 0), "/", length(data[,g_name])))
801
  print(ggplot(data, aes(x = data[,'scaled_pxl_row_in_lowres'], y=600 - data[,'scaled_pxl_col_in_lowres'])) + geom_point(aes(color=data[,g_name])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",g_name)))
802
}
803
```
804
805
```{r}
806
logcounts_151673[,i] > 0
807
sum(logcounts_151673_filtered[,gene_id] > 0)
808
sum(!is.na(logcounts_151673_filtered[,gene_id]))
809
810
compute_corr_matrix(data.frame(logcounts_151673_filtered), data[,6:12])
811
compute_corr_matrix(data.frame(logcounts_151673_filtered_no_outliers[,gene_id]), temp_data[,6:12])
812
```
813
814
```{r}
815
g_name = 'SNAP25'
816
gene_id = rowData(spe)[rowData(spe)$gene_name == 'SNAP25',]$gene_id
817
print(sum(logcounts_151507[,gene_id] != 0))
818
print(sum(logcounts_151507_filtered[,gene_id] != 0))
819
print(sum(!is.na(logcounts_151507_filtered_no_outliers[,gene_id])))
820
821
mat_with_outliers <- compute_corr_matrix(data.frame(logcounts_151507_filtered[,gene_id]), data[,4:5])
822
mat_with_outliers
823
824
mat_outliers <- compute_corr_matrix(data.frame(logcounts_151507_filtered_no_outliers[,gene_id]), data[,4:5])
825
mat_outliers
826
```
827
828
```{r, message=FALSE}
829
# Call remove outlier function
830
logcounts_temp <- as.matrix(logcounts_151673_filtered)
831
col_names <- colnames(logcounts_temp)
832
logcounts_151673_filtered_no_outliers <- remove_outliers(logcounts_temp, 100)
833
colnames(logcounts_151673_filtered_no_outliers) <- col_names
834
835
# Remove columns that are completely NA
836
not_all_na <- function(x) any(!is.na(x))
837
logcounts_151673_filtered_no_outliers <- data.frame(logcounts_151673_filtered_no_outliers) %>% select(where(not_all_na))
838
839
# compute correlation matrix
840
mat <- compute_corr_matrix(logcounts_151673_filtered_no_outliers, data[6:12])
841
```
842
843
```{r, message=FALSE}
844
top_corr <- get_top_k_corr(mat, 10)
845
plot_corr_matrix(top_corr)
846
```
847
848
```{r}
849
# MOBP, SNAP25, plot 
850
# Linear regression model
851
# Where do MOBP/SNAP25 fall in my analysis
852
# When did it get removed, spot plots of these genes
853
# spot plots of top genes in correlation analysis
854
gene_id = rowData(spe)[rowData(spe)$gene_name == 'LFNG',]$gene_id
855
sum(!is.na(logcounts_151507_filtered_no_outliers[,gene_id]))
856
```
857
858
```{r, message=FALSE}
859
plot_all_channels('LFNG', logcounts_151507_filtered_no_outliers, color_arr, remove_outliers=FALSE)
860
```
861
862
```{r, message=FALSE}
863
plot_all_channels('ELK4', logcounts_151507_filtered_no_outliers, color_arr, remove_outliers=FALSE)
864
```
865
866
```{r, message=FALSE}
867
plot_all_channels('PLIN4', logcounts_151507_filtered_no_outliers, color_arr, remove_outliers=FALSE)
868
```