|
a |
|
b/EDA.Rmd |
|
|
1 |
--- |
|
|
2 |
title: "R Notebook" |
|
|
3 |
output: html_notebook |
|
|
4 |
editor_options: |
|
|
5 |
markdown: |
|
|
6 |
wrap: 72 |
|
|
7 |
--- |
|
|
8 |
|
|
|
9 |
# Exploratory Data Analysis and Feature Extraction |
|
|
10 |
|
|
|
11 |
In this markdown, exploratory data analysis is performed on sample |
|
|
12 |
151507, with the goal of discovering genes whose expression values are |
|
|
13 |
correlated with simple histology image features (RGB/Grayscale color |
|
|
14 |
space). Sample 151507 includes 4136 spots and 21131 genes after |
|
|
15 |
performing QC on spots and removing genes with an expression level of 0 |
|
|
16 |
for all spots. |
|
|
17 |
|
|
|
18 |
First, RGB/grayscale color channels are extracted from the image. All |
|
|
19 |
images are in lowres, meaning that they each have size 600 x 600. Then, |
|
|
20 |
the 4136 x 9 dataframe 'data' is created. Each row of data represents a |
|
|
21 |
barcode/spot, and the following column data is represented: |
|
|
22 |
|
|
|
23 |
"pxl_col_in_fullres": column number of barcode in fullres image |
|
|
24 |
|
|
|
25 |
"pxl_row_in_fullres": row number of barcode in fullres image |
|
|
26 |
|
|
|
27 |
"barcode": string containing barcode |
|
|
28 |
|
|
|
29 |
"scaled_pxl_col_in_lowres": column number of barcode in lowres image |
|
|
30 |
|
|
|
31 |
"scaled_pxl_row_in_lowres": row number of barcode in lowres image |
|
|
32 |
|
|
|
33 |
"red": red channel value of barcode |
|
|
34 |
|
|
|
35 |
"green": green channel value of barcode |
|
|
36 |
|
|
|
37 |
"blue": blue channel value of barcode |
|
|
38 |
|
|
|
39 |
"grey": grey channel value of barcode |
|
|
40 |
|
|
|
41 |
Then, correlation + the removal of outliers is performed in two ways, |
|
|
42 |
with the second being optimal. 1. Correlation between gene expression |
|
|
43 |
values and RGB/Grayscale pixel values is performed; then, outlier pixel |
|
|
44 |
and gene expression values are removed. 2. Outliers are removed first, |
|
|
45 |
then correlation is performed. |
|
|
46 |
|
|
|
47 |
## Imports |
|
|
48 |
|
|
|
49 |
```{r Import, message=FALSE} |
|
|
50 |
library(spatialLIBD) |
|
|
51 |
library(scuttle) |
|
|
52 |
library(dplyr) |
|
|
53 |
library(ggcorrplot) |
|
|
54 |
library(ggplot2) |
|
|
55 |
library(EBImage) |
|
|
56 |
library(ggpubr) |
|
|
57 |
library(foreach) |
|
|
58 |
library(doParallel) |
|
|
59 |
library(plotwidgets) |
|
|
60 |
library(Dict) |
|
|
61 |
library(lme4) |
|
|
62 |
registerDoParallel(4) |
|
|
63 |
library(reticulate) |
|
|
64 |
|
|
|
65 |
py_install("tensorflow") |
|
|
66 |
py_install("keras") |
|
|
67 |
|
|
|
68 |
layers <- import("tensorflow.keras.layers") |
|
|
69 |
models <- import("tensorflow.keras.models") |
|
|
70 |
optimizers <- import("tensorflow.keras.optimizers") |
|
|
71 |
``` |
|
|
72 |
|
|
|
73 |
## Finding Features of Interest From Color Spaces |
|
|
74 |
|
|
|
75 |
The first features we want to extract from the sample are from the |
|
|
76 |
RGBGrayscale + HSL color spaces. We can first scale the pxl values in |
|
|
77 |
the SpatialExperiment object to fit the lowres image, and then determine |
|
|
78 |
the feature values for each pixel and add them to the SpatialExperiment |
|
|
79 |
object. |
|
|
80 |
|
|
|
81 |
```{r Helper} |
|
|
82 |
### Helper Functions for Finding Features of Interest |
|
|
83 |
|
|
|
84 |
get_img <- function(sample_id) { |
|
|
85 |
png_link <- paste('https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/',sample_id,'_tissue_lowres_image.png',sep='') |
|
|
86 |
img <- readImage(png_link) |
|
|
87 |
img |
|
|
88 |
} |
|
|
89 |
|
|
|
90 |
get_rgbg <- function(img) { |
|
|
91 |
# create 600x600 color channel matrices for each channel |
|
|
92 |
red <- imageData(EBImage::channel(img, 'red')) |
|
|
93 |
green <- imageData(EBImage::channel(img, 'green')) |
|
|
94 |
blue <- imageData(EBImage::channel(img, 'blue')) |
|
|
95 |
grey <- imageData(EBImage::channel(img, 'grey')) |
|
|
96 |
list('red'=red, 'green'=green, 'blue'=blue, 'grey'=grey) |
|
|
97 |
} |
|
|
98 |
|
|
|
99 |
visualize_color_channels <- function(rgbg) { |
|
|
100 |
hist(rgbg$red) |
|
|
101 |
hist(rgbg$green) |
|
|
102 |
hist(rgbg$blue) |
|
|
103 |
hist(rgbg$grey) |
|
|
104 |
} |
|
|
105 |
|
|
|
106 |
get_hsl <- function(rgbg) { |
|
|
107 |
rgb_mat <- t(data.frame(c(rgbg$red), c(rgbg$green), c(rgbg$blue))) |
|
|
108 |
hsl_mat <- rgb2hsl(rgb_mat) |
|
|
109 |
|
|
|
110 |
# HSL color space |
|
|
111 |
hue <- matrix(hsl_mat[1,], nrow=600, ncol=600) |
|
|
112 |
saturation <- matrix(hsl_mat[2,], nrow=600, ncol=600) |
|
|
113 |
lightness <- matrix(hsl_mat[3,], nrow=600, ncol=600) |
|
|
114 |
|
|
|
115 |
list("hue"=hue, "saturation"=saturation, "lightness"=lightness) |
|
|
116 |
} |
|
|
117 |
|
|
|
118 |
add_scaled_pxls_lowres <- function(spe_sub) { |
|
|
119 |
scaling <- SpatialExperiment::scaleFactors(spe_sub) |
|
|
120 |
pxl_col_in_lowres <-round(spatialCoords(spe_sub)[,'pxl_col_in_fullres'] * scaling) |
|
|
121 |
pxl_row_in_lowres <-round(spatialCoords(spe_sub)[,'pxl_row_in_fullres'] * scaling) |
|
|
122 |
spatialCoords(spe_sub) <- cbind(spatialCoords(spe_sub), pxl_col_in_lowres, pxl_row_in_lowres) |
|
|
123 |
spe_sub |
|
|
124 |
} |
|
|
125 |
|
|
|
126 |
match_all_color_spaces <- function(color, pxl_indices) { |
|
|
127 |
apply(pxl_indices, MARGIN=1, match_each_color_space, color=color) |
|
|
128 |
} |
|
|
129 |
|
|
|
130 |
match_each_color_space <- function(pxl_indices, color) { |
|
|
131 |
color[pxl_indices[1], pxl_indices[2]] |
|
|
132 |
} |
|
|
133 |
|
|
|
134 |
add_color_features <- function(spe_sub, colors) { |
|
|
135 |
pxl_indices <- spatialCoords(spe_sub)[,c('pxl_row_in_lowres','pxl_col_in_lowres')] |
|
|
136 |
feature_df <- lapply(c(rgbg, hsl), match_all_color_spaces, pxl_indices=pxl_indices) |
|
|
137 |
spatialCoords(spe_sub) <- cbind(spatialCoords(spe_sub), |
|
|
138 |
feature_df$red, |
|
|
139 |
feature_df$green, |
|
|
140 |
feature_df$blue, |
|
|
141 |
feature_df$grey, |
|
|
142 |
feature_df$hue, |
|
|
143 |
feature_df$saturation, |
|
|
144 |
feature_df$lightness) |
|
|
145 |
colnames(spatialCoords(spe_sub)) <- c('pxl_col_in_fullres', |
|
|
146 |
'pxl_row_in_fullres', |
|
|
147 |
'pxl_col_in_lowres', |
|
|
148 |
'pxl_row_in_lowres', |
|
|
149 |
'red', 'green', 'blue', 'grey', |
|
|
150 |
'hue', 'saturation', 'lightness') |
|
|
151 |
spe_sub |
|
|
152 |
} |
|
|
153 |
``` |
|
|
154 |
|
|
|
155 |
```{r} |
|
|
156 |
# Get image and RGBG/HSL feature spaces |
|
|
157 |
img <- get_img("151673") |
|
|
158 |
rgbg <- get_rgbg(img) |
|
|
159 |
hsl <- get_hsl(rgbg) |
|
|
160 |
|
|
|
161 |
# Add low resolution pixel values to SpatialExperiment object |
|
|
162 |
spe_sub <- add_scaled_pxls_lowres(spe_sub) |
|
|
163 |
|
|
|
164 |
# Match color space values to pixel row/column location, and add to SpatialExperiment object |
|
|
165 |
spe_sub <- add_color_features(spe_sub, c(rgbg, hsl)) |
|
|
166 |
``` |
|
|
167 |
|
|
|
168 |
## Finding Features of Interest From Autoencoder Latent Space |
|
|
169 |
|
|
|
170 |
Another approach to finding features of interest is by using an encoder |
|
|
171 |
model to capture the latent space between an encoder/decoder autoencoder |
|
|
172 |
model. Then, we can use these low-dimensional features to perform |
|
|
173 |
classification. |
|
|
174 |
|
|
|
175 |
To build this model, we first need to split out input image into image |
|
|
176 |
'patches' centered at each spot. The size of these patches was chosen to |
|
|
177 |
be 14x14 pixels. Then, for each spot, the corresponding image patch will |
|
|
178 |
be encoded into a feature vector. |
|
|
179 |
|
|
|
180 |
To simplify this process, a 'Spot' class has been created to store the |
|
|
181 |
current pixel values of the spot and features from the latent space. |
|
|
182 |
|
|
|
183 |
```{r} |
|
|
184 |
create_spot_instance <- function(rowname, img, spe_sub) { |
|
|
185 |
coords = spatialCoords(spe_sub)[rowname,] |
|
|
186 |
row = coords['pxl_row_in_lowres'] |
|
|
187 |
col = coords['pxl_col_in_lowres'] |
|
|
188 |
|
|
|
189 |
patch = imageData(img[(row - 6): (row + 6), (col - 6): (col + 6),]) |
|
|
190 |
|
|
|
191 |
new("Spot", |
|
|
192 |
sampleId=imgData(spe_sub)$sample_id, |
|
|
193 |
barcode=rowname, |
|
|
194 |
imagePatch=patch, |
|
|
195 |
spatialCoords=coords) |
|
|
196 |
} |
|
|
197 |
|
|
|
198 |
setClass("Spot", |
|
|
199 |
slots=list(sampleId="character", |
|
|
200 |
barcode="character", |
|
|
201 |
imagePatch="array", |
|
|
202 |
spatialCoords="numeric", |
|
|
203 |
latentSpace="array")) |
|
|
204 |
|
|
|
205 |
vec <- lapply(rownames(spatialCoords(spe_sub)), create_spot_instance, img=img, spe_sub=spe_sub) |
|
|
206 |
``` |
|
|
207 |
|
|
|
208 |
```{r} |
|
|
209 |
encoder_input = layers$Input(shape=as.integer(507), name='encoder_input') |
|
|
210 |
encoder_dense_layer1 = layers$Dense(units=as.integer(300), name='encoder_dense_1')(encoder_input) |
|
|
211 |
encoder_activ_layer1 = layers$LeakyReLU(name="encoder_leakyrelu_1")(encoder_dense_layer1) |
|
|
212 |
encoder_dense_layer2 = layers$Dense(units=as.integer(2), name='encoder_dense_2')(encoder_activ_layer1) |
|
|
213 |
encoder_output = layers$LeakyReLU(name='encoder_leakyrelu_2')(encoder_dense_layer2) |
|
|
214 |
encoder = models$Model(encoder_input, encoder_output, name="encoder_model") |
|
|
215 |
|
|
|
216 |
decoder_input = layers$Input(shape=as.integer(2), name='decoder_input') |
|
|
217 |
decoder_dense_layer1 = layers$Dense(units=as.integer(300), name='decoder_dense_1')(decoder_input) |
|
|
218 |
decoder_activ_layer1 = layers$LeakyReLU(name="decoder_leakyrelu_1")(decoder_dense_layer1) |
|
|
219 |
decoder_dense_layer2 = layers$Dense(units=as.integer(507), name='decoder_dense_2')(decoder_activ_layer1) |
|
|
220 |
decoder_output = layers$LeakyReLU(name='decoder_leakyrelu_2')(decoder_dense_layer2) |
|
|
221 |
decoder = models$Model(decoder_input, decoder_output, name="decoder_model") |
|
|
222 |
|
|
|
223 |
ae_input = layers$Input(shape=as.integer(507), name="AE_input") |
|
|
224 |
ae_encoder_output = encoder(ae_input) |
|
|
225 |
ae_decoder_output = decoder(ae_encoder_output) |
|
|
226 |
``` |
|
|
227 |
|
|
|
228 |
```{r} |
|
|
229 |
get_encoded_features <- function(spot) { |
|
|
230 |
temp <- c(spot@imagePatch) |
|
|
231 |
dim(temp) <- c(1, length(temp)) |
|
|
232 |
|
|
|
233 |
ae = models$Model(ae_input, ae_decoder_output, name="AE") |
|
|
234 |
ae$compile(loss="mse", optimizer=optimizers$Adam(learning_rate=0.0005)) |
|
|
235 |
ae$fit(temp, temp, epochs=as.integer(50), shuffle=TRUE, verbose=as.integer(0)) |
|
|
236 |
|
|
|
237 |
encoding = encoder$predict(temp, verbose=as.integer(0)) |
|
|
238 |
spot@latentSpace <- encoding |
|
|
239 |
spot |
|
|
240 |
} |
|
|
241 |
|
|
|
242 |
extract_encodings <- function(spot) { |
|
|
243 |
c(spot@latentSpace, spot@spatialCoords) |
|
|
244 |
} |
|
|
245 |
|
|
|
246 |
extract_barcodes <- function(spot) { |
|
|
247 |
spot@barcode |
|
|
248 |
} |
|
|
249 |
|
|
|
250 |
add_gene_col <- function(g_name, features, logcounts, spe_sub) { |
|
|
251 |
if (!(g_name %in% colnames(features))) { |
|
|
252 |
features[,ncol(features) + 1] <- data.frame(logcounts[,get_gene_id(g_name, spe_sub)]) |
|
|
253 |
colnames(features)[ncol(features)] <- g_name |
|
|
254 |
} |
|
|
255 |
features |
|
|
256 |
} |
|
|
257 |
|
|
|
258 |
temp_vec_1 <- lapply(vec[1:500], get_encoded_features) |
|
|
259 |
temp_vec_2 <- lapply(vec[501:1000], get_encoded_features) |
|
|
260 |
temp_vec_3 <- lapply(vec[1001:1500], get_encoded_features) |
|
|
261 |
temp_vec_4 <- lapply(vec[1501:2000], get_encoded_features) |
|
|
262 |
temp_vec_5 <- lapply(vec[2001:2500], get_encoded_features) |
|
|
263 |
temp_vec_6 <- lapply(vec[2501:3000], get_encoded_features) |
|
|
264 |
temp_vec_7 <- lapply(vec[3001:3590], get_encoded_features) |
|
|
265 |
|
|
|
266 |
vec <- c(temp_vec_1, temp_vec_2, temp_vec_3, temp_vec_4, temp_vec_5, temp_vec_6, temp_vec_7) |
|
|
267 |
features <- t(sapply(vec, extract_encodings)) |
|
|
268 |
rownames(features) <- sapply(vec, extract_barcodes) |
|
|
269 |
colnames(features)[1:2] <- c('latent_space_1', 'latent_space_2') |
|
|
270 |
|
|
|
271 |
features <- add_gene_col('MOBP', features, logcounts, spe_sub) |
|
|
272 |
features <- add_gene_col('SNAP25', features, logcounts, spe_sub) |
|
|
273 |
features <- add_gene_col('PCP4', features, logcounts, spe_sub) |
|
|
274 |
``` |
|
|
275 |
|
|
|
276 |
```{r} |
|
|
277 |
# plot MOBP, SNAP25, PCP4 in 2D feature space |
|
|
278 |
print(ggplot(features, aes(x=features[,'latent_space_1'], y = features[,'latent_space_2'])) + geom_point(aes(color=features[,'MOBP'])) + theme(aspect.ratio = 1) + scale_color_gradient(low='pink', high='purple') + ggtitle(paste("MOBP Expression Level in 2D Latent Space"))) |
|
|
279 |
|
|
|
280 |
print(ggplot(features, aes(x=features[,'latent_space_1'], y = features[,'latent_space_2'])) + geom_point(aes(color=features[,'SNAP25'])) + theme(aspect.ratio = 1) + scale_color_gradient(low='pink', high='purple') + ggtitle(paste("SNAP25 Expression Level in 2D Latent Space"))) |
|
|
281 |
|
|
|
282 |
print(ggplot(features, aes(x=features[,'latent_space_1'], y = features[,'latent_space_2'])) + geom_point(aes(color=features[,'PCP4'])) + theme(aspect.ratio = 1) + scale_color_gradient(low='pink', high='purple') + ggtitle(paste("PCP4 Expression Level in 2D Latent Space"))) |
|
|
283 |
``` |
|
|
284 |
|
|
|
285 |
```{r} |
|
|
286 |
# Plot two feature latent space |
|
|
287 |
features <- data.frame(features) |
|
|
288 |
# Spot Plots for Autoencoder Features |
|
|
289 |
print(ggplot(features, aes(x = features[,'pxl_row_in_lowres'], y=600 - features[,'pxl_col_in_lowres'])) + geom_point(aes(color=features[,'latent_space_1'])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",'latent_space_1'))) |
|
|
290 |
|
|
|
291 |
print(ggplot(features, aes(x = features[,'pxl_row_in_lowres'], y=600 - features[,'pxl_col_in_lowres'])) + geom_point(aes(color=features[,'latent_space_2'])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",'latent_space_2'))) |
|
|
292 |
``` |
|
|
293 |
|
|
|
294 |
```{r} |
|
|
295 |
mat <- compute_corr_matrix(logcounts, features[,1:13]) |
|
|
296 |
|
|
|
297 |
top_corr <- get_top_k_corr(mat, 10) |
|
|
298 |
|
|
|
299 |
plot_corr_matrix(data.frame(top_corr)) |
|
|
300 |
``` |
|
|
301 |
|
|
|
302 |
## Finding Genes of Interest |
|
|
303 |
|
|
|
304 |
An overall goal of this exploratory data analysis is to identify genes |
|
|
305 |
of interest so that we can build predictive models in the future. To |
|
|
306 |
accomplish this, a correlation matrix between spots and image features |
|
|
307 |
(color spaces) is created. Then, the genes exhibiting the highest |
|
|
308 |
correlation to image features are selected and plotted. |
|
|
309 |
|
|
|
310 |
```{r} |
|
|
311 |
get_gene_name <- function(x, spe_sub) { |
|
|
312 |
rowData(spe_sub)[rowData(spe_sub)$gene_id == x,]$gene_name |
|
|
313 |
} |
|
|
314 |
|
|
|
315 |
get_gene_id <- function(x, spe_sub) { |
|
|
316 |
rowData(spe_sub)[rowData(spe_sub)$gene_name == x ,]$gene_id |
|
|
317 |
} |
|
|
318 |
|
|
|
319 |
create_id_name_dict <- function(logcounts, spe_sub) { |
|
|
320 |
id_to_name <- mclapply(colnames(logcounts), get_gene_name, spe_sub=spe_sub) |
|
|
321 |
names(id_to_name) <- colnames(logcounts) |
|
|
322 |
id_to_name |
|
|
323 |
} |
|
|
324 |
|
|
|
325 |
# compute correlation matrix in parallel |
|
|
326 |
compute_corr_matrix <- function(expression_arr, color_arr, round=4) { |
|
|
327 |
foreach(i = 1:ncol(expression_arr), |
|
|
328 |
.combine = rbind, |
|
|
329 |
.packages = c('data.table', 'doParallel')) %dopar% { |
|
|
330 |
colName <- colnames(expression_arr)[i] |
|
|
331 |
df <- data.frame(round(cor(expression_arr[,i], color_arr, method = 'pearson', use="complete.obs"), round)) |
|
|
332 |
rownames(df) <- colName |
|
|
333 |
df |
|
|
334 |
} |
|
|
335 |
} |
|
|
336 |
|
|
|
337 |
get_top_k_corr <- function(mat, k) { |
|
|
338 |
temp_ind <- c() |
|
|
339 |
for (i in 1:ncol(mat)) { |
|
|
340 |
temp_ind <- c(temp_ind, order(abs(mat[,i]), decreasing=T)[1:k]) |
|
|
341 |
} |
|
|
342 |
top_corr_ind <- unique(temp_ind) |
|
|
343 |
# temp_ind <- c(temp_ind, order(abs(as.numeric(unlist(mat))), decreasing=T)[1:k]) |
|
|
344 |
# temp_ind <- temp_ind %% 600 |
|
|
345 |
top_corr <- rename_gene_ids(data.frame(mat[top_corr_ind,])) |
|
|
346 |
top_corr |
|
|
347 |
} |
|
|
348 |
|
|
|
349 |
# Replace gene IDs with gene names |
|
|
350 |
rename_gene_ids <- function(correlation_matrix) { |
|
|
351 |
for (i in 1:nrow(correlation_matrix)) { |
|
|
352 |
g_name <- rownames(correlation_matrix)[i] |
|
|
353 |
rownames(correlation_matrix)[i] <- rowData(spe_sub[g_name])$gene_name |
|
|
354 |
} |
|
|
355 |
correlation_matrix |
|
|
356 |
} |
|
|
357 |
|
|
|
358 |
# Plot correlation matrix |
|
|
359 |
plot_corr_matrix <- function(correlation_matrix) { |
|
|
360 |
ggcorrplot(correlation_matrix, sig.level=0.01, lab_size = 4.5, p.mat = NULL, |
|
|
361 |
insig = c("pch", "blank"), pch = 1, pch.col = "black", pch.cex =1, |
|
|
362 |
tl.cex = 7) + |
|
|
363 |
theme(axis.text.x = element_text(margin=margin(-2,0,0,0)), |
|
|
364 |
axis.text.y = element_text(margin=margin(0,-2,0,0)), |
|
|
365 |
panel.grid.minor = element_line(size=7)) + |
|
|
366 |
geom_tile(fill="white") + |
|
|
367 |
geom_tile(height=1, width=1) |
|
|
368 |
} |
|
|
369 |
|
|
|
370 |
get_gene_id <- function(g_name, spe) { |
|
|
371 |
gene_id = rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id |
|
|
372 |
gene_id |
|
|
373 |
} |
|
|
374 |
``` |
|
|
375 |
|
|
|
376 |
```{r} |
|
|
377 |
gene_id <- get_gene_id('TERB1', spe_sub) |
|
|
378 |
mat[gene_id,] |
|
|
379 |
|
|
|
380 |
``` |
|
|
381 |
|
|
|
382 |
```{r} |
|
|
383 |
logcounts <- t(logcounts(spe_sub)) |
|
|
384 |
|
|
|
385 |
id_to_name <- create_id_name_dict(logcounts, spe_sub) |
|
|
386 |
|
|
|
387 |
mat <- compute_corr_matrix(logcounts, |
|
|
388 |
spatialCoords(spe_sub)[,5:ncol(spatialCoords(spe_sub))]) |
|
|
389 |
|
|
|
390 |
top_corr <- get_top_k_corr(mat, 20) |
|
|
391 |
|
|
|
392 |
plot_corr_matrix(data.frame(top_corr)) |
|
|
393 |
``` |
|
|
394 |
|
|
|
395 |
```{r} |
|
|
396 |
mat[get_gene_id('SNAP25', spe_sub),] |
|
|
397 |
top_corr |
|
|
398 |
``` |
|
|
399 |
|
|
|
400 |
## Spot Plots of Features |
|
|
401 |
|
|
|
402 |
```{r} |
|
|
403 |
temp_df <- data.frame(spatialCoords(spe_sub)) |
|
|
404 |
for (g_name in colnames(temp_df[,5:11])) { |
|
|
405 |
# print(paste(" Number of expressed barcodes for ", g_name, ": ",sum(data[,g_name] > 0), "/", length(data[,g_name]))) |
|
|
406 |
print(ggplot(temp_df, aes(x = temp_df[,'pxl_row_in_lowres'], y=600 - temp_df[,'pxl_col_in_lowres'])) + geom_point(aes(color=temp_df[,g_name])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",g_name))) |
|
|
407 |
} |
|
|
408 |
``` |
|
|
409 |
|
|
|
410 |
```{r} |
|
|
411 |
# Add genes |
|
|
412 |
``` |
|
|
413 |
|
|
|
414 |
```{r} |
|
|
415 |
temp_df <- data.frame(spatialCoords(spe_sub)) |
|
|
416 |
for (g_name in colnames(temp_df[,5:11])) { |
|
|
417 |
# print(paste(" Number of expressed barcodes for ", g_name, ": ",sum(data[,g_name] > 0), "/", length(data[,g_name]))) |
|
|
418 |
print(ggplot(temp_df, aes(x = temp_df[,'pxl_row_in_lowres'], y=600 - temp_df[,'pxl_col_in_lowres'])) + geom_point(aes(color=temp_df[,g_name])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",g_name))) |
|
|
419 |
} |
|
|
420 |
``` |
|
|
421 |
|
|
|
422 |
### Create RGB and HSL color channels and plot histograms of distribution |
|
|
423 |
|
|
|
424 |
```{r, message=FALSE} |
|
|
425 |
png_link <- paste('https://spatial-dlpfc.s3.us-east-2.amazonaws.com/images/',sample_id,'_tissue_lowres_image.png',sep='') |
|
|
426 |
img <- readImage(png_link) |
|
|
427 |
|
|
|
428 |
# create 600x600 color channel matrices for each channel |
|
|
429 |
red <- imageData(EBImage::channel(img, 'red')) |
|
|
430 |
green <- imageData(EBImage::channel(img, 'green')) |
|
|
431 |
blue <- imageData(EBImage::channel(img, 'blue')) |
|
|
432 |
grey <- imageData(EBImage::channel(img, 'grey')) |
|
|
433 |
|
|
|
434 |
# visualize histograms of each color channel |
|
|
435 |
hist(red) |
|
|
436 |
hist(green) |
|
|
437 |
hist(blue) |
|
|
438 |
hist(grey) |
|
|
439 |
|
|
|
440 |
rgb_mat <- t(data.frame(c(red), c(green), c(blue))) |
|
|
441 |
hsl_mat <- rgb2hsl(rgb_mat) |
|
|
442 |
|
|
|
443 |
# HSL color space |
|
|
444 |
hue <- Matrix(hsl_mat[1,], nrow=600, ncol=600) |
|
|
445 |
saturation <- Matrix(hsl_mat[2,], nrow=600, ncol=600) |
|
|
446 |
lightness <- Matrix(hsl_mat[3,], nrow=600, ncol=600) |
|
|
447 |
|
|
|
448 |
# get scale factor |
|
|
449 |
scaling <- SpatialExperiment::scaleFactors(spe_sub) |
|
|
450 |
|
|
|
451 |
# store spatial coords information (pxl col and row numbers in fullres image for each barcode) |
|
|
452 |
spatial_coords <- spatialCoords(spe_sub) |
|
|
453 |
|
|
|
454 |
# create data frame to hold pixel locations and pixel color channel values |
|
|
455 |
data <- data.frame(spatial_coords, row.names=NULL) |
|
|
456 |
data['barcode'] <- rownames(spatial_coords) |
|
|
457 |
|
|
|
458 |
# add scaled pixel columns |
|
|
459 |
new <- ceiling(data[["pxl_col_in_fullres"]] * scaling) |
|
|
460 |
data[ , ncol(data) + 1] <- new |
|
|
461 |
colnames(data)[ncol(data)] <- "scaled_pxl_col_in_lowres" |
|
|
462 |
|
|
|
463 |
# add scaled pixel rows |
|
|
464 |
new <- ceiling(data[["pxl_row_in_fullres"]] * scaling) |
|
|
465 |
data[ , ncol(data) + 1] <- new |
|
|
466 |
colnames(data)[ncol(data)] <- "scaled_pxl_row_in_lowres" |
|
|
467 |
|
|
|
468 |
# create vectors to be added to dataframe |
|
|
469 |
X_r <- vector(mode="double", length = dim(data)[1]) |
|
|
470 |
X_g <- vector(mode="double", length = dim(data)[1]) |
|
|
471 |
X_b <- vector(mode="double", length = dim(data)[1]) |
|
|
472 |
X_grey <- vector(mode="double", length = dim(data)[1]) |
|
|
473 |
X_h <- vector(mode="double", length = dim(data)[1]) |
|
|
474 |
X_s <- vector(mode="double", length = dim(data)[1]) |
|
|
475 |
X_l <- vector(mode="double", length = dim(data)[1]) |
|
|
476 |
|
|
|
477 |
# iterate over dataframe, add rgb values |
|
|
478 |
for (i in 1:nrow(data)) { |
|
|
479 |
col_index <- data[i,]$scaled_pxl_col_in_lowres |
|
|
480 |
row_index <- data[i,]$scaled_pxl_row_in_lowres |
|
|
481 |
X_r[i] <- red[row_index, col_index] |
|
|
482 |
X_g[i] <- green[row_index, col_index] |
|
|
483 |
X_b[i] <- blue[row_index, col_index] |
|
|
484 |
X_grey[i] <- grey[row_index, col_index] |
|
|
485 |
X_h[i] <- hue[row_index, col_index] |
|
|
486 |
X_s[i] <- saturation[row_index, col_index] |
|
|
487 |
X_l[i] <- lightness[row_index, col_index] |
|
|
488 |
} |
|
|
489 |
|
|
|
490 |
data[ , ncol(data) + 1] <- X_r |
|
|
491 |
colnames(data)[ncol(data)] <- "red" |
|
|
492 |
data[ , ncol(data) + 1] <- X_g |
|
|
493 |
colnames(data)[ncol(data)] <- "green" |
|
|
494 |
data[ , ncol(data) + 1] <- X_b |
|
|
495 |
colnames(data)[ncol(data)] <- "blue" |
|
|
496 |
data[ , ncol(data) + 1] <- X_grey |
|
|
497 |
colnames(data)[ncol(data)] <- "grey" |
|
|
498 |
data[ , ncol(data) + 1] <- X_h |
|
|
499 |
colnames(data)[ncol(data)] <- "hue" |
|
|
500 |
data[ , ncol(data) + 1] <- X_s |
|
|
501 |
colnames(data)[ncol(data)] <- "saturation" |
|
|
502 |
data[ , ncol(data) + 1] <- X_l |
|
|
503 |
colnames(data)[ncol(data)] <- "lightness" |
|
|
504 |
``` |
|
|
505 |
|
|
|
506 |
### Functions for high res images, where there exist more than one pixel per spot |
|
|
507 |
|
|
|
508 |
```{r, message=FALSE} |
|
|
509 |
# Mode Function |
|
|
510 |
getmode <- function(v) { |
|
|
511 |
uniqv <- unique(v) |
|
|
512 |
uniqv[which.max(tabulate(match(v, uniqv)))] |
|
|
513 |
} |
|
|
514 |
|
|
|
515 |
# For a spot 'i', plot histogram of pixel color values given by 'color' |
|
|
516 |
plot_pixels_per_spot <- function(ind_barcodes, data, color, i) { |
|
|
517 |
# histogram of color channel values in one spot |
|
|
518 |
barcode = rownames(ind_barcodes)[i] |
|
|
519 |
x <- data[data$barcode == barcode, color] |
|
|
520 |
hist(x, main=paste("Histogram of ", color, " Color Channel Values For a Single Barcode", sep = ''),breaks=100) |
|
|
521 |
} |
|
|
522 |
|
|
|
523 |
# Create a df where rows are barcodes and columns are mean, median, and mode values for each RGB/Grayscale category |
|
|
524 |
populate_ind_barcodes <- function(i, ind_barcodes) { |
|
|
525 |
colors <- c("red", "green", "blue", "grey") |
|
|
526 |
for (j in 1:length(colors)) { |
|
|
527 |
color <- colors[j] |
|
|
528 |
vals <- data[data$barcode == row.names(ind_barcodes)[i],][,color] |
|
|
529 |
ind_barcodes[i, paste("mean_", color, sep="")] <- mean(vals) |
|
|
530 |
ind_barcodes[i, paste("median_", color, sep="")] <- median(vals) |
|
|
531 |
ind_barcodes[i, paste("mode_", color, sep="")] <- getmode(vals) |
|
|
532 |
} |
|
|
533 |
ind_barcodes |
|
|
534 |
} |
|
|
535 |
|
|
|
536 |
# create empty df |
|
|
537 |
ind_barcodes <- data.frame(matrix(ncol = 12, nrow = length(unique(data[,'barcode'])))) |
|
|
538 |
colnames(ind_barcodes) <- c('mean_red', 'median_red', 'mode_red', 'mean_green', 'median_green', 'mode_green', 'mean_blue', 'median_blue', 'mode_blue', 'mean_grey', 'median_grey', 'mode_grey') |
|
|
539 |
rownames(ind_barcodes) <- unique(data[, 'barcode']) |
|
|
540 |
|
|
|
541 |
# add mean, median, mode data |
|
|
542 |
for (i in 1:nrow(ind_barcodes)) { |
|
|
543 |
ind_barcodes <- populate_ind_barcodes(i, ind_barcodes) |
|
|
544 |
} |
|
|
545 |
ind_barcodes |
|
|
546 |
``` |
|
|
547 |
|
|
|
548 |
### Analysis 1: Compute Correlation First, Compute Outliers After |
|
|
549 |
|
|
|
550 |
In the following blocks, the correlation between all genes + pixel color |
|
|
551 |
channel values is calculated. Then, for the top performing genes, a |
|
|
552 |
plotting function is called in which spots corresponding to outlier gene |
|
|
553 |
expression values + spots corresponding to outlier pixel color channel |
|
|
554 |
values are removed. |
|
|
555 |
|
|
|
556 |
```{r, message=FALSE} |
|
|
557 |
# compute correlation matrix in parallel |
|
|
558 |
compute_corr_matrix <- function(expression_arr, color_arr, round=4) { |
|
|
559 |
foreach(i = 1:ncol(expression_arr), |
|
|
560 |
.combine = rbind, |
|
|
561 |
.packages = c('data.table', 'doParallel')) %dopar% { |
|
|
562 |
colName <- colnames(expression_arr)[i] |
|
|
563 |
df <- data.frame(round(cor(expression_arr[,i], color_arr, method = 'pearson', use="complete.obs"), round)) |
|
|
564 |
rownames(df) <- colName |
|
|
565 |
df |
|
|
566 |
} |
|
|
567 |
} |
|
|
568 |
|
|
|
569 |
# Filter correlation matrix based on a threshold and replace gene IDs with gene names |
|
|
570 |
filter_corr_matrix <- function(correlation_matrix, threshold) { |
|
|
571 |
filter <- as.data.frame(apply(abs(correlation_matrix) >= threshold, 1, any)) |
|
|
572 |
correlation_matrix_filtered <- correlation_matrix[filter[,1],] |
|
|
573 |
correlation_matrix_filtered <- rename_gene_ids(correlation_matrix_filtered) |
|
|
574 |
correlation_matrix_filtered |
|
|
575 |
} |
|
|
576 |
|
|
|
577 |
# Replace gene IDs with gene names |
|
|
578 |
rename_gene_ids <- function(correlation_matrix) { |
|
|
579 |
for (i in 1:nrow(correlation_matrix)) { |
|
|
580 |
g_name <- rownames(correlation_matrix)[i] |
|
|
581 |
rownames(correlation_matrix)[i] <- rowData(spe_sub[g_name])$gene_name |
|
|
582 |
} |
|
|
583 |
correlation_matrix |
|
|
584 |
} |
|
|
585 |
|
|
|
586 |
# Plot correlation matrix |
|
|
587 |
plot_corr_matrix <- function(correlation_matrix) { |
|
|
588 |
ggcorrplot(correlation_matrix, sig.level=0.01, lab_size = 4.5, p.mat = NULL, |
|
|
589 |
insig = c("pch", "blank"), pch = 1, pch.col = "black", pch.cex =1, |
|
|
590 |
tl.cex = 7) + |
|
|
591 |
theme(axis.text.x = element_text(margin=margin(-2,0,0,0)), |
|
|
592 |
axis.text.y = element_text(margin=margin(0,-2,0,0)), |
|
|
593 |
panel.grid.minor = element_line(size=7)) + |
|
|
594 |
geom_tile(fill="white") + |
|
|
595 |
geom_tile(height=1, width=1) |
|
|
596 |
} |
|
|
597 |
|
|
|
598 |
# function to calculate regression equation. Used as a sanity check |
|
|
599 |
lm_eqn <- function(vec1, vec2){ |
|
|
600 |
m <- lm(vec1 ~ vec2, data.frame(vec1, vec2)); |
|
|
601 |
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, |
|
|
602 |
list(a = format(unname(coef(m)[1]), digits = 2), |
|
|
603 |
b = format(unname(coef(m)[2]), digits = 2), |
|
|
604 |
r2 = format(summary(m)$r.squared, digits = 3))) |
|
|
605 |
as.character(as.expression(eq)); |
|
|
606 |
} |
|
|
607 |
``` |
|
|
608 |
|
|
|
609 |
```{r, message=FALSE} |
|
|
610 |
color_arr <- data[,6:ncol(data)] |
|
|
611 |
correlation_matrix <- compute_corr_matrix(logcounts_151507_filtered, color_arr) |
|
|
612 |
correlation_matrix_filtered <- filter_corr_matrix(correlation_matrix, 0.3) |
|
|
613 |
plot_corr_matrix(correlation_matrix_filtered) |
|
|
614 |
``` |
|
|
615 |
|
|
|
616 |
```{r, message=FALSE} |
|
|
617 |
# plot color channel value vs gene expression. If remove_outliers == TRUE, outlier gene expression values + outlier pixel color values are removed. |
|
|
618 |
# ASSUMES THAT EACH BARCODE HAS MULTIPLE PIXELS |
|
|
619 |
plot_channel_highres <- function(g_name, gene_id, color, stat, log_arr, color_arr, remove_outliers) { |
|
|
620 |
stat_type <- paste(color, stat, sep="_") |
|
|
621 |
x_axis <- log_arr[,gene_id] |
|
|
622 |
y_axis <- color_arr[,stat_type] * 255 |
|
|
623 |
df <- data.frame(x_axis, y_axis) |
|
|
624 |
if (remove_outliers) { |
|
|
625 |
barcode_names <- rownames(log_arr) |
|
|
626 |
overlap <- union(rownames(log_arr)[isOutlier(log_arr[,gene_id])], rownames(color_arr)[isOutlier(color_arr[,stat_type])]) |
|
|
627 |
rownames(df) <- barcode_names |
|
|
628 |
df <- df[-which(rownames(df) %in% overlap),] |
|
|
629 |
title <- paste(stat_type, " Channel Values vs ", g_name, " Gene Expression with Outliers Removed", sep="") |
|
|
630 |
} else { |
|
|
631 |
title <- paste(stat_type, " Channel Values vs ", g_name, " Gene Expression without Outliers Removed", sep="") |
|
|
632 |
} |
|
|
633 |
ggplot(df, aes(x=df[,1], y=df[,2])) + xlab(g_name) + ylab(stat_type) + geom_point(color=color) + |
|
|
634 |
geom_smooth(method='lm', color='black') + stat_regline_equation(label.y = 190, aes(label = ..eq.label..)) + |
|
|
635 |
stat_regline_equation(label.y = 180, aes(label = ..rr.label..)) + ggtitle(title) |
|
|
636 |
} |
|
|
637 |
|
|
|
638 |
# call plot_channel function for all color and mean/median/mode combinations |
|
|
639 |
# ASSUMES THAT EACH BARCODE HAS MULTIPLE PIXELS |
|
|
640 |
plot_all_channels_highres <- function(g_name, log_arr, color_arr, remove_outliers) { |
|
|
641 |
gene_id <- rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id |
|
|
642 |
colors <- c('red', 'green', 'blue', 'grey') |
|
|
643 |
stats <- c('mean', 'median', 'mode') |
|
|
644 |
|
|
|
645 |
for (color in colors) { |
|
|
646 |
for (stat in stats) { |
|
|
647 |
print(plot_channel_highres(g_name, gene_id, color, stat, log_arr, color_arr, remove_outliers)) |
|
|
648 |
} |
|
|
649 |
} |
|
|
650 |
} |
|
|
651 |
|
|
|
652 |
# plot color channel value vs gene expression. If remove_outliers == TRUE, outlier gene expression values + outlier pixel color values are removed. |
|
|
653 |
# ASSUMES THAT EACH BARCODE HAS ONE PIXEL |
|
|
654 |
plot_channel <- function(g_name, gene_id, color, log_arr, color_arr, remove_outliers) { |
|
|
655 |
x_axis <- log_arr[,gene_id] |
|
|
656 |
y_axis <- color_arr[,color] * 255 |
|
|
657 |
df <- data.frame(x_axis, y_axis) |
|
|
658 |
if (remove_outliers) { |
|
|
659 |
barcode_names <- rownames(log_arr) |
|
|
660 |
overlap <- union(rownames(log_arr)[isOutlier(log_arr[,gene_id])], rownames(color_arr)[isOutlier(color_arr[,color])]) |
|
|
661 |
rownames(df) <- barcode_names |
|
|
662 |
df <- df[-which(rownames(df) %in% overlap),] |
|
|
663 |
title <- paste(color, " Channel Values vs ", g_name, " Gene Expression with Outliers Removed", sep="") |
|
|
664 |
} else { |
|
|
665 |
title <- paste(color, " Channel Values vs ", g_name, " Gene Expression without Outliers Removed", sep="") |
|
|
666 |
} |
|
|
667 |
ggplot(df, aes(x=df[,1], y=df[,2])) + xlab(g_name) + ylab(color) + geom_point(color=color) + |
|
|
668 |
geom_smooth(method='lm', color='black') + stat_regline_equation(label.y = 190, aes(label = ..eq.label..)) + |
|
|
669 |
stat_regline_equation(label.y = 180, aes(label = ..rr.label..)) + ggtitle(title) |
|
|
670 |
} |
|
|
671 |
|
|
|
672 |
# call plot_channel function for all colors |
|
|
673 |
# ASSUMES THAT EACH BARCODE HAS ONE PIXEL |
|
|
674 |
plot_all_channels <- function(g_name, log_arr, color_arr, remove_outliers) { |
|
|
675 |
gene_id <- rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id |
|
|
676 |
colors <- c('red', 'green', 'blue', 'grey') |
|
|
677 |
|
|
|
678 |
for (color in colors) { |
|
|
679 |
print(plot_channel(g_name, gene_id, color, log_arr, color_arr, remove_outliers)) |
|
|
680 |
} |
|
|
681 |
} |
|
|
682 |
``` |
|
|
683 |
|
|
|
684 |
```{r, message=FALSE} |
|
|
685 |
plot_all_channels('MT-ND1', logcounts_151507_filtered, color_arr, remove_outliers=FALSE) |
|
|
686 |
plot_all_channels('MT-ND1', logcounts_151507_filtered, color_arr, remove_outliers=TRUE) |
|
|
687 |
``` |
|
|
688 |
|
|
|
689 |
```{r, message=FALSE} |
|
|
690 |
plot_all_channels('MT-CO2', logcounts_151507_filtered, color_arr, remove_outliers=FALSE) |
|
|
691 |
plot_all_channels('MT-CO2', logcounts_151507_filtered, color_arr, remove_outliers=TRUE) |
|
|
692 |
``` |
|
|
693 |
|
|
|
694 |
```{r, message=FALSE} |
|
|
695 |
plot_all_channels('MT-ATP6', logcounts_151507_filtered, color_arr, remove_outliers=FALSE) |
|
|
696 |
plot_all_channels('MT-ATP6', logcounts_151507_filtered, color_arr, remove_outliers=TRUE) |
|
|
697 |
``` |
|
|
698 |
|
|
|
699 |
```{r, message=FALSE} |
|
|
700 |
plot_all_channels('MT-CYB', logcounts_151507_filtered, color_arr, remove_outliers=FALSE) |
|
|
701 |
plot_all_channels('MT-CYB', logcounts_151507_filtered, color_arr, remove_outliers=TRUE) |
|
|
702 |
``` |
|
|
703 |
|
|
|
704 |
## Redo Analysis Removing Outliers Before Correlation Computation |
|
|
705 |
|
|
|
706 |
In the following blocks, analysis is redone by removing outliers before |
|
|
707 |
correlation computation based on the isOutlier() function. If the number |
|
|
708 |
of non-NA expression values \< num = 100, the gene is removed from |
|
|
709 |
consideration. Then, correlation between the RGB/Grayscale values and |
|
|
710 |
gene expression values is computed. |
|
|
711 |
|
|
|
712 |
```{r, message=FALSE} |
|
|
713 |
# Remove outlier gene expression values. If median gene expression value is 0, replace complement of outliers with NA. Else replace outliers with NA. If remaining non-NA expression values < num, replace entire column with NA. |
|
|
714 |
remove_outliers <- function(arr, num) { |
|
|
715 |
foreach(i = 1:ncol(arr), |
|
|
716 |
.combine = cbind, |
|
|
717 |
.packages = c('data.table', 'doParallel')) %dopar% { |
|
|
718 |
if (median(arr[,i]) == 0) { |
|
|
719 |
arr[,i][!isOutlier(arr[,i])] <- NA |
|
|
720 |
} else { |
|
|
721 |
arr[,i][isOutlier(arr[,i])] <- NA |
|
|
722 |
} |
|
|
723 |
if (matrixStats::count(!is.na(arr[,i])) < num) { |
|
|
724 |
arr[,i] <- rep(NA, nrow(arr)) |
|
|
725 |
} |
|
|
726 |
arr[,i] |
|
|
727 |
} |
|
|
728 |
} |
|
|
729 |
|
|
|
730 |
get_top_k_corr <- function(mat, k) { |
|
|
731 |
temp_ind <- c() |
|
|
732 |
for (i in 1:ncol(mat)) { |
|
|
733 |
temp_ind <- c(temp_ind, order(abs(mat[,i]), decreasing=T)[1:k]) |
|
|
734 |
} |
|
|
735 |
top_corr_ind <- unique(temp_ind) |
|
|
736 |
top_corr <- rename_gene_ids(data.frame(mat[top_corr_ind,])) |
|
|
737 |
top_corr |
|
|
738 |
} |
|
|
739 |
|
|
|
740 |
get_gene_id <- function(g_name, spe) { |
|
|
741 |
gene_id = rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id |
|
|
742 |
gene_id |
|
|
743 |
} |
|
|
744 |
|
|
|
745 |
add_gene_col_data_df <- function(gene_id, g_name, data) { |
|
|
746 |
if (!(g_name %in% colnames(data))) { |
|
|
747 |
data[,ncol(data) + 1] <- data.frame(logcounts_151673_filtered[,gene_id]) |
|
|
748 |
colnames(data)[ncol(data)] <- g_name |
|
|
749 |
} |
|
|
750 |
data |
|
|
751 |
} |
|
|
752 |
|
|
|
753 |
drop_cols_data_df <- function(drop, data) { |
|
|
754 |
data <- data[,!(names(data) %in% drop)] |
|
|
755 |
data |
|
|
756 |
} |
|
|
757 |
|
|
|
758 |
range01 <- function(x){(x-min(x))/(max(x)-min(x))} |
|
|
759 |
``` |
|
|
760 |
|
|
|
761 |
```{r} |
|
|
762 |
g_name <- 'MOBP' |
|
|
763 |
gene_id <- get_gene_id(g_name, spe) |
|
|
764 |
data <- add_gene_col_data_df(gene_id, g_name, data) |
|
|
765 |
``` |
|
|
766 |
|
|
|
767 |
```{r} |
|
|
768 |
#temp <- data.frame(spatialCoords(spe_sub), vec1) |
|
|
769 |
g_name='vec1' |
|
|
770 |
temp <- data.frame(spatialCoords(spe_sub), vec1) |
|
|
771 |
print(ggplot(temp, aes(x = temp[,'pxl_row_in_lowres'], y=600 - temp[,'pxl_col_in_lowres'])) + geom_point(aes(color=temp[,g_name])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",g_name))) |
|
|
772 |
``` |
|
|
773 |
|
|
|
774 |
```{r} |
|
|
775 |
for (g_name in colnames(data[6:12])) { |
|
|
776 |
print(ggplot(data, aes(x = data[,'scaled_pxl_row_in_lowres'], y=600 - data[,'scaled_pxl_col_in_lowres'])) + geom_point(aes(color=data[,g_name])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",g_name))) |
|
|
777 |
} |
|
|
778 |
``` |
|
|
779 |
|
|
|
780 |
```{r} |
|
|
781 |
top_corr |
|
|
782 |
``` |
|
|
783 |
|
|
|
784 |
```{r} |
|
|
785 |
mat <- compute_corr_matrix(logcounts_151673_filtered, data[6:12]) |
|
|
786 |
top_corr <- get_top_k_corr(mat, 10) |
|
|
787 |
plot_corr_matrix(data.frame(top_corr)) |
|
|
788 |
``` |
|
|
789 |
|
|
|
790 |
```{r} |
|
|
791 |
for (g_name in rownames(top_corr)) { |
|
|
792 |
gene_id <- get_gene_id(g_name, spe) |
|
|
793 |
data <- add_gene_col_data_df(gene_id, g_name, data) |
|
|
794 |
} |
|
|
795 |
data |
|
|
796 |
``` |
|
|
797 |
|
|
|
798 |
```{r} |
|
|
799 |
for (g_name in colnames(data[13:ncol(data)])) { |
|
|
800 |
print(paste(" Number of expressed barcodes for ", g_name, ": ",sum(data[,g_name] > 0), "/", length(data[,g_name]))) |
|
|
801 |
print(ggplot(data, aes(x = data[,'scaled_pxl_row_in_lowres'], y=600 - data[,'scaled_pxl_col_in_lowres'])) + geom_point(aes(color=data[,g_name])) + theme(aspect.ratio = 1) + scale_color_gradient(low="pink", high="purple") + ggtitle(paste("Spot Plot ",g_name))) |
|
|
802 |
} |
|
|
803 |
``` |
|
|
804 |
|
|
|
805 |
```{r} |
|
|
806 |
logcounts_151673[,i] > 0 |
|
|
807 |
sum(logcounts_151673_filtered[,gene_id] > 0) |
|
|
808 |
sum(!is.na(logcounts_151673_filtered[,gene_id])) |
|
|
809 |
|
|
|
810 |
compute_corr_matrix(data.frame(logcounts_151673_filtered), data[,6:12]) |
|
|
811 |
compute_corr_matrix(data.frame(logcounts_151673_filtered_no_outliers[,gene_id]), temp_data[,6:12]) |
|
|
812 |
``` |
|
|
813 |
|
|
|
814 |
```{r} |
|
|
815 |
g_name = 'SNAP25' |
|
|
816 |
gene_id = rowData(spe)[rowData(spe)$gene_name == 'SNAP25',]$gene_id |
|
|
817 |
print(sum(logcounts_151507[,gene_id] != 0)) |
|
|
818 |
print(sum(logcounts_151507_filtered[,gene_id] != 0)) |
|
|
819 |
print(sum(!is.na(logcounts_151507_filtered_no_outliers[,gene_id]))) |
|
|
820 |
|
|
|
821 |
mat_with_outliers <- compute_corr_matrix(data.frame(logcounts_151507_filtered[,gene_id]), data[,4:5]) |
|
|
822 |
mat_with_outliers |
|
|
823 |
|
|
|
824 |
mat_outliers <- compute_corr_matrix(data.frame(logcounts_151507_filtered_no_outliers[,gene_id]), data[,4:5]) |
|
|
825 |
mat_outliers |
|
|
826 |
``` |
|
|
827 |
|
|
|
828 |
```{r, message=FALSE} |
|
|
829 |
# Call remove outlier function |
|
|
830 |
logcounts_temp <- as.matrix(logcounts_151673_filtered) |
|
|
831 |
col_names <- colnames(logcounts_temp) |
|
|
832 |
logcounts_151673_filtered_no_outliers <- remove_outliers(logcounts_temp, 100) |
|
|
833 |
colnames(logcounts_151673_filtered_no_outliers) <- col_names |
|
|
834 |
|
|
|
835 |
# Remove columns that are completely NA |
|
|
836 |
not_all_na <- function(x) any(!is.na(x)) |
|
|
837 |
logcounts_151673_filtered_no_outliers <- data.frame(logcounts_151673_filtered_no_outliers) %>% select(where(not_all_na)) |
|
|
838 |
|
|
|
839 |
# compute correlation matrix |
|
|
840 |
mat <- compute_corr_matrix(logcounts_151673_filtered_no_outliers, data[6:12]) |
|
|
841 |
``` |
|
|
842 |
|
|
|
843 |
```{r, message=FALSE} |
|
|
844 |
top_corr <- get_top_k_corr(mat, 10) |
|
|
845 |
plot_corr_matrix(top_corr) |
|
|
846 |
``` |
|
|
847 |
|
|
|
848 |
```{r} |
|
|
849 |
# MOBP, SNAP25, plot |
|
|
850 |
# Linear regression model |
|
|
851 |
# Where do MOBP/SNAP25 fall in my analysis |
|
|
852 |
# When did it get removed, spot plots of these genes |
|
|
853 |
# spot plots of top genes in correlation analysis |
|
|
854 |
gene_id = rowData(spe)[rowData(spe)$gene_name == 'LFNG',]$gene_id |
|
|
855 |
sum(!is.na(logcounts_151507_filtered_no_outliers[,gene_id])) |
|
|
856 |
``` |
|
|
857 |
|
|
|
858 |
```{r, message=FALSE} |
|
|
859 |
plot_all_channels('LFNG', logcounts_151507_filtered_no_outliers, color_arr, remove_outliers=FALSE) |
|
|
860 |
``` |
|
|
861 |
|
|
|
862 |
```{r, message=FALSE} |
|
|
863 |
plot_all_channels('ELK4', logcounts_151507_filtered_no_outliers, color_arr, remove_outliers=FALSE) |
|
|
864 |
``` |
|
|
865 |
|
|
|
866 |
```{r, message=FALSE} |
|
|
867 |
plot_all_channels('PLIN4', logcounts_151507_filtered_no_outliers, color_arr, remove_outliers=FALSE) |
|
|
868 |
``` |