|
a |
|
b/Models.Rmd |
|
|
1 |
--- |
|
|
2 |
title: "R Notebook" |
|
|
3 |
output: html_notebook |
|
|
4 |
--- |
|
|
5 |
|
|
|
6 |
This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. |
|
|
7 |
|
|
|
8 |
Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. |
|
|
9 |
|
|
|
10 |
```{r} |
|
|
11 |
library(reticulate) |
|
|
12 |
library('caret') |
|
|
13 |
library(spatialLIBD) |
|
|
14 |
library(MLmetrics) |
|
|
15 |
|
|
|
16 |
py_install("pandas") |
|
|
17 |
py_install("numpy") |
|
|
18 |
py_install("scikit-learn") |
|
|
19 |
py_install("matplotlib") |
|
|
20 |
|
|
|
21 |
sk <- import("sklearn") |
|
|
22 |
pd <- import("pandas") |
|
|
23 |
np <- import("numpy") |
|
|
24 |
plt <- import("matplotlib") |
|
|
25 |
sk_model_selection <- import("sklearn.model_selection") |
|
|
26 |
sk_linear_model <- import("sklearn.linear_model") |
|
|
27 |
sk_preprocessing <- import("sklearn.preprocessing") |
|
|
28 |
pyplot <- import("matplotlib.pyplot") |
|
|
29 |
``` |
|
|
30 |
|
|
|
31 |
## Logistic Regression Classification from RGBG/HSL Image Spaces |
|
|
32 |
```{r} |
|
|
33 |
y <- logcounts_151673_filtered |
|
|
34 |
sum(y[y > 0]) |
|
|
35 |
y[!(y > 0)] <- 1 |
|
|
36 |
``` |
|
|
37 |
|
|
|
38 |
|
|
|
39 |
```{r} |
|
|
40 |
split_data <- function(spe, g_name, logcounts, color_arr, isBinary) { |
|
|
41 |
gene_id <- rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id |
|
|
42 |
y <- logcounts[,gene_id] |
|
|
43 |
if (isBinary) { |
|
|
44 |
y[y > 0] <- 1 |
|
|
45 |
} |
|
|
46 |
x <- color_arr |
|
|
47 |
|
|
|
48 |
temp <- sk_model_selection$train_test_split(x, y, test_size=0.1) |
|
|
49 |
x_train <- temp[1] |
|
|
50 |
x_test <- temp[2] |
|
|
51 |
y_train <- temp[3] |
|
|
52 |
y_test <- temp[4] |
|
|
53 |
list(x_train, x_test, y_train, y_test) |
|
|
54 |
} |
|
|
55 |
|
|
|
56 |
evaluate_model <- function(y_predict, y_true, g_name) { |
|
|
57 |
precision <- Precision(y_true, y_predict) |
|
|
58 |
f1 <- F1_Score(y_true, y_predict) |
|
|
59 |
recall <- Recall(y_true, y_predict) |
|
|
60 |
accuracy <- Accuracy(y_predict, y_true) |
|
|
61 |
print(cat(paste(g_name, paste("Accuracy: ", accuracy), paste('Recall: ', recall), paste('Precision: ', precision), paste('F1: ', f1), sep="\n"))) |
|
|
62 |
|
|
|
63 |
} |
|
|
64 |
|
|
|
65 |
train <- function(spe, g_name, logcounts, color_arr, isBinary) { |
|
|
66 |
split_data <- split_data(spe, g_name, logcounts, color_arr, isBinary) |
|
|
67 |
|
|
|
68 |
x_train <- split_data[1][[1]][[1]] |
|
|
69 |
x_test <- split_data[2][[1]][[1]] |
|
|
70 |
y_train <- split_data[3][[1]][[1]] |
|
|
71 |
y_test <- np$ravel(split_data[4][[1]][[1]]) |
|
|
72 |
|
|
|
73 |
scaler <- sk_preprocessing$StandardScaler() |
|
|
74 |
features_standardized <- scaler$fit_transform(x_train) |
|
|
75 |
test_standardized <- scaler$transform(x_test) |
|
|
76 |
|
|
|
77 |
# Logistic regression for binary classification |
|
|
78 |
clf <- sk_linear_model$LogisticRegression(random_state=as.integer(100)) |
|
|
79 |
clf$fit(features_standardized, np$ravel(y_train)) |
|
|
80 |
y_predict <- clf$predict(test_standardized) |
|
|
81 |
|
|
|
82 |
# Metrics for binary classification |
|
|
83 |
evaluate_model(y_predict, y_test, g_name) |
|
|
84 |
|
|
|
85 |
# Find the top 10 most entries most likely to be + gene expression value and observe their ground truth |
|
|
86 |
y_predict_proba <- clf$predict_proba(test_standardized) |
|
|
87 |
temp <- order(y_predict_proba[,2],decreasing=T)[1:10] |
|
|
88 |
print(paste("Accuracy of top 10 barcodes likely to have positive gene expression: ", sum(y_test[temp] == 1) / 10)) |
|
|
89 |
cat('\n') |
|
|
90 |
} |
|
|
91 |
``` |
|
|
92 |
|
|
|
93 |
```{r} |
|
|
94 |
g_name <- "MOBP" |
|
|
95 |
logcounts <- logcounts_151673_filtered |
|
|
96 |
gene_id <- rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id |
|
|
97 |
y <- logcounts[,gene_id] |
|
|
98 |
#y[y > 0] <- 1 |
|
|
99 |
x <- data.frame(data[10], data[11]) |
|
|
100 |
scaler <- sk_preprocessing$StandardScaler() |
|
|
101 |
features_standardized <- scaler$fit_transform(x) |
|
|
102 |
test_standardized <- scaler$transform(x) |
|
|
103 |
temp_df <- data.frame(test_standardized,y) |
|
|
104 |
ggplot(temp_df, aes(x = temp_df[,'X1'], y=temp_df[,'X2'])) + geom_point(aes(color=temp_df[,'y'])) + theme(aspect.ratio = 1) + ggtitle(paste("Green and Saturation Features For ",g_name)) |
|
|
105 |
``` |
|
|
106 |
|
|
|
107 |
```{r} |
|
|
108 |
# Get list of genes with strongest correlations |
|
|
109 |
top_genes <- rownames(top_corr) |
|
|
110 |
|
|
|
111 |
for (g_name in c("MOBP", "CD74")) { |
|
|
112 |
train(spe, g_name, logcounts_151673_filtered, data.frame(data[7], data[11]), TRUE) |
|
|
113 |
} |
|
|
114 |
``` |
|
|
115 |
```{r} |
|
|
116 |
|
|
|
117 |
``` |
|
|
118 |
|
|
|
119 |
|