120 lines (96 with data), 3.7 kB
---
title: "R Notebook"
output: html_notebook
---
This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*.
```{r}
library(reticulate)
library('caret')
library(spatialLIBD)
library(MLmetrics)
py_install("pandas")
py_install("numpy")
py_install("scikit-learn")
py_install("matplotlib")
sk <- import("sklearn")
pd <- import("pandas")
np <- import("numpy")
plt <- import("matplotlib")
sk_model_selection <- import("sklearn.model_selection")
sk_linear_model <- import("sklearn.linear_model")
sk_preprocessing <- import("sklearn.preprocessing")
pyplot <- import("matplotlib.pyplot")
```
## Logistic Regression Classification from RGBG/HSL Image Spaces
```{r}
y <- logcounts_151673_filtered
sum(y[y > 0])
y[!(y > 0)] <- 1
```
```{r}
split_data <- function(spe, g_name, logcounts, color_arr, isBinary) {
gene_id <- rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id
y <- logcounts[,gene_id]
if (isBinary) {
y[y > 0] <- 1
}
x <- color_arr
temp <- sk_model_selection$train_test_split(x, y, test_size=0.1)
x_train <- temp[1]
x_test <- temp[2]
y_train <- temp[3]
y_test <- temp[4]
list(x_train, x_test, y_train, y_test)
}
evaluate_model <- function(y_predict, y_true, g_name) {
precision <- Precision(y_true, y_predict)
f1 <- F1_Score(y_true, y_predict)
recall <- Recall(y_true, y_predict)
accuracy <- Accuracy(y_predict, y_true)
print(cat(paste(g_name, paste("Accuracy: ", accuracy), paste('Recall: ', recall), paste('Precision: ', precision), paste('F1: ', f1), sep="\n")))
}
train <- function(spe, g_name, logcounts, color_arr, isBinary) {
split_data <- split_data(spe, g_name, logcounts, color_arr, isBinary)
x_train <- split_data[1][[1]][[1]]
x_test <- split_data[2][[1]][[1]]
y_train <- split_data[3][[1]][[1]]
y_test <- np$ravel(split_data[4][[1]][[1]])
scaler <- sk_preprocessing$StandardScaler()
features_standardized <- scaler$fit_transform(x_train)
test_standardized <- scaler$transform(x_test)
# Logistic regression for binary classification
clf <- sk_linear_model$LogisticRegression(random_state=as.integer(100))
clf$fit(features_standardized, np$ravel(y_train))
y_predict <- clf$predict(test_standardized)
# Metrics for binary classification
evaluate_model(y_predict, y_test, g_name)
# Find the top 10 most entries most likely to be + gene expression value and observe their ground truth
y_predict_proba <- clf$predict_proba(test_standardized)
temp <- order(y_predict_proba[,2],decreasing=T)[1:10]
print(paste("Accuracy of top 10 barcodes likely to have positive gene expression: ", sum(y_test[temp] == 1) / 10))
cat('\n')
}
```
```{r}
g_name <- "MOBP"
logcounts <- logcounts_151673_filtered
gene_id <- rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id
y <- logcounts[,gene_id]
#y[y > 0] <- 1
x <- data.frame(data[10], data[11])
scaler <- sk_preprocessing$StandardScaler()
features_standardized <- scaler$fit_transform(x)
test_standardized <- scaler$transform(x)
temp_df <- data.frame(test_standardized,y)
ggplot(temp_df, aes(x = temp_df[,'X1'], y=temp_df[,'X2'])) + geom_point(aes(color=temp_df[,'y'])) + theme(aspect.ratio = 1) + ggtitle(paste("Green and Saturation Features For ",g_name))
```
```{r}
# Get list of genes with strongest correlations
top_genes <- rownames(top_corr)
for (g_name in c("MOBP", "CD74")) {
train(spe, g_name, logcounts_151673_filtered, data.frame(data[7], data[11]), TRUE)
}
```
```{r}
```