st-histology-ml / Git / Diff of /Models.Rmd

Models:
DanielG/
st-histology-ml
Downloads: 1
Diff of /Models.Rmd [000000] .. [c9fae8]
Switch to side-by-side view

--- a
+++ b/Models.Rmd
@@ -0,0 +1,119 @@
+---
+title: "R Notebook"
+output: html_notebook
+---
+
+This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 
+
+Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 
+
+```{r}
+library(reticulate)
+library('caret')
+library(spatialLIBD)
+library(MLmetrics)
+
+py_install("pandas")
+py_install("numpy")
+py_install("scikit-learn")
+py_install("matplotlib")
+
+sk <- import("sklearn")
+pd <- import("pandas")
+np <- import("numpy")
+plt <- import("matplotlib")
+sk_model_selection <- import("sklearn.model_selection")
+sk_linear_model <- import("sklearn.linear_model")
+sk_preprocessing <- import("sklearn.preprocessing")
+pyplot <- import("matplotlib.pyplot")
+```
+
+## Logistic Regression Classification from RGBG/HSL Image Spaces
+```{r}
+y <- logcounts_151673_filtered
+sum(y[y > 0])
+y[!(y > 0)] <- 1
+```
+
+
+```{r}
+split_data <- function(spe, g_name, logcounts, color_arr, isBinary) {
+  gene_id <- rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id
+  y <- logcounts[,gene_id]
+  if (isBinary) {
+    y[y > 0] <- 1
+  }
+  x <- color_arr
+  
+  temp <- sk_model_selection$train_test_split(x, y, test_size=0.1)
+  x_train <- temp[1]
+  x_test <- temp[2]
+  y_train <- temp[3]
+  y_test <- temp[4]
+  list(x_train, x_test, y_train, y_test)
+}
+
+evaluate_model <- function(y_predict, y_true, g_name) {
+  precision <- Precision(y_true, y_predict)
+  f1 <- F1_Score(y_true, y_predict)
+  recall <- Recall(y_true, y_predict)
+  accuracy <- Accuracy(y_predict, y_true)
+  print(cat(paste(g_name, paste("Accuracy: ", accuracy), paste('Recall: ', recall), paste('Precision: ', precision), paste('F1: ', f1), sep="\n")))
+
+}
+
+train <- function(spe, g_name, logcounts, color_arr, isBinary) {
+  split_data <- split_data(spe, g_name, logcounts, color_arr, isBinary)
+
+  x_train <- split_data[1][[1]][[1]]
+  x_test <- split_data[2][[1]][[1]]
+  y_train <- split_data[3][[1]][[1]]
+  y_test <- np$ravel(split_data[4][[1]][[1]])
+  
+  scaler <- sk_preprocessing$StandardScaler()
+  features_standardized <- scaler$fit_transform(x_train)
+  test_standardized <- scaler$transform(x_test)
+  
+  # Logistic regression for binary classification
+  clf <- sk_linear_model$LogisticRegression(random_state=as.integer(100))
+  clf$fit(features_standardized, np$ravel(y_train))
+  y_predict <- clf$predict(test_standardized)
+  
+  # Metrics for binary classification
+  evaluate_model(y_predict, y_test, g_name)
+  
+  # Find the top 10 most entries most likely to be + gene expression value and observe their ground truth
+  y_predict_proba <- clf$predict_proba(test_standardized)
+  temp <- order(y_predict_proba[,2],decreasing=T)[1:10]
+  print(paste("Accuracy of top 10 barcodes likely to have positive gene expression: ", sum(y_test[temp] == 1) / 10))
+  cat('\n')
+}
+```
+
+```{r}
+g_name <- "MOBP"
+logcounts <- logcounts_151673_filtered
+gene_id <- rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id
+y <- logcounts[,gene_id]
+#y[y > 0] <- 1
+x <- data.frame(data[10], data[11])
+scaler <- sk_preprocessing$StandardScaler()
+features_standardized <- scaler$fit_transform(x)
+test_standardized <- scaler$transform(x)
+temp_df <- data.frame(test_standardized,y)
+ggplot(temp_df, aes(x = temp_df[,'X1'], y=temp_df[,'X2'])) + geom_point(aes(color=temp_df[,'y'])) + theme(aspect.ratio = 1) + ggtitle(paste("Green and Saturation Features For ",g_name))
+```
+
+```{r}
+# Get list of genes with strongest correlations
+top_genes <- rownames(top_corr)
+
+for (g_name in c("MOBP", "CD74")) {
+  train(spe, g_name, logcounts_151673_filtered, data.frame(data[7], data[11]), TRUE)
+}
+```
+```{r}
+
+```
+
+