st-histology-ml / Git / [c9fae8] /Models.Rmd

Models:
DanielG/
st-histology-ml
Downloads: 1
[c9fae8]: / Models.Rmd
History
Download this file
120 lines (96 with data), 3.7 kB

---
title: "R Notebook"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
library(reticulate)
library('caret')
library(spatialLIBD)
library(MLmetrics)

py_install("pandas")
py_install("numpy")
py_install("scikit-learn")
py_install("matplotlib")

sk <- import("sklearn")
pd <- import("pandas")
np <- import("numpy")
plt <- import("matplotlib")
sk_model_selection <- import("sklearn.model_selection")
sk_linear_model <- import("sklearn.linear_model")
sk_preprocessing <- import("sklearn.preprocessing")
pyplot <- import("matplotlib.pyplot")
```

## Logistic Regression Classification from RGBG/HSL Image Spaces
```{r}
y <- logcounts_151673_filtered
sum(y[y > 0])
y[!(y > 0)] <- 1
```


```{r}
split_data <- function(spe, g_name, logcounts, color_arr, isBinary) {
  gene_id <- rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id
  y <- logcounts[,gene_id]
  if (isBinary) {
    y[y > 0] <- 1
  }
  x <- color_arr
  
  temp <- sk_model_selection$train_test_split(x, y, test_size=0.1)
  x_train <- temp[1]
  x_test <- temp[2]
  y_train <- temp[3]
  y_test <- temp[4]
  list(x_train, x_test, y_train, y_test)
}

evaluate_model <- function(y_predict, y_true, g_name) {
  precision <- Precision(y_true, y_predict)
  f1 <- F1_Score(y_true, y_predict)
  recall <- Recall(y_true, y_predict)
  accuracy <- Accuracy(y_predict, y_true)
  print(cat(paste(g_name, paste("Accuracy: ", accuracy), paste('Recall: ', recall), paste('Precision: ', precision), paste('F1: ', f1), sep="\n")))

}

train <- function(spe, g_name, logcounts, color_arr, isBinary) {
  split_data <- split_data(spe, g_name, logcounts, color_arr, isBinary)

  x_train <- split_data[1][[1]][[1]]
  x_test <- split_data[2][[1]][[1]]
  y_train <- split_data[3][[1]][[1]]
  y_test <- np$ravel(split_data[4][[1]][[1]])
  
  scaler <- sk_preprocessing$StandardScaler()
  features_standardized <- scaler$fit_transform(x_train)
  test_standardized <- scaler$transform(x_test)
  
  # Logistic regression for binary classification
  clf <- sk_linear_model$LogisticRegression(random_state=as.integer(100))
  clf$fit(features_standardized, np$ravel(y_train))
  y_predict <- clf$predict(test_standardized)
  
  # Metrics for binary classification
  evaluate_model(y_predict, y_test, g_name)
  
  # Find the top 10 most entries most likely to be + gene expression value and observe their ground truth
  y_predict_proba <- clf$predict_proba(test_standardized)
  temp <- order(y_predict_proba[,2],decreasing=T)[1:10]
  print(paste("Accuracy of top 10 barcodes likely to have positive gene expression: ", sum(y_test[temp] == 1) / 10))
  cat('\n')
}
```

```{r}
g_name <- "MOBP"
logcounts <- logcounts_151673_filtered
gene_id <- rowData(spe)[rowData(spe)$gene_name == g_name,]$gene_id
y <- logcounts[,gene_id]
#y[y > 0] <- 1
x <- data.frame(data[10], data[11])
scaler <- sk_preprocessing$StandardScaler()
features_standardized <- scaler$fit_transform(x)
test_standardized <- scaler$transform(x)
temp_df <- data.frame(test_standardized,y)
ggplot(temp_df, aes(x = temp_df[,'X1'], y=temp_df[,'X2'])) + geom_point(aes(color=temp_df[,'y'])) + theme(aspect.ratio = 1) + ggtitle(paste("Green and Saturation Features For ",g_name))
```

```{r}
# Get list of genes with strongest correlations
top_genes <- rownames(top_corr)

for (g_name in c("MOBP", "CD74")) {
  train(spe, g_name, logcounts_151673_filtered, data.frame(data[7], data[11]), TRUE)
}
```
```{r}

```