lung-cancer-metastasis-pr / Git / Diff of /bin/utils.r

Models:

RichardZick/

lung-cancer-metastasis-pr

Downloads: 1

Diff of /bin/utils.r [000000] .. [868c5d]

Switch to unified view

 b/bin/utils.r
+options(stringsAsFactors = FALSE)
+# function that prints two decimals of a number
+dec_two <- function(x) {
+  return (format(round(x, 2), nsmall = 2));
+}
+# function that prints two decimals of a number with sign
+signed_dec_two <- function(x) {
+   sign <- "+"
+   if (x<0) { sign <- "-";  }
+  return (paste(sign, (format(round(x, 2), nsmall = 2)), sep=""))
+}
+# Function that reads in a vector made of binary values and prints the imbalance rates
+dataset_dim_retriever <- function(thisDataset)
+{
+  cat("[Dataset size]\n")
+  cat("number of data instances (rows) =", dim(thisDataset)[1], "\n")
+  cat("number of features (columns) =", dim(thisDataset)[2], "\n")
+}
+# Function that reads in a vector made of binary values and prints the imbalance rates
+imbalance_retriever <- function(thisVector)
+{
+  lun <- length(table(thisVector))
+  if (lun != 2) {
+    print("This vector is not binary. The imbalance_retriever() function will stop here");
+    return ;
+  }
+  cat("\n[Imbalance of this dataset]\n")
+  number_of_elements_of_first_class <- unname(table(thisVector)[1])
+  name_of_elements_of_first_class <- names(table(thisVector)[1])
+  cat("[class: ",name_of_elements_of_first_class, "  #elements = ", number_of_elements_of_first_class, "]\n", sep="")
+  cat(dec_two(unname(table(thisVector))[1]*100/length(thisVector)),"%\n", sep="")
+  number_of_elements_of_second_class <-unname(table(thisVector)[2])
+  name_of_elements_of_second_class <-names(table(thisVector)[2])
+  cat("[class: ",name_of_elements_of_second_class, "  #elements = ", number_of_elements_of_second_class, "]\n", sep="")
+  cat(dec_two(unname(table(thisVector))[2]*100/length(thisVector)),"%\n", sep="")
+  cat("\n")
+}
+# Function that returns a more balanced training set
+train_data_balancer <- function(thisDataset, target_index, training_set_perc, INPUT_PERC_POS, balancedFlag) {
+    cat("\ntrain_data_balancer() function\n")
+    thisDatasetSize <- dim(thisDataset)[1]
+    training_set_numb_of_ele <- round(training_set_perc*thisDatasetSize/100,0)
+    cat("\nThe training set will contain ", training_set_numb_of_ele, " items (", training_set_perc, "%) of the data instances\n", sep="")
+    test_set_perc <- 100-training_set_perc
+    test_set_numb_of_ele <- thisDatasetSize - training_set_numb_of_ele
+    cat("The test set will contain ", test_set_numb_of_ele, " items (", test_set_perc, "%) of the data instances\n", sep="")
+    # Split negative subset and positive subset
+    positive_subset <- (thisDataset[is.element(thisDataset[,target_index], 1),])
+    negative_subset <- (thisDataset[is.element(thisDataset[,target_index], 0),])
+    # shuffle again
+    positive_subset <- positive_subset[sample(nrow(positive_subset)),]
+    negative_subset <- negative_subset[sample(nrow(negative_subset)),]
+    positiveSetSize <- dim(positive_subset)[1]
+    negativeSetSize <- dim(negative_subset)[1]
+    cat("\noriginal \n", sep="")
+    cat("positiveSetSize = ", positiveSetSize, "\n", sep="")
+    cat("negativeSetSize = ", negativeSetSize, "\n", sep="")
+    # if balancedFlag then 50% positives and 50% negatives
+    if (balancedFlag == TRUE) {
+        minorClassSize <- min(positiveSetSize,negativeSetSize)
+        positive_subset <- positive_subset[1:minorClassSize,]
+        negative_subset <- negative_subset[1:minorClassSize,]
+        positiveSetSize <- dim(positive_subset)[1]
+        negativeSetSize <- dim(negative_subset)[1]
+        cat("\n(balancedFlag == TRUE) \n", sep="")
+       cat("positiveSetSize = ", positiveSetSize, "\n", sep="")
+       cat("negativeSetSize = ", negativeSetSize, "\n", sep="")
+        training_set_numb_of_ele <- round((positiveSetSize+negativeSetSize)*training_set_perc/100,0)
+        test_set_numb_of_ele <- (positiveSetSize+negativeSetSize) - training_set_numb_of_ele
+       cat("\nThe training set will contain ", training_set_numb_of_ele, " items (", training_set_perc, "%) of the data instances\n", sep="")
+       cat("The test set will contain ", test_set_numb_of_ele, " items (", test_set_perc, "%) of the data instances\n", sep="")
+    }
+    title <- "Positive dataset"
+    #dataset_dim_retriever(positive_subset, title)
+    #imbalance_retriever(positive_subset[ , target_index], title)
+    title <- "Negative dataset"
+    #dataset_dim_retriever(negative_subset, title)
+    #imbalance_retriever(negative_subset[ , target_index], title)
+    # cat("\nThe training set will contain ", training_set_numb_of_ele, " items", sep="")
+    # cat("\nThe test set will contain ", test_set_numb_of_ele, " items \n", sep="")
+    # newTrainingSet <- 50% positive_subset & 50% negative_subset
+    # from index 1 to 81 (that is training_set_numb_of_ele/2 ) of positive_subset
+    # and from index 1 to 81 (that is training_set_numb_of_ele/2 ) of negative_subset
+    train_set_num_of_positives <- round(training_set_numb_of_ele*(INPUT_PERC_POS/100), 0)
+    # cat("INPUT_PERC_POS = ", INPUT_PERC_POS, "%\n", sep="")
+    # cat("train_set_num_of_positives = ", train_set_num_of_positives, "\n", sep="")
+    train_set_num_of_negatives <- round(training_set_numb_of_ele - train_set_num_of_positives,0)
+    trainPosComponent <- positive_subset[(1:train_set_num_of_positives), ]
+    trainNegComponent <- negative_subset[(1:train_set_num_of_negatives), ]
+    newTrainingSetTemp <- rbind(trainPosComponent, trainNegComponent)
+    newTrainingSet <- newTrainingSetTemp[sample(nrow(newTrainingSetTemp)),]
+    title <- "New training set"
+    # dataset_dim_retriever(newTrainingSet, title)
+    # imbalance_retriever(newTrainingSet[ , target_index], title)
+    # newTestSet <- all the rest
+    # from index 82 (that is training_set_numb_of_ele/2 + 1) to the end of positive_subset
+    # and from index 82 (that is training_set_numb_of_ele/2 + 1) to the end of negative_subset
+#     cat("train_set_num_of_positives +1 = ", train_set_num_of_positives+1, "\n", sep="")
+#     cat("positiveSetSize = ", positiveSetSize, "\n", sep="")
+#     cat("train_set_num_of_negatives +1 = ", train_set_num_of_negatives+1, "\n", sep="")
+#     cat("negativeSetSize = ", negativeSetSize, "\n", sep="")
+    testPosComponent <- positive_subset[((train_set_num_of_positives+1):positiveSetSize), ]
+    testNegComponent <- negative_subset[((train_set_num_of_negatives+1):negativeSetSize), ]
+    # print("dim(testPosComponent)")
+    # print(dim(testPosComponent))
+    # print("dim(testNegComponent)")
+    # print(dim(testNegComponent))
+    newTestSetTemp <- rbind(testPosComponent, testNegComponent)
+    newTestSet <- newTestSetTemp[sample(nrow(newTestSetTemp)),]
+    title <- "New test set"
+    # dataset_dim_retriever(newTestSet, title)
+    # imbalance_retriever(newTestSet[ , target_index], title)
+    return (list(newTrainingSet, newTestSet))
+}