--- a +++ b/bin/utils.r @@ -0,0 +1,160 @@ +options(stringsAsFactors = FALSE) + +# function that prints two decimals of a number +dec_two <- function(x) { + return (format(round(x, 2), nsmall = 2)); +} + +# function that prints two decimals of a number with sign +signed_dec_two <- function(x) { + + sign <- "+" + if (x<0) { sign <- "-"; } + + return (paste(sign, (format(round(x, 2), nsmall = 2)), sep="")) +} + + + +# Function that reads in a vector made of binary values and prints the imbalance rates +dataset_dim_retriever <- function(thisDataset) +{ + cat("[Dataset size]\n") + cat("number of data instances (rows) =", dim(thisDataset)[1], "\n") + cat("number of features (columns) =", dim(thisDataset)[2], "\n") +} + +# Function that reads in a vector made of binary values and prints the imbalance rates +imbalance_retriever <- function(thisVector) +{ + lun <- length(table(thisVector)) + if (lun != 2) { + + print("This vector is not binary. The imbalance_retriever() function will stop here"); + return ; + + } + + cat("\n[Imbalance of this dataset]\n") + number_of_elements_of_first_class <- unname(table(thisVector)[1]) + name_of_elements_of_first_class <- names(table(thisVector)[1]) + cat("[class: ",name_of_elements_of_first_class, " #elements = ", number_of_elements_of_first_class, "]\n", sep="") + cat(dec_two(unname(table(thisVector))[1]*100/length(thisVector)),"%\n", sep="") + + number_of_elements_of_second_class <-unname(table(thisVector)[2]) + name_of_elements_of_second_class <-names(table(thisVector)[2]) + cat("[class: ",name_of_elements_of_second_class, " #elements = ", number_of_elements_of_second_class, "]\n", sep="") + cat(dec_two(unname(table(thisVector))[2]*100/length(thisVector)),"%\n", sep="") + + cat("\n") + +} + + +# Function that returns a more balanced training set +train_data_balancer <- function(thisDataset, target_index, training_set_perc, INPUT_PERC_POS, balancedFlag) { + + cat("\ntrain_data_balancer() function\n") + + thisDatasetSize <- dim(thisDataset)[1] + + training_set_numb_of_ele <- round(training_set_perc*thisDatasetSize/100,0) + cat("\nThe training set will contain ", training_set_numb_of_ele, " items (", training_set_perc, "%) of the data instances\n", sep="") + + test_set_perc <- 100-training_set_perc + test_set_numb_of_ele <- thisDatasetSize - training_set_numb_of_ele + cat("The test set will contain ", test_set_numb_of_ele, " items (", test_set_perc, "%) of the data instances\n", sep="") + + # Split negative subset and positive subset + positive_subset <- (thisDataset[is.element(thisDataset[,target_index], 1),]) + negative_subset <- (thisDataset[is.element(thisDataset[,target_index], 0),]) + + # shuffle again + positive_subset <- positive_subset[sample(nrow(positive_subset)),] + negative_subset <- negative_subset[sample(nrow(negative_subset)),] + + positiveSetSize <- dim(positive_subset)[1] + negativeSetSize <- dim(negative_subset)[1] + + cat("\noriginal \n", sep="") + cat("positiveSetSize = ", positiveSetSize, "\n", sep="") + cat("negativeSetSize = ", negativeSetSize, "\n", sep="") + + # if balancedFlag then 50% positives and 50% negatives + if (balancedFlag == TRUE) { + + minorClassSize <- min(positiveSetSize,negativeSetSize) + + positive_subset <- positive_subset[1:minorClassSize,] + negative_subset <- negative_subset[1:minorClassSize,] + + positiveSetSize <- dim(positive_subset)[1] + negativeSetSize <- dim(negative_subset)[1] + cat("\n(balancedFlag == TRUE) \n", sep="") + + cat("positiveSetSize = ", positiveSetSize, "\n", sep="") + cat("negativeSetSize = ", negativeSetSize, "\n", sep="") + + training_set_numb_of_ele <- round((positiveSetSize+negativeSetSize)*training_set_perc/100,0) + test_set_numb_of_ele <- (positiveSetSize+negativeSetSize) - training_set_numb_of_ele + + cat("\nThe training set will contain ", training_set_numb_of_ele, " items (", training_set_perc, "%) of the data instances\n", sep="") + cat("The test set will contain ", test_set_numb_of_ele, " items (", test_set_perc, "%) of the data instances\n", sep="") + } + + + + title <- "Positive dataset" + #dataset_dim_retriever(positive_subset, title) + #imbalance_retriever(positive_subset[ , target_index], title) + + title <- "Negative dataset" + #dataset_dim_retriever(negative_subset, title) + #imbalance_retriever(negative_subset[ , target_index], title) + + # cat("\nThe training set will contain ", training_set_numb_of_ele, " items", sep="") + # cat("\nThe test set will contain ", test_set_numb_of_ele, " items \n", sep="") + + # newTrainingSet <- 50% positive_subset & 50% negative_subset + # from index 1 to 81 (that is training_set_numb_of_ele/2 ) of positive_subset + # and from index 1 to 81 (that is training_set_numb_of_ele/2 ) of negative_subset + + train_set_num_of_positives <- round(training_set_numb_of_ele*(INPUT_PERC_POS/100), 0) + # cat("INPUT_PERC_POS = ", INPUT_PERC_POS, "%\n", sep="") + # cat("train_set_num_of_positives = ", train_set_num_of_positives, "\n", sep="") + train_set_num_of_negatives <- round(training_set_numb_of_ele - train_set_num_of_positives,0) + trainPosComponent <- positive_subset[(1:train_set_num_of_positives), ] + trainNegComponent <- negative_subset[(1:train_set_num_of_negatives), ] + newTrainingSetTemp <- rbind(trainPosComponent, trainNegComponent) + newTrainingSet <- newTrainingSetTemp[sample(nrow(newTrainingSetTemp)),] + + title <- "New training set" + # dataset_dim_retriever(newTrainingSet, title) + # imbalance_retriever(newTrainingSet[ , target_index], title) + + # newTestSet <- all the rest + # from index 82 (that is training_set_numb_of_ele/2 + 1) to the end of positive_subset + # and from index 82 (that is training_set_numb_of_ele/2 + 1) to the end of negative_subset + +# cat("train_set_num_of_positives +1 = ", train_set_num_of_positives+1, "\n", sep="") +# cat("positiveSetSize = ", positiveSetSize, "\n", sep="") +# cat("train_set_num_of_negatives +1 = ", train_set_num_of_negatives+1, "\n", sep="") +# cat("negativeSetSize = ", negativeSetSize, "\n", sep="") + + testPosComponent <- positive_subset[((train_set_num_of_positives+1):positiveSetSize), ] + testNegComponent <- negative_subset[((train_set_num_of_negatives+1):negativeSetSize), ] + + # print("dim(testPosComponent)") + # print(dim(testPosComponent)) + # print("dim(testNegComponent)") + # print(dim(testNegComponent)) + newTestSetTemp <- rbind(testPosComponent, testNegComponent) + newTestSet <- newTestSetTemp[sample(nrow(newTestSetTemp)),] + + title <- "New test set" + # dataset_dim_retriever(newTestSet, title) + # imbalance_retriever(newTestSet[ , target_index], title) + + return (list(newTrainingSet, newTestSet)) +} +