[868c5d]: / bin / utils.r

Download this file

161 lines (114 with data), 6.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
options(stringsAsFactors = FALSE)
# function that prints two decimals of a number
dec_two <- function(x) {
return (format(round(x, 2), nsmall = 2));
}
# function that prints two decimals of a number with sign
signed_dec_two <- function(x) {
sign <- "+"
if (x<0) { sign <- "-"; }
return (paste(sign, (format(round(x, 2), nsmall = 2)), sep=""))
}
# Function that reads in a vector made of binary values and prints the imbalance rates
dataset_dim_retriever <- function(thisDataset)
{
cat("[Dataset size]\n")
cat("number of data instances (rows) =", dim(thisDataset)[1], "\n")
cat("number of features (columns) =", dim(thisDataset)[2], "\n")
}
# Function that reads in a vector made of binary values and prints the imbalance rates
imbalance_retriever <- function(thisVector)
{
lun <- length(table(thisVector))
if (lun != 2) {
print("This vector is not binary. The imbalance_retriever() function will stop here");
return ;
}
cat("\n[Imbalance of this dataset]\n")
number_of_elements_of_first_class <- unname(table(thisVector)[1])
name_of_elements_of_first_class <- names(table(thisVector)[1])
cat("[class: ",name_of_elements_of_first_class, " #elements = ", number_of_elements_of_first_class, "]\n", sep="")
cat(dec_two(unname(table(thisVector))[1]*100/length(thisVector)),"%\n", sep="")
number_of_elements_of_second_class <-unname(table(thisVector)[2])
name_of_elements_of_second_class <-names(table(thisVector)[2])
cat("[class: ",name_of_elements_of_second_class, " #elements = ", number_of_elements_of_second_class, "]\n", sep="")
cat(dec_two(unname(table(thisVector))[2]*100/length(thisVector)),"%\n", sep="")
cat("\n")
}
# Function that returns a more balanced training set
train_data_balancer <- function(thisDataset, target_index, training_set_perc, INPUT_PERC_POS, balancedFlag) {
cat("\ntrain_data_balancer() function\n")
thisDatasetSize <- dim(thisDataset)[1]
training_set_numb_of_ele <- round(training_set_perc*thisDatasetSize/100,0)
cat("\nThe training set will contain ", training_set_numb_of_ele, " items (", training_set_perc, "%) of the data instances\n", sep="")
test_set_perc <- 100-training_set_perc
test_set_numb_of_ele <- thisDatasetSize - training_set_numb_of_ele
cat("The test set will contain ", test_set_numb_of_ele, " items (", test_set_perc, "%) of the data instances\n", sep="")
# Split negative subset and positive subset
positive_subset <- (thisDataset[is.element(thisDataset[,target_index], 1),])
negative_subset <- (thisDataset[is.element(thisDataset[,target_index], 0),])
# shuffle again
positive_subset <- positive_subset[sample(nrow(positive_subset)),]
negative_subset <- negative_subset[sample(nrow(negative_subset)),]
positiveSetSize <- dim(positive_subset)[1]
negativeSetSize <- dim(negative_subset)[1]
cat("\noriginal \n", sep="")
cat("positiveSetSize = ", positiveSetSize, "\n", sep="")
cat("negativeSetSize = ", negativeSetSize, "\n", sep="")
# if balancedFlag then 50% positives and 50% negatives
if (balancedFlag == TRUE) {
minorClassSize <- min(positiveSetSize,negativeSetSize)
positive_subset <- positive_subset[1:minorClassSize,]
negative_subset <- negative_subset[1:minorClassSize,]
positiveSetSize <- dim(positive_subset)[1]
negativeSetSize <- dim(negative_subset)[1]
cat("\n(balancedFlag == TRUE) \n", sep="")
cat("positiveSetSize = ", positiveSetSize, "\n", sep="")
cat("negativeSetSize = ", negativeSetSize, "\n", sep="")
training_set_numb_of_ele <- round((positiveSetSize+negativeSetSize)*training_set_perc/100,0)
test_set_numb_of_ele <- (positiveSetSize+negativeSetSize) - training_set_numb_of_ele
cat("\nThe training set will contain ", training_set_numb_of_ele, " items (", training_set_perc, "%) of the data instances\n", sep="")
cat("The test set will contain ", test_set_numb_of_ele, " items (", test_set_perc, "%) of the data instances\n", sep="")
}
title <- "Positive dataset"
#dataset_dim_retriever(positive_subset, title)
#imbalance_retriever(positive_subset[ , target_index], title)
title <- "Negative dataset"
#dataset_dim_retriever(negative_subset, title)
#imbalance_retriever(negative_subset[ , target_index], title)
# cat("\nThe training set will contain ", training_set_numb_of_ele, " items", sep="")
# cat("\nThe test set will contain ", test_set_numb_of_ele, " items \n", sep="")
# newTrainingSet <- 50% positive_subset & 50% negative_subset
# from index 1 to 81 (that is training_set_numb_of_ele/2 ) of positive_subset
# and from index 1 to 81 (that is training_set_numb_of_ele/2 ) of negative_subset
train_set_num_of_positives <- round(training_set_numb_of_ele*(INPUT_PERC_POS/100), 0)
# cat("INPUT_PERC_POS = ", INPUT_PERC_POS, "%\n", sep="")
# cat("train_set_num_of_positives = ", train_set_num_of_positives, "\n", sep="")
train_set_num_of_negatives <- round(training_set_numb_of_ele - train_set_num_of_positives,0)
trainPosComponent <- positive_subset[(1:train_set_num_of_positives), ]
trainNegComponent <- negative_subset[(1:train_set_num_of_negatives), ]
newTrainingSetTemp <- rbind(trainPosComponent, trainNegComponent)
newTrainingSet <- newTrainingSetTemp[sample(nrow(newTrainingSetTemp)),]
title <- "New training set"
# dataset_dim_retriever(newTrainingSet, title)
# imbalance_retriever(newTrainingSet[ , target_index], title)
# newTestSet <- all the rest
# from index 82 (that is training_set_numb_of_ele/2 + 1) to the end of positive_subset
# and from index 82 (that is training_set_numb_of_ele/2 + 1) to the end of negative_subset
# cat("train_set_num_of_positives +1 = ", train_set_num_of_positives+1, "\n", sep="")
# cat("positiveSetSize = ", positiveSetSize, "\n", sep="")
# cat("train_set_num_of_negatives +1 = ", train_set_num_of_negatives+1, "\n", sep="")
# cat("negativeSetSize = ", negativeSetSize, "\n", sep="")
testPosComponent <- positive_subset[((train_set_num_of_positives+1):positiveSetSize), ]
testNegComponent <- negative_subset[((train_set_num_of_negatives+1):negativeSetSize), ]
# print("dim(testPosComponent)")
# print(dim(testPosComponent))
# print("dim(testNegComponent)")
# print(dim(testNegComponent))
newTestSetTemp <- rbind(testPosComponent, testNegComponent)
newTestSet <- newTestSetTemp[sample(nrow(newTestSetTemp)),]
title <- "New test set"
# dataset_dim_retriever(newTestSet, title)
# imbalance_retriever(newTestSet[ , target_index], title)
return (list(newTrainingSet, newTestSet))
}