Diff of /bin/utils.r [000000] .. [868c5d]

Switch to unified view

a b/bin/utils.r
1
options(stringsAsFactors = FALSE)
2
3
# function that prints two decimals of a number
4
dec_two <- function(x) {
5
  return (format(round(x, 2), nsmall = 2));
6
}
7
8
# function that prints two decimals of a number with sign
9
signed_dec_two <- function(x) {
10
11
   sign <- "+"
12
   if (x<0) { sign <- "-";  }   
13
 
14
  return (paste(sign, (format(round(x, 2), nsmall = 2)), sep=""))
15
}
16
17
18
19
# Function that reads in a vector made of binary values and prints the imbalance rates
20
dataset_dim_retriever <- function(thisDataset)
21
{
22
  cat("[Dataset size]\n")
23
  cat("number of data instances (rows) =", dim(thisDataset)[1], "\n")
24
  cat("number of features (columns) =", dim(thisDataset)[2], "\n")
25
}
26
27
# Function that reads in a vector made of binary values and prints the imbalance rates
28
imbalance_retriever <- function(thisVector)
29
{
30
  lun <- length(table(thisVector))
31
  if (lun != 2) {
32
  
33
    print("This vector is not binary. The imbalance_retriever() function will stop here");
34
    return ;
35
  
36
  }  
37
38
  cat("\n[Imbalance of this dataset]\n")
39
  number_of_elements_of_first_class <- unname(table(thisVector)[1])
40
  name_of_elements_of_first_class <- names(table(thisVector)[1])
41
  cat("[class: ",name_of_elements_of_first_class, "  #elements = ", number_of_elements_of_first_class, "]\n", sep="")
42
  cat(dec_two(unname(table(thisVector))[1]*100/length(thisVector)),"%\n", sep="")
43
  
44
  number_of_elements_of_second_class <-unname(table(thisVector)[2])
45
  name_of_elements_of_second_class <-names(table(thisVector)[2])
46
  cat("[class: ",name_of_elements_of_second_class, "  #elements = ", number_of_elements_of_second_class, "]\n", sep="")
47
  cat(dec_two(unname(table(thisVector))[2]*100/length(thisVector)),"%\n", sep="")
48
  
49
  cat("\n")
50
51
}
52
53
54
# Function that returns a more balanced training set
55
train_data_balancer <- function(thisDataset, target_index, training_set_perc, INPUT_PERC_POS, balancedFlag) {
56
57
    cat("\ntrain_data_balancer() function\n")
58
    
59
    thisDatasetSize <- dim(thisDataset)[1]
60
 
61
    training_set_numb_of_ele <- round(training_set_perc*thisDatasetSize/100,0)
62
    cat("\nThe training set will contain ", training_set_numb_of_ele, " items (", training_set_perc, "%) of the data instances\n", sep="")
63
64
    test_set_perc <- 100-training_set_perc
65
    test_set_numb_of_ele <- thisDatasetSize - training_set_numb_of_ele
66
    cat("The test set will contain ", test_set_numb_of_ele, " items (", test_set_perc, "%) of the data instances\n", sep="")
67
 
68
    # Split negative subset and positive subset
69
    positive_subset <- (thisDataset[is.element(thisDataset[,target_index], 1),])
70
    negative_subset <- (thisDataset[is.element(thisDataset[,target_index], 0),])
71
    
72
    # shuffle again
73
    positive_subset <- positive_subset[sample(nrow(positive_subset)),] 
74
    negative_subset <- negative_subset[sample(nrow(negative_subset)),] 
75
    
76
    positiveSetSize <- dim(positive_subset)[1]
77
    negativeSetSize <- dim(negative_subset)[1]
78
    
79
    cat("\noriginal \n", sep="")
80
    cat("positiveSetSize = ", positiveSetSize, "\n", sep="")
81
    cat("negativeSetSize = ", negativeSetSize, "\n", sep="")
82
    
83
    # if balancedFlag then 50% positives and 50% negatives 
84
    if (balancedFlag == TRUE) {
85
    
86
        minorClassSize <- min(positiveSetSize,negativeSetSize)        
87
    
88
        positive_subset <- positive_subset[1:minorClassSize,]
89
        negative_subset <- negative_subset[1:minorClassSize,]
90
    
91
        positiveSetSize <- dim(positive_subset)[1]
92
        negativeSetSize <- dim(negative_subset)[1]
93
        cat("\n(balancedFlag == TRUE) \n", sep="")
94
            
95
       cat("positiveSetSize = ", positiveSetSize, "\n", sep="")
96
       cat("negativeSetSize = ", negativeSetSize, "\n", sep="")
97
        
98
        training_set_numb_of_ele <- round((positiveSetSize+negativeSetSize)*training_set_perc/100,0)
99
        test_set_numb_of_ele <- (positiveSetSize+negativeSetSize) - training_set_numb_of_ele
100
        
101
       cat("\nThe training set will contain ", training_set_numb_of_ele, " items (", training_set_perc, "%) of the data instances\n", sep="")
102
       cat("The test set will contain ", test_set_numb_of_ele, " items (", test_set_perc, "%) of the data instances\n", sep="")
103
    }
104
    
105
106
 
107
    title <- "Positive dataset"
108
    #dataset_dim_retriever(positive_subset, title)
109
    #imbalance_retriever(positive_subset[ , target_index], title)
110
111
    title <- "Negative dataset"
112
    #dataset_dim_retriever(negative_subset, title)
113
    #imbalance_retriever(negative_subset[ , target_index], title)
114
    
115
    # cat("\nThe training set will contain ", training_set_numb_of_ele, " items", sep="")
116
    # cat("\nThe test set will contain ", test_set_numb_of_ele, " items \n", sep="")
117
    
118
    # newTrainingSet <- 50% positive_subset & 50% negative_subset 
119
    # from index 1 to 81 (that is training_set_numb_of_ele/2 ) of positive_subset
120
    # and from index 1 to 81 (that is training_set_numb_of_ele/2 ) of negative_subset
121
122
    train_set_num_of_positives <- round(training_set_numb_of_ele*(INPUT_PERC_POS/100), 0)
123
    # cat("INPUT_PERC_POS = ", INPUT_PERC_POS, "%\n", sep="")
124
    # cat("train_set_num_of_positives = ", train_set_num_of_positives, "\n", sep="")
125
    train_set_num_of_negatives <- round(training_set_numb_of_ele - train_set_num_of_positives,0)
126
    trainPosComponent <- positive_subset[(1:train_set_num_of_positives), ]
127
    trainNegComponent <- negative_subset[(1:train_set_num_of_negatives), ]
128
    newTrainingSetTemp <- rbind(trainPosComponent, trainNegComponent)
129
    newTrainingSet <- newTrainingSetTemp[sample(nrow(newTrainingSetTemp)),]
130
    
131
    title <- "New training set"
132
    # dataset_dim_retriever(newTrainingSet, title)
133
    # imbalance_retriever(newTrainingSet[ , target_index], title)
134
    
135
    # newTestSet <- all the rest
136
    # from index 82 (that is training_set_numb_of_ele/2 + 1) to the end of positive_subset
137
    # and from index 82 (that is training_set_numb_of_ele/2 + 1) to the end of negative_subset
138
    
139
#     cat("train_set_num_of_positives +1 = ", train_set_num_of_positives+1, "\n", sep="")
140
#     cat("positiveSetSize = ", positiveSetSize, "\n", sep="")
141
#     cat("train_set_num_of_negatives +1 = ", train_set_num_of_negatives+1, "\n", sep="")
142
#     cat("negativeSetSize = ", negativeSetSize, "\n", sep="")
143
144
    testPosComponent <- positive_subset[((train_set_num_of_positives+1):positiveSetSize), ]
145
    testNegComponent <- negative_subset[((train_set_num_of_negatives+1):negativeSetSize), ]
146
    
147
    # print("dim(testPosComponent)")
148
    # print(dim(testPosComponent))
149
    # print("dim(testNegComponent)")
150
    # print(dim(testNegComponent))
151
    newTestSetTemp <- rbind(testPosComponent, testNegComponent)
152
    newTestSet <- newTestSetTemp[sample(nrow(newTestSetTemp)),]
153
    
154
    title <- "New test set"
155
    # dataset_dim_retriever(newTestSet, title)
156
    # imbalance_retriever(newTestSet[ , target_index], title)
157
    
158
    return (list(newTrainingSet, newTestSet))
159
}
160