Diff of /bin/normalization.r [000000] .. [868c5d]

Switch to unified view

a b/bin/normalization.r
1
# install.packages("class")
2
# install.packages("gmodels")
3
4
# function that normalizes
5
normalize <- function(x) {
6
  return ((x - min(x)) / (max(x) - min(x))) 
7
}
8
  
9
10
# function that converts a string 
11
# https://stats.stackexchange.com/a/17995
12
fromStringToNumeric <- function(x_array) {
13
14
   new_x <- as.factor(x_array)
15
   levels(new_x) <- 1:length(levels(new_x))
16
   new_x_num <- as.numeric(new_x)
17
18
   return (new_x_num)
19
}
20
21
22
cat("[Reading the data file]\n")
23
lung_cancer_data <- read.csv("../data/LungCancerDataset_AllRecords.csv", stringsAsFactors = FALSE) 
24
25
26
num_of_columns_original <- dim(lung_cancer_data)[2]
27
num_of_instances <- dim(lung_cancer_data)[1]
28
num_of_features_original <- num_of_columns_original - 1
29
30
lung_cancer_data_original <- lung_cancer_data
31
32
lung_cancer_data$Metastasis <- lung_cancer_data$M
33
lung_cancer_data$M <- NULL
34
35
36
print("M0 = 0 = tumor NOT spread to distant organs")
37
print("M1 = 1 = tumor spread to distant organs")
38
39
table(lung_cancer_data$Metastasis)  # it helps us to get the numbers of patients
40
lung_cancer_data$Metastasis <- factor(lung_cancer_data$Metastasis, levels = c("M0", "M1"), labels = c("0", "1"))
41
42
lung_cancer_data$Metastasis <- as.numeric(lung_cancer_data$Metastasis)-1
43
44
colnames(lung_cancer_data)
45
46
lung_cancer_data_num <- lung_cancer_data
47
48
# Le's remove this feature that has only one value
49
#lung_cancer_data$SiterecwithKaposiandmesothelioma <- NULL 
50
51
j = 1
52
for(i in 1:(num_of_columns_original))
53
{
54
  if (table(lung_cancer_data[i])==num_of_instances) {
55
  
56
    cat("The column ", colnames(lung_cancer_data[i]), "[",i,"] has only one value so will be deleted\n");
57
    lung_cancer_data_num[j] <- NULL
58
    j = j - 1
59
  }
60
  j = j + 1
61
}
62
63
lung_cancer_data_num$Stage <- NULL
64
65
num_of_columns <- dim(lung_cancer_data_num)[2]
66
num_of_features <- num_of_columns - 1
67
68
target_column_index <- grep("Metastasis", colnames(lung_cancer_data_num))
69
70
cat("num_of_features = ", num_of_features, "\n")
71
cat("the target is lung_cancer_data_num$Metastasis, column index =", target_column_index, "\n")
72
73
for(i in 1:(num_of_features))
74
{
75
  lung_cancer_data_num[,i] <- fromStringToNumeric(lung_cancer_data_num[,i])
76
}
77
lung_cancer_data_num$Metastasis <- lung_cancer_data$Metastasis
78
# lung_cancer_data_num <- lung_cancer_data_num[sample(nrow(lung_cancer_data_num)),] # shuffle the rows
79
80
81
round(prop.table(table(lung_cancer_data_num$Metastasis)) * 100, digits = 1)  # it gives the result in the percentage form rounded of to 1 decimal place( and so it’s digits = 1)
82
83
cat("[Normalizing the values of the data file (except the Metastasis target column)]\n")
84
lung_cancer_data_norm <- as.data.frame(lapply(lung_cancer_data_num[1:num_of_features], normalize))
85
lung_cancer_data_norm$Metastasis <- lung_cancer_data_num$Metastasis
86
87
colnames(lung_cancer_data_norm)
88
89
write.table(lung_cancer_data_norm, file = "../data/LungCancerDataset_AllRecords_NORM.csv", row.names=FALSE, na="", col.names=TRUE, sep=",")
90