|
a |
|
b/bin/normalization.r |
|
|
1 |
# install.packages("class") |
|
|
2 |
# install.packages("gmodels") |
|
|
3 |
|
|
|
4 |
# function that normalizes |
|
|
5 |
normalize <- function(x) { |
|
|
6 |
return ((x - min(x)) / (max(x) - min(x))) |
|
|
7 |
} |
|
|
8 |
|
|
|
9 |
|
|
|
10 |
# function that converts a string |
|
|
11 |
# https://stats.stackexchange.com/a/17995 |
|
|
12 |
fromStringToNumeric <- function(x_array) { |
|
|
13 |
|
|
|
14 |
new_x <- as.factor(x_array) |
|
|
15 |
levels(new_x) <- 1:length(levels(new_x)) |
|
|
16 |
new_x_num <- as.numeric(new_x) |
|
|
17 |
|
|
|
18 |
return (new_x_num) |
|
|
19 |
} |
|
|
20 |
|
|
|
21 |
|
|
|
22 |
cat("[Reading the data file]\n") |
|
|
23 |
lung_cancer_data <- read.csv("../data/LungCancerDataset_AllRecords.csv", stringsAsFactors = FALSE) |
|
|
24 |
|
|
|
25 |
|
|
|
26 |
num_of_columns_original <- dim(lung_cancer_data)[2] |
|
|
27 |
num_of_instances <- dim(lung_cancer_data)[1] |
|
|
28 |
num_of_features_original <- num_of_columns_original - 1 |
|
|
29 |
|
|
|
30 |
lung_cancer_data_original <- lung_cancer_data |
|
|
31 |
|
|
|
32 |
lung_cancer_data$Metastasis <- lung_cancer_data$M |
|
|
33 |
lung_cancer_data$M <- NULL |
|
|
34 |
|
|
|
35 |
|
|
|
36 |
print("M0 = 0 = tumor NOT spread to distant organs") |
|
|
37 |
print("M1 = 1 = tumor spread to distant organs") |
|
|
38 |
|
|
|
39 |
table(lung_cancer_data$Metastasis) # it helps us to get the numbers of patients |
|
|
40 |
lung_cancer_data$Metastasis <- factor(lung_cancer_data$Metastasis, levels = c("M0", "M1"), labels = c("0", "1")) |
|
|
41 |
|
|
|
42 |
lung_cancer_data$Metastasis <- as.numeric(lung_cancer_data$Metastasis)-1 |
|
|
43 |
|
|
|
44 |
colnames(lung_cancer_data) |
|
|
45 |
|
|
|
46 |
lung_cancer_data_num <- lung_cancer_data |
|
|
47 |
|
|
|
48 |
# Le's remove this feature that has only one value |
|
|
49 |
#lung_cancer_data$SiterecwithKaposiandmesothelioma <- NULL |
|
|
50 |
|
|
|
51 |
j = 1 |
|
|
52 |
for(i in 1:(num_of_columns_original)) |
|
|
53 |
{ |
|
|
54 |
if (table(lung_cancer_data[i])==num_of_instances) { |
|
|
55 |
|
|
|
56 |
cat("The column ", colnames(lung_cancer_data[i]), "[",i,"] has only one value so will be deleted\n"); |
|
|
57 |
lung_cancer_data_num[j] <- NULL |
|
|
58 |
j = j - 1 |
|
|
59 |
} |
|
|
60 |
j = j + 1 |
|
|
61 |
} |
|
|
62 |
|
|
|
63 |
lung_cancer_data_num$Stage <- NULL |
|
|
64 |
|
|
|
65 |
num_of_columns <- dim(lung_cancer_data_num)[2] |
|
|
66 |
num_of_features <- num_of_columns - 1 |
|
|
67 |
|
|
|
68 |
target_column_index <- grep("Metastasis", colnames(lung_cancer_data_num)) |
|
|
69 |
|
|
|
70 |
cat("num_of_features = ", num_of_features, "\n") |
|
|
71 |
cat("the target is lung_cancer_data_num$Metastasis, column index =", target_column_index, "\n") |
|
|
72 |
|
|
|
73 |
for(i in 1:(num_of_features)) |
|
|
74 |
{ |
|
|
75 |
lung_cancer_data_num[,i] <- fromStringToNumeric(lung_cancer_data_num[,i]) |
|
|
76 |
} |
|
|
77 |
lung_cancer_data_num$Metastasis <- lung_cancer_data$Metastasis |
|
|
78 |
# lung_cancer_data_num <- lung_cancer_data_num[sample(nrow(lung_cancer_data_num)),] # shuffle the rows |
|
|
79 |
|
|
|
80 |
|
|
|
81 |
round(prop.table(table(lung_cancer_data_num$Metastasis)) * 100, digits = 1) # it gives the result in the percentage form rounded of to 1 decimal place( and so it’s digits = 1) |
|
|
82 |
|
|
|
83 |
cat("[Normalizing the values of the data file (except the Metastasis target column)]\n") |
|
|
84 |
lung_cancer_data_norm <- as.data.frame(lapply(lung_cancer_data_num[1:num_of_features], normalize)) |
|
|
85 |
lung_cancer_data_norm$Metastasis <- lung_cancer_data_num$Metastasis |
|
|
86 |
|
|
|
87 |
colnames(lung_cancer_data_norm) |
|
|
88 |
|
|
|
89 |
write.table(lung_cancer_data_norm, file = "../data/LungCancerDataset_AllRecords_NORM.csv", row.names=FALSE, na="", col.names=TRUE, sep=",") |
|
|
90 |
|