|
a |
|
b/bin/knn.r |
|
|
1 |
setwd(".") |
|
|
2 |
options(stringsAsFactors = FALSE) |
|
|
3 |
# library("clusterSim") |
|
|
4 |
library("PRROC") |
|
|
5 |
library("e1071") |
|
|
6 |
|
|
|
7 |
source("./confusion_matrix_rates.r") |
|
|
8 |
threshold <- 0.5 |
|
|
9 |
|
|
|
10 |
dataFileName <- "../data/LungCancerDataset_AllRecords_NORM_27reduced_features.csv" |
|
|
11 |
cat("dataFileName = ", dataFileName, "\n", sep="") |
|
|
12 |
|
|
|
13 |
cancer_data_norm <- read.csv(file=dataFileName,head=TRUE,sep=",",stringsAsFactors=FALSE) |
|
|
14 |
cancer_data_norm <- cancer_data_norm[sample(nrow(cancer_data_norm)),] # shuffle the rows |
|
|
15 |
|
|
|
16 |
totalElements <- dim(cancer_data_norm)[1] |
|
|
17 |
|
|
|
18 |
subsets_size <- 4000 |
|
|
19 |
|
|
|
20 |
if (subsets_size != totalElements) { |
|
|
21 |
cat("!!! ATTENTION: We are running the method on a subset of the original dataset, \n", sep="") |
|
|
22 |
cat("!!! containing only ", subsets_size, " elements \n", sep="") |
|
|
23 |
cat("!!! instead of ", totalElements, " elements \n", sep="") |
|
|
24 |
} |
|
|
25 |
|
|
|
26 |
cancer_data_norm <- cancer_data_norm[1:subsets_size, ] |
|
|
27 |
|
|
|
28 |
dataset_dim_retriever(cancer_data_norm) |
|
|
29 |
imbalance_retriever(cancer_data_norm$Metastasis) |
|
|
30 |
|
|
|
31 |
|
|
|
32 |
target_index <- dim(cancer_data_norm)[2] |
|
|
33 |
|
|
|
34 |
training_set_perce <- 60 |
|
|
35 |
cat("training_set_perce = ", training_set_perce, "% \n", sep="") |
|
|
36 |
validation_set_perce <- 20 |
|
|
37 |
cat("validation_set_perce = ", validation_set_perce, "% \n", sep="") |
|
|
38 |
test_set_perce <- 100 - training_set_perce - validation_set_perce |
|
|
39 |
cat("test_set_perce = ", test_set_perce, "% \n", sep="") |
|
|
40 |
|
|
|
41 |
# the training set is the first 60% of the whole dataset |
|
|
42 |
training_set_first_index <- 1 # NEW |
|
|
43 |
training_set_last_index <- round(dim(cancer_data_norm)[1]*training_set_perce/100) # NEW |
|
|
44 |
|
|
|
45 |
# the validation set is the following 20% of the whole dataset |
|
|
46 |
validation_set_first_index <- round(dim(cancer_data_norm)[1]*training_set_perce/100)+1 # NEW |
|
|
47 |
validation_set_last_index <- round(dim(cancer_data_norm)[1]*(training_set_perce+validation_set_perce)/100) # NEW |
|
|
48 |
|
|
|
49 |
# the test set is the last 20% of the whole dataset |
|
|
50 |
test_set_first_index <- round(dim(cancer_data_norm)[1]*(training_set_perce+validation_set_perce)/100)+1 # NEW |
|
|
51 |
test_set_last_index <- dim(cancer_data_norm)[1] # NEW |
|
|
52 |
|
|
|
53 |
cat("[Creating the subsets for the values]\n") |
|
|
54 |
cancer_data_train <- cancer_data_norm[training_set_first_index:training_set_last_index, 1:(target_index-1)] # NEW |
|
|
55 |
cancer_data_validation <- cancer_data_norm[validation_set_first_index:validation_set_last_index, 1:(target_index-1)] # NEW |
|
|
56 |
cancer_data_test <- cancer_data_norm[test_set_first_index:test_set_last_index, 1:(target_index-1)] # NEW |
|
|
57 |
|
|
|
58 |
cat("[Creating the subsets for the labels \"1\"-\"0\"]\n") |
|
|
59 |
cancer_data_train_labels <- cancer_data_norm[training_set_first_index:training_set_last_index, target_index] # NEW |
|
|
60 |
cancer_data_validation_labels <- cancer_data_norm[validation_set_first_index:validation_set_last_index, target_index] # NEW |
|
|
61 |
cancer_data_test_labels <- cancer_data_norm[test_set_first_index:test_set_last_index, target_index] # NEW |
|
|
62 |
|
|
|
63 |
|
|
|
64 |
library(class) |
|
|
65 |
library(gmodels) |
|
|
66 |
|
|
|
67 |
# # The k value must be lower than the size of the trainingset |
|
|
68 |
maxK <- 100 #NEW |
|
|
69 |
|
|
|
70 |
mcc_array <- character(length(maxK)) |
|
|
71 |
|
|
|
72 |
# NEW PART: |
|
|
73 |
|
|
|
74 |
cat("\n[Optimization of the hyper-parameter k start]\n") |
|
|
75 |
# optimizaion loop |
|
|
76 |
for(thisK in 1:maxK) |
|
|
77 |
{ |
|
|
78 |
# apply k-NN with the current K value |
|
|
79 |
# train on the training set, evaluate in the validation set by computing the MCC |
|
|
80 |
# save the MCC corresponding to the current K value |
|
|
81 |
|
|
|
82 |
cat("\n[Training the kNN model (with k=",thisK,") on training set & applying the kNN model to validation set]\n", sep="") |
|
|
83 |
|
|
|
84 |
cancer_data_validation_pred <- knn(train = cancer_data_train, test = cancer_data_validation, cl = cancer_data_train_labels, k=thisK) |
|
|
85 |
cancer_data_validation_pred_binary <- as.numeric (cancer_data_validation_pred)-1 |
|
|
86 |
|
|
|
87 |
mcc_outcome <- mcc(cancer_data_validation_labels, cancer_data_validation_pred_binary) |
|
|
88 |
cat("When k=",thisK,", the MCC value is ",mcc_outcome, "\t (worst possible: -1; best possible: +1)\n", sep="") |
|
|
89 |
|
|
|
90 |
mcc_array[thisK] <- mcc_outcome |
|
|
91 |
|
|
|
92 |
} |
|
|
93 |
|
|
|
94 |
# select the k corresponding to the highest MCC and call it k_best |
|
|
95 |
bestMCC <- max(mcc_array) |
|
|
96 |
bestK <- match(bestMCC, mcc_array) |
|
|
97 |
cat("\nThe best k value is ", bestK,", corresponding to MCC=", mcc_array[bestK],"\n", sep="") |
|
|
98 |
|
|
|
99 |
cat("[Optimization end]\n\n") |
|
|
100 |
|
|
|
101 |
cat("\n @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ \n") |
|
|
102 |
|
|
|
103 |
# apply k-NN with k_best to the test set |
|
|
104 |
|
|
|
105 |
cat("[Training the kNN model (with the OPTIMIZED hyper-parameter k=",bestK,") on training set & applying the kNN to the test set]\n", sep="") |
|
|
106 |
cancer_data_test_pred <- knn(train = cancer_data_train, test = cancer_data_test, cl = cancer_data_train_labels, k=bestK) |
|
|
107 |
|
|
|
108 |
cancer_data_test_pred <- as.numeric(cancer_data_test_pred)-1 |
|
|
109 |
|
|
|
110 |
|
|
|
111 |
confusion_matrix_rates(cancer_data_test_labels, cancer_data_test_pred, "@@@ Test set @@@") |
|
|
112 |
|