Diff of /bin/svm.r [000000] .. [868c5d]

Switch to unified view

a b/bin/svm.r
1
setwd(".")
2
options(stringsAsFactors = FALSE)
3
#library("clusterSim")
4
library("e1071")
5
library("PRROC")
6
tau = 0.5
7
8
source("./confusion_matrix_rates.r")
9
10
dataFileName <- "../data/LungCancerDataset_AllRecords_NORM_27reduced_features.csv"
11
cat("dataFileName = ", dataFileName, "\n", sep="")
12
13
cancer_data_norm <- read.csv(file=dataFileName,head=TRUE,sep=",",stringsAsFactors=FALSE)
14
cancer_data_norm <- cancer_data_norm[sample(nrow(cancer_data_norm)),] # shuffle the rows
15
16
totalElements <- dim(cancer_data_norm)[1]
17
18
subsets_size <- 4000
19
20
if (subsets_size != totalElements) {
21
    cat("!!! ATTENTION: We are running the method on a subset of the original dataset, \n", sep="")
22
    cat("!!! containing only ", subsets_size, " elements \n", sep="")
23
    cat("!!! instead of ", totalElements, " elements \n", sep="")
24
}
25
26
cancer_data_norm <- cancer_data_norm[1:subsets_size, ]
27
28
dataset_dim_retriever(cancer_data_norm)
29
imbalance_retriever(cancer_data_norm$Metastasis)
30
31
target_index <- dim(cancer_data_norm)[2]
32
33
training_set_perce <- 60
34
cat("training_set_perce = ", training_set_perce, "% \n", sep="")
35
validation_set_perce <- 20
36
cat("validation_set_perce = ", validation_set_perce, "% \n", sep="")
37
test_set_perce <- 100 - training_set_perce - validation_set_perce
38
cat("test_set_perce = ", test_set_perce, "% \n", sep="")
39
40
# the training set is the first 60% of the whole dataset
41
training_set_first_index <- 1 # NEW
42
training_set_last_index <- round(dim(cancer_data_norm)[1]*training_set_perce/100) # NEW
43
44
# the validation set is the following 20% of the whole dataset
45
validation_set_first_index <- round(dim(cancer_data_norm)[1]*training_set_perce/100)+1 # NEW
46
validation_set_last_index <- round(dim(cancer_data_norm)[1]*(training_set_perce+validation_set_perce)/100) # NEW
47
48
# the test set is the last 20% of the whole dataset
49
test_set_first_index <- round(dim(cancer_data_norm)[1]*(training_set_perce+validation_set_perce)/100)+1 # NEW
50
test_set_last_index <- dim(cancer_data_norm)[1] # NEW
51
52
cat("[Creating the subsets for the values]\n")
53
cancer_data_train <- cancer_data_norm[training_set_first_index:training_set_last_index, 1:(target_index-1)] # NEW
54
cancer_data_validation <- cancer_data_norm[validation_set_first_index:validation_set_last_index, 1:(target_index-1)] # NEW
55
cancer_data_test <- cancer_data_norm[test_set_first_index:test_set_last_index, 1:(target_index-1)] # NEW
56
57
58
59
cat("[Creating the subsets for the labels \"1\"-\"0\"]\n")
60
cancer_data_train_labels <- cancer_data_norm[training_set_first_index:training_set_last_index, target_index] # NEW
61
cancer_data_validation_labels <- cancer_data_norm[validation_set_first_index:validation_set_last_index, target_index] # NEW
62
cancer_data_test_labels <- cancer_data_norm[test_set_first_index:test_set_last_index, target_index]   # NEW
63
64
library(class)
65
library(gmodels)
66
67
# The k value must be lower than the size of the training set
68
maxK <- 10 #NEW
69
70
mcc_array <- character(length(maxK))
71
72
# NEW PART:
73
74
c_array = c(0.001, 0.01, 0.1, 1, 10)
75
mccCounter = 1
76
77
cat("\n[Optimization of the hyper-parameter C start]\n")
78
# optimizaion loop
79
for(thisC in c_array)
80
{
81
  # apply k-NN with the current K value
82
  # train on the training set, evaluate in the validation set by computing the MCC
83
  # save the MCC corresponding to the current K value
84
  
85
  cat("[Training the SVM model (with C=",thisC,") on training set & applying the SVM model to validation set]\n", sep="")
86
  
87
  svm_model <- svm(cancer_data_train_labels ~ ., cost=thisC, data=cancer_data_train, method = "C-classification", kernel = "linear")
88
    
89
  cancer_data_validation_PRED <- predict(svm_model, cancer_data_validation)
90
  
91
  cancer_data_validation_pred_binary <- as.numeric (cancer_data_validation_PRED)  
92
  cancer_data_validation_pred_binary[cancer_data_validation_pred_binary>=tau]<-1
93
  cancer_data_validation_pred_binary[cancer_data_validation_pred_binary<tau]<-0
94
  
95
#   # cancer_data_validation_pred_binary
96
#    fg_test <- cancer_data_validation_PRED[cancer_data_validation_labels==1]
97
#    bg_test <- cancer_data_validation_PRED[cancer_data_validation_labels==0]
98
# 
99
#    pr_curve_val <- pr.curve(scores.class0 = fg_test, scores.class1 = bg_test, curve = F)
100
#    # plot(pr_curve_test)
101
#    # print(pr_curve_val)
102
# 
103
#    roc_curve_val  <- roc.curve(scores.class0 = fg_test, scores.class1 = bg_test, curve = F)
104
#    # plot(pr_curve_test)
105
  # print(roc_curve_val)
106
  
107
  mcc_outcome <- mcc(cancer_data_validation_labels, cancer_data_validation_pred_binary)
108
  
109
  cat("When C=",thisC,", the MCC value is ",mcc_outcome, "\t (worst possible: -1; best possible: +1)\n", sep="")
110
  
111
  mcc_array[mccCounter] <- mcc_outcome
112
  mccCounter = mccCounter + 1
113
}
114
115
# select the k corresponding to the highest MCC and call it k_best
116
bestMCC <- max(mcc_array)
117
bestCindex <- match(bestMCC, mcc_array)
118
cat("\nThe best C value is ", c_array[bestCindex],", corresponding to MCC=", mcc_array[bestCindex],"\n", sep="")
119
120
cat("[Optimization end]\n\n")
121
122
cat("\n @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ \n")
123
124
# apply k-NN with k_best to the test set
125
126
cat("\n[Training the SVM model (with the OPTIMIZED hyper-parameter C=",c_array[bestCindex],") on training set & applying the SVM to the test set]\n", sep="")
127
#cancer_data_test_pred <- knn(train = cancer_data_train, test = cancer_data_test, cl = cancer_data_train_labels, k=bestK)
128
129
svm_model_new <- svm(cancer_data_train_labels ~ ., cost=c_array[bestCindex], data=cancer_data_train, method = "C-classification", kernel = "linear")
130
cancer_data_test_pred <- predict(svm_model_new, cancer_data_test)
131
  
132
133
confusion_matrix_rates(cancer_data_test_labels, cancer_data_test_pred, "@@@ Test set @@@")
134
135
136