Diff of /bin/lin_reg.r [000000] .. [868c5d]

Switch to unified view

a b/bin/lin_reg.r
1
#!/usr/bin/env Rscript
2
3
4
setwd(".")
5
options(stringsAsFactors = FALSE)
6
# library("clusterSim")
7
library("e1071")
8
library("PRROC")
9
tau = 0.5
10
11
source("./confusion_matrix_rates.r")
12
13
14
cancer_data_norm <- read.csv(file="../data/LungCancerDataset_AllRecords_NORM_27reduced_features.csv",head=TRUE,sep=",",stringsAsFactors=FALSE)
15
16
cancer_data_norm <- cancer_data_norm[sample(nrow(cancer_data_norm)),] # shuffle the rows
17
18
target_index <- dim(cancer_data_norm)[2]
19
20
training_set_perce <- 80
21
22
cat("training_set_perce = ", training_set_perce, "%\n", sep="")
23
24
# the training set is the first training_set_perce% of the whole dataset
25
training_set_first_index <- 1 # NEW
26
training_set_last_index <- round(dim(cancer_data_norm)[1]*training_set_perce/100) # NEW
27
28
 # the test set is the last 20% of the whole dataset
29
test_set_first_index <- round(dim(cancer_data_norm)[1]*training_set_perce/100)+1 # NEW
30
test_set_last_index <- dim(cancer_data_norm)[1] # NEW
31
32
cat("[Creating the subsets for the values]\n")
33
prc_data_train <- cancer_data_norm[training_set_first_index:training_set_last_index, 1:(target_index-1)] # NEW
34
prc_data_test <- cancer_data_norm[test_set_first_index:test_set_last_index, 1:(target_index-1)] # NEW
35
36
37
cat("[Creating the subsets for the labels \"1\"-\"0\"]\n")
38
prc_data_train_labels <- cancer_data_norm[training_set_first_index:training_set_last_index, target_index] # NEW
39
prc_data_test_labels <- cancer_data_norm[test_set_first_index:test_set_last_index, target_index]   # NEW
40
41
library(class)
42
library(gmodels)
43
44
# apply k-NN with k_best to the test set
45
46
cat("\n[Training the linear regression model on training set & applying the linear regression to test set]\n", sep="")
47
48
lin_reg_model_new <- lm(prc_data_train_labels ~ ., data=prc_data_train)
49
prc_data_test_pred <- predict(lin_reg_model_new, prc_data_test)
50
51
prc_data_test_pred_bin <- as.numeric(prc_data_test_pred)
52
prc_data_test_pred_bin[prc_data_test_pred_bin>=tau]<-1
53
prc_data_test_pred_bin[prc_data_test_pred_bin<tau]<-0
54
55
confusion_matrix_rates(prc_data_test_labels, prc_data_test_pred_bin, "@@@ Test set @@@")
56
57