Diff of /Data_prep.R [000000] .. [be963f]

Switch to unified view

a b/Data_prep.R
1
##Please set your working directory before run the code
2
getwd()
3
setwd("...")
4
5
##Let's read data from CSV file and assign it to variable
6
raw_data<-read.csv("BBB_raw_data.csv")
7
8
##It's good to know if we are successful - we should check dimensions and print variable to visually check if data are 
9
##loaded correctly
10
dim(raw_data)
11
12
raw_data
13
14
raw_data[1,1]
15
raw_data[209,135]
16
17
raw_data[1,]
18
19
##Options to print/show just values from specific column - in our case its column nr 1
20
raw_data[,1]
21
raw_data$tpsa
22
raw_data[,"tpsa"]
23
24
raw_data[1]
25
26
raw_data[raw_data$logBBB<(-1.5),]
27
raw_data[raw_data[,135]<(-1.5),]
28
29
sapply(raw_data, class)
30
31
sapply(raw_data, class)
32
33
##Let's search for missing values in our dataset
34
is.na(raw_data)
35
36
##As searching visually is not easy and efficient let's try mathematic: FALSE is 0 whereas TRUE is 1 so...
37
sum(is.na(raw_data))
38
39
raw_data[is.na(raw_data[,"logBBB"]),]
40
41
colSums(is.na(raw_data))
42
43
colsNA<-colnames(raw_data)[colSums(is.na(raw_data)) > 0]
44
45
colsNA
46
47
rowsNA<-rownames(raw_data)[rowSums(is.na(raw_data)) > 0]
48
49
##It's not numeric value so it must be converted - as.numeric() is an option here
50
rowsNA
51
52
rowsNA<-which(rowSums(is.na(raw_data)) > 0)
53
rowsNA
54
55
data<-raw_data[-rowsNA,]
56
##Check if any row contains missing value
57
rownames(data)[rowSums(is.na(data)) > 0]
58
59
##Check if any column contains missing value
60
colnames(data)[colSums(is.na(data)) > 0]
61
62
##Check dimensions of newly created data
63
dim(data)
64
65
#Lets make it wrong 
66
67
data_bad<-read.csv("BBB_raw_data_tabSep.txt")
68
69
##Let's print our data - something is definitle not right
70
data_bad
71
72
##What are dimensions of our variable
73
dim(data_bad)
74
75
##What?? How to correct the error
76
data_correct<-read.csv("BBB_raw_data_tabSep.txt", sep="")
77
data_correct
78
79
##What else is important? Check if your dataset has variable/columns name included
80
data_bad2<-read.csv("BBB_raw_data_tabSep.txt", sep="", header = FALSE)
81
data_bad2
82
83
sapply(data_bad2, class)
84
85
##And now we see how such simple settings affect our work - it just not work...
86
sum(data_bad2$V1)
87
88
##Of course we expect to have one more row
89
dim(data_bad2)
90
91
92
##Load library
93
library(caret)
94
95
set.seed(1234) ##Please check what will happen if we will not set seed
96
97
##Let's get "random" row numbers for trainig data
98
trainIndex <- createDataPartition(data$logBBB, p = .8, list=TRUE)
99
100
trainIndex
101
##Having rows numbers we are capable to split dataset into training part and testing part
102
train_data<-data[trainIndex$Resample1,]
103
test_data<-data[-trainIndex$Resample1,]
104
105
dim(train_data)
106
dim(test_data)
107
108
##Why not to save out datasets into files for future use?
109
write.csv(train_data, file="BBB_data_no1.csv", row.names=FALSE)
110
write.csv(test_data, file="t-BBB_data_no1.csv", row.names=FALSE)
111
112
set.seed(1234) ##Again...
113
114
foldIndex <- createFolds(data$logBBB, k = 10, returnTrain = FALSE)
115
116
foldIndex
117
118
##Did we get what was expected? - check it.
119
sort(unlist(foldIndex))
120
##Seems to be right
121
##Let's split dataset into 
122
loop<-1
123
for(i in foldIndex){
124
  print(i)
125
  
126
  ##Create training subset, name for file and file itself
127
  train_data<-data[-i,]
128
  name<-paste("cv_BBB_data_no", loop, ".csv", sep="")
129
  write.csv(train_data, name, row.names=FALSE)
130
  
131
  ##Create test subset, name for file and file itself
132
  t_name<-paste("t-cv_BBB_data_no", loop, ".csv", sep="")
133
  test_data<-data[i,]
134
  write.csv(test_data, t_name, row.names=FALSE)
135
  
136
  ##Increase loop variable by 1 to not overwrite just created files.
137
  loop<-loop+1    
138
}
139
140
##It's also good to know what is inside files so save row numbers for every fold
141
track_record<-as.matrix(foldIndex, row.names=names(foldIndex))
142
write.table(track_record, file="track_record.txt", row.names=TRUE)    
143
144
##That's all... in case of data splittig
145
track_record
146
147
##Standardization
148
##Let's take 2 variables with distribution different from normal as an example
149
summary(data[,c("wpsa1","wpsa2")])
150
151
##We use function preProcess implemented in caret package. 
152
pp_settings <- preProcess(data[,c("wpsa1","wpsa2")], method=c("center", "scale"))
153
154
##Use specified settings to transform given data
155
standardized_data <- predict(pp_settings, data[,c("wpsa1","wpsa2")])
156
summary(standardized_data)
157
158
hist(standardized_data$wpsa2)
159
160
##Distribution did not change!!!
161
hist(data$wpsa2)
162
163
##Again prepare object with scaling settings
164
pp_settings2 <- preProcess(data[,c("wpsa1","wpsa2")], method=c("range"))
165
166
scaled_data <- predict(pp_settings2, data[,c("wpsa1","wpsa2")])
167
168
summary(scaled_data)
169
170
##Again distribution did not change
171
hist(scaled_data$wpsa2)
172
173
##Another transformation used quite frequently is logarithm
174
log_data = log(data[,c("wpsa1","wpsa2")])
175
176
summary(log_data)
177
178
##This time, variable distribution changed!!!
179
hist(log_data$wpsa2)
180
181
##Do we have a normal distribution?
182
shapiro.test(log_data$wpsa2)
183
184
##Did we have a normal distribution before?
185
shapiro.test(data$wpsa2)
186
187
188
data[1]
189
poly_x <- poly(data[[1]], degree = 3)
190
poly_x
191
plot(y=poly_x[,1], x=data[[1]])
192
plot(y=poly_x[,2], x=data[[1]])
193
plot(y=poly_x[,3], x=data[[1]])
194
195
196
##Add noise to data
197
noised_data<-data
198
for(l in 1:5){
199
  for(i in 1:dim(data)[1]){
200
    r=rnorm(dim(data)[2], mean = 1, sd = 0.01)
201
    new_row<-data[i,]*r
202
    noised_data<-rbind(noised_data, new_row)
203
  }
204
}