[be963f]: / Data_prep.R

Download this file

205 lines (139 with data), 5.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
##Please set your working directory before run the code
getwd()
setwd("...")
##Let's read data from CSV file and assign it to variable
raw_data<-read.csv("BBB_raw_data.csv")
##It's good to know if we are successful - we should check dimensions and print variable to visually check if data are
##loaded correctly
dim(raw_data)
raw_data
raw_data[1,1]
raw_data[209,135]
raw_data[1,]
##Options to print/show just values from specific column - in our case its column nr 1
raw_data[,1]
raw_data$tpsa
raw_data[,"tpsa"]
raw_data[1]
raw_data[raw_data$logBBB<(-1.5),]
raw_data[raw_data[,135]<(-1.5),]
sapply(raw_data, class)
sapply(raw_data, class)
##Let's search for missing values in our dataset
is.na(raw_data)
##As searching visually is not easy and efficient let's try mathematic: FALSE is 0 whereas TRUE is 1 so...
sum(is.na(raw_data))
raw_data[is.na(raw_data[,"logBBB"]),]
colSums(is.na(raw_data))
colsNA<-colnames(raw_data)[colSums(is.na(raw_data)) > 0]
colsNA
rowsNA<-rownames(raw_data)[rowSums(is.na(raw_data)) > 0]
##It's not numeric value so it must be converted - as.numeric() is an option here
rowsNA
rowsNA<-which(rowSums(is.na(raw_data)) > 0)
rowsNA
data<-raw_data[-rowsNA,]
##Check if any row contains missing value
rownames(data)[rowSums(is.na(data)) > 0]
##Check if any column contains missing value
colnames(data)[colSums(is.na(data)) > 0]
##Check dimensions of newly created data
dim(data)
#Lets make it wrong
data_bad<-read.csv("BBB_raw_data_tabSep.txt")
##Let's print our data - something is definitle not right
data_bad
##What are dimensions of our variable
dim(data_bad)
##What?? How to correct the error
data_correct<-read.csv("BBB_raw_data_tabSep.txt", sep="")
data_correct
##What else is important? Check if your dataset has variable/columns name included
data_bad2<-read.csv("BBB_raw_data_tabSep.txt", sep="", header = FALSE)
data_bad2
sapply(data_bad2, class)
##And now we see how such simple settings affect our work - it just not work...
sum(data_bad2$V1)
##Of course we expect to have one more row
dim(data_bad2)
##Load library
library(caret)
set.seed(1234) ##Please check what will happen if we will not set seed
##Let's get "random" row numbers for trainig data
trainIndex <- createDataPartition(data$logBBB, p = .8, list=TRUE)
trainIndex
##Having rows numbers we are capable to split dataset into training part and testing part
train_data<-data[trainIndex$Resample1,]
test_data<-data[-trainIndex$Resample1,]
dim(train_data)
dim(test_data)
##Why not to save out datasets into files for future use?
write.csv(train_data, file="BBB_data_no1.csv", row.names=FALSE)
write.csv(test_data, file="t-BBB_data_no1.csv", row.names=FALSE)
set.seed(1234) ##Again...
foldIndex <- createFolds(data$logBBB, k = 10, returnTrain = FALSE)
foldIndex
##Did we get what was expected? - check it.
sort(unlist(foldIndex))
##Seems to be right
##Let's split dataset into
loop<-1
for(i in foldIndex){
print(i)
##Create training subset, name for file and file itself
train_data<-data[-i,]
name<-paste("cv_BBB_data_no", loop, ".csv", sep="")
write.csv(train_data, name, row.names=FALSE)
##Create test subset, name for file and file itself
t_name<-paste("t-cv_BBB_data_no", loop, ".csv", sep="")
test_data<-data[i,]
write.csv(test_data, t_name, row.names=FALSE)
##Increase loop variable by 1 to not overwrite just created files.
loop<-loop+1
}
##It's also good to know what is inside files so save row numbers for every fold
track_record<-as.matrix(foldIndex, row.names=names(foldIndex))
write.table(track_record, file="track_record.txt", row.names=TRUE)
##That's all... in case of data splittig
track_record
##Standardization
##Let's take 2 variables with distribution different from normal as an example
summary(data[,c("wpsa1","wpsa2")])
##We use function preProcess implemented in caret package.
pp_settings <- preProcess(data[,c("wpsa1","wpsa2")], method=c("center", "scale"))
##Use specified settings to transform given data
standardized_data <- predict(pp_settings, data[,c("wpsa1","wpsa2")])
summary(standardized_data)
hist(standardized_data$wpsa2)
##Distribution did not change!!!
hist(data$wpsa2)
##Again prepare object with scaling settings
pp_settings2 <- preProcess(data[,c("wpsa1","wpsa2")], method=c("range"))
scaled_data <- predict(pp_settings2, data[,c("wpsa1","wpsa2")])
summary(scaled_data)
##Again distribution did not change
hist(scaled_data$wpsa2)
##Another transformation used quite frequently is logarithm
log_data = log(data[,c("wpsa1","wpsa2")])
summary(log_data)
##This time, variable distribution changed!!!
hist(log_data$wpsa2)
##Do we have a normal distribution?
shapiro.test(log_data$wpsa2)
##Did we have a normal distribution before?
shapiro.test(data$wpsa2)
data[1]
poly_x <- poly(data[[1]], degree = 3)
poly_x
plot(y=poly_x[,1], x=data[[1]])
plot(y=poly_x[,2], x=data[[1]])
plot(y=poly_x[,3], x=data[[1]])
##Add noise to data
noised_data<-data
for(l in 1:5){
for(i in 1:dim(data)[1]){
r=rnorm(dim(data)[2], mean = 1, sd = 0.01)
new_row<-data[i,]*r
noised_data<-rbind(noised_data, new_row)
}
}