|
a |
|
b/Data_prep.R |
|
|
1 |
##Please set your working directory before run the code |
|
|
2 |
getwd() |
|
|
3 |
setwd("...") |
|
|
4 |
|
|
|
5 |
##Let's read data from CSV file and assign it to variable |
|
|
6 |
raw_data<-read.csv("BBB_raw_data.csv") |
|
|
7 |
|
|
|
8 |
##It's good to know if we are successful - we should check dimensions and print variable to visually check if data are |
|
|
9 |
##loaded correctly |
|
|
10 |
dim(raw_data) |
|
|
11 |
|
|
|
12 |
raw_data |
|
|
13 |
|
|
|
14 |
raw_data[1,1] |
|
|
15 |
raw_data[209,135] |
|
|
16 |
|
|
|
17 |
raw_data[1,] |
|
|
18 |
|
|
|
19 |
##Options to print/show just values from specific column - in our case its column nr 1 |
|
|
20 |
raw_data[,1] |
|
|
21 |
raw_data$tpsa |
|
|
22 |
raw_data[,"tpsa"] |
|
|
23 |
|
|
|
24 |
raw_data[1] |
|
|
25 |
|
|
|
26 |
raw_data[raw_data$logBBB<(-1.5),] |
|
|
27 |
raw_data[raw_data[,135]<(-1.5),] |
|
|
28 |
|
|
|
29 |
sapply(raw_data, class) |
|
|
30 |
|
|
|
31 |
sapply(raw_data, class) |
|
|
32 |
|
|
|
33 |
##Let's search for missing values in our dataset |
|
|
34 |
is.na(raw_data) |
|
|
35 |
|
|
|
36 |
##As searching visually is not easy and efficient let's try mathematic: FALSE is 0 whereas TRUE is 1 so... |
|
|
37 |
sum(is.na(raw_data)) |
|
|
38 |
|
|
|
39 |
raw_data[is.na(raw_data[,"logBBB"]),] |
|
|
40 |
|
|
|
41 |
colSums(is.na(raw_data)) |
|
|
42 |
|
|
|
43 |
colsNA<-colnames(raw_data)[colSums(is.na(raw_data)) > 0] |
|
|
44 |
|
|
|
45 |
colsNA |
|
|
46 |
|
|
|
47 |
rowsNA<-rownames(raw_data)[rowSums(is.na(raw_data)) > 0] |
|
|
48 |
|
|
|
49 |
##It's not numeric value so it must be converted - as.numeric() is an option here |
|
|
50 |
rowsNA |
|
|
51 |
|
|
|
52 |
rowsNA<-which(rowSums(is.na(raw_data)) > 0) |
|
|
53 |
rowsNA |
|
|
54 |
|
|
|
55 |
data<-raw_data[-rowsNA,] |
|
|
56 |
##Check if any row contains missing value |
|
|
57 |
rownames(data)[rowSums(is.na(data)) > 0] |
|
|
58 |
|
|
|
59 |
##Check if any column contains missing value |
|
|
60 |
colnames(data)[colSums(is.na(data)) > 0] |
|
|
61 |
|
|
|
62 |
##Check dimensions of newly created data |
|
|
63 |
dim(data) |
|
|
64 |
|
|
|
65 |
#Lets make it wrong |
|
|
66 |
|
|
|
67 |
data_bad<-read.csv("BBB_raw_data_tabSep.txt") |
|
|
68 |
|
|
|
69 |
##Let's print our data - something is definitle not right |
|
|
70 |
data_bad |
|
|
71 |
|
|
|
72 |
##What are dimensions of our variable |
|
|
73 |
dim(data_bad) |
|
|
74 |
|
|
|
75 |
##What?? How to correct the error |
|
|
76 |
data_correct<-read.csv("BBB_raw_data_tabSep.txt", sep="") |
|
|
77 |
data_correct |
|
|
78 |
|
|
|
79 |
##What else is important? Check if your dataset has variable/columns name included |
|
|
80 |
data_bad2<-read.csv("BBB_raw_data_tabSep.txt", sep="", header = FALSE) |
|
|
81 |
data_bad2 |
|
|
82 |
|
|
|
83 |
sapply(data_bad2, class) |
|
|
84 |
|
|
|
85 |
##And now we see how such simple settings affect our work - it just not work... |
|
|
86 |
sum(data_bad2$V1) |
|
|
87 |
|
|
|
88 |
##Of course we expect to have one more row |
|
|
89 |
dim(data_bad2) |
|
|
90 |
|
|
|
91 |
|
|
|
92 |
##Load library |
|
|
93 |
library(caret) |
|
|
94 |
|
|
|
95 |
set.seed(1234) ##Please check what will happen if we will not set seed |
|
|
96 |
|
|
|
97 |
##Let's get "random" row numbers for trainig data |
|
|
98 |
trainIndex <- createDataPartition(data$logBBB, p = .8, list=TRUE) |
|
|
99 |
|
|
|
100 |
trainIndex |
|
|
101 |
##Having rows numbers we are capable to split dataset into training part and testing part |
|
|
102 |
train_data<-data[trainIndex$Resample1,] |
|
|
103 |
test_data<-data[-trainIndex$Resample1,] |
|
|
104 |
|
|
|
105 |
dim(train_data) |
|
|
106 |
dim(test_data) |
|
|
107 |
|
|
|
108 |
##Why not to save out datasets into files for future use? |
|
|
109 |
write.csv(train_data, file="BBB_data_no1.csv", row.names=FALSE) |
|
|
110 |
write.csv(test_data, file="t-BBB_data_no1.csv", row.names=FALSE) |
|
|
111 |
|
|
|
112 |
set.seed(1234) ##Again... |
|
|
113 |
|
|
|
114 |
foldIndex <- createFolds(data$logBBB, k = 10, returnTrain = FALSE) |
|
|
115 |
|
|
|
116 |
foldIndex |
|
|
117 |
|
|
|
118 |
##Did we get what was expected? - check it. |
|
|
119 |
sort(unlist(foldIndex)) |
|
|
120 |
##Seems to be right |
|
|
121 |
##Let's split dataset into |
|
|
122 |
loop<-1 |
|
|
123 |
for(i in foldIndex){ |
|
|
124 |
print(i) |
|
|
125 |
|
|
|
126 |
##Create training subset, name for file and file itself |
|
|
127 |
train_data<-data[-i,] |
|
|
128 |
name<-paste("cv_BBB_data_no", loop, ".csv", sep="") |
|
|
129 |
write.csv(train_data, name, row.names=FALSE) |
|
|
130 |
|
|
|
131 |
##Create test subset, name for file and file itself |
|
|
132 |
t_name<-paste("t-cv_BBB_data_no", loop, ".csv", sep="") |
|
|
133 |
test_data<-data[i,] |
|
|
134 |
write.csv(test_data, t_name, row.names=FALSE) |
|
|
135 |
|
|
|
136 |
##Increase loop variable by 1 to not overwrite just created files. |
|
|
137 |
loop<-loop+1 |
|
|
138 |
} |
|
|
139 |
|
|
|
140 |
##It's also good to know what is inside files so save row numbers for every fold |
|
|
141 |
track_record<-as.matrix(foldIndex, row.names=names(foldIndex)) |
|
|
142 |
write.table(track_record, file="track_record.txt", row.names=TRUE) |
|
|
143 |
|
|
|
144 |
##That's all... in case of data splittig |
|
|
145 |
track_record |
|
|
146 |
|
|
|
147 |
##Standardization |
|
|
148 |
##Let's take 2 variables with distribution different from normal as an example |
|
|
149 |
summary(data[,c("wpsa1","wpsa2")]) |
|
|
150 |
|
|
|
151 |
##We use function preProcess implemented in caret package. |
|
|
152 |
pp_settings <- preProcess(data[,c("wpsa1","wpsa2")], method=c("center", "scale")) |
|
|
153 |
|
|
|
154 |
##Use specified settings to transform given data |
|
|
155 |
standardized_data <- predict(pp_settings, data[,c("wpsa1","wpsa2")]) |
|
|
156 |
summary(standardized_data) |
|
|
157 |
|
|
|
158 |
hist(standardized_data$wpsa2) |
|
|
159 |
|
|
|
160 |
##Distribution did not change!!! |
|
|
161 |
hist(data$wpsa2) |
|
|
162 |
|
|
|
163 |
##Again prepare object with scaling settings |
|
|
164 |
pp_settings2 <- preProcess(data[,c("wpsa1","wpsa2")], method=c("range")) |
|
|
165 |
|
|
|
166 |
scaled_data <- predict(pp_settings2, data[,c("wpsa1","wpsa2")]) |
|
|
167 |
|
|
|
168 |
summary(scaled_data) |
|
|
169 |
|
|
|
170 |
##Again distribution did not change |
|
|
171 |
hist(scaled_data$wpsa2) |
|
|
172 |
|
|
|
173 |
##Another transformation used quite frequently is logarithm |
|
|
174 |
log_data = log(data[,c("wpsa1","wpsa2")]) |
|
|
175 |
|
|
|
176 |
summary(log_data) |
|
|
177 |
|
|
|
178 |
##This time, variable distribution changed!!! |
|
|
179 |
hist(log_data$wpsa2) |
|
|
180 |
|
|
|
181 |
##Do we have a normal distribution? |
|
|
182 |
shapiro.test(log_data$wpsa2) |
|
|
183 |
|
|
|
184 |
##Did we have a normal distribution before? |
|
|
185 |
shapiro.test(data$wpsa2) |
|
|
186 |
|
|
|
187 |
|
|
|
188 |
data[1] |
|
|
189 |
poly_x <- poly(data[[1]], degree = 3) |
|
|
190 |
poly_x |
|
|
191 |
plot(y=poly_x[,1], x=data[[1]]) |
|
|
192 |
plot(y=poly_x[,2], x=data[[1]]) |
|
|
193 |
plot(y=poly_x[,3], x=data[[1]]) |
|
|
194 |
|
|
|
195 |
|
|
|
196 |
##Add noise to data |
|
|
197 |
noised_data<-data |
|
|
198 |
for(l in 1:5){ |
|
|
199 |
for(i in 1:dim(data)[1]){ |
|
|
200 |
r=rnorm(dim(data)[2], mean = 1, sd = 0.01) |
|
|
201 |
new_row<-data[i,]*r |
|
|
202 |
noised_data<-rbind(noised_data, new_row) |
|
|
203 |
} |
|
|
204 |
} |