AI_pharm / Git / Diff of /Data

Models:

Amanda-D/

AI_pharm

Downloads: 1

Diff of /Data_prep.R [000000] .. [be963f]

Switch to unified view

 b/Data_prep.R
+##Please set your working directory before run the code
+getwd()
+setwd("...")
+##Let's read data from CSV file and assign it to variable
+raw_data<-read.csv("BBB_raw_data.csv")
+##It's good to know if we are successful - we should check dimensions and print variable to visually check if data are
+##loaded correctly
+dim(raw_data)
+raw_data
+raw_data[1,1]
+raw_data[209,135]
+raw_data[1,]
+##Options to print/show just values from specific column - in our case its column nr 1
+raw_data[,1]
+raw_data$tpsa
+raw_data[,"tpsa"]
+raw_data[1]
+raw_data[raw_data$logBBB<(-1.5),]
+raw_data[raw_data[,135]<(-1.5),]
+sapply(raw_data, class)
+sapply(raw_data, class)
+##Let's search for missing values in our dataset
+is.na(raw_data)
+##As searching visually is not easy and efficient let's try mathematic: FALSE is 0 whereas TRUE is 1 so...
+sum(is.na(raw_data))
+raw_data[is.na(raw_data[,"logBBB"]),]
+colSums(is.na(raw_data))
+colsNA<-colnames(raw_data)[colSums(is.na(raw_data)) > 0]
+colsNA
+rowsNA<-rownames(raw_data)[rowSums(is.na(raw_data)) > 0]
+##It's not numeric value so it must be converted - as.numeric() is an option here
+rowsNA
+rowsNA<-which(rowSums(is.na(raw_data)) > 0)
+rowsNA
+data<-raw_data[-rowsNA,]
+##Check if any row contains missing value
+rownames(data)[rowSums(is.na(data)) > 0]
+##Check if any column contains missing value
+colnames(data)[colSums(is.na(data)) > 0]
+##Check dimensions of newly created data
+dim(data)
+#Lets make it wrong
+data_bad<-read.csv("BBB_raw_data_tabSep.txt")
+##Let's print our data - something is definitle not right
+data_bad
+##What are dimensions of our variable
+dim(data_bad)
+##What?? How to correct the error
+data_correct<-read.csv("BBB_raw_data_tabSep.txt", sep="")
+data_correct
+##What else is important? Check if your dataset has variable/columns name included
+data_bad2<-read.csv("BBB_raw_data_tabSep.txt", sep="", header = FALSE)
+data_bad2
+sapply(data_bad2, class)
+##And now we see how such simple settings affect our work - it just not work...
+sum(data_bad2$V1)
+##Of course we expect to have one more row
+dim(data_bad2)
+##Load library
+library(caret)
+set.seed(1234) ##Please check what will happen if we will not set seed
+##Let's get "random" row numbers for trainig data
+trainIndex <- createDataPartition(data$logBBB, p = .8, list=TRUE)
+trainIndex
+##Having rows numbers we are capable to split dataset into training part and testing part
+train_data<-data[trainIndex$Resample1,]
+test_data<-data[-trainIndex$Resample1,]
+dim(train_data)
+dim(test_data)
+##Why not to save out datasets into files for future use?
+write.csv(train_data, file="BBB_data_no1.csv", row.names=FALSE)
+write.csv(test_data, file="t-BBB_data_no1.csv", row.names=FALSE)
+set.seed(1234) ##Again...
+foldIndex <- createFolds(data$logBBB, k = 10, returnTrain = FALSE)
+foldIndex
+##Did we get what was expected? - check it.
+sort(unlist(foldIndex))
+##Seems to be right
+##Let's split dataset into
+loop<-1
+for(i in foldIndex){
+  print(i)
+  ##Create training subset, name for file and file itself
+  train_data<-data[-i,]
+  name<-paste("cv_BBB_data_no", loop, ".csv", sep="")
+  write.csv(train_data, name, row.names=FALSE)
+  ##Create test subset, name for file and file itself
+  t_name<-paste("t-cv_BBB_data_no", loop, ".csv", sep="")
+  test_data<-data[i,]
+  write.csv(test_data, t_name, row.names=FALSE)
+  ##Increase loop variable by 1 to not overwrite just created files.
+  loop<-loop+1
+}
+##It's also good to know what is inside files so save row numbers for every fold
+track_record<-as.matrix(foldIndex, row.names=names(foldIndex))
+write.table(track_record, file="track_record.txt", row.names=TRUE)
+##That's all... in case of data splittig
+track_record
+##Standardization
+##Let's take 2 variables with distribution different from normal as an example
+summary(data[,c("wpsa1","wpsa2")])
+##We use function preProcess implemented in caret package.
+pp_settings <- preProcess(data[,c("wpsa1","wpsa2")], method=c("center", "scale"))
+##Use specified settings to transform given data
+standardized_data <- predict(pp_settings, data[,c("wpsa1","wpsa2")])
+summary(standardized_data)
+hist(standardized_data$wpsa2)
+##Distribution did not change!!!
+hist(data$wpsa2)
+##Again prepare object with scaling settings
+pp_settings2 <- preProcess(data[,c("wpsa1","wpsa2")], method=c("range"))
+scaled_data <- predict(pp_settings2, data[,c("wpsa1","wpsa2")])
+summary(scaled_data)
+##Again distribution did not change
+hist(scaled_data$wpsa2)
+##Another transformation used quite frequently is logarithm
+log_data = log(data[,c("wpsa1","wpsa2")])
+summary(log_data)
+##This time, variable distribution changed!!!
+hist(log_data$wpsa2)
+##Do we have a normal distribution?
+shapiro.test(log_data$wpsa2)
+##Did we have a normal distribution before?
+shapiro.test(data$wpsa2)
+data[1]
+poly_x <- poly(data[[1]], degree = 3)
+poly_x
+plot(y=poly_x[,1], x=data[[1]])
+plot(y=poly_x[,2], x=data[[1]])
+plot(y=poly_x[,3], x=data[[1]])
+##Add noise to data
+noised_data<-data
+for(l in 1:5){
+  for(i in 1:dim(data)[1]){
+    r=rnorm(dim(data)[2], mean = 1, sd = 0.01)
+    new_row<-data[i,]*r
+    noised_data<-rbind(noised_data, new_row)
+  }
+}