ClinicalTrialsElig / Git / [c09aa8] /scripts/template.R

Models:
joseph-gordon/
ClinicalTrialsElig
Downloads: 1
[c09aa8]: / scripts / template.R
History
Download this file
404 lines (336 with data), 11.5 kB

setwd("./templates")
namefile <- "2.txt"
namestr <- paste("./../clustall9k/", namefile, sep = "")
b <- read.csv(namestr, header = F, row.names = NULL, stringsAsFactors = F)

for (i in 1:ncol(b)) { 
  for (j in 1:nrow(b)) {
    if (i > 1) {
      b[j,1] <- paste(b[j,1], b[j, i], sep = "")
    }
  }
}

b <- as.data.frame(b[[1]])


b[] <- lapply(b, as.character)
b[] <- lapply(b, tolower)




cbindPad <- function(...){
  args <- list(...)
  n <- sapply(args,nrow)
  mx <- max(n)
  pad <- function(x, mx){
    if (nrow(x) < mx){
      nms <- colnames(x)
      padTemp <- matrix(NA, mx - nrow(x), ncol(x))
      colnames(padTemp) <- nms
      if (ncol(x)==0) {
        return(padTemp)
      } else {
        return(rbind(x,padTemp))
      }
    }
    else{
      return(x)
    }
  }
  rs <- lapply(args,pad,mx)
  return(do.call(cbind,rs))
}



##Platelet Change To Rules

b[] <- lapply(b[], function(x) gsub("Platelet","Platelets",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub("Plateletss","Platelets",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub("Platelet count","Platelets",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub("Platelets:","Platelets",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub("\\(plts\\)","",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub("\\(plt\\)","",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub(">",">=",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub("greater than or equal to",">=",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub(">==",">=",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub(" cell/mm\\^3","/mcL",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub(" */ *mm\\^3","/mcL",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub("/microliters","mcL",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub(" *cells/mcL","/mcL",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub(" mm\\^3","/mcL",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub(" *x *10\\^9/L","000/mcL",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub("100000","100 000",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub("ul","mcL",x, ignore.case = T))
b[] <- lapply(b[], function(x) gsub("mcl","mcL",x, ignore.case = T))








l <- as.data.frame(matrix(0, ncol = 1, nrow = (nrow(b))))


for (i in 1:nrow(b)) {
  if (length( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE))) > ncol(l)) {
    l <- as.data.frame(matrix(0, ncol = length( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE))), nrow = (nrow(b))))
  }
  
}
n <- 1
for (i in 1:nrow(b)) {
  
  n <- as.numeric(as.numeric(ncol(l)) - as.numeric(length( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE)))))
  l[i,] <- c( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE)), rep("IGNORE", times = as.numeric(n)))
}

l <- cbind(l, rep(0, times = nrow(l)))
names(l)[ncol(l)] <- "wc"
for (i in 1:nrow(l)) {
  l[i,ncol(l)] <- length(grep("IGNORE", l[i,]))
}


row1 <- unlist(strsplit(unlist(strsplit(b[which.max(l$wc),1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE))

wc_df <- as.data.frame(l$wc) 
for (i in 1:nrow(wc_df)) {
  wc_df[i,1] <- ncol(l) - wc_df[i,1] - 1
  wc_df[i,2] <- 0
}


l <- l[1:(ncol(l)-1)]




base_str <- row1
text_df <- l
text_df_2 <- text_df
loc_df <- as.data.frame(matrix(0, ncol = 2*length(base_str), nrow = nrow(text_df)), stringsAsFactors = F)
rem_df <- as.data.frame(matrix('f', ncol = 1, nrow = nrow(text_df)), stringsAsFactors = F)
fillins_df <- as.data.frame(matrix(NA, ncol = length(base_str), nrow = nrow(text_df)), stringsAsFactors = F)
temp_df <- as.data.frame(matrix(NA, ncol = length(base_str) + 1, nrow = 1), stringsAsFactors = F)

i <- 1
j <- 1
diff <- 0
count <- 0
threshold <- 0.1
temp_loc <- 1
fillins_loc <- 1
loc <- 1
blank_check <- F
match_check <- F

while (j <= length(base_str)) {
  n <- 1
  k <- 1
  while (n <= nrow(text_df)) {
    #print("n:")
    #print(n)
    while (k <= (ncol(text_df)-diff)) {
      #print("k:")
      #print(k)
      if ( ((paste(text_df[n, k:(k+diff)], collapse = ' ')) == (paste(base_str[i:j], collapse = ' '))) ) {
        if (loc == 1) {
          match_check <- T
          count <- count + 1
          loc_df[n,loc] <- k
          loc_df[n,loc+1] <- k + diff
          k <- ncol(text_df)-diff+1
        }
        
        else if (diff == 0) {
          if (k > loc_df[n, loc-1]) {
          
            match_check <- T
            count <- count + 1
            loc_df[n,loc] <- k
            loc_df[n,loc+1] <- k + diff
            k <- ncol(text_df)-diff+1
          }
        } else {
            
            if (k == (loc_df[n,loc-2])) {
              match_check <- T
              count <- count + 1
              loc_df[n,loc] <- k
              loc_df[n,loc+1] <- k + diff
              k <- ncol(text_df)-diff+1
            }	
          }
        }
      k <- k + 1
    }
    if (match_check == F) {
      rem_df[n,1] <- 't'
    }else {
      print("YOU HIT A MATCH")
    }
    n <- n + 1
    k <- 1
    match_check <- F
  }
  if (count >= (nrow(text_df)*threshold)) {
    if (count < nrow(text_df)) {
      threshold <- ((threshold)*nrow(text_df_2))/(sum(rem_df$V1 == 'f'))
    }
    text_df <- text_df[which(rem_df$V1 == 'f'),]
    loc_df <- loc_df[which(rem_df$V1 == 'f'),]
    fillins_df <- fillins_df[which(rem_df$V1 == 'f'),]
    wc_df <- wc_df[which(rem_df$V1 == 'f'),]
    rem_df <- subset(rem_df, V1 == 'f')
    if (loc == 1) {
      for (n in 1:nrow(loc_df)) {
        if (loc_df[n, loc] != 1) {
          temp_df[1,temp_loc]  <- "BLANK"
          temp_loc <- temp_loc + 1
          blank_check <- T
          fillins_loc <- fillins_loc + 1
          break
        }
      }
    }else {
        for (n in 1:nrow(loc_df)) {
          if ((loc_df[n,loc]) - (loc_df[n,loc-1]) > 1) {
            temp_df[1,temp_loc]  <- "BLANK"
            temp_loc <- temp_loc + 1
            blank_check <- T
            fillins_loc <- fillins_loc + 1
            break
          }
        }
    }
    
    
    if (diff > 0) {
      temp_loc <- temp_loc - 1
      for (x in 1:nrow(loc_df)) {
        loc_df[x, loc - 2] <- loc_df[x, loc]
        loc_df[x, loc - 1] <- loc_df[x, loc + 1]
        loc_df[x, loc] <- 0 
        loc_df[x, loc + 1] <- 0 
      }
      loc <- loc - 2
    }
    if (blank_check == T) {
      print ("BLANK_CHECK")
      print (loc)
      print("161")
      for (x in 1:nrow(fillins_df)) {
        print("163")
        if (loc == 1) {
          print("165")
          if (loc_df[x, loc] != 1) {
            print("167")
            fillins_df[x, fillins_loc] <- paste(text_df[x, 1:(loc_df[x,loc]-1)], collapse = ' ')
          }
        }
        else {
          print("172")
          if ((loc_df[x,loc]) - (loc_df[x,loc-1]) > 1) {
            print("174")
            fillins_df[x, fillins_loc] <- paste(text_df[x, (loc_df[x,loc-1] + 1):(loc_df[x,loc]-1)], collapse = ' ')
          }
        }
      }
    }
    print("180")
    temp_df[temp_loc] <- paste(base_str[i:j], collapse = ' ')
    temp_loc <- temp_loc + 1
    loc <- loc + 2
    j <- j + 1
    #threshold <- ((threshold)*nrow(text_df_2))/(sum(rem_df$V1 == 'f'))
    #text_df <- text_df[which(rem_df$V1 == 'f'),]
    #loc_df <- loc_df[which(rem_df$V1 == 'f'),]
    #fillins_df <- fillins_df[which(rem_df$V1 == 'f'),]
    #wc_df <- wc_df[which(rem_df$V1 == 'f'),]
    #rem_df <- subset(rem_df, V1 == 'f')
  }else {
    #print("YOU HIT ELSE")
    if (diff > 0) {
      i <- j
      j <- i
    }else {
      i <- i + 1
      j <- i
    }
    
    loc_df[loc] <- 0
    loc_df[loc+1] <- 0
    rem_df$V1 <- 'f'
    
  }
  
  print("this is i: ")
  print (i)
  count <- 0
  diff <- j - i
  print("this is diff:")
  print(diff)
}


##This part could be cleaner
loc_df <- loc_df[, colSums(loc_df != 0) > 0]

blank_check_end <- F

for (n in 1:nrow(loc_df)) {
  if (loc_df[n, ncol(loc_df)] < wc_df[n, 1]) {
    blank_check_end <- T
  }
}

if (blank_check_end == T) {
  fillins_loc = fillins_loc + 1
  temp_df[temp_loc] <- "BLANK"
  for (x in 1:nrow(fillins_df)) {
    fillins_df[x, fillins_loc] <- paste(text_df[x, (loc_df[x,ncol(loc_df)] + 1):ncol(text_df)], collapse = ' ')
  }
}

for (x in 1:nrow(fillins_df)) {
  fillins_df[x, fillins_loc] <- gsub("IGNORE", "", fillins_df[x, fillins_loc])
}


##Part where we do the counting
fillins_df <- fillins_df[, colSums(is.na(fillins_df)) != nrow(fillins_df)]


for (i in 1:ncol(fillins_df)) {
  names(fillins_df)[i] <- paste("Blank", as.character(i), collapse = ' ')
}

fillins_df <- sapply(fillins_df, as.character)
fillins_df[is.na(fillins_df)] <- ""


i <- 1
while (i <=  ncol(fillins_df)) {
  fillins_df <- as.data.frame(cbind (fillins_df[,c(1:i)], rep(1, times = nrow(fillins_df)), fillins_df[,-c(1:i)]))
  i <- i+ 2
}

for (i in 1:ncol(fillins_df)) {
  if (i %% 2 == 0) {
    colnames(fillins_df)[i] <- "Frequency"
    fillins_df[,i] <- sapply(fillins_df[,i], as.character)
    fillins_df[,i] <- sapply(fillins_df[,i], as.numeric)
  }
}

for (i in 1:ncol(fillins_df)) {
  if (i %% 2 == 1) {
    colnames(fillins_df)[i] <- paste("Blank", (i+1)/2, collapse ="")
    fillins_df[,i] <- sapply(fillins_df[,i], as.character)
  }
}

for (j in 1:ncol(fillins_df)) {
  for (i in 1:nrow(fillins_df)) {
    if (j %% 2 == 1) {
      for (k in 1:i) {
        if (k < i) {
          if (trimws(fillins_df[i, j], which = "both") == trimws(fillins_df[k,j], which = "both")) {
            fillins_df[k, j+1] <- fillins_df[k, j+1] + 1
            fillins_df[i, j] <- paste("IGNORE", i, collapse ="")
            fillins_df[i, j + 1] <- 0 
          }
        }
      }
    }
  }
}

for (j in 1:ncol(fillins_df)) {
  for (i in 1:nrow(fillins_df)) {
    if (j %% 2 == 0) {
      if (fillins_df[i, j] == 0) {
        fillins_df[i, j] <- ""  
      }
    } else {
      fillins_df[i, j] <- gsub("IGNORE \\d+", "", fillins_df[i, j])
    }
  }
}

test2 <- as.data.frame(matrix(0, ncol = 0, nrow = (nrow(fillins_df))))
for (j in 1:ncol(fillins_df)) {
  if (j %% 2 == 1) {
    test <- fillins_df[, c(j, j+1)]
    test[,2] <- as.numeric(test[,2])
    test <- test[order(-test$Frequency),]
    test2 <- cbind(test2, test)
  }
}
fillins_df <- test2[rowSums(is.na(test2))!=(ncol(test2)/2),]


#for (j in 1:ncol(fillins_df)) {
#  for (i in 1:nrow(fillins_df)) {
#    if (j %% 2 == 1) {
#      while (fillins_df[i, j + 1] == "") {
#        for (l in i:nrow(fillins_df)) {
#          if (fillins_df[l, j + 1] != "") {
#            fillins_df[i, j] <- fillins_df[l, j] 
#            fillins_df[i, j + 1] <- fillins_df[l, j + 1] 
#          }
#        }
#        break
#      }
#      fillins_df[l, j] <- ""
#      fillins_df[l, j + 1] <- ""
#    } 
#  }
#}


#temp_df <- cbindPad(temp_df, b)
#fillins_df <- cbindPad(fillins_df, b)
#names(temp_df)[ncol(temp_df)] <- "Cluster Text"
#names(fillins_df)[ncol(fillins_df)] <- "Cluster Text"
write.csv(temp_df, file = paste0(namefile, "TEMPLATE.csv"), row.names = F)
write.csv(fillins_df, file = paste0(namefile, "FILLINS.csv"), col.names = F)