Diff of /scripts/templatefinal.R [000000] .. [c09aa8]

Switch to unified view

a b/scripts/templatefinal.R
1
setwd("./templates")
2
namestr <- "./../plateletall.csv"
3
b <- read.csv(namestr, header = F, row.names = NULL, stringsAsFactors = F)
4
5
for (i in 1:ncol(b)) { 
6
  for (j in 1:nrow(b)) {
7
    if (i > 1) {
8
      b[j,1] <- paste(b[j,1], b[j, i], sep = "")
9
    }
10
  }
11
}
12
13
b <- as.data.frame(b[[1]])
14
15
16
b[] <- lapply(b, as.character)
17
b[] <- lapply(b, tolower)
18
19
20
21
22
cbindPad <- function(...){
23
  args <- list(...)
24
  n <- sapply(args,nrow)
25
  mx <- max(n)
26
  pad <- function(x, mx){
27
    if (nrow(x) < mx){
28
      nms <- colnames(x)
29
      padTemp <- matrix(NA, mx - nrow(x), ncol(x))
30
      colnames(padTemp) <- nms
31
      if (ncol(x)==0) {
32
        return(padTemp)
33
      } else {
34
        return(rbind(x,padTemp))
35
      }
36
    }
37
    else{
38
      return(x)
39
    }
40
  }
41
  rs <- lapply(args,pad,mx)
42
  return(do.call(cbind,rs))
43
}
44
45
46
47
##Platelet Change To Rules
48
49
b[] <- lapply(b[], function(x) gsub("Platelet","Platelets",x, ignore.case = T))
50
b[] <- lapply(b[], function(x) gsub("Plateletss","Platelets",x, ignore.case = T))
51
b[] <- lapply(b[], function(x) gsub("Platelet count","Platelets",x, ignore.case = T))
52
b[] <- lapply(b[], function(x) gsub("Platelets:","Platelets",x, ignore.case = T))
53
b[] <- lapply(b[], function(x) gsub("\\(plts\\)","",x, ignore.case = T))
54
b[] <- lapply(b[], function(x) gsub("\\(plt\\)","",x, ignore.case = T))
55
b[] <- lapply(b[], function(x) gsub(">",">=",x, ignore.case = T))
56
b[] <- lapply(b[], function(x) gsub("greater than or equal to",">=",x, ignore.case = T))
57
b[] <- lapply(b[], function(x) gsub(">==",">=",x, ignore.case = T))
58
b[] <- lapply(b[], function(x) gsub(" cell/mm\\^3","/mcL",x, ignore.case = T))
59
b[] <- lapply(b[], function(x) gsub(" */ *mm\\^3","/mcL",x, ignore.case = T))
60
b[] <- lapply(b[], function(x) gsub("/microliters","mcL",x, ignore.case = T))
61
b[] <- lapply(b[], function(x) gsub(" *cells/mcL","/mcL",x, ignore.case = T))
62
b[] <- lapply(b[], function(x) gsub(" mm\\^3","/mcL",x, ignore.case = T))
63
b[] <- lapply(b[], function(x) gsub(" *x *10\\^9/L","000/mcL",x, ignore.case = T))
64
b[] <- lapply(b[], function(x) gsub("100000","100 000",x, ignore.case = T))
65
b[] <- lapply(b[], function(x) gsub("ul","mcL",x, ignore.case = T))
66
b[] <- lapply(b[], function(x) gsub("mcl","mcL",x, ignore.case = T))
67
68
69
70
71
72
73
74
75
l <- as.data.frame(matrix(0, ncol = 1, nrow = (nrow(b))))
76
77
78
for (i in 1:nrow(b)) {
79
  if (length( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE))) > ncol(l)) {
80
    l <- as.data.frame(matrix(0, ncol = length( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE))), nrow = (nrow(b))))
81
  }
82
  
83
}
84
n <- 1
85
for (i in 1:nrow(b)) {
86
  
87
  n <- as.numeric(as.numeric(ncol(l)) - as.numeric(length( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE)))))
88
  l[i,] <- c( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE)), rep("IGNORE", times = as.numeric(n)))
89
}
90
91
l <- cbind(l, rep(0, times = nrow(l)))
92
names(l)[ncol(l)] <- "wc"
93
for (i in 1:nrow(l)) {
94
  l[i,ncol(l)] <- length(grep("IGNORE", l[i,]))
95
}
96
97
98
row1 <- unlist(strsplit(unlist(strsplit(b[which.max(l$wc),1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE))
99
100
wc_df <- as.data.frame(l$wc) 
101
for (i in 1:nrow(wc_df)) {
102
  wc_df[i,1] <- ncol(l) - wc_df[i,1] - 1
103
  wc_df[i,2] <- 0
104
}
105
106
107
l <- l[1:(ncol(l)-1)]
108
109
110
111
112
base_str <- row1
113
text_df <- l
114
text_df_2 <- text_df
115
loc_df <- as.data.frame(matrix(0, ncol = 2*length(base_str), nrow = nrow(text_df)), stringsAsFactors = F)
116
rem_df <- as.data.frame(matrix('f', ncol = 1, nrow = nrow(text_df)), stringsAsFactors = F)
117
fillins_df <- as.data.frame(matrix(NA, ncol = length(base_str), nrow = nrow(text_df)), stringsAsFactors = F)
118
temp_df <- as.data.frame(matrix(NA, ncol = length(base_str) + 1, nrow = 1), stringsAsFactors = F)
119
120
i <- 1
121
j <- 1
122
diff <- 0
123
count <- 0
124
threshold <- 0.8
125
temp_loc <- 1
126
fillins_loc <- 1
127
loc <- 1
128
blank_check <- F
129
match_check <- F
130
131
while (j <= length(base_str)) {
132
  n <- 1
133
  k <- 1
134
  while (n <= nrow(text_df)) {
135
    #print("n:")
136
    #print(n)
137
    while (k <= (ncol(text_df)-diff)) {
138
      #print("k:")
139
      #print(k)
140
      if ( ((paste(text_df[n, k:(k+diff)], collapse = ' ')) == (paste(base_str[i:j], collapse = ' '))) ) {
141
        if (loc == 1) {
142
          match_check <- T
143
          count <- count + 1
144
          loc_df[n,loc] <- k
145
          loc_df[n,loc+1] <- k + diff
146
          k <- ncol(text_df)-diff+1
147
        }
148
        
149
        else if (diff == 0) {
150
          if (k > loc_df[n, loc-1]) {
151
          
152
            match_check <- T
153
            count <- count + 1
154
            loc_df[n,loc] <- k
155
            loc_df[n,loc+1] <- k + diff
156
            k <- ncol(text_df)-diff+1
157
          }
158
        } else {
159
            
160
            if (k == (loc_df[n,loc-2])) {
161
              match_check <- T
162
              count <- count + 1
163
              loc_df[n,loc] <- k
164
              loc_df[n,loc+1] <- k + diff
165
              k <- ncol(text_df)-diff+1
166
            }   
167
          }
168
        }
169
      k <- k + 1
170
    }
171
    if (match_check == F) {
172
      rem_df[n,1] <- 't'
173
    }else {
174
      print("YOU HIT A MATCH")
175
    }
176
    n <- n + 1
177
    k <- 1
178
    match_check <- F
179
  }
180
  if (count >= (nrow(text_df)*threshold)) {
181
    if (count < nrow(text_df)) {
182
      threshold <- ((threshold)*nrow(text_df_2))/(sum(rem_df$V1 == 'f'))
183
    }
184
    text_df <- text_df[which(rem_df$V1 == 'f'),]
185
    loc_df <- loc_df[which(rem_df$V1 == 'f'),]
186
    fillins_df <- fillins_df[which(rem_df$V1 == 'f'),]
187
    wc_df <- wc_df[which(rem_df$V1 == 'f'),]
188
    rem_df <- subset(rem_df, V1 == 'f')
189
    if (loc == 1) {
190
      for (n in 1:nrow(loc_df)) {
191
        if (loc_df[n, loc] != 1) {
192
          temp_df[1,temp_loc]  <- "BLANK"
193
          temp_loc <- temp_loc + 1
194
          blank_check <- T
195
          fillins_loc <- fillins_loc + 1
196
          break
197
        }
198
      }
199
    }else {
200
        for (n in 1:nrow(loc_df)) {
201
          if ((loc_df[n,loc]) - (loc_df[n,loc-1]) > 1) {
202
            temp_df[1,temp_loc]  <- "BLANK"
203
            temp_loc <- temp_loc + 1
204
            blank_check <- T
205
            fillins_loc <- fillins_loc + 1
206
            break
207
          }
208
        }
209
    }
210
    
211
    
212
    if (diff > 0) {
213
      temp_loc <- temp_loc - 1
214
      for (x in 1:nrow(loc_df)) {
215
        loc_df[x, loc - 2] <- loc_df[x, loc]
216
        loc_df[x, loc - 1] <- loc_df[x, loc + 1]
217
        loc_df[x, loc] <- 0 
218
        loc_df[x, loc + 1] <- 0 
219
      }
220
      loc <- loc - 2
221
    }
222
    if (blank_check == T) {
223
      print ("BLANK_CHECK")
224
      print (loc)
225
      print("161")
226
      for (x in 1:nrow(fillins_df)) {
227
        print("163")
228
        if (loc == 1) {
229
          print("165")
230
          if (loc_df[x, loc] != 1) {
231
            print("167")
232
            fillins_df[x, fillins_loc] <- paste(text_df[x, 1:(loc_df[x,loc]-1)], collapse = ' ')
233
          }
234
        }
235
        else {
236
          print("172")
237
          if ((loc_df[x,loc]) - (loc_df[x,loc-1]) > 1) {
238
            print("174")
239
            fillins_df[x, fillins_loc] <- paste(text_df[x, (loc_df[x,loc-1] + 1):(loc_df[x,loc]-1)], collapse = ' ')
240
          }
241
        }
242
      }
243
    }
244
    print("180")
245
    temp_df[temp_loc] <- paste(base_str[i:j], collapse = ' ')
246
    temp_loc <- temp_loc + 1
247
    loc <- loc + 2
248
    j <- j + 1
249
    #threshold <- ((threshold)*nrow(text_df_2))/(sum(rem_df$V1 == 'f'))
250
    #text_df <- text_df[which(rem_df$V1 == 'f'),]
251
    #loc_df <- loc_df[which(rem_df$V1 == 'f'),]
252
    #fillins_df <- fillins_df[which(rem_df$V1 == 'f'),]
253
    #wc_df <- wc_df[which(rem_df$V1 == 'f'),]
254
    #rem_df <- subset(rem_df, V1 == 'f')
255
  }else {
256
    #print("YOU HIT ELSE")
257
    if (diff > 0) {
258
      i <- j
259
      j <- i
260
    }else {
261
      i <- i + 1
262
      j <- i
263
    }
264
    
265
    loc_df[loc] <- 0
266
    loc_df[loc+1] <- 0
267
    rem_df$V1 <- 'f'
268
    
269
  }
270
  
271
  print("this is i: ")
272
  print (i)
273
  count <- 0
274
  diff <- j - i
275
  print("this is diff:")
276
  print(diff)
277
}
278
279
280
##This part could be cleaner
281
loc_df <- loc_df[, colSums(loc_df != 0) > 0]
282
283
blank_check_end <- F
284
285
for (n in 1:nrow(loc_df)) {
286
  if (loc_df[n, ncol(loc_df)] < wc_df[n, 1]) {
287
    blank_check_end <- T
288
  }
289
}
290
291
if (blank_check_end == T) {
292
  fillins_loc = fillins_loc + 1
293
  temp_df[temp_loc] <- "BLANK"
294
  for (x in 1:nrow(fillins_df)) {
295
    fillins_df[x, fillins_loc] <- paste(text_df[x, (loc_df[x,ncol(loc_df)] + 1):ncol(text_df)], collapse = ' ')
296
  }
297
}
298
299
for (x in 1:nrow(fillins_df)) {
300
  fillins_df[x, fillins_loc] <- gsub("IGNORE", "", fillins_df[x, fillins_loc])
301
}
302
303
304
##Part where we do the counting
305
fillins_df <- fillins_df[, colSums(is.na(fillins_df)) != nrow(fillins_df)]
306
307
308
for (i in 1:ncol(fillins_df)) {
309
  names(fillins_df)[i] <- paste("Blank", as.character(i), collapse = ' ')
310
}
311
312
fillins_df <- sapply(fillins_df, as.character)
313
fillins_df[is.na(fillins_df)] <- ""
314
315
316
i <- 1
317
while (i <=  ncol(fillins_df)) {
318
  fillins_df <- as.data.frame(cbind (fillins_df[,c(1:i)], rep(1, times = nrow(fillins_df)), fillins_df[,-c(1:i)]))
319
  i <- i+ 2
320
}
321
322
for (i in 1:ncol(fillins_df)) {
323
  if (i %% 2 == 0) {
324
    colnames(fillins_df)[i] <- "Frequency"
325
    fillins_df[,i] <- sapply(fillins_df[,i], as.character)
326
    fillins_df[,i] <- sapply(fillins_df[,i], as.numeric)
327
  }
328
}
329
330
for (i in 1:ncol(fillins_df)) {
331
  if (i %% 2 == 1) {
332
    colnames(fillins_df)[i] <- paste("Blank", (i+1)/2, collapse ="")
333
    fillins_df[,i] <- sapply(fillins_df[,i], as.character)
334
  }
335
}
336
337
for (j in 1:ncol(fillins_df)) {
338
  for (i in 1:nrow(fillins_df)) {
339
    if (j %% 2 == 1) {
340
      for (k in 1:i) {
341
        if (k < i) {
342
          if (trimws(fillins_df[i, j], which = "both") == trimws(fillins_df[k,j], which = "both")) {
343
            fillins_df[k, j+1] <- fillins_df[k, j+1] + 1
344
            fillins_df[i, j] <- paste("IGNORE", i, collapse ="")
345
            fillins_df[i, j + 1] <- 0 
346
          }
347
        }
348
      }
349
    }
350
  }
351
}
352
353
for (j in 1:ncol(fillins_df)) {
354
  for (i in 1:nrow(fillins_df)) {
355
    if (j %% 2 == 0) {
356
      if (fillins_df[i, j] == 0) {
357
        fillins_df[i, j] <- ""  
358
      }
359
    } else {
360
      fillins_df[i, j] <- gsub("IGNORE \\d+", "", fillins_df[i, j])
361
    }
362
  }
363
}
364
365
test2 <- as.data.frame(matrix(0, ncol = 0, nrow = (nrow(fillins_df))))
366
for (j in 1:ncol(fillins_df)) {
367
  if (j %% 2 == 1) {
368
    test <- fillins_df[, c(j, j+1)]
369
    test[,2] <- as.numeric(test[,2])
370
    test <- test[order(-test$Frequency),]
371
    test2 <- cbind(test2, test)
372
  }
373
}
374
fillins_df <- test2[rowSums(is.na(test2))!=(ncol(test2)/2),]
375
376
377
#for (j in 1:ncol(fillins_df)) {
378
#  for (i in 1:nrow(fillins_df)) {
379
#    if (j %% 2 == 1) {
380
#      while (fillins_df[i, j + 1] == "") {
381
#        for (l in i:nrow(fillins_df)) {
382
#          if (fillins_df[l, j + 1] != "") {
383
#            fillins_df[i, j] <- fillins_df[l, j] 
384
#            fillins_df[i, j + 1] <- fillins_df[l, j + 1] 
385
#          }
386
#        }
387
#        break
388
#      }
389
#      fillins_df[l, j] <- ""
390
#      fillins_df[l, j + 1] <- ""
391
#    } 
392
#  }
393
#}
394
395
396
#temp_df <- cbindPad(temp_df, b)
397
#fillins_df <- cbindPad(fillins_df, b)
398
#names(temp_df)[ncol(temp_df)] <- "Cluster Text"
399
#names(fillins_df)[ncol(fillins_df)] <- "Cluster Text"
400
write.csv(temp_df, file = paste0(namestr, "TEMPLATE.csv"), row.names = F)
401
write.csv(fillins_df, file = paste0(namestr, "FILLINS.csv"), col.names = F)
402