Diff of /scripts/template.R [000000] .. [c09aa8]

Switch to unified view

a b/scripts/template.R
1
setwd("./templates")
2
namefile <- "2.txt"
3
namestr <- paste("./../clustall9k/", namefile, sep = "")
4
b <- read.csv(namestr, header = F, row.names = NULL, stringsAsFactors = F)
5
6
for (i in 1:ncol(b)) { 
7
  for (j in 1:nrow(b)) {
8
    if (i > 1) {
9
      b[j,1] <- paste(b[j,1], b[j, i], sep = "")
10
    }
11
  }
12
}
13
14
b <- as.data.frame(b[[1]])
15
16
17
b[] <- lapply(b, as.character)
18
b[] <- lapply(b, tolower)
19
20
21
22
23
cbindPad <- function(...){
24
  args <- list(...)
25
  n <- sapply(args,nrow)
26
  mx <- max(n)
27
  pad <- function(x, mx){
28
    if (nrow(x) < mx){
29
      nms <- colnames(x)
30
      padTemp <- matrix(NA, mx - nrow(x), ncol(x))
31
      colnames(padTemp) <- nms
32
      if (ncol(x)==0) {
33
        return(padTemp)
34
      } else {
35
        return(rbind(x,padTemp))
36
      }
37
    }
38
    else{
39
      return(x)
40
    }
41
  }
42
  rs <- lapply(args,pad,mx)
43
  return(do.call(cbind,rs))
44
}
45
46
47
48
##Platelet Change To Rules
49
50
b[] <- lapply(b[], function(x) gsub("Platelet","Platelets",x, ignore.case = T))
51
b[] <- lapply(b[], function(x) gsub("Plateletss","Platelets",x, ignore.case = T))
52
b[] <- lapply(b[], function(x) gsub("Platelet count","Platelets",x, ignore.case = T))
53
b[] <- lapply(b[], function(x) gsub("Platelets:","Platelets",x, ignore.case = T))
54
b[] <- lapply(b[], function(x) gsub("\\(plts\\)","",x, ignore.case = T))
55
b[] <- lapply(b[], function(x) gsub("\\(plt\\)","",x, ignore.case = T))
56
b[] <- lapply(b[], function(x) gsub(">",">=",x, ignore.case = T))
57
b[] <- lapply(b[], function(x) gsub("greater than or equal to",">=",x, ignore.case = T))
58
b[] <- lapply(b[], function(x) gsub(">==",">=",x, ignore.case = T))
59
b[] <- lapply(b[], function(x) gsub(" cell/mm\\^3","/mcL",x, ignore.case = T))
60
b[] <- lapply(b[], function(x) gsub(" */ *mm\\^3","/mcL",x, ignore.case = T))
61
b[] <- lapply(b[], function(x) gsub("/microliters","mcL",x, ignore.case = T))
62
b[] <- lapply(b[], function(x) gsub(" *cells/mcL","/mcL",x, ignore.case = T))
63
b[] <- lapply(b[], function(x) gsub(" mm\\^3","/mcL",x, ignore.case = T))
64
b[] <- lapply(b[], function(x) gsub(" *x *10\\^9/L","000/mcL",x, ignore.case = T))
65
b[] <- lapply(b[], function(x) gsub("100000","100 000",x, ignore.case = T))
66
b[] <- lapply(b[], function(x) gsub("ul","mcL",x, ignore.case = T))
67
b[] <- lapply(b[], function(x) gsub("mcl","mcL",x, ignore.case = T))
68
69
70
71
72
73
74
75
76
l <- as.data.frame(matrix(0, ncol = 1, nrow = (nrow(b))))
77
78
79
for (i in 1:nrow(b)) {
80
  if (length( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE))) > ncol(l)) {
81
    l <- as.data.frame(matrix(0, ncol = length( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE))), nrow = (nrow(b))))
82
  }
83
  
84
}
85
n <- 1
86
for (i in 1:nrow(b)) {
87
  
88
  n <- as.numeric(as.numeric(ncol(l)) - as.numeric(length( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE)))))
89
  l[i,] <- c( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE)), rep("IGNORE", times = as.numeric(n)))
90
}
91
92
l <- cbind(l, rep(0, times = nrow(l)))
93
names(l)[ncol(l)] <- "wc"
94
for (i in 1:nrow(l)) {
95
  l[i,ncol(l)] <- length(grep("IGNORE", l[i,]))
96
}
97
98
99
row1 <- unlist(strsplit(unlist(strsplit(b[which.max(l$wc),1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE))
100
101
wc_df <- as.data.frame(l$wc) 
102
for (i in 1:nrow(wc_df)) {
103
  wc_df[i,1] <- ncol(l) - wc_df[i,1] - 1
104
  wc_df[i,2] <- 0
105
}
106
107
108
l <- l[1:(ncol(l)-1)]
109
110
111
112
113
base_str <- row1
114
text_df <- l
115
text_df_2 <- text_df
116
loc_df <- as.data.frame(matrix(0, ncol = 2*length(base_str), nrow = nrow(text_df)), stringsAsFactors = F)
117
rem_df <- as.data.frame(matrix('f', ncol = 1, nrow = nrow(text_df)), stringsAsFactors = F)
118
fillins_df <- as.data.frame(matrix(NA, ncol = length(base_str), nrow = nrow(text_df)), stringsAsFactors = F)
119
temp_df <- as.data.frame(matrix(NA, ncol = length(base_str) + 1, nrow = 1), stringsAsFactors = F)
120
121
i <- 1
122
j <- 1
123
diff <- 0
124
count <- 0
125
threshold <- 0.1
126
temp_loc <- 1
127
fillins_loc <- 1
128
loc <- 1
129
blank_check <- F
130
match_check <- F
131
132
while (j <= length(base_str)) {
133
  n <- 1
134
  k <- 1
135
  while (n <= nrow(text_df)) {
136
    #print("n:")
137
    #print(n)
138
    while (k <= (ncol(text_df)-diff)) {
139
      #print("k:")
140
      #print(k)
141
      if ( ((paste(text_df[n, k:(k+diff)], collapse = ' ')) == (paste(base_str[i:j], collapse = ' '))) ) {
142
        if (loc == 1) {
143
          match_check <- T
144
          count <- count + 1
145
          loc_df[n,loc] <- k
146
          loc_df[n,loc+1] <- k + diff
147
          k <- ncol(text_df)-diff+1
148
        }
149
        
150
        else if (diff == 0) {
151
          if (k > loc_df[n, loc-1]) {
152
          
153
            match_check <- T
154
            count <- count + 1
155
            loc_df[n,loc] <- k
156
            loc_df[n,loc+1] <- k + diff
157
            k <- ncol(text_df)-diff+1
158
          }
159
        } else {
160
            
161
            if (k == (loc_df[n,loc-2])) {
162
              match_check <- T
163
              count <- count + 1
164
              loc_df[n,loc] <- k
165
              loc_df[n,loc+1] <- k + diff
166
              k <- ncol(text_df)-diff+1
167
            }   
168
          }
169
        }
170
      k <- k + 1
171
    }
172
    if (match_check == F) {
173
      rem_df[n,1] <- 't'
174
    }else {
175
      print("YOU HIT A MATCH")
176
    }
177
    n <- n + 1
178
    k <- 1
179
    match_check <- F
180
  }
181
  if (count >= (nrow(text_df)*threshold)) {
182
    if (count < nrow(text_df)) {
183
      threshold <- ((threshold)*nrow(text_df_2))/(sum(rem_df$V1 == 'f'))
184
    }
185
    text_df <- text_df[which(rem_df$V1 == 'f'),]
186
    loc_df <- loc_df[which(rem_df$V1 == 'f'),]
187
    fillins_df <- fillins_df[which(rem_df$V1 == 'f'),]
188
    wc_df <- wc_df[which(rem_df$V1 == 'f'),]
189
    rem_df <- subset(rem_df, V1 == 'f')
190
    if (loc == 1) {
191
      for (n in 1:nrow(loc_df)) {
192
        if (loc_df[n, loc] != 1) {
193
          temp_df[1,temp_loc]  <- "BLANK"
194
          temp_loc <- temp_loc + 1
195
          blank_check <- T
196
          fillins_loc <- fillins_loc + 1
197
          break
198
        }
199
      }
200
    }else {
201
        for (n in 1:nrow(loc_df)) {
202
          if ((loc_df[n,loc]) - (loc_df[n,loc-1]) > 1) {
203
            temp_df[1,temp_loc]  <- "BLANK"
204
            temp_loc <- temp_loc + 1
205
            blank_check <- T
206
            fillins_loc <- fillins_loc + 1
207
            break
208
          }
209
        }
210
    }
211
    
212
    
213
    if (diff > 0) {
214
      temp_loc <- temp_loc - 1
215
      for (x in 1:nrow(loc_df)) {
216
        loc_df[x, loc - 2] <- loc_df[x, loc]
217
        loc_df[x, loc - 1] <- loc_df[x, loc + 1]
218
        loc_df[x, loc] <- 0 
219
        loc_df[x, loc + 1] <- 0 
220
      }
221
      loc <- loc - 2
222
    }
223
    if (blank_check == T) {
224
      print ("BLANK_CHECK")
225
      print (loc)
226
      print("161")
227
      for (x in 1:nrow(fillins_df)) {
228
        print("163")
229
        if (loc == 1) {
230
          print("165")
231
          if (loc_df[x, loc] != 1) {
232
            print("167")
233
            fillins_df[x, fillins_loc] <- paste(text_df[x, 1:(loc_df[x,loc]-1)], collapse = ' ')
234
          }
235
        }
236
        else {
237
          print("172")
238
          if ((loc_df[x,loc]) - (loc_df[x,loc-1]) > 1) {
239
            print("174")
240
            fillins_df[x, fillins_loc] <- paste(text_df[x, (loc_df[x,loc-1] + 1):(loc_df[x,loc]-1)], collapse = ' ')
241
          }
242
        }
243
      }
244
    }
245
    print("180")
246
    temp_df[temp_loc] <- paste(base_str[i:j], collapse = ' ')
247
    temp_loc <- temp_loc + 1
248
    loc <- loc + 2
249
    j <- j + 1
250
    #threshold <- ((threshold)*nrow(text_df_2))/(sum(rem_df$V1 == 'f'))
251
    #text_df <- text_df[which(rem_df$V1 == 'f'),]
252
    #loc_df <- loc_df[which(rem_df$V1 == 'f'),]
253
    #fillins_df <- fillins_df[which(rem_df$V1 == 'f'),]
254
    #wc_df <- wc_df[which(rem_df$V1 == 'f'),]
255
    #rem_df <- subset(rem_df, V1 == 'f')
256
  }else {
257
    #print("YOU HIT ELSE")
258
    if (diff > 0) {
259
      i <- j
260
      j <- i
261
    }else {
262
      i <- i + 1
263
      j <- i
264
    }
265
    
266
    loc_df[loc] <- 0
267
    loc_df[loc+1] <- 0
268
    rem_df$V1 <- 'f'
269
    
270
  }
271
  
272
  print("this is i: ")
273
  print (i)
274
  count <- 0
275
  diff <- j - i
276
  print("this is diff:")
277
  print(diff)
278
}
279
280
281
##This part could be cleaner
282
loc_df <- loc_df[, colSums(loc_df != 0) > 0]
283
284
blank_check_end <- F
285
286
for (n in 1:nrow(loc_df)) {
287
  if (loc_df[n, ncol(loc_df)] < wc_df[n, 1]) {
288
    blank_check_end <- T
289
  }
290
}
291
292
if (blank_check_end == T) {
293
  fillins_loc = fillins_loc + 1
294
  temp_df[temp_loc] <- "BLANK"
295
  for (x in 1:nrow(fillins_df)) {
296
    fillins_df[x, fillins_loc] <- paste(text_df[x, (loc_df[x,ncol(loc_df)] + 1):ncol(text_df)], collapse = ' ')
297
  }
298
}
299
300
for (x in 1:nrow(fillins_df)) {
301
  fillins_df[x, fillins_loc] <- gsub("IGNORE", "", fillins_df[x, fillins_loc])
302
}
303
304
305
##Part where we do the counting
306
fillins_df <- fillins_df[, colSums(is.na(fillins_df)) != nrow(fillins_df)]
307
308
309
for (i in 1:ncol(fillins_df)) {
310
  names(fillins_df)[i] <- paste("Blank", as.character(i), collapse = ' ')
311
}
312
313
fillins_df <- sapply(fillins_df, as.character)
314
fillins_df[is.na(fillins_df)] <- ""
315
316
317
i <- 1
318
while (i <=  ncol(fillins_df)) {
319
  fillins_df <- as.data.frame(cbind (fillins_df[,c(1:i)], rep(1, times = nrow(fillins_df)), fillins_df[,-c(1:i)]))
320
  i <- i+ 2
321
}
322
323
for (i in 1:ncol(fillins_df)) {
324
  if (i %% 2 == 0) {
325
    colnames(fillins_df)[i] <- "Frequency"
326
    fillins_df[,i] <- sapply(fillins_df[,i], as.character)
327
    fillins_df[,i] <- sapply(fillins_df[,i], as.numeric)
328
  }
329
}
330
331
for (i in 1:ncol(fillins_df)) {
332
  if (i %% 2 == 1) {
333
    colnames(fillins_df)[i] <- paste("Blank", (i+1)/2, collapse ="")
334
    fillins_df[,i] <- sapply(fillins_df[,i], as.character)
335
  }
336
}
337
338
for (j in 1:ncol(fillins_df)) {
339
  for (i in 1:nrow(fillins_df)) {
340
    if (j %% 2 == 1) {
341
      for (k in 1:i) {
342
        if (k < i) {
343
          if (trimws(fillins_df[i, j], which = "both") == trimws(fillins_df[k,j], which = "both")) {
344
            fillins_df[k, j+1] <- fillins_df[k, j+1] + 1
345
            fillins_df[i, j] <- paste("IGNORE", i, collapse ="")
346
            fillins_df[i, j + 1] <- 0 
347
          }
348
        }
349
      }
350
    }
351
  }
352
}
353
354
for (j in 1:ncol(fillins_df)) {
355
  for (i in 1:nrow(fillins_df)) {
356
    if (j %% 2 == 0) {
357
      if (fillins_df[i, j] == 0) {
358
        fillins_df[i, j] <- ""  
359
      }
360
    } else {
361
      fillins_df[i, j] <- gsub("IGNORE \\d+", "", fillins_df[i, j])
362
    }
363
  }
364
}
365
366
test2 <- as.data.frame(matrix(0, ncol = 0, nrow = (nrow(fillins_df))))
367
for (j in 1:ncol(fillins_df)) {
368
  if (j %% 2 == 1) {
369
    test <- fillins_df[, c(j, j+1)]
370
    test[,2] <- as.numeric(test[,2])
371
    test <- test[order(-test$Frequency),]
372
    test2 <- cbind(test2, test)
373
  }
374
}
375
fillins_df <- test2[rowSums(is.na(test2))!=(ncol(test2)/2),]
376
377
378
#for (j in 1:ncol(fillins_df)) {
379
#  for (i in 1:nrow(fillins_df)) {
380
#    if (j %% 2 == 1) {
381
#      while (fillins_df[i, j + 1] == "") {
382
#        for (l in i:nrow(fillins_df)) {
383
#          if (fillins_df[l, j + 1] != "") {
384
#            fillins_df[i, j] <- fillins_df[l, j] 
385
#            fillins_df[i, j + 1] <- fillins_df[l, j + 1] 
386
#          }
387
#        }
388
#        break
389
#      }
390
#      fillins_df[l, j] <- ""
391
#      fillins_df[l, j + 1] <- ""
392
#    } 
393
#  }
394
#}
395
396
397
#temp_df <- cbindPad(temp_df, b)
398
#fillins_df <- cbindPad(fillins_df, b)
399
#names(temp_df)[ncol(temp_df)] <- "Cluster Text"
400
#names(fillins_df)[ncol(fillins_df)] <- "Cluster Text"
401
write.csv(temp_df, file = paste0(namefile, "TEMPLATE.csv"), row.names = F)
402
write.csv(fillins_df, file = paste0(namefile, "FILLINS.csv"), col.names = F)
403