Diff of /scripts/FullPipeline [000000] .. [c09aa8]

Switch to unified view

a b/scripts/FullPipeline
1
2
library("tm")
3
library("SnowballC")
4
library("ggplot2")
5
library("wordcloud")
6
library("cluster")
7
require("stats")
8
setwd("C:/Users/cloud/Downloads")
9
10
11
ctcl <- read.csv("ctsampleinds_cleaned (2).csv", header = F)
12
13
ctclwtri <- read.csv("ctsampleinds_cleaned.csv", header = F)
14
15
accruals <- read.csv("ctsampledata (1).csv", header = F)
16
17
18
setwd("C:/Users/cloud/Downloads/ClusterSentCritFinalRun3")
19
20
21
22
###NOTE: IF the words "inclusion criterion" or "exclusion criterion" actually appear in your data file,
23
#then pick different words to use for the below loops
24
25
26
27
n <- 1
28
s <- seq(1, ncol(ctcl), by = 2)
29
for (i in 1:nrow(ctcl)) {
30
  for (j in s) {
31
    if (!is.na(ctcl[i,j])) {
32
      if (ctcl[i,j] != "") {
33
        n <- as.character(n)
34
        str <- paste("s", n, ".txt", sep = "")
35
        incexc <- "UNKNOWN WHETHER INCLUSION CRITERION OR EXCLUSION CRITERION"
36
        if (!is.na(ctcl[i,j+1])) {
37
          if (ctcl[i,j+1] != "") {
38
            if (ctcl[i,j+1] == "TRUE") {
39
              incexc <- "INCLUSION CRITERION"
40
            }
41
            if (ctcl[i,j+1] == "FALSE") {
42
              incexc <- "EXCLUSION CRITERION"
43
            }
44
          }
45
        }
46
        trial <- ctclwtri[i,1]
47
        accrual <- "No accrual data found"
48
        for (k in 1:nrow(accruals)) {
49
          if (trial == accruals[k,1]) {
50
            accrual <- accruals[k,2]
51
          }
52
        }
53
        write.table(as.data.frame(paste(ctcl[i,j], incexc, "Accruals:", accrual, sep = " ")),file = str, row.names = F, col.names = F)
54
        n <- as.numeric(n)
55
        n <- n + 1
56
      }
57
    }
58
  }
59
}
60
61
sentdocs <- Corpus(DirSource("C:/Users/cloud/Downloads/ClusterSentCritFinalRun3"))
62
writeLines(as.character(sentdocs[[9770]]))
63
64
sentdocs <- tm_map(sentdocs,content_transformer(tolower))
65
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
66
sentdocs <- tm_map(sentdocs, toSpace, '"')
67
sentdocs <- tm_map(sentdocs, toSpace, 'inclusion criterion')
68
sentdocs <- tm_map(sentdocs, toSpace, 'exclusion criterion')
69
sentdocs <- tm_map(sentdocs, toSpace, 'accruals: \\d+')
70
sentdocs <- tm_map(sentdocs, toSpace, 'no accrual data found')
71
sentdocs <- tm_map(sentdocs, removeWords, stopwords("english"))
72
sentdocs <- tm_map(sentdocs, stripWhitespace)
73
sentdocs <- tm_map(sentdocs,stemDocument)
74
sentdtm <- DocumentTermMatrix(sentdocs)
75
sentm <- as.matrix(sentdtm)
76
77
78
#Clust number selection
79
library("NbClust")
80
sentmdf <- as.data.frame(sentm)
81
82
minnum <- 10
83
maxnum <- 50
84
85
nb1 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
86
               method = "ward.D", index = "kl", alphaBeale = 0.1)
87
88
nb2 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
89
               method = "ward.D", index = "ch", alphaBeale = 0.1)
90
91
nb3 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
92
               method = "ward.D", index = "hartigan", alphaBeale = 0.1)
93
94
nb4 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
95
               method = "ward.D", index = "ccc", alphaBeale = 0.1)
96
97
nb5 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
98
               method = "ward.D", index = "scott", alphaBeale = 0.1)
99
100
nb6 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
101
               method = "ward.D", index = "marriot", alphaBeale = 0.1)
102
103
nb7 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
104
               method = "ward.D", index = "trcovw", alphaBeale = 0.1)
105
106
nb8 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
107
               method = "ward.D", index = "tracew", alphaBeale = 0.1)
108
109
nb9 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
110
               method = "ward.D", index = "friedman", alphaBeale = 0.1)
111
112
nb10 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
113
                method = "ward.D", index = "rubin", alphaBeale = 0.1)
114
115
nb11 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
116
                method = "ward.D", index = "cindex", alphaBeale = 0.1)
117
118
nb12 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
119
                method = "ward.D", index = "db", alphaBeale = 0.1)
120
121
nb13 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
122
                method = "ward.D", index = "silhouette", alphaBeale = 0.1)
123
124
nb14 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
125
                method = "ward.D", index = "duda", alphaBeale = 0.1)
126
127
nb15 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
128
                method = "ward.D", index = "pseudot2", alphaBeale = 0.1)
129
130
nb16 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
131
                method = "ward.D", index = "beale", alphaBeale = 0.1)
132
133
nb17 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
134
                method = "ward.D", index = "ratkowsky", alphaBeale = 0.1)
135
136
nb18 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
137
                method = "ward.D", index = "ball", alphaBeale = 0.1)
138
139
nb19 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
140
                method = "ward.D", index = "ptbiserial", alphaBeale = 0.1)
141
142
nb20 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
143
                method = "ward.D", index = "gap", alphaBeale = 0.1)
144
145
nb21 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
146
                method = "ward.D", index = "frey", alphaBeale = 0.1)
147
148
nb22 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
149
                method = "ward.D", index = "mcclain", alphaBeale = 0.1)
150
151
nb23 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
152
                method = "ward.D", index = "gamma", alphaBeale = 0.1)
153
154
nb24 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
155
                method = "ward.D", index = "gplus", alphaBeale = 0.1)
156
157
nb25 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
158
                method = "ward.D", index = "tau", alphaBeale = 0.1)
159
160
nb26 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
161
                method = "ward.D", index = "dunn", alphaBeale = 0.1)
162
163
nb27 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
164
                method = "ward.D", index = "hubert", alphaBeale = 0.1)
165
166
nb28 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
167
                method = "ward.D", index = "sdindex", alphaBeale = 0.1)
168
169
nb29 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
170
                method = "ward.D", index = "dindex", alphaBeale = 0.1)
171
172
nb30 <- NbClust(data = sentmdf, diss = NULL, distance = "euclidean", min.nc = minnum, max.nc = maxnum, 
173
                method = "ward.D", index = "sdbw", alphaBeale = 0.1)
174
175
176
177
list1 <- unlist(list(if(exists("nb1")) { if(!is.null(nb1$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb1$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb1$Best.nc[1]), "Number_clusters")))}}},
178
                     if(exists("nb2")) { if(!is.null(nb2$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb2$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb2$Best.nc[1]), "Number_clusters")))}}},
179
                     if(exists("nb3")) { if(!is.null(nb3$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb3$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb3$Best.nc[1]), "Number_clusters")))}}},
180
                     if(exists("nb4")) { if(!is.null(nb4$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb4$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb4$Best.nc[1]), "Number_clusters")))}}},
181
                     if(exists("nb5")) { if(!is.null(nb5$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb5$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb5$Best.nc[1]), "Number_clusters")))}}},
182
                     if(exists("nb6")) { if(!is.null(nb6$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb6$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb6$Best.nc[1]), "Number_clusters")))}}},
183
                     if(exists("nb7")) { if(!is.null(nb7$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb7$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb7$Best.nc[1]), "Number_clusters")))}}},
184
                     if(exists("nb8")) { if(!is.null(nb8$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb8$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb8$Best.nc[1]), "Number_clusters")))}}},
185
                     if(exists("nb9")) { if(!is.null(nb9$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb9$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb9$Best.nc[1]), "Number_clusters")))}}},
186
                     if(exists("nb10")) { if(!is.null(nb10$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb10$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb10$Best.nc[1]), "Number_clusters")))}}},
187
                     if(exists("nb11")) { if(!is.null(nb11$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb11$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb11$Best.nc[1]), "Number_clusters")))}}},
188
                     if(exists("nb12")) { if(!is.null(nb12$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb12$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb12$Best.nc[1]), "Number_clusters")))}}},
189
                     if(exists("nb13")) { if(!is.null(nb13$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb13$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb13$Best.nc[1]), "Number_clusters")))}}},
190
                     if(exists("nb14")) { if(!is.null(nb14$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb14$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb14$Best.nc[1]), "Number_clusters")))}}},
191
                     if(exists("nb15")) { if(!is.null(nb15$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb15$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb15$Best.nc[1]), "Number_clusters")))}}},
192
                     if(exists("nb16")) { if(!is.null(nb16$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb16$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb16$Best.nc[1]), "Number_clusters")))}}},
193
                     if(exists("nb17")) { if(!is.null(nb17$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb17$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb17$Best.nc[1]), "Number_clusters")))}}},
194
                     if(exists("nb18")) { if(!is.null(nb18$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb18$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb18$Best.nc[1]), "Number_clusters")))}}},
195
                     if(exists("nb19")) { if(!is.null(nb19$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb19$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb19$Best.nc[1]), "Number_clusters")))}}},
196
                     if(exists("nb20")) { if(!is.null(nb20$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb20$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb20$Best.nc[1]), "Number_clusters")))}}},
197
                     if(exists("nb21")) { if(!is.null(nb21$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb21$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb21$Best.nc[1]), "Number_clusters")))}}},
198
                     if(exists("nb22")) { if(!is.null(nb22$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb22$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb22$Best.nc[1]), "Number_clusters")))}}},
199
                     if(exists("nb23")) { if(!is.null(nb23$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb23$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb23$Best.nc[1]), "Number_clusters")))}}},
200
                     if(exists("nb24")) { if(!is.null(nb24$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb24$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb24$Best.nc[1]), "Number_clusters")))}}},
201
                     if(exists("nb25")) { if(!is.null(nb25$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb25$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb25$Best.nc[1]), "Number_clusters")))}}},
202
                     if(exists("nb26")) { if(!is.null(nb26$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb26$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb26$Best.nc[1]), "Number_clusters")))}}},
203
                     if(exists("nb27")) { if(!is.null(nb27$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb27$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb27$Best.nc[1]), "Number_clusters")))}}},
204
                     if(exists("nb28")) { if(!is.null(nb28$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb28$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb28$Best.nc[1]), "Number_clusters")))}}},
205
                     if(exists("nb29")) { if(!is.null(nb29$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb29$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb29$Best.nc[1]), "Number_clusters")))}}},
206
                     if(exists("nb30")) { if(!is.null(nb30$Best.nc)) { if ( !is.infinite(as.numeric(unlist(strsplit(as.character(nb30$Best.nc[1]), "Number_clusters"))))) {as.numeric(unlist(strsplit(as.character(nb30$Best.nc[1]), "Number_clusters")))}}}))
207
208
round2 = function(x, n) {
209
  posneg = sign(x)
210
  z = abs(x)*10^n
211
  z = z + 0.5
212
  z = trunc(z)
213
  z = z/10^n
214
  z*posneg
215
}
216
217
round2(median(list1), 0)
218
round2(mean(list1), 0)
219
220
cutnum <- round2(mean(list1), 0)
221
222
223
numbo <- 1
224
225
gsubin <- function(x){gsub("INCLUSION CRITERION", "", x)}
226
gsubex <- function(x){gsub("EXCLUSION CRITERION", "", x)}
227
gsubacc <- function(x){gsub("Accruals: \\d+", "", x)}
228
gsubacc2 <- function(x){gsub("No accrual data found", "", x)}
229
230
groups <- cutree(hclust(dist(sentm),method="ward.D"), k=cutnum)
231
clust_list <- lapply(sort(unique(groups)), function(x) which(groups==x))
232
hc1 <- hclust(dist(sentm), method = "ward.D")
233
id1 <- idendro(hc1)
234
for (numbo in 1:cutnum) {
235
  
236
cdf <- as.data.frame(clust_list[[numbo]])
237
238
gsubsen <- function(x) {gsub("s\\d+ .. ", "", x)}
239
row.names(cdf) <- lapply(row.names(cdf), gsubsen)
240
241
cdf[] <- lapply(cdf, as.character)
242
243
cdf[,1] <- row.names(cdf)
244
245
setwd("C:/Users/cloud/Downloads/ClusterSentCritFinalRun3")
246
247
require(data.table)
248
249
for (i in 1:nrow(cdf)) {
250
  if ((file.info(paste(cdf[i,1],"",sep=""))$size) > 0) {
251
    if (as.character(read.table(paste(cdf[i,1],"",sep=""), header = F)[1,1]) %like% "INCLUSION CRITERION") {
252
      incexc2 <- "Inclusion criterion"
253
    }
254
    if (as.character(read.table(paste(cdf[i,1],"",sep=""), header = F)[1,1]) %like% "EXCLUSION CRITERION") {
255
      incexc2 <- "Exclusion criterion"
256
    }
257
    cdf[i,2] <- as.character(read.table(paste(cdf[i,1],"",sep=""), header = F)[1,1])  
258
    cdf[i,2] <- gsubin(cdf[i,2])
259
    cdf[i,2] <- gsubex(cdf[i,2])
260
    cdf[i,3] <- incexc2
261
  }
262
} 
263
264
265
266
cdf <- cdf[,c(1,2,3)]
267
268
setwd("C:/Users/cloud/Downloads/NewClusterRun5")
269
270
if (nrow(cdf)>1) {
271
  
272
n <- as.character(numbo)
273
str <- paste("cluster", n, ".txt", sep = "")
274
write.table(cdf,file = str, row.names = F, col.names = F)
275
276
277
}else {
278
279
  write.table(cdf,file = "clustersingletons.txt", row.names = F, col.names = F, append = TRUE)
280
}
281
282
}
283
284
cbindPad <- function(...){
285
  args <- list(...)
286
  n <- sapply(args,nrow)
287
  mx <- max(n)
288
  pad <- function(x, mx){
289
    if (nrow(x) < mx){
290
      nms <- colnames(x)
291
      padTemp <- matrix(NA, mx - nrow(x), ncol(x))
292
      colnames(padTemp) <- nms
293
      if (ncol(x)==0) {
294
        return(padTemp)
295
      } else {
296
        return(rbind(x,padTemp))
297
      }
298
    }
299
    else{
300
      return(x)
301
    }
302
  }
303
  rs <- lapply(args,pad,mx)
304
  return(do.call(cbind,rs))
305
}
306
307
numbo <- 1
308
309
310
while (numbo <= (cutnum +1)) {
311
  
312
  if (numbo <= cutnum) {
313
  str2 <- paste("cluster", numbo, ".txt", sep = "")
314
  }else {
315
    str2 <- "clustersingletons.txt"
316
  }
317
  setwd("C:/Users/cloud/Downloads/NewClusterRun5")
318
  
319
  res <- try(read.table(str2),silent = TRUE)
320
  
321
  if (!inherits(res, 'try-error')) {
322
    b <- read.table(str2)
323
    b <- b[,2, drop = F] 
324
    b[] <- lapply(b, as.character)
325
    b[] <- lapply(b, tolower)
326
    
327
    l <- as.data.frame(matrix(0, ncol = 1, nrow = (nrow(b))))
328
    
329
    
330
    for (i in 1:nrow(b)) {
331
      if (length( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE))) > ncol(l)) {
332
        l <- as.data.frame(matrix(0, ncol = length( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE))), nrow = (nrow(b))))
333
      }
334
      
335
    }
336
    n <- 1
337
    for (i in 1:nrow(b)) {
338
      
339
      n <- as.numeric(as.numeric(ncol(l)) - as.numeric(length( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE)))))
340
      l[i,] <- c( unlist(strsplit(unlist(strsplit (b[i,1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE)), rep("IGNORE", times = as.numeric(n)))
341
    }
342
    
343
    l <- cbind(l, rep(0, times = nrow(l)))
344
    names(l)[ncol(l)] <- "wc"
345
    for (i in 1:nrow(l)) {
346
      l[i,ncol(l)] <- length(grep("IGNORE", l[i,]))
347
    }
348
    
349
    
350
    row1 <- unlist(strsplit(unlist(strsplit(b[which.max(l$wc),1], "[^[:alnum:]<>=./]")), "(?=[<>=./])", perl = TRUE))
351
    
352
    l <- l[1:(ncol(l)-1)]
353
    d <- as.data.frame(matrix(.Machine$double.xmax, ncol = ncol(l), nrow = nrow(l)), stringsAsFactors = F)
354
    listo <- rep(.Machine$double.xmax, times = nrow(l))
355
    
356
    results <- rep('IGNORE', times = ncol(l))
357
    n <- 1
358
    k <- 1
359
    g <- 1
360
    p <- 0
361
    s <- 1
362
    check <- 0
363
    ignorecheck <- 0
364
    switch01 <- 0
365
    while (n <= length(row1)) {
366
      while (k <= length(row1)) {
367
        p <- 0 
368
        if (n > 0) {
369
          if (k > 0) {
370
            for (i in 1:nrow(l)) {
371
              if (i > 0) {
372
                for (v in 1:ncol(l)) {
373
                  for (w in 1:ncol(l)) {  
374
                    if (s == 1) {  
375
                      if ( ((paste(l[i, v:w], collapse = ' ')) == (paste(row1[n:k], collapse = ' '))) ){
376
                        if (w < listo[i]) {  
377
                          p <- p + 1
378
                          listo[i] <- w
379
                          if (n == 1 & v > 1) {
380
                            check <- check + 1
381
                          }
382
                        }
383
                      }
384
                    }else {
385
                      if ( ((paste(l[i, v:w], collapse = ' ')) == (paste(row1[n:k], collapse = ' '))) ){
386
                        if (v > d[i,s-1]) {
387
                          if (w < listo[i]) {
388
                            p <- p + 1
389
                            listo[i] <- w
390
                          }
391
                        }
392
                      }
393
                    } 
394
                  } 
395
                }
396
              }    
397
            }
398
          }
399
        }
400
        if (p >= nrow(b)) {
401
          results[g] <- paste(row1[n:k], collapse = ' ')
402
          if (k == length(row1)) {
403
            n <- k
404
          }
405
          if (k < length(row1)) {
406
            k <- k + 1
407
          }
408
          d[,s] <- listo
409
          listo <- rep(.Machine$double.xmax, times = nrow(l))
410
          ignorecheck <- ignorecheck + 1
411
          switch01 <- 1
412
        }
413
        if ((p < nrow(b))) {
414
          if (n < length(row1)) {
415
            if (switch01 == 0) {
416
              n <- k + 1
417
              k <- n
418
            }
419
            if (switch01 == 1) {
420
              n <- k
421
            }
422
            g <- g + 2
423
            if (d[1,s] != .Machine$double.xmax) {
424
              s <- s + 1
425
            }
426
            listo <- rep(.Machine$double.xmax, times = nrow(l))
427
            switch01 <- 0 
428
          }
429
        }
430
        if (n == length(row1)) {
431
          n <- n + 1
432
          k <- k + 1
433
        }
434
      }
435
    }
436
    if (ignorecheck > 0) {
437
      results <- unlist(strsplit(results, " "))
438
      
439
      results <- as.data.frame((results))
440
      results <- cbind(results, rep("f", times = nrow(results)))
441
      colnames(results)[ncol(results)] <- "tf"
442
      results[] <- lapply(results, as.character)
443
      results <- results[complete.cases(results),]
444
      
445
      for (i in 2:nrow(results)) {
446
        if (results[i,1] == "IGNORE") {
447
          if (results[i,1] == results[(i-1),1]) {
448
            results[i,2] <- "t"
449
          }
450
        }
451
      }
452
      
453
      igcheck <- 0
454
      results <- subset(results, tf == "f")  
455
      results <- as.data.frame(results[,-(ncol(results))])
456
      colnames(results)[1] <- "Template"
457
      results <- as.character(results[,1])
458
      if (length(results) < ncol(l)) {
459
        for (i in (length(results)+1):(ncol(l))) {
460
          results[i] <- "IGNORE"
461
          igcheck <- 1
462
        }
463
      }
464
      
465
      if (check > 0) {
466
        results <- c("IGNORE", results)
467
      }
468
      
469
      seq <- as.data.frame(t(as.data.frame(results)))
470
      seq <- seq[,(1:(ncol(l)))]
471
      seq [] <- lapply(seq, as.character)
472
      l[] <- lapply(l, as.character)
473
      
474
      p <- list(as.character(print(seq[1,])))
475
      p <- as.data.frame(p)
476
      colnames(p)[1] <- "col1"
477
      p <- cbind(p, rep("f", times =nrow(p)))
478
      colnames(p)[2] <- "tf"
479
      p[,2] <- sapply(p[,2], as.character)
480
      for (i in 2:nrow(p)) {
481
        if ((trimws(p[i,1], which = "both")) == "IGNORE") {
482
          if (p[i,1] == p[(i-1),1])
483
            p[i,2] <- "t"
484
        }
485
      }
486
      
487
      
488
      p <- subset(p, tf == "f")
489
      p <- as.character(print(p[,1]))
490
      p <- paste(p,collapse=" ")
491
      w <- strsplit(p, "IGNORE")
492
      w <- as.data.frame(w[[1]])
493
      colnames(w)[1] <- "col1"
494
      w <- subset(w, col1 != " ")
495
      w <- subset(w, col1 != "")
496
      w <- as.character(w[,1])
497
      w <- gsub(" ","", w)
498
      
499
      
500
      non2 <- as.data.frame(matrix(0, ncol = ncol(l), nrow = nrow(l)), stringsAsFactors = F)
501
      d2 <- as.data.frame(matrix("f", ncol = length(w), nrow = nrow(l)), stringsAsFactors = F)
502
      
503
      non2[] <- lapply(non2, as.numeric)
504
      
505
      g <- 1 
506
      
507
      for (j in 1:nrow(l)) {
508
        if (j > 0) {
509
          for (i in 1:length(w)) {
510
            if (i > 0) {
511
              for (k in 1:ncol(l)) {
512
                for (m in 1:ncol(l)) {
513
                  if ( (gsub(" ","",paste(l[j,k:m],collapse=" ")) == w[i]) & (d2[j,i] == "f") ){
514
                    d2[j,i] <- "t"
515
                    if (g == 1) {
516
                      non2[j, g] <- k
517
                      non2[j, g+1] <- m
518
                      g <- g + 2
519
                    }else if ((k > as.numeric(non2[j, (g-1)])) & (m > as.numeric(non2[j, (g-1)]))) {
520
                      non2[j, g] <- k
521
                      non2[j, g+1] <- m
522
                      g <- g + 2
523
                    }
524
                  }
525
                }
526
              }
527
            }
528
          }
529
        }
530
        g <- 1
531
      }
532
      
533
      non3 <- as.data.frame(matrix("IGNORE", ncol = ncol(l), nrow = nrow(l)), stringsAsFactors = F)
534
      
535
      g <- rep(1, times = nrow(non2))
536
      f <- 1
537
      
538
      
539
      for (m in 1:nrow(non2)) {
540
        q <- as.character(print(non2[m,]))
541
        q <- paste(q,collapse=" ")
542
        q <- unlist(strsplit(q, " "))
543
        x <- 1
544
        while (x <= length(q)) {
545
          if (q[x] == "0") {
546
            q <- q[-x]
547
          }else {
548
            x <- x + 1
549
          }
550
        }
551
        q <- list(q)
552
        q <- as.data.frame(q[[1]])
553
        colnames(q)[1] <- "col1"
554
        q <- subset(q, col1 != "")
555
        q[,1] <- sapply(q[,1], as.character)
556
        q[,1] <- sapply(q[,1], as.numeric)
557
        i <- 1
558
        while (i <= nrow(q)) {
559
          if (i == 1) { 
560
            if (as.numeric(q[i,1]-1) >=1) {
561
              non3[m,g[m]] <- paste(as.character(l[m,1:(as.numeric(q[i,1]-1))]), collapse = " ")
562
              g[m] <- g[m] + 1
563
            }else {
564
              g[m] <- 2
565
            }
566
          }
567
          if ((i %% 2 == 0) & (i != nrow(q)) & ((as.numeric(q[i+1,1]-1)) >= (as.numeric(q[i,1]+1)))) {
568
            non3[m,g[m]] <- paste(as.character(l[m,(as.numeric(q[i,1]+1)):(as.numeric(q[i+1,1]-1))]), collapse = " ")
569
            g[m] <- g[m] + 1
570
          }
571
          if ((i %% 2 == 0) & (i != nrow(q)) & ((as.numeric(q[i+1,1]-1)) < (as.numeric(q[i,1]+1)))) {
572
            g[m] <- g[m] + 1
573
          }
574
          if ((i %% 2 == 0) & (i == nrow(q)) & (as.numeric(q[i,1]+1) <= as.numeric(ncol(l)))) {
575
            non3[m,g[m]] <- paste(as.character(l[m,(as.numeric(q[i,1]+1)):(as.numeric(ncol(l)))]), collapse = " ")
576
            g[m] <- g[m] + 1
577
          }
578
          if (i < nrow(q)) {
579
            i <- i + 1
580
          }else {
581
            break
582
          }
583
        }
584
      }
585
      
586
      
587
      for (i in 1:ncol(seq)) {
588
        if (seq[1,i] == "IGNORE") {
589
          seq[1,i] <- "BLANK" 
590
        }
591
      }
592
      
593
      non3 <- as.data.frame(t(non3))
594
      non3 <- cbind(non3, rep("f", times = nrow(non3)))
595
      colnames(non3)[ncol(non3)] <- "tf"
596
      non3[] <- lapply(non3, as.character)
597
      
598
      for (i in 1:nrow(non3)) {
599
        if (paste(as.character(print(non3[i,1:(ncol(non3)-1)])), collapse = " ") == paste(as.character(rep("IGNORE", times = ncol(non3)-1)), collapse = " ")) {
600
          non3[i,ncol(non3)] <- "t" 
601
        }
602
      }
603
      
604
      non3 <- subset(non3, tf == "f")  
605
      non3 <- non3[,-(ncol(non3))]
606
      non3 <- as.data.frame(t(non3))
607
      
608
      seq <- as.data.frame(t(seq))
609
      seq <- cbind(seq, rep("f", times = nrow(seq)))
610
      colnames(seq)[ncol(seq)] <- "tf"
611
      seq[] <- lapply(seq, as.character)
612
      
613
      for (i in 2:nrow(seq)) {
614
        if (seq[i,1] == "BLANK") {
615
          if (seq[i,1] == seq[(i-1),1]) {
616
            seq[i,2] <- "t"
617
          }
618
        }
619
      }
620
      
621
      if (seq[nrow(seq),1] == "BLANK") {
622
        if (igcheck == 1) {
623
          seq[nrow(seq),2] <- "t" 
624
        }
625
      }
626
      
627
      seq <- subset(seq, tf == "f")  
628
      seq <- as.data.frame(seq[,-(ncol(seq))])
629
      
630
      
631
      
632
      
633
      colnames(seq)[1] <- "Template"
634
      seq <- as.data.frame(t(seq))
635
      
636
      
637
      template <- seq
638
      
639
      
640
      
641
      fillins <- non3
642
      
643
      if (ncol(non3) > 0) {
644
        
645
        for (j in 1:ncol(fillins)) {
646
          fillinstest <- as.character(fillins[,j])
647
          for (i in 1:length(fillinstest)) {
648
            fillinstest[i] <- gsub("IGNORE", "", fillinstest[i])
649
          }
650
          fillins[,j] <- fillinstest
651
        }
652
        
653
        
654
        
655
        i <- 1
656
        while (i <=  ncol(fillins)) {
657
          fillins <- as.data.frame(cbind (fillins[,c(1:i)], rep(1, times = nrow(fillins)), fillins[,-c(1:i)]))
658
          i <- i+ 2
659
        }
660
        
661
        for (i in 1:ncol(fillins)) {
662
          if (i %% 2 == 0) {
663
            colnames(fillins)[i] <- "Frequency"
664
            fillins[,i] <- sapply(fillins[,i], as.character)
665
            fillins[,i] <- sapply(fillins[,i], as.numeric)
666
          }
667
        }
668
        for (i in 1:ncol(fillins)) {
669
          if (i %% 2 == 1) {
670
            colnames(fillins)[i] <- paste("Blank", (i+1)/2, collapse ="")
671
            fillins[,i] <- sapply(fillins[,i], as.character)
672
          }
673
        }
674
        for (j in 1:ncol(fillins)) {
675
          for (i in 1:nrow(fillins)) {
676
            if (j %% 2 == 1) {
677
              for (k in 1:i) {
678
                if (k < i) {
679
                  if (trimws(fillins[i, j], which = "both") == trimws(fillins[k,j], which = "both")) {
680
                    fillins[k, j+1] <- fillins[k, j+1] + 1
681
                    fillins[i, j] <- paste("IGNORE", i, collapse ="")
682
                    fillins[i, j + 1] <- 0 
683
                  }
684
                }
685
              }
686
            }
687
          }
688
        }
689
        
690
      }
691
      
692
    } else {
693
      template <- fillins <- as.data.frame(matrix("", ncol = 1, nrow = 1))
694
    }
695
    
696
697
    template <- cbindPad(template, b)
698
    fillins <- cbindPad(fillins, b)
699
    names(template)[ncol(template)] <- "Cluster Text"
700
    names(fillins)[ncol(fillins)] <- "Cluster Text"
701
    setwd("C:/Users/cloud/Downloads/PipelineTestRun5") 
702
    numbochar <- as.character(numbo)
703
    numbotempstr <- paste("template", numbochar, ".csv", sep = "")
704
    numbofillinsstr <- paste("fillins", numbochar, ".csv", sep = "")
705
    write.csv(template, file = numbotempstr, row.names = F)
706
    write.csv(fillins, file = numbofillinsstr, col.names = F)
707
    
708
  }
709
  numbo <- numbo + 1
710
  
711
}