Switch to unified view

a b/scripts/AllClinTrialRun(NOTWORKING).R
1
library("tm")
2
library("SnowballC")
3
library("ggplot2")
4
library("wordcloud")
5
library("stringi")
6
7
setwd("/afs/athena.mit.edu/user/w/i/williame/Downloads")
8
9
thingcsv <- read.csv("allInclusionExclusionBulletPoint.csv", encoding = "latin1")
10
thingcsv <- as.data.frame(thingcsv[,4])
11
thingcsv[] <- lapply(thingcsv, as.character)
12
for (l in 1:nrow(thingcsv)) {
13
  thingcsv[l,] <- stringi::stri_trans_general(thingcsv[l,], "latin-ascii")
14
}
15
setwd("/afs/athena.mit.edu/user/w/i/williame/Downloads/ClusterSentAllCTFinal2")
16
n <- 1
17
for (i in 1:nrow(thingcsv)) {
18
  if (!is.na(thingcsv[i,1])) {
19
    if (thingcsv[i,1] != "" & thingcsv[i,1] != "#NAME?" & thingcsv[i,1] != "All") {
20
      n <- as.character(n)
21
      str <- paste("allsent", n, ".txt", sep = "")
22
      write.table(as.data.frame(thingcsv[i,1]),file = str, row.names = F, col.names = F)
23
      n <- as.numeric(n)
24
      n <- n + 1
25
    }
26
  }
27
}
28
29
30
allsentdocs <- Corpus(DirSource("/afs/athena.mit.edu/user/w/i/williame/Downloads/ClusterSentAllCTFinal2"))
31
writeLines(as.character(allsentdocs[[25]]))
32
33
allsentdocs <- tm_map(allsentdocs,content_transformer(tolower))
34
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
35
allsentdocs <- tm_map(allsentdocs, toSpace, '"')
36
allsentdocs <- tm_map(allsentdocs, removeWords, stopwords("english"))
37
allsentdocs <- tm_map(allsentdocs, stripWhitespace)
38
allsentdocs <- tm_map(allsentdocs,stemDocument)
39
allsentdtm <- DocumentTermMatrix(allsentdocs)
40
allsentm <- as.matrix(allsentdtm)
41
rownames(allsentm) <- paste(substring(rownames(allsentm),1,3),rep("..",nrow(allsentm)), substring(rownames(allsentm), nchar(rownames(allsentm))-12,nchar(rownames(allsentm))-4))
42
allsentd <- dist(allsentm)
43
setwd("/afs/athena.mit.edu/user/w/i/williame/Downloads")
44
write.csv(as.matrix(allsentd), file ="distmatclusterallsentfinal2.csv")
45
allsentgroups <- hclust(allsentd,method="ward.D")
46
allsentgroupsplot <- as.dendrogram(allsentgroups)
47
48
plot(allsentgroups, hang=-1)
49
par(mfrow=c(3,1))
50
51
plot(allsentgroupsplot, main="Main")
52
plot(cut(allsentgroupsplot, h=4000)$upper, 
53
     main="Upper tree of cut at h=1000")
54
plot(cut(allsentgroupsplot, h=4000)$lower[[2]], 
55
     main="Second branch of lower tree with cut at h=3000")
56
library("ggplot2")
57
library("ggdendro")
58
59
ggdendrogram(allsentgroups)
60
allsentdata <- dendro_data(allsentgroupsplot, type = "rectangle")
61
head(allsentdata$labels)
62
63
allsentdatalabel<- allsentdata[["labels"]][["label"]]
64
65
allsentdatalabeldf <- as.data.frame(allsentdatalabel)
66
allsentdatalabeldf <- as.data.frame(cbind(allsentdatalabeldf, rep("n/a", times = nrow(allsentdatalabeldf))))
67
gsuballsen <- function(x) {gsub("all .. ", "", x)} #this might not be right
68
allsentdatalabeldf$allsentdatalabel <- lapply(allsentdatalabeldf$allsentdatalabel, gsuballsen)
69
setwd("C:/Users/AI/Downloads/ClusterSentAllCTFinal2")
70
colnames(allsentdatalabeldf)[c(1,2)] <- c("file", "text")
71
allsentdatalabeldf[] <- lapply(allsentdatalabeldf, as.character)
72
for (i in 1:nrow(allsentdatalabeldf)) {
73
  allsentdatalabeldf[i,2] <- as.character(read.table(paste(allsentdatalabeldf[i,1],".txt",sep=""), header = F)[1,1])
74
  
75
} 
76
77
setwd("/afs/athena.mit.edu/user/w/i/williame/Downloads")
78
79
write.csv(allsentdatalabeldf, file = "allsentclusterresultsclean.csv")
80
81
82
pdf("pdfgraphallsentfinal2.pdf", width=40, height=15)
83
plot(allsentgroups, hang=-1)
84
dev.off()
85