Switch to unified view

a b/scripts/SentClusterScript.R
1
########FINAL STUFF############
2
3
library("tm")
4
library("SnowballC")
5
library("ggplot2")
6
library("wordcloud")
7
8
9
10
11
###SENTENCE###
12
13
setwd("C:/Users/AI/Downloads")
14
ctclnb <- read.csv("ctsampleinds_cleaned_nobool.csv", header = F)
15
16
17
setwd("C:/Users/AI/Downloads/ClusterSentFinal1")
18
19
n <- 1
20
for (i in 1:nrow(ctclnb)) {
21
  for (j in 1:ncol(ctclnb)) {
22
    if (!is.na(ctclnb[i,j])) {
23
      if (ctclnb[i,j] != "") {
24
        n <- as.character(n)
25
        str <- paste("sent", n, ".txt", sep = "")
26
        write.table(as.data.frame(ctclnb[i,j]),file = str, row.names = F, col.names = F)
27
        n <- as.numeric(n)
28
        n <- n + 1
29
      }
30
    }
31
  }
32
}
33
34
35
sentdocs <- Corpus(DirSource("C:/Users/AI/Downloads/ClusterSentFinal1"))
36
writeLines(as.character(sentdocs[[25]]))
37
38
sentdocs <- tm_map(sentdocs,content_transformer(tolower))
39
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
40
sentdocs <- tm_map(sentdocs, toSpace, '"')
41
sentdocs <- tm_map(sentdocs, removeWords, stopwords("english"))
42
sentdocs <- tm_map(sentdocs, stripWhitespace)
43
sentdocs <- tm_map(sentdocs,stemDocument)
44
sentdtm <- DocumentTermMatrix(sentdocs)
45
sentm <- as.matrix(sentdtm)
46
rownames(sentm) <- paste(substring(rownames(sentm),1,3),rep("..",nrow(sentm)), substring(rownames(sentm), nchar(rownames(sentm))-12,nchar(rownames(sentm))-4))
47
sentd <- dist(sentm)
48
setwd("C:/Users/AI/Downloads")
49
write.csv(as.matrix(sentd), file ="distmatclustersentfinal1.csv")
50
sentgroups <- hclust(sentd,method="ward.D")
51
sentgroupsplot <- as.dendrogram(sentgroups)
52
53
plot(sentgroups, hang=-1)
54
par(mfrow=c(3,1))
55
56
plot(sentgroupsplot, main="Main")
57
plot(cut(sentgroupsplot, h=4000)$upper, 
58
     main="Upper tree of cut at h=1000")
59
plot(cut(sentgroupsplot, h=4000)$lower[[2]], 
60
     main="Second branch of lower tree with cut at h=3000")
61
library("ggplot2")
62
library("ggdendro")
63
64
ggdendrogram(sentgroups)
65
sentdata <- dendro_data(sentgroupsplot, type = "rectangle")
66
head(sentdata$labels)
67
68
sentdatalabel<- sentdata[["labels"]][["label"]]
69
70
sentdatalabeldf <- as.data.frame(sentdatalabel)
71
sentdatalabeldf <- as.data.frame(cbind(sentdatalabeldf, rep("n/a", times = nrow(sentdatalabeldf))))
72
gsubsen <- function(x) {gsub("sen .. ", "", x)}
73
sentdatalabeldf$sentdatalabel <- lapply(sentdatalabeldf$sentdatalabel, gsubsen)
74
setwd("C:/Users/AI/Downloads/ClusterSentFinal1")
75
colnames(sentdatalabeldf)[c(1,2)] <- c("file", "text")
76
sentdatalabeldf[] <- lapply(sentdatalabeldf, as.character)
77
test1 <- as.character(read.table(paste(sentdatalabeldf[1,1],".txt",sep=""), header = F)[1,1])
78
for (i in 1:nrow(sentdatalabeldf)) {
79
  sentdatalabeldf[i,2] <- as.character(read.table(paste(sentdatalabeldf[i,1],".txt",sep=""), header = F)[1,1])
80
  
81
} 
82
83
setwd("C:/Users/AI/Downloads")
84
85
write.csv(sentdatalabeldf, file = "sentclusterresultsclean.csv")
86
87
pdf("pdfgraphsentfinal2.pdf", width=40, height=15)
88
plot(sentgroups, hang=-1)
89
dev.off()