Switch to unified view

a b/scripts/MetClusterScript.R
1
library(data.table)
2
library(tm)
3
library(SnowballC)
4
library(ggplot2)
5
library(wordcloud)
6
setwd("/afs/athena.mit.edu/user/w/i/williame/Downloads")
7
mt <- read.csv("sample_output_full (1).csv", header = F, stringsAsFactors = F)
8
setwd("/afs/athena.mit.edu/user/w/i/williame/Downloads/ClusterMetFinal2")
9
mtcol <- rep("f", times = nrow(mt))
10
mt <- cbind(mt,mtcol)
11
mt[] <- lapply(mt, as.character)
12
for (i in 1:nrow(mt)) {
13
  for (j in 1:ncol(mt)) {
14
    if (!is.na(mt[i,j])) {
15
      if (mt[i,j] %like% "Processing text") {
16
        mt[i, ncol(mt)] <- "t"
17
      }
18
    }
19
  }
20
}
21
n <- 1
22
k <- 1
23
p <- 1
24
for (i in 1:nrow(mt)) {
25
  if (!is.na(mt[i,ncol(mt)])) { 
26
    if ((mt[i,ncol(mt)] != "f") & i > n) {
27
      k <- i - 1
28
      p <- as.character(p)
29
      str <- paste("mt", p, ".txt", sep = "")
30
      write.table(as.data.frame(mt[(n+1):k,]),file = str, row.names = F, col.names = F)
31
      n <- k + 1
32
      p <- as.numeric(p)
33
      p <- p + 1
34
    }
35
  }
36
}
37
38
mt[1,ncol(mt)] != "f"
39
40
metdocs <- Corpus(DirSource("/afs/athena.mit.edu/user/w/i/williame/Downloads/ClusterMetFinal2"))
41
writeLines(as.character(metdocs[[25]]))
42
43
metdocs <- tm_map(metdocs,content_transformer(tolower))
44
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
45
metdocs <- tm_map(metdocs, toSpace, '"')
46
metdocs <- tm_map(metdocs, removeWords, stopwords("english"))
47
metdocs <- tm_map(metdocs, stripWhitespace)
48
metdocs <- tm_map(metdocs,stemDocument)
49
metdtm <- DocumentTermMatrix(metdocs)
50
metm <- as.matrix(metdtm)
51
rownames(metm) <- paste(substring(rownames(metm),1,3),rep("..",nrow(metm)), substring(rownames(metm), nchar(rownames(metm))-12,nchar(rownames(metm))-4))
52
metd <- dist(metm)
53
setwd("/afs/athena.mit.edu/user/w/i/williame/Downloads")
54
write.csv(as.matrix(metd), file ="distmatclustermetfinal2.csv")
55
metgroups <- hclust(metd,method="ward.D")
56
metgroupsplot <- as.dendrogram(metgroups)
57
plot(metgroups, hang=-1)
58
par(mfrow=c(3,1))
59
60
plot(metgroupsplot, main="Main")
61
plot(cut(metgroupsplot, h=100)$upper, 
62
     main="Upper tree of cut at h=1000")
63
plot(cut(metgroupsplot, h=100)$lower[[2]], 
64
     main="Second branch of lower tree with cut at h=1000")
65
library("ggplot2")
66
library("ggdendro")
67
68
ggdendrogram(metgroups)
69
metdata <- dendro_data(metgroupsplot, type = "rectangle")
70
head(metdata$labels)
71
72
metdatalabel<- metdata[["labels"]][["label"]]
73
74
metdatalabeldf <- as.data.frame(metdatalabel)
75
metdatalabeldf <- as.data.frame(cbind(metdatalabeldf, rep("n/a", times = nrow(metdatalabeldf))))
76
gsubmet <- function(x) {gsub("mt\\d+ .. ", "", x)}
77
metdatalabeldf$metdatalabel <- lapply(metdatalabeldf$metdatalabel, gsubmet)
78
setwd("/afs/athena.mit.edu/user/w/i/williame/Downloads/ClusterMetFinal3Text")
79
n <- 1
80
k <- 1
81
p <- 1
82
for (i in 1:nrow(mt)) {
83
  if (!is.na(mt[i,ncol(mt)])) { 
84
    if ((mt[i,ncol(mt)] != "f") & i > n) {
85
      k <- i - 1
86
      p <- as.character(p)
87
      str <- paste("mt", p, "text", ".txt", sep = "")
88
      write.table(as.data.frame(mt[n:k,]),file = str, row.names = F, col.names = F)
89
      n <- k + 1
90
      p <- as.numeric(p)
91
      p <- p + 1
92
    }
93
  }
94
}
95
colnames(metdatalabeldf)[c(1,2)] <- c("file", "text")
96
metdatalabeldf[] <- lapply(metdatalabeldf, as.character)
97
gsubproc <- function(x) {gsub("\'Processing text_000N_\\d+.tx.1:", "", x)}
98
test1 <- gsubproc(as.character(read.table(paste(metdatalabeldf[1,1],"text",".txt",sep=""), header = F)[1,2]))
99
for (i in 1:nrow(metdatalabeldf)) {
100
  metdatalabeldf[i,2] <- gsubproc(as.character(read.table(paste(metdatalabeldf[i,1],"text",".txt",sep=""), header = F, fill = T)[1,2]))
101
  
102
} 
103
104
setwd("/afs/athena.mit.edu/user/w/i/williame/Downloads")
105
106
write.csv(metdatalabeldf, file = "metclusterresultsclean3.csv")
107
108
109
pdf("pdfgraphsmetfinal1.pdf", width=40, height=15)
110
plot(metgroups, hang=-1)
111
dev.off()
112