[c09aa8]: / clusters / scripts / commonwords.py

Download this file

32 lines (32 with data), 1.2 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from nltk import PorterStemmer
for x in range(300) :
fin = open("C:/primes/data/unpackedclusters/clust_" + str(x) + ".txt", 'r')
fout = open("C:/primes/data/commonwords/words_" + str(x) + ".txt", 'w+')
lines = fin.readlines()
wordmap = {}
stemtoword = {}
letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','-']
for line in lines :
l = line.lower()
word = ""
for x in l :
if x in letters :
word = word + x
elif (not (word == "")) and (not (word == " ")):
stem = PorterStemmer().stem(word)
stemtoword[stem] = word
if stem in wordmap :
wordmap[stem] = wordmap[stem] + 1
else :
wordmap[stem] = 1
word = ""
stem = PorterStemmer().stem(word)
stemtoword[stem] = word
if stem in wordmap :
wordmap[stem] = wordmap[stem] + 1
else :
wordmap[stem] = 1
for x in wordmap :
fout.write(str(stemtoword[x]) + " " + str(wordmap[x]) + "\n")
fin.close()
fout.close()