--- a +++ b/clusters/scripts/commonwords.py @@ -0,0 +1,32 @@ +from nltk import PorterStemmer +for x in range(300) : + fin = open("C:/primes/data/unpackedclusters/clust_" + str(x) + ".txt", 'r') + fout = open("C:/primes/data/commonwords/words_" + str(x) + ".txt", 'w+') + lines = fin.readlines() + wordmap = {} + stemtoword = {} + letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','-'] + for line in lines : + l = line.lower() + word = "" + for x in l : + if x in letters : + word = word + x + elif (not (word == "")) and (not (word == " ")): + stem = PorterStemmer().stem(word) + stemtoword[stem] = word + if stem in wordmap : + wordmap[stem] = wordmap[stem] + 1 + else : + wordmap[stem] = 1 + word = "" + stem = PorterStemmer().stem(word) + stemtoword[stem] = word + if stem in wordmap : + wordmap[stem] = wordmap[stem] + 1 + else : + wordmap[stem] = 1 + for x in wordmap : + fout.write(str(stemtoword[x]) + " " + str(wordmap[x]) + "\n") + fin.close() + fout.close() \ No newline at end of file