ClinicalTrialsElig / Git / [c09aa8] /clusters/scripts/commonwords.py

Models:

joseph-gordon/

ClinicalTrialsElig

Downloads: 1

[c09aa8]: / clusters / scripts / commonwords.py

History

Download this file

32 lines (32 with data), 1.2 kB

from nltk import PorterStemmer
for x in range(300) :
    fin = open("C:/primes/data/unpackedclusters/clust_" + str(x) + ".txt", 'r')
    fout = open("C:/primes/data/commonwords/words_" + str(x) + ".txt", 'w+')
    lines = fin.readlines()
    wordmap = {}
    stemtoword = {}
    letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','-']
    for line in lines :
        l = line.lower()
        word = ""
        for x in l :
            if x in letters :
                word = word + x
            elif (not (word == "")) and (not (word == " ")):
                stem = PorterStemmer().stem(word)
                stemtoword[stem] = word
                if stem in wordmap :
                    wordmap[stem] = wordmap[stem] + 1
                else :
                    wordmap[stem] = 1
                word = ""
        stem = PorterStemmer().stem(word)
        stemtoword[stem] = word
        if stem in wordmap :
            wordmap[stem] = wordmap[stem] + 1
        else :
            wordmap[stem] = 1
    for x in wordmap :
        fout.write(str(stemtoword[x]) + " " + str(wordmap[x]) + "\n")
    fin.close()
    fout.close()