Switch to unified view

a b/ngrams/parseInputSentence.py
1
import sys
2
#from nltk.stem import WordNetLemmatizer
3
#import nltk
4
#from nltk.corpus import stopwords
5
from nltk.tokenize import word_tokenize
6
7
userInput_ngram=set()
8
dict_ngrams_score=dict()
9
sub_ngrams_link=dict()
10
11
12
with open("./HIV_ngrams.tsv", encoding="utf8", errors='ignore') as in_file1:
13
    for line in in_file1:
14
        if line.strip()=="":
15
            break
16
        fields=line.strip().split("\t")
17
        dict_ngrams_score[tuple(fields[1:])]=int(fields[0])
18
        if len(fields[1:])>2:
19
            sub_ngrams_link[tuple(fields[1:][1:])]=tuple([fields[1:]])
20
            sub_ngrams_link[tuple(fields[1:][:-1])]=tuple([fields[1:]])
21
22
if sys.argv[1].strip()=="":
23
    exit()
24
words=word_tokenize(sys.argv[1].strip().lower())
25
max_len=len(words)
26
for i,w in enumerate(words):
27
    if i+1<max_len:
28
        gram2=tuple([w,words[i+1]])
29
        userInput_ngram.add(gram2)
30
    if i+2<max_len:
31
        gram3=tuple([w,words[i+1],words[i+2]])
32
        userInput_ngram.add(gram3)
33
    if i+3<max_len:
34
        gram4=tuple([w,words[i+1],words[i+2],words[i+3]])
35
        userInput_ngram.add(gram4)
36
    if i+4<max_len:
37
        gram5=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4]])
38
        userInput_ngram.add(gram5)
39
    if i+5<max_len:
40
        gram6=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5]])
41
        userInput_ngram.add(gram5)
42
    if i+6<max_len:
43
        gram7=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5],words[i+6]])
44
        userInput_ngram.add(gram7)
45
46
scroredSentence=userInput_ngram.intersection(dict_ngrams_score.keys())
47
#print (userInput_ngram)
48
#print (dict_ngrams_score.keys())
49
score=0
50
for i in scroredSentence:
51
    score+=dict_ngrams_score[i]
52
53
print (score)
54
55
#plus1gram=userInput_ngram.intersection(sub_ngrams_link.keys())
56
57
#for gram in plus1gram:
58
#   print ("\t".join(gram)+"\t:\t"+"\t".join(sub_ngrams_link[gram])+"\t:\t"+"\t".join(dict_ngrams_score[sub_ngrams_link[gram]])