|
a |
|
b/ngrams/parseInputSentence.py |
|
|
1 |
import sys |
|
|
2 |
#from nltk.stem import WordNetLemmatizer |
|
|
3 |
#import nltk |
|
|
4 |
#from nltk.corpus import stopwords |
|
|
5 |
from nltk.tokenize import word_tokenize |
|
|
6 |
|
|
|
7 |
userInput_ngram=set() |
|
|
8 |
dict_ngrams_score=dict() |
|
|
9 |
sub_ngrams_link=dict() |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
with open("./HIV_ngrams.tsv", encoding="utf8", errors='ignore') as in_file1: |
|
|
13 |
for line in in_file1: |
|
|
14 |
if line.strip()=="": |
|
|
15 |
break |
|
|
16 |
fields=line.strip().split("\t") |
|
|
17 |
dict_ngrams_score[tuple(fields[1:])]=int(fields[0]) |
|
|
18 |
if len(fields[1:])>2: |
|
|
19 |
sub_ngrams_link[tuple(fields[1:][1:])]=tuple([fields[1:]]) |
|
|
20 |
sub_ngrams_link[tuple(fields[1:][:-1])]=tuple([fields[1:]]) |
|
|
21 |
|
|
|
22 |
if sys.argv[1].strip()=="": |
|
|
23 |
exit() |
|
|
24 |
words=word_tokenize(sys.argv[1].strip().lower()) |
|
|
25 |
max_len=len(words) |
|
|
26 |
for i,w in enumerate(words): |
|
|
27 |
if i+1<max_len: |
|
|
28 |
gram2=tuple([w,words[i+1]]) |
|
|
29 |
userInput_ngram.add(gram2) |
|
|
30 |
if i+2<max_len: |
|
|
31 |
gram3=tuple([w,words[i+1],words[i+2]]) |
|
|
32 |
userInput_ngram.add(gram3) |
|
|
33 |
if i+3<max_len: |
|
|
34 |
gram4=tuple([w,words[i+1],words[i+2],words[i+3]]) |
|
|
35 |
userInput_ngram.add(gram4) |
|
|
36 |
if i+4<max_len: |
|
|
37 |
gram5=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4]]) |
|
|
38 |
userInput_ngram.add(gram5) |
|
|
39 |
if i+5<max_len: |
|
|
40 |
gram6=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5]]) |
|
|
41 |
userInput_ngram.add(gram5) |
|
|
42 |
if i+6<max_len: |
|
|
43 |
gram7=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5],words[i+6]]) |
|
|
44 |
userInput_ngram.add(gram7) |
|
|
45 |
|
|
|
46 |
scroredSentence=userInput_ngram.intersection(dict_ngrams_score.keys()) |
|
|
47 |
#print (userInput_ngram) |
|
|
48 |
#print (dict_ngrams_score.keys()) |
|
|
49 |
score=0 |
|
|
50 |
for i in scroredSentence: |
|
|
51 |
score+=dict_ngrams_score[i] |
|
|
52 |
|
|
|
53 |
print (score) |
|
|
54 |
|
|
|
55 |
#plus1gram=userInput_ngram.intersection(sub_ngrams_link.keys()) |
|
|
56 |
|
|
|
57 |
#for gram in plus1gram: |
|
|
58 |
# print ("\t".join(gram)+"\t:\t"+"\t".join(sub_ngrams_link[gram])+"\t:\t"+"\t".join(dict_ngrams_score[sub_ngrams_link[gram]]) |