|
a |
|
b/ngrams/generateNgramsIndications.py |
|
|
1 |
import sys |
|
|
2 |
#from nltk.stem import WordNetLemmatizer |
|
|
3 |
#import nltk |
|
|
4 |
#from nltk.corpus import stopwords |
|
|
5 |
from nltk.tokenize import word_tokenize |
|
|
6 |
import math |
|
|
7 |
|
|
|
8 |
#stop_words = set(stopwords.words('english')) |
|
|
9 |
|
|
|
10 |
|
|
|
11 |
#raw_word_count=dict() |
|
|
12 |
#lemma_word_count=dict() |
|
|
13 |
#lemma_raw_connect=dict() |
|
|
14 |
id_uniqueWords=dict() |
|
|
15 |
ngram_counts=dict() |
|
|
16 |
|
|
|
17 |
gram2_count=0 |
|
|
18 |
gram3_count=0 |
|
|
19 |
gram4_count=0 |
|
|
20 |
gram5_count=0 |
|
|
21 |
gram6_count=0 |
|
|
22 |
gram7_count=0 |
|
|
23 |
gram7_count=0 |
|
|
24 |
gram9_count=0 |
|
|
25 |
gram10_count=0 |
|
|
26 |
|
|
|
27 |
id_count=0 |
|
|
28 |
|
|
|
29 |
|
|
|
30 |
|
|
|
31 |
with open(sys.argv[1], encoding="utf8", errors='ignore') as in_file: |
|
|
32 |
for line in in_file: |
|
|
33 |
if line.strip()=="": |
|
|
34 |
break |
|
|
35 |
id_count+=1 |
|
|
36 |
id_key=line.split("\t")[0] |
|
|
37 |
fields=line.split("\t")[1:] |
|
|
38 |
id_uniqueWords[id_key]=set([]) |
|
|
39 |
for f in fields: |
|
|
40 |
#id_uniqueWords[fields[0]].add(set(word_tokenize(f.strip("True:").strip("False:")))) |
|
|
41 |
words=word_tokenize(f.lstrip("True:").lstrip("False:").lower()) |
|
|
42 |
max_len=len(words) |
|
|
43 |
for i,w in enumerate(words): |
|
|
44 |
if i+1<max_len: |
|
|
45 |
gram2=tuple([w,words[i+1]]) |
|
|
46 |
gram2_count+=1 |
|
|
47 |
if gram2 in ngram_counts: |
|
|
48 |
ngram_counts[gram2]+=1 |
|
|
49 |
else: |
|
|
50 |
ngram_counts[gram2]=1 |
|
|
51 |
if i+2<max_len: |
|
|
52 |
gram3=tuple([w,words[i+1],words[i+2]]) |
|
|
53 |
gram3_count+=1 |
|
|
54 |
if gram3 in ngram_counts: |
|
|
55 |
ngram_counts[gram3]+=1 |
|
|
56 |
else: |
|
|
57 |
ngram_counts[gram3]=1 |
|
|
58 |
if i+3<max_len: |
|
|
59 |
gram4=tuple([w,words[i+1],words[i+2],words[i+3]]) |
|
|
60 |
gram4_count+=1 |
|
|
61 |
if gram4 in ngram_counts: |
|
|
62 |
ngram_counts[gram4]+=1 |
|
|
63 |
else: |
|
|
64 |
ngram_counts[gram4]=1 |
|
|
65 |
if i+4<max_len: |
|
|
66 |
gram5=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4]]) |
|
|
67 |
gram5_count+=1 |
|
|
68 |
if gram5 in ngram_counts: |
|
|
69 |
ngram_counts[gram5]+=1 |
|
|
70 |
else: |
|
|
71 |
ngram_counts[gram5]=1 |
|
|
72 |
if i+5<max_len: |
|
|
73 |
gram6=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5]]) |
|
|
74 |
gram6_count+=1 |
|
|
75 |
if gram6 in ngram_counts: |
|
|
76 |
ngram_counts[gram6]+=1 |
|
|
77 |
else: |
|
|
78 |
ngram_counts[gram6]=1 |
|
|
79 |
if i+6<max_len: |
|
|
80 |
gram7=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5],words[i+6]]) |
|
|
81 |
gram7_count+=1 |
|
|
82 |
if gram6 in ngram_counts: |
|
|
83 |
ngram_counts[gram7]+=1 |
|
|
84 |
else: |
|
|
85 |
ngram_counts[gram7]=1 |
|
|
86 |
if i+7<max_len: |
|
|
87 |
gram8=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5],words[i+6],words[i+7]]) |
|
|
88 |
gram8_count+=1 |
|
|
89 |
if gram8 in ngram_counts: |
|
|
90 |
ngram_counts[gram8]+=1 |
|
|
91 |
else: |
|
|
92 |
ngram_counts[gram8]=1 |
|
|
93 |
|
|
|
94 |
for k,v in ngram_counts.items(): |
|
|
95 |
if v>1: |
|
|
96 |
print (str(math.ceil(float(v)/id_count))+"\t"+"\t".join(k)) |
|
|
97 |
else: |
|
|
98 |
continue |
|
|
99 |
#if len(k)==2: |
|
|
100 |
#print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram2_count))) |
|
|
101 |
#print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k)) |
|
|
102 |
#elif len(k)==3: |
|
|
103 |
#print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram3_count))) |
|
|
104 |
#print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k)) |
|
|
105 |
#elif len(k)==4: |
|
|
106 |
#print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram4_count))) |
|
|
107 |
#print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k)) |
|
|
108 |
#elif len(k)==5: |
|
|
109 |
#print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram5_count))) |
|
|
110 |
#print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k)) |
|
|
111 |
#elif len(k)==6: |
|
|
112 |
#print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram6_count))) |
|
|
113 |
#print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k)) |
|
|
114 |
|
|
|
115 |
|
|
|
116 |
|
|
|
117 |
exit() |
|
|
118 |
print ("id\t"+"\t".join(id_uniqueWords.keys())) |
|
|
119 |
for k1,v1 in id_uniqueWords.items(): |
|
|
120 |
avg_list=[] |
|
|
121 |
for k2,v2 in id_uniqueWords.items(): |
|
|
122 |
avg=(len(v1)+len(v2))/2.0 |
|
|
123 |
|
|
|
124 |
avg_list.append(str(int(100*(len(v1.intersection(v2))/avg)))) |
|
|
125 |
print (k2+"\t".join(avg_list)) |