Switch to unified view

a b/ngrams/generateNgramsIndications.py
1
import sys
2
#from nltk.stem import WordNetLemmatizer
3
#import nltk
4
#from nltk.corpus import stopwords
5
from nltk.tokenize import word_tokenize
6
import math
7
8
#stop_words = set(stopwords.words('english'))
9
10
11
#raw_word_count=dict()
12
#lemma_word_count=dict()
13
#lemma_raw_connect=dict()
14
id_uniqueWords=dict()
15
ngram_counts=dict()
16
17
gram2_count=0
18
gram3_count=0
19
gram4_count=0
20
gram5_count=0
21
gram6_count=0
22
gram7_count=0
23
gram7_count=0
24
gram9_count=0
25
gram10_count=0
26
27
id_count=0
28
29
30
31
with open(sys.argv[1], encoding="utf8", errors='ignore') as in_file:
32
    for line in in_file:
33
        if line.strip()=="":
34
            break
35
        id_count+=1
36
        id_key=line.split("\t")[0]
37
        fields=line.split("\t")[1:]
38
        id_uniqueWords[id_key]=set([])
39
        for f in fields:
40
            #id_uniqueWords[fields[0]].add(set(word_tokenize(f.strip("True:").strip("False:"))))
41
            words=word_tokenize(f.lstrip("True:").lstrip("False:").lower())
42
            max_len=len(words)
43
            for i,w in enumerate(words):
44
                if i+1<max_len:
45
                    gram2=tuple([w,words[i+1]])
46
                    gram2_count+=1
47
                    if gram2 in ngram_counts:
48
                        ngram_counts[gram2]+=1
49
                    else:
50
                        ngram_counts[gram2]=1
51
                if i+2<max_len:
52
                    gram3=tuple([w,words[i+1],words[i+2]])
53
                    gram3_count+=1
54
                    if gram3 in ngram_counts:
55
                        ngram_counts[gram3]+=1
56
                    else:
57
                        ngram_counts[gram3]=1
58
                if i+3<max_len:
59
                    gram4=tuple([w,words[i+1],words[i+2],words[i+3]])
60
                    gram4_count+=1
61
                    if gram4 in ngram_counts:
62
                        ngram_counts[gram4]+=1
63
                    else:
64
                        ngram_counts[gram4]=1
65
                if i+4<max_len:
66
                    gram5=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4]])
67
                    gram5_count+=1
68
                    if gram5 in ngram_counts:
69
                        ngram_counts[gram5]+=1
70
                    else:
71
                        ngram_counts[gram5]=1
72
                if i+5<max_len:
73
                    gram6=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5]])
74
                    gram6_count+=1
75
                    if gram6 in ngram_counts:
76
                        ngram_counts[gram6]+=1
77
                    else:
78
                        ngram_counts[gram6]=1
79
                if i+6<max_len:
80
                    gram7=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5],words[i+6]])
81
                    gram7_count+=1
82
                    if gram6 in ngram_counts:
83
                        ngram_counts[gram7]+=1
84
                    else:
85
                        ngram_counts[gram7]=1
86
                if i+7<max_len:
87
                    gram8=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5],words[i+6],words[i+7]])
88
                    gram8_count+=1
89
                    if gram8 in ngram_counts:
90
                        ngram_counts[gram8]+=1
91
                    else:
92
                        ngram_counts[gram8]=1
93
94
for k,v in ngram_counts.items():
95
    if v>1:
96
        print (str(math.ceil(float(v)/id_count))+"\t"+"\t".join(k))
97
    else:
98
        continue
99
    #if len(k)==2:
100
        #print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram2_count)))
101
        #print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k))
102
    #elif len(k)==3:
103
        #print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram3_count)))
104
        #print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k))
105
    #elif len(k)==4:
106
        #print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram4_count)))
107
        #print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k))
108
    #elif len(k)==5:
109
        #print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram5_count)))
110
        #print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k))
111
    #elif len(k)==6:
112
        #print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram6_count)))
113
        #print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k))
114
115
116
117
exit()
118
print ("id\t"+"\t".join(id_uniqueWords.keys()))
119
for k1,v1 in id_uniqueWords.items():
120
    avg_list=[]
121
    for k2,v2 in id_uniqueWords.items():
122
        avg=(len(v1)+len(v2))/2.0
123
124
        avg_list.append(str(int(100*(len(v1.intersection(v2))/avg))))
125
    print (k2+"\t".join(avg_list))