--- a +++ b/ngrams/generateNgramsIndications.py @@ -0,0 +1,125 @@ +import sys +#from nltk.stem import WordNetLemmatizer +#import nltk +#from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +import math + +#stop_words = set(stopwords.words('english')) + + +#raw_word_count=dict() +#lemma_word_count=dict() +#lemma_raw_connect=dict() +id_uniqueWords=dict() +ngram_counts=dict() + +gram2_count=0 +gram3_count=0 +gram4_count=0 +gram5_count=0 +gram6_count=0 +gram7_count=0 +gram7_count=0 +gram9_count=0 +gram10_count=0 + +id_count=0 + + + +with open(sys.argv[1], encoding="utf8", errors='ignore') as in_file: + for line in in_file: + if line.strip()=="": + break + id_count+=1 + id_key=line.split("\t")[0] + fields=line.split("\t")[1:] + id_uniqueWords[id_key]=set([]) + for f in fields: + #id_uniqueWords[fields[0]].add(set(word_tokenize(f.strip("True:").strip("False:")))) + words=word_tokenize(f.lstrip("True:").lstrip("False:").lower()) + max_len=len(words) + for i,w in enumerate(words): + if i+1<max_len: + gram2=tuple([w,words[i+1]]) + gram2_count+=1 + if gram2 in ngram_counts: + ngram_counts[gram2]+=1 + else: + ngram_counts[gram2]=1 + if i+2<max_len: + gram3=tuple([w,words[i+1],words[i+2]]) + gram3_count+=1 + if gram3 in ngram_counts: + ngram_counts[gram3]+=1 + else: + ngram_counts[gram3]=1 + if i+3<max_len: + gram4=tuple([w,words[i+1],words[i+2],words[i+3]]) + gram4_count+=1 + if gram4 in ngram_counts: + ngram_counts[gram4]+=1 + else: + ngram_counts[gram4]=1 + if i+4<max_len: + gram5=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4]]) + gram5_count+=1 + if gram5 in ngram_counts: + ngram_counts[gram5]+=1 + else: + ngram_counts[gram5]=1 + if i+5<max_len: + gram6=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5]]) + gram6_count+=1 + if gram6 in ngram_counts: + ngram_counts[gram6]+=1 + else: + ngram_counts[gram6]=1 + if i+6<max_len: + gram7=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5],words[i+6]]) + gram7_count+=1 + if gram6 in ngram_counts: + ngram_counts[gram7]+=1 + else: + ngram_counts[gram7]=1 + if i+7<max_len: + gram8=tuple([w,words[i+1],words[i+2],words[i+3],words[i+4],words[i+5],words[i+6],words[i+7]]) + gram8_count+=1 + if gram8 in ngram_counts: + ngram_counts[gram8]+=1 + else: + ngram_counts[gram8]=1 + +for k,v in ngram_counts.items(): + if v>1: + print (str(math.ceil(float(v)/id_count))+"\t"+"\t".join(k)) + else: + continue + #if len(k)==2: + #print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram2_count))) + #print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k)) + #elif len(k)==3: + #print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram3_count))) + #print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k)) + #elif len(k)==4: + #print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram4_count))) + #print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k)) + #elif len(k)==5: + #print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram5_count))) + #print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k)) + #elif len(k)==6: + #print ("\t".join(k)+"\t"+str(math.ceil(float(v)/gram6_count))) + #print (str(math.ceil(float(v)/gram2_count))+"\t" +"\t".join(k)) + + + +exit() +print ("id\t"+"\t".join(id_uniqueWords.keys())) +for k1,v1 in id_uniqueWords.items(): + avg_list=[] + for k2,v2 in id_uniqueWords.items(): + avg=(len(v1)+len(v2))/2.0 + + avg_list.append(str(int(100*(len(v1.intersection(v2))/avg)))) + print (k2+"\t".join(avg_list))