[1bd6b5]: / helpers / n_grams.py

Download this file

24 lines (18 with data), 714 Bytes

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from pandas import Series
from sklearn.feature_extraction.text import CountVectorizer
def find_longest_common_n_grams(data, min_words, max_words, min_count, min_frequency):
if not len(data):
return []
vectorizer = CountVectorizer(ngram_range=(min_words, max_words), analyzer='word')
counts = Series(
vectorizer.fit_transform(data).sum(0).tolist()[0],
index=vectorizer.get_feature_names(),
name='counts'
)
frequent = counts[(counts >= 3) & (counts > min_frequency * len(data))]
hits = Series(frequent.index)
return [
hit
for hit in hits
if not ((hits.str.startswith(hit) | hits.str.endswith(hit)) & (hits != hit)).any()
]