multi-omics-state-field / Git / [1bd6b5] /helpers/n

Models:

AlyssaS/

multi-omics-state-field

Downloads: 1

[1bd6b5]: / helpers / n_grams.py

History

Download this file

24 lines (18 with data), 714 Bytes

from pandas import Series
from sklearn.feature_extraction.text import CountVectorizer


def find_longest_common_n_grams(data, min_words, max_words, min_count, min_frequency):

    if not len(data):
        return []

    vectorizer = CountVectorizer(ngram_range=(min_words, max_words), analyzer='word')
    counts = Series(
        vectorizer.fit_transform(data).sum(0).tolist()[0],
        index=vectorizer.get_feature_names(),
        name='counts'
    )
    frequent = counts[(counts >= 3) & (counts > min_frequency * len(data))]
    hits = Series(frequent.index)

    return [
        hit
        for hit in hits
        if not ((hits.str.startswith(hit) | hits.str.endswith(hit)) & (hits != hit)).any()
    ]