al-medical-RE / Git / [735bb5] /src/features/bag_of_words

Models:

philipB/

al-medical-RE

Downloads: 1

[735bb5]: / src / features / bag_of_words_feature.py

History

Download this file

67 lines (53 with data), 1.7 kB

# Base Dependencies
# ----------------
import numpy
from typing import List

# Local Dependencies
# ------------------
from models import RelationCollection

# 3rd-Party Dependencies
# ----------------------
from sklearn.feature_extraction.text import CountVectorizer

# Constants
# ---------
CV_CONFIG = {
    # "min_df": 0.1,
    # "max_df": 0.9,
    "max_features": 50,
}


class BagOfWordsFeature:
    """
    Bag of Words

    All words within a relation's middle context.
    """

    def __init__(self):
        self.cv = CountVectorizer(**CV_CONFIG)

    def get_feature_names(self, input_features=None):
        names = []
        for f in self.cv.get_feature_names_out():
            names.append("word_{}".format(f))
        return names

    def get_text(self, collection: RelationCollection) -> List[str]:
        contexts = []
        for doc in collection.middle_tokens:
            tokens = " ".join([t.lemma_ for t in doc])
            contexts.append(tokens)

        return contexts

    def fit(self, x: RelationCollection, y=None):
        texts = self.get_text(x)
        self.cv = self.cv.fit(texts)
        return self

    def transform(self, x: RelationCollection, y=None) -> numpy.array:
        texts = self.get_text(x)
        X = self.cv.transform(texts)
        X = X.toarray()
        # X = list(X)
        # X /= numpy.max(numpy.abs(X), axis=0)
        return X

    def fit_transform(self, x: RelationCollection, y=None) -> numpy.array:
        texts = self.get_text(x)
        X = self.cv.fit_transform(texts)
        X = X.toarray()
        # X = list(X)
        # X /= numpy.max(numpy.abs(X), axis=0)
        return X