[735bb5]: / src / features / bag_of_words_feature.py

Download this file

67 lines (53 with data), 1.7 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Base Dependencies
# ----------------
import numpy
from typing import List
# Local Dependencies
# ------------------
from models import RelationCollection
# 3rd-Party Dependencies
# ----------------------
from sklearn.feature_extraction.text import CountVectorizer
# Constants
# ---------
CV_CONFIG = {
# "min_df": 0.1,
# "max_df": 0.9,
"max_features": 50,
}
class BagOfWordsFeature:
"""
Bag of Words
All words within a relation's middle context.
"""
def __init__(self):
self.cv = CountVectorizer(**CV_CONFIG)
def get_feature_names(self, input_features=None):
names = []
for f in self.cv.get_feature_names_out():
names.append("word_{}".format(f))
return names
def get_text(self, collection: RelationCollection) -> List[str]:
contexts = []
for doc in collection.middle_tokens:
tokens = " ".join([t.lemma_ for t in doc])
contexts.append(tokens)
return contexts
def fit(self, x: RelationCollection, y=None):
texts = self.get_text(x)
self.cv = self.cv.fit(texts)
return self
def transform(self, x: RelationCollection, y=None) -> numpy.array:
texts = self.get_text(x)
X = self.cv.transform(texts)
X = X.toarray()
# X = list(X)
# X /= numpy.max(numpy.abs(X), axis=0)
return X
def fit_transform(self, x: RelationCollection, y=None) -> numpy.array:
texts = self.get_text(x)
X = self.cv.fit_transform(texts)
X = X.toarray()
# X = list(X)
# X /= numpy.max(numpy.abs(X), axis=0)
return X