Diff of /src/features/__init__.py [000000] .. [735bb5]

Switch to side-by-side view

--- a
+++ b/src/features/__init__.py
@@ -0,0 +1,319 @@
+# Base Dependencies
+# -----------------
+import numpy
+from typing import Dict
+
+# Package Dependencies
+# -----------------
+# distance features
+from .token_distance_feature import TokenDistanceFeature
+from .char_distance_feature import CharDistanceFeature
+from .punct_distance_feature import PunctuationFeature
+from .position_feature import PositionFeature
+from .relative_distance_feature import RelativeDistanceFeature
+
+# word-based features
+from .bag_of_entities_feature import BagOfEntitiesFeature
+from .bag_of_words_feature import BagOfWordsFeature
+from .bag_of_verbs_feature import BagOfVerbsFeature
+
+# text representations
+from .wei_text_feature import WeiTextFeature
+
+# embeddings
+from .iob_feature import IOBFeature
+from .word_to_index import WordToIndex
+from .entity_embedding import EntityEmbedding
+from .sentence_embedding import SentenceEmbedding
+
+# semantic
+from .pos_feature import POSFeature
+from .dep_feature import DEPFeature
+from .negation_feature import NegationFeature
+from .negated_entities_feature import NegatedEntitiesFeature
+from .dependency_tree import DependencyTree
+from .dep_adjancency_matrix import DependencyAdjacencyMatrix
+from .sent_has_but_feature import SentHasButFeature
+
+# others
+from .character_length_feature import CharacterLengthFeature
+from .token_length_feature import TokenLengthFeature
+
+# Local Dependencies
+# -------------------
+from models.relation_collection import RelationCollection
+from vocabulary import Vocabulary
+
+# 3rd-Party Dependencies
+# ----------------------
+from sklearn.base import BaseEstimator
+
+
+# RandomForestFeatures
+# --------------------
+class RandomForestFeatures(BaseEstimator):
+    """Random Forest Features
+
+    Generates the features for the Random Forest model. This features are a subset
+    of those used in `Alimova and Tutubalina (2020) - Multiple features for clinical
+    relation extraction: A machine learning approach`
+
+    """
+
+    def __init__(self, dataset: str):
+
+        # distance
+        self.token_distance_feature = TokenDistanceFeature()
+        self.char_distance_feature = CharDistanceFeature()
+        self.punctuation_feature = PunctuationFeature()
+        self.position_feature = PositionFeature(dataset=dataset)
+
+        # word-base
+        self.bag_of_entities_feature = BagOfEntitiesFeature(dataset=dataset)
+        self.bag_of_words_feature = BagOfWordsFeature()
+
+    def get_feature_names(self, input_features=None):
+        names = []
+        names = names + self.token_distance_feature.get_feature_names()
+        names = names + self.char_distance_feature.get_feature_names()
+        names = names + self.punctuation_feature.get_feature_names()
+        names = names + self.position_feature.get_feature_names()
+        names = names + self.bag_of_entities_feature.get_feature_names()
+        names = names + self.bag_of_words_feature.get_feature_names()
+        return names 
+    
+    def fit(self, x: RelationCollection, y=None):
+        # distance
+        self.token_distance_feature = self.token_distance_feature.fit(x)
+        self.char_distance_feature = self.char_distance_feature.fit(x)
+        self.punctuation_feature = self.punctuation_feature.fit(x)
+        self.position_feature = self.position_feature.fit(x)
+
+        # word-base
+        self.bag_of_entities_feature = self.bag_of_entities_feature.fit(x)
+        self.bag_of_words_feature = self.bag_of_words_feature.fit(x)
+
+        return self
+
+    def transform(self, x: RelationCollection) -> numpy.array:
+        # distance
+        token_distance_feature = self.token_distance_feature.transform(x)
+        char_distance_feature = self.char_distance_feature.transform(x)
+        punctuation_feature = self.punctuation_feature.transform(x)
+        position_feature = self.position_feature.transform(x)
+
+        # word-base
+        bag_of_entities_feature = self.bag_of_entities_feature.transform(x)
+        bag_of_words_feature = self.bag_of_words_feature.transform(x)
+
+        features = numpy.concatenate(
+            (
+                token_distance_feature,
+                char_distance_feature,
+                punctuation_feature,
+                position_feature,
+                bag_of_entities_feature,
+                bag_of_words_feature,
+            ),
+            axis=1,
+        )
+
+        assert features.shape[0] == len(x)
+
+        return features
+
+    def fit_transform(self, x: RelationCollection, y=None) -> numpy.array:
+        # distance
+        token_distance_feature = self.token_distance_feature.fit_transform(x)
+        char_distance_feature = self.char_distance_feature.fit_transform(x)
+        punctuation_feature = self.punctuation_feature.fit_transform(x)
+        position_feature = self.position_feature.fit_transform(x)
+
+        # word-base
+        bag_of_entities_feature = self.bag_of_entities_feature.fit_transform(x)
+        bag_of_words_feature = self.bag_of_words_feature.fit_transform(x)
+
+        features = numpy.concatenate(
+            (
+                token_distance_feature,
+                char_distance_feature,
+                punctuation_feature,
+                position_feature,
+                bag_of_entities_feature,
+                bag_of_words_feature,
+            ),
+            axis=1,
+        )
+
+        assert features.shape[0] == len(x)
+
+        return features
+
+
+class RandomForestFeaturesNegation(RandomForestFeatures):
+    """Random Forest Features with Negation"""
+
+    def __init__(self, dataset: str):
+        super().__init__(dataset)
+
+        # negation
+        # self.negation_feature = NegationFeature()
+        self.negated_entities = NegatedEntitiesFeature()
+        self.has_but = SentHasButFeature()
+
+    def get_feature_names(self, input_features=None):
+        names = super().get_feature_names()
+        # names = names + self.negation_feature.get_feature_names()
+        names = names + self.negated_entities.get_feature_names()
+        names = names + self.has_but.get_feature_names()
+        return names
+    
+    def fit(self, x: RelationCollection, y=None):
+        super().fit(x)
+
+        # negation
+        # self.negation_feature = self.negation_feature.fit(x)
+        self.negated_entities = self.negated_entities.fit(x)
+        self.has_but = self.has_but.fit(x)
+
+        return self
+
+    def transform(self, x: RelationCollection):
+        features = super().transform(x)
+
+        # negation
+        # negation_feature = self.negation_feature.transform(x)
+        negated_entities = self.negated_entities.transform(x)
+        has_but = self.has_but.transform(x)
+
+        features = numpy.concatenate(
+            (features, negated_entities, has_but),  # negation_feature,
+            axis=1,
+        )
+
+        return features
+
+    def fit_transform(self, x: RelationCollection):
+        features = super().fit_transform(x)
+
+        # negation
+        # negation_feature = self.negation_feature.fit_transform(x)
+        negated_entities = self.negated_entities.fit_transform(x)
+        has_but = self.has_but.fit_transform(x)
+
+        features = numpy.concatenate(
+            (features, negated_entities, has_but),  # negation_feature,
+            axis=1,
+        )
+
+        return features
+
+
+# BilstmFeatures
+# --------------
+class BilstmFeatures(BaseEstimator):
+    """BiLSTM Features
+
+    Generates the feautes for the BiLSTM model. These features correspond to
+    the ones used in `Hasan et al. - Integrating Text Embedding with Traditional NLP
+    Features for Clinical Relation Extraction`
+    """
+
+    def __init__(self, dataset: str, vocab: Vocabulary):
+        self.dataset = dataset
+        self.vocab = vocab
+
+        self.relative_distance = RelativeDistanceFeature()
+        self.iob = IOBFeature(dataset, vocab.pad_index)
+        self.pos = POSFeature(vocab.pad_index)
+        self.dep = DEPFeature(vocab.pad_index)
+
+        self.word2index = WordToIndex(vocab)
+        self.char_length = CharacterLengthFeature()
+
+    def fit(self, x: RelationCollection, y=None):
+
+        self.relative_distance = self.relative_distance.fit(x)
+        self.iob = self.iob.fit(x)
+        self.pos = self.pos.fit(x)
+        self.dep = self.dep.fit(x)
+        self.word2index = self.word2index.fit(x)
+
+        return self
+
+    def transform(self, x: RelationCollection) -> Dict:
+        rd1, rd2 = self.relative_distance.transform(x)
+        iob = self.iob.transform(x)
+        pos = self.pos.transform(x)
+        dep = self.dep.transform(x)
+        e1, e2, sent = self.word2index.transform(x)
+        seq_length = [len(s) for s in sent]
+        char_length = self.char_length.transform(x)
+
+        return {
+            "rd1": rd1,
+            "rd2": rd2,
+            "iob": iob,
+            "pos": pos,
+            "dep": dep,
+            "e1": e1,
+            "e2": e2,
+            "sent": sent,
+            "seq_length": seq_length,
+            "char_length": char_length,
+        }
+
+    def fit_transform(self, x: RelationCollection, y=None) -> Dict:
+        rd1, rd2 = self.relative_distance.fit_transform(x)
+        iob = self.iob.fit_transform(x)
+        pos = self.pos.fit_transform(x)
+        dep = self.dep.fit_transform(x)
+        e1, e2, sent = self.word2index.fit_transform(x)
+        seq_length = numpy.array([len(s) for s in sent])
+        char_length = self.char_length.fit_transform(x)
+
+        return {
+            "rd1": rd1,
+            "rd2": rd2,
+            "iob": iob,
+            "pos": pos,
+            "dep": dep,
+            "e1": e1,
+            "e2": e2,
+            "sent": sent,
+            "seq_length": seq_length,
+            "char_length": char_length,
+        }
+
+
+# BertFeatures
+# --------------
+class BertFeatures(BaseEstimator):
+    """BERT Features
+
+    Generates the features for the Bert model.
+    """
+
+    def __init__(self):
+        self.char_length = CharacterLengthFeature()
+        self.token_length = TokenLengthFeature()
+        self.wei_text = WeiTextFeature()
+
+    def fit(self, x: RelationCollection, y=None):
+        return self
+
+    def transform(self, x: RelationCollection) -> Dict:
+        return {
+            "sentence": self.wei_text.transform(x),
+            "text": [r.text for r in x.relations],
+            "char_length": self.char_length.transform(x),
+            "seq_length": self.token_length.transform(x),
+        }
+
+    def fit_transform(self, x: RelationCollection, y=None) -> Dict:
+        return {
+            "sentence": self.wei_text.fit_transform(x),
+            "text": [r.text for r in x.relations],
+            "char_length": self.char_length.fit_transform(x),
+            "seq_length": self.token_length.fit_transform(x),
+        }