a b/src/training/rf.py
1
# Base Dependencies
2
# -----------------
3
import numpy as np
4
import time
5
from typing import Optional
6
from pathlib import Path
7
from os.path import join
8
from joblib import dump, load
9
10
# Package Dependencies
11
# --------------------
12
from .base import BaseTrainer
13
from .config import ALExperimentConfig, PLExperimentConfig
14
from .utils import compute_metrics, random_sampling
15
16
# Local Dependencies
17
# -------------------
18
from features import RandomForestFeaturesNegation
19
from models.relation_collection import RelationCollection
20
from ml_models.rf import (
21
    RandomForestClassifierOneStage,
22
)
23
24
# 3rd-Party Dependencies
25
# --------------------
26
import neptune
27
from modAL.models import ActiveLearner
28
from modAL.uncertainty import uncertainty_sampling
29
from modAL.batch import uncertainty_batch_sampling
30
31
# Constants
32
# ---------
33
from constants import RFQueryStrategy
34
from config import NEPTUNE_API_TOKEN, NEPTUNE_PROJECT
35
36
37
# Auxiliar Functions
38
# ------------------
39
def _get_query_strategy(q: RFQueryStrategy):
40
    if q == RFQueryStrategy.RANDOM:
41
        return random_sampling
42
    elif q == RFQueryStrategy.LC:
43
        return uncertainty_sampling
44
    elif q == RFQueryStrategy.BATCH_LC:
45
        return uncertainty_batch_sampling
46
    else:
47
        raise ValueError("Query strategy not supported")
48
49
50
# Trainer Class
51
# -------------
52
class RandomForestTrainer(BaseTrainer):
53
    """RandomForestTrainer
54
55
    Trainer for the Random Forest model
56
    """
57
58
    def __init__(
59
        self,
60
        dataset: str,
61
        train_dataset: RelationCollection,
62
        test_dataset: RelationCollection,
63
        relation_type: Optional[str] = None,
64
    ):
65
        """
66
        Args:
67
            dataset (str): name of the dataset, e.g., "n2c2".
68
            train_dataset (Dataset): train split of the dataset.
69
            test_dataset (Dataset): test split of the dataset.
70
            relation_type (str, optional): relation type. Defaults to None.
71
72
        Raises:
73
            ValueError: if the name dataset provided is not supported
74
        """
75
        super().__init__(dataset, train_dataset, test_dataset, relation_type)
76
77
        # feature encoder
78
        self.f_encoder = RandomForestFeaturesNegation(self.dataset)
79
80
    def _init_model(self):
81
        return RandomForestClassifierOneStage(self.dataset)
82
83
    @property
84
    def method_name(self) -> str:
85
        return "rf"
86
87
    @property
88
    def method_name_pretty(self) -> str:
89
        return "Random Forest"
90
91
    def train_passive_learning(
92
        self, config: PLExperimentConfig, logging: bool = True, save_model: bool = False
93
    ) -> RandomForestClassifierOneStage:
94
        """Trains the RF model using passive learning
95
96
        Args:
97
            logging (bool, optional): determines if logging should be done. Defaults to True.
98
            save_model (bool, optional): determines if the model should be saved. Defaults to False.
99
100
        Returns:
101
            RandomForestClassifierOneStage: trained model
102
        """
103
        if logging:
104
            # Connect to Neptune and create a run
105
            run = neptune.init_run(project=NEPTUNE_PROJECT, api_token=NEPTUNE_API_TOKEN)
106
107
        # print info
108
        self.print_info_passive_learning()
109
110
        # init model
111
        model = self._init_model()
112
113
        # fit model
114
        X_train: np.array = self.f_encoder.fit_transform(self.train_dataset)
115
        y_train: np.array = self.train_dataset.labels
116
        model = model.fit(X_train, y_train)
117
118
        # predict
119
        X_test: np.array = self.f_encoder.transform(self.test_dataset)
120
        y_test: np.array = self.test_dataset.labels
121
        y_pred_train = model.predict(X_train)
122
        y_pred = model.predict(X_test)
123
124
        # compute metrics
125
        train_metrics = self.compute_metrics(y_true=y_train, y_pred=y_pred_train)
126
        test_metrics = self.compute_metrics(y_true=y_test, y_pred=y_pred)
127
128
        self.print_train_metrics(train_metrics)
129
        self.print_test_metrics(test_metrics)
130
131
        # save model
132
        if save_model:
133
            dump(model, Path(join(self.pl_checkpoint_path, "model.joblib")))
134
135
        if logging:
136
            run["method"] = self.method_name
137
            run["dataset"] = self.dataset
138
            run["relation"] = self.relation_type
139
            run["strategy"] = "passive learning"
140
141
            for key, value in train_metrics.items():
142
                run["train/" + key] = value
143
144
            for key, value in test_metrics.items():
145
                run["test/" + key] = value
146
147
            run["model/parameters"] = model.get_params()
148
            # run["model/file"].upload(Path(join(self.pl_checkpoint_path, "model.joblib")))
149
            run.stop()
150
151
        return model
152
153
    def train_active_learning(
154
        self,
155
        query_strategy: RFQueryStrategy,
156
        config: ALExperimentConfig,
157
        save_models: bool = False,
158
        verbose: bool = True,
159
        logging: bool = True,
160
    ):
161
        """Trains the RF model using passive learning
162
163
        Args:
164
            query_strategy (str): strategy used to query the most informative instances.
165
            config (ALExperimentConfig): configuration of the AL experiment.
166
            logging (bool, optional): _description_. Defaults to True.
167
168
        Raises:
169
            ValueError: if `query_strategy` not supported
170
        """
171
172
        if logging:
173
            run = neptune.init_run(project=NEPTUNE_PROJECT, api_token=NEPTUNE_API_TOKEN)
174
175
        # setup
176
        f_query_strategy = _get_query_strategy(query_strategy)
177
        INIT_QUERY_SIZE = self.compute_init_q_size(config)
178
        QUERY_SIZE = self.compute_q_size(config)
179
        AL_STEPS = self.compute_al_steps(config)
180
181
        if verbose:
182
            self.print_info_active_learning(
183
                q_strategy=query_strategy.value,
184
                pool_size=self.n_instances,
185
                init_q_size=INIT_QUERY_SIZE,
186
                q_size=QUERY_SIZE,
187
            )
188
189
        # Isolate training examples for labelled dataset
190
        init_query_indices = np.random.randint(
191
            low=0, high=self.n_instances, size=INIT_QUERY_SIZE
192
        )
193
        active_collection = self.train_dataset[init_query_indices]
194
        X_active = self.f_encoder.fit_transform(active_collection)
195
        y_active = active_collection.labels
196
197
        # Isolate the non-training examples we'll be querying.
198
        pool_indices = np.delete(
199
            np.array(range(self.n_instances)), init_query_indices, axis=0
200
        )
201
        pool_collection = self.train_dataset[pool_indices]
202
        X_pool = self.f_encoder.transform(pool_collection)
203
        Y_pool = pool_collection.labels
204
205
        # Specify the core estimator along with it's active learning model
206
        learner = ActiveLearner(
207
            estimator=self._init_model(),
208
            X_training=X_active,
209
            y_training=y_active,
210
            query_strategy=f_query_strategy,
211
        )
212
213
        if save_models:
214
            dump(
215
                {
216
                    "model": learner.estimator,
217
                    "f_encoder": self.f_encoder,
218
                    "X_active": X_active,
219
                    "y_active": y_active,
220
                },
221
                Path(join(self.al_checkpoint_path, "model_init.joblib")),
222
            )
223
224
        # evaluate init model
225
        X_test = self.f_encoder.transform(self.test_dataset)
226
        y_test = self.test_dataset.labels
227
        y_pred = learner.predict(X_test)
228
229
        init_metrics = compute_metrics(
230
            y_true=y_test, y_pred=y_pred, average=self.metrics_average
231
        )
232
233
        if verbose:
234
            self.print_al_iteration_metrics(step=0, metrics=init_metrics)
235
236
        if logging:
237
            run["method"] = self.method_name
238
            run["dataset"] = self.dataset
239
            run["relation"] = self.relation_type
240
            run["strategy"] = query_strategy.value
241
            for k, v in init_metrics.items():
242
                run["test/" + k].append(v)
243
244
            run["annotation/instance_ann"].append(
245
                active_collection.n_instances / self.n_instances
246
            )
247
            run["annotation/token_ann"].append(active_collection.n_tokens / self.n_tokens)
248
            run["annotation/char_ann"].append(
249
                active_collection.n_characters / self.n_characters
250
            )
251
252
        # Active Learning Loop
253
        for index in range(AL_STEPS):
254
            init_step_time = time.time()
255
256
            # query most informative examples
257
            init_query_time = time.time()
258
            n_instances = min(QUERY_SIZE, X_pool.shape[0])
259
            query_index, _ = learner.query(X_pool, n_instances=n_instances)
260
            X_query = X_pool[query_index]
261
            y_query = Y_pool[query_index]
262
            query_time = time.time() - init_query_time
263
264
            # compute accuracy on query
265
            y_query_pred = learner.predict(X_query)
266
            step_acc = self.compute_step_accuracy(y_true=y_query, y_pred=y_query_pred)
267
268
            # compute average prediction score for true label on query
269
            scores = []
270
            query_probs = learner.estimator.predict_proba(X_pool[query_index])
271
            for i in range(len(y_query)):
272
                try:
273
                    scores.append(query_probs[i][y_query[i]])
274
                except IndexError:
275
                    scores.append(0.0)
276
            step_score = np.mean(scores)
277
278
            # move queried instances from pool to training
279
            active_collection = active_collection + pool_collection[query_index]
280
281
            # train model on new training data
282
            init_train_time = time.time()
283
            X_active = self.f_encoder.fit_transform(active_collection)
284
            y_active = active_collection.labels
285
            learner.fit(X=X_active, y=y_active)
286
            train_time = time.time() - init_train_time
287
288
            if save_models:
289
                dump(
290
                    {
291
                        "model": learner.estimator,
292
                        "f_encoder": self.f_encoder,
293
                        "X_active": X_active,
294
                        "y_active": y_active,
295
                    },
296
                    Path(
297
                        join(self.al_checkpoint_path, "model_{}.joblib".format(index))
298
                    ),
299
                )
300
301
            # remove the queried instance from the unlabeled pool.
302
            pool_indices = np.delete(
303
                np.array(range(len(pool_collection))), query_index, axis=0
304
            )
305
            if len(pool_indices) == 0:
306
                break
307
            pool_collection = pool_collection[pool_indices]
308
            X_pool = self.f_encoder.transform(pool_collection)
309
310
            # calculate and report our model's precision, recall and f1-score.
311
            X_test = self.f_encoder.transform(self.test_dataset)
312
            y_pred = learner.predict(X_test)
313
314
            # compute metrics
315
            step_metrics = self.compute_metrics(y_true=y_test, y_pred=y_pred)
316
317
            step_time = time.time() - init_step_time
318
319
            if verbose:
320
                self.print_al_iteration_metrics(step=index + 1, metrics=step_metrics)
321
322
            if logging:
323
                run["model/parameters"].append(learner.estimator.get_params())
324
325
                for key, value in step_metrics.items():
326
                    run["test/" + key].append(value)
327
328
                run["times/step_time"].append(step_time)
329
                run["times/train_time"].append(train_time)
330
                run["times/query_time"].append(query_time)
331
332
                run["train/step_acc"].append(step_acc)
333
                run["train/step_score"].append(step_score)
334
335
                run["annotation/instance_ann"].append(
336
                    active_collection.n_instances / self.n_instances
337
                )
338
                run["annotation/token_ann"].append(
339
                    active_collection.n_tokens / self.n_tokens
340
                )
341
                run["annotation/char_ann"].append(
342
                    active_collection.n_characters / self.n_characters
343
                )
344
345
        # end of active learning loop
346
347
        if logging:
348
            run.stop()