[735bb5]: / src / ml_models / bilstm / model.py

Download this file

147 lines (125 with data), 5.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# Base Dependencies
# -----------------
from typing import Dict
# Package Dependencies
# --------------------
from .embeddings import Embeddings, RDEmbeddings
from .encoders import RecurrentEncoder
from .config import EmbeddingConfig, RDEmbeddingConfig, LSTMConfig
# Local Dependencies
# ------------------
from vocabulary import Vocabulary
# PyTorch Dependencies
# ---------------------
from torch import nn, Tensor, concat, mean
# Model
# -----
class HasanModel(nn.Module):
"""
Implementation of the BiLSTM model described in `Hasan et al. (2020) - Integrating
Text Embedding with Traditional NLP Features for Clinical Relation Extraction`
"""
def __init__(
self,
vocab: Vocabulary,
lstm_config: LSTMConfig,
bioword2vec_config: EmbeddingConfig,
rd_config: RDEmbeddingConfig,
pos_config: EmbeddingConfig,
dep_config: EmbeddingConfig,
iob_config: EmbeddingConfig,
num_classes: int = 2,
clf_dropout: float = 0.25,
):
"""Initializes the model
Args:
vocab (Vocabulary): vocabulary object
lstm_config (LSTMConfig): configuration for the LSTM encoder
bioword2vec_config (EmbeddingConfig): configuration for the BioWord2Vec embedding
rd_config (RDEmbeddingConfig): configuration for the Relative Distance embedding
pos_config (EmbeddingConfig): configuration for the POS embedding
dep_config (EmbeddingConfig): configuration for the DEP embedding
iob_config (EmbeddingConfig): configuration for the IOB embedding
num_classes (int, optional): number of output classes. Defaults to 2.
clf_dropout (float, optional): dropout rate. Defaults to 0.1.
"""
super(HasanModel, self).__init__()
# attributes
self.vocab = vocab
self.lstm_config = lstm_config
self.bioword2vec_config = bioword2vec_config
self.rd_config = rd_config
self.pos_config = pos_config
self.dep_config = dep_config
self.iob_config = iob_config
self.num_classes = num_classes
self.num_directions = 2 if self.lstm_config.bidirectional else 1
self.clf_hidden_dim = 64
self.clf_dropout = clf_dropout
# embedding layers
self.wv_embedding = Embeddings(**self.bioword2vec_config.__dict__)
self.rd_embedding = RDEmbeddings(**self.rd_config.__dict__)
self.pos_embedding = Embeddings(**self.pos_config.__dict__)
self.dep_embedding = Embeddings(**self.dep_config.__dict__)
self.iob_embedding = Embeddings(**self.iob_config.__dict__)
# BiLSTM encoder
self.lstm = RecurrentEncoder(rnn_type="lstm", **self.lstm_config.__dict__)
# classifier
self.fc = nn.Sequential(
nn.Dropout(p=self.clf_dropout),
nn.Linear(self.clf_input_dim, self.clf_hidden_dim),
nn.ReLU(),
nn.Dropout(p=self.clf_dropout),
nn.Linear(self.clf_hidden_dim, self.num_classes),
nn.ReLU(),
nn.Sigmoid(),
)
# load pretrained embeddings
self.wv_embedding.load_from_file(self.bioword2vec_config.emb_path, self.vocab)
@property
def clf_input_dim(self) -> int:
"""Input dimensions of the classifier"""
return (self.num_directions * self.lstm_config.hidden_size) + (
2 * self.wv_embedding.embedding_dim
)
def forward(self, inputs: Dict[str, Tensor]) -> Tensor:
"""Forward pass of the model
Args:
inputs (Dict[str, Tensor]): input tensors
Returns:
Tensor: output tensor
"""
e1: Tensor = inputs["e1"] # [batch_size, max_len_e1]
e2: Tensor = inputs["e2"] # [batch_size, max_len_e2]
sent: Tensor = inputs["sent"] # [batch_size, max_len_seq]
rd1: Tensor = inputs["rd1"] # [batch_size, max_len_seq]
rd2: Tensor = inputs["rd2"] # [batch_size, max_len_seq]
pos: Tensor = inputs["pos"] # [batch_size, max_len_seq]
dep: Tensor = inputs["dep"] # [batch_size, max_len_seq]
iob: Tensor = inputs["iob"] # [batch_size, max_len_seq]
seq_length: Tensor = inputs["seq_length"] # [batch_size]
assert len(e1.shape) == 2
assert len(e2.shape) == 2
assert len(sent.shape) == 2
assert len(rd1.shape) == 2
assert len(rd2.shape) == 2
assert len(pos.shape) == 2
assert len(dep.shape) == 2
assert len(iob.shape) == 2
assert len(seq_length.shape) == 1
# embedded inputs
e1_emb = mean(self.wv_embedding(e1), axis=1) # [batch_size, wv_emb_dim]
e2_emb = mean(self.wv_embedding(e2), axis=1) # [batch_size, wv_emb_dim]
sent_emb = self.wv_embedding(sent) # [batch_size, seq_length, wv_emb_dim]
rd1_emb = self.rd_embedding(rd1) # [batch_size, seq_length, rd_emb_dim]
rd2_emb = self.rd_embedding(rd2) # [batch_size, seq_length, rd_emb_dim]
pos_emb = self.pos_embedding(pos) # [batch_size, seq_length, pos_emb_dim]
dep_emb = self.dep_embedding(dep) # [batch_size, seq_length, pos_emb_dim]
iob_emb = self.iob_embedding(iob) # [batch_size, seq_length, iob_emb_dim]
# encode
inputs_emb = concat((sent_emb, rd1_emb, rd2_emb, pos_emb, dep_emb, iob_emb), axis=2)
outputs_emb, hidden_concat = self.lstm(inputs_emb, seq_length)
outputs = concat((e1_emb, e2_emb, hidden_concat), axis=1)
# classify
outputs = self.fc(outputs)
return outputs