[71ad2f]: / src / rnn / rnn_utils.py

Download this file

90 lines (72 with data), 2.7 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import numpy as np
from collections import Counter
import pandas as pd
import re
import nltk
import string
import torch
import torch.nn as nn
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')).union(set(string.punctuation))
def preprocessing_rnn(text):
words=word_tokenize(text)
filtered_sentence = []
# remove stopwords
for word in words:
if word not in stop_words:
filtered_sentence.append(word)
# lemmatize
lemma_word = []
wordnet_lemmatizer = WordNetLemmatizer()
for w in filtered_sentence:
word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
lemma_word.append(word3)
return lemma_word
def count_vocab_index(train_df, test_df):
df = pd.concat([train_df, test_df]).sample(frac=1).reset_index(drop=True)
counts = Counter()
for _, row in df.iterrows():
counts.update(preprocessing_rnn(row['discharge_diagnosis']))
# removing the words that have frequency less than 2
for word in list(counts):
if counts[word] < 2:
del counts[word]
vocab2index = {"":0, "UNKNOWN":1}
words = ["", "UNKNOWN"]
for word in counts:
vocab2index[word] = len(words)
words.append(word)
return counts, vocab2index
def encode_sentence(text, vocab2index, N = 50):
tokenized = preprocessing_rnn(text)
encoded = np.zeros(N, dtype=int)
enc1 = np.array([vocab2index.get(word, vocab2index["UNKNOWN"]) for word in tokenized])
length = min(N, len(enc1))
encoded[:length] = enc1[:length]
return encoded
def get_emb_matrix(w2vmodel, word_counts):
""" Creates embedding matrix from word vectors"""
vocab_size = len(word_counts) + 2
emb_size = w2vmodel.vector_size
W = np.zeros((vocab_size, emb_size), dtype="float32")
W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words
i = 2
for word in word_counts:
if word in w2vmodel.wv:
W[i] = w2vmodel.wv[word]
else:
W[i] = np.random.uniform(-0.25,0.25, emb_size)
i += 1
return W
def create_emb_layer(weights_matrix, non_trainable=False):
num_embeddings, embedding_dim = weights_matrix.shape
emb_layer = nn.Embedding(num_embeddings, embedding_dim, padding_idx = 0)
emb_layer.load_state_dict({'weight': torch.from_numpy(weights_matrix)})
if non_trainable:
emb_layer.weight.requires_grad = False
return emb_layer, num_embeddings, embedding_dim