a b/src/rnn/rnn_utils.py
1
import numpy as np 
2
from collections import Counter
3
import pandas as pd 
4
import re
5
import nltk
6
import string
7
import torch
8
import torch.nn as nn
9
from nltk import sent_tokenize, word_tokenize
10
from nltk.stem import WordNetLemmatizer
11
from nltk.corpus import stopwords
12
13
stop_words = set(stopwords.words('english')).union(set(string.punctuation)) 
14
15
def preprocessing_rnn(text):
16
  words=word_tokenize(text)
17
  filtered_sentence = [] 
18
  # remove stopwords
19
  for word in words: 
20
    if word not in stop_words: 
21
        filtered_sentence.append(word) 
22
  
23
  # lemmatize
24
  lemma_word = []
25
  wordnet_lemmatizer = WordNetLemmatizer()
26
  for w in filtered_sentence:
27
    word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
28
    word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
29
    word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
30
    lemma_word.append(word3)
31
  return lemma_word
32
33
34
def count_vocab_index(train_df, test_df):
35
    df = pd.concat([train_df, test_df]).sample(frac=1).reset_index(drop=True)
36
    counts = Counter()
37
    for _, row in df.iterrows():
38
        counts.update(preprocessing_rnn(row['discharge_diagnosis']))
39
    
40
    # removing the words that have frequency less than 2
41
    for word in list(counts):
42
        if counts[word] < 2:
43
            del counts[word]
44
    
45
    vocab2index = {"":0, "UNKNOWN":1}
46
    words = ["", "UNKNOWN"]
47
    for word in counts:
48
        vocab2index[word] = len(words)
49
        words.append(word)
50
51
    return counts, vocab2index
52
53
54
def encode_sentence(text, vocab2index, N = 50):
55
  tokenized = preprocessing_rnn(text)
56
  encoded = np.zeros(N, dtype=int)
57
  enc1 = np.array([vocab2index.get(word, vocab2index["UNKNOWN"]) for word in tokenized])
58
  length = min(N, len(enc1))
59
  encoded[:length] = enc1[:length]
60
  return encoded
61
        
62
63
64
def get_emb_matrix(w2vmodel, word_counts):
65
  """ Creates embedding matrix from word vectors"""
66
  vocab_size = len(word_counts) + 2
67
  emb_size = w2vmodel.vector_size
68
69
  W = np.zeros((vocab_size, emb_size), dtype="float32")
70
  W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
71
  W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
72
  
73
  i = 2
74
  for word in word_counts:
75
    if word in w2vmodel.wv:
76
      W[i] = w2vmodel.wv[word]
77
    else:
78
      W[i] = np.random.uniform(-0.25,0.25, emb_size)
79
    i += 1   
80
  return W
81
82
def create_emb_layer(weights_matrix, non_trainable=False):
83
  num_embeddings, embedding_dim = weights_matrix.shape
84
  emb_layer = nn.Embedding(num_embeddings, embedding_dim, padding_idx = 0)
85
  emb_layer.load_state_dict({'weight': torch.from_numpy(weights_matrix)})
86
  if non_trainable:
87
    emb_layer.weight.requires_grad = False
88
89
  return emb_layer, num_embeddings, embedding_dim