a b/src/utils.py
1
import numpy as np
2
from sklearn.metrics import accuracy_score,hamming_loss,precision_score,recall_score,f1_score,classification_report
3
from torch.utils.data import SubsetRandomSampler, DataLoader
4
import re
5
import nltk
6
import string
7
8
9
nltk.download('punkt')
10
nltk.download('stopwords')
11
nltk.download('wordnet')
12
from nltk import sent_tokenize, word_tokenize
13
from nltk.stem import WordNetLemmatizer
14
from nltk.corpus import stopwords
15
16
17
18
# stopwords + punctuation
19
stop_words = set(stopwords.words('english')).union(set(string.punctuation)) 
20
21
22
23
24
################## preprocessing text #########################
25
def preprocess(text):
26
27
  words = word_tokenize(text)
28
  filtered_sentence = [] 
29
  # remove stopwords
30
  for word in words: 
31
    if word not in stop_words: 
32
        filtered_sentence.append(word) 
33
  text = ' '.join(filtered_sentence)
34
  # lemmatize
35
  # lemma_word = []
36
  # wordnet_lemmatizer = WordNetLemmatizer()
37
  # for w in filtered_sentence:
38
  #   word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
39
  #   word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
40
  #   word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
41
  #   lemma_word.append(word3)
42
  return text
43
44
#######################################################################
45
46
47
###################### Calculation of Metrics #########################
48
def calculate_metrics(pred, target, threshold=0.5):
49
  pred = np.array(pred > threshold, dtype="float32")
50
  
51
  return {'micro/precision': precision_score(y_true=target, y_pred=pred, average='micro'),
52
            'micro/recall': recall_score(y_true=target, y_pred=pred, average='micro'),
53
            'micro/f1': f1_score(y_true=target, y_pred=pred, average='micro'),
54
            'hammingloss':hamming_loss(target,pred)       
55
        }
56
#########################################################################
57
58
################### Label One-hot Encodings ########################
59
def labeltarget(x,frequent_list):
60
  target=np.zeros(10,dtype="float32")
61
  for index,code in enumerate(frequent_list):
62
    if code in x :
63
      target[index]=1
64
  return target
65
#####################################################################
66
67
#######################################################################################
68
def split_indices(dataset, validation_split, shuffle_dataset = True, random_seed = 2021):
69
  dataset_size = len(dataset)
70
  indices = list(range(dataset_size))
71
  split = int(np.floor(validation_split * dataset_size))
72
  if shuffle_dataset :
73
    np.random.seed(random_seed)
74
    np.random.shuffle(indices)
75
  return indices[split:], indices[:split]
76
#########################################################################################
77
78
#########################################################################################
79
def dataloader(train_dataset, test_dataset, batch_size, val_split):
80
  train_indices, val_indices = split_indices(train_dataset, val_split)
81
  train_sampler = SubsetRandomSampler(train_indices)
82
  val_sampler = SubsetRandomSampler(val_indices)
83
  train_loader = DataLoader(train_dataset, batch_size = batch_size, sampler=train_sampler)
84
  val_loader = DataLoader(train_dataset, batch_size = batch_size, sampler=val_sampler)
85
  test_loader = DataLoader(test_dataset, batch_size= batch_size)
86
  return train_loader, val_loader, test_loader
87
88
89
#########################################################################################
90
91
#########################################################################################
92
def train_metric(y_pred, y_test, threshold=0.5):
93
  num_classes = y_pred.shape[1]
94
  y_pred_tags = (y_pred>0.5).float()
95
96
  correct_pred = (y_pred_tags == y_test).float()
97
  accuracy = (correct_pred.sum(dim=1) == num_classes).float().sum() / len(correct_pred)
98
99
  hammingloss = hamming_loss(y_test.cpu().numpy(), y_pred_tags.cpu().numpy())
100
101
  f1score = f1_score(y_true=y_test.cpu().numpy(), y_pred=y_pred_tags.cpu().numpy(), average='micro')
102
  return accuracy, hammingloss, f1score
103
104
#################################################################################################