[71ad2f]: / src / utils.py

Download this file

104 lines (80 with data), 4.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
import numpy as np
from sklearn.metrics import accuracy_score,hamming_loss,precision_score,recall_score,f1_score,classification_report
from torch.utils.data import SubsetRandomSampler, DataLoader
import re
import nltk
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# stopwords + punctuation
stop_words = set(stopwords.words('english')).union(set(string.punctuation))
################## preprocessing text #########################
def preprocess(text):
words = word_tokenize(text)
filtered_sentence = []
# remove stopwords
for word in words:
if word not in stop_words:
filtered_sentence.append(word)
text = ' '.join(filtered_sentence)
# lemmatize
# lemma_word = []
# wordnet_lemmatizer = WordNetLemmatizer()
# for w in filtered_sentence:
# word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
# word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
# word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
# lemma_word.append(word3)
return text
#######################################################################
###################### Calculation of Metrics #########################
def calculate_metrics(pred, target, threshold=0.5):
pred = np.array(pred > threshold, dtype="float32")
return {'micro/precision': precision_score(y_true=target, y_pred=pred, average='micro'),
'micro/recall': recall_score(y_true=target, y_pred=pred, average='micro'),
'micro/f1': f1_score(y_true=target, y_pred=pred, average='micro'),
'hammingloss':hamming_loss(target,pred)
}
#########################################################################
################### Label One-hot Encodings ########################
def labeltarget(x,frequent_list):
target=np.zeros(10,dtype="float32")
for index,code in enumerate(frequent_list):
if code in x :
target[index]=1
return target
#####################################################################
#######################################################################################
def split_indices(dataset, validation_split, shuffle_dataset = True, random_seed = 2021):
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
np.random.seed(random_seed)
np.random.shuffle(indices)
return indices[split:], indices[:split]
#########################################################################################
#########################################################################################
def dataloader(train_dataset, test_dataset, batch_size, val_split):
train_indices, val_indices = split_indices(train_dataset, val_split)
train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)
train_loader = DataLoader(train_dataset, batch_size = batch_size, sampler=train_sampler)
val_loader = DataLoader(train_dataset, batch_size = batch_size, sampler=val_sampler)
test_loader = DataLoader(test_dataset, batch_size= batch_size)
return train_loader, val_loader, test_loader
#########################################################################################
#########################################################################################
def train_metric(y_pred, y_test, threshold=0.5):
num_classes = y_pred.shape[1]
y_pred_tags = (y_pred>0.5).float()
correct_pred = (y_pred_tags == y_test).float()
accuracy = (correct_pred.sum(dim=1) == num_classes).float().sum() / len(correct_pred)
hammingloss = hamming_loss(y_test.cpu().numpy(), y_pred_tags.cpu().numpy())
f1score = f1_score(y_true=y_test.cpu().numpy(), y_pred=y_pred_tags.cpu().numpy(), average='micro')
return accuracy, hammingloss, f1score
#################################################################################################