|
a |
|
b/src/utils.py |
|
|
1 |
import numpy as np |
|
|
2 |
from sklearn.metrics import accuracy_score,hamming_loss,precision_score,recall_score,f1_score,classification_report |
|
|
3 |
from torch.utils.data import SubsetRandomSampler, DataLoader |
|
|
4 |
import re |
|
|
5 |
import nltk |
|
|
6 |
import string |
|
|
7 |
|
|
|
8 |
|
|
|
9 |
nltk.download('punkt') |
|
|
10 |
nltk.download('stopwords') |
|
|
11 |
nltk.download('wordnet') |
|
|
12 |
from nltk import sent_tokenize, word_tokenize |
|
|
13 |
from nltk.stem import WordNetLemmatizer |
|
|
14 |
from nltk.corpus import stopwords |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
|
|
|
18 |
# stopwords + punctuation |
|
|
19 |
stop_words = set(stopwords.words('english')).union(set(string.punctuation)) |
|
|
20 |
|
|
|
21 |
|
|
|
22 |
|
|
|
23 |
|
|
|
24 |
################## preprocessing text ######################### |
|
|
25 |
def preprocess(text): |
|
|
26 |
|
|
|
27 |
words = word_tokenize(text) |
|
|
28 |
filtered_sentence = [] |
|
|
29 |
# remove stopwords |
|
|
30 |
for word in words: |
|
|
31 |
if word not in stop_words: |
|
|
32 |
filtered_sentence.append(word) |
|
|
33 |
text = ' '.join(filtered_sentence) |
|
|
34 |
# lemmatize |
|
|
35 |
# lemma_word = [] |
|
|
36 |
# wordnet_lemmatizer = WordNetLemmatizer() |
|
|
37 |
# for w in filtered_sentence: |
|
|
38 |
# word1 = wordnet_lemmatizer.lemmatize(w, pos = "n") |
|
|
39 |
# word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v") |
|
|
40 |
# word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a")) |
|
|
41 |
# lemma_word.append(word3) |
|
|
42 |
return text |
|
|
43 |
|
|
|
44 |
####################################################################### |
|
|
45 |
|
|
|
46 |
|
|
|
47 |
###################### Calculation of Metrics ######################### |
|
|
48 |
def calculate_metrics(pred, target, threshold=0.5): |
|
|
49 |
pred = np.array(pred > threshold, dtype="float32") |
|
|
50 |
|
|
|
51 |
return {'micro/precision': precision_score(y_true=target, y_pred=pred, average='micro'), |
|
|
52 |
'micro/recall': recall_score(y_true=target, y_pred=pred, average='micro'), |
|
|
53 |
'micro/f1': f1_score(y_true=target, y_pred=pred, average='micro'), |
|
|
54 |
'hammingloss':hamming_loss(target,pred) |
|
|
55 |
} |
|
|
56 |
######################################################################### |
|
|
57 |
|
|
|
58 |
################### Label One-hot Encodings ######################## |
|
|
59 |
def labeltarget(x,frequent_list): |
|
|
60 |
target=np.zeros(10,dtype="float32") |
|
|
61 |
for index,code in enumerate(frequent_list): |
|
|
62 |
if code in x : |
|
|
63 |
target[index]=1 |
|
|
64 |
return target |
|
|
65 |
##################################################################### |
|
|
66 |
|
|
|
67 |
####################################################################################### |
|
|
68 |
def split_indices(dataset, validation_split, shuffle_dataset = True, random_seed = 2021): |
|
|
69 |
dataset_size = len(dataset) |
|
|
70 |
indices = list(range(dataset_size)) |
|
|
71 |
split = int(np.floor(validation_split * dataset_size)) |
|
|
72 |
if shuffle_dataset : |
|
|
73 |
np.random.seed(random_seed) |
|
|
74 |
np.random.shuffle(indices) |
|
|
75 |
return indices[split:], indices[:split] |
|
|
76 |
######################################################################################### |
|
|
77 |
|
|
|
78 |
######################################################################################### |
|
|
79 |
def dataloader(train_dataset, test_dataset, batch_size, val_split): |
|
|
80 |
train_indices, val_indices = split_indices(train_dataset, val_split) |
|
|
81 |
train_sampler = SubsetRandomSampler(train_indices) |
|
|
82 |
val_sampler = SubsetRandomSampler(val_indices) |
|
|
83 |
train_loader = DataLoader(train_dataset, batch_size = batch_size, sampler=train_sampler) |
|
|
84 |
val_loader = DataLoader(train_dataset, batch_size = batch_size, sampler=val_sampler) |
|
|
85 |
test_loader = DataLoader(test_dataset, batch_size= batch_size) |
|
|
86 |
return train_loader, val_loader, test_loader |
|
|
87 |
|
|
|
88 |
|
|
|
89 |
######################################################################################### |
|
|
90 |
|
|
|
91 |
######################################################################################### |
|
|
92 |
def train_metric(y_pred, y_test, threshold=0.5): |
|
|
93 |
num_classes = y_pred.shape[1] |
|
|
94 |
y_pred_tags = (y_pred>0.5).float() |
|
|
95 |
|
|
|
96 |
correct_pred = (y_pred_tags == y_test).float() |
|
|
97 |
accuracy = (correct_pred.sum(dim=1) == num_classes).float().sum() / len(correct_pred) |
|
|
98 |
|
|
|
99 |
hammingloss = hamming_loss(y_test.cpu().numpy(), y_pred_tags.cpu().numpy()) |
|
|
100 |
|
|
|
101 |
f1score = f1_score(y_true=y_test.cpu().numpy(), y_pred=y_pred_tags.cpu().numpy(), average='micro') |
|
|
102 |
return accuracy, hammingloss, f1score |
|
|
103 |
|
|
|
104 |
################################################################################################# |