ICD10-codes-assignment / Git / [71ad2f] /src/utils.py

Models:
philipB/
ICD10-codes-assignment
Downloads: 1
[71ad2f]: / src / utils.py
History
Download this file
104 lines (80 with data), 4.0 kB

import numpy as np
from sklearn.metrics import accuracy_score,hamming_loss,precision_score,recall_score,f1_score,classification_report
from torch.utils.data import SubsetRandomSampler, DataLoader
import re
import nltk
import string


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords



# stopwords + punctuation
stop_words = set(stopwords.words('english')).union(set(string.punctuation)) 




################## preprocessing text #########################
def preprocess(text):

  words = word_tokenize(text)
  filtered_sentence = [] 
  # remove stopwords
  for word in words: 
    if word not in stop_words: 
        filtered_sentence.append(word) 
  text = ' '.join(filtered_sentence)
  # lemmatize
  # lemma_word = []
  # wordnet_lemmatizer = WordNetLemmatizer()
  # for w in filtered_sentence:
  #   word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
  #   word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
  #   word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
  #   lemma_word.append(word3)
  return text

#######################################################################


###################### Calculation of Metrics #########################
def calculate_metrics(pred, target, threshold=0.5):
  pred = np.array(pred > threshold, dtype="float32")
  
  return {'micro/precision': precision_score(y_true=target, y_pred=pred, average='micro'),
            'micro/recall': recall_score(y_true=target, y_pred=pred, average='micro'),
            'micro/f1': f1_score(y_true=target, y_pred=pred, average='micro'),
            'hammingloss':hamming_loss(target,pred)       
        }
#########################################################################

################### Label One-hot Encodings ########################
def labeltarget(x,frequent_list):
  target=np.zeros(10,dtype="float32")
  for index,code in enumerate(frequent_list):
    if code in x :
      target[index]=1
  return target
#####################################################################

#######################################################################################
def split_indices(dataset, validation_split, shuffle_dataset = True, random_seed = 2021):
  dataset_size = len(dataset)
  indices = list(range(dataset_size))
  split = int(np.floor(validation_split * dataset_size))
  if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
  return indices[split:], indices[:split]
#########################################################################################

#########################################################################################
def dataloader(train_dataset, test_dataset, batch_size, val_split):
  train_indices, val_indices = split_indices(train_dataset, val_split)
  train_sampler = SubsetRandomSampler(train_indices)
  val_sampler = SubsetRandomSampler(val_indices)
  train_loader = DataLoader(train_dataset, batch_size = batch_size, sampler=train_sampler)
  val_loader = DataLoader(train_dataset, batch_size = batch_size, sampler=val_sampler)
  test_loader = DataLoader(test_dataset, batch_size= batch_size)
  return train_loader, val_loader, test_loader


#########################################################################################

#########################################################################################
def train_metric(y_pred, y_test, threshold=0.5):
  num_classes = y_pred.shape[1]
  y_pred_tags = (y_pred>0.5).float()

  correct_pred = (y_pred_tags == y_test).float()
  accuracy = (correct_pred.sum(dim=1) == num_classes).float().sum() / len(correct_pred)

  hammingloss = hamming_loss(y_test.cpu().numpy(), y_pred_tags.cpu().numpy())

  f1score = f1_score(y_true=y_test.cpu().numpy(), y_pred=y_pred_tags.cpu().numpy(), average='micro')
  return accuracy, hammingloss, f1score

#################################################################################################