[d129b2]: / medicalbert / classifiers / standard / classifier.py

Download this file

132 lines (104 with data), 5.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gcsfs,logging, os, torch
import pandas as pd
from statistics import mean
from tqdm import trange, tqdm
###
# Base class for Bert classifiers.
###
class Classifier:
def train(self, datareader):
device = torch.device(self.config['device'])
self.model.train()
self.model.to(device)
batch_losses = []
for _ in trange(self.epochs, int(self.config['epochs']), desc="Epoch"):
tr_loss = 0
batche = []
with tqdm(datareader.get_train(), desc="Iteration") as t:
for step, batch in enumerate(t):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
loss = self.model(input_ids, labels=label_ids)[0]
# Statistics
batche.append(loss.item())
loss = loss / self.config['gradient_accumulation_steps']
loss.backward()
tr_loss += loss.item()
if (step + 1) % self.config['gradient_accumulation_steps'] == 0:
batch_losses.append(mean(batche))
# Update the model gradients
#torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
self.optimizer.step()
self.optimizer.zero_grad()
# save a checkpoint here
self.save()
self.epochs = self.epochs+1
self.save_batch_losses(pd.DataFrame(batch_losses))
def save_batch_losses(self, losses):
path = os.path.join(self.config['output_dir'], self.config['experiment_name'])
if path[:2] != "gs":
if not os.path.exists(path):
os.makedirs(path)
losses.to_csv(os.path.join(self.config['output_dir'], self.config['experiment_name'], "batch_loss.csv"))
def set_eval_mode(self):
self.model.eval()
def load_from_checkpoint(self):
if 'load_from_checkpoint' in self.config:
file_path = os.path.join(self.config['output_dir'], "checkpoints", self.config['load_from_checkpoint'])
checkpoint = torch.load(file_path)
self.epochs = checkpoint['epoch']
self.model.load_state_dict(checkpoint['bert_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer'])
# work around - for some reason reloading an optimizer that worked with CUDA tensors
# causes an error - see https://github.com/pytorch/pytorch/issues/2830
for state in self.optimizer.state.values():
for k, v in state.items():
if isinstance(v, torch.Tensor):
if self.config['device'] == 'gpu':
state[k] = v.cuda()
else:
state[k] = v
def load_object_from_location(self, checkpoint_file):
if checkpoint_file[:2] != "gs":
return torch.load(checkpoint_file)
else:
fs = gcsfs.GCSFileSystem()
with fs.open(checkpoint_file, mode='rb') as f:
return torch.load(f)
def load_from_checkpoint(self, checkpoint_file):
file_path = os.path.join(self.config['output_dir'], self.config['experiment_name'],"checkpoints", checkpoint_file)
checkpoint = self.load_object_from_location(file_path)
self.epochs = checkpoint['epoch']
self.model.load_state_dict(checkpoint['bert_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer'])
# work around - for some reason reloading an optimizer that worked with CUDA tensors
# causes an error - see https://github.com/pytorch/pytorch/issues/2830
for state in self.optimizer.state.values():
for k, v in state.items():
if isinstance(v, torch.Tensor):
if self.config['device'] == 'gpu':
state[k] = v.cuda()
else:
state[k] = v
def save_object_to_location(self, object):
if self.config['output_dir'][:2] != "gs":
if not os.path.exists(
os.path.join(self.config['output_dir'], self.config['experiment_name'], "checkpoints")):
os.makedirs(os.path.join(self.config['output_dir'], self.config['experiment_name'], "checkpoints"))
torch.save(object,
os.path.join(self.config['output_dir'], self.config['experiment_name'], "checkpoints",
str(self.epochs)))
else:
fs = gcsfs.GCSFileSystem()
file_name = os.path.join(self.config['output_dir'], self.config['experiment_name'], "checkpoints",
str(self.epochs))
with fs.open(file_name, mode='wb') as f:
return torch.save(object, f)
def save(self):
checkpoint = {
'epoch': self.epochs + 1,
'bert_dict': self.model.state_dict(),
'optimizer': self.optimizer.state_dict(),
}
self.save_object_to_location(checkpoint)
logging.info("Saved model")