Download this file

130 lines (105 with data), 4.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Base Dependencies
# -----------------
from pathlib import Path
from os.path import join
# Local Dependencies
# ------------------
from ml_models.bert import ClinicalBERTTokenizer
# 3rd-Party Dependencies
# ----------------------
import torch
from datasets import load_from_disk
from transformers import BertForSequenceClassification, AutoConfig
from bertviz import head_view, model_view
# Constants
# ---------
from constants import CHECKPOINTS_CACHE_DIR, DDI_HF_TEST_PATH
N_CHANGES = 50
def run():
"""This scripts stores the attention maps of the Clinical BERT model for the first 50 changes in the DDI test set."""
init_model_path = Path(
join(CHECKPOINTS_CACHE_DIR, "al", "bert", "ddi", "model_5.ck")
)
end_model_path = Path(
join(CHECKPOINTS_CACHE_DIR, "al", "bert", "ddi", "model_6.ck")
)
head_views_output_folder = Path(
join("results", "ddi", "bert", "interpretability", "head_views")
)
model_views_output_folder = Path(
join("results", "ddi", "bert", "interpretability", "model_views")
)
# load dataset and tokenize
tokenizer = ClinicalBERTTokenizer()
test_dataset = load_from_disk(Path(join(DDI_HF_TEST_PATH, "bert")))
sentences = test_dataset["sentence"]
labels = test_dataset["label"]
# load BERT models
init_config = AutoConfig.from_pretrained(
pretrained_model_name_or_path=init_model_path
)
init_config.output_attentions = True
end_config = AutoConfig.from_pretrained(
pretrained_model_name_or_path=end_model_path
)
end_config.output_attentions = True
init_model = BertForSequenceClassification.from_pretrained(
pretrained_model_name_or_path=init_model_path, config=init_config
)
end_model = BertForSequenceClassification.from_pretrained(
pretrained_model_name_or_path=end_model_path, config=end_config
)
changes = []
for index, (sentence, label) in enumerate(zip(sentences, labels)):
if label > 0:
inputs = tokenizer.encode(sentence, return_tensors="pt")
init_outputs = init_model(inputs)
end_outputs = end_model(inputs)
init_y_pred = torch.argmax(init_outputs["logits"])
end_y_pred = torch.argmax(end_outputs["logits"])
if end_y_pred == label and init_y_pred != label:
tokens = tokenizer.convert_ids_to_tokens(inputs[0])
init_head_view = head_view(
init_outputs["attentions"], tokens, html_action="return"
)
init_model_view = model_view(
init_outputs["attentions"], tokens, html_action="return"
)
end_head_view = head_view(
end_outputs["attentions"], tokens, html_action="return"
)
end_model_view = model_view(
end_outputs["attentions"], tokens, html_action="return"
)
# Save the HTMLs object to file
file_path = Path(
join(head_views_output_folder, str(index) + "_init.html")
)
with open(file_path, "w") as f:
f.write(init_head_view.data)
file_path = Path(
join(head_views_output_folder, str(index) + "_end.html")
)
with open(file_path, "w") as f:
f.write(end_head_view.data)
file_path = Path(
join(model_views_output_folder, str(index) + "_init.html")
)
with open(file_path, "w") as f:
f.write(init_model_view.data)
file_path = Path(
join(model_views_output_folder, str(index) + "_end.html")
)
with open(file_path, "w") as f:
f.write(end_model_view.data)
changes.append(
f"Index: {str(index)} Initial prediction: {init_y_pred} Final Prediction: {end_y_pred}"
)
if len(changes) == N_CHANGES:
break
# save list of changes to file
with open(
join("results", "ddi", "bert", "interpretability", "changes.txt"), "w"
) as f:
for item in changes:
f.write(f"{item}\n")