Switch to unified view

a b/finetunning_clinicalbert.py
1
# -*- coding: utf-8 -*-
2
"""fineTunning_ClinicalBERT.ipynb
3
4
5
6
7
"""### Fine Tunning"""
8
9
!pip install transformers[torch]
10
11
12
13
#haa_trainChronologies_string = haa_trainChronologies.to_string
14
15
print(haa_trainChronologies_string)
16
17
example=haa_trainChronologies_string
18
19
from datasets import Dataset
20
21
22
hf_dataset = Dataset.from_pandas(haa_develAdmittimes)
23
24
hf_haa_develAdmittimes = hf_dataset.from_pandas(haa_develAdmittimes)
25
26
hf_dataset
27
28
29
30
31
32
33
34
def tokenize_data(example):
35
    combined_text = f"Subject ID: {example['subject_id']} Hospital Admission ID: {example['hadm_id']} Admittime: {example['admittime']}"
36
37
    # Tokenize the text and handle padding directly, ensuring output is suitable for processing
38
    tokenized_output = tokenizer(combined_text, truncation=True, padding='max_length', max_length=16)
39
40
    # Return the dictionary as-is if already in list format
41
    return tokenized_output
42
43
from transformers import AutoTokenizer
44
45
# Load the tokenizer
46
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
47
48
def tokenize_data(example):
49
    # Create a single text string from the dataset fields
50
    text_to_tokenize = f"Subject ID: {example['subject_id']} Hospital Admission ID: {example['hadm_id']} Admittime: {example['admittime']} Observations: {example.get('observations', '')}"
51
52
    # Tokenize the combined text with consistent padding and truncation
53
    return tokenizer(
54
        text_to_tokenize,
55
        padding="max_length",   # Ensures all outputs have the same length
56
        truncation=True,        # Ensures no output exceeds max_length
57
        max_length=512          # Sets the maximum length of a sequence
58
    )
59
60
# Example of how to apply this function using map in the Hugging Face dataset
61
tokenized_dataset = hf_haa_develAdmittimes.map(
62
    tokenize_data,
63
    batched=True,
64
    batch_size=16,
65
    remove_columns=hf_haa_develAdmittimes.column_names
66
)
67
68
from transformers import DataCollatorWithPadding
69
70
# Initialize the tokenizer
71
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
72
73
# Initialize a data collator that dynamically pads the batches
74
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None)  # None means it will pad dynamically to the longest in the batch
75
76
# Assuming hf_haa_develAdmittimes is correctly initialized as a dataset
77
# Apply tokenization to the dataset
78
tokenized_dataset = hf_haa_develAdmittimes.map(
79
    tokenize_data,
80
    batched=True,
81
    batch_size=8,
82
    remove_columns=hf_haa_develAdmittimes.column_names
83
)
84
85
tokenized_dataset
86
87
88
89
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
90
train_dataset = train_test_split['train']
91
eval_dataset = train_test_split['test']
92
93
pip install transformers[torch] --upgrade
94
95
pip install transformers[torch] --upgrade
96
97
98
99
from transformers import AutoModelForSequenceClassification
100
101
model = AutoModelForSequenceClassification.from_pretrained(
102
    "emilyalsentzer/Bio_ClinicalBERT",
103
    num_labels=1  # Specify the number of labels in your classification task
104
)
105
106
from transformers import DataCollatorWithPadding
107
108
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None)
109
110
from transformers import DataCollatorWithPadding
111
112
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
113
sample_batch = data_collator([tokenized_dataset[i] for i in range(8)])
114
print(sample_batch)
115
collated_batch = data_collator(sample_batch)
116
print(collated_batch)
117
118
# Diagnostic to check input shapes
119
def check_input_shapes(data):
120
    print("Shapes of input tensors:")
121
    print("Input IDs:", data['input_ids'].shape)
122
    print("Attention Mask:", data['attention_mask'].shape)
123
    if 'token_type_ids' in data:
124
        print("Token Type IDs:", data['token_type_ids'].shape)
125
126
# Apply this diagnostic function to a batch from the training dataset
127
sample_batch = next(iter(Trainer.get_train_dataloader(trainer)))
128
check_input_shapes(sample_batch)
129
130
from transformers import DataCollatorWithPadding
131
132
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
133
134
# Test the data collator on a small batch manually extracted from the dataset
135
example_batch = [tokenized_dataset[i] for i in range(8)]  # Adjust range as necessary
136
collated_batch = data_collator(example_batch)
137
print({k: v.shape for k, v in collated_batch.items()})
138
139
140
# Example of inspecting the output of one tokenized example
141
example = {'subject_id': '1', 'hadm_id': '100', 'admittime': '2020-01-01', 'observations': 'Patient exhibits symptoms of flu.'}
142
tokenized_example = tokenize_function(example)
143
print(tokenized_example)
144
145
# Assuming 'tokenized_datasets' is a list of tokenized examples
146
sample_batch = [tokenized_dataset[i] for i in range(8)]
147
collated_batch = data_collator(sample_batch)
148
print({k: v.shape for k, v in collated_batch.items()})
149
150
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
151
152
input_ids = input_ids_np.squeeze(0)
153
outputs = model(input_ids=input_ids,attention_mask=attention_mask)
154
155
for batch in loader:
156
    outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
157
    print(outputs)
158
    break
159
160
# Manually create a batch from the tokenized dataset
161
sample_batch = [train_dataset[i] for i in range(8)]
162
collated_batch = data_collator(sample_batch)
163
164
# Print the shapes of each component
165
print("Collated batch shapes:")
166
for key, tensor in collated_batch.items():
167
    print(f"{key}: {tensor.shape}")
168
169
# Assuming a correct initialization of your data collator
170
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
171
172
# Manually collate a sample batch
173
sample_batch = [train_dataset[i] for i in range(8)]
174
collated_batch = data_collator(sample_batch)
175
176
# Print the structure and content of collated batch to diagnose
177
print("Collated batch input_ids shape and content:", collated_batch['input_ids'].shape, collated_batch['input_ids'])
178
179
from torch.utils.data import DataLoader
180
from transformers import DataCollatorWithPadding, AutoTokenizer
181
182
# Assuming you have initialized your tokenizer already
183
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
184
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
185
186
# Create a DataLoader to automatically batch and collate samples
187
loader = DataLoader(train_dataset, batch_size=8, collate_fn=data_collator)
188
189
# Check the first batch
190
#for batch in loader:
191
#    print("Batch 'input_ids' shape:", batch['input_ids'].shape)
192
193
print("Collated input_ids shape:", collated_batch['input_ids'].shape)
194
195
# Assuming your data loader setup from previous snippets
196
loader = DataLoader(train_dataset, batch_size=8, collate_fn=data_collator)
197
198
# Print detailed structure of the first few batches
199
for batch in loader:
200
    if isinstance(batch, dict):
201
        for key, value in batch.items():
202
            print(f"{key}: {value}")
203
            if hasattr(value, 'shape'):
204
                print(f"Shape of {key}: {value.shape}")
205
    else:
206
        print("Batch data type:", type(batch))
207
        print(batch)
208
    break
209
210
# Check the first few items in the dataset
211
for i in range(3):
212
    print(train_dataset[i])
213
214
215
216
217
218
from transformers import DataCollatorWithPadding
219
220
# Assuming you have a tokenizer loaded as follows
221
# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
222
223
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
224
225
# Assuming 'tokenized_datasets' is a dataset or a list of such tokenized examples
226
# Let's simulate a batch with several examples
227
sample_batch = [tokenized_dataset[i] for i in range(8)]  # Collect 8 examples to form a batch
228
collated_batch = data_collator(sample_batch)  # Apply the data collator
229
230
# Print out the shapes of the tensors in the collated batch to verify
231
print({k: v.shape for k, v in collated_batch.items()})
232
233
from transformers import DataCollatorWithPadding, AutoTokenizer
234
235
# Initialize the tokenizer and the data collator
236
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
237
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None)
238
239
# Assuming you have a list of dictionaries from tokenized datasets
240
# Here we simulate tokenized data for demonstration
241
tokenized_datasets = [{
242
    'input_ids': tokenizer.encode("Sample text here", add_special_tokens=True),
243
    'token_type_ids': [0] * len(tokenizer.encode("Sample text here", add_special_tokens=True)),
244
    'attention_mask': [1] * len(tokenizer.encode("Sample text here", add_special_tokens=True))
245
} for _ in range(8)]
246
247
# Use the data collator to turn these into a batch
248
collated_batch = data_collator(tokenized_datasets)
249
print({k: v.shape for k, v in collated_batch.items()})
250
251
print(collated_batch)
252
253
print(tokenized_datasets[0])
254
255
from transformers import Trainer, TrainingArguments
256
257
# Set up training arguments
258
training_args = TrainingArguments(
259
    output_dir='./results',          # where to save the model files
260
    num_train_epochs=1,              # number of training epochs
261
    per_device_train_batch_size=8,   # batch size per device during training
262
    evaluation_strategy='steps',     # evaluation is done (and model saved) every eval_steps
263
    eval_steps=500,                  # number of steps to run evaluation
264
    save_steps=500,                  # number of steps to save the model
265
    warmup_steps=500,                # number of steps for the warmup phase
266
    weight_decay=0.01                # strength of weight decay
267
)
268
269
# Initialize the trainer
270
trainer = Trainer(
271
    model=model,
272
    args=training_args,               # training arguments, defined above
273
    train_dataset=train_dataset,      # training dataset
274
    eval_dataset=eval_dataset,        # evaluation dataset
275
    data_collator=data_collator       # our data collator
276
)
277
278
# Start training
279
trainer.train()
280
281
282
from rouge_score import rouge_scorer
283
284
def rouge_scores(references, predictions):
285
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
286
287
    # Store the scores in a list
288
    scores = []
289
290
    for ref, pred in zip(references, predictions):
291
        score = scorer.score(ref, pred)
292
        scores.append(score)
293
294
    return scores
295
296
297
references = [
298
   "The timestamps for observations containing "C0392747" are as follows:
299
300
- 2104-08-05
301
- 2104-08-07
302
- 2104-08-08
303
- 2104-08-08
304
- 2104-08-09
305
- ...
306
- 2194-10-01
307
- 2165-04-30
308
- 2165-04-30
309
- 2165-05-02
310
- 2165-05-09"
311
]
312
predictions = [
313
    "
314
- 2104-08-08
315
- 2104-08-07
316
- 2104-08-08
317
- ...
318
- 2194-10-01
319
- 2165-04-30
320
- 2165-04-30
321
- 2165-05-02
322
- 2165-05-09"
323
    "
324
]
325
326
# Calculate ROUGE scores
327
rouge_scores = rouge_scores(references, predictions)
328
329
# Print the scores
330
for score in rouge_scores:
331
    print(score)
332
333
334
335