--- a +++ b/finetunning_clinicalbert.py @@ -0,0 +1,335 @@ +# -*- coding: utf-8 -*- +"""fineTunning_ClinicalBERT.ipynb + + + + +"""### Fine Tunning""" + +!pip install transformers[torch] + + + +#haa_trainChronologies_string = haa_trainChronologies.to_string + +print(haa_trainChronologies_string) + +example=haa_trainChronologies_string + +from datasets import Dataset + + +hf_dataset = Dataset.from_pandas(haa_develAdmittimes) + +hf_haa_develAdmittimes = hf_dataset.from_pandas(haa_develAdmittimes) + +hf_dataset + + + + + + + +def tokenize_data(example): + combined_text = f"Subject ID: {example['subject_id']} Hospital Admission ID: {example['hadm_id']} Admittime: {example['admittime']}" + + # Tokenize the text and handle padding directly, ensuring output is suitable for processing + tokenized_output = tokenizer(combined_text, truncation=True, padding='max_length', max_length=16) + + # Return the dictionary as-is if already in list format + return tokenized_output + +from transformers import AutoTokenizer + +# Load the tokenizer +tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") + +def tokenize_data(example): + # Create a single text string from the dataset fields + text_to_tokenize = f"Subject ID: {example['subject_id']} Hospital Admission ID: {example['hadm_id']} Admittime: {example['admittime']} Observations: {example.get('observations', '')}" + + # Tokenize the combined text with consistent padding and truncation + return tokenizer( + text_to_tokenize, + padding="max_length", # Ensures all outputs have the same length + truncation=True, # Ensures no output exceeds max_length + max_length=512 # Sets the maximum length of a sequence + ) + +# Example of how to apply this function using map in the Hugging Face dataset +tokenized_dataset = hf_haa_develAdmittimes.map( + tokenize_data, + batched=True, + batch_size=16, + remove_columns=hf_haa_develAdmittimes.column_names +) + +from transformers import DataCollatorWithPadding + +# Initialize the tokenizer +tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") + +# Initialize a data collator that dynamically pads the batches +data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None) # None means it will pad dynamically to the longest in the batch + +# Assuming hf_haa_develAdmittimes is correctly initialized as a dataset +# Apply tokenization to the dataset +tokenized_dataset = hf_haa_develAdmittimes.map( + tokenize_data, + batched=True, + batch_size=8, + remove_columns=hf_haa_develAdmittimes.column_names +) + +tokenized_dataset + + + +train_test_split = tokenized_dataset.train_test_split(test_size=0.1) +train_dataset = train_test_split['train'] +eval_dataset = train_test_split['test'] + +pip install transformers[torch] --upgrade + +pip install transformers[torch] --upgrade + + + +from transformers import AutoModelForSequenceClassification + +model = AutoModelForSequenceClassification.from_pretrained( + "emilyalsentzer/Bio_ClinicalBERT", + num_labels=1 # Specify the number of labels in your classification task +) + +from transformers import DataCollatorWithPadding + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None) + +from transformers import DataCollatorWithPadding + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) +sample_batch = data_collator([tokenized_dataset[i] for i in range(8)]) +print(sample_batch) +collated_batch = data_collator(sample_batch) +print(collated_batch) + +# Diagnostic to check input shapes +def check_input_shapes(data): + print("Shapes of input tensors:") + print("Input IDs:", data['input_ids'].shape) + print("Attention Mask:", data['attention_mask'].shape) + if 'token_type_ids' in data: + print("Token Type IDs:", data['token_type_ids'].shape) + +# Apply this diagnostic function to a batch from the training dataset +sample_batch = next(iter(Trainer.get_train_dataloader(trainer))) +check_input_shapes(sample_batch) + +from transformers import DataCollatorWithPadding + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +# Test the data collator on a small batch manually extracted from the dataset +example_batch = [tokenized_dataset[i] for i in range(8)] # Adjust range as necessary +collated_batch = data_collator(example_batch) +print({k: v.shape for k, v in collated_batch.items()}) + + +# Example of inspecting the output of one tokenized example +example = {'subject_id': '1', 'hadm_id': '100', 'admittime': '2020-01-01', 'observations': 'Patient exhibits symptoms of flu.'} +tokenized_example = tokenize_function(example) +print(tokenized_example) + +# Assuming 'tokenized_datasets' is a list of tokenized examples +sample_batch = [tokenized_dataset[i] for i in range(8)] +collated_batch = data_collator(sample_batch) +print({k: v.shape for k, v in collated_batch.items()}) + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +input_ids = input_ids_np.squeeze(0) +outputs = model(input_ids=input_ids,attention_mask=attention_mask) + +for batch in loader: + outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask']) + print(outputs) + break + +# Manually create a batch from the tokenized dataset +sample_batch = [train_dataset[i] for i in range(8)] +collated_batch = data_collator(sample_batch) + +# Print the shapes of each component +print("Collated batch shapes:") +for key, tensor in collated_batch.items(): + print(f"{key}: {tensor.shape}") + +# Assuming a correct initialization of your data collator +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +# Manually collate a sample batch +sample_batch = [train_dataset[i] for i in range(8)] +collated_batch = data_collator(sample_batch) + +# Print the structure and content of collated batch to diagnose +print("Collated batch input_ids shape and content:", collated_batch['input_ids'].shape, collated_batch['input_ids']) + +from torch.utils.data import DataLoader +from transformers import DataCollatorWithPadding, AutoTokenizer + +# Assuming you have initialized your tokenizer already +tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +# Create a DataLoader to automatically batch and collate samples +loader = DataLoader(train_dataset, batch_size=8, collate_fn=data_collator) + +# Check the first batch +#for batch in loader: +# print("Batch 'input_ids' shape:", batch['input_ids'].shape) + +print("Collated input_ids shape:", collated_batch['input_ids'].shape) + +# Assuming your data loader setup from previous snippets +loader = DataLoader(train_dataset, batch_size=8, collate_fn=data_collator) + +# Print detailed structure of the first few batches +for batch in loader: + if isinstance(batch, dict): + for key, value in batch.items(): + print(f"{key}: {value}") + if hasattr(value, 'shape'): + print(f"Shape of {key}: {value.shape}") + else: + print("Batch data type:", type(batch)) + print(batch) + break + +# Check the first few items in the dataset +for i in range(3): + print(train_dataset[i]) + + + + + +from transformers import DataCollatorWithPadding + +# Assuming you have a tokenizer loaded as follows +# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +# Assuming 'tokenized_datasets' is a dataset or a list of such tokenized examples +# Let's simulate a batch with several examples +sample_batch = [tokenized_dataset[i] for i in range(8)] # Collect 8 examples to form a batch +collated_batch = data_collator(sample_batch) # Apply the data collator + +# Print out the shapes of the tensors in the collated batch to verify +print({k: v.shape for k, v in collated_batch.items()}) + +from transformers import DataCollatorWithPadding, AutoTokenizer + +# Initialize the tokenizer and the data collator +tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") +data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None) + +# Assuming you have a list of dictionaries from tokenized datasets +# Here we simulate tokenized data for demonstration +tokenized_datasets = [{ + 'input_ids': tokenizer.encode("Sample text here", add_special_tokens=True), + 'token_type_ids': [0] * len(tokenizer.encode("Sample text here", add_special_tokens=True)), + 'attention_mask': [1] * len(tokenizer.encode("Sample text here", add_special_tokens=True)) +} for _ in range(8)] + +# Use the data collator to turn these into a batch +collated_batch = data_collator(tokenized_datasets) +print({k: v.shape for k, v in collated_batch.items()}) + +print(collated_batch) + +print(tokenized_datasets[0]) + +from transformers import Trainer, TrainingArguments + +# Set up training arguments +training_args = TrainingArguments( + output_dir='./results', # where to save the model files + num_train_epochs=1, # number of training epochs + per_device_train_batch_size=8, # batch size per device during training + evaluation_strategy='steps', # evaluation is done (and model saved) every eval_steps + eval_steps=500, # number of steps to run evaluation + save_steps=500, # number of steps to save the model + warmup_steps=500, # number of steps for the warmup phase + weight_decay=0.01 # strength of weight decay +) + +# Initialize the trainer +trainer = Trainer( + model=model, + args=training_args, # training arguments, defined above + train_dataset=train_dataset, # training dataset + eval_dataset=eval_dataset, # evaluation dataset + data_collator=data_collator # our data collator +) + +# Start training +trainer.train() + + +from rouge_score import rouge_scorer + +def rouge_scores(references, predictions): + scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) + + # Store the scores in a list + scores = [] + + for ref, pred in zip(references, predictions): + score = scorer.score(ref, pred) + scores.append(score) + + return scores + + +references = [ + "The timestamps for observations containing "C0392747" are as follows: + +- 2104-08-05 +- 2104-08-07 +- 2104-08-08 +- 2104-08-08 +- 2104-08-09 +- ... +- 2194-10-01 +- 2165-04-30 +- 2165-04-30 +- 2165-05-02 +- 2165-05-09" +] +predictions = [ + " +- 2104-08-08 +- 2104-08-07 +- 2104-08-08 +- ... +- 2194-10-01 +- 2165-04-30 +- 2165-04-30 +- 2165-05-02 +- 2165-05-09" + " +] + +# Calculate ROUGE scores +rouge_scores = rouge_scores(references, predictions) + +# Print the scores +for score in rouge_scores: + print(score) + + + +