RAG-On-Clinical-Data / Git / [0037e2] /finetunning

Models:
RaymondKing/
RAG-On-Clinical-Data
Downloads: 1
[0037e2]: / finetunning_clinicalbert.py
History
Download this file
336 lines (230 with data), 10.6 kB

# -*- coding: utf-8 -*-
"""fineTunning_ClinicalBERT.ipynb




"""### Fine Tunning"""

!pip install transformers[torch]



#haa_trainChronologies_string = haa_trainChronologies.to_string

print(haa_trainChronologies_string)

example=haa_trainChronologies_string

from datasets import Dataset


hf_dataset = Dataset.from_pandas(haa_develAdmittimes)

hf_haa_develAdmittimes = hf_dataset.from_pandas(haa_develAdmittimes)

hf_dataset







def tokenize_data(example):
    combined_text = f"Subject ID: {example['subject_id']} Hospital Admission ID: {example['hadm_id']} Admittime: {example['admittime']}"

    # Tokenize the text and handle padding directly, ensuring output is suitable for processing
    tokenized_output = tokenizer(combined_text, truncation=True, padding='max_length', max_length=16)

    # Return the dictionary as-is if already in list format
    return tokenized_output

from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

def tokenize_data(example):
    # Create a single text string from the dataset fields
    text_to_tokenize = f"Subject ID: {example['subject_id']} Hospital Admission ID: {example['hadm_id']} Admittime: {example['admittime']} Observations: {example.get('observations', '')}"

    # Tokenize the combined text with consistent padding and truncation
    return tokenizer(
        text_to_tokenize,
        padding="max_length",   # Ensures all outputs have the same length
        truncation=True,        # Ensures no output exceeds max_length
        max_length=512          # Sets the maximum length of a sequence
    )

# Example of how to apply this function using map in the Hugging Face dataset
tokenized_dataset = hf_haa_develAdmittimes.map(
    tokenize_data,
    batched=True,
    batch_size=16,
    remove_columns=hf_haa_develAdmittimes.column_names
)

from transformers import DataCollatorWithPadding

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Initialize a data collator that dynamically pads the batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None)  # None means it will pad dynamically to the longest in the batch

# Assuming hf_haa_develAdmittimes is correctly initialized as a dataset
# Apply tokenization to the dataset
tokenized_dataset = hf_haa_develAdmittimes.map(
    tokenize_data,
    batched=True,
    batch_size=8,
    remove_columns=hf_haa_develAdmittimes.column_names
)

tokenized_dataset



train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

pip install transformers[torch] --upgrade

pip install transformers[torch] --upgrade



from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=1  # Specify the number of labels in your classification task
)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
sample_batch = data_collator([tokenized_dataset[i] for i in range(8)])
print(sample_batch)
collated_batch = data_collator(sample_batch)
print(collated_batch)

# Diagnostic to check input shapes
def check_input_shapes(data):
    print("Shapes of input tensors:")
    print("Input IDs:", data['input_ids'].shape)
    print("Attention Mask:", data['attention_mask'].shape)
    if 'token_type_ids' in data:
        print("Token Type IDs:", data['token_type_ids'].shape)

# Apply this diagnostic function to a batch from the training dataset
sample_batch = next(iter(Trainer.get_train_dataloader(trainer)))
check_input_shapes(sample_batch)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Test the data collator on a small batch manually extracted from the dataset
example_batch = [tokenized_dataset[i] for i in range(8)]  # Adjust range as necessary
collated_batch = data_collator(example_batch)
print({k: v.shape for k, v in collated_batch.items()})


# Example of inspecting the output of one tokenized example
example = {'subject_id': '1', 'hadm_id': '100', 'admittime': '2020-01-01', 'observations': 'Patient exhibits symptoms of flu.'}
tokenized_example = tokenize_function(example)
print(tokenized_example)

# Assuming 'tokenized_datasets' is a list of tokenized examples
sample_batch = [tokenized_dataset[i] for i in range(8)]
collated_batch = data_collator(sample_batch)
print({k: v.shape for k, v in collated_batch.items()})

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

input_ids = input_ids_np.squeeze(0)
outputs = model(input_ids=input_ids,attention_mask=attention_mask)

for batch in loader:
    outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
    print(outputs)
    break

# Manually create a batch from the tokenized dataset
sample_batch = [train_dataset[i] for i in range(8)]
collated_batch = data_collator(sample_batch)

# Print the shapes of each component
print("Collated batch shapes:")
for key, tensor in collated_batch.items():
    print(f"{key}: {tensor.shape}")

# Assuming a correct initialization of your data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Manually collate a sample batch
sample_batch = [train_dataset[i] for i in range(8)]
collated_batch = data_collator(sample_batch)

# Print the structure and content of collated batch to diagnose
print("Collated batch input_ids shape and content:", collated_batch['input_ids'].shape, collated_batch['input_ids'])

from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding, AutoTokenizer

# Assuming you have initialized your tokenizer already
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create a DataLoader to automatically batch and collate samples
loader = DataLoader(train_dataset, batch_size=8, collate_fn=data_collator)

# Check the first batch
#for batch in loader:
#    print("Batch 'input_ids' shape:", batch['input_ids'].shape)

print("Collated input_ids shape:", collated_batch['input_ids'].shape)

# Assuming your data loader setup from previous snippets
loader = DataLoader(train_dataset, batch_size=8, collate_fn=data_collator)

# Print detailed structure of the first few batches
for batch in loader:
    if isinstance(batch, dict):
        for key, value in batch.items():
            print(f"{key}: {value}")
            if hasattr(value, 'shape'):
                print(f"Shape of {key}: {value.shape}")
    else:
        print("Batch data type:", type(batch))
        print(batch)
    break

# Check the first few items in the dataset
for i in range(3):
    print(train_dataset[i])





from transformers import DataCollatorWithPadding

# Assuming you have a tokenizer loaded as follows
# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Assuming 'tokenized_datasets' is a dataset or a list of such tokenized examples
# Let's simulate a batch with several examples
sample_batch = [tokenized_dataset[i] for i in range(8)]  # Collect 8 examples to form a batch
collated_batch = data_collator(sample_batch)  # Apply the data collator

# Print out the shapes of the tensors in the collated batch to verify
print({k: v.shape for k, v in collated_batch.items()})

from transformers import DataCollatorWithPadding, AutoTokenizer

# Initialize the tokenizer and the data collator
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None)

# Assuming you have a list of dictionaries from tokenized datasets
# Here we simulate tokenized data for demonstration
tokenized_datasets = [{
    'input_ids': tokenizer.encode("Sample text here", add_special_tokens=True),
    'token_type_ids': [0] * len(tokenizer.encode("Sample text here", add_special_tokens=True)),
    'attention_mask': [1] * len(tokenizer.encode("Sample text here", add_special_tokens=True))
} for _ in range(8)]

# Use the data collator to turn these into a batch
collated_batch = data_collator(tokenized_datasets)
print({k: v.shape for k, v in collated_batch.items()})

print(collated_batch)

print(tokenized_datasets[0])

from transformers import Trainer, TrainingArguments

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # where to save the model files
    num_train_epochs=1,              # number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    evaluation_strategy='steps',     # evaluation is done (and model saved) every eval_steps
    eval_steps=500,                  # number of steps to run evaluation
    save_steps=500,                  # number of steps to save the model
    warmup_steps=500,                # number of steps for the warmup phase
    weight_decay=0.01                # strength of weight decay
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,               # training arguments, defined above
    train_dataset=train_dataset,      # training dataset
    eval_dataset=eval_dataset,        # evaluation dataset
    data_collator=data_collator       # our data collator
)

# Start training
trainer.train()


from rouge_score import rouge_scorer

def rouge_scores(references, predictions):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Store the scores in a list
    scores = []

    for ref, pred in zip(references, predictions):
        score = scorer.score(ref, pred)
        scores.append(score)

    return scores


references = [
   "The timestamps for observations containing "C0392747" are as follows:

- 2104-08-05
- 2104-08-07
- 2104-08-08
- 2104-08-08
- 2104-08-09
- ...
- 2194-10-01
- 2165-04-30
- 2165-04-30
- 2165-05-02
- 2165-05-09"
]
predictions = [
    "
- 2104-08-08
- 2104-08-07
- 2104-08-08
- ...
- 2194-10-01
- 2165-04-30
- 2165-04-30
- 2165-05-02
- 2165-05-09"
    "
]

# Calculate ROUGE scores
rouge_scores = rouge_scores(references, predictions)

# Print the scores
for score in rouge_scores:
    print(score)