Switch to side-by-side view

--- a
+++ b/finetunning_clinicalbert.py
@@ -0,0 +1,335 @@
+# -*- coding: utf-8 -*-
+"""fineTunning_ClinicalBERT.ipynb
+
+
+
+
+"""### Fine Tunning"""
+
+!pip install transformers[torch]
+
+
+
+#haa_trainChronologies_string = haa_trainChronologies.to_string
+
+print(haa_trainChronologies_string)
+
+example=haa_trainChronologies_string
+
+from datasets import Dataset
+
+
+hf_dataset = Dataset.from_pandas(haa_develAdmittimes)
+
+hf_haa_develAdmittimes = hf_dataset.from_pandas(haa_develAdmittimes)
+
+hf_dataset
+
+
+
+
+
+
+
+def tokenize_data(example):
+    combined_text = f"Subject ID: {example['subject_id']} Hospital Admission ID: {example['hadm_id']} Admittime: {example['admittime']}"
+
+    # Tokenize the text and handle padding directly, ensuring output is suitable for processing
+    tokenized_output = tokenizer(combined_text, truncation=True, padding='max_length', max_length=16)
+
+    # Return the dictionary as-is if already in list format
+    return tokenized_output
+
+from transformers import AutoTokenizer
+
+# Load the tokenizer
+tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
+
+def tokenize_data(example):
+    # Create a single text string from the dataset fields
+    text_to_tokenize = f"Subject ID: {example['subject_id']} Hospital Admission ID: {example['hadm_id']} Admittime: {example['admittime']} Observations: {example.get('observations', '')}"
+
+    # Tokenize the combined text with consistent padding and truncation
+    return tokenizer(
+        text_to_tokenize,
+        padding="max_length",   # Ensures all outputs have the same length
+        truncation=True,        # Ensures no output exceeds max_length
+        max_length=512          # Sets the maximum length of a sequence
+    )
+
+# Example of how to apply this function using map in the Hugging Face dataset
+tokenized_dataset = hf_haa_develAdmittimes.map(
+    tokenize_data,
+    batched=True,
+    batch_size=16,
+    remove_columns=hf_haa_develAdmittimes.column_names
+)
+
+from transformers import DataCollatorWithPadding
+
+# Initialize the tokenizer
+tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
+
+# Initialize a data collator that dynamically pads the batches
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None)  # None means it will pad dynamically to the longest in the batch
+
+# Assuming hf_haa_develAdmittimes is correctly initialized as a dataset
+# Apply tokenization to the dataset
+tokenized_dataset = hf_haa_develAdmittimes.map(
+    tokenize_data,
+    batched=True,
+    batch_size=8,
+    remove_columns=hf_haa_develAdmittimes.column_names
+)
+
+tokenized_dataset
+
+
+
+train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
+train_dataset = train_test_split['train']
+eval_dataset = train_test_split['test']
+
+pip install transformers[torch] --upgrade
+
+pip install transformers[torch] --upgrade
+
+
+
+from transformers import AutoModelForSequenceClassification
+
+model = AutoModelForSequenceClassification.from_pretrained(
+    "emilyalsentzer/Bio_ClinicalBERT",
+    num_labels=1  # Specify the number of labels in your classification task
+)
+
+from transformers import DataCollatorWithPadding
+
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None)
+
+from transformers import DataCollatorWithPadding
+
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+sample_batch = data_collator([tokenized_dataset[i] for i in range(8)])
+print(sample_batch)
+collated_batch = data_collator(sample_batch)
+print(collated_batch)
+
+# Diagnostic to check input shapes
+def check_input_shapes(data):
+    print("Shapes of input tensors:")
+    print("Input IDs:", data['input_ids'].shape)
+    print("Attention Mask:", data['attention_mask'].shape)
+    if 'token_type_ids' in data:
+        print("Token Type IDs:", data['token_type_ids'].shape)
+
+# Apply this diagnostic function to a batch from the training dataset
+sample_batch = next(iter(Trainer.get_train_dataloader(trainer)))
+check_input_shapes(sample_batch)
+
+from transformers import DataCollatorWithPadding
+
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+# Test the data collator on a small batch manually extracted from the dataset
+example_batch = [tokenized_dataset[i] for i in range(8)]  # Adjust range as necessary
+collated_batch = data_collator(example_batch)
+print({k: v.shape for k, v in collated_batch.items()})
+
+
+# Example of inspecting the output of one tokenized example
+example = {'subject_id': '1', 'hadm_id': '100', 'admittime': '2020-01-01', 'observations': 'Patient exhibits symptoms of flu.'}
+tokenized_example = tokenize_function(example)
+print(tokenized_example)
+
+# Assuming 'tokenized_datasets' is a list of tokenized examples
+sample_batch = [tokenized_dataset[i] for i in range(8)]
+collated_batch = data_collator(sample_batch)
+print({k: v.shape for k, v in collated_batch.items()})
+
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+input_ids = input_ids_np.squeeze(0)
+outputs = model(input_ids=input_ids,attention_mask=attention_mask)
+
+for batch in loader:
+    outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
+    print(outputs)
+    break
+
+# Manually create a batch from the tokenized dataset
+sample_batch = [train_dataset[i] for i in range(8)]
+collated_batch = data_collator(sample_batch)
+
+# Print the shapes of each component
+print("Collated batch shapes:")
+for key, tensor in collated_batch.items():
+    print(f"{key}: {tensor.shape}")
+
+# Assuming a correct initialization of your data collator
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+# Manually collate a sample batch
+sample_batch = [train_dataset[i] for i in range(8)]
+collated_batch = data_collator(sample_batch)
+
+# Print the structure and content of collated batch to diagnose
+print("Collated batch input_ids shape and content:", collated_batch['input_ids'].shape, collated_batch['input_ids'])
+
+from torch.utils.data import DataLoader
+from transformers import DataCollatorWithPadding, AutoTokenizer
+
+# Assuming you have initialized your tokenizer already
+tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+# Create a DataLoader to automatically batch and collate samples
+loader = DataLoader(train_dataset, batch_size=8, collate_fn=data_collator)
+
+# Check the first batch
+#for batch in loader:
+#    print("Batch 'input_ids' shape:", batch['input_ids'].shape)
+
+print("Collated input_ids shape:", collated_batch['input_ids'].shape)
+
+# Assuming your data loader setup from previous snippets
+loader = DataLoader(train_dataset, batch_size=8, collate_fn=data_collator)
+
+# Print detailed structure of the first few batches
+for batch in loader:
+    if isinstance(batch, dict):
+        for key, value in batch.items():
+            print(f"{key}: {value}")
+            if hasattr(value, 'shape'):
+                print(f"Shape of {key}: {value.shape}")
+    else:
+        print("Batch data type:", type(batch))
+        print(batch)
+    break
+
+# Check the first few items in the dataset
+for i in range(3):
+    print(train_dataset[i])
+
+
+
+
+
+from transformers import DataCollatorWithPadding
+
+# Assuming you have a tokenizer loaded as follows
+# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
+
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+# Assuming 'tokenized_datasets' is a dataset or a list of such tokenized examples
+# Let's simulate a batch with several examples
+sample_batch = [tokenized_dataset[i] for i in range(8)]  # Collect 8 examples to form a batch
+collated_batch = data_collator(sample_batch)  # Apply the data collator
+
+# Print out the shapes of the tensors in the collated batch to verify
+print({k: v.shape for k, v in collated_batch.items()})
+
+from transformers import DataCollatorWithPadding, AutoTokenizer
+
+# Initialize the tokenizer and the data collator
+tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=None)
+
+# Assuming you have a list of dictionaries from tokenized datasets
+# Here we simulate tokenized data for demonstration
+tokenized_datasets = [{
+    'input_ids': tokenizer.encode("Sample text here", add_special_tokens=True),
+    'token_type_ids': [0] * len(tokenizer.encode("Sample text here", add_special_tokens=True)),
+    'attention_mask': [1] * len(tokenizer.encode("Sample text here", add_special_tokens=True))
+} for _ in range(8)]
+
+# Use the data collator to turn these into a batch
+collated_batch = data_collator(tokenized_datasets)
+print({k: v.shape for k, v in collated_batch.items()})
+
+print(collated_batch)
+
+print(tokenized_datasets[0])
+
+from transformers import Trainer, TrainingArguments
+
+# Set up training arguments
+training_args = TrainingArguments(
+    output_dir='./results',          # where to save the model files
+    num_train_epochs=1,              # number of training epochs
+    per_device_train_batch_size=8,   # batch size per device during training
+    evaluation_strategy='steps',     # evaluation is done (and model saved) every eval_steps
+    eval_steps=500,                  # number of steps to run evaluation
+    save_steps=500,                  # number of steps to save the model
+    warmup_steps=500,                # number of steps for the warmup phase
+    weight_decay=0.01                # strength of weight decay
+)
+
+# Initialize the trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,               # training arguments, defined above
+    train_dataset=train_dataset,      # training dataset
+    eval_dataset=eval_dataset,        # evaluation dataset
+    data_collator=data_collator       # our data collator
+)
+
+# Start training
+trainer.train()
+
+
+from rouge_score import rouge_scorer
+
+def rouge_scores(references, predictions):
+    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
+
+    # Store the scores in a list
+    scores = []
+
+    for ref, pred in zip(references, predictions):
+        score = scorer.score(ref, pred)
+        scores.append(score)
+
+    return scores
+
+
+references = [
+   "The timestamps for observations containing "C0392747" are as follows:
+
+- 2104-08-05
+- 2104-08-07
+- 2104-08-08
+- 2104-08-08
+- 2104-08-09
+- ...
+- 2194-10-01
+- 2165-04-30
+- 2165-04-30
+- 2165-05-02
+- 2165-05-09"
+]
+predictions = [
+    "
+- 2104-08-08
+- 2104-08-07
+- 2104-08-08
+- ...
+- 2194-10-01
+- 2165-04-30
+- 2165-04-30
+- 2165-05-02
+- 2165-05-09"
+    "
+]
+
+# Calculate ROUGE scores
+rouge_scores = rouge_scores(references, predictions)
+
+# Print the scores
+for score in rouge_scores:
+    print(score)
+
+
+
+