In [None]:
!pip install datasets transformers gdown



In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
from datasets import load_dataset, load_metric

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!gdown https://drive.google.com/uc?id=10663oMohQRDuNCQ0REMIeY8IAKoCiK3A

Downloading...
From: https://drive.google.com/uc?id=10663oMohQRDuNCQ0REMIeY8IAKoCiK3A
To: /content/checkpoint-final-mlm-albert.zip
42.2MB [00:00, 134MB/s] 


In [None]:
!unzip checkpoint-final-mlm-albert.zip

Archive:  checkpoint-final-mlm-albert.zip
replace drive/MyDrive/albert-mlm/checkpoint-final-mlm-albert/config.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
!ls drive/MyDrive/albert-mlm/checkpoint-final-mlm-albert

all_results.json   special_tokens_map.json  training_args.bin
config.json	   tokenizer_config.json    train_results.json
eval_results.json  tokenizer.json
pytorch_model.bin  trainer_state.json


In [None]:
max_length = 384 
doc_stride = 128 
model_checkpoint = "drive/MyDrive/albert-mlm/checkpoint-final-mlm-albert"

In [None]:
datasets = load_dataset("squad_v2")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
pad_on_right = tokenizer.padding_side == "right"
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


Downloading:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/801k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


Some weights of the model checkpoint at drive/MyDrive/albert-mlm/checkpoint-final-mlm-albert were not used when initializing AlbertForQuestionAnswering: ['predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.bias', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at drive/MyDrive/albert-mlm/checkpoint-final-mlm-albert and are newly initi

In [None]:
def prepare_train_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
# !rm -rf drive/MyDrive/albert-qa

In [None]:
!mkdir -p drive/MyDrive/albert-qa

In [None]:
!ls drive/MyDrive/

 albert-mlm			     MirasText.zip
 albert-qa			     model
 all_sentences.txt		     qa.json
 bert-base-finetuned		     qa_static.csv
 bert-large-finetuned		     runs
 cached_train_checkpoint-final_384   titles_word2vec.model
'Colab Notebooks'		     transformers
 documents_word2vec.model	    'Untitled document.gdoc'
 drug_dataset.csv		     تایم‌شیت.gsheet


In [None]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_steps=500,
    resume_from_checkpoint=True,
    output_dir='drive/MyDrive/albert-qa',
    do_train=True,
    do_eval=False,
    save_total_limit=3
)

In [None]:
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

  0%|          | 0/131 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

In [None]:
data_collator = default_data_collator

In [None]:
# part_train_data = tokenized_datasets["train"].train_test_split(0.9)['train']
# part_validation_data = tokenized_datasets["validation"].train_test_split(0.9)['train']

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            12G        2.2G        5.5G        4.2M        5.0G         10G
Swap:            0B          0B          0B


In [None]:
!nvidia-smi

Sun Sep 12 22:02:37 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    76W / 149W |    504MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
trainer.train()

***** Running training *****
  Num examples = 131958
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 16496


Epoch,Training Loss,Validation Loss


Saving model checkpoint to drive/MyDrive/albert-qa/checkpoint-500
Configuration saved in drive/MyDrive/albert-qa/checkpoint-500/config.json
Model weights saved in drive/MyDrive/albert-qa/checkpoint-500/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/albert-qa/checkpoint-500/tokenizer_config.json
Special tokens file saved in drive/MyDrive/albert-qa/checkpoint-500/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/albert-qa/checkpoint-1000
Configuration saved in drive/MyDrive/albert-qa/checkpoint-1000/config.json
Model weights saved in drive/MyDrive/albert-qa/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/albert-qa/checkpoint-1000/tokenizer_config.json
Special tokens file saved in drive/MyDrive/albert-qa/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/albert-qa/checkpoint-1500
Configuration saved in drive/MyDrive/albert-qa/checkpoint-1500/config.json
Model weights saved in drive/MyDrive/albert-q

In [None]:
rainer.save_model("albert-base-squad-trained")