al-medical-RE / Git / [735bb5] /src/extensions/baal/dataset.py

Models:
philipB/
al-medical-RE
Downloads: 1
[735bb5]: / src / extensions / baal / dataset.py
History
Download this file
189 lines (158 with data), 6.0 kB

# Base Dependencies
# -----------------
import numpy as np
from typing import List, Union, Any

# 3rd-Party Dependencies
# ----------------------
import torch

from baal.active import ActiveLearningDataset
from baal.active.dataset.base import Dataset
from datasets import Dataset as HFDataset


# Auxiliar Functions 
# ------------------
def my_active_huggingface_dataset(
    dataset,
    tokenizer=None,
    target_key: str = "label",
    input_key: str = "sentence",
    max_seq_len: int = 128,
    **kwargs
):
    """
    Wrapping huggingface.datasets with baal.active.ActiveLearningDataset.
    Args:
        dataset (torch.utils.data.Dataset): a dataset provided by huggingface.
        tokenizer (transformers.PreTrainedTokenizer): a tokenizer provided by huggingface.
        target_key (str): target key used in the dataset's dictionary.
        input_key (str): input key used in the dataset's dictionary.
        max_seq_len (int): max length of a sequence to be used for padding the shorter sequences.
        kwargs (Dict): Parameters forwarded to 'ActiveLearningDataset'.
    Returns:
        an baal.active.ActiveLearningDataset object.
    """

    return MyActiveLearningDatasetBert(
        MyHuggingFaceDatasets(dataset, tokenizer, target_key, input_key, max_seq_len),
        **kwargs
    )


# Datasets
# --------
class MyHuggingFaceDatasets(Dataset):
    """
    Support for `huggingface.datasets`: (https://github.com/huggingface/datasets).
    The purpose of this wrapper is to separate the labels from the rest of the sample information
    and make the dataset ready to be used by `baal.active.ActiveLearningDataset`.
    Args:
        dataset (Dataset): a dataset provided by huggingface.
        tokenizer (transformers.PreTrainedTokenizer): a tokenizer provided by huggingface.
        target_key (str): target key used in the dataset's dictionary.
        input_key (str): input key used in the dataset's dictionary.
        max_seq_len (int): max length of a sequence to be used for padding the shorter
            sequences.
    """

    def __init__(
        self,
        dataset: HFDataset,
        tokenizer=None,
        target_key: str = "label",
        input_key: str = "sentence",
        max_seq_len: int = 128,
    ):
        self.dataset = dataset
        self.targets, self.texts = self.dataset[target_key], self.dataset[input_key]
        self.targets_list: List = np.unique(self.targets).tolist()

        if tokenizer:
            self.input_ids, self.attention_masks = self._tokenize(
                tokenizer, max_seq_len
            )
        else:
            self.input_ids = self.dataset["input_ids"]
            self.attention_masks = self.dataset["attention_mask"]

    @property
    def num_classes(self):
        return len(self.targets_list)

    def _tokenize(self, tokenizer, max_seq_len):
        # For speed purposes, we should use fast tokenizers here, but that is up to the caller
        tokenized = tokenizer(
            self.texts,
            add_special_tokens=True,
            max_length=max_seq_len,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
            truncation=True,
        )
        return tokenized["input_ids"], tokenized["attention_mask"]

    def label(self, idx: int, value: int):
        """Label the item.
        Args:
            idx: index to label
            value: Value to label the index.
        """
        self.targets[idx] = value

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        target = self.targets_list.index(self.targets[idx])

        return {
            "input_ids": self.input_ids[idx].flatten()
            if len(self.input_ids) > 0
            else None,
            "inputs": self.texts[idx],
            "attention_mask": self.attention_masks[idx].flatten()
            if len(self.attention_masks) > 0
            else None,
            "label": torch.tensor(target, dtype=torch.long),
        }


class MyActiveLearningDatasetBert(ActiveLearningDataset):
    """
    MyActiveLearningDataset

    Modification of ActiveLearningDataset to allow the indexing with a
    a list of integers.
    """
    
    
    @property
    def labels(self) -> List[int]:
        return self._dataset[self.get_indices_for_active_step()]["label"]

    @property
    def n_labelled_tokens(self) -> int:
        return (
            self._dataset.dataset[self.get_indices_for_active_step()]["seq_length"]
            .sum()
            .item()
        )

    @property
    def n_labelled_chars(self) -> int:
        return (
            self._dataset.dataset[self.get_indices_for_active_step()]["char_length"]
            .sum()
            .item()
        )

    def __getitem__(self, index: Union[int, List[int]]) -> Any:
        """Return items from the original dataset based on the labelled index."""
        _index = np.array(self.get_indices_for_active_step())[index]
        return self._dataset[_index]


class MyActiveLearningDatasetBilstm(ActiveLearningDataset):
    """
    MyActiveLearningDataset

    Modification of ActiveLearningDataset to allow the indexing with a
    a list of integers.
    """

    @property
    def labels(self) -> List[int]:
        return self._dataset[self.get_indices_for_active_step()]["label"]

    @property
    def n_labelled_tokens(self) -> int:
        return (
            self._dataset[self.get_indices_for_active_step()]["seq_length"].sum().item()
        )

    @property
    def n_labelled_chars(self) -> int:
        return (
            self._dataset[self.get_indices_for_active_step()]["char_length"]
            .sum()
            .item()
        )

    def __getitem__(self, index: Union[int, List[int]]) -> Any:
        """Return items from the original dataset based on the labelled index."""
        _index = np.array(self.get_indices_for_active_step())[index]
        return self._dataset[_index]