medical-bert / Git / [d129b2] /medicalbert/datareader/StandardDataReader.py

Models:
philipB/
medical-bert
Downloads: 1
[d129b2]: / medicalbert / datareader / StandardDataReader.py
History
Download this file
114 lines (86 with data), 4.0 kB

import logging
import os

import pandas as pd
import torch
from torch.utils.data import TensorDataset
from tqdm import tqdm

from datareader.abstract_data_reader import AbstractDataReader, InputExample


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


class StandardDataReader(AbstractDataReader):

    def __init__(self, config, tokenizer):
        self.tokenizer = tokenizer
        self.max_sequence_length = config['max_sequence_length']
        self.config = config
        self.train = None
        self.valid = None
        self.test = None

    def build_fresh_dataset(self, dataset):
        logging.info("Building fresh dataset...")

        df = pd.read_csv(os.path.join(self.config['data_dir'], dataset))

        input_features = []
        df['text'] = df['text'].str.replace(r'\t', ' ', regex=True)
        df['text'] = df['text'].str.replace(r'\n', ' ', regex=True)
        df['text'] = df['text'].str.lower()

        for _, row in tqdm(df.iterrows(), total=df.shape[0]):
            text = row['text']
            lbl = row[self.config['target']]

            input_example = InputExample(None, text, None, self.config['target'])
            feature = self.convert_example_to_feature(input_example, lbl)
            input_features.append(feature)

        all_input_ids = torch.tensor([f.input_ids for f in input_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in input_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in input_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in input_features], dtype=torch.long)

        td = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        return td

    def convert_example_to_feature(self, example, lbl):
        """Loads a data file into a list of `InputBatch`s."""

        # tokenize the first text.
        tokens_a = self.tokenizer.tokenize(example.text_a)

        # if its a sentence-pair task, tokenize the second
        tokens_b = None
        if example.text_b:
            tokens_b = self.tokenizer.tokenize(example.text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            AbstractDataReader.truncate_seq_pair(tokens_a, tokens_b, self.max_sequence_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > (self.max_sequence_length - 2):
                tokens_a = tokens_a[-(self.max_sequence_length - 2):]

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < self.max_sequence_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == self.max_sequence_length
        assert len(input_mask) == self.max_sequence_length
        assert len(segment_ids) == self.max_sequence_length

        return InputFeatures(input_ids, input_mask, segment_ids, lbl)