|
a |
|
b/utils/datacollator.py |
|
|
1 |
import random |
|
|
2 |
import warnings |
|
|
3 |
from collections.abc import Mapping |
|
|
4 |
from dataclasses import dataclass |
|
|
5 |
from random import randint |
|
|
6 |
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union |
|
|
7 |
|
|
|
8 |
import numpy as np |
|
|
9 |
|
|
|
10 |
from transformers.tokenization_utils_base import PreTrainedTokenizerBase |
|
|
11 |
from transformers.utils import PaddingStrategy |
|
|
12 |
|
|
|
13 |
|
|
|
14 |
@dataclass |
|
|
15 |
class MyDataCollatorForSeq2Seq: |
|
|
16 |
""" |
|
|
17 |
Data collator that will dynamically pad the inputs received, as well as the labels. |
|
|
18 |
|
|
|
19 |
Args: |
|
|
20 |
tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]): |
|
|
21 |
The tokenizer used for encoding the data. |
|
|
22 |
model ([`PreTrainedModel`]): |
|
|
23 |
The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to |
|
|
24 |
prepare the *decoder_input_ids* |
|
|
25 |
|
|
|
26 |
This is useful when using *label_smoothing* to avoid calculating loss twice. |
|
|
27 |
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): |
|
|
28 |
Select a strategy to pad the returned sequences (according to the model's padding side and padding index) |
|
|
29 |
among: |
|
|
30 |
|
|
|
31 |
- `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single |
|
|
32 |
sequence is provided). |
|
|
33 |
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum |
|
|
34 |
acceptable input length for the model if that argument is not provided. |
|
|
35 |
- `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths). |
|
|
36 |
max_length (`int`, *optional*): |
|
|
37 |
Maximum length of the returned list and optionally padding length (see above). |
|
|
38 |
pad_to_multiple_of (`int`, *optional*): |
|
|
39 |
If set will pad the sequence to a multiple of the provided value. |
|
|
40 |
|
|
|
41 |
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= |
|
|
42 |
7.5 (Volta). |
|
|
43 |
label_pad_token_id (`int`, *optional*, defaults to -100): |
|
|
44 |
The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions). |
|
|
45 |
return_tensors (`str`): |
|
|
46 |
The type of Tensor to return. Allowable values are "np", "pt" and "tf". |
|
|
47 |
""" |
|
|
48 |
|
|
|
49 |
tokenizer: PreTrainedTokenizerBase |
|
|
50 |
model: Optional[Any] = None |
|
|
51 |
padding: Union[bool, str, PaddingStrategy] = True |
|
|
52 |
max_length: Optional[int] = None |
|
|
53 |
pad_to_multiple_of: Optional[int] = None |
|
|
54 |
label_pad_token_id: int = -100 |
|
|
55 |
return_tensors: str = "pt" |
|
|
56 |
|
|
|
57 |
def __call__(self, features, return_tensors=None): |
|
|
58 |
if return_tensors is None: |
|
|
59 |
return_tensors = self.return_tensors |
|
|
60 |
labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None |
|
|
61 |
# We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the |
|
|
62 |
# same length to return tensors. |
|
|
63 |
if labels is not None: |
|
|
64 |
max_label_length = max(len(l) for l in labels) |
|
|
65 |
if self.pad_to_multiple_of is not None: |
|
|
66 |
max_label_length = ( |
|
|
67 |
(max_label_length + self.pad_to_multiple_of - 1) |
|
|
68 |
// self.pad_to_multiple_of |
|
|
69 |
* self.pad_to_multiple_of |
|
|
70 |
) |
|
|
71 |
|
|
|
72 |
padding_side = self.tokenizer.padding_side |
|
|
73 |
for feature in features: |
|
|
74 |
remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"])) |
|
|
75 |
if isinstance(feature["labels"], list): |
|
|
76 |
feature["labels"] = ( |
|
|
77 |
feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"] |
|
|
78 |
) |
|
|
79 |
elif padding_side == "right": |
|
|
80 |
feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64) |
|
|
81 |
else: |
|
|
82 |
feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64) |
|
|
83 |
|
|
|
84 |
feature_pt = [{k: v for k, v in d.items() if k not in ['input', 'dicom', 'instruction', 'output']} for d in features] |
|
|
85 |
feature_pt = self.tokenizer.pad( |
|
|
86 |
feature_pt, |
|
|
87 |
padding=self.padding, |
|
|
88 |
max_length=self.max_length, |
|
|
89 |
pad_to_multiple_of=self.pad_to_multiple_of, |
|
|
90 |
return_tensors=return_tensors, |
|
|
91 |
) |
|
|
92 |
|
|
|
93 |
# add "dicom" back in the dicts |
|
|
94 |
feature_pt.data['dicom'] = [elem['dicom'] for elem in features] |
|
|
95 |
|
|
|
96 |
features = feature_pt |
|
|
97 |
|
|
|
98 |
# prepare decoder_input_ids |
|
|
99 |
if ( |
|
|
100 |
labels is not None |
|
|
101 |
and self.model is not None |
|
|
102 |
and hasattr(self.model, "prepare_decoder_input_ids_from_labels") |
|
|
103 |
): |
|
|
104 |
decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=features["labels"]) |
|
|
105 |
features["decoder_input_ids"] = decoder_input_ids |
|
|
106 |
|
|
|
107 |
return features |