--- a
+++ b/mmaction/datasets/pipelines/formatting.py
@@ -0,0 +1,490 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Sequence
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+
+from ..builder import PIPELINES
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+    """
+    if isinstance(data, torch.Tensor):
+        return data
+    if isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    if isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    if isinstance(data, int):
+        return torch.LongTensor([data])
+    if isinstance(data, float):
+        return torch.FloatTensor([data])
+    raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@PIPELINES.register_module()
+class ToTensor:
+    """Convert some values in results dict to `torch.Tensor` type in data
+    loader pipeline.
+
+    Args:
+        keys (Sequence[str]): Required keys to be converted.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Performs the ToTensor formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class Rename:
+    """Rename the key in results.
+
+    Args:
+        mapping (dict): The keys in results that need to be renamed. The key of
+            the dict is the original name, while the value is the new name. If
+            the original name not found in results, do nothing.
+            Default: dict().
+    """
+
+    def __init__(self, mapping):
+        self.mapping = mapping
+
+    def __call__(self, results):
+        for key, value in self.mapping.items():
+            if key in results:
+                assert isinstance(key, str) and isinstance(value, str)
+                assert value not in results, ('the new name already exists in '
+                                              'results')
+                results[value] = results[key]
+                results.pop(key)
+        return results
+
+
+@PIPELINES.register_module()
+class ToDataContainer:
+    """Convert the data to DataContainer.
+
+    Args:
+        fields (Sequence[dict]): Required fields to be converted
+            with keys and attributes. E.g.
+            fields=(dict(key='gt_bbox', stack=False),).
+            Note that key can also be a list of keys, if so, every tensor in
+            the list will be converted to DataContainer.
+    """
+
+    def __init__(self, fields):
+        self.fields = fields
+
+    def __call__(self, results):
+        """Performs the ToDataContainer formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        for field in self.fields:
+            _field = field.copy()
+            key = _field.pop('key')
+            if isinstance(key, list):
+                for item in key:
+                    results[item] = DC(results[item], **_field)
+            else:
+                results[key] = DC(results[key], **_field)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(fields={self.fields})'
+
+
+@PIPELINES.register_module()
+class ImageToTensor:
+    """Convert image type to `torch.Tensor` type.
+
+    Args:
+        keys (Sequence[str]): Required keys to be converted.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Performs the ImageToTensor formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        for key in self.keys:
+            results[key] = to_tensor(results[key].transpose(2, 0, 1))
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class Transpose:
+    """Transpose image channels to a given order.
+
+    Args:
+        keys (Sequence[str]): Required keys to be converted.
+        order (Sequence[int]): Image channel order.
+    """
+
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+
+    def __call__(self, results):
+        """Performs the Transpose formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+
+    def __repr__(self):
+        return (f'{self.__class__.__name__}('
+                f'keys={self.keys}, order={self.order})')
+
+
+@PIPELINES.register_module()
+class Collect:
+    """Collect data from the loader relevant to the specific task.
+
+    This keeps the items in ``keys`` as it is, and collect items in
+    ``meta_keys`` into a meta item called ``meta_name``.This is usually
+    the last stage of the data loader pipeline.
+    For example, when keys='imgs', meta_keys=('filename', 'label',
+    'original_shape'), meta_name='img_metas', the results will be a dict with
+    keys 'imgs' and 'img_metas', where 'img_metas' is a DataContainer of
+    another dict with keys 'filename', 'label', 'original_shape'.
+
+    Args:
+        keys (Sequence[str]): Required keys to be collected.
+        meta_name (str): The name of the key that contains meta information.
+            This key is always populated. Default: "img_metas".
+        meta_keys (Sequence[str]): Keys that are collected under meta_name.
+            The contents of the ``meta_name`` dictionary depends on
+            ``meta_keys``.
+            By default this includes:
+
+            - "filename": path to the image file
+            - "label": label of the image file
+            - "original_shape": original shape of the image as a tuple
+                (h, w, c)
+            - "img_shape": shape of the image input to the network as a tuple
+                (h, w, c).  Note that images may be zero padded on the
+                bottom/right, if the batch tensor is larger than this shape.
+            - "pad_shape": image shape after padding
+            - "flip_direction": a str in ("horiziontal", "vertival") to
+                indicate if the image is fliped horizontally or vertically.
+            - "img_norm_cfg": a dict of normalization information:
+                - mean - per channel mean subtraction
+                - std - per channel std divisor
+                - to_rgb - bool indicating if bgr was converted to rgb
+        nested (bool): If set as True, will apply data[x] = [data[x]] to all
+            items in data. The arg is added for compatibility. Default: False.
+    """
+
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'label', 'original_shape', 'img_shape',
+                            'pad_shape', 'flip_direction', 'img_norm_cfg'),
+                 meta_name='img_metas',
+                 nested=False):
+        self.keys = keys
+        self.meta_keys = meta_keys
+        self.meta_name = meta_name
+        self.nested = nested
+
+    def __call__(self, results):
+        """Performs the Collect formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        data = {}
+        for key in self.keys:
+            data[key] = results[key]
+
+        if len(self.meta_keys) != 0:
+            meta = {}
+            for key in self.meta_keys:
+                meta[key] = results[key]
+            data[self.meta_name] = DC(meta, cpu_only=True)
+        if self.nested:
+            for k in data:
+                data[k] = [data[k]]
+
+        return data
+
+    def __repr__(self):
+        return (f'{self.__class__.__name__}('
+                f'keys={self.keys}, meta_keys={self.meta_keys}, '
+                f'nested={self.nested})')
+
+
+@PIPELINES.register_module()
+class FormatShape:
+    """Format final imgs shape to the given input_format.
+
+    Required keys are "imgs", "num_clips" and "clip_len", added or modified
+    keys are "imgs" and "input_shape".
+
+    Args:
+        input_format (str): Define the final imgs format.
+        collapse (bool): To collpase input_format N... to ... (NCTHW to CTHW,
+            etc.) if N is 1. Should be set as True when training and testing
+            detectors. Default: False.
+    """
+
+    def __init__(self, input_format, collapse=False):
+        self.input_format = input_format
+        self.collapse = collapse
+        if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']:
+            raise ValueError(
+                f'The input format {self.input_format} is invalid.')
+
+    def __call__(self, results):
+        """Performs the FormatShape formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        if not isinstance(results['imgs'], np.ndarray):
+            results['imgs'] = np.array(results['imgs'])
+        imgs = results['imgs']
+        # [M x H x W x C]
+        # M = 1 * N_crops * N_clips * L
+        if self.collapse:
+            assert results['num_clips'] == 1
+
+        if self.input_format == 'NCTHW':
+            num_clips = results['num_clips']
+            clip_len = results['clip_len']
+
+            imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+            # N_crops x N_clips x L x H x W x C
+            imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4))
+            # N_crops x N_clips x C x L x H x W
+            imgs = imgs.reshape((-1, ) + imgs.shape[2:])
+            # M' x C x L x H x W
+            # M' = N_crops x N_clips
+        elif self.input_format == 'NCHW':
+            imgs = np.transpose(imgs, (0, 3, 1, 2))
+            # M x C x H x W
+        elif self.input_format == 'NCHW_Flow':
+            num_clips = results['num_clips']
+            clip_len = results['clip_len']
+            imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+            # N_crops x N_clips x L x H x W x C
+            imgs = np.transpose(imgs, (0, 1, 2, 5, 3, 4))
+            # N_crops x N_clips x L x C x H x W
+            imgs = imgs.reshape((-1, imgs.shape[2] * imgs.shape[3]) +
+                                imgs.shape[4:])
+            # M' x C' x H x W
+            # M' = N_crops x N_clips
+            # C' = L x C
+        elif self.input_format == 'NPTCHW':
+            num_proposals = results['num_proposals']
+            num_clips = results['num_clips']
+            clip_len = results['clip_len']
+            imgs = imgs.reshape((num_proposals, num_clips * clip_len) +
+                                imgs.shape[1:])
+            # P x M x H x W x C
+            # M = N_clips x L
+            imgs = np.transpose(imgs, (0, 1, 4, 2, 3))
+            # P x M x C x H x W
+
+        if self.collapse:
+            assert imgs.shape[0] == 1
+            imgs = imgs.squeeze(0)
+
+        results['imgs'] = imgs
+        results['input_shape'] = imgs.shape
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f"(input_format='{self.input_format}')"
+        return repr_str
+
+
+@PIPELINES.register_module()
+class FormatAudioShape:
+    """Format final audio shape to the given input_format.
+
+    Required keys are "imgs", "num_clips" and "clip_len", added or modified
+    keys are "imgs" and "input_shape".
+
+    Args:
+        input_format (str): Define the final imgs format.
+    """
+
+    def __init__(self, input_format):
+        self.input_format = input_format
+        if self.input_format not in ['NCTF']:
+            raise ValueError(
+                f'The input format {self.input_format} is invalid.')
+
+    def __call__(self, results):
+        """Performs the FormatShape formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        audios = results['audios']
+        # clip x sample x freq -> clip x channel x sample x freq
+        clip, sample, freq = audios.shape
+        audios = audios.reshape(clip, 1, sample, freq)
+        results['audios'] = audios
+        results['input_shape'] = audios.shape
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f"(input_format='{self.input_format}')"
+        return repr_str
+
+
+@PIPELINES.register_module()
+class JointToBone:
+    """Convert the joint information to bone information.
+
+    Required keys are "keypoint" ,
+    added or modified keys are "keypoint".
+
+    Args:
+        dataset (str): Define the type of dataset: 'nturgb+d', 'openpose',
+            'coco'. Default: 'nturgb+d'.
+    """
+
+    def __init__(self, dataset='nturgb+d'):
+        self.dataset = dataset
+        if self.dataset not in ['nturgb+d', 'openpose', 'coco']:
+            raise ValueError(
+                f'The dataset type {self.dataset} is not supported')
+        if self.dataset == 'nturgb+d':
+            self.pairs = [(0, 1), (1, 20), (2, 20), (3, 2), (4, 20), (5, 4),
+                          (6, 5), (7, 6), (8, 20), (9, 8), (10, 9), (11, 10),
+                          (12, 0), (13, 12), (14, 13), (15, 14), (16, 0),
+                          (17, 16), (18, 17), (19, 18), (21, 22), (20, 20),
+                          (22, 7), (23, 24), (24, 11)]
+        elif self.dataset == 'openpose':
+            self.pairs = ((0, 0), (1, 0), (2, 1), (3, 2), (4, 3), (5, 1),
+                          (6, 5), (7, 6), (8, 2), (9, 8), (10, 9), (11, 5),
+                          (12, 11), (13, 12), (14, 0), (15, 0), (16, 14), (17,
+                                                                           15))
+        elif self.dataset == 'coco':
+            self.pairs = ((0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 0),
+                          (6, 0), (7, 5), (8, 6), (9, 7), (10, 8), (11, 0),
+                          (12, 0), (13, 11), (14, 12), (15, 13), (16, 14))
+
+    def __call__(self, results):
+        """Performs the Bone formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        keypoint = results['keypoint']
+        M, T, V, C = keypoint.shape
+        bone = np.zeros((M, T, V, C), dtype=np.float32)
+
+        assert C in [2, 3]
+        for v1, v2 in self.pairs:
+            bone[..., v1, :] = keypoint[..., v1, :] - keypoint[..., v2, :]
+            if C == 3 and self.dataset in ['openpose', 'coco']:
+                score = (keypoint[..., v1, 2] + keypoint[..., v2, 2]) / 2
+                bone[..., v1, 2] = score
+
+        results['keypoint'] = bone
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f"(dataset_type='{self.dataset}')"
+        return repr_str
+
+
+@PIPELINES.register_module()
+class FormatGCNInput:
+    """Format final skeleton shape to the given input_format.
+
+    Required keys are "keypoint" and "keypoint_score"(optional),
+    added or modified keys are "keypoint" and "input_shape".
+
+    Args:
+        input_format (str): Define the final skeleton format.
+    """
+
+    def __init__(self, input_format, num_person=2):
+        self.input_format = input_format
+        if self.input_format not in ['NCTVM']:
+            raise ValueError(
+                f'The input format {self.input_format} is invalid.')
+        self.num_person = num_person
+
+    def __call__(self, results):
+        """Performs the FormatShape formatting.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        keypoint = results['keypoint']
+
+        if 'keypoint_score' in results:
+            keypoint_confidence = results['keypoint_score']
+            keypoint_confidence = np.expand_dims(keypoint_confidence, -1)
+            keypoint_3d = np.concatenate((keypoint, keypoint_confidence),
+                                         axis=-1)
+        else:
+            keypoint_3d = keypoint
+
+        keypoint_3d = np.transpose(keypoint_3d,
+                                   (3, 1, 2, 0))  # M T V C -> C T V M
+
+        if keypoint_3d.shape[-1] < self.num_person:
+            pad_dim = self.num_person - keypoint_3d.shape[-1]
+            pad = np.zeros(
+                keypoint_3d.shape[:-1] + (pad_dim, ), dtype=keypoint_3d.dtype)
+            keypoint_3d = np.concatenate((keypoint_3d, pad), axis=-1)
+        elif keypoint_3d.shape[-1] > self.num_person:
+            keypoint_3d = keypoint_3d[:, :, :, :self.num_person]
+
+        results['keypoint'] = keypoint_3d
+        results['input_shape'] = keypoint_3d.shape
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f"(input_format='{self.input_format}')"
+        return repr_str