--- a +++ b/mmaction/datasets/pipelines/loading.py @@ -0,0 +1,1850 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy as cp +import io +import os +import os.path as osp +import shutil +import warnings + +import mmcv +import numpy as np +import torch +from mmcv.fileio import FileClient +from torch.nn.modules.utils import _pair + +from ...utils import get_random_string, get_shm_dir, get_thread_id +from ..builder import PIPELINES + + +@PIPELINES.register_module() +class LoadHVULabel: + """Convert the HVU label from dictionaries to torch tensors. + + Required keys are "label", "categories", "category_nums", added or modified + keys are "label", "mask" and "category_mask". + """ + + def __init__(self, **kwargs): + self.hvu_initialized = False + self.kwargs = kwargs + + def init_hvu_info(self, categories, category_nums): + assert len(categories) == len(category_nums) + self.categories = categories + self.category_nums = category_nums + self.num_categories = len(self.categories) + self.num_tags = sum(self.category_nums) + self.category2num = dict(zip(categories, category_nums)) + self.start_idx = [0] + for i in range(self.num_categories - 1): + self.start_idx.append(self.start_idx[-1] + self.category_nums[i]) + self.category2startidx = dict(zip(categories, self.start_idx)) + self.hvu_initialized = True + + def __call__(self, results): + """Convert the label dictionary to 3 tensors: "label", "mask" and + "category_mask". + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + + if not self.hvu_initialized: + self.init_hvu_info(results['categories'], results['category_nums']) + + onehot = torch.zeros(self.num_tags) + onehot_mask = torch.zeros(self.num_tags) + category_mask = torch.zeros(self.num_categories) + + for category, tags in results['label'].items(): + # skip if not training on this category + if category not in self.categories: + continue + category_mask[self.categories.index(category)] = 1. + start_idx = self.category2startidx[category] + category_num = self.category2num[category] + tags = [idx + start_idx for idx in tags] + onehot[tags] = 1. + onehot_mask[start_idx:category_num + start_idx] = 1. + + results['label'] = onehot + results['mask'] = onehot_mask + results['category_mask'] = category_mask + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'hvu_initialized={self.hvu_initialized})') + return repr_str + + +@PIPELINES.register_module() +class SampleFrames: + """Sample frames from the video. + + Required keys are "total_frames", "start_index" , added or modified keys + are "frame_inds", "frame_interval" and "num_clips". + + Args: + clip_len (int): Frames of each sampled output clip. + frame_interval (int): Temporal interval of adjacent sampled frames. + Default: 1. + num_clips (int): Number of clips to be sampled. Default: 1. + temporal_jitter (bool): Whether to apply temporal jittering. + Default: False. + twice_sample (bool): Whether to use twice sample when testing. + If set to True, it will sample frames with and without fixed shift, + which is commonly used for testing in TSM model. Default: False. + out_of_bound_opt (str): The way to deal with out of bounds frame + indexes. Available options are 'loop', 'repeat_last'. + Default: 'loop'. + test_mode (bool): Store True when building test or validation dataset. + Default: False. + start_index (None): This argument is deprecated and moved to dataset + class (``BaseDataset``, ``VideoDatset``, ``RawframeDataset``, etc), + see this: https://github.com/open-mmlab/mmaction2/pull/89. + keep_tail_frames (bool): Whether to keep tail frames when sampling. + Default: False. + """ + + def __init__(self, + clip_len, + frame_interval=1, + num_clips=1, + temporal_jitter=False, + twice_sample=False, + out_of_bound_opt='loop', + test_mode=False, + start_index=None, + keep_tail_frames=False): + + self.clip_len = clip_len + self.frame_interval = frame_interval + self.num_clips = num_clips + self.temporal_jitter = temporal_jitter + self.twice_sample = twice_sample + self.out_of_bound_opt = out_of_bound_opt + self.test_mode = test_mode + self.keep_tail_frames = keep_tail_frames + assert self.out_of_bound_opt in ['loop', 'repeat_last'] + + if start_index is not None: + warnings.warn('No longer support "start_index" in "SampleFrames", ' + 'it should be set in dataset class, see this pr: ' + 'https://github.com/open-mmlab/mmaction2/pull/89') + + def _get_train_clips(self, num_frames): + """Get clip offsets in train mode. + + It will calculate the average interval for selected frames, + and randomly shift them within offsets between [0, avg_interval]. + If the total number of frames is smaller than clips num or origin + frames length, it will return all zero indices. + + Args: + num_frames (int): Total number of frame in the video. + + Returns: + np.ndarray: Sampled frame indices in train mode. + """ + ori_clip_len = self.clip_len * self.frame_interval + + if self.keep_tail_frames: + avg_interval = (num_frames - ori_clip_len + 1) / float( + self.num_clips) + if num_frames > ori_clip_len - 1: + base_offsets = np.arange(self.num_clips) * avg_interval + clip_offsets = (base_offsets + np.random.uniform( + 0, avg_interval, self.num_clips)).astype(np.int) + else: + clip_offsets = np.zeros((self.num_clips, ), dtype=np.int) + else: + avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips + + if avg_interval > 0: + base_offsets = np.arange(self.num_clips) * avg_interval + clip_offsets = base_offsets + np.random.randint( + avg_interval, size=self.num_clips) + elif num_frames > max(self.num_clips, ori_clip_len): + clip_offsets = np.sort( + np.random.randint( + num_frames - ori_clip_len + 1, size=self.num_clips)) + elif avg_interval == 0: + ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips + clip_offsets = np.around(np.arange(self.num_clips) * ratio) + else: + clip_offsets = np.zeros((self.num_clips, ), dtype=np.int) + + return clip_offsets + + def _get_test_clips(self, num_frames): + """Get clip offsets in test mode. + + Calculate the average interval for selected frames, and shift them + fixedly by avg_interval/2. If set twice_sample True, it will sample + frames together without fixed shift. If the total number of frames is + not enough, it will return all zero indices. + + Args: + num_frames (int): Total number of frame in the video. + + Returns: + np.ndarray: Sampled frame indices in test mode. + """ + ori_clip_len = self.clip_len * self.frame_interval + avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips) + if num_frames > ori_clip_len - 1: + base_offsets = np.arange(self.num_clips) * avg_interval + clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int) + if self.twice_sample: + clip_offsets = np.concatenate([clip_offsets, base_offsets]) + else: + clip_offsets = np.zeros((self.num_clips, ), dtype=np.int) + return clip_offsets + + def _sample_clips(self, num_frames): + """Choose clip offsets for the video in a given mode. + + Args: + num_frames (int): Total number of frame in the video. + + Returns: + np.ndarray: Sampled frame indices. + """ + if self.test_mode: + clip_offsets = self._get_test_clips(num_frames) + else: + clip_offsets = self._get_train_clips(num_frames) + + return clip_offsets + + def __call__(self, results): + """Perform the SampleFrames loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + total_frames = results['total_frames'] + + clip_offsets = self._sample_clips(total_frames) + frame_inds = clip_offsets[:, None] + np.arange( + self.clip_len)[None, :] * self.frame_interval + frame_inds = np.concatenate(frame_inds) + + if self.temporal_jitter: + perframe_offsets = np.random.randint( + self.frame_interval, size=len(frame_inds)) + frame_inds += perframe_offsets + + frame_inds = frame_inds.reshape((-1, self.clip_len)) + if self.out_of_bound_opt == 'loop': + frame_inds = np.mod(frame_inds, total_frames) + elif self.out_of_bound_opt == 'repeat_last': + safe_inds = frame_inds < total_frames + unsafe_inds = 1 - safe_inds + last_ind = np.max(safe_inds * frame_inds, axis=1) + new_inds = (safe_inds * frame_inds + (unsafe_inds.T * last_ind).T) + frame_inds = new_inds + else: + raise ValueError('Illegal out_of_bound option.') + + start_index = results['start_index'] + frame_inds = np.concatenate(frame_inds) + start_index + results['frame_inds'] = frame_inds.astype(np.int) + results['clip_len'] = self.clip_len + results['frame_interval'] = self.frame_interval + results['num_clips'] = self.num_clips + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'frame_interval={self.frame_interval}, ' + f'num_clips={self.num_clips}, ' + f'temporal_jitter={self.temporal_jitter}, ' + f'twice_sample={self.twice_sample}, ' + f'out_of_bound_opt={self.out_of_bound_opt}, ' + f'test_mode={self.test_mode})') + return repr_str + + +@PIPELINES.register_module() +class UntrimmedSampleFrames: + """Sample frames from the untrimmed video. + + Required keys are "filename", "total_frames", added or modified keys are + "frame_inds", "frame_interval" and "num_clips". + + Args: + clip_len (int): The length of sampled clips. Default: 1. + frame_interval (int): Temporal interval of adjacent sampled frames. + Default: 16. + start_index (None): This argument is deprecated and moved to dataset + class (``BaseDataset``, ``VideoDatset``, ``RawframeDataset``, etc), + see this: https://github.com/open-mmlab/mmaction2/pull/89. + """ + + def __init__(self, clip_len=1, frame_interval=16, start_index=None): + + self.clip_len = clip_len + self.frame_interval = frame_interval + + if start_index is not None: + warnings.warn('No longer support "start_index" in "SampleFrames", ' + 'it should be set in dataset class, see this pr: ' + 'https://github.com/open-mmlab/mmaction2/pull/89') + + def __call__(self, results): + """Perform the SampleFrames loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + total_frames = results['total_frames'] + start_index = results['start_index'] + + clip_centers = np.arange(self.frame_interval // 2, total_frames, + self.frame_interval) + num_clips = clip_centers.shape[0] + frame_inds = clip_centers[:, None] + np.arange( + -(self.clip_len // 2), self.clip_len - + (self.clip_len // 2))[None, :] + # clip frame_inds to legal range + frame_inds = np.clip(frame_inds, 0, total_frames - 1) + + frame_inds = np.concatenate(frame_inds) + start_index + results['frame_inds'] = frame_inds.astype(np.int) + results['clip_len'] = self.clip_len + results['frame_interval'] = self.frame_interval + results['num_clips'] = num_clips + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'frame_interval={self.frame_interval})') + return repr_str + + +@PIPELINES.register_module() +class DenseSampleFrames(SampleFrames): + """Select frames from the video by dense sample strategy. + + Required keys are "filename", added or modified keys are "total_frames", + "frame_inds", "frame_interval" and "num_clips". + + Args: + clip_len (int): Frames of each sampled output clip. + frame_interval (int): Temporal interval of adjacent sampled frames. + Default: 1. + num_clips (int): Number of clips to be sampled. Default: 1. + sample_range (int): Total sample range for dense sample. + Default: 64. + num_sample_positions (int): Number of sample start positions, Which is + only used in test mode. Default: 10. That is to say, by default, + there are at least 10 clips for one input sample in test mode. + temporal_jitter (bool): Whether to apply temporal jittering. + Default: False. + test_mode (bool): Store True when building test or validation dataset. + Default: False. + """ + + def __init__(self, + *args, + sample_range=64, + num_sample_positions=10, + **kwargs): + super().__init__(*args, **kwargs) + self.sample_range = sample_range + self.num_sample_positions = num_sample_positions + + def _get_train_clips(self, num_frames): + """Get clip offsets by dense sample strategy in train mode. + + It will calculate a sample position and sample interval and set + start index 0 when sample_pos == 1 or randomly choose from + [0, sample_pos - 1]. Then it will shift the start index by each + base offset. + + Args: + num_frames (int): Total number of frame in the video. + + Returns: + np.ndarray: Sampled frame indices in train mode. + """ + sample_position = max(1, 1 + num_frames - self.sample_range) + interval = self.sample_range // self.num_clips + start_idx = 0 if sample_position == 1 else np.random.randint( + 0, sample_position - 1) + base_offsets = np.arange(self.num_clips) * interval + clip_offsets = (base_offsets + start_idx) % num_frames + return clip_offsets + + def _get_test_clips(self, num_frames): + """Get clip offsets by dense sample strategy in test mode. + + It will calculate a sample position and sample interval and evenly + sample several start indexes as start positions between + [0, sample_position-1]. Then it will shift each start index by the + base offsets. + + Args: + num_frames (int): Total number of frame in the video. + + Returns: + np.ndarray: Sampled frame indices in train mode. + """ + sample_position = max(1, 1 + num_frames - self.sample_range) + interval = self.sample_range // self.num_clips + start_list = np.linspace( + 0, sample_position - 1, num=self.num_sample_positions, dtype=int) + base_offsets = np.arange(self.num_clips) * interval + clip_offsets = list() + for start_idx in start_list: + clip_offsets.extend((base_offsets + start_idx) % num_frames) + clip_offsets = np.array(clip_offsets) + return clip_offsets + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'frame_interval={self.frame_interval}, ' + f'num_clips={self.num_clips}, ' + f'sample_range={self.sample_range}, ' + f'num_sample_positions={self.num_sample_positions}, ' + f'temporal_jitter={self.temporal_jitter}, ' + f'out_of_bound_opt={self.out_of_bound_opt}, ' + f'test_mode={self.test_mode})') + return repr_str + + +@PIPELINES.register_module() +class SampleAVAFrames(SampleFrames): + + def __init__(self, clip_len, frame_interval=2, test_mode=False): + + super().__init__(clip_len, frame_interval, test_mode=test_mode) + + def _get_clips(self, center_index, skip_offsets, shot_info): + start = center_index - (self.clip_len // 2) * self.frame_interval + end = center_index + ((self.clip_len + 1) // 2) * self.frame_interval + frame_inds = list(range(start, end, self.frame_interval)) + if not self.test_mode: + frame_inds = frame_inds + skip_offsets + frame_inds = np.clip(frame_inds, shot_info[0], shot_info[1] - 1) + return frame_inds + + def __call__(self, results): + fps = results['fps'] + timestamp = results['timestamp'] + timestamp_start = results['timestamp_start'] + shot_info = results['shot_info'] + + center_index = fps * (timestamp - timestamp_start) + 1 + + skip_offsets = np.random.randint( + -self.frame_interval // 2, (self.frame_interval + 1) // 2, + size=self.clip_len) + frame_inds = self._get_clips(center_index, skip_offsets, shot_info) + start_index = results.get('start_index', 0) + + frame_inds = np.array(frame_inds, dtype=np.int) + start_index + results['frame_inds'] = frame_inds + results['clip_len'] = self.clip_len + results['frame_interval'] = self.frame_interval + results['num_clips'] = 1 + results['crop_quadruple'] = np.array([0, 0, 1, 1], dtype=np.float32) + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'frame_interval={self.frame_interval}, ' + f'test_mode={self.test_mode})') + return repr_str + + +@PIPELINES.register_module() +class SampleProposalFrames(SampleFrames): + """Sample frames from proposals in the video. + + Required keys are "total_frames" and "out_proposals", added or + modified keys are "frame_inds", "frame_interval", "num_clips", + 'clip_len' and 'num_proposals'. + + Args: + clip_len (int): Frames of each sampled output clip. + body_segments (int): Number of segments in course period. + aug_segments (list[int]): Number of segments in starting and + ending period. + aug_ratio (int | float | tuple[int | float]): The ratio + of the length of augmentation to that of the proposal. + frame_interval (int): Temporal interval of adjacent sampled frames. + Default: 1. + test_interval (int): Temporal interval of adjacent sampled frames + in test mode. Default: 6. + temporal_jitter (bool): Whether to apply temporal jittering. + Default: False. + mode (str): Choose 'train', 'val' or 'test' mode. + Default: 'train'. + """ + + def __init__(self, + clip_len, + body_segments, + aug_segments, + aug_ratio, + frame_interval=1, + test_interval=6, + temporal_jitter=False, + mode='train'): + super().__init__( + clip_len, + frame_interval=frame_interval, + temporal_jitter=temporal_jitter) + self.body_segments = body_segments + self.aug_segments = aug_segments + self.aug_ratio = _pair(aug_ratio) + if not mmcv.is_tuple_of(self.aug_ratio, (int, float)): + raise TypeError(f'aug_ratio should be int, float' + f'or tuple of int and float, ' + f'but got {type(aug_ratio)}') + assert len(self.aug_ratio) == 2 + assert mode in ['train', 'val', 'test'] + self.mode = mode + self.test_interval = test_interval + + @staticmethod + def _get_train_indices(valid_length, num_segments): + """Get indices of different stages of proposals in train mode. + + It will calculate the average interval for each segment, + and randomly shift them within offsets between [0, average_duration]. + If the total number of frames is smaller than num segments, it will + return all zero indices. + + Args: + valid_length (int): The length of the starting point's + valid interval. + num_segments (int): Total number of segments. + + Returns: + np.ndarray: Sampled frame indices in train mode. + """ + avg_interval = (valid_length + 1) // num_segments + if avg_interval > 0: + base_offsets = np.arange(num_segments) * avg_interval + offsets = base_offsets + np.random.randint( + avg_interval, size=num_segments) + else: + offsets = np.zeros((num_segments, ), dtype=np.int) + + return offsets + + @staticmethod + def _get_val_indices(valid_length, num_segments): + """Get indices of different stages of proposals in validation mode. + + It will calculate the average interval for each segment. + If the total number of valid length is smaller than num segments, + it will return all zero indices. + + Args: + valid_length (int): The length of the starting point's + valid interval. + num_segments (int): Total number of segments. + + Returns: + np.ndarray: Sampled frame indices in validation mode. + """ + if valid_length >= num_segments: + avg_interval = valid_length / float(num_segments) + base_offsets = np.arange(num_segments) * avg_interval + offsets = (base_offsets + avg_interval / 2.0).astype(np.int) + else: + offsets = np.zeros((num_segments, ), dtype=np.int) + + return offsets + + def _get_proposal_clips(self, proposal, num_frames): + """Get clip offsets in train mode. + + It will calculate sampled frame indices in the proposal's three + stages: starting, course and ending stage. + + Args: + proposal (obj): The proposal object. + num_frames (int): Total number of frame in the video. + + Returns: + np.ndarray: Sampled frame indices in train mode. + """ + # proposal interval: [start_frame, end_frame) + start_frame = proposal.start_frame + end_frame = proposal.end_frame + ori_clip_len = self.clip_len * self.frame_interval + + duration = end_frame - start_frame + assert duration != 0 + valid_length = duration - ori_clip_len + + valid_starting = max(0, + start_frame - int(duration * self.aug_ratio[0])) + valid_ending = min(num_frames - ori_clip_len + 1, + end_frame - 1 + int(duration * self.aug_ratio[1])) + + valid_starting_length = start_frame - valid_starting - ori_clip_len + valid_ending_length = (valid_ending - end_frame + 1) - ori_clip_len + + if self.mode == 'train': + starting_offsets = self._get_train_indices(valid_starting_length, + self.aug_segments[0]) + course_offsets = self._get_train_indices(valid_length, + self.body_segments) + ending_offsets = self._get_train_indices(valid_ending_length, + self.aug_segments[1]) + elif self.mode == 'val': + starting_offsets = self._get_val_indices(valid_starting_length, + self.aug_segments[0]) + course_offsets = self._get_val_indices(valid_length, + self.body_segments) + ending_offsets = self._get_val_indices(valid_ending_length, + self.aug_segments[1]) + starting_offsets += valid_starting + course_offsets += start_frame + ending_offsets += end_frame + + offsets = np.concatenate( + (starting_offsets, course_offsets, ending_offsets)) + return offsets + + def _get_train_clips(self, num_frames, proposals): + """Get clip offsets in train mode. + + It will calculate sampled frame indices of each proposal, and then + assemble them. + + Args: + num_frames (int): Total number of frame in the video. + proposals (list): Proposals fetched. + + Returns: + np.ndarray: Sampled frame indices in train mode. + """ + clip_offsets = [] + for proposal in proposals: + proposal_clip_offsets = self._get_proposal_clips( + proposal[0][1], num_frames) + clip_offsets = np.concatenate( + [clip_offsets, proposal_clip_offsets]) + + return clip_offsets + + def _get_test_clips(self, num_frames): + """Get clip offsets in test mode. + + It will calculate sampled frame indices based on test interval. + + Args: + num_frames (int): Total number of frame in the video. + + Returns: + np.ndarray: Sampled frame indices in test mode. + """ + ori_clip_len = self.clip_len * self.frame_interval + return np.arange( + 0, num_frames - ori_clip_len, self.test_interval, dtype=np.int) + + def _sample_clips(self, num_frames, proposals): + """Choose clip offsets for the video in a given mode. + + Args: + num_frames (int): Total number of frame in the video. + proposals (list | None): Proposals fetched. + It is set to None in test mode. + + Returns: + np.ndarray: Sampled frame indices. + """ + if self.mode == 'test': + clip_offsets = self._get_test_clips(num_frames) + else: + assert proposals is not None + clip_offsets = self._get_train_clips(num_frames, proposals) + + return clip_offsets + + def __call__(self, results): + """Perform the SampleFrames loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + total_frames = results['total_frames'] + + out_proposals = results.get('out_proposals', None) + clip_offsets = self._sample_clips(total_frames, out_proposals) + frame_inds = clip_offsets[:, None] + np.arange( + self.clip_len)[None, :] * self.frame_interval + frame_inds = np.concatenate(frame_inds) + + if self.temporal_jitter: + perframe_offsets = np.random.randint( + self.frame_interval, size=len(frame_inds)) + frame_inds += perframe_offsets + + start_index = results['start_index'] + frame_inds = np.mod(frame_inds, total_frames) + start_index + + results['frame_inds'] = np.array(frame_inds).astype(np.int) + results['clip_len'] = self.clip_len + results['frame_interval'] = self.frame_interval + results['num_clips'] = ( + self.body_segments + self.aug_segments[0] + self.aug_segments[1]) + if self.mode in ['train', 'val']: + results['num_proposals'] = len(results['out_proposals']) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'clip_len={self.clip_len}, ' + f'body_segments={self.body_segments}, ' + f'aug_segments={self.aug_segments}, ' + f'aug_ratio={self.aug_ratio}, ' + f'frame_interval={self.frame_interval}, ' + f'test_interval={self.test_interval}, ' + f'temporal_jitter={self.temporal_jitter}, ' + f'mode={self.mode})') + return repr_str + + +@PIPELINES.register_module() +class PyAVInit: + """Using pyav to initialize the video. + + PyAV: https://github.com/mikeboers/PyAV + + Required keys are "filename", + added or modified keys are "video_reader", and "total_frames". + + Args: + io_backend (str): io backend where frames are store. + Default: 'disk'. + kwargs (dict): Args for file client. + """ + + def __init__(self, io_backend='disk', **kwargs): + self.io_backend = io_backend + self.kwargs = kwargs + self.file_client = None + + def __call__(self, results): + """Perform the PyAV initialization. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + try: + import av + except ImportError: + raise ImportError('Please run "conda install av -c conda-forge" ' + 'or "pip install av" to install PyAV first.') + + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + + file_obj = io.BytesIO(self.file_client.get(results['filename'])) + container = av.open(file_obj) + + results['video_reader'] = container + results['total_frames'] = container.streams.video[0].frames + + return results + + def __repr__(self): + repr_str = f'{self.__class__.__name__}(io_backend={self.io_backend})' + return repr_str + + +@PIPELINES.register_module() +class PyAVDecode: + """Using PyAV to decode the video. + + PyAV: https://github.com/mikeboers/PyAV + + Required keys are "video_reader" and "frame_inds", + added or modified keys are "imgs", "img_shape" and "original_shape". + + Args: + multi_thread (bool): If set to True, it will apply multi + thread processing. Default: False. + mode (str): Decoding mode. Options are 'accurate' and 'efficient'. + If set to 'accurate', it will decode videos into accurate frames. + If set to 'efficient', it will adopt fast seeking but only return + the nearest key frames, which may be duplicated and inaccurate, + and more suitable for large scene-based video datasets. + Default: 'accurate'. + """ + + def __init__(self, multi_thread=False, mode='accurate'): + self.multi_thread = multi_thread + self.mode = mode + assert mode in ['accurate', 'efficient'] + + @staticmethod + def frame_generator(container, stream): + """Frame generator for PyAV.""" + for packet in container.demux(stream): + for frame in packet.decode(): + if frame: + return frame.to_rgb().to_ndarray() + + def __call__(self, results): + """Perform the PyAV decoding. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + container = results['video_reader'] + imgs = list() + + if self.multi_thread: + container.streams.video[0].thread_type = 'AUTO' + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + if self.mode == 'accurate': + # set max indice to make early stop + max_inds = max(results['frame_inds']) + i = 0 + for frame in container.decode(video=0): + if i > max_inds + 1: + break + imgs.append(frame.to_rgb().to_ndarray()) + i += 1 + + # the available frame in pyav may be less than its length, + # which may raise error + results['imgs'] = [ + imgs[i % len(imgs)] for i in results['frame_inds'] + ] + elif self.mode == 'efficient': + for frame in container.decode(video=0): + backup_frame = frame + break + stream = container.streams.video[0] + for idx in results['frame_inds']: + pts_scale = stream.average_rate * stream.time_base + frame_pts = int(idx / pts_scale) + container.seek( + frame_pts, any_frame=False, backward=True, stream=stream) + frame = self.frame_generator(container, stream) + if frame is not None: + imgs.append(frame) + backup_frame = frame + else: + imgs.append(backup_frame) + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + results['video_reader'] = None + del container + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(multi_thread={self.multi_thread}, mode={self.mode})' + return repr_str + + +@PIPELINES.register_module() +class PIMSInit: + """Use PIMS to initialize the video. + + PIMS: https://github.com/soft-matter/pims + + Args: + io_backend (str): io backend where frames are store. + Default: 'disk'. + mode (str): Decoding mode. Options are 'accurate' and 'efficient'. + If set to 'accurate', it will always use ``pims.PyAVReaderIndexed`` + to decode videos into accurate frames. If set to 'efficient', it + will adopt fast seeking by using ``pims.PyAVReaderTimed``. + Both will return the accurate frames in most cases. + Default: 'accurate'. + kwargs (dict): Args for file client. + """ + + def __init__(self, io_backend='disk', mode='accurate', **kwargs): + self.io_backend = io_backend + self.kwargs = kwargs + self.file_client = None + self.mode = mode + assert mode in ['accurate', 'efficient'] + + def __call__(self, results): + try: + import pims + except ImportError: + raise ImportError('Please run "conda install pims -c conda-forge" ' + 'or "pip install pims" to install pims first.') + + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + + file_obj = io.BytesIO(self.file_client.get(results['filename'])) + if self.mode == 'accurate': + container = pims.PyAVReaderIndexed(file_obj) + else: + container = pims.PyAVReaderTimed(file_obj) + + results['video_reader'] = container + results['total_frames'] = len(container) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(io_backend={self.io_backend}, ' + f'mode={self.mode})') + return repr_str + + +@PIPELINES.register_module() +class PIMSDecode: + """Using PIMS to decode the videos. + + PIMS: https://github.com/soft-matter/pims + + Required keys are "video_reader" and "frame_inds", + added or modified keys are "imgs", "img_shape" and "original_shape". + """ + + def __call__(self, results): + container = results['video_reader'] + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + frame_inds = results['frame_inds'] + imgs = [container[idx] for idx in frame_inds] + + results['video_reader'] = None + del container + + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + + return results + + +@PIPELINES.register_module() +class PyAVDecodeMotionVector(PyAVDecode): + """Using pyav to decode the motion vectors from video. + + Reference: https://github.com/PyAV-Org/PyAV/ + blob/main/tests/test_decode.py + + Required keys are "video_reader" and "frame_inds", + added or modified keys are "motion_vectors", "frame_inds". + """ + + @staticmethod + def _parse_vectors(mv, vectors, height, width): + """Parse the returned vectors.""" + (w, h, src_x, src_y, dst_x, + dst_y) = (vectors['w'], vectors['h'], vectors['src_x'], + vectors['src_y'], vectors['dst_x'], vectors['dst_y']) + val_x = dst_x - src_x + val_y = dst_y - src_y + start_x = dst_x - w // 2 + start_y = dst_y - h // 2 + end_x = start_x + w + end_y = start_y + h + for sx, ex, sy, ey, vx, vy in zip(start_x, end_x, start_y, end_y, + val_x, val_y): + if (sx >= 0 and ex < width and sy >= 0 and ey < height): + mv[sy:ey, sx:ex] = (vx, vy) + + return mv + + def __call__(self, results): + """Perform the PyAV motion vector decoding. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + container = results['video_reader'] + imgs = list() + + if self.multi_thread: + container.streams.video[0].thread_type = 'AUTO' + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + # set max index to make early stop + max_idx = max(results['frame_inds']) + i = 0 + stream = container.streams.video[0] + codec_context = stream.codec_context + codec_context.options = {'flags2': '+export_mvs'} + for packet in container.demux(stream): + for frame in packet.decode(): + if i > max_idx + 1: + break + i += 1 + height = frame.height + width = frame.width + mv = np.zeros((height, width, 2), dtype=np.int8) + vectors = frame.side_data.get('MOTION_VECTORS') + if frame.key_frame: + # Key frame don't have motion vectors + assert vectors is None + if vectors is not None and len(vectors) > 0: + mv = self._parse_vectors(mv, vectors.to_ndarray(), height, + width) + imgs.append(mv) + + results['video_reader'] = None + del container + + # the available frame in pyav may be less than its length, + # which may raise error + results['motion_vectors'] = np.array( + [imgs[i % len(imgs)] for i in results['frame_inds']]) + return results + + +@PIPELINES.register_module() +class DecordInit: + """Using decord to initialize the video_reader. + + Decord: https://github.com/dmlc/decord + + Required keys are "filename", + added or modified keys are "video_reader" and "total_frames". + + Args: + io_backend (str): io backend where frames are store. + Default: 'disk'. + num_threads (int): Number of thread to decode the video. Default: 1. + kwargs (dict): Args for file client. + """ + + def __init__(self, io_backend='disk', num_threads=1, **kwargs): + self.io_backend = io_backend + self.num_threads = num_threads + self.kwargs = kwargs + self.file_client = None + + def __call__(self, results): + """Perform the Decord initialization. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + try: + import decord + except ImportError: + raise ImportError( + 'Please run "pip install decord" to install Decord first.') + + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + + file_obj = io.BytesIO(self.file_client.get(results['filename'])) + container = decord.VideoReader(file_obj, num_threads=self.num_threads) + results['video_reader'] = container + results['total_frames'] = len(container) + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'io_backend={self.io_backend}, ' + f'num_threads={self.num_threads})') + return repr_str + + +@PIPELINES.register_module() +class DecordDecode: + """Using decord to decode the video. + + Decord: https://github.com/dmlc/decord + + Required keys are "video_reader", "filename" and "frame_inds", + added or modified keys are "imgs" and "original_shape". + + Args: + mode (str): Decoding mode. Options are 'accurate' and 'efficient'. + If set to 'accurate', it will decode videos into accurate frames. + If set to 'efficient', it will adopt fast seeking but only return + key frames, which may be duplicated and inaccurate, and more + suitable for large scene-based video datasets. Default: 'accurate'. + """ + + def __init__(self, mode='accurate'): + self.mode = mode + assert mode in ['accurate', 'efficient'] + + def __call__(self, results): + """Perform the Decord decoding. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + container = results['video_reader'] + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + frame_inds = results['frame_inds'] + + if self.mode == 'accurate': + imgs = container.get_batch(frame_inds).asnumpy() + imgs = list(imgs) + elif self.mode == 'efficient': + # This mode is faster, however it always returns I-FRAME + container.seek(0) + imgs = list() + for idx in frame_inds: + container.seek(idx) + frame = container.next() + imgs.append(frame.asnumpy()) + + results['video_reader'] = None + del container + + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + + return results + + def __repr__(self): + repr_str = f'{self.__class__.__name__}(mode={self.mode})' + return repr_str + + +@PIPELINES.register_module() +class OpenCVInit: + """Using OpenCV to initialize the video_reader. + + Required keys are "filename", added or modified keys are "new_path", + "video_reader" and "total_frames". + + Args: + io_backend (str): io backend where frames are store. + Default: 'disk'. + kwargs (dict): Args for file client. + """ + + def __init__(self, io_backend='disk', **kwargs): + self.io_backend = io_backend + self.kwargs = kwargs + self.file_client = None + self.tmp_folder = None + if self.io_backend != 'disk': + random_string = get_random_string() + thread_id = get_thread_id() + self.tmp_folder = osp.join(get_shm_dir(), + f'{random_string}_{thread_id}') + os.mkdir(self.tmp_folder) + + def __call__(self, results): + """Perform the OpenCV initialization. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + if self.io_backend == 'disk': + new_path = results['filename'] + else: + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + + thread_id = get_thread_id() + # save the file of same thread at the same place + new_path = osp.join(self.tmp_folder, f'tmp_{thread_id}.mp4') + with open(new_path, 'wb') as f: + f.write(self.file_client.get(results['filename'])) + + container = mmcv.VideoReader(new_path) + results['new_path'] = new_path + results['video_reader'] = container + results['total_frames'] = len(container) + + return results + + def __del__(self): + if self.tmp_folder and osp.exists(self.tmp_folder): + shutil.rmtree(self.tmp_folder) + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'io_backend={self.io_backend})') + return repr_str + + +@PIPELINES.register_module() +class OpenCVDecode: + """Using OpenCV to decode the video. + + Required keys are "video_reader", "filename" and "frame_inds", added or + modified keys are "imgs", "img_shape" and "original_shape". + """ + + def __call__(self, results): + """Perform the OpenCV decoding. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + container = results['video_reader'] + imgs = list() + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + for frame_ind in results['frame_inds']: + cur_frame = container[frame_ind] + # last frame may be None in OpenCV + while isinstance(cur_frame, type(None)): + frame_ind -= 1 + cur_frame = container[frame_ind] + imgs.append(cur_frame) + + results['video_reader'] = None + del container + + imgs = np.array(imgs) + # The default channel order of OpenCV is BGR, thus we change it to RGB + imgs = imgs[:, :, :, ::-1] + results['imgs'] = list(imgs) + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + + return results + + +@PIPELINES.register_module() +class RawFrameDecode: + """Load and decode frames with given indices. + + Required keys are "frame_dir", "filename_tmpl" and "frame_inds", + added or modified keys are "imgs", "img_shape" and "original_shape". + + Args: + io_backend (str): IO backend where frames are stored. Default: 'disk'. + decoding_backend (str): Backend used for image decoding. + Default: 'cv2'. + kwargs (dict, optional): Arguments for FileClient. + """ + + def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs): + self.io_backend = io_backend + self.decoding_backend = decoding_backend + self.kwargs = kwargs + self.file_client = None + + def __call__(self, results): + """Perform the ``RawFrameDecode`` to pick frames given indices. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + mmcv.use_backend(self.decoding_backend) + + directory = results['frame_dir'] + filename_tmpl = results['filename_tmpl'] + modality = results['modality'] + + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + + imgs = list() + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + offset = results.get('offset', 0) + + cache = {} + for i, frame_idx in enumerate(results['frame_inds']): + # Avoid loading duplicated frames + if frame_idx in cache: + if modality == 'RGB': + imgs.append(cp.deepcopy(imgs[cache[frame_idx]])) + else: + imgs.append(cp.deepcopy(imgs[2 * cache[frame_idx]])) + imgs.append(cp.deepcopy(imgs[2 * cache[frame_idx] + 1])) + continue + else: + cache[frame_idx] = i + + frame_idx += offset + if modality == 'RGB': + filepath = osp.join(directory, filename_tmpl.format(frame_idx)) + img_bytes = self.file_client.get(filepath) + # Get frame with channel order RGB directly. + cur_frame = mmcv.imfrombytes(img_bytes, channel_order='rgb') + imgs.append(cur_frame) + elif modality == 'Flow': + x_filepath = osp.join(directory, + filename_tmpl.format('x', frame_idx)) + y_filepath = osp.join(directory, + filename_tmpl.format('y', frame_idx)) + x_img_bytes = self.file_client.get(x_filepath) + x_frame = mmcv.imfrombytes(x_img_bytes, flag='grayscale') + y_img_bytes = self.file_client.get(y_filepath) + y_frame = mmcv.imfrombytes(y_img_bytes, flag='grayscale') + imgs.extend([x_frame, y_frame]) + else: + raise NotImplementedError + + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + + # we resize the gt_bboxes and proposals to their real scale + if 'gt_bboxes' in results: + h, w = results['img_shape'] + scale_factor = np.array([w, h, w, h]) + gt_bboxes = results['gt_bboxes'] + gt_bboxes = (gt_bboxes * scale_factor).astype(np.float32) + results['gt_bboxes'] = gt_bboxes + if 'proposals' in results and results['proposals'] is not None: + proposals = results['proposals'] + proposals = (proposals * scale_factor).astype(np.float32) + results['proposals'] = proposals + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'io_backend={self.io_backend}, ' + f'decoding_backend={self.decoding_backend})') + return repr_str + + +@PIPELINES.register_module() +class ArrayDecode: + """Load and decode frames with given indices from a 4D array. + + Required keys are "array and "frame_inds", added or modified keys are + "imgs", "img_shape" and "original_shape". + """ + + def __call__(self, results): + """Perform the ``RawFrameDecode`` to pick frames given indices. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + + modality = results['modality'] + array = results['array'] + + imgs = list() + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + offset = results.get('offset', 0) + + for i, frame_idx in enumerate(results['frame_inds']): + + frame_idx += offset + if modality == 'RGB': + imgs.append(array[frame_idx]) + elif modality == 'Flow': + imgs.extend( + [array[frame_idx, ..., 0], array[frame_idx, ..., 1]]) + else: + raise NotImplementedError + + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + + return results + + def __repr__(self): + return f'{self.__class__.__name__}()' + + +@PIPELINES.register_module() +class ImageDecode: + """Load and decode images. + + Required key is "filename", added or modified keys are "imgs", "img_shape" + and "original_shape". + + Args: + io_backend (str): IO backend where frames are stored. Default: 'disk'. + decoding_backend (str): Backend used for image decoding. + Default: 'cv2'. + kwargs (dict, optional): Arguments for FileClient. + """ + + def __init__(self, io_backend='disk', decoding_backend='cv2', **kwargs): + self.io_backend = io_backend + self.decoding_backend = decoding_backend + self.kwargs = kwargs + self.file_client = None + + def __call__(self, results): + """Perform the ``ImageDecode`` to load image given the file path. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + mmcv.use_backend(self.decoding_backend) + + filename = results['filename'] + + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + + imgs = list() + img_bytes = self.file_client.get(filename) + + img = mmcv.imfrombytes(img_bytes, channel_order='rgb') + imgs.append(img) + + results['imgs'] = imgs + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] + return results + + +@PIPELINES.register_module() +class AudioDecodeInit: + """Using librosa to initialize the audio reader. + + Required keys are "audio_path", added or modified keys are "length", + "sample_rate", "audios". + + Args: + io_backend (str): io backend where frames are store. + Default: 'disk'. + sample_rate (int): Audio sampling times per second. Default: 16000. + """ + + def __init__(self, + io_backend='disk', + sample_rate=16000, + pad_method='zero', + **kwargs): + self.io_backend = io_backend + self.sample_rate = sample_rate + if pad_method in ['random', 'zero']: + self.pad_method = pad_method + else: + raise NotImplementedError + self.kwargs = kwargs + self.file_client = None + + @staticmethod + def _zero_pad(shape): + return np.zeros(shape, dtype=np.float32) + + @staticmethod + def _random_pad(shape): + # librosa load raw audio file into a distribution of -1~+1 + return np.random.rand(shape).astype(np.float32) * 2 - 1 + + def __call__(self, results): + """Perform the librosa initialization. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + try: + import librosa + except ImportError: + raise ImportError('Please install librosa first.') + + if self.file_client is None: + self.file_client = FileClient(self.io_backend, **self.kwargs) + if osp.exists(results['audio_path']): + file_obj = io.BytesIO(self.file_client.get(results['audio_path'])) + y, sr = librosa.load(file_obj, sr=self.sample_rate) + else: + # Generate a random dummy 10s input + pad_func = getattr(self, f'_{self.pad_method}_pad') + y = pad_func(int(round(10.0 * self.sample_rate))) + sr = self.sample_rate + + results['length'] = y.shape[0] + results['sample_rate'] = sr + results['audios'] = y + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'io_backend={self.io_backend}, ' + f'sample_rate={self.sample_rate}, ' + f'pad_method={self.pad_method})') + return repr_str + + +@PIPELINES.register_module() +class LoadAudioFeature: + """Load offline extracted audio features. + + Required keys are "audio_path", added or modified keys are "length", + audios". + """ + + def __init__(self, pad_method='zero'): + if pad_method not in ['zero', 'random']: + raise NotImplementedError + self.pad_method = pad_method + + @staticmethod + def _zero_pad(shape): + return np.zeros(shape, dtype=np.float32) + + @staticmethod + def _random_pad(shape): + # spectrogram is normalized into a distribution of 0~1 + return np.random.rand(shape).astype(np.float32) + + def __call__(self, results): + """Perform the numpy loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + if osp.exists(results['audio_path']): + feature_map = np.load(results['audio_path']) + else: + # Generate a random dummy 10s input + # Some videos do not have audio stream + pad_func = getattr(self, f'_{self.pad_method}_pad') + feature_map = pad_func((640, 80)) + + results['length'] = feature_map.shape[0] + results['audios'] = feature_map + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'pad_method={self.pad_method})') + return repr_str + + +@PIPELINES.register_module() +class AudioDecode: + """Sample the audio w.r.t. the frames selected. + + Args: + fixed_length (int): As the audio clip selected by frames sampled may + not be exactly the same, `fixed_length` will truncate or pad them + into the same size. Default: 32000. + + Required keys are "frame_inds", "num_clips", "total_frames", "length", + added or modified keys are "audios", "audios_shape". + """ + + def __init__(self, fixed_length=32000): + self.fixed_length = fixed_length + + def __call__(self, results): + """Perform the ``AudioDecode`` to pick audio clips.""" + audio = results['audios'] + frame_inds = results['frame_inds'] + num_clips = results['num_clips'] + resampled_clips = list() + frame_inds = frame_inds.reshape(num_clips, -1) + for clip_idx in range(num_clips): + clip_frame_inds = frame_inds[clip_idx] + start_idx = max( + 0, + int( + round((clip_frame_inds[0] + 1) / results['total_frames'] * + results['length']))) + end_idx = min( + results['length'], + int( + round((clip_frame_inds[-1] + 1) / results['total_frames'] * + results['length']))) + cropped_audio = audio[start_idx:end_idx] + if cropped_audio.shape[0] >= self.fixed_length: + truncated_audio = cropped_audio[:self.fixed_length] + else: + truncated_audio = np.pad( + cropped_audio, + ((0, self.fixed_length - cropped_audio.shape[0])), + mode='constant') + + resampled_clips.append(truncated_audio) + + results['audios'] = np.array(resampled_clips) + results['audios_shape'] = results['audios'].shape + return results + + +@PIPELINES.register_module() +class BuildPseudoClip: + """Build pseudo clips with one single image by repeating it n times. + + Required key is "imgs", added or modified key is "imgs", "num_clips", + "clip_len". + + Args: + clip_len (int): Frames of the generated pseudo clips. + """ + + def __init__(self, clip_len): + self.clip_len = clip_len + + def __call__(self, results): + # the input should be one single image + assert len(results['imgs']) == 1 + im = results['imgs'][0] + for _ in range(1, self.clip_len): + results['imgs'].append(np.copy(im)) + results['clip_len'] = self.clip_len + results['num_clips'] = 1 + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'fix_length={self.fixed_length})') + return repr_str + + +@PIPELINES.register_module() +class AudioFeatureSelector: + """Sample the audio feature w.r.t. the frames selected. + + Required keys are "audios", "frame_inds", "num_clips", "length", + "total_frames", added or modified keys are "audios", "audios_shape". + + Args: + fixed_length (int): As the features selected by frames sampled may + not be exactly the same, `fixed_length` will truncate or pad them + into the same size. Default: 128. + """ + + def __init__(self, fixed_length=128): + self.fixed_length = fixed_length + + def __call__(self, results): + """Perform the ``AudioFeatureSelector`` to pick audio feature clips. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + audio = results['audios'] + frame_inds = results['frame_inds'] + num_clips = results['num_clips'] + resampled_clips = list() + + frame_inds = frame_inds.reshape(num_clips, -1) + for clip_idx in range(num_clips): + clip_frame_inds = frame_inds[clip_idx] + start_idx = max( + 0, + int( + round((clip_frame_inds[0] + 1) / results['total_frames'] * + results['length']))) + end_idx = min( + results['length'], + int( + round((clip_frame_inds[-1] + 1) / results['total_frames'] * + results['length']))) + cropped_audio = audio[start_idx:end_idx, :] + if cropped_audio.shape[0] >= self.fixed_length: + truncated_audio = cropped_audio[:self.fixed_length, :] + else: + truncated_audio = np.pad( + cropped_audio, + ((0, self.fixed_length - cropped_audio.shape[0]), (0, 0)), + mode='constant') + + resampled_clips.append(truncated_audio) + results['audios'] = np.array(resampled_clips) + results['audios_shape'] = results['audios'].shape + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'fix_length={self.fixed_length})') + return repr_str + + +@PIPELINES.register_module() +class LoadLocalizationFeature: + """Load Video features for localizer with given video_name list. + + Required keys are "video_name" and "data_prefix", added or modified keys + are "raw_feature". + + Args: + raw_feature_ext (str): Raw feature file extension. Default: '.csv'. + """ + + def __init__(self, raw_feature_ext='.csv'): + valid_raw_feature_ext = ('.csv', ) + if raw_feature_ext not in valid_raw_feature_ext: + raise NotImplementedError + self.raw_feature_ext = raw_feature_ext + + def __call__(self, results): + """Perform the LoadLocalizationFeature loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + video_name = results['video_name'] + data_prefix = results['data_prefix'] + + data_path = osp.join(data_prefix, video_name + self.raw_feature_ext) + raw_feature = np.loadtxt( + data_path, dtype=np.float32, delimiter=',', skiprows=1) + + results['raw_feature'] = np.transpose(raw_feature, (1, 0)) + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'raw_feature_ext={self.raw_feature_ext})') + return repr_str + + +@PIPELINES.register_module() +class GenerateLocalizationLabels: + """Load video label for localizer with given video_name list. + + Required keys are "duration_frame", "duration_second", "feature_frame", + "annotations", added or modified keys are "gt_bbox". + """ + + def __call__(self, results): + """Perform the GenerateLocalizationLabels loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + video_frame = results['duration_frame'] + video_second = results['duration_second'] + feature_frame = results['feature_frame'] + corrected_second = float(feature_frame) / video_frame * video_second + annotations = results['annotations'] + + gt_bbox = [] + + for annotation in annotations: + current_start = max( + min(1, annotation['segment'][0] / corrected_second), 0) + current_end = max( + min(1, annotation['segment'][1] / corrected_second), 0) + gt_bbox.append([current_start, current_end]) + + gt_bbox = np.array(gt_bbox) + results['gt_bbox'] = gt_bbox + return results + + +@PIPELINES.register_module() +class LoadProposals: + """Loading proposals with given proposal results. + + Required keys are "video_name", added or modified keys are 'bsp_feature', + 'tmin', 'tmax', 'tmin_score', 'tmax_score' and 'reference_temporal_iou'. + + Args: + top_k (int): The top k proposals to be loaded. + pgm_proposals_dir (str): Directory to load proposals. + pgm_features_dir (str): Directory to load proposal features. + proposal_ext (str): Proposal file extension. Default: '.csv'. + feature_ext (str): Feature file extension. Default: '.npy'. + """ + + def __init__(self, + top_k, + pgm_proposals_dir, + pgm_features_dir, + proposal_ext='.csv', + feature_ext='.npy'): + self.top_k = top_k + self.pgm_proposals_dir = pgm_proposals_dir + self.pgm_features_dir = pgm_features_dir + valid_proposal_ext = ('.csv', ) + if proposal_ext not in valid_proposal_ext: + raise NotImplementedError + self.proposal_ext = proposal_ext + valid_feature_ext = ('.npy', ) + if feature_ext not in valid_feature_ext: + raise NotImplementedError + self.feature_ext = feature_ext + + def __call__(self, results): + """Perform the LoadProposals loading. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + video_name = results['video_name'] + proposal_path = osp.join(self.pgm_proposals_dir, + video_name + self.proposal_ext) + if self.proposal_ext == '.csv': + pgm_proposals = np.loadtxt( + proposal_path, dtype=np.float32, delimiter=',', skiprows=1) + + pgm_proposals = np.array(pgm_proposals[:self.top_k]) + tmin = pgm_proposals[:, 0] + tmax = pgm_proposals[:, 1] + tmin_score = pgm_proposals[:, 2] + tmax_score = pgm_proposals[:, 3] + reference_temporal_iou = pgm_proposals[:, 5] + + feature_path = osp.join(self.pgm_features_dir, + video_name + self.feature_ext) + if self.feature_ext == '.npy': + bsp_feature = np.load(feature_path).astype(np.float32) + + bsp_feature = bsp_feature[:self.top_k, :] + + results['bsp_feature'] = bsp_feature + results['tmin'] = tmin + results['tmax'] = tmax + results['tmin_score'] = tmin_score + results['tmax_score'] = tmax_score + results['reference_temporal_iou'] = reference_temporal_iou + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'top_k={self.top_k}, ' + f'pgm_proposals_dir={self.pgm_proposals_dir}, ' + f'pgm_features_dir={self.pgm_features_dir}, ' + f'proposal_ext={self.proposal_ext}, ' + f'feature_ext={self.feature_ext})') + return repr_str