--- a
+++ b/mmaction/datasets/ssn_dataset.py
@@ -0,0 +1,882 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+import warnings
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+from torch.nn.modules.utils import _pair
+
+from ..core import softmax
+from ..localization import (eval_ap, load_localize_proposal_file,
+                            perform_regression, temporal_iou, temporal_nms)
+from ..utils import get_root_logger
+from .base import BaseDataset
+from .builder import DATASETS
+
+
+class SSNInstance:
+    """Proposal instance of SSN.
+
+    Args:
+        start_frame (int): Index of the proposal's start frame.
+        end_frame (int): Index of the proposal's end frame.
+        num_video_frames (int): Total frames of the video.
+        label (int | None): The category label of the proposal. Default: None.
+        best_iou (float): The highest IOU with the groundtruth instance.
+            Default: 0.
+        overlap_self (float): Percent of the proposal's own span contained
+            in a groundtruth instance. Default: 0.
+    """
+
+    def __init__(self,
+                 start_frame,
+                 end_frame,
+                 num_video_frames,
+                 label=None,
+                 best_iou=0,
+                 overlap_self=0):
+        self.start_frame = start_frame
+        self.end_frame = min(end_frame, num_video_frames)
+        self.num_video_frames = num_video_frames
+        self.label = label if label is not None else -1
+        self.coverage = (end_frame - start_frame) / num_video_frames
+        self.best_iou = best_iou
+        self.overlap_self = overlap_self
+        self.loc_reg = None
+        self.size_reg = None
+        self.regression_targets = [0., 0.]
+
+    def compute_regression_targets(self, gt_list):
+        """Compute regression targets of positive proposals.
+
+        Args:
+            gt_list (list): The list of groundtruth instances.
+        """
+        # Find the groundtruth instance with the highest IOU.
+        ious = [
+            temporal_iou(self.start_frame, self.end_frame, gt.start_frame,
+                         gt.end_frame) for gt in gt_list
+        ]
+        best_gt = gt_list[np.argmax(ious)]
+
+        # interval: [start_frame, end_frame)
+        proposal_center = (self.start_frame + self.end_frame - 1) / 2
+        gt_center = (best_gt.start_frame + best_gt.end_frame - 1) / 2
+        proposal_size = self.end_frame - self.start_frame
+        gt_size = best_gt.end_frame - best_gt.start_frame
+
+        # Get regression targets:
+        # (1). Localization regression target:
+        #     center shift proportional to the proposal duration
+        # (2). Duration/Size regression target:
+        #     logarithm of the groundtruth duration over proposal duration
+
+        self.loc_reg = (gt_center - proposal_center) / proposal_size
+        self.size_reg = np.log(gt_size / proposal_size)
+        self.regression_targets = ([self.loc_reg, self.size_reg]
+                                   if self.loc_reg is not None else [0., 0.])
+
+
+@DATASETS.register_module()
+class SSNDataset(BaseDataset):
+    """Proposal frame dataset for Structured Segment Networks.
+
+    Based on proposal information, the dataset loads raw frames and applies
+    specified transforms to return a dict containing the frame tensors and
+    other information.
+
+    The ann_file is a text file with multiple lines and each
+    video's information takes up several lines. This file can be a normalized
+    file with percent or standard file with specific frame indexes. If the file
+    is a normalized file, it will be converted into a standard file first.
+
+    Template information of a video in a standard file:
+    .. code-block:: txt
+        # index
+        video_id
+        num_frames
+        fps
+        num_gts
+        label, start_frame, end_frame
+        label, start_frame, end_frame
+        ...
+        num_proposals
+        label, best_iou, overlap_self, start_frame, end_frame
+        label, best_iou, overlap_self, start_frame, end_frame
+        ...
+
+    Example of a standard annotation file:
+    .. code-block:: txt
+        # 0
+        video_validation_0000202
+        5666
+        1
+        3
+        8 130 185
+        8 832 1136
+        8 1303 1381
+        5
+        8 0.0620 0.0620 790 5671
+        8 0.1656 0.1656 790 2619
+        8 0.0833 0.0833 3945 5671
+        8 0.0960 0.0960 4173 5671
+        8 0.0614 0.0614 3327 5671
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        train_cfg (dict): Config for training.
+        test_cfg (dict): Config for testing.
+        data_prefix (str): Path to a directory where videos are held.
+        test_mode (bool): Store True when building test or validation dataset.
+            Default: False.
+        filename_tmpl (str): Template for each filename.
+            Default: 'img_{:05}.jpg'.
+        start_index (int): Specify a start index for frames in consideration of
+            different filename format. Default: 1.
+        modality (str): Modality of data. Support 'RGB', 'Flow'.
+            Default: 'RGB'.
+        video_centric (bool): Whether to sample proposals just from
+            this video or sample proposals randomly from the entire dataset.
+            Default: True.
+        reg_normalize_constants (list): Regression target normalized constants,
+            including mean and standard deviation of location and duration.
+        body_segments (int): Number of segments in course period.
+            Default: 5.
+        aug_segments (list[int]): Number of segments in starting and
+            ending period. Default: (2, 2).
+        aug_ratio (int | float | tuple[int | float]): The ratio of the length
+            of augmentation to that of the proposal. Default: (0.5, 0.5).
+        clip_len (int): Frames of each sampled output clip.
+            Default: 1.
+        frame_interval (int): Temporal interval of adjacent sampled frames.
+            Default: 1.
+        filter_gt (bool): Whether to filter videos with no annotation
+            during training. Default: True.
+        use_regression (bool): Whether to perform regression. Default: True.
+        verbose (bool): Whether to print full information or not.
+            Default: False.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 pipeline,
+                 train_cfg,
+                 test_cfg,
+                 data_prefix,
+                 test_mode=False,
+                 filename_tmpl='img_{:05d}.jpg',
+                 start_index=1,
+                 modality='RGB',
+                 video_centric=True,
+                 reg_normalize_constants=None,
+                 body_segments=5,
+                 aug_segments=(2, 2),
+                 aug_ratio=(0.5, 0.5),
+                 clip_len=1,
+                 frame_interval=1,
+                 filter_gt=True,
+                 use_regression=True,
+                 verbose=False):
+        self.logger = get_root_logger()
+        super().__init__(
+            ann_file,
+            pipeline,
+            data_prefix=data_prefix,
+            test_mode=test_mode,
+            start_index=start_index,
+            modality=modality)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.assigner = train_cfg.ssn.assigner
+        self.sampler = train_cfg.ssn.sampler
+        self.evaluater = test_cfg.ssn.evaluater
+        self.verbose = verbose
+        self.filename_tmpl = filename_tmpl
+
+        if filter_gt or not test_mode:
+            valid_inds = [
+                i for i, video_info in enumerate(self.video_infos)
+                if len(video_info['gts']) > 0
+            ]
+        self.logger.info(f'{len(valid_inds)} out of {len(self.video_infos)} '
+                         f'videos are valid.')
+        self.video_infos = [self.video_infos[i] for i in valid_inds]
+
+        # construct three pools:
+        # 1. Positive(Foreground)
+        # 2. Background
+        # 3. Incomplete
+        self.positive_pool = []
+        self.background_pool = []
+        self.incomplete_pool = []
+        self.construct_proposal_pools()
+
+        if reg_normalize_constants is None:
+            self.reg_norm_consts = self._compute_reg_normalize_constants()
+        else:
+            self.reg_norm_consts = reg_normalize_constants
+        self.video_centric = video_centric
+        self.body_segments = body_segments
+        self.aug_segments = aug_segments
+        self.aug_ratio = _pair(aug_ratio)
+        if not mmcv.is_tuple_of(self.aug_ratio, (int, float)):
+            raise TypeError(f'aug_ratio should be int, float'
+                            f'or tuple of int and float, '
+                            f'but got {type(aug_ratio)}')
+        assert len(self.aug_ratio) == 2
+
+        total_ratio = (
+            self.sampler.positive_ratio + self.sampler.background_ratio +
+            self.sampler.incomplete_ratio)
+        self.positive_per_video = int(
+            self.sampler.num_per_video *
+            (self.sampler.positive_ratio / total_ratio))
+        self.background_per_video = int(
+            self.sampler.num_per_video *
+            (self.sampler.background_ratio / total_ratio))
+        self.incomplete_per_video = (
+            self.sampler.num_per_video - self.positive_per_video -
+            self.background_per_video)
+
+        self.test_interval = self.test_cfg.ssn.sampler.test_interval
+        # number of consecutive frames
+        self.clip_len = clip_len
+        # number of steps (sparse sampling for efficiency of io)
+        self.frame_interval = frame_interval
+
+        # test mode or not
+        self.filter_gt = filter_gt
+        self.use_regression = use_regression
+        self.test_mode = test_mode
+
+        # yapf: disable
+        if self.verbose:
+            self.logger.info(f"""
+            SSNDataset: proposal file {self.proposal_file} parsed.
+
+            There are {len(self.positive_pool) + len(self.background_pool) +
+                len(self.incomplete_pool)} usable proposals from {len(self.video_infos)} videos.
+            {len(self.positive_pool)} positive proposals
+            {len(self.incomplete_pool)} incomplete proposals
+            {len(self.background_pool)} background proposals
+
+            Sample config:
+            FG/BG/INCOMP: {self.positive_per_video}/{self.background_per_video}/{self.incomplete_per_video}  # noqa:E501
+            Video Centric: {self.video_centric}
+
+            Regression Normalization Constants:
+            Location: mean {self.reg_norm_consts[0][0]:.05f} std {self.reg_norm_consts[1][0]:.05f} # noqa: E501
+            Duration: mean {self.reg_norm_consts[0][1]:.05f} std {self.reg_norm_consts[1][1]:.05f} # noqa: E501
+            """)
+        # yapf: enable
+        else:
+            self.logger.info(
+                f'SSNDataset: proposal file {self.proposal_file} parsed.')
+
+    def load_annotations(self):
+        """Load annotation file to get video information."""
+        video_infos = []
+        if 'normalized_' in self.ann_file:
+            self.proposal_file = self.ann_file.replace('normalized_', '')
+            if not osp.exists(self.proposal_file):
+                raise Exception(f'Please refer to `$MMACTION2/tools/data` to'
+                                f'denormalize {self.ann_file}.')
+        else:
+            self.proposal_file = self.ann_file
+        proposal_infos = load_localize_proposal_file(self.proposal_file)
+        # proposal_info:[video_id, num_frames, gt_list, proposal_list]
+        # gt_list member: [label, start_frame, end_frame]
+        # proposal_list member: [label, best_iou, overlap_self,
+        #                        start_frame, end_frame]
+        for proposal_info in proposal_infos:
+            if self.data_prefix is not None:
+                frame_dir = osp.join(self.data_prefix, proposal_info[0])
+            num_frames = int(proposal_info[1])
+            # gts:start, end, num_frames, class_label, tIoU=1
+            gts = []
+            for x in proposal_info[2]:
+                if int(x[2]) > int(x[1]) and int(x[1]) < num_frames:
+                    ssn_instance = SSNInstance(
+                        int(x[1]),
+                        int(x[2]),
+                        num_frames,
+                        label=int(x[0]),
+                        best_iou=1.0)
+                    gts.append(ssn_instance)
+            # proposals:start, end, num_frames, class_label
+            # tIoU=best_iou, overlap_self
+            proposals = []
+            for x in proposal_info[3]:
+                if int(x[4]) > int(x[3]) and int(x[3]) < num_frames:
+                    ssn_instance = SSNInstance(
+                        int(x[3]),
+                        int(x[4]),
+                        num_frames,
+                        label=int(x[0]),
+                        best_iou=float(x[1]),
+                        overlap_self=float(x[2]))
+                    proposals.append(ssn_instance)
+            video_infos.append(
+                dict(
+                    frame_dir=frame_dir,
+                    video_id=proposal_info[0],
+                    total_frames=num_frames,
+                    gts=gts,
+                    proposals=proposals))
+        return video_infos
+
+    def results_to_detections(self, results, top_k=2000, **kwargs):
+        """Convert prediction results into detections.
+
+        Args:
+            results (list): Prediction results.
+            top_k (int): Number of top results. Default: 2000.
+
+        Returns:
+            list: Detection results.
+        """
+        num_classes = results[0]['activity_scores'].shape[1] - 1
+        detections = [dict() for _ in range(num_classes)]
+
+        for idx in range(len(self)):
+            video_id = self.video_infos[idx]['video_id']
+            relative_proposals = results[idx]['relative_proposal_list']
+            if len(relative_proposals[0].shape) == 3:
+                relative_proposals = np.squeeze(relative_proposals, 0)
+
+            activity_scores = results[idx]['activity_scores']
+            completeness_scores = results[idx]['completeness_scores']
+            regression_scores = results[idx]['bbox_preds']
+            if regression_scores is None:
+                regression_scores = np.zeros(
+                    (len(relative_proposals), num_classes, 2),
+                    dtype=np.float32)
+            regression_scores = regression_scores.reshape((-1, num_classes, 2))
+
+            if top_k <= 0:
+                combined_scores = (
+                    softmax(activity_scores[:, 1:], dim=1) *
+                    np.exp(completeness_scores))
+                for i in range(num_classes):
+                    center_scores = regression_scores[:, i, 0][:, None]
+                    duration_scores = regression_scores[:, i, 1][:, None]
+                    detections[i][video_id] = np.concatenate(
+                        (relative_proposals, combined_scores[:, i][:, None],
+                         center_scores, duration_scores),
+                        axis=1)
+            else:
+                combined_scores = (
+                    softmax(activity_scores[:, 1:], dim=1) *
+                    np.exp(completeness_scores))
+                keep_idx = np.argsort(combined_scores.ravel())[-top_k:]
+                for k in keep_idx:
+                    class_idx = k % num_classes
+                    proposal_idx = k // num_classes
+                    new_item = [
+                        relative_proposals[proposal_idx, 0],
+                        relative_proposals[proposal_idx,
+                                           1], combined_scores[proposal_idx,
+                                                               class_idx],
+                        regression_scores[proposal_idx, class_idx,
+                                          0], regression_scores[proposal_idx,
+                                                                class_idx, 1]
+                    ]
+                    if video_id not in detections[class_idx]:
+                        detections[class_idx][video_id] = np.array([new_item])
+                    else:
+                        detections[class_idx][video_id] = np.vstack(
+                            [detections[class_idx][video_id], new_item])
+
+        return detections
+
+    def evaluate(self,
+                 results,
+                 metrics='mAP',
+                 metric_options=dict(mAP=dict(eval_dataset='thumos14')),
+                 logger=None,
+                 **deprecated_kwargs):
+        """Evaluation in SSN proposal dataset.
+
+        Args:
+            results (list[dict]): Output results.
+            metrics (str | sequence[str]): Metrics to be performed.
+                Defaults: 'mAP'.
+            metric_options (dict): Dict for metric options. Options are
+                ``eval_dataset`` for ``mAP``.
+                Default: ``dict(mAP=dict(eval_dataset='thumos14'))``.
+            logger (logging.Logger | None): Logger for recording.
+                Default: None.
+            deprecated_kwargs (dict): Used for containing deprecated arguments.
+                See 'https://github.com/open-mmlab/mmaction2/pull/286'.
+
+        Returns:
+            dict: Evaluation results for evaluation metrics.
+        """
+        # Protect ``metric_options`` since it uses mutable value as default
+        metric_options = copy.deepcopy(metric_options)
+
+        if deprecated_kwargs != {}:
+            warnings.warn(
+                'Option arguments for metrics has been changed to '
+                "`metric_options`, See 'https://github.com/open-mmlab/mmaction2/pull/286' "  # noqa: E501
+                'for more details')
+            metric_options['mAP'] = dict(metric_options['mAP'],
+                                         **deprecated_kwargs)
+
+        if not isinstance(results, list):
+            raise TypeError(f'results must be a list, but got {type(results)}')
+        assert len(results) == len(self), (
+            f'The length of results is not equal to the dataset len: '
+            f'{len(results)} != {len(self)}')
+
+        metrics = metrics if isinstance(metrics, (list, tuple)) else [metrics]
+        allowed_metrics = ['mAP']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        detections = self.results_to_detections(results, **self.evaluater)
+
+        if self.use_regression:
+            self.logger.info('Performing location regression')
+            for class_idx, _ in enumerate(detections):
+                detections[class_idx] = {
+                    k: perform_regression(v)
+                    for k, v in detections[class_idx].items()
+                }
+            self.logger.info('Regression finished')
+
+        self.logger.info('Performing NMS')
+        for class_idx, _ in enumerate(detections):
+            detections[class_idx] = {
+                k: temporal_nms(v, self.evaluater.nms)
+                for k, v in detections[class_idx].items()
+            }
+        self.logger.info('NMS finished')
+
+        # get gts
+        all_gts = self.get_all_gts()
+        for class_idx, _ in enumerate(detections):
+            if class_idx not in all_gts:
+                all_gts[class_idx] = dict()
+
+        # get predictions
+        plain_detections = {}
+        for class_idx, _ in enumerate(detections):
+            detection_list = []
+            for video, dets in detections[class_idx].items():
+                detection_list.extend([[video, class_idx] + x[:3]
+                                       for x in dets.tolist()])
+            plain_detections[class_idx] = detection_list
+
+        eval_results = OrderedDict()
+        for metric in metrics:
+            if metric == 'mAP':
+                eval_dataset = metric_options.setdefault('mAP', {}).setdefault(
+                    'eval_dataset', 'thumos14')
+                if eval_dataset == 'thumos14':
+                    iou_range = np.arange(0.1, 1.0, .1)
+                    ap_values = eval_ap(plain_detections, all_gts, iou_range)
+                    map_ious = ap_values.mean(axis=0)
+                    self.logger.info('Evaluation finished')
+
+                    for iou, map_iou in zip(iou_range, map_ious):
+                        eval_results[f'mAP@{iou:.02f}'] = map_iou
+
+        return eval_results
+
+    def construct_proposal_pools(self):
+        """Construct positive proposal pool, incomplete proposal pool and
+        background proposal pool of the entire dataset."""
+        for video_info in self.video_infos:
+            positives = self.get_positives(
+                video_info['gts'], video_info['proposals'],
+                self.assigner.positive_iou_threshold,
+                self.sampler.add_gt_as_proposals)
+            self.positive_pool.extend([(video_info['video_id'], proposal)
+                                       for proposal in positives])
+
+            incompletes, backgrounds = self.get_negatives(
+                video_info['proposals'],
+                self.assigner.incomplete_iou_threshold,
+                self.assigner.background_iou_threshold,
+                self.assigner.background_coverage_threshold,
+                self.assigner.incomplete_overlap_threshold)
+            self.incomplete_pool.extend([(video_info['video_id'], proposal)
+                                         for proposal in incompletes])
+            self.background_pool.extend([video_info['video_id'], proposal]
+                                        for proposal in backgrounds)
+
+    def get_all_gts(self):
+        """Fetch groundtruth instances of the entire dataset."""
+        gts = {}
+        for video_info in self.video_infos:
+            video = video_info['video_id']
+            for gt in video_info['gts']:
+                class_idx = gt.label - 1
+                # gt_info: [relative_start, relative_end]
+                gt_info = [
+                    gt.start_frame / video_info['total_frames'],
+                    gt.end_frame / video_info['total_frames']
+                ]
+                gts.setdefault(class_idx, {}).setdefault(video,
+                                                         []).append(gt_info)
+
+        return gts
+
+    @staticmethod
+    def get_positives(gts, proposals, positive_threshold, with_gt=True):
+        """Get positive/foreground proposals.
+
+        Args:
+            gts (list): List of groundtruth instances(:obj:`SSNInstance`).
+            proposals (list): List of proposal instances(:obj:`SSNInstance`).
+            positive_threshold (float): Minimum threshold of overlap of
+                positive/foreground proposals and groundtruths.
+            with_gt (bool): Whether to include groundtruth instances in
+                positive proposals. Default: True.
+
+        Returns:
+            list[:obj:`SSNInstance`]: (positives), positives is a list
+                comprised of positive proposal instances.
+        """
+        positives = [
+            proposal for proposal in proposals
+            if proposal.best_iou > positive_threshold
+        ]
+
+        if with_gt:
+            positives.extend(gts)
+
+        for proposal in positives:
+            proposal.compute_regression_targets(gts)
+
+        return positives
+
+    @staticmethod
+    def get_negatives(proposals,
+                      incomplete_iou_threshold,
+                      background_iou_threshold,
+                      background_coverage_threshold=0.01,
+                      incomplete_overlap_threshold=0.7):
+        """Get negative proposals, including incomplete proposals and
+        background proposals.
+
+        Args:
+            proposals (list): List of proposal instances(:obj:`SSNInstance`).
+            incomplete_iou_threshold (float): Maximum threshold of overlap
+                of incomplete proposals and groundtruths.
+            background_iou_threshold (float): Maximum threshold of overlap
+                of background proposals and groundtruths.
+            background_coverage_threshold (float): Minimum coverage
+                of background proposals in video duration. Default: 0.01.
+            incomplete_overlap_threshold (float): Minimum percent of incomplete
+                proposals' own span contained in a groundtruth instance.
+                Default: 0.7.
+
+        Returns:
+            list[:obj:`SSNInstance`]: (incompletes, backgrounds), incompletes
+                and backgrounds are lists comprised of incomplete
+                proposal instances and background proposal instances.
+        """
+        incompletes = []
+        backgrounds = []
+
+        for proposal in proposals:
+            if (proposal.best_iou < incomplete_iou_threshold
+                    and proposal.overlap_self > incomplete_overlap_threshold):
+                incompletes.append(proposal)
+            elif (proposal.best_iou < background_iou_threshold
+                  and proposal.coverage > background_coverage_threshold):
+                backgrounds.append(proposal)
+
+        return incompletes, backgrounds
+
+    def _video_centric_sampling(self, record):
+        """Sample proposals from the this video instance.
+
+        Args:
+            record (dict): Information of the video instance(video_info[idx]).
+                key: frame_dir, video_id, total_frames,
+                gts: List of groundtruth instances(:obj:`SSNInstance`).
+                proposals: List of proposal instances(:obj:`SSNInstance`).
+        """
+        positives = self.get_positives(record['gts'], record['proposals'],
+                                       self.assigner.positive_iou_threshold,
+                                       self.sampler.add_gt_as_proposals)
+        incompletes, backgrounds = self.get_negatives(
+            record['proposals'], self.assigner.incomplete_iou_threshold,
+            self.assigner.background_iou_threshold,
+            self.assigner.background_coverage_threshold,
+            self.assigner.incomplete_overlap_threshold)
+
+        def sample_video_proposals(proposal_type, video_id, video_pool,
+                                   num_requested_proposals, dataset_pool):
+            """This method will sample proposals from the this video pool. If
+            the video pool is empty, it will fetch from the dataset pool
+            (collect proposal of the entire dataset).
+
+            Args:
+                proposal_type (int): Type id of proposal.
+                    Positive/Foreground: 0
+                    Negative:
+                        Incomplete: 1
+                        Background: 2
+                video_id (str): Name of the video.
+                video_pool (list): Pool comprised of proposals in this video.
+                num_requested_proposals (int): Number of proposals
+                    to be sampled.
+                dataset_pool (list): Proposals of the entire dataset.
+
+            Returns:
+                list[(str, :obj:`SSNInstance`), int]:
+                    video_id (str): Name of the video.
+                    :obj:`SSNInstance`: Instance of class SSNInstance.
+                    proposal_type (int): Type of proposal.
+            """
+
+            if len(video_pool) == 0:
+                idx = np.random.choice(
+                    len(dataset_pool), num_requested_proposals, replace=False)
+                return [(dataset_pool[x], proposal_type) for x in idx]
+
+            replicate = len(video_pool) < num_requested_proposals
+            idx = np.random.choice(
+                len(video_pool), num_requested_proposals, replace=replicate)
+            return [((video_id, video_pool[x]), proposal_type) for x in idx]
+
+        out_proposals = []
+        out_proposals.extend(
+            sample_video_proposals(0, record['video_id'], positives,
+                                   self.positive_per_video,
+                                   self.positive_pool))
+        out_proposals.extend(
+            sample_video_proposals(1, record['video_id'], incompletes,
+                                   self.incomplete_per_video,
+                                   self.incomplete_pool))
+        out_proposals.extend(
+            sample_video_proposals(2, record['video_id'], backgrounds,
+                                   self.background_per_video,
+                                   self.background_pool))
+
+        return out_proposals
+
+    def _random_sampling(self):
+        """Randomly sample proposals from the entire dataset."""
+        out_proposals = []
+
+        positive_idx = np.random.choice(
+            len(self.positive_pool),
+            self.positive_per_video,
+            replace=len(self.positive_pool) < self.positive_per_video)
+        out_proposals.extend([(self.positive_pool[x], 0)
+                              for x in positive_idx])
+        incomplete_idx = np.random.choice(
+            len(self.incomplete_pool),
+            self.incomplete_per_video,
+            replace=len(self.incomplete_pool) < self.incomplete_per_video)
+        out_proposals.extend([(self.incomplete_pool[x], 1)
+                              for x in incomplete_idx])
+        background_idx = np.random.choice(
+            len(self.background_pool),
+            self.background_per_video,
+            replace=len(self.background_pool) < self.background_per_video)
+        out_proposals.extend([(self.background_pool[x], 2)
+                              for x in background_idx])
+
+        return out_proposals
+
+    def _get_stage(self, proposal, num_frames):
+        """Fetch the scale factor of starting and ending stage and get the
+        stage split.
+
+        Args:
+            proposal (:obj:`SSNInstance`): Proposal instance.
+            num_frames (int): Total frames of the video.
+
+        Returns:
+            tuple[float, float, list]: (starting_scale_factor,
+                ending_scale_factor, stage_split), starting_scale_factor is
+                the ratio of the effective sampling length to augment length
+                in starting stage, ending_scale_factor is the ratio of the
+                effective sampling length to augment length in ending stage,
+                stage_split is  ending segment id of starting, course and
+                ending stage.
+        """
+        # proposal interval: [start_frame, end_frame)
+        start_frame = proposal.start_frame
+        end_frame = proposal.end_frame
+        ori_clip_len = self.clip_len * self.frame_interval
+
+        duration = end_frame - start_frame
+        assert duration != 0
+
+        valid_starting = max(0,
+                             start_frame - int(duration * self.aug_ratio[0]))
+        valid_ending = min(num_frames - ori_clip_len + 1,
+                           end_frame - 1 + int(duration * self.aug_ratio[1]))
+
+        valid_starting_length = start_frame - valid_starting - ori_clip_len
+        valid_ending_length = (valid_ending - end_frame + 1) - ori_clip_len
+
+        starting_scale_factor = ((valid_starting_length + ori_clip_len + 1) /
+                                 (duration * self.aug_ratio[0]))
+        ending_scale_factor = (valid_ending_length + ori_clip_len + 1) / (
+            duration * self.aug_ratio[1])
+
+        aug_start, aug_end = self.aug_segments
+        stage_split = [
+            aug_start, aug_start + self.body_segments,
+            aug_start + self.body_segments + aug_end
+        ]
+
+        return starting_scale_factor, ending_scale_factor, stage_split
+
+    def _compute_reg_normalize_constants(self):
+        """Compute regression target normalized constants."""
+        if self.verbose:
+            self.logger.info('Compute regression target normalized constants')
+        targets = []
+        for video_info in self.video_infos:
+            positives = self.get_positives(
+                video_info['gts'], video_info['proposals'],
+                self.assigner.positive_iou_threshold, False)
+            for positive in positives:
+                targets.append(list(positive.regression_targets))
+
+        return np.array((np.mean(targets, axis=0), np.std(targets, axis=0)))
+
+    def prepare_train_frames(self, idx):
+        """Prepare the frames for training given the index."""
+        results = copy.deepcopy(self.video_infos[idx])
+        results['filename_tmpl'] = self.filename_tmpl
+        results['modality'] = self.modality
+        results['start_index'] = self.start_index
+
+        if self.video_centric:
+            # yapf: disable
+            results['out_proposals'] = self._video_centric_sampling(self.video_infos[idx])  # noqa: E501
+            # yapf: enable
+        else:
+            results['out_proposals'] = self._random_sampling()
+
+        out_proposal_scale_factor = []
+        out_proposal_type = []
+        out_proposal_labels = []
+        out_proposal_reg_targets = []
+
+        for _, proposal in enumerate(results['out_proposals']):
+            # proposal: [(video_id, SSNInstance), proposal_type]
+            num_frames = proposal[0][1].num_video_frames
+
+            (starting_scale_factor, ending_scale_factor,
+             _) = self._get_stage(proposal[0][1], num_frames)
+
+            # proposal[1]: Type id of proposal.
+            # Positive/Foreground: 0
+            # Negative:
+            #   Incomplete: 1
+            #   Background: 2
+
+            # Positivte/Foreground proposal
+            if proposal[1] == 0:
+                label = proposal[0][1].label
+            # Incomplete proposal
+            elif proposal[1] == 1:
+                label = proposal[0][1].label
+            # Background proposal
+            elif proposal[1] == 2:
+                label = 0
+            else:
+                raise ValueError(f'Proposal type should be 0, 1, or 2,'
+                                 f'but got {proposal[1]}')
+            out_proposal_scale_factor.append(
+                [starting_scale_factor, ending_scale_factor])
+            if not isinstance(label, int):
+                raise TypeError(f'proposal_label must be an int,'
+                                f'but got {type(label)}')
+            out_proposal_labels.append(label)
+            out_proposal_type.append(proposal[1])
+
+            reg_targets = proposal[0][1].regression_targets
+            if proposal[1] == 0:
+                # Normalize regression targets of positive proposals.
+                reg_targets = ((reg_targets[0] - self.reg_norm_consts[0][0]) /
+                               self.reg_norm_consts[1][0],
+                               (reg_targets[1] - self.reg_norm_consts[0][1]) /
+                               self.reg_norm_consts[1][1])
+            out_proposal_reg_targets.append(reg_targets)
+
+        results['reg_targets'] = np.array(
+            out_proposal_reg_targets, dtype=np.float32)
+        results['proposal_scale_factor'] = np.array(
+            out_proposal_scale_factor, dtype=np.float32)
+        results['proposal_labels'] = np.array(out_proposal_labels)
+        results['proposal_type'] = np.array(out_proposal_type)
+
+        return self.pipeline(results)
+
+    def prepare_test_frames(self, idx):
+        """Prepare the frames for testing given the index."""
+        results = copy.deepcopy(self.video_infos[idx])
+        results['filename_tmpl'] = self.filename_tmpl
+        results['modality'] = self.modality
+        results['start_index'] = self.start_index
+
+        proposals = results['proposals']
+        num_frames = results['total_frames']
+        ori_clip_len = self.clip_len * self.frame_interval
+        frame_ticks = np.arange(
+            0, num_frames - ori_clip_len, self.test_interval, dtype=int) + 1
+
+        num_sampled_frames = len(frame_ticks)
+
+        if len(proposals) == 0:
+            proposals.append(SSNInstance(0, num_frames - 1, num_frames))
+
+        relative_proposal_list = []
+        proposal_tick_list = []
+        scale_factor_list = []
+
+        for proposal in proposals:
+            relative_proposal = (proposal.start_frame / num_frames,
+                                 proposal.end_frame / num_frames)
+            relative_duration = relative_proposal[1] - relative_proposal[0]
+            relative_starting_duration = relative_duration * self.aug_ratio[0]
+            relative_ending_duration = relative_duration * self.aug_ratio[1]
+            relative_starting = (
+                relative_proposal[0] - relative_starting_duration)
+            relative_ending = relative_proposal[1] + relative_ending_duration
+
+            real_relative_starting = max(0.0, relative_starting)
+            real_relative_ending = min(1.0, relative_ending)
+
+            starting_scale_factor = (
+                (relative_proposal[0] - real_relative_starting) /
+                relative_starting_duration)
+            ending_scale_factor = (
+                (real_relative_ending - relative_proposal[1]) /
+                relative_ending_duration)
+
+            proposal_ranges = (real_relative_starting, *relative_proposal,
+                               real_relative_ending)
+            proposal_ticks = (np.array(proposal_ranges) *
+                              num_sampled_frames).astype(np.int32)
+
+            relative_proposal_list.append(relative_proposal)
+            proposal_tick_list.append(proposal_ticks)
+            scale_factor_list.append(
+                (starting_scale_factor, ending_scale_factor))
+
+        results['relative_proposal_list'] = np.array(
+            relative_proposal_list, dtype=np.float32)
+        results['scale_factor_list'] = np.array(
+            scale_factor_list, dtype=np.float32)
+        results['proposal_tick_list'] = np.array(
+            proposal_tick_list, dtype=np.int32)
+        results['reg_norm_consts'] = self.reg_norm_consts
+
+        return self.pipeline(results)