# Copyright (c) OpenMMLab. All rights reserved.
import copy
import os.path as osp
from collections import OrderedDict
import mmcv
import numpy as np
from mmcv.utils import print_log
from ..core import mean_average_precision
from .base import BaseDataset
from .builder import DATASETS
@DATASETS.register_module()
class HVUDataset(BaseDataset):
"""HVU dataset, which supports the recognition tags of multiple categories.
Accept both video annotation files or rawframe annotation files.
The dataset loads videos or raw frames and applies specified transforms to
return a dict containing the frame tensors and other information.
The ann_file is a json file with multiple dictionaries, and each dictionary
indicates a sample video with the filename and tags, the tags are organized
as different categories. Example of a video dictionary:
.. code-block:: txt
{
'filename': 'gD_G1b0wV5I_001015_001035.mp4',
'label': {
'concept': [250, 131, 42, 51, 57, 155, 122],
'object': [1570, 508],
'event': [16],
'action': [180],
'scene': [206]
}
}
Example of a rawframe dictionary:
.. code-block:: txt
{
'frame_dir': 'gD_G1b0wV5I_001015_001035',
'total_frames': 61
'label': {
'concept': [250, 131, 42, 51, 57, 155, 122],
'object': [1570, 508],
'event': [16],
'action': [180],
'scene': [206]
}
}
Args:
ann_file (str): Path to the annotation file, should be a json file.
pipeline (list[dict | callable]): A sequence of data transforms.
tag_categories (list[str]): List of category names of tags.
tag_category_nums (list[int]): List of number of tags in each category.
filename_tmpl (str | None): Template for each filename. If set to None,
video dataset is used. Default: None.
**kwargs: Keyword arguments for ``BaseDataset``.
"""
def __init__(self,
ann_file,
pipeline,
tag_categories,
tag_category_nums,
filename_tmpl=None,
**kwargs):
assert len(tag_categories) == len(tag_category_nums)
self.tag_categories = tag_categories
self.tag_category_nums = tag_category_nums
self.filename_tmpl = filename_tmpl
self.num_categories = len(self.tag_categories)
self.num_tags = sum(self.tag_category_nums)
self.category2num = dict(zip(tag_categories, tag_category_nums))
self.start_idx = [0]
for i in range(self.num_categories - 1):
self.start_idx.append(self.start_idx[-1] +
self.tag_category_nums[i])
self.category2startidx = dict(zip(tag_categories, self.start_idx))
self.start_index = kwargs.pop('start_index', 0)
self.dataset_type = None
super().__init__(
ann_file, pipeline, start_index=self.start_index, **kwargs)
def load_annotations(self):
"""Load annotation file to get video information."""
assert self.ann_file.endswith('.json')
return self.load_json_annotations()
def load_json_annotations(self):
video_infos = mmcv.load(self.ann_file)
num_videos = len(video_infos)
video_info0 = video_infos[0]
assert ('filename' in video_info0) != ('frame_dir' in video_info0)
path_key = 'filename' if 'filename' in video_info0 else 'frame_dir'
self.dataset_type = 'video' if path_key == 'filename' else 'rawframe'
if self.dataset_type == 'rawframe':
assert self.filename_tmpl is not None
for i in range(num_videos):
path_value = video_infos[i][path_key]
if self.data_prefix is not None:
path_value = osp.join(self.data_prefix, path_value)
video_infos[i][path_key] = path_value
# We will convert label to torch tensors in the pipeline
video_infos[i]['categories'] = self.tag_categories
video_infos[i]['category_nums'] = self.tag_category_nums
if self.dataset_type == 'rawframe':
video_infos[i]['filename_tmpl'] = self.filename_tmpl
video_infos[i]['start_index'] = self.start_index
video_infos[i]['modality'] = self.modality
return video_infos
@staticmethod
def label2array(num, label):
arr = np.zeros(num, dtype=np.float32)
arr[label] = 1.
return arr
def evaluate(self,
results,
metrics='mean_average_precision',
metric_options=None,
logger=None):
"""Evaluation in HVU Video Dataset. We only support evaluating mAP for
each tag categories. Since some tag categories are missing for some
videos, we can not evaluate mAP for all tags.
Args:
results (list): Output results.
metrics (str | sequence[str]): Metrics to be performed.
Defaults: 'mean_average_precision'.
metric_options (dict | None): Dict for metric options.
Default: None.
logger (logging.Logger | None): Logger for recording.
Default: None.
Returns:
dict: Evaluation results dict.
"""
# Protect ``metric_options`` since it uses mutable value as default
metric_options = copy.deepcopy(metric_options)
if not isinstance(results, list):
raise TypeError(f'results must be a list, but got {type(results)}')
assert len(results) == len(self), (
f'The length of results is not equal to the dataset len: '
f'{len(results)} != {len(self)}')
metrics = metrics if isinstance(metrics, (list, tuple)) else [metrics]
# There should be only one metric in the metrics list:
# 'mean_average_precision'
assert len(metrics) == 1
metric = metrics[0]
assert metric == 'mean_average_precision'
gt_labels = [ann['label'] for ann in self.video_infos]
eval_results = OrderedDict()
for category in self.tag_categories:
start_idx = self.category2startidx[category]
num = self.category2num[category]
preds = [
result[start_idx:start_idx + num]
for video_idx, result in enumerate(results)
if category in gt_labels[video_idx]
]
gts = [
gt_label[category] for gt_label in gt_labels
if category in gt_label
]
gts = [self.label2array(num, item) for item in gts]
mAP = mean_average_precision(preds, gts)
eval_results[f'{category}_mAP'] = mAP
log_msg = f'\n{category}_mAP\t{mAP:.4f}'
print_log(log_msg, logger=logger)
return eval_results