--- a +++ b/tools/analysis/check_videos.py @@ -0,0 +1,158 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import warnings +from functools import partial +from multiprocessing import Manager, Pool, cpu_count + +import mmcv +import numpy as np +from mmcv import Config, DictAction + +from mmaction.datasets import PIPELINES, build_dataset + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMAction2 check datasets') + parser.add_argument('config', help='test config file path') + parser.add_argument( + '--options', + nargs='+', + action=DictAction, + default={}, + help='custom options for evaluation, the key-value pair in xxx=yyy ' + 'format will be kwargs for dataset.evaluate() function (deprecate), ' + 'change to --eval-options instead.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + parser.add_argument( + '--output-file', + default='invalid-video.txt', + help='Output file path which keeps corrupted/missing video file paths') + parser.add_argument( + '--split', + default='train', + choices=['train', 'val', 'test'], + help='Dataset split') + parser.add_argument( + '--decoder', + default='decord', + choices=['decord', 'opencv', 'pyav'], + help='Video decoder type, should be one of [decord, opencv, pyav]') + parser.add_argument( + '--num-processes', + type=int, + default=(cpu_count() - 1 or 1), + help='Number of processes to check videos') + parser.add_argument( + '--remove-corrupted-videos', + action='store_true', + help='Whether to delete all corrupted videos') + args = parser.parse_args() + + if args.options and args.eval_options: + raise ValueError( + '--options and --eval-options cannot be both ' + 'specified, --options is deprecated in favor of --eval-options') + if args.options: + warnings.warn('--options is deprecated in favor of --eval-options') + args.eval_options = args.options + return args + + +@PIPELINES.register_module() +class RandomSampleFrames: + + def __call__(self, results): + """Select frames to verify. + + Select the first, last and three random frames, Required key is + "total_frames", added or modified key is "frame_inds". + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + assert results['total_frames'] > 0 + + # first and last frames + results['frame_inds'] = np.array([0, results['total_frames'] - 1]) + + # choose 3 random frames + if results['total_frames'] > 2: + results['frame_inds'] = np.concatenate([ + results['frame_inds'], + np.random.randint(1, results['total_frames'] - 1, 3) + ]) + + return results + + +def _do_check_videos(lock, dataset, output_file, idx): + try: + dataset[idx] + except: # noqa + # save invalid video path to output file + lock.acquire() + with open(output_file, 'a') as f: + f.write(dataset.video_infos[idx]['filename'] + '\n') + lock.release() + + +if __name__ == '__main__': + args = parse_args() + + decoder_to_pipeline_prefix = dict( + decord='Decord', opencv='OpenCV', pyav='PyAV') + + # read config file + cfg = Config.fromfile(args.config) + cfg.merge_from_dict(args.cfg_options) + + # build dataset + dataset_type = cfg.data[args.split].type + assert dataset_type == 'VideoDataset' + cfg.data[args.split].pipeline = [ + dict(type=decoder_to_pipeline_prefix[args.decoder] + 'Init'), + dict(type='RandomSampleFrames'), + dict(type=decoder_to_pipeline_prefix[args.decoder] + 'Decode') + ] + dataset = build_dataset(cfg.data[args.split], + dict(test_mode=(args.split != 'train'))) + + # prepare for checking + if os.path.exists(args.output_file): + # remove existing output file + os.remove(args.output_file) + pool = Pool(args.num_processes) + lock = Manager().Lock() + worker_fn = partial(_do_check_videos, lock, dataset, args.output_file) + ids = range(len(dataset)) + + # start checking + prog_bar = mmcv.ProgressBar(len(dataset)) + for _ in pool.imap_unordered(worker_fn, ids): + prog_bar.update() + pool.close() + pool.join() + + if os.path.exists(args.output_file): + num_lines = sum(1 for _ in open(args.output_file)) + print(f'Checked {len(dataset)} videos, ' + f'{num_lines} are corrupted/missing.') + if args.remove_corrupted_videos: + print('Start deleting corrupted videos') + cnt = 0 + with open(args.output_file, 'r') as f: + for line in f: + if os.path.exists(line.strip()): + os.remove(line.strip()) + cnt += 1 + print(f'Deleted {cnt} corrupted videos.') + else: + print(f'Checked {len(dataset)} videos, none are corrupted/missing')