Switch to side-by-side view

--- a
+++ b/tools/analysis/check_videos.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import warnings
+from functools import partial
+from multiprocessing import Manager, Pool, cpu_count
+
+import mmcv
+import numpy as np
+from mmcv import Config, DictAction
+
+from mmaction.datasets import PIPELINES, build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 check datasets')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function (deprecate), '
+        'change to --eval-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--output-file',
+        default='invalid-video.txt',
+        help='Output file path which keeps corrupted/missing video file paths')
+    parser.add_argument(
+        '--split',
+        default='train',
+        choices=['train', 'val', 'test'],
+        help='Dataset split')
+    parser.add_argument(
+        '--decoder',
+        default='decord',
+        choices=['decord', 'opencv', 'pyav'],
+        help='Video decoder type, should be one of [decord, opencv, pyav]')
+    parser.add_argument(
+        '--num-processes',
+        type=int,
+        default=(cpu_count() - 1 or 1),
+        help='Number of processes to check videos')
+    parser.add_argument(
+        '--remove-corrupted-videos',
+        action='store_true',
+        help='Whether to delete all corrupted videos')
+    args = parser.parse_args()
+
+    if args.options and args.eval_options:
+        raise ValueError(
+            '--options and --eval-options cannot be both '
+            'specified, --options is deprecated in favor of --eval-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --eval-options')
+        args.eval_options = args.options
+    return args
+
+
+@PIPELINES.register_module()
+class RandomSampleFrames:
+
+    def __call__(self, results):
+        """Select frames to verify.
+
+        Select the first, last and three random frames, Required key is
+        "total_frames", added or modified key is "frame_inds".
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        assert results['total_frames'] > 0
+
+        # first and last frames
+        results['frame_inds'] = np.array([0, results['total_frames'] - 1])
+
+        # choose 3 random frames
+        if results['total_frames'] > 2:
+            results['frame_inds'] = np.concatenate([
+                results['frame_inds'],
+                np.random.randint(1, results['total_frames'] - 1, 3)
+            ])
+
+        return results
+
+
+def _do_check_videos(lock, dataset, output_file, idx):
+    try:
+        dataset[idx]
+    except:  # noqa
+        # save invalid video path to output file
+        lock.acquire()
+        with open(output_file, 'a') as f:
+            f.write(dataset.video_infos[idx]['filename'] + '\n')
+        lock.release()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    decoder_to_pipeline_prefix = dict(
+        decord='Decord', opencv='OpenCV', pyav='PyAV')
+
+    # read config file
+    cfg = Config.fromfile(args.config)
+    cfg.merge_from_dict(args.cfg_options)
+
+    # build dataset
+    dataset_type = cfg.data[args.split].type
+    assert dataset_type == 'VideoDataset'
+    cfg.data[args.split].pipeline = [
+        dict(type=decoder_to_pipeline_prefix[args.decoder] + 'Init'),
+        dict(type='RandomSampleFrames'),
+        dict(type=decoder_to_pipeline_prefix[args.decoder] + 'Decode')
+    ]
+    dataset = build_dataset(cfg.data[args.split],
+                            dict(test_mode=(args.split != 'train')))
+
+    # prepare for checking
+    if os.path.exists(args.output_file):
+        # remove existing output file
+        os.remove(args.output_file)
+    pool = Pool(args.num_processes)
+    lock = Manager().Lock()
+    worker_fn = partial(_do_check_videos, lock, dataset, args.output_file)
+    ids = range(len(dataset))
+
+    # start checking
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for _ in pool.imap_unordered(worker_fn, ids):
+        prog_bar.update()
+    pool.close()
+    pool.join()
+
+    if os.path.exists(args.output_file):
+        num_lines = sum(1 for _ in open(args.output_file))
+        print(f'Checked {len(dataset)} videos, '
+              f'{num_lines} are corrupted/missing.')
+        if args.remove_corrupted_videos:
+            print('Start deleting corrupted videos')
+            cnt = 0
+            with open(args.output_file, 'r') as f:
+                for line in f:
+                    if os.path.exists(line.strip()):
+                        os.remove(line.strip())
+                        cnt += 1
+            print(f'Deleted {cnt} corrupted videos.')
+    else:
+        print(f'Checked {len(dataset)} videos, none are corrupted/missing')