Switch to side-by-side view

--- a
+++ b/tests/test_models/test_gradcam.py
@@ -0,0 +1,230 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models import build_recognizer
+from mmaction.utils.gradcam_utils import GradCAM
+from .base import generate_gradcam_inputs, get_recognizer_cfg
+
+
+def _get_target_shapes(input_shape, num_classes=400, model_type='2D'):
+    if model_type not in ['2D', '3D']:
+        raise ValueError(f'Data type {model_type} is not available')
+
+    preds_target_shape = (input_shape[0], num_classes)
+    if model_type == '3D':
+        # input shape (batch_size, num_crops*num_clips, C, clip_len, H, W)
+        # target shape (batch_size*num_crops*num_clips, clip_len, H, W, C)
+        blended_imgs_target_shape = (input_shape[0] * input_shape[1],
+                                     input_shape[3], input_shape[4],
+                                     input_shape[5], input_shape[2])
+    else:
+        # input shape (batch_size, num_segments, C, H, W)
+        # target shape (batch_size, num_segments, H, W, C)
+        blended_imgs_target_shape = (input_shape[0], input_shape[1],
+                                     input_shape[3], input_shape[4],
+                                     input_shape[2])
+
+    return blended_imgs_target_shape, preds_target_shape
+
+
+def _do_test_2D_models(recognizer,
+                       target_layer_name,
+                       input_shape,
+                       num_classes=400,
+                       device='cpu'):
+    demo_inputs = generate_gradcam_inputs(input_shape)
+    demo_inputs['imgs'] = demo_inputs['imgs'].to(device)
+    demo_inputs['label'] = demo_inputs['label'].to(device)
+
+    recognizer = recognizer.to(device)
+    gradcam = GradCAM(recognizer, target_layer_name)
+
+    blended_imgs_target_shape, preds_target_shape = _get_target_shapes(
+        input_shape, num_classes=num_classes, model_type='2D')
+
+    blended_imgs, preds = gradcam(demo_inputs)
+    assert blended_imgs.size() == blended_imgs_target_shape
+    assert preds.size() == preds_target_shape
+
+    blended_imgs, preds = gradcam(demo_inputs, True)
+    assert blended_imgs.size() == blended_imgs_target_shape
+    assert preds.size() == preds_target_shape
+
+
+def _do_test_3D_models(recognizer,
+                       target_layer_name,
+                       input_shape,
+                       num_classes=400):
+    blended_imgs_target_shape, preds_target_shape = _get_target_shapes(
+        input_shape, num_classes=num_classes, model_type='3D')
+    demo_inputs = generate_gradcam_inputs(input_shape, '3D')
+
+    # parrots 3dconv is only implemented on gpu
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            recognizer = recognizer.cuda()
+            demo_inputs['imgs'] = demo_inputs['imgs'].cuda()
+            demo_inputs['label'] = demo_inputs['label'].cuda()
+            gradcam = GradCAM(recognizer, target_layer_name)
+
+            blended_imgs, preds = gradcam(demo_inputs)
+            assert blended_imgs.size() == blended_imgs_target_shape
+            assert preds.size() == preds_target_shape
+
+            blended_imgs, preds = gradcam(demo_inputs, True)
+            assert blended_imgs.size() == blended_imgs_target_shape
+            assert preds.size() == preds_target_shape
+    else:
+        gradcam = GradCAM(recognizer, target_layer_name)
+
+        blended_imgs, preds = gradcam(demo_inputs)
+        assert blended_imgs.size() == blended_imgs_target_shape
+        assert preds.size() == preds_target_shape
+
+        blended_imgs, preds = gradcam(demo_inputs, True)
+        assert blended_imgs.size() == blended_imgs_target_shape
+        assert preds.size() == preds_target_shape
+
+
+def test_tsn():
+    config = get_recognizer_cfg('tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py')
+    config.model['backbone']['pretrained'] = None
+    recognizer = build_recognizer(config.model)
+    recognizer.cfg = config
+
+    input_shape = (1, 25, 3, 32, 32)
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    _do_test_2D_models(recognizer, target_layer_name, input_shape)
+
+
+def test_i3d():
+    config = get_recognizer_cfg('i3d/i3d_r50_32x2x1_100e_kinetics400_rgb.py')
+    config.model['backbone']['pretrained2d'] = False
+    config.model['backbone']['pretrained'] = None
+
+    recognizer = build_recognizer(config.model)
+    recognizer.cfg = config
+
+    input_shape = [1, 1, 3, 32, 32, 32]
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    _do_test_3D_models(recognizer, target_layer_name, input_shape)
+
+
+def test_r2plus1d():
+    config = get_recognizer_cfg(
+        'r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py')
+    config.model['backbone']['pretrained2d'] = False
+    config.model['backbone']['pretrained'] = None
+    config.model['backbone']['norm_cfg'] = dict(type='BN3d')
+
+    recognizer = build_recognizer(config.model)
+    recognizer.cfg = config
+
+    input_shape = (1, 3, 3, 8, 32, 32)
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    _do_test_3D_models(recognizer, target_layer_name, input_shape)
+
+
+def test_slowfast():
+    config = get_recognizer_cfg(
+        'slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py')
+
+    recognizer = build_recognizer(config.model)
+    recognizer.cfg = config
+
+    input_shape = (1, 1, 3, 32, 32, 32)
+    target_layer_name = 'backbone/slow_path/layer4/1/relu'
+
+    _do_test_3D_models(recognizer, target_layer_name, input_shape)
+
+
+def test_tsm():
+    config = get_recognizer_cfg('tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py')
+    config.model['backbone']['pretrained'] = None
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    # base config
+    recognizer = build_recognizer(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 8, 3, 32, 32)
+    _do_test_2D_models(recognizer, target_layer_name, input_shape)
+
+    # test twice sample + 3 crops, 2*3*8=48
+    config.model.test_cfg = dict(average_clips='prob')
+    recognizer = build_recognizer(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 48, 3, 32, 32)
+    _do_test_2D_models(recognizer, target_layer_name, input_shape)
+
+
+def test_csn():
+    config = get_recognizer_cfg(
+        'csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb.py')
+    config.model['backbone']['pretrained2d'] = False
+    config.model['backbone']['pretrained'] = None
+
+    recognizer = build_recognizer(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 1, 3, 32, 32, 32)
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    _do_test_3D_models(recognizer, target_layer_name, input_shape)
+
+
+def test_tpn():
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    config = get_recognizer_cfg('tpn/tpn_tsm_r50_1x1x8_150e_sthv1_rgb.py')
+    config.model['backbone']['pretrained'] = None
+    recognizer = build_recognizer(config.model)
+    recognizer.cfg = config
+
+    input_shape = (1, 8, 3, 32, 32)
+    _do_test_2D_models(recognizer, target_layer_name, input_shape, 174)
+
+    config = get_recognizer_cfg(
+        'tpn/tpn_slowonly_r50_8x8x1_150e_kinetics_rgb.py')
+    config.model['backbone']['pretrained'] = None
+    recognizer = build_recognizer(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 3, 3, 8, 32, 32)
+    _do_test_3D_models(recognizer, target_layer_name, input_shape)
+
+
+def test_c3d():
+    config = get_recognizer_cfg('c3d/c3d_sports1m_16x1x1_45e_ucf101_rgb.py')
+    config.model['backbone']['pretrained'] = None
+    recognizer = build_recognizer(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 1, 3, 16, 112, 112)
+    target_layer_name = 'backbone/conv5a/activate'
+    _do_test_3D_models(recognizer, target_layer_name, input_shape, 101)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_tin():
+    config = get_recognizer_cfg(
+        'tin/tin_tsm_finetune_r50_1x1x8_50e_kinetics400_rgb.py')
+    config.model['backbone']['pretrained'] = None
+    target_layer_name = 'backbone/layer4/1/relu'
+
+    recognizer = build_recognizer(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 8, 3, 64, 64)
+    _do_test_2D_models(
+        recognizer, target_layer_name, input_shape, device='cuda:0')
+
+
+def test_x3d():
+    config = get_recognizer_cfg('x3d/x3d_s_13x6x1_facebook_kinetics400_rgb.py')
+    config.model['backbone']['pretrained'] = None
+    recognizer = build_recognizer(config.model)
+    recognizer.cfg = config
+    input_shape = (1, 1, 3, 13, 32, 32)
+    target_layer_name = 'backbone/layer4/1/relu'
+    _do_test_3D_models(recognizer, target_layer_name, input_shape)