--- a
+++ b/tests/test_metrics.py
@@ -0,0 +1,350 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from mmseg.core.evaluation import (eval_metrics, mean_dice, mean_fscore,
+                                   mean_iou)
+from mmseg.core.evaluation.metrics import f_score
+
+
+def get_confusion_matrix(pred_label, label, num_classes, ignore_index):
+    """Intersection over Union
+       Args:
+           pred_label (np.ndarray): 2D predict map
+           label (np.ndarray): label 2D label map
+           num_classes (int): number of categories
+           ignore_index (int): index ignore in evaluation
+       """
+
+    mask = (label != ignore_index)
+    pred_label = pred_label[mask]
+    label = label[mask]
+
+    n = num_classes
+    inds = n * label + pred_label
+
+    mat = np.bincount(inds, minlength=n**2).reshape(n, n)
+
+    return mat
+
+
+# This func is deprecated since it's not memory efficient
+def legacy_mean_iou(results, gt_seg_maps, num_classes, ignore_index):
+    num_imgs = len(results)
+    assert len(gt_seg_maps) == num_imgs
+    total_mat = np.zeros((num_classes, num_classes), dtype=np.float)
+    for i in range(num_imgs):
+        mat = get_confusion_matrix(
+            results[i], gt_seg_maps[i], num_classes, ignore_index=ignore_index)
+        total_mat += mat
+    all_acc = np.diag(total_mat).sum() / total_mat.sum()
+    acc = np.diag(total_mat) / total_mat.sum(axis=1)
+    iou = np.diag(total_mat) / (
+        total_mat.sum(axis=1) + total_mat.sum(axis=0) - np.diag(total_mat))
+
+    return all_acc, acc, iou
+
+
+# This func is deprecated since it's not memory efficient
+def legacy_mean_dice(results, gt_seg_maps, num_classes, ignore_index):
+    num_imgs = len(results)
+    assert len(gt_seg_maps) == num_imgs
+    total_mat = np.zeros((num_classes, num_classes), dtype=np.float)
+    for i in range(num_imgs):
+        mat = get_confusion_matrix(
+            results[i], gt_seg_maps[i], num_classes, ignore_index=ignore_index)
+        total_mat += mat
+    all_acc = np.diag(total_mat).sum() / total_mat.sum()
+    acc = np.diag(total_mat) / total_mat.sum(axis=1)
+    dice = 2 * np.diag(total_mat) / (
+        total_mat.sum(axis=1) + total_mat.sum(axis=0))
+
+    return all_acc, acc, dice
+
+
+# This func is deprecated since it's not memory efficient
+def legacy_mean_fscore(results,
+                       gt_seg_maps,
+                       num_classes,
+                       ignore_index,
+                       beta=1):
+    num_imgs = len(results)
+    assert len(gt_seg_maps) == num_imgs
+    total_mat = np.zeros((num_classes, num_classes), dtype=np.float)
+    for i in range(num_imgs):
+        mat = get_confusion_matrix(
+            results[i], gt_seg_maps[i], num_classes, ignore_index=ignore_index)
+        total_mat += mat
+    all_acc = np.diag(total_mat).sum() / total_mat.sum()
+    recall = np.diag(total_mat) / total_mat.sum(axis=1)
+    precision = np.diag(total_mat) / total_mat.sum(axis=0)
+    fv = np.vectorize(f_score)
+    fscore = fv(precision, recall, beta=beta)
+
+    return all_acc, recall, precision, fscore
+
+
+def test_metrics():
+    pred_size = (10, 30, 30)
+    num_classes = 19
+    ignore_index = 255
+    results = np.random.randint(0, num_classes, size=pred_size)
+    label = np.random.randint(0, num_classes, size=pred_size)
+
+    # Test the availability of arg: ignore_index.
+    label[:, 2, 5:10] = ignore_index
+
+    # Test the correctness of the implementation of mIoU calculation.
+    ret_metrics = eval_metrics(
+        results, label, num_classes, ignore_index, metrics='mIoU')
+    all_acc, acc, iou = ret_metrics['aAcc'], ret_metrics['Acc'], ret_metrics[
+        'IoU']
+    all_acc_l, acc_l, iou_l = legacy_mean_iou(results, label, num_classes,
+                                              ignore_index)
+    assert all_acc == all_acc_l
+    assert np.allclose(acc, acc_l)
+    assert np.allclose(iou, iou_l)
+    # Test the correctness of the implementation of mDice calculation.
+    ret_metrics = eval_metrics(
+        results, label, num_classes, ignore_index, metrics='mDice')
+    all_acc, acc, dice = ret_metrics['aAcc'], ret_metrics['Acc'], ret_metrics[
+        'Dice']
+    all_acc_l, acc_l, dice_l = legacy_mean_dice(results, label, num_classes,
+                                                ignore_index)
+    assert all_acc == all_acc_l
+    assert np.allclose(acc, acc_l)
+    assert np.allclose(dice, dice_l)
+    # Test the correctness of the implementation of mDice calculation.
+    ret_metrics = eval_metrics(
+        results, label, num_classes, ignore_index, metrics='mFscore')
+    all_acc, recall, precision, fscore = ret_metrics['aAcc'], ret_metrics[
+        'Recall'], ret_metrics['Precision'], ret_metrics['Fscore']
+    all_acc_l, recall_l, precision_l, fscore_l = legacy_mean_fscore(
+        results, label, num_classes, ignore_index)
+    assert all_acc == all_acc_l
+    assert np.allclose(recall, recall_l)
+    assert np.allclose(precision, precision_l)
+    assert np.allclose(fscore, fscore_l)
+    # Test the correctness of the implementation of joint calculation.
+    ret_metrics = eval_metrics(
+        results,
+        label,
+        num_classes,
+        ignore_index,
+        metrics=['mIoU', 'mDice', 'mFscore'])
+    all_acc, acc, iou, dice, precision, recall, fscore = ret_metrics[
+        'aAcc'], ret_metrics['Acc'], ret_metrics['IoU'], ret_metrics[
+            'Dice'], ret_metrics['Precision'], ret_metrics[
+                'Recall'], ret_metrics['Fscore']
+    assert all_acc == all_acc_l
+    assert np.allclose(acc, acc_l)
+    assert np.allclose(iou, iou_l)
+    assert np.allclose(dice, dice_l)
+    assert np.allclose(precision, precision_l)
+    assert np.allclose(recall, recall_l)
+    assert np.allclose(fscore, fscore_l)
+
+    # Test the correctness of calculation when arg: num_classes is larger
+    # than the maximum value of input maps.
+    results = np.random.randint(0, 5, size=pred_size)
+    label = np.random.randint(0, 4, size=pred_size)
+    ret_metrics = eval_metrics(
+        results,
+        label,
+        num_classes,
+        ignore_index=255,
+        metrics='mIoU',
+        nan_to_num=-1)
+    all_acc, acc, iou = ret_metrics['aAcc'], ret_metrics['Acc'], ret_metrics[
+        'IoU']
+    assert acc[-1] == -1
+    assert iou[-1] == -1
+
+    ret_metrics = eval_metrics(
+        results,
+        label,
+        num_classes,
+        ignore_index=255,
+        metrics='mDice',
+        nan_to_num=-1)
+    all_acc, acc, dice = ret_metrics['aAcc'], ret_metrics['Acc'], ret_metrics[
+        'Dice']
+    assert acc[-1] == -1
+    assert dice[-1] == -1
+
+    ret_metrics = eval_metrics(
+        results,
+        label,
+        num_classes,
+        ignore_index=255,
+        metrics='mFscore',
+        nan_to_num=-1)
+    all_acc, precision, recall, fscore = ret_metrics['aAcc'], ret_metrics[
+        'Precision'], ret_metrics['Recall'], ret_metrics['Fscore']
+    assert precision[-1] == -1
+    assert recall[-1] == -1
+    assert fscore[-1] == -1
+
+    ret_metrics = eval_metrics(
+        results,
+        label,
+        num_classes,
+        ignore_index=255,
+        metrics=['mDice', 'mIoU', 'mFscore'],
+        nan_to_num=-1)
+    all_acc, acc, iou, dice, precision, recall, fscore = ret_metrics[
+        'aAcc'], ret_metrics['Acc'], ret_metrics['IoU'], ret_metrics[
+            'Dice'], ret_metrics['Precision'], ret_metrics[
+                'Recall'], ret_metrics['Fscore']
+    assert acc[-1] == -1
+    assert dice[-1] == -1
+    assert iou[-1] == -1
+    assert precision[-1] == -1
+    assert recall[-1] == -1
+    assert fscore[-1] == -1
+
+    # Test the bug which is caused by torch.histc.
+    # torch.histc:  https://pytorch.org/docs/stable/generated/torch.histc.html
+    # When the arg:bins is set to be same as arg:max,
+    # some channels of mIoU may be nan.
+    results = np.array([np.repeat(31, 59)])
+    label = np.array([np.arange(59)])
+    num_classes = 59
+    ret_metrics = eval_metrics(
+        results, label, num_classes, ignore_index=255, metrics='mIoU')
+    all_acc, acc, iou = ret_metrics['aAcc'], ret_metrics['Acc'], ret_metrics[
+        'IoU']
+    assert not np.any(np.isnan(iou))
+
+
+def test_mean_iou():
+    pred_size = (10, 30, 30)
+    num_classes = 19
+    ignore_index = 255
+    results = np.random.randint(0, num_classes, size=pred_size)
+    label = np.random.randint(0, num_classes, size=pred_size)
+    label[:, 2, 5:10] = ignore_index
+    ret_metrics = mean_iou(results, label, num_classes, ignore_index)
+    all_acc, acc, iou = ret_metrics['aAcc'], ret_metrics['Acc'], ret_metrics[
+        'IoU']
+    all_acc_l, acc_l, iou_l = legacy_mean_iou(results, label, num_classes,
+                                              ignore_index)
+    assert all_acc == all_acc_l
+    assert np.allclose(acc, acc_l)
+    assert np.allclose(iou, iou_l)
+
+    results = np.random.randint(0, 5, size=pred_size)
+    label = np.random.randint(0, 4, size=pred_size)
+    ret_metrics = mean_iou(
+        results, label, num_classes, ignore_index=255, nan_to_num=-1)
+    all_acc, acc, iou = ret_metrics['aAcc'], ret_metrics['Acc'], ret_metrics[
+        'IoU']
+    assert acc[-1] == -1
+    assert acc[-1] == -1
+
+
+def test_mean_dice():
+    pred_size = (10, 30, 30)
+    num_classes = 19
+    ignore_index = 255
+    results = np.random.randint(0, num_classes, size=pred_size)
+    label = np.random.randint(0, num_classes, size=pred_size)
+    label[:, 2, 5:10] = ignore_index
+    ret_metrics = mean_dice(results, label, num_classes, ignore_index)
+    all_acc, acc, iou = ret_metrics['aAcc'], ret_metrics['Acc'], ret_metrics[
+        'Dice']
+    all_acc_l, acc_l, dice_l = legacy_mean_dice(results, label, num_classes,
+                                                ignore_index)
+    assert all_acc == all_acc_l
+    assert np.allclose(acc, acc_l)
+    assert np.allclose(iou, dice_l)
+
+    results = np.random.randint(0, 5, size=pred_size)
+    label = np.random.randint(0, 4, size=pred_size)
+    ret_metrics = mean_dice(
+        results, label, num_classes, ignore_index=255, nan_to_num=-1)
+    all_acc, acc, dice = ret_metrics['aAcc'], ret_metrics['Acc'], ret_metrics[
+        'Dice']
+    assert acc[-1] == -1
+    assert dice[-1] == -1
+
+
+def test_mean_fscore():
+    pred_size = (10, 30, 30)
+    num_classes = 19
+    ignore_index = 255
+    results = np.random.randint(0, num_classes, size=pred_size)
+    label = np.random.randint(0, num_classes, size=pred_size)
+    label[:, 2, 5:10] = ignore_index
+    ret_metrics = mean_fscore(results, label, num_classes, ignore_index)
+    all_acc, recall, precision, fscore = ret_metrics['aAcc'], ret_metrics[
+        'Recall'], ret_metrics['Precision'], ret_metrics['Fscore']
+    all_acc_l, recall_l, precision_l, fscore_l = legacy_mean_fscore(
+        results, label, num_classes, ignore_index)
+    assert all_acc == all_acc_l
+    assert np.allclose(recall, recall_l)
+    assert np.allclose(precision, precision_l)
+    assert np.allclose(fscore, fscore_l)
+
+    ret_metrics = mean_fscore(
+        results, label, num_classes, ignore_index, beta=2)
+    all_acc, recall, precision, fscore = ret_metrics['aAcc'], ret_metrics[
+        'Recall'], ret_metrics['Precision'], ret_metrics['Fscore']
+    all_acc_l, recall_l, precision_l, fscore_l = legacy_mean_fscore(
+        results, label, num_classes, ignore_index, beta=2)
+    assert all_acc == all_acc_l
+    assert np.allclose(recall, recall_l)
+    assert np.allclose(precision, precision_l)
+    assert np.allclose(fscore, fscore_l)
+
+    results = np.random.randint(0, 5, size=pred_size)
+    label = np.random.randint(0, 4, size=pred_size)
+    ret_metrics = mean_fscore(
+        results, label, num_classes, ignore_index=255, nan_to_num=-1)
+    all_acc, recall, precision, fscore = ret_metrics['aAcc'], ret_metrics[
+        'Recall'], ret_metrics['Precision'], ret_metrics['Fscore']
+    assert recall[-1] == -1
+    assert precision[-1] == -1
+    assert fscore[-1] == -1
+
+
+def test_filename_inputs():
+    import cv2
+    import tempfile
+
+    def save_arr(input_arrays: list, title: str, is_image: bool, dir: str):
+        filenames = []
+        SUFFIX = '.png' if is_image else '.npy'
+        for idx, arr in enumerate(input_arrays):
+            filename = '{}/{}-{}{}'.format(dir, title, idx, SUFFIX)
+            if is_image:
+                cv2.imwrite(filename, arr)
+            else:
+                np.save(filename, arr)
+            filenames.append(filename)
+        return filenames
+
+    pred_size = (10, 30, 30)
+    num_classes = 19
+    ignore_index = 255
+    results = np.random.randint(0, num_classes, size=pred_size)
+    labels = np.random.randint(0, num_classes, size=pred_size)
+    labels[:, 2, 5:10] = ignore_index
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+
+        result_files = save_arr(results, 'pred', False, temp_dir)
+        label_files = save_arr(labels, 'label', True, temp_dir)
+
+        ret_metrics = eval_metrics(
+            result_files,
+            label_files,
+            num_classes,
+            ignore_index,
+            metrics='mIoU')
+        all_acc, acc, iou = ret_metrics['aAcc'], ret_metrics[
+            'Acc'], ret_metrics['IoU']
+        all_acc_l, acc_l, iou_l = legacy_mean_iou(results, labels, num_classes,
+                                                  ignore_index)
+        assert all_acc == all_acc_l
+        assert np.allclose(acc, acc_l)
+        assert np.allclose(iou, iou_l)