In [42]:
import json
import numpy as np
import os
import sys
sys.path.append("../../../ecg")

import load
import util

def fleiss_kappa(ratings):
    """
    Args:
        ratings: An N x R numpy array. N is the number of
            samples and R is the number of reviewers. Each
            entry (n, r) is the category assigned to example
            n by reviewer r.
    Returns:
        Fleiss' kappa score.
    https://en.wikipedia.org/wiki/Fleiss%27_kappa
    """
    N, R = ratings.shape
    NR =  N * R
    categories = set(ratings.ravel().tolist())
    P_example = -np.full(N, R)
    p_class = 0.0
    for c in categories:
        c_sum = np.sum(ratings == c, axis=1)
        P_example += c_sum**2
        p_class += (np.sum(c_sum) / float(NR)) ** 2
    P_example = np.sum(P_example) / float(NR * (R-1))
    k = (P_example - p_class) / (1 - p_class)
    return k

def average_pairwise_agreement(revs):
    """
    Here, we use the same method as the diabetic
    retinopathy paper. The number of pair-wise
    agreements over the total number of pairwise
    comparisons.
    """
    corr = 0
    tot = 0
    n_revs = len(revs)
    for i in range(n_revs):
        for j in range(i+1, n_revs):
            c = np.sum(revs[i] == revs[j])
            t = revs[i].size
            corr += c
            tot += t
    return corr / float(tot)


In [10]:
model_path = "/deep/group/awni/ecg_models/default/1527627404-9/0.337-0.880-012-0.255-0.906.hdf5"
preproc = util.load(os.path.dirname(model_path))

revs = []
for i in range(6):
    with open("../test_rev{}.json".format(i), 'r') as fid:
        revs.append([json.loads(l)['labels'] for l in fid])
revs = [np.argmax(preproc.process_y(r), axis=2) for r in revs]


### Sequence Level Agreements

In [66]:
print "\t     Fleiss' kappa  \t Avg Pairwise"
for e, c in enumerate(preproc.classes):
    binary_revs = [np.reshape(r == e, -1) for r in revs]
    print "{:<10}   {:.3f} \t\t {:.3f}".format(
        c, fleiss_kappa(np.stack(binary_revs, axis=1)),
        average_pairwise_agreement(binary_revs)) 
print
ratings = np.hstack([r.reshape(-1, 1) for r in revs])
print "{:<10}   {:.3f} \t\t {:.3f}".format(
        "All", fleiss_kappa(ratings),
        average_pairwise_agreement(revs))

	     Fleiss' kappa  	 Avg Pairwise
AF           0.613 		 0.904
AVB          0.707 		 0.950
BIGEMINY     0.796 		 0.989
EAR          0.407 		 0.977
IVR          0.535 		 0.978
JUNCTIONAL   0.610 		 0.956
NOISE        0.729 		 0.962
SINUS        0.678 		 0.841
SVT          0.398 		 0.961
TRIGEMINY    0.783 		 0.987
VT           0.500 		 0.992
WENCKEBACH   0.496 		 0.962

All          0.645 		 0.730


### Set-level agreements (can only compute for a given rhythm)

In [71]:
print "\t     Fleiss' kappa  \t Avg Pairwise"
for e, c in enumerate(preproc.classes):
    binary_revs = [np.any(r == e, axis=1) for r in revs]
    print "{:<10}   {:.3f} \t\t {:.3f}".format(
        c, fleiss_kappa(np.stack(binary_revs, axis=1)),
        average_pairwise_agreement(binary_revs))   

	     Fleiss' kappa  	 Avg Pairwise
AF           0.591 		 0.873
AVB          0.703 		 0.929
BIGEMINY     0.791 		 0.976
EAR          0.415 		 0.950
IVR          0.645 		 0.944
JUNCTIONAL   0.607 		 0.928
NOISE        0.625 		 0.924
SINUS        0.666 		 0.858
SVT          0.485 		 0.921
TRIGEMINY    0.732 		 0.971
VT           0.677 		 0.967
WENCKEBACH   0.609 		 0.947


### Confusions between "AFIB" and "AFL" and "AVB type 2 second degree" and "AVB third degree"