a b/utils/pycocoevalcap/rouge/rouge.py
1
#!/usr/bin/env python
2
# 
3
# File Name : rouge.py
4
#
5
# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
6
#
7
# Creation Date : 2015-01-07 06:03
8
# Author : Ramakrishna Vedantam <vrama91@vt.edu>
9
10
import numpy as np
11
import pdb
12
13
def my_lcs(string, sub):
14
    """
15
    Calculates longest common subsequence for a pair of tokenized strings
16
    :param string : list of str : tokens from a string split using whitespace
17
    :param sub : list of str : shorter string, also split using whitespace
18
    :returns: length (list of int): length of the longest common subsequence between the two strings
19
20
    Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
21
    """
22
    if(len(string)< len(sub)):
23
        sub, string = string, sub
24
25
    lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
26
27
    for j in range(1,len(sub)+1):
28
        for i in range(1,len(string)+1):
29
            if(string[i-1] == sub[j-1]):
30
                lengths[i][j] = lengths[i-1][j-1] + 1
31
            else:
32
                lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
33
34
    return lengths[len(string)][len(sub)]
35
36
class Rouge():
37
    '''
38
    Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
39
40
    '''
41
    def __init__(self):
42
        # vrama91: updated the value below based on discussion with Hovey
43
        self.beta = 1.2
44
45
    def calc_score(self, candidate, refs):
46
        """
47
        Compute ROUGE-L score given one candidate and references for an image
48
        :param candidate: str : candidate sentence to be evaluated
49
        :param refs: list of str : COCO reference sentences for the particular image to be evaluated
50
        :returns score: int (ROUGE-L score for the candidate evaluated against references)
51
        """
52
        assert(len(candidate)==1)   
53
        assert(len(refs)>0)         
54
        prec = []
55
        rec = []
56
57
        # split into tokens
58
        token_c = candidate[0].split(" ")
59
        
60
        for reference in refs:
61
            # split into tokens
62
            token_r = reference.split(" ")
63
            # compute the longest common subsequence
64
            lcs = my_lcs(token_r, token_c)
65
            prec.append(lcs/float(len(token_c)))
66
            rec.append(lcs/float(len(token_r)))
67
68
        prec_max = max(prec)
69
        rec_max = max(rec)
70
71
        if(prec_max!=0 and rec_max !=0):
72
            score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
73
        else:
74
            score = 0.0
75
        return score
76
77
    def compute_score(self, gts, res):
78
        """
79
        Computes Rouge-L score given a set of reference and candidate sentences for the dataset
80
        Invoked by evaluate_captions.py 
81
        :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 
82
        :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
83
        :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
84
        """
85
        assert(gts.keys() == res.keys())
86
        imgIds = gts.keys()
87
88
        score = []
89
        for id in imgIds:
90
            hypo = res[id]
91
            ref  = gts[id]
92
93
            score.append(self.calc_score(hypo, ref))
94
95
            # Sanity check.
96
            assert(type(hypo) is list)
97
            assert(len(hypo) == 1)
98
            assert(type(ref) is list)
99
            assert(len(ref) > 0)
100
101
        average_score = np.mean(np.array(score))
102
        return average_score, np.array(score)
103
104
    def method(self):
105
        return "Rouge"