|
a |
|
b/utils/pycocoevalcap/rouge/rouge.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
# |
|
|
3 |
# File Name : rouge.py |
|
|
4 |
# |
|
|
5 |
# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) |
|
|
6 |
# |
|
|
7 |
# Creation Date : 2015-01-07 06:03 |
|
|
8 |
# Author : Ramakrishna Vedantam <vrama91@vt.edu> |
|
|
9 |
|
|
|
10 |
import numpy as np |
|
|
11 |
import pdb |
|
|
12 |
|
|
|
13 |
def my_lcs(string, sub): |
|
|
14 |
""" |
|
|
15 |
Calculates longest common subsequence for a pair of tokenized strings |
|
|
16 |
:param string : list of str : tokens from a string split using whitespace |
|
|
17 |
:param sub : list of str : shorter string, also split using whitespace |
|
|
18 |
:returns: length (list of int): length of the longest common subsequence between the two strings |
|
|
19 |
|
|
|
20 |
Note: my_lcs only gives length of the longest common subsequence, not the actual LCS |
|
|
21 |
""" |
|
|
22 |
if(len(string)< len(sub)): |
|
|
23 |
sub, string = string, sub |
|
|
24 |
|
|
|
25 |
lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] |
|
|
26 |
|
|
|
27 |
for j in range(1,len(sub)+1): |
|
|
28 |
for i in range(1,len(string)+1): |
|
|
29 |
if(string[i-1] == sub[j-1]): |
|
|
30 |
lengths[i][j] = lengths[i-1][j-1] + 1 |
|
|
31 |
else: |
|
|
32 |
lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) |
|
|
33 |
|
|
|
34 |
return lengths[len(string)][len(sub)] |
|
|
35 |
|
|
|
36 |
class Rouge(): |
|
|
37 |
''' |
|
|
38 |
Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set |
|
|
39 |
|
|
|
40 |
''' |
|
|
41 |
def __init__(self): |
|
|
42 |
# vrama91: updated the value below based on discussion with Hovey |
|
|
43 |
self.beta = 1.2 |
|
|
44 |
|
|
|
45 |
def calc_score(self, candidate, refs): |
|
|
46 |
""" |
|
|
47 |
Compute ROUGE-L score given one candidate and references for an image |
|
|
48 |
:param candidate: str : candidate sentence to be evaluated |
|
|
49 |
:param refs: list of str : COCO reference sentences for the particular image to be evaluated |
|
|
50 |
:returns score: int (ROUGE-L score for the candidate evaluated against references) |
|
|
51 |
""" |
|
|
52 |
assert(len(candidate)==1) |
|
|
53 |
assert(len(refs)>0) |
|
|
54 |
prec = [] |
|
|
55 |
rec = [] |
|
|
56 |
|
|
|
57 |
# split into tokens |
|
|
58 |
token_c = candidate[0].split(" ") |
|
|
59 |
|
|
|
60 |
for reference in refs: |
|
|
61 |
# split into tokens |
|
|
62 |
token_r = reference.split(" ") |
|
|
63 |
# compute the longest common subsequence |
|
|
64 |
lcs = my_lcs(token_r, token_c) |
|
|
65 |
prec.append(lcs/float(len(token_c))) |
|
|
66 |
rec.append(lcs/float(len(token_r))) |
|
|
67 |
|
|
|
68 |
prec_max = max(prec) |
|
|
69 |
rec_max = max(rec) |
|
|
70 |
|
|
|
71 |
if(prec_max!=0 and rec_max !=0): |
|
|
72 |
score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) |
|
|
73 |
else: |
|
|
74 |
score = 0.0 |
|
|
75 |
return score |
|
|
76 |
|
|
|
77 |
def compute_score(self, gts, res): |
|
|
78 |
""" |
|
|
79 |
Computes Rouge-L score given a set of reference and candidate sentences for the dataset |
|
|
80 |
Invoked by evaluate_captions.py |
|
|
81 |
:param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values |
|
|
82 |
:param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values |
|
|
83 |
:returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) |
|
|
84 |
""" |
|
|
85 |
assert(gts.keys() == res.keys()) |
|
|
86 |
imgIds = gts.keys() |
|
|
87 |
|
|
|
88 |
score = [] |
|
|
89 |
for id in imgIds: |
|
|
90 |
hypo = res[id] |
|
|
91 |
ref = gts[id] |
|
|
92 |
|
|
|
93 |
score.append(self.calc_score(hypo, ref)) |
|
|
94 |
|
|
|
95 |
# Sanity check. |
|
|
96 |
assert(type(hypo) is list) |
|
|
97 |
assert(len(hypo) == 1) |
|
|
98 |
assert(type(ref) is list) |
|
|
99 |
assert(len(ref) > 0) |
|
|
100 |
|
|
|
101 |
average_score = np.mean(np.array(score)) |
|
|
102 |
return average_score, np.array(score) |
|
|
103 |
|
|
|
104 |
def method(self): |
|
|
105 |
return "Rouge" |