[1180c1]: / llava / eval / eval_metrics / evaluate_metrics.py

Download this file

149 lines (117 with data), 4.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import math
from .utils import *
from .glossary import *
def bleu(candidate, references, n, weights):
pn = []
bp = brevity_penalty(candidate, references)
for i in range(n):
pn.append(modified_precision(candidate, references, i + 1))
if len(weights) > len(pn):
tmp_weights = []
for i in range(len(pn)):
tmp_weights.append(weights[i])
bleu_result = calculate_bleu(tmp_weights, pn, n, bp)
return str(bleu_result) + " (warning: the length of weights is bigger than n)"
elif len(weights) < len(pn):
tmp_weights = []
for i in range(len(pn)):
tmp_weights.append(0)
for i in range(len(weights)):
tmp_weights[i] = weights[i]
bleu_result = calculate_bleu(tmp_weights, pn, n, bp)
return str(bleu_result) + " (warning: the length of weights is smaller than n)"
else:
bleu_result = calculate_bleu(weights, pn, n, bp)
return str(bleu_result)
#BLEU
def calculate_bleu(weights, pn, n, bp):
sum_wlogp = 0
for i in range(n):
if pn[i] != 0:
sum_wlogp += float(weights[i]) * math.log(pn[i])
bleu_result = bp * math.exp(sum_wlogp)
return bleu_result
#Exact match
def calculate_exactmatch(candidate, reference):
candidate = normalize_word(candidate)
reference = normalize_word(reference)
candidate_words = split_sentence(candidate, 1)
reference_words = split_sentence(reference, 1)
count = 0
total = 0
for word in reference_words:
if word in candidate_words:
count += 1
for word in candidate_words:
total += candidate_words[word]
if total == 0:
return 0 # "0 (warning: length of candidate's words is 0)"
else:
return count / total
#Exact match with normalization
def similarity_candidate_prediction(candidate_answer, prediction):
candidate_answer = split_sentence(candidate_answer, 1)
count = 0
total = 0
for word in prediction:
if word in candidate_answer:
count += 1
total = len(candidate_answer)
if total == 0:
return 0.0 # "0 (warning: length of candidate's words is 0)"
else:
return count / total
def argmax(lst):
return lst.index(max(lst))
def calculate_appearance_with_normalization(prediction, reference, candidate_set):
prediction = normalize_word(prediction)
reference = normalize_word(reference)
prediction_words = split_sentence(prediction, 1)
reference_words = split_sentence(reference, 1)
candidate_set = candidate_set['0']
similarity_list = []
candidate_answer_normalized_list = []
for candidate_answer in candidate_set:
if isinstance(candidate_answer, int):
candidate_answer = str(candidate_answer)
candidate_answer = normalize_word(candidate_answer)
candidate_answer_normalized_list.append(candidate_answer)
similarity_list.append(similarity_candidate_prediction(candidate_answer, prediction_words))
final_prediction = candidate_answer_normalized_list[argmax(similarity_list)]
# import pdb; pdb.set_trace()
if final_prediction == reference:
return 1.0 #
else:
return 0.0
#F1
def calculate_f1score(candidate, reference):
candidate = normalize_word(candidate)
reference = normalize_word(reference)
candidate_words = split_sentence(candidate, 1)
reference_words = split_sentence(reference, 1)
word_set = set()
for word in candidate_words:
word_set.add(word)
for word in reference_words:
word_set.add(word)
tp = 0
fp = 0
fn = 0
for word in word_set:
if word in candidate_words and word in reference_words:
tp += candidate_words[word]
elif word in candidate_words and word not in reference_words:
fp += candidate_words[word]
elif word not in candidate_words and word in reference_words:
fn += reference_words[word]
if len(candidate_words) == 0:
return 0, 0, 0 # "0 (warning: length of candidate's words is 0)"
elif len(reference_words) == 0:
return 0, 0, 0
else:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
if tp == 0:
return 0, 0, 0
else:
return 2 * precision * recall / (precision + recall), precision, recall