[6d389a]: / tools / data / activitynet / convert_proposal_format.py

Download this file

163 lines (139 with data), 6.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Copyright (c) OpenMMLab. All rights reserved.
"""This file converts the output proposal file of proposal generator (BSN, BMN)
into the input proposal file of action classifier (Currently supports SSN and
P-GCN, not including TSN, I3D etc.)."""
import argparse
import mmcv
import numpy as np
from mmaction.core import pairwise_temporal_iou
def load_annotations(ann_file):
"""Load the annotation according to ann_file into video_infos."""
video_infos = []
anno_database = mmcv.load(ann_file)
for video_name in anno_database:
video_info = anno_database[video_name]
video_info['video_name'] = video_name
video_infos.append(video_info)
return video_infos
def import_ground_truth(video_infos, activity_index):
"""Read ground truth data from video_infos."""
ground_truth = {}
for video_info in video_infos:
video_id = video_info['video_name'][2:]
this_video_ground_truths = []
for ann in video_info['annotations']:
t_start, t_end = ann['segment']
label = activity_index[ann['label']]
this_video_ground_truths.append([t_start, t_end, label])
ground_truth[video_id] = np.array(this_video_ground_truths)
return ground_truth
def import_proposals(result_dict):
"""Read predictions from result dict."""
proposals = {}
num_proposals = 0
for video_id in result_dict:
result = result_dict[video_id]
this_video_proposals = []
for proposal in result:
t_start, t_end = proposal['segment']
score = proposal['score']
this_video_proposals.append([t_start, t_end, score])
num_proposals += 1
proposals[video_id] = np.array(this_video_proposals)
return proposals, num_proposals
def dump_formatted_proposal(video_idx, video_id, num_frames, fps, gts,
proposals, tiou, t_overlap_self,
formatted_proposal_file):
"""dump the formatted proposal file, which is the input proposal file of
action classifier (e.g: SSN).
Args:
video_idx (int): Index of video.
video_id (str): ID of video.
num_frames (int): Total frames of the video.
fps (float): Fps of the video.
gts (np.ndarray[float]): t_start, t_end and label of groundtruths.
proposals (np.ndarray[float]): t_start, t_end and score of proposals.
tiou (np.ndarray[float]): 2-dim array with IoU ratio.
t_overlap_self (np.ndarray[float]): 2-dim array with overlap_self
(union / self_len) ratio.
formatted_proposal_file (open file object): Open file object of
formatted_proposal_file.
"""
formatted_proposal_file.write(
f'#{video_idx}\n{video_id}\n{num_frames}\n{fps}\n{gts.shape[0]}\n')
for gt in gts:
formatted_proposal_file.write(f'{int(gt[2])} {gt[0]} {gt[1]}\n')
formatted_proposal_file.write(f'{proposals.shape[0]}\n')
best_iou = np.amax(tiou, axis=0)
best_iou_index = np.argmax(tiou, axis=0)
best_overlap = np.amax(t_overlap_self, axis=0)
best_overlap_index = np.argmax(t_overlap_self, axis=0)
for i in range(proposals.shape[0]):
index_iou = best_iou_index[i]
index_overlap = best_overlap_index[i]
label_iou = gts[index_iou][2]
label_overlap = gts[index_overlap][2]
if label_iou != label_overlap:
label = label_iou if label_iou != 0 else label_overlap
else:
label = label_iou
if best_iou[i] == 0 and best_overlap[i] == 0:
formatted_proposal_file.write(
f'0 0 0 {proposals[i][0]} {proposals[i][1]}\n')
else:
formatted_proposal_file.write(
f'{int(label)} {best_iou[i]} {best_overlap[i]} '
f'{proposals[i][0]} {proposals[i][1]}\n')
def parse_args():
parser = argparse.ArgumentParser(description='convert proposal format')
parser.add_argument(
'--ann-file',
type=str,
default='../../../data/ActivityNet/anet_anno_val.json',
help='name of annotation file')
parser.add_argument(
'--activity-index-file',
type=str,
default='../../../data/ActivityNet/anet_activity_indexes_val.txt',
help='name of activity index file')
parser.add_argument(
'--proposal-file',
type=str,
default='../../../results.json',
help='name of proposal file, which is the'
'output of proposal generator (BMN)')
parser.add_argument(
'--formatted-proposal-file',
type=str,
default='../../../anet_val_formatted_proposal.txt',
help='name of formatted proposal file, which is the'
'input of action classifier (SSN)')
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
formatted_proposal_file = open(args.formatted_proposal_file, 'w')
# The activity index file is constructed according to
# 'https://github.com/activitynet/ActivityNet/blob/master/Evaluation/eval_classification.py'
activity_index, class_idx = {}, 0
for line in open(args.activity_index_file).readlines():
activity_index[line.strip()] = class_idx
class_idx += 1
video_infos = load_annotations(args.ann_file)
ground_truth = import_ground_truth(video_infos, activity_index)
proposal, num_proposals = import_proposals(
mmcv.load(args.proposal_file)['results'])
video_idx = 0
for video_info in video_infos:
video_id = video_info['video_name'][2:]
num_frames = video_info['duration_frame']
fps = video_info['fps']
tiou, t_overlap = pairwise_temporal_iou(
proposal[video_id][:, :2].astype(float),
ground_truth[video_id][:, :2].astype(float),
calculate_overlap_self=True)
dump_formatted_proposal(video_idx, video_id, num_frames, fps,
ground_truth[video_id], proposal[video_id],
tiou, t_overlap, formatted_proposal_file)
video_idx += 1
formatted_proposal_file.close()