[6c353a]: / medacy / tools / calculators / annotation_overlap.py

Download this file

69 lines (47 with data), 1.7 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import argparse
from collections import Counter
from itertools import product
from pprint import pprint
from medacy.data.annotations import Annotations
from medacy.data.dataset import Dataset
def calculate_document_overlap(data_file):
already_matched = []
print(data_file.txt_path)
ann = Annotations(data_file.ann_path)
counts = Counter()
for a, b in product(ann, ann):
if a is b or {a, b} in already_matched:
continue
already_matched.append({a, b})
a_tag, a_start, a_end, a_text = a
b_tag, b_start, b_end, b_text = b
left_cut = a_start < b_start < a_end < b_end
right_cut = b_start < a_start < b_end < a_end
a_inside = b_start < a_start < a_end < b_end
b_inside = a_start < b_start < b_end < a_end
if left_cut:
print(f"Leftside cutoff: {a}, {b}")
elif right_cut:
print(f"Rightside cutoff: {a}, {b}")
elif a_inside:
print(f"A inside B: {a}, {b}")
elif b_inside:
print(f"B inside A: {a}, {b}")
if any([left_cut, right_cut, a_inside, b_inside]):
counts[(a_tag, b_tag)] += 1
print(counts)
return counts
def calculate_dataset_overlap(dataset):
total_counts = Counter()
for d in dataset:
total_counts += calculate_document_overlap(d)
print(f"Total overlaps:")
pprint(total_counts)
def main():
parser = argparse.ArgumentParser(description="Display which annotations in a dataset overlap")
parser.add_argument("dataset", help="Directory of the dataset")
args = parser.parse_args()
dataset = Dataset(args.dataset)
calculate_dataset_overlap(dataset)
if __name__ == '__main__':
main()