a b/utils/pycocoevalcap/tokenizer/ptbtokenizer.py
1
#!/usr/bin/env python
2
#
3
# File Name : ptbtokenizer.py
4
#
5
# Description : Do the PTB Tokenization and remove punctuations.
6
#
7
# Creation Date : 29-12-2014
8
# Last Modified : Thu Mar 19 09:53:35 2015
9
# Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10
11
import os
12
import sys
13
import subprocess
14
import tempfile
15
import itertools
16
17
18
# Last modified : Wed 22 May 2019 08:10:00 PM EDT
19
# By Sabarish Sivanath
20
# To support Python 3
21
22
# path to the stanford corenlp jar
23
STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
24
25
# punctuations to be removed from the sentences
26
PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
27
        ".", "?", "!", ",", ":", "-", "--", "...", ";"]
28
29
class PTBTokenizer:
30
    """Python wrapper of Stanford PTBTokenizer"""
31
32
    def tokenize(self, captions_for_image):
33
        cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
34
                'edu.stanford.nlp.process.PTBTokenizer', \
35
                '-preserveLines', '-lowerCase']
36
37
        # ======================================================
38
        # prepare data for PTB Tokenizer
39
        # ======================================================
40
        final_tokenized_captions_for_image = {}
41
        image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
42
        sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
43
44
        # ======================================================
45
        # save sentences to temporary file
46
        # ======================================================
47
        path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
48
        tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
49
        tmp_file.write(sentences.encode('utf-8'))
50
        tmp_file.close()
51
52
        # ======================================================
53
        # tokenize sentence
54
        # ======================================================
55
        cmd.append(os.path.basename(tmp_file.name))
56
        p_tokenizer = subprocess.Popen(cmd, 
57
                                       cwd=path_to_jar_dirname, 
58
                                       stdout=subprocess.PIPE,
59
                                       universal_newlines = True,
60
                                       bufsize = 1)
61
        token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
62
        lines = token_lines.split('\n')
63
        # remove temp file
64
        os.remove(tmp_file.name)
65
66
        # ======================================================
67
        # create dictionary for tokenized captions
68
        # ======================================================
69
        for k, line in zip(image_id, lines):
70
            if not k in final_tokenized_captions_for_image:
71
                final_tokenized_captions_for_image[k] = []
72
            tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
73
                    if w not in PUNCTUATIONS])
74
            final_tokenized_captions_for_image[k].append(tokenized_caption)
75
76
        return final_tokenized_captions_for_image