Switch to unified view

a b/extractiveSummarization/generate_summaries.py
1
import sys, os
2
import glob
3
import itertools
4
import shutil
5
import time
6
from pathlib import Path
7
from nltk import sent_tokenize
8
from summarizers import Lexrank
9
10
import argparse
11
parser = argparse.ArgumentParser()
12
13
parser.add_argument('--train', action='store', metavar='path', type=str, required=True,  help='Directory path containing training documents. If test path not specified, also treated as testing documents.')
14
parser.add_argument('--saveto', action='store', metavar='path', type=str, required=True, help='directory path for saving summaries produced')
15
parser.add_argument('--test', action='store', metavar='path', type=str, help='directory path containing testing documents')
16
parser.add_argument('--ntrain', action='store', type=int, metavar='n', help='First n number of documents to train on')
17
parser.add_argument('--ntest', action='store', type=int, metavar='n', help='First n number of documents to produce summaries for')
18
parser.add_argument('--threshold', action='store', type=float, help="default 0.03")
19
parser.add_argument('--size', action='store', type=int, help='summary size. default 1')
20
21
args = parser.parse_args()
22
23
train_dir_path = args.train
24
test_dir_path = args.test
25
saveto_dir_path = args.saveto
26
threshold = args.threshold or 0.03
27
summary_size = args.size or 1
28
29
if not os.path.isdir(train_dir_path):
30
    print('The train path specified does not exist')
31
    sys.exit()
32
33
if test_dir_path and not os.path.isdir(test_dir_path):
34
    print('The test path specified does not exist')
35
    sys.exit()
36
37
if not os.path.isdir(saveto_dir_path):
38
    print('The save to path specified does not exist')
39
    sys.exit()
40
41
if args.ntest and args.ntest < 1:
42
    print('ntest should be greater than 0')
43
44
if args.ntrain and args.ntrain < 1:
45
    print('ntrain should be greater than 0')
46
47
start = time.time()
48
49
train_documents = {}
50
test_documents = {}
51
52
for i, filepath in enumerate(glob.glob(os.path.join(train_dir_path, '*'))):
53
    with open(filepath) as fp:
54
        fname = Path(filepath).stem
55
        sentences = [] 
56
        for line in fp.readlines():
57
            sentences.extend(sent_tokenize(line))
58
        train_documents[fname] = sentences
59
        if args.ntrain and i == args.ntrain - 1:
60
            break
61
62
if test_dir_path:
63
    for i, filepath in enumerate(glob.glob(os.path.join(test_dir_path, '*'))):
64
        with open(filepath) as fp:
65
            fname = Path(filepath).stem
66
            sentences = [] 
67
            for line in fp.readlines():
68
                sentences.extend(sent_tokenize(line))
69
            test_documents[fname] = sentences
70
        if args.ntest and i == args.ntest - 1:
71
            break
72
73
lxr = Lexrank(train_documents.values(), threshold=threshold)
74
75
if test_dir_path:
76
    documents = test_documents
77
else:
78
    documents = train_documents
79
80
for i, fname in enumerate(documents):
81
    summary = lxr.get_summary(documents[fname], summary_size=summary_size)#, threshold=.1)
82
    joined_summary = " ".join(summary)
83
    summary_path = os.path.join(saveto_dir_path, fname + ".sum")
84
    with open(summary_path, 'w') as sum:
85
        sum.write(joined_summary)
86
    if args.ntest and i == args.ntest - 1:
87
        break
88
89
end = time.time()
90
#print("----Summary----")
91
print("Runtime " + str(end - start))