|
a |
|
b/extractiveSummarization/generate_summaries.py |
|
|
1 |
import sys, os |
|
|
2 |
import glob |
|
|
3 |
import itertools |
|
|
4 |
import shutil |
|
|
5 |
import time |
|
|
6 |
from pathlib import Path |
|
|
7 |
from nltk import sent_tokenize |
|
|
8 |
from summarizers import Lexrank |
|
|
9 |
|
|
|
10 |
import argparse |
|
|
11 |
parser = argparse.ArgumentParser() |
|
|
12 |
|
|
|
13 |
parser.add_argument('--train', action='store', metavar='path', type=str, required=True, help='Directory path containing training documents. If test path not specified, also treated as testing documents.') |
|
|
14 |
parser.add_argument('--saveto', action='store', metavar='path', type=str, required=True, help='directory path for saving summaries produced') |
|
|
15 |
parser.add_argument('--test', action='store', metavar='path', type=str, help='directory path containing testing documents') |
|
|
16 |
parser.add_argument('--ntrain', action='store', type=int, metavar='n', help='First n number of documents to train on') |
|
|
17 |
parser.add_argument('--ntest', action='store', type=int, metavar='n', help='First n number of documents to produce summaries for') |
|
|
18 |
parser.add_argument('--threshold', action='store', type=float, help="default 0.03") |
|
|
19 |
parser.add_argument('--size', action='store', type=int, help='summary size. default 1') |
|
|
20 |
|
|
|
21 |
args = parser.parse_args() |
|
|
22 |
|
|
|
23 |
train_dir_path = args.train |
|
|
24 |
test_dir_path = args.test |
|
|
25 |
saveto_dir_path = args.saveto |
|
|
26 |
threshold = args.threshold or 0.03 |
|
|
27 |
summary_size = args.size or 1 |
|
|
28 |
|
|
|
29 |
if not os.path.isdir(train_dir_path): |
|
|
30 |
print('The train path specified does not exist') |
|
|
31 |
sys.exit() |
|
|
32 |
|
|
|
33 |
if test_dir_path and not os.path.isdir(test_dir_path): |
|
|
34 |
print('The test path specified does not exist') |
|
|
35 |
sys.exit() |
|
|
36 |
|
|
|
37 |
if not os.path.isdir(saveto_dir_path): |
|
|
38 |
print('The save to path specified does not exist') |
|
|
39 |
sys.exit() |
|
|
40 |
|
|
|
41 |
if args.ntest and args.ntest < 1: |
|
|
42 |
print('ntest should be greater than 0') |
|
|
43 |
|
|
|
44 |
if args.ntrain and args.ntrain < 1: |
|
|
45 |
print('ntrain should be greater than 0') |
|
|
46 |
|
|
|
47 |
start = time.time() |
|
|
48 |
|
|
|
49 |
train_documents = {} |
|
|
50 |
test_documents = {} |
|
|
51 |
|
|
|
52 |
for i, filepath in enumerate(glob.glob(os.path.join(train_dir_path, '*'))): |
|
|
53 |
with open(filepath) as fp: |
|
|
54 |
fname = Path(filepath).stem |
|
|
55 |
sentences = [] |
|
|
56 |
for line in fp.readlines(): |
|
|
57 |
sentences.extend(sent_tokenize(line)) |
|
|
58 |
train_documents[fname] = sentences |
|
|
59 |
if args.ntrain and i == args.ntrain - 1: |
|
|
60 |
break |
|
|
61 |
|
|
|
62 |
if test_dir_path: |
|
|
63 |
for i, filepath in enumerate(glob.glob(os.path.join(test_dir_path, '*'))): |
|
|
64 |
with open(filepath) as fp: |
|
|
65 |
fname = Path(filepath).stem |
|
|
66 |
sentences = [] |
|
|
67 |
for line in fp.readlines(): |
|
|
68 |
sentences.extend(sent_tokenize(line)) |
|
|
69 |
test_documents[fname] = sentences |
|
|
70 |
if args.ntest and i == args.ntest - 1: |
|
|
71 |
break |
|
|
72 |
|
|
|
73 |
lxr = Lexrank(train_documents.values(), threshold=threshold) |
|
|
74 |
|
|
|
75 |
if test_dir_path: |
|
|
76 |
documents = test_documents |
|
|
77 |
else: |
|
|
78 |
documents = train_documents |
|
|
79 |
|
|
|
80 |
for i, fname in enumerate(documents): |
|
|
81 |
summary = lxr.get_summary(documents[fname], summary_size=summary_size)#, threshold=.1) |
|
|
82 |
joined_summary = " ".join(summary) |
|
|
83 |
summary_path = os.path.join(saveto_dir_path, fname + ".sum") |
|
|
84 |
with open(summary_path, 'w') as sum: |
|
|
85 |
sum.write(joined_summary) |
|
|
86 |
if args.ntest and i == args.ntest - 1: |
|
|
87 |
break |
|
|
88 |
|
|
|
89 |
end = time.time() |
|
|
90 |
#print("----Summary----") |
|
|
91 |
print("Runtime " + str(end - start)) |