--- a +++ b/extractiveSummarization/generate_summaries.py @@ -0,0 +1,91 @@ +import sys, os +import glob +import itertools +import shutil +import time +from pathlib import Path +from nltk import sent_tokenize +from summarizers import Lexrank + +import argparse +parser = argparse.ArgumentParser() + +parser.add_argument('--train', action='store', metavar='path', type=str, required=True, help='Directory path containing training documents. If test path not specified, also treated as testing documents.') +parser.add_argument('--saveto', action='store', metavar='path', type=str, required=True, help='directory path for saving summaries produced') +parser.add_argument('--test', action='store', metavar='path', type=str, help='directory path containing testing documents') +parser.add_argument('--ntrain', action='store', type=int, metavar='n', help='First n number of documents to train on') +parser.add_argument('--ntest', action='store', type=int, metavar='n', help='First n number of documents to produce summaries for') +parser.add_argument('--threshold', action='store', type=float, help="default 0.03") +parser.add_argument('--size', action='store', type=int, help='summary size. default 1') + +args = parser.parse_args() + +train_dir_path = args.train +test_dir_path = args.test +saveto_dir_path = args.saveto +threshold = args.threshold or 0.03 +summary_size = args.size or 1 + +if not os.path.isdir(train_dir_path): + print('The train path specified does not exist') + sys.exit() + +if test_dir_path and not os.path.isdir(test_dir_path): + print('The test path specified does not exist') + sys.exit() + +if not os.path.isdir(saveto_dir_path): + print('The save to path specified does not exist') + sys.exit() + +if args.ntest and args.ntest < 1: + print('ntest should be greater than 0') + +if args.ntrain and args.ntrain < 1: + print('ntrain should be greater than 0') + +start = time.time() + +train_documents = {} +test_documents = {} + +for i, filepath in enumerate(glob.glob(os.path.join(train_dir_path, '*'))): + with open(filepath) as fp: + fname = Path(filepath).stem + sentences = [] + for line in fp.readlines(): + sentences.extend(sent_tokenize(line)) + train_documents[fname] = sentences + if args.ntrain and i == args.ntrain - 1: + break + +if test_dir_path: + for i, filepath in enumerate(glob.glob(os.path.join(test_dir_path, '*'))): + with open(filepath) as fp: + fname = Path(filepath).stem + sentences = [] + for line in fp.readlines(): + sentences.extend(sent_tokenize(line)) + test_documents[fname] = sentences + if args.ntest and i == args.ntest - 1: + break + +lxr = Lexrank(train_documents.values(), threshold=threshold) + +if test_dir_path: + documents = test_documents +else: + documents = train_documents + +for i, fname in enumerate(documents): + summary = lxr.get_summary(documents[fname], summary_size=summary_size)#, threshold=.1) + joined_summary = " ".join(summary) + summary_path = os.path.join(saveto_dir_path, fname + ".sum") + with open(summary_path, 'w') as sum: + sum.write(joined_summary) + if args.ntest and i == args.ntest - 1: + break + +end = time.time() +#print("----Summary----") +print("Runtime " + str(end - start))