--- a +++ b/extractiveSummarization/summarization_tests.py @@ -0,0 +1,169 @@ +import unittest +import random +import sys, os +import glob +import itertools +import shutil +from pathlib import Path +from nltk import sent_tokenize + +#from ehrkit.summarizers import Lexrank +#from ehrkit.summarizers.evaluate import folder2rouge + +from summarizers import Lexrank +from summarizers.evaluate import folder2rouge + +import files2rouge + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Number of documents used +NUM_DOCS = 20 + +class tests(unittest.TestCase): + def setUp(self): + self.source_dir = '/data/lily/jmg277/nc_text/source' + self.source_dir_body = '/data/lily/jmg277/nc_text_body/source' + self.target_dir = '/data/lily/jmg277/nc_text/target' + +class t1(tests): + def test1_1(self): + print("test 1.1 Avg sentences in introduction source") + sentence_counts = [] + for filepath in glob.glob(os.path.join(self.source_dir, '*.src')): + with open(filepath) as fp: + num_sentences = 0 + for line in fp.readlines(): + num_sentences += len(sent_tokenize(line)) + sentence_counts.append(num_sentences) + avg = round(sum(sentence_counts)/len(sentence_counts)) + print("Mean number of sentences in source (introduction section) text:", avg) + # placeholder, this output cannot be checked easily + self.assertEqual(1, 1) + + def test1_2(self): + print("test 1.2 Avg sentences in entire body source") + sentence_counts = [] + for filepath in glob.glob(os.path.join(self.source_dir_body, '*.src')): + with open(filepath) as fp: + num_sentences = 0 + for line in fp.readlines(): + num_sentences += len(sent_tokenize(line)) + sentence_counts.append(num_sentences) + avg = round(sum(sentence_counts)/len(sentence_counts)) + print("Mean number of sentences in source (entire body) text:", avg) + # placeholder, this output cannot be checked easily + self.assertEqual(1, 1) + + def test1_3(self): + print("test 1.3 Avg sentences in abstracts") + sentence_counts = [] + for filepath in glob.glob(os.path.join(self.target_dir, '*.tgt')): + with open(filepath) as fp: + num_sentences = 0 + for line in fp.readlines(): + num_sentences += len(sent_tokenize(line)) + sentence_counts.append(num_sentences) + avg = round(sum(sentence_counts)/len(sentence_counts)) + print("Mean number of sentences in abstract text:", avg) + # placeholder, this output cannot be checked easily + self.assertEqual(1, 1) + +class t2(tests): + def setUp(self): + self.source_dir = '/data/lily/jmg277/nc_text/source' + self.target_dir = '/data/lily/jmg277/nc_text/target' + self.source_dir_body = '/data/lily/jmg277/nc_text_body/source' + self.target_dir_body = '/data/lily/jmg277/nc_text_body/target' + self.saveto_dir = '/data/lily/sn482/pubmed_summaries' + + documents = {} + if not os.path.exists(self.saveto_dir): + os.mkdir(self.saveto_dir) + + for i, filepath in enumerate(glob.glob(os.path.join(self.source_dir, '*.src'))): + with open(filepath) as fp: + fname = Path(filepath).stem + sentences = [] + for line in fp.readlines(): + sentences.extend(sent_tokenize(line)) + documents[fname] = sentences + if i == NUM_DOCS - 1: + break + self.documents = documents + + def test2_1(self): + print("test 2.1 idf scores") + num_testdocs = 3 + + test_docs = dict(itertools.islice(self.documents.items(), num_testdocs)) + lxr = Lexrank(test_docs.values(), threshold=.1) + print(lxr.idf_score) + # placeholder, this output cannot be checked easily + self.assertEqual(1, 1) + + def test2_2(self): + print("test 2.2 intro lexrank summaries trained on intro text") + new_dir = "lexrank_summaries" + new_dir_path = os.path.join(self.saveto_dir, new_dir) + if os.path.exists(new_dir_path): + shutil.rmtree(new_dir_path) + os.mkdir(new_dir_path) + + lxr = Lexrank(self.documents.values(), threshold=.1) + test_docs = self.documents + #dict(itertools.islice(self.documents.items(), 3)) #documents[:3] + for fname in test_docs: + summary = lxr.get_summary(test_docs[fname], summary_size=10)#, threshold=.1) + joined_summary = " ".join(summary) + summary_path = os.path.join(new_dir_path, fname + ".sum") + with open(summary_path, 'w') as sum: + sum.write(joined_summary) + # placeholder, this output cannot be checked easily + self.assertEqual(1, 1) + +class t3(tests): + def setUp(self): + self.ref_dir = '/data/lily/sn482/reference_abstracts' + self.lxrsummaries_dir = '/data/lily/sn482/pubmed_summaries/lexrank_summaries' + + def test3_1(self): + print("test 3.1 files2rouge lexrank summaries") + + allsummaries_path = os.path.join(self.lxrsummaries_dir, 'allsummaries.txt') + allreferences_path = os.path.join(self.lxrsummaries_dir, 'allreferences.txt') + #saveto_path = os.path.join(self.saveto_path, "lexrank_rouge.txt") + + allsummaries_file = open(allsummaries_path, 'w') + allreferences_file = open(allreferences_path, 'w') + + for filepath in glob.glob(os.path.join(self.lxrsummaries_dir, '*.sum')): + fname = Path(filepath).stem + + with open(filepath) as fs: + summary = fs.readline() + allsummaries_file.write("%s\n" % summary) + + ref_path = os.path.join(self.ref_dir, fname + ".tgt") + + with open(ref_path) as fr: + abstract = fr.readline() + allreferences_file.write("%s\n" % abstract) + + allsummaries_file.close() + allreferences_file.close() + + files2rouge.run(allsummaries_path, allreferences_path) + # placeholder, this output cannot be checked easily + self.assertEqual(1, 1) + + def test3_2(self): + print("test 3.2 folder2rouge lexrank summaries") + # saveto_path = os.path.join(self.saveto_dir, "lxr_folder2rouge.txt") + rouge = folder2rouge(self.lxrsummaries_dir, self.ref_dir) + rouge.run()#saveto=saveto_path) + # placeholder, this output cannot be checked easily + self.assertEqual(1, 1) + +if __name__ == '__main__': + unittest.main()