EHRKit-2022 / Git / Diff of /extractiveSummarization/summarization

Models:
philipB/
EHRKit-2022
Downloads: 1
Diff of /extractiveSummarization/summarization_tests.py [000000] .. [2d4573]
Switch to side-by-side view

--- a
+++ b/extractiveSummarization/summarization_tests.py
@@ -0,0 +1,169 @@
+import unittest
+import random
+import sys, os
+import glob
+import itertools
+import shutil
+from pathlib import Path
+from nltk import sent_tokenize
+
+#from ehrkit.summarizers import Lexrank
+#from ehrkit.summarizers.evaluate import folder2rouge
+
+from summarizers import Lexrank
+from summarizers.evaluate import folder2rouge
+
+import files2rouge
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Number of documents used
+NUM_DOCS = 20
+
+class tests(unittest.TestCase):
+    def setUp(self):
+        self.source_dir = '/data/lily/jmg277/nc_text/source'
+        self.source_dir_body = '/data/lily/jmg277/nc_text_body/source'
+        self.target_dir = '/data/lily/jmg277/nc_text/target'
+
+class t1(tests):
+    def test1_1(self):
+        print("test 1.1 Avg sentences in introduction source")
+        sentence_counts = []
+        for filepath in glob.glob(os.path.join(self.source_dir, '*.src')):
+            with open(filepath) as fp:
+                num_sentences = 0 
+                for line in fp.readlines():
+                    num_sentences += len(sent_tokenize(line))
+                sentence_counts.append(num_sentences)
+        avg = round(sum(sentence_counts)/len(sentence_counts))
+        print("Mean number of sentences in source (introduction section) text:", avg)
+        # placeholder, this output cannot be checked easily
+        self.assertEqual(1, 1)
+
+    def test1_2(self):
+        print("test 1.2 Avg sentences in entire body source")
+        sentence_counts = []
+        for filepath in glob.glob(os.path.join(self.source_dir_body, '*.src')):
+            with open(filepath) as fp:
+                num_sentences = 0 
+                for line in fp.readlines():
+                    num_sentences += len(sent_tokenize(line))
+                sentence_counts.append(num_sentences)
+        avg = round(sum(sentence_counts)/len(sentence_counts))
+        print("Mean number of sentences in source (entire body) text:", avg)
+        # placeholder, this output cannot be checked easily
+        self.assertEqual(1, 1)
+
+    def test1_3(self):
+        print("test 1.3 Avg sentences in abstracts")
+        sentence_counts = []
+        for filepath in glob.glob(os.path.join(self.target_dir, '*.tgt')):
+            with open(filepath) as fp:
+                num_sentences = 0 
+                for line in fp.readlines():
+                    num_sentences += len(sent_tokenize(line))
+                sentence_counts.append(num_sentences)
+        avg = round(sum(sentence_counts)/len(sentence_counts))
+        print("Mean number of sentences in abstract text:", avg)
+        # placeholder, this output cannot be checked easily
+        self.assertEqual(1, 1)
+
+class t2(tests):
+    def setUp(self):
+        self.source_dir = '/data/lily/jmg277/nc_text/source'
+        self.target_dir = '/data/lily/jmg277/nc_text/target'
+        self.source_dir_body = '/data/lily/jmg277/nc_text_body/source'
+        self.target_dir_body = '/data/lily/jmg277/nc_text_body/target'
+        self.saveto_dir = '/data/lily/sn482/pubmed_summaries'
+       
+        documents = {} 
+        if not os.path.exists(self.saveto_dir):
+            os.mkdir(self.saveto_dir)
+
+        for i, filepath in enumerate(glob.glob(os.path.join(self.source_dir, '*.src'))):
+            with open(filepath) as fp:
+                fname = Path(filepath).stem
+                sentences = [] 
+                for line in fp.readlines():
+                    sentences.extend(sent_tokenize(line))
+                documents[fname] = sentences
+            if i == NUM_DOCS - 1:
+                break
+        self.documents = documents
+
+    def test2_1(self):
+        print("test 2.1 idf scores")
+        num_testdocs = 3
+
+        test_docs = dict(itertools.islice(self.documents.items(), num_testdocs))
+        lxr = Lexrank(test_docs.values(), threshold=.1)
+        print(lxr.idf_score)
+        # placeholder, this output cannot be checked easily
+        self.assertEqual(1, 1)
+
+    def test2_2(self):
+        print("test 2.2 intro lexrank summaries trained on intro text")
+        new_dir = "lexrank_summaries"
+        new_dir_path = os.path.join(self.saveto_dir, new_dir)
+        if os.path.exists(new_dir_path):
+            shutil.rmtree(new_dir_path)
+        os.mkdir(new_dir_path)
+
+        lxr = Lexrank(self.documents.values(), threshold=.1)
+        test_docs = self.documents
+        #dict(itertools.islice(self.documents.items(), 3)) #documents[:3]
+        for fname in test_docs:
+            summary = lxr.get_summary(test_docs[fname], summary_size=10)#, threshold=.1)
+            joined_summary = " ".join(summary)
+            summary_path = os.path.join(new_dir_path, fname + ".sum")
+            with open(summary_path, 'w') as sum:
+                sum.write(joined_summary)
+        # placeholder, this output cannot be checked easily
+        self.assertEqual(1, 1)
+
+class t3(tests):
+    def setUp(self):
+        self.ref_dir = '/data/lily/sn482/reference_abstracts'
+        self.lxrsummaries_dir = '/data/lily/sn482/pubmed_summaries/lexrank_summaries'
+
+    def test3_1(self):
+        print("test 3.1 files2rouge lexrank summaries")
+        
+        allsummaries_path = os.path.join(self.lxrsummaries_dir, 'allsummaries.txt')
+        allreferences_path = os.path.join(self.lxrsummaries_dir, 'allreferences.txt')
+        #saveto_path = os.path.join(self.saveto_path, "lexrank_rouge.txt")
+
+        allsummaries_file = open(allsummaries_path, 'w')
+        allreferences_file = open(allreferences_path, 'w')
+
+        for filepath in glob.glob(os.path.join(self.lxrsummaries_dir, '*.sum')):
+            fname = Path(filepath).stem
+            
+            with open(filepath) as fs:
+                summary = fs.readline()
+                allsummaries_file.write("%s\n" % summary)
+
+            ref_path = os.path.join(self.ref_dir, fname + ".tgt")
+
+            with open(ref_path) as fr:
+                abstract = fr.readline()
+                allreferences_file.write("%s\n" % abstract)
+
+        allsummaries_file.close()
+        allreferences_file.close()
+
+        files2rouge.run(allsummaries_path, allreferences_path)
+        # placeholder, this output cannot be checked easily
+        self.assertEqual(1, 1)
+
+    def test3_2(self):
+        print("test 3.2 folder2rouge lexrank summaries")
+        # saveto_path = os.path.join(self.saveto_dir, "lxr_folder2rouge.txt")
+        rouge = folder2rouge(self.lxrsummaries_dir, self.ref_dir)
+        rouge.run()#saveto=saveto_path)
+        # placeholder, this output cannot be checked easily
+        self.assertEqual(1, 1)
+
+if __name__ == '__main__':
+    unittest.main()