EHRKit-2022 / Git / [2d4573] /extractiveSummarization/summarization

Models:
philipB/
EHRKit-2022
Downloads: 1
[2d4573]: / extractiveSummarization / summarization_tests.py
History
Download this file
170 lines (141 with data), 6.6 kB

import unittest
import random
import sys, os
import glob
import itertools
import shutil
from pathlib import Path
from nltk import sent_tokenize

#from ehrkit.summarizers import Lexrank
#from ehrkit.summarizers.evaluate import folder2rouge

from summarizers import Lexrank
from summarizers.evaluate import folder2rouge

import files2rouge

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Number of documents used
NUM_DOCS = 20

class tests(unittest.TestCase):
    def setUp(self):
        self.source_dir = '/data/lily/jmg277/nc_text/source'
        self.source_dir_body = '/data/lily/jmg277/nc_text_body/source'
        self.target_dir = '/data/lily/jmg277/nc_text/target'

class t1(tests):
    def test1_1(self):
        print("test 1.1 Avg sentences in introduction source")
        sentence_counts = []
        for filepath in glob.glob(os.path.join(self.source_dir, '*.src')):
            with open(filepath) as fp:
                num_sentences = 0 
                for line in fp.readlines():
                    num_sentences += len(sent_tokenize(line))
                sentence_counts.append(num_sentences)
        avg = round(sum(sentence_counts)/len(sentence_counts))
        print("Mean number of sentences in source (introduction section) text:", avg)
        # placeholder, this output cannot be checked easily
        self.assertEqual(1, 1)

    def test1_2(self):
        print("test 1.2 Avg sentences in entire body source")
        sentence_counts = []
        for filepath in glob.glob(os.path.join(self.source_dir_body, '*.src')):
            with open(filepath) as fp:
                num_sentences = 0 
                for line in fp.readlines():
                    num_sentences += len(sent_tokenize(line))
                sentence_counts.append(num_sentences)
        avg = round(sum(sentence_counts)/len(sentence_counts))
        print("Mean number of sentences in source (entire body) text:", avg)
        # placeholder, this output cannot be checked easily
        self.assertEqual(1, 1)

    def test1_3(self):
        print("test 1.3 Avg sentences in abstracts")
        sentence_counts = []
        for filepath in glob.glob(os.path.join(self.target_dir, '*.tgt')):
            with open(filepath) as fp:
                num_sentences = 0 
                for line in fp.readlines():
                    num_sentences += len(sent_tokenize(line))
                sentence_counts.append(num_sentences)
        avg = round(sum(sentence_counts)/len(sentence_counts))
        print("Mean number of sentences in abstract text:", avg)
        # placeholder, this output cannot be checked easily
        self.assertEqual(1, 1)

class t2(tests):
    def setUp(self):
        self.source_dir = '/data/lily/jmg277/nc_text/source'
        self.target_dir = '/data/lily/jmg277/nc_text/target'
        self.source_dir_body = '/data/lily/jmg277/nc_text_body/source'
        self.target_dir_body = '/data/lily/jmg277/nc_text_body/target'
        self.saveto_dir = '/data/lily/sn482/pubmed_summaries'
       
        documents = {} 
        if not os.path.exists(self.saveto_dir):
            os.mkdir(self.saveto_dir)

        for i, filepath in enumerate(glob.glob(os.path.join(self.source_dir, '*.src'))):
            with open(filepath) as fp:
                fname = Path(filepath).stem
                sentences = [] 
                for line in fp.readlines():
                    sentences.extend(sent_tokenize(line))
                documents[fname] = sentences
            if i == NUM_DOCS - 1:
                break
        self.documents = documents

    def test2_1(self):
        print("test 2.1 idf scores")
        num_testdocs = 3

        test_docs = dict(itertools.islice(self.documents.items(), num_testdocs))
        lxr = Lexrank(test_docs.values(), threshold=.1)
        print(lxr.idf_score)
        # placeholder, this output cannot be checked easily
        self.assertEqual(1, 1)

    def test2_2(self):
        print("test 2.2 intro lexrank summaries trained on intro text")
        new_dir = "lexrank_summaries"
        new_dir_path = os.path.join(self.saveto_dir, new_dir)
        if os.path.exists(new_dir_path):
            shutil.rmtree(new_dir_path)
        os.mkdir(new_dir_path)

        lxr = Lexrank(self.documents.values(), threshold=.1)
        test_docs = self.documents
        #dict(itertools.islice(self.documents.items(), 3)) #documents[:3]
        for fname in test_docs:
            summary = lxr.get_summary(test_docs[fname], summary_size=10)#, threshold=.1)
            joined_summary = " ".join(summary)
            summary_path = os.path.join(new_dir_path, fname + ".sum")
            with open(summary_path, 'w') as sum:
                sum.write(joined_summary)
        # placeholder, this output cannot be checked easily
        self.assertEqual(1, 1)

class t3(tests):
    def setUp(self):
        self.ref_dir = '/data/lily/sn482/reference_abstracts'
        self.lxrsummaries_dir = '/data/lily/sn482/pubmed_summaries/lexrank_summaries'

    def test3_1(self):
        print("test 3.1 files2rouge lexrank summaries")
        
        allsummaries_path = os.path.join(self.lxrsummaries_dir, 'allsummaries.txt')
        allreferences_path = os.path.join(self.lxrsummaries_dir, 'allreferences.txt')
        #saveto_path = os.path.join(self.saveto_path, "lexrank_rouge.txt")

        allsummaries_file = open(allsummaries_path, 'w')
        allreferences_file = open(allreferences_path, 'w')

        for filepath in glob.glob(os.path.join(self.lxrsummaries_dir, '*.sum')):
            fname = Path(filepath).stem
            
            with open(filepath) as fs:
                summary = fs.readline()
                allsummaries_file.write("%s\n" % summary)

            ref_path = os.path.join(self.ref_dir, fname + ".tgt")

            with open(ref_path) as fr:
                abstract = fr.readline()
                allreferences_file.write("%s\n" % abstract)

        allsummaries_file.close()
        allreferences_file.close()

        files2rouge.run(allsummaries_path, allreferences_path)
        # placeholder, this output cannot be checked easily
        self.assertEqual(1, 1)

    def test3_2(self):
        print("test 3.2 folder2rouge lexrank summaries")
        # saveto_path = os.path.join(self.saveto_dir, "lxr_folder2rouge.txt")
        rouge = folder2rouge(self.lxrsummaries_dir, self.ref_dir)
        rouge.run()#saveto=saveto_path)
        # placeholder, this output cannot be checked easily
        self.assertEqual(1, 1)

if __name__ == '__main__':
    unittest.main()