Diff of /tests/pubmed_tests.py [000000] .. [2d4573]

Switch to side-by-side view

--- a
+++ b/tests/pubmed_tests.py
@@ -0,0 +1,126 @@
+import os
+import sys
+import unittest
+
+# XML_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'pubmed', 'xml'))
+# PARSED_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'pubmed', 'parsed_articles'))
+# XML_DIR = '/data/corpora/pubmed_xml_subset'
+# PARSED_DIR = '/data/corpora/pubmed_parsed'
+
+XML_DIR = '../pubmed/xml'
+PARSED_DIR = '../pubmed/parsed_articles'
+
+if not os.path.exists(XML_DIR):
+    command = 'Error: Directory of PubMed XML files does not exist at ' + XML_DIR + '.'
+    sys.exit(command)
+if not os.path.exists(PARSED_DIR):
+    command = 'Error: Directory of parsed PubMed articles does not exist at ' + PARSED_DIR + '.'
+    sys.exit(command)
+
+
+class tests(unittest.TestCase):
+    def setUp(self):
+        self.PARSED_DIR = PARSED_DIR
+        self.XML_DIR = XML_DIR
+
+class t1(tests):
+    # Concerning number of articles in directories
+    def test1_1(self):
+        print("Number of articles whose introductions have been parsed:")
+        command = 'ls ' + os.path.join(self.PARSED_DIR, 'with_just_intros', 'body') + ' | wc -l'
+        os.system(command)
+
+    def test1_2(self):
+        print("Number of articles whose whole bodies have been parsed:")
+        command = 'ls ' + os.path.join(self.PARSED_DIR, 'with_whole_bodies', 'body') + ' | wc -l'
+        os.system(command)
+
+    @unittest.skipIf("t1.test1_3" not in sys.argv, "Test 1_3 must be run explicitly due to runtime.")
+    def test1_3(self):
+        # Takes a few minutes
+        print("Number of XML article files (takes a while to run):")
+        command = 'find ' + self.XML_DIR + ' -type f | wc -l'
+        os.system(command)
+
+
+class t2(tests):
+    # Concerning number of words in files
+    def test2_1(self, write=True):
+        counts = []
+        body_dir = os.path.join(self.PARSED_DIR, 'with_just_intros', 'body')
+        for file in os.listdir(body_dir)[:1000]:
+            if file.endswith('.src'):
+                with open(os.path.join(body_dir, file), "rt") as body_file:
+                    data = body_file.read()
+                    words = data.split()
+                    counts.append(len(words))
+        avg = round(sum(counts) / len(counts))
+        if write:
+            print("Average number of words in introductory section:", avg)
+        return avg
+
+    def test2_2(self, write=True):
+        counts = []
+        body_dir = os.path.join(self.PARSED_DIR, 'with_whole_bodies', 'body')
+        for file in os.listdir(body_dir)[:1000]:
+            if file.endswith('.src'):
+                with open(os.path.join(body_dir, file), "rt") as body_file:
+                    data = body_file.read()
+                    words = data.split()
+                    counts.append(len(words))
+        avg = round(sum(counts) / len(counts))
+        if write:
+            print("Mean number of words in whole body section:", avg)
+        return avg
+
+    def test2_3(self, write=True, whole_body=False):
+        counts = []
+        if whole_body:
+            abstract_dir = os.path.join(self.PARSED_DIR, 'with_whole_bodies', 'abstract')
+        else:
+            abstract_dir = os.path.join(self.PARSED_DIR, 'with_just_intros', 'abstract')
+        for file in os.listdir(abstract_dir)[:1000]:
+            if file.endswith('.tgt'):
+                 with open(os.path.join(abstract_dir, file), "rt") as body_file:
+                    data = body_file.read()
+                    words = data.split()
+                    counts.append(len(words))
+        avg = round(sum(counts) / len(counts))
+        if write:
+            print("Mean number of words in abstract:", avg)
+        return avg
+
+    def test2_4(self):
+        src_words = self.test2_1(write=False)
+        tgt_words = self.test2_3(write=False)
+        print("Ratio of body to abstract length (with just body intros):", round(src_words/tgt_words, 1))
+
+    def test2_5(self):
+        src_words = self.test2_2(write=False)
+        tgt_words = self.test2_3(write=False, whole_body=True)
+        print("Ratio of body to abstract length (with whole bodies):", round(src_words/tgt_words, 1))
+
+
+class t3(tests):
+    # Concerning sizes of directories
+    @unittest.skipIf("t3.test3_1" not in sys.argv, "Test 3_1 must be run explicitly due to runtime.")
+    def test3_1(self):
+        print("Size of directory of articles with just body intros (takes a while to run):")
+        command = 'du -sh ' + os.path.join(self.PARSED_DIR, 'with_just_intros')
+        os.system(command)
+
+    @unittest.skipIf("t3.test3_2" not in sys.argv, "Test 3_2 must be run explicitly due to runtime.")
+    def test3_2(self):
+        print("Size of directory of articles with whole bodies (takes a while to run):")
+        command = 'du -sh ' + os.path.join(self.PARSED_DIR, 'with_whole_bodies')
+        os.system(command)
+
+    @unittest.skipIf("t3.test3_3" not in sys.argv, "Test 3_3 must be run explicitly due to runtime.")
+    def test3_3(self):
+        print("Size of XML file directory (takes a while to run):")
+        command = 'du -sh ' + os.path.join(self.XML_DIR)
+        os.system(command)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file