a b/tests/pubmed_tests.py
1
import os
2
import sys
3
import unittest
4
5
# XML_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'pubmed', 'xml'))
6
# PARSED_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'pubmed', 'parsed_articles'))
7
# XML_DIR = '/data/corpora/pubmed_xml_subset'
8
# PARSED_DIR = '/data/corpora/pubmed_parsed'
9
10
XML_DIR = '../pubmed/xml'
11
PARSED_DIR = '../pubmed/parsed_articles'
12
13
if not os.path.exists(XML_DIR):
14
    command = 'Error: Directory of PubMed XML files does not exist at ' + XML_DIR + '.'
15
    sys.exit(command)
16
if not os.path.exists(PARSED_DIR):
17
    command = 'Error: Directory of parsed PubMed articles does not exist at ' + PARSED_DIR + '.'
18
    sys.exit(command)
19
20
21
class tests(unittest.TestCase):
22
    def setUp(self):
23
        self.PARSED_DIR = PARSED_DIR
24
        self.XML_DIR = XML_DIR
25
26
class t1(tests):
27
    # Concerning number of articles in directories
28
    def test1_1(self):
29
        print("Number of articles whose introductions have been parsed:")
30
        command = 'ls ' + os.path.join(self.PARSED_DIR, 'with_just_intros', 'body') + ' | wc -l'
31
        os.system(command)
32
33
    def test1_2(self):
34
        print("Number of articles whose whole bodies have been parsed:")
35
        command = 'ls ' + os.path.join(self.PARSED_DIR, 'with_whole_bodies', 'body') + ' | wc -l'
36
        os.system(command)
37
38
    @unittest.skipIf("t1.test1_3" not in sys.argv, "Test 1_3 must be run explicitly due to runtime.")
39
    def test1_3(self):
40
        # Takes a few minutes
41
        print("Number of XML article files (takes a while to run):")
42
        command = 'find ' + self.XML_DIR + ' -type f | wc -l'
43
        os.system(command)
44
45
46
class t2(tests):
47
    # Concerning number of words in files
48
    def test2_1(self, write=True):
49
        counts = []
50
        body_dir = os.path.join(self.PARSED_DIR, 'with_just_intros', 'body')
51
        for file in os.listdir(body_dir)[:1000]:
52
            if file.endswith('.src'):
53
                with open(os.path.join(body_dir, file), "rt") as body_file:
54
                    data = body_file.read()
55
                    words = data.split()
56
                    counts.append(len(words))
57
        avg = round(sum(counts) / len(counts))
58
        if write:
59
            print("Average number of words in introductory section:", avg)
60
        return avg
61
62
    def test2_2(self, write=True):
63
        counts = []
64
        body_dir = os.path.join(self.PARSED_DIR, 'with_whole_bodies', 'body')
65
        for file in os.listdir(body_dir)[:1000]:
66
            if file.endswith('.src'):
67
                with open(os.path.join(body_dir, file), "rt") as body_file:
68
                    data = body_file.read()
69
                    words = data.split()
70
                    counts.append(len(words))
71
        avg = round(sum(counts) / len(counts))
72
        if write:
73
            print("Mean number of words in whole body section:", avg)
74
        return avg
75
76
    def test2_3(self, write=True, whole_body=False):
77
        counts = []
78
        if whole_body:
79
            abstract_dir = os.path.join(self.PARSED_DIR, 'with_whole_bodies', 'abstract')
80
        else:
81
            abstract_dir = os.path.join(self.PARSED_DIR, 'with_just_intros', 'abstract')
82
        for file in os.listdir(abstract_dir)[:1000]:
83
            if file.endswith('.tgt'):
84
                 with open(os.path.join(abstract_dir, file), "rt") as body_file:
85
                    data = body_file.read()
86
                    words = data.split()
87
                    counts.append(len(words))
88
        avg = round(sum(counts) / len(counts))
89
        if write:
90
            print("Mean number of words in abstract:", avg)
91
        return avg
92
93
    def test2_4(self):
94
        src_words = self.test2_1(write=False)
95
        tgt_words = self.test2_3(write=False)
96
        print("Ratio of body to abstract length (with just body intros):", round(src_words/tgt_words, 1))
97
98
    def test2_5(self):
99
        src_words = self.test2_2(write=False)
100
        tgt_words = self.test2_3(write=False, whole_body=True)
101
        print("Ratio of body to abstract length (with whole bodies):", round(src_words/tgt_words, 1))
102
103
104
class t3(tests):
105
    # Concerning sizes of directories
106
    @unittest.skipIf("t3.test3_1" not in sys.argv, "Test 3_1 must be run explicitly due to runtime.")
107
    def test3_1(self):
108
        print("Size of directory of articles with just body intros (takes a while to run):")
109
        command = 'du -sh ' + os.path.join(self.PARSED_DIR, 'with_just_intros')
110
        os.system(command)
111
112
    @unittest.skipIf("t3.test3_2" not in sys.argv, "Test 3_2 must be run explicitly due to runtime.")
113
    def test3_2(self):
114
        print("Size of directory of articles with whole bodies (takes a while to run):")
115
        command = 'du -sh ' + os.path.join(self.PARSED_DIR, 'with_whole_bodies')
116
        os.system(command)
117
118
    @unittest.skipIf("t3.test3_3" not in sys.argv, "Test 3_3 must be run explicitly due to runtime.")
119
    def test3_3(self):
120
        print("Size of XML file directory (takes a while to run):")
121
        command = 'du -sh ' + os.path.join(self.XML_DIR)
122
        os.system(command)
123
124
125
if __name__ == '__main__':
126
    unittest.main()