Diff of /pubmed/parse_articles.py [000000] .. [2d4573]

Switch to unified view

a b/pubmed/parse_articles.py
1
import os
2
import shutil
3
import random
4
import sys
5
import nltk
6
from bs4 import BeautifulSoup
7
8
9
def parse_body(soup, whole_body):
10
    if whole_body:
11
        body_tags = soup.findAll('body')
12
    else:
13
        body_tags = soup.findAll('sec', {'id': 'Sec1'})
14
    if not body_tags:
15
        return None
16
17
    unclean_body = str(body_tags[0])
18
    soup_body = BeautifulSoup(unclean_body, 'html.parser')
19
    paragraphs = soup_body.findAll('p')
20
    body = ''
21
    for p in range(len(paragraphs)):
22
        # Removes reference tags and concatenates
23
        body += (paragraphs[p].getText() + '\n')
24
    return body
25
26
27
def parse_file(file_path, whole_body):
28
    # Reads file as string
29
    with open(file_path, 'r') as f:
30
        content = f.readlines()
31
        content = ''.join(content)
32
33
    # Prepares text for parsing
34
    soup = BeautifulSoup(content, features='html.parser')
35
36
    # Reads abstract as clean text
37
    ab_tag = soup.find('abstract')
38
    if not ab_tag:
39
        return None
40
41
    # Abstract usually 1 paragraph
42
    ab_paragraphs = ab_tag.findAll('p')
43
    abstract = ''
44
    for p in range(len(ab_paragraphs)):
45
        abstract += (ab_paragraphs[p].getText() + '\n')
46
47
    body = parse_body(soup, whole_body)
48
    if whole_body:
49
        return abstract, body, None
50
    else:
51
        return abstract, None, body
52
53
54
def remove_short_paragraphs(text, strip_newlines):
55
    if text:
56
        # Removes short paragraphs
57
        paragraphs = text.split('\n')
58
        good_pars = []
59
        for p in paragraphs:
60
            if len(p.split()) > 20:
61
                good_pars.append(p)
62
        if len(good_pars) > 0:
63
            if strip_newlines:
64
                clean_text = ''.join(good_pars) + '\n'
65
            else:
66
                clean_text = '\n'.join(good_pars) + '\n'
67
            return clean_text
68
    return None
69
70
71
def random_summary(PARSED_DIR, filename, tokenizer):
72
    abstract_path = os.path.join(PARSED_DIR, 'abstract', filename + '.tgt')
73
    merged_path = os.path.join(PARSED_DIR, 'merged', filename + '.mgd')
74
    with open(abstract_path, "r") as abstract:
75
        abs_text = abstract.read().replace("\n", ". ").replace("..", ". ")
76
    with open(merged_path, "r") as mgd:
77
        mgd_text = mgd.read().replace("\n", ". ").replace("..", ". ")
78
    # Fraction of article classified as summary
79
    pct_sum = len(abs_text) / len(mgd_text)
80
81
    # Classifies at random
82
    mgd_sents = tokenizer.tokenize(mgd_text)
83
    rand_sum = ''
84
    indices = random.sample(list(range(len(mgd_sents))), round(pct_sum * len(mgd_sents)))
85
    for i in range(len(mgd_sents)):
86
        if i in indices:
87
            rand_sum += mgd_sents[i]
88
89
    summary_path = os.path.join(PARSED_DIR, 'random_summary', filename + '.sum')
90
    with open(summary_path, 'w') as sum:
91
        sum.write(rand_sum + '\n')
92
93
def run_parser():
94
    print('newer version')
95
    XML_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'xml'))
96
    PARSED_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'parsed_articles'))
97
    # XML_DIR = '/data/corpora/pubmed_xml_subset/'
98
    # PARSED_DIR = '/data/corpora/pubmed_parsed/'
99
    print('Path to XML files: %s' % XML_DIR)
100
    print('Path to parsed PubMed files: %s' % PARSED_DIR)
101
102
    body_type = input('Parse the whole body section of each article or just the body introduction? '\
103
                        '[w=whole body, j=just intro]: ')
104
    if body_type == 'w':
105
        PARSED_DIR = os.path.join(PARSED_DIR, 'with_whole_bodies')
106
        whole_body = True
107
    elif body_type == 'j':
108
        PARSED_DIR = os.path.join(PARSED_DIR, 'with_just_intros')
109
        whole_body = False
110
    else:
111
        sys.exit('Error: Must input \'w\' or \'j.\'')
112
113
    os.makedirs(PARSED_DIR, exist_ok=True)
114
    os.makedirs(os.path.join(PARSED_DIR, 'abstract'), exist_ok=True)
115
    os.makedirs(os.path.join(PARSED_DIR, 'body'), exist_ok=True)
116
    os.makedirs(os.path.join(PARSED_DIR, 'merged'), exist_ok=True)
117
    os.makedirs(os.path.join(PARSED_DIR, 'random_summary'), exist_ok=True)
118
119
    n_files = input('Number of files to parse [press Enter to parse all]: ')
120
    if n_files == '':
121
        n_files = -1
122
    else:
123
        n_files = int(n_files)
124
    count = 0
125
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
126
    skipped_files = 0
127
    for root, dirs, files in os.walk(XML_DIR, topdown=True):
128
        for file in files:
129
            if skipped_files % 10 == 0:
130
                print('skipped {} files'.format(skipped_files), end = '\r')
131
            if file.endswith('.nxml'):
132
                # Check if xml file has already been parsed
133
                file_path = os.path.join(root, file)
134
                filename = file[:-5]
135
                tgt_path = os.path.join(PARSED_DIR, 'abstract', filename + '.tgt')
136
                if os.path.exists(tgt_path):
137
                    continue
138
                else:
139
                    # Extracts text
140
                    contents = parse_file(file_path, whole_body)
141
                    if not contents:
142
                        continue
143
                    else:
144
                        abstract, body, intro = contents[0], contents[1], contents[2]
145
                        abstract = remove_short_paragraphs(abstract, strip_newlines=True)
146
                        body = remove_short_paragraphs(body, strip_newlines=False)
147
                        intro = remove_short_paragraphs(intro, strip_newlines=False)
148
                        if not abstract:
149
                            skipped_files += 1
150
                            continue
151
                        if whole_body and not body:
152
                            skipped_files += 1
153
                            continue
154
                        if not whole_body and not intro:
155
                            skipped_files += 1
156
                            continue
157
158
                        # Writes abstract, body, and both to files
159
                        src_path = os.path.join(PARSED_DIR, 'body', filename + '.src')
160
                        mgd_path = os.path.join(PARSED_DIR, 'merged', filename + '.mgd')
161
                        with open(tgt_path, 'w') as tgt:
162
                            tgt.write(abstract)
163
                        with open(src_path, 'w') as src:
164
                            src.write(body) if whole_body else src.write(intro)
165
                        with open(mgd_path, 'w') as mgd:
166
                            mgd.write(abstract + body) if whole_body else mgd.write(abstract + intro)
167
168
                        random_summary(PARSED_DIR, filename, tokenizer)
169
170
                        count += 1
171
                        if count % 100 == 0:
172
                            print('Number of files parsed: %d' % count)
173
                        if count == n_files:
174
                            break
175
        if count == n_files:
176
            break
177
    if os.path.exists('__pycache__'):
178
        shutil.rmtree('__pycache__')
179
    if count < n_files:
180
        print('Only %d files could be parsed.' % count)
181
    else:
182
        print('Successfully parsed %d files.' % count)
183
184
if __name__ == "__main__":
185
    run_parser()