|
a |
|
b/pubmed/parse_articles.py |
|
|
1 |
import os |
|
|
2 |
import shutil |
|
|
3 |
import random |
|
|
4 |
import sys |
|
|
5 |
import nltk |
|
|
6 |
from bs4 import BeautifulSoup |
|
|
7 |
|
|
|
8 |
|
|
|
9 |
def parse_body(soup, whole_body): |
|
|
10 |
if whole_body: |
|
|
11 |
body_tags = soup.findAll('body') |
|
|
12 |
else: |
|
|
13 |
body_tags = soup.findAll('sec', {'id': 'Sec1'}) |
|
|
14 |
if not body_tags: |
|
|
15 |
return None |
|
|
16 |
|
|
|
17 |
unclean_body = str(body_tags[0]) |
|
|
18 |
soup_body = BeautifulSoup(unclean_body, 'html.parser') |
|
|
19 |
paragraphs = soup_body.findAll('p') |
|
|
20 |
body = '' |
|
|
21 |
for p in range(len(paragraphs)): |
|
|
22 |
# Removes reference tags and concatenates |
|
|
23 |
body += (paragraphs[p].getText() + '\n') |
|
|
24 |
return body |
|
|
25 |
|
|
|
26 |
|
|
|
27 |
def parse_file(file_path, whole_body): |
|
|
28 |
# Reads file as string |
|
|
29 |
with open(file_path, 'r') as f: |
|
|
30 |
content = f.readlines() |
|
|
31 |
content = ''.join(content) |
|
|
32 |
|
|
|
33 |
# Prepares text for parsing |
|
|
34 |
soup = BeautifulSoup(content, features='html.parser') |
|
|
35 |
|
|
|
36 |
# Reads abstract as clean text |
|
|
37 |
ab_tag = soup.find('abstract') |
|
|
38 |
if not ab_tag: |
|
|
39 |
return None |
|
|
40 |
|
|
|
41 |
# Abstract usually 1 paragraph |
|
|
42 |
ab_paragraphs = ab_tag.findAll('p') |
|
|
43 |
abstract = '' |
|
|
44 |
for p in range(len(ab_paragraphs)): |
|
|
45 |
abstract += (ab_paragraphs[p].getText() + '\n') |
|
|
46 |
|
|
|
47 |
body = parse_body(soup, whole_body) |
|
|
48 |
if whole_body: |
|
|
49 |
return abstract, body, None |
|
|
50 |
else: |
|
|
51 |
return abstract, None, body |
|
|
52 |
|
|
|
53 |
|
|
|
54 |
def remove_short_paragraphs(text, strip_newlines): |
|
|
55 |
if text: |
|
|
56 |
# Removes short paragraphs |
|
|
57 |
paragraphs = text.split('\n') |
|
|
58 |
good_pars = [] |
|
|
59 |
for p in paragraphs: |
|
|
60 |
if len(p.split()) > 20: |
|
|
61 |
good_pars.append(p) |
|
|
62 |
if len(good_pars) > 0: |
|
|
63 |
if strip_newlines: |
|
|
64 |
clean_text = ''.join(good_pars) + '\n' |
|
|
65 |
else: |
|
|
66 |
clean_text = '\n'.join(good_pars) + '\n' |
|
|
67 |
return clean_text |
|
|
68 |
return None |
|
|
69 |
|
|
|
70 |
|
|
|
71 |
def random_summary(PARSED_DIR, filename, tokenizer): |
|
|
72 |
abstract_path = os.path.join(PARSED_DIR, 'abstract', filename + '.tgt') |
|
|
73 |
merged_path = os.path.join(PARSED_DIR, 'merged', filename + '.mgd') |
|
|
74 |
with open(abstract_path, "r") as abstract: |
|
|
75 |
abs_text = abstract.read().replace("\n", ". ").replace("..", ". ") |
|
|
76 |
with open(merged_path, "r") as mgd: |
|
|
77 |
mgd_text = mgd.read().replace("\n", ". ").replace("..", ". ") |
|
|
78 |
# Fraction of article classified as summary |
|
|
79 |
pct_sum = len(abs_text) / len(mgd_text) |
|
|
80 |
|
|
|
81 |
# Classifies at random |
|
|
82 |
mgd_sents = tokenizer.tokenize(mgd_text) |
|
|
83 |
rand_sum = '' |
|
|
84 |
indices = random.sample(list(range(len(mgd_sents))), round(pct_sum * len(mgd_sents))) |
|
|
85 |
for i in range(len(mgd_sents)): |
|
|
86 |
if i in indices: |
|
|
87 |
rand_sum += mgd_sents[i] |
|
|
88 |
|
|
|
89 |
summary_path = os.path.join(PARSED_DIR, 'random_summary', filename + '.sum') |
|
|
90 |
with open(summary_path, 'w') as sum: |
|
|
91 |
sum.write(rand_sum + '\n') |
|
|
92 |
|
|
|
93 |
def run_parser(): |
|
|
94 |
print('newer version') |
|
|
95 |
XML_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'xml')) |
|
|
96 |
PARSED_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'parsed_articles')) |
|
|
97 |
# XML_DIR = '/data/corpora/pubmed_xml_subset/' |
|
|
98 |
# PARSED_DIR = '/data/corpora/pubmed_parsed/' |
|
|
99 |
print('Path to XML files: %s' % XML_DIR) |
|
|
100 |
print('Path to parsed PubMed files: %s' % PARSED_DIR) |
|
|
101 |
|
|
|
102 |
body_type = input('Parse the whole body section of each article or just the body introduction? '\ |
|
|
103 |
'[w=whole body, j=just intro]: ') |
|
|
104 |
if body_type == 'w': |
|
|
105 |
PARSED_DIR = os.path.join(PARSED_DIR, 'with_whole_bodies') |
|
|
106 |
whole_body = True |
|
|
107 |
elif body_type == 'j': |
|
|
108 |
PARSED_DIR = os.path.join(PARSED_DIR, 'with_just_intros') |
|
|
109 |
whole_body = False |
|
|
110 |
else: |
|
|
111 |
sys.exit('Error: Must input \'w\' or \'j.\'') |
|
|
112 |
|
|
|
113 |
os.makedirs(PARSED_DIR, exist_ok=True) |
|
|
114 |
os.makedirs(os.path.join(PARSED_DIR, 'abstract'), exist_ok=True) |
|
|
115 |
os.makedirs(os.path.join(PARSED_DIR, 'body'), exist_ok=True) |
|
|
116 |
os.makedirs(os.path.join(PARSED_DIR, 'merged'), exist_ok=True) |
|
|
117 |
os.makedirs(os.path.join(PARSED_DIR, 'random_summary'), exist_ok=True) |
|
|
118 |
|
|
|
119 |
n_files = input('Number of files to parse [press Enter to parse all]: ') |
|
|
120 |
if n_files == '': |
|
|
121 |
n_files = -1 |
|
|
122 |
else: |
|
|
123 |
n_files = int(n_files) |
|
|
124 |
count = 0 |
|
|
125 |
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') |
|
|
126 |
skipped_files = 0 |
|
|
127 |
for root, dirs, files in os.walk(XML_DIR, topdown=True): |
|
|
128 |
for file in files: |
|
|
129 |
if skipped_files % 10 == 0: |
|
|
130 |
print('skipped {} files'.format(skipped_files), end = '\r') |
|
|
131 |
if file.endswith('.nxml'): |
|
|
132 |
# Check if xml file has already been parsed |
|
|
133 |
file_path = os.path.join(root, file) |
|
|
134 |
filename = file[:-5] |
|
|
135 |
tgt_path = os.path.join(PARSED_DIR, 'abstract', filename + '.tgt') |
|
|
136 |
if os.path.exists(tgt_path): |
|
|
137 |
continue |
|
|
138 |
else: |
|
|
139 |
# Extracts text |
|
|
140 |
contents = parse_file(file_path, whole_body) |
|
|
141 |
if not contents: |
|
|
142 |
continue |
|
|
143 |
else: |
|
|
144 |
abstract, body, intro = contents[0], contents[1], contents[2] |
|
|
145 |
abstract = remove_short_paragraphs(abstract, strip_newlines=True) |
|
|
146 |
body = remove_short_paragraphs(body, strip_newlines=False) |
|
|
147 |
intro = remove_short_paragraphs(intro, strip_newlines=False) |
|
|
148 |
if not abstract: |
|
|
149 |
skipped_files += 1 |
|
|
150 |
continue |
|
|
151 |
if whole_body and not body: |
|
|
152 |
skipped_files += 1 |
|
|
153 |
continue |
|
|
154 |
if not whole_body and not intro: |
|
|
155 |
skipped_files += 1 |
|
|
156 |
continue |
|
|
157 |
|
|
|
158 |
# Writes abstract, body, and both to files |
|
|
159 |
src_path = os.path.join(PARSED_DIR, 'body', filename + '.src') |
|
|
160 |
mgd_path = os.path.join(PARSED_DIR, 'merged', filename + '.mgd') |
|
|
161 |
with open(tgt_path, 'w') as tgt: |
|
|
162 |
tgt.write(abstract) |
|
|
163 |
with open(src_path, 'w') as src: |
|
|
164 |
src.write(body) if whole_body else src.write(intro) |
|
|
165 |
with open(mgd_path, 'w') as mgd: |
|
|
166 |
mgd.write(abstract + body) if whole_body else mgd.write(abstract + intro) |
|
|
167 |
|
|
|
168 |
random_summary(PARSED_DIR, filename, tokenizer) |
|
|
169 |
|
|
|
170 |
count += 1 |
|
|
171 |
if count % 100 == 0: |
|
|
172 |
print('Number of files parsed: %d' % count) |
|
|
173 |
if count == n_files: |
|
|
174 |
break |
|
|
175 |
if count == n_files: |
|
|
176 |
break |
|
|
177 |
if os.path.exists('__pycache__'): |
|
|
178 |
shutil.rmtree('__pycache__') |
|
|
179 |
if count < n_files: |
|
|
180 |
print('Only %d files could be parsed.' % count) |
|
|
181 |
else: |
|
|
182 |
print('Successfully parsed %d files.' % count) |
|
|
183 |
|
|
|
184 |
if __name__ == "__main__": |
|
|
185 |
run_parser() |