[5b4ecd]: / gap-replay / pubmed / augment.py

Download this file

144 lines (121 with data), 4.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import argparse
import json
import requests
import time
import xml.etree.ElementTree as ET
from tqdm import tqdm
def get_mesh_tags(article):
"""
Parse an article xml to extract all MeSH tags and Publication Types
Parameters
----------
article: xml.etree.ElementTree.Element
xml of the article
Returns
-------
list
list of MeSH tags
list
list of Publication Types
"""
pm_id = next(article.iter("PMID")).text
meshs = [mesh[0].text for mesh in article.iter("MeshHeading")]
publication_types = [pt.text for pt in article.iter("PublicationType")]
return pm_id, meshs, publication_types
def update_metadata_mesh(records, pm_ids):
"""
Update records by fetching their MeSH tags and Publication Tags, using the
EFetch API of the Entrez E-utilities (https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch)
Parameters
----------
records: dict
dictionary of PubMed articles, with associated information and metadata
pm_ids: list
list of PubMed ids
"""
api_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {"db": "pubmed", "id": pm_ids}
max_retry = 10
retry = 0
while(True):
try:
response = requests.post(api_url, params=params)
root = ET.fromstring(response.content)
new_records = []
for article_xml in root:
pm_id, meshs, publication_types = get_mesh_tags(article_xml)
for record in records:
if record['externalids']['pubmed'] == pm_id:
record.update({"mesh": meshs, "publicationtype": publication_types})
new_records += [record]
break
except Exception as oops:
if oops is StopIteration:
print(response.url)
retry += 1
if retry >= max_retry:
raise RuntimeError("Error communicating with the server: %s" % oops)
print(f"Error communicating with server ({retry}):", oops)
time.sleep(1)
return new_records
def augment_PMC(source_path, dest_path, log_path, batch_size=100):
print('='*100+f'\n4. Scraping MeSH and Publication Type tags for PMC articles in {source_path}.\n')
if not os.path.exists(log_path):
with open(log_path, 'w') as f:
f.write('0')
with open(log_path, 'r') as f_in:
start_line = int(f_in.read())
# aligning the two files, to start from the first article not yet processed
with open(source_path, 'r') as f_in:
for _ in tqdm(range(start_line), desc="Skipping already processed articles..."):
next(f_in)
with open(dest_path, 'a') as f_out:
count = 0
records, pm_ids = [], []
for i, line in tqdm(enumerate(f_in), initial=start_line, total=4900000, desc="Processing new articles..."):
record = json.loads(line)
pm_id = record['externalids']['pubmed']
if pm_id is None:
continue
records += [record]
pm_ids += [pm_id]
count += 1
# Scrape metadata in batches
if count % batch_size == 0:
new_records = update_metadata_mesh(records, pm_ids)
for new_record in new_records:
if new_record:
f_out.write(json.dumps(new_record) + '\n')
records, pm_ids = [], []
with open(log_path, 'w') as f_log:
f_log.write(str(start_line+i+1))
# last batch
if count % batch_size > 0:
new_records = update_metadata_mesh(records, pm_ids)
for new_record in new_records:
f_out.write(json.dumps(new_record) + '\n')
print(f"Finished scraping MeSH metadata for {count} articles.\n")
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_path",
type=str,
required=True,
help="Inpute PubMedCentral file, after the first metadata scraping step (i.e. after running PMC_load.py).")
parser.add_argument(
"--output_path",
type=str,
required=True,
help="Output PubMedCentral file, enriched of MeSH tags and Publication Types.")
parser.add_argument(
"--log_path",
type=str,
required=True,
help="Textual file to store the last processed line."
)
args = parser.parse_args()
print(args)
augment_PMC(args.input_path, args.output_path, log_path=args.log_path)
if __name__ == "__main__":
main()