[5b4ecd]: / gap-replay / pubmed / reference.py

Download this file

200 lines (179 with data), 7.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
from lxml import etree
import requests
import argparse
import json
import os
from tqdm import tqdm
BASE_URL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&field=title&term=\"{}\"'
def extract_ref_author(ref_text):
author = ref_text.split('.')[0]
if ',' in author:
author = author.split(', ')[0]
return author
def extract_ref_year(ref_text):
year = ref_text
year = year[:year.rfind(';')]
year = year[year.rfind(' ')+1:]
return year
def extract_ref_title(ref_text):
ref_title = ref_text
if ref_title[-1] == '.':
ref_title = ref_title[:-1]
ref_title = ref_title[ref_title.find('.')+2:]
ref_title = ref_title[:1+ref_title.rfind('.')]
return ref_title
def request_IDs(query):
''' Fetch list of PubMed IDs for a given e-search query. '''
try:
url_request = BASE_URL.format(query)
page = requests.get(url_request).content
content = etree.fromstring(page)
root = content.getroottree()
result = [item.text for item in root.xpath("//Id")]
return result
except:
return []
def scrape_references(ref_path, ids_path, start, end, batch_size=5, verbose=True):
# Read pre-scraped pubmed IDs
with open(ids_path, 'r') as f_in:
scraped_ids = set([line.strip() for line in f_in])
print(f'Loaded {len(scraped_ids)} scraped PubMed IDs.')
print(f'Scraping PubMed IDs for references {start} to {end} by batches of {batch_size}...')
with open(ref_path, 'r') as f_in, open(ids_path, 'a') as f_out:
batch = []
batch_idx = start // batch_size
num_matches = 0
for i, line in enumerate(f_in):
# Scrape IDs for references between start and end indices
if i < start:
continue
if end is not None and i >= end:
break
# Load batch of batch_size Uptodate references
try:
ref_text = json.loads(line)['ref_text']
if ref_text == '':
continue
ref_title = extract_ref_title(ref_text)
ref_year = extract_ref_year(ref_text)
ref_author = extract_ref_author(ref_text)
if ref_title == '' or ref_year == '' or ref_author == '':
continue
query = f'(({ref_author} [Author - First]) AND ({ref_title} [Title]) AND ({ref_year} [Date - Publication]))'
except:
print(f'Batch {batch_idx}: Error loading reference: {line}')
continue
# Continue until you fill up the batch
batch.append(query)
if len(batch) < batch_size:
continue
# Request PubMed IDs for batch using first author, paper title and pub date
batch_idx += 1
batch_query = ' OR '.join(batch)
matching_ids = request_IDs(batch_query)
num_found = len(matching_ids)
num_matches += num_found
matching_ids = [id for id in matching_ids if id not in scraped_ids]
scraped_ids = scraped_ids.union(set(matching_ids))
if verbose:
print(f'Batch {batch_idx}: Adding {len(matching_ids)} of {num_found} found PubMed IDs: {matching_ids}')
if len(matching_ids) > 0:
f_out.write('\n'.join(matching_ids) + '\n')
batch = []
if end:
print(f'Finished scraping PubMed IDs for references {start} to {end}.')
print(f'Found PubMed IDs for {num_matches} out of {end-start} articles.')
def identify_references(ids_path, data_path, source):
''' Identify PubMed articles referenced in Uptodate.'''
# Scrape Uptodate PubMed IDs
pubmed_ids = set()
if source == "uptodate":
with open(ids_path, 'r') as f_in:
for line in f_in.readlines():
pubmed_ids.add(line.strip())
elif source == "cochrane":
with open(ids_path, 'r') as f_in:
data = json.load(f_in)
pubmed_ids = {article["DOI"].lower() for article in data}
else:
raise ValueError("Invalid source")
# Add uptodate_reference field to pubmed articles referenced in Uptodate
out_path = data_path.split('.')[0]+f'_{source}.jsonl'
key = source + '_reference'
with open(data_path, 'r') as f_in, open(out_path, 'w') as f_out:
# Read line by line the data path (streaming because it's a big file)
for line in tqdm(f_in, total=4700000):
article = json.loads(line)
article[key] = 0
try:
if source == "uptodate":
pm_id = article['externalids']['pubmed']
elif source == "cochrane":
pm_id = article['externalids']['doi'].lower()
else:
raise ValueError("Invalid source")
if pm_id in pubmed_ids:
article[key] = 1
except:
pass
f_out.write(json.dumps(article) + '\n')
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--refs_path",
type=str,
required=False,
help="Uptodate references file, without PubMedIDs (for mode 'scrape').")
parser.add_argument(
"--ids_path",
type=str,
required=True,
help="PubMed IDs of Uptodate References.")
parser.add_argument(
"--data_path",
type=str,
required=True,
help="Pubmed articles file (for mode 'identify').")
parser.add_argument(
"--start",
type=int,
required=False,
default=0,
help="Start index of references to divide.")
parser.add_argument(
"--end",
type=int,
required=False,
default=None,
help="End index of references to divide.")
parser.add_argument(
"--batch_size",
type=int,
required=False,
default=1,
help="Batch size for E-Utils API calls.")
parser.add_argument(
"--mode",
type=str,
required=True,
help="Mode: 'scrape' or 'identify'.")
parser.add_argument(
"--source",
type=str,
required=False,
help="Source: 'cochrane' or 'uptodate'.")
args = parser.parse_args()
print(args)
# identify: Add identifier to pubmed articles referenced in Uptodate
if args.mode == 'identify':
identify_references(args.ids_path, args.data_path, args.source)
# scrape: Scrape PubMed IDs for Uptodate references
elif args.mode == 'scrape':
scrape_references(
args.refs_path, args.ids_path, args.start, args.end,
batch_size=args.batch_size, verbose=True)
else:
raise ValueError(f'Invalid mode: {args.mode}')
if __name__ == "__main__":
main()
# python3 reference.py --ids_path /pure-mlo-scratch/data/pubmed/uptodate_pubmed_ids.jsonl --data_path /pure-mlo-scratch/data/pubmed/pubmed_processed_mesh_train.jsonl --mode identify