meditron / Git / [5b4ecd] /gap-replay/pubmed/reference.py

Models:
cathy-stones/
meditron
Downloads: 0
[5b4ecd]: / gap-replay / pubmed / reference.py
History
Download this file
200 lines (179 with data), 7.1 kB

from lxml import etree
import requests
import argparse
import json
import os
from tqdm import tqdm

BASE_URL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&field=title&term=\"{}\"'

def extract_ref_author(ref_text):
    author = ref_text.split('.')[0]
    if ',' in author:
        author = author.split(', ')[0]
    return author

def extract_ref_year(ref_text):
    year = ref_text
    year = year[:year.rfind(';')]
    year = year[year.rfind(' ')+1:]
    return year

def extract_ref_title(ref_text): 
    ref_title = ref_text
    if ref_title[-1] == '.':
        ref_title = ref_title[:-1]
    ref_title = ref_title[ref_title.find('.')+2:]
    ref_title = ref_title[:1+ref_title.rfind('.')]
    return ref_title

def request_IDs(query):
    ''' Fetch list of PubMed IDs for a given e-search query. '''
    try: 
        url_request = BASE_URL.format(query)
        page = requests.get(url_request).content
        content = etree.fromstring(page)
        root = content.getroottree()
        result = [item.text for item in root.xpath("//Id")]
        return result
    except: 
        return []

def scrape_references(ref_path, ids_path, start, end, batch_size=5, verbose=True):
    # Read pre-scraped pubmed IDs
    with open(ids_path, 'r') as f_in:
        scraped_ids = set([line.strip() for line in f_in])
        print(f'Loaded {len(scraped_ids)} scraped PubMed IDs.')

    print(f'Scraping PubMed IDs for references {start} to {end} by batches of {batch_size}...')
    with open(ref_path, 'r') as f_in, open(ids_path, 'a') as f_out:
        batch = []
        batch_idx = start // batch_size
        num_matches = 0
        for i, line in enumerate(f_in):
            # Scrape IDs for references between start and end indices
            if i < start:
                continue
            if end is not None and i >= end:
                break

            # Load batch of batch_size Uptodate references
            try: 
                ref_text = json.loads(line)['ref_text']
                if ref_text == '': 
                    continue
                ref_title = extract_ref_title(ref_text)
                ref_year = extract_ref_year(ref_text)
                ref_author = extract_ref_author(ref_text)
                if ref_title == '' or ref_year == '' or ref_author == '':
                    continue
                query = f'(({ref_author} [Author - First]) AND ({ref_title} [Title]) AND ({ref_year} [Date - Publication]))'
            except: 
                print(f'Batch {batch_idx}: Error loading reference: {line}')
                continue

            # Continue until you fill up the batch
            batch.append(query)
            if len(batch) < batch_size: 
                continue

            # Request PubMed IDs for batch using first author, paper title and pub date
            batch_idx += 1
            batch_query = ' OR '.join(batch)
            matching_ids = request_IDs(batch_query)
            num_found = len(matching_ids)
            num_matches += num_found
            matching_ids = [id for id in matching_ids if id not in scraped_ids]
            scraped_ids = scraped_ids.union(set(matching_ids))
            if verbose: 
                print(f'Batch {batch_idx}: Adding {len(matching_ids)} of {num_found} found PubMed IDs: {matching_ids}')
            if len(matching_ids) > 0: 
                f_out.write('\n'.join(matching_ids) + '\n')
            batch = []
    if end: 
        print(f'Finished scraping PubMed IDs for references {start} to {end}.')
        print(f'Found PubMed IDs for {num_matches} out of {end-start} articles.')


def identify_references(ids_path, data_path, source):
    ''' Identify PubMed articles referenced in Uptodate.'''

    # Scrape Uptodate PubMed IDs 
    pubmed_ids = set()

    if source == "uptodate":
        with open(ids_path, 'r') as f_in:
            for line in f_in.readlines():
                pubmed_ids.add(line.strip())
    elif source == "cochrane":
        with open(ids_path, 'r') as f_in:
            data = json.load(f_in)
        pubmed_ids = {article["DOI"].lower() for article in data}
    else:
        raise ValueError("Invalid source")
    
    # Add uptodate_reference field to pubmed articles referenced in Uptodate
    out_path = data_path.split('.')[0]+f'_{source}.jsonl'
    key = source + '_reference'
    with open(data_path, 'r') as f_in, open(out_path, 'w') as f_out:
        # Read line by line the data path (streaming because it's a big file)
        for line in tqdm(f_in, total=4700000):
            article = json.loads(line)
            article[key] = 0
            try:
                if source == "uptodate":
                    pm_id = article['externalids']['pubmed']
                elif source == "cochrane":
                    pm_id = article['externalids']['doi'].lower()
                else:
                    raise ValueError("Invalid source")
                if pm_id in pubmed_ids:
                    article[key] = 1
            except:
                pass
            f_out.write(json.dumps(article) + '\n')

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--refs_path",
        type=str,
        required=False,
        help="Uptodate references file, without PubMedIDs (for mode 'scrape').")
    parser.add_argument(
        "--ids_path",
        type=str,
        required=True,
        help="PubMed IDs of Uptodate References.")
    parser.add_argument(
        "--data_path",
        type=str,
        required=True,
        help="Pubmed articles file (for mode 'identify').")
    parser.add_argument(
        "--start",
        type=int,
        required=False,
        default=0,
        help="Start index of references to divide.")
    parser.add_argument(
        "--end",
        type=int,
        required=False,
        default=None,
        help="End index of references to divide.")
    parser.add_argument(
        "--batch_size",
        type=int,
        required=False,
        default=1,
        help="Batch size for E-Utils API calls.")
    parser.add_argument(
        "--mode",
        type=str,
        required=True,
        help="Mode: 'scrape' or 'identify'.")
    parser.add_argument(
        "--source",
        type=str,
        required=False,
        help="Source: 'cochrane' or 'uptodate'.")
    args = parser.parse_args()
    print(args)

    # identify: Add identifier to pubmed articles referenced in Uptodate
    if args.mode == 'identify':
        identify_references(args.ids_path, args.data_path, args.source)

    # scrape: Scrape PubMed IDs for Uptodate references
    elif args.mode == 'scrape':
        scrape_references(
            args.refs_path, args.ids_path, args.start, args.end, 
            batch_size=args.batch_size, verbose=True)
    else:
        raise ValueError(f'Invalid mode: {args.mode}')

if __name__ == "__main__":
    main()

# python3 reference.py --ids_path /pure-mlo-scratch/data/pubmed/uptodate_pubmed_ids.jsonl --data_path /pure-mlo-scratch/data/pubmed/pubmed_processed_mesh_train.jsonl --mode identify