meditron / Git / [5b4ecd] /gap-replay/pubmed/augment.py

Models:
cathy-stones/
meditron
Downloads: 0
[5b4ecd]: / gap-replay / pubmed / augment.py
History
Download this file
144 lines (121 with data), 4.8 kB

import os
import argparse
import json
import requests
import time
import xml.etree.ElementTree as ET
from tqdm import tqdm


def get_mesh_tags(article):
    """
    Parse an article xml to extract all MeSH tags and Publication Types

    Parameters
    ----------
    article: xml.etree.ElementTree.Element
        xml of the article

    Returns
    -------
    list
        list of MeSH tags
    list
        list of Publication Types
    """
    pm_id = next(article.iter("PMID")).text
    meshs = [mesh[0].text for mesh in article.iter("MeshHeading")]
    publication_types = [pt.text for pt in article.iter("PublicationType")]

    return pm_id, meshs, publication_types


def update_metadata_mesh(records, pm_ids):
    """
    Update records by fetching their MeSH tags and Publication Tags, using the 
    EFetch API of the Entrez E-utilities (https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch)

    Parameters
    ----------
    records: dict
        dictionary of PubMed articles, with associated information and metadata
    pm_ids: list
        list of PubMed ids
    """
    api_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {"db": "pubmed", "id": pm_ids}
    max_retry = 10
    retry = 0
    while(True):
        try: 
            response = requests.post(api_url, params=params)
            root = ET.fromstring(response.content)
            new_records = []
            for article_xml in root:
                pm_id, meshs, publication_types = get_mesh_tags(article_xml)
                for record in records:
                    if record['externalids']['pubmed'] == pm_id:
                        record.update({"mesh": meshs, "publicationtype": publication_types})
                        new_records += [record]
            break
        except Exception as oops:
            if oops is StopIteration:
                print(response.url)
            retry += 1
            if retry >= max_retry:
                raise RuntimeError("Error communicating with the server: %s" % oops)
            print(f"Error communicating with server ({retry}):", oops)
            time.sleep(1)

    return new_records



def augment_PMC(source_path, dest_path, log_path, batch_size=100):
    print('='*100+f'\n4. Scraping MeSH and Publication Type tags for PMC articles in {source_path}.\n')

    if not os.path.exists(log_path):
        with open(log_path, 'w') as f:
            f.write('0')

    with open(log_path, 'r') as f_in:
        start_line = int(f_in.read())

    # aligning the two files, to start from the first article not yet processed
    with open(source_path, 'r') as f_in:
        for _ in tqdm(range(start_line), desc="Skipping already processed articles..."):
            next(f_in)     
        
        with open(dest_path, 'a') as f_out:
            count = 0
            records, pm_ids = [], []
            for i, line in tqdm(enumerate(f_in), initial=start_line, total=4900000, desc="Processing new articles..."):
                record = json.loads(line)
                pm_id = record['externalids']['pubmed']
                if pm_id is None:
                    continue
                records += [record]
                pm_ids += [pm_id]
                count += 1
                
                # Scrape metadata in batches
                if count % batch_size == 0:
                    new_records = update_metadata_mesh(records, pm_ids)
                    for new_record in new_records:
                        if new_record:
                            f_out.write(json.dumps(new_record) + '\n')
                    records, pm_ids = [], []

                    with open(log_path, 'w') as f_log:
                        f_log.write(str(start_line+i+1))

            # last batch
            if count % batch_size > 0:
                new_records = update_metadata_mesh(records, pm_ids)
                for new_record in new_records:
                    f_out.write(json.dumps(new_record) + '\n')
    print(f"Finished scraping MeSH metadata for {count} articles.\n")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input_path",
        type=str,
        required=True,
        help="Inpute PubMedCentral file, after the first metadata scraping step (i.e. after running PMC_load.py).")
    parser.add_argument(
        "--output_path",
        type=str,
        required=True,
        help="Output PubMedCentral file, enriched of MeSH tags and Publication Types.")
    parser.add_argument(
        "--log_path",
        type=str,
        required=True,
        help="Textual file to store the last processed line."
    )
    args = parser.parse_args()
    print(args)

    augment_PMC(args.input_path, args.output_path, log_path=args.log_path)

if __name__ == "__main__":
    main()