EHRKit-2022 / Git / [2d4573] /pubmed/parse

Models:
philipB/
EHRKit-2022
Downloads: 1
[2d4573]: / pubmed / parse_articles.py
History
Download this file
185 lines (162 with data), 7.0 kB

import os
import shutil
import random
import sys
import nltk
from bs4 import BeautifulSoup


def parse_body(soup, whole_body):
    if whole_body:
        body_tags = soup.findAll('body')
    else:
        body_tags = soup.findAll('sec', {'id': 'Sec1'})
    if not body_tags:
        return None

    unclean_body = str(body_tags[0])
    soup_body = BeautifulSoup(unclean_body, 'html.parser')
    paragraphs = soup_body.findAll('p')
    body = ''
    for p in range(len(paragraphs)):
        # Removes reference tags and concatenates
        body += (paragraphs[p].getText() + '\n')
    return body


def parse_file(file_path, whole_body):
    # Reads file as string
    with open(file_path, 'r') as f:
        content = f.readlines()
        content = ''.join(content)

    # Prepares text for parsing
    soup = BeautifulSoup(content, features='html.parser')

    # Reads abstract as clean text
    ab_tag = soup.find('abstract')
    if not ab_tag:
        return None

    # Abstract usually 1 paragraph
    ab_paragraphs = ab_tag.findAll('p')
    abstract = ''
    for p in range(len(ab_paragraphs)):
        abstract += (ab_paragraphs[p].getText() + '\n')

    body = parse_body(soup, whole_body)
    if whole_body:
        return abstract, body, None
    else:
        return abstract, None, body


def remove_short_paragraphs(text, strip_newlines):
    if text:
        # Removes short paragraphs
        paragraphs = text.split('\n')
        good_pars = []
        for p in paragraphs:
            if len(p.split()) > 20:
                good_pars.append(p)
        if len(good_pars) > 0:
            if strip_newlines:
                clean_text = ''.join(good_pars) + '\n'
            else:
                clean_text = '\n'.join(good_pars) + '\n'
            return clean_text
    return None


def random_summary(PARSED_DIR, filename, tokenizer):
    abstract_path = os.path.join(PARSED_DIR, 'abstract', filename + '.tgt')
    merged_path = os.path.join(PARSED_DIR, 'merged', filename + '.mgd')
    with open(abstract_path, "r") as abstract:
        abs_text = abstract.read().replace("\n", ". ").replace("..", ". ")
    with open(merged_path, "r") as mgd:
        mgd_text = mgd.read().replace("\n", ". ").replace("..", ". ")
    # Fraction of article classified as summary
    pct_sum = len(abs_text) / len(mgd_text)

    # Classifies at random
    mgd_sents = tokenizer.tokenize(mgd_text)
    rand_sum = ''
    indices = random.sample(list(range(len(mgd_sents))), round(pct_sum * len(mgd_sents)))
    for i in range(len(mgd_sents)):
        if i in indices:
            rand_sum += mgd_sents[i]

    summary_path = os.path.join(PARSED_DIR, 'random_summary', filename + '.sum')
    with open(summary_path, 'w') as sum:
        sum.write(rand_sum + '\n')

def run_parser():
    print('newer version')
    XML_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'xml'))
    PARSED_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'parsed_articles'))
    # XML_DIR = '/data/corpora/pubmed_xml_subset/'
    # PARSED_DIR = '/data/corpora/pubmed_parsed/'
    print('Path to XML files: %s' % XML_DIR)
    print('Path to parsed PubMed files: %s' % PARSED_DIR)

    body_type = input('Parse the whole body section of each article or just the body introduction? '\
                        '[w=whole body, j=just intro]: ')
    if body_type == 'w':
        PARSED_DIR = os.path.join(PARSED_DIR, 'with_whole_bodies')
        whole_body = True
    elif body_type == 'j':
        PARSED_DIR = os.path.join(PARSED_DIR, 'with_just_intros')
        whole_body = False
    else:
        sys.exit('Error: Must input \'w\' or \'j.\'')

    os.makedirs(PARSED_DIR, exist_ok=True)
    os.makedirs(os.path.join(PARSED_DIR, 'abstract'), exist_ok=True)
    os.makedirs(os.path.join(PARSED_DIR, 'body'), exist_ok=True)
    os.makedirs(os.path.join(PARSED_DIR, 'merged'), exist_ok=True)
    os.makedirs(os.path.join(PARSED_DIR, 'random_summary'), exist_ok=True)

    n_files = input('Number of files to parse [press Enter to parse all]: ')
    if n_files == '':
        n_files = -1
    else:
        n_files = int(n_files)
    count = 0
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    skipped_files = 0
    for root, dirs, files in os.walk(XML_DIR, topdown=True):
        for file in files:
            if skipped_files % 10 == 0:
                print('skipped {} files'.format(skipped_files), end = '\r')
            if file.endswith('.nxml'):
                # Check if xml file has already been parsed
                file_path = os.path.join(root, file)
                filename = file[:-5]
                tgt_path = os.path.join(PARSED_DIR, 'abstract', filename + '.tgt')
                if os.path.exists(tgt_path):
                    continue
                else:
                    # Extracts text
                    contents = parse_file(file_path, whole_body)
                    if not contents:
                        continue
                    else:
                        abstract, body, intro = contents[0], contents[1], contents[2]
                        abstract = remove_short_paragraphs(abstract, strip_newlines=True)
                        body = remove_short_paragraphs(body, strip_newlines=False)
                        intro = remove_short_paragraphs(intro, strip_newlines=False)
                        if not abstract:
                            skipped_files += 1
                            continue
                        if whole_body and not body:
                            skipped_files += 1
                            continue
                        if not whole_body and not intro:
                            skipped_files += 1
                            continue

                        # Writes abstract, body, and both to files
                        src_path = os.path.join(PARSED_DIR, 'body', filename + '.src')
                        mgd_path = os.path.join(PARSED_DIR, 'merged', filename + '.mgd')
                        with open(tgt_path, 'w') as tgt:
                            tgt.write(abstract)
                        with open(src_path, 'w') as src:
                            src.write(body) if whole_body else src.write(intro)
                        with open(mgd_path, 'w') as mgd:
                            mgd.write(abstract + body) if whole_body else mgd.write(abstract + intro)

                        random_summary(PARSED_DIR, filename, tokenizer)

                        count += 1
                        if count % 100 == 0:
                            print('Number of files parsed: %d' % count)
                        if count == n_files:
                            break
        if count == n_files:
            break
    if os.path.exists('__pycache__'):
        shutil.rmtree('__pycache__')
    if count < n_files:
        print('Only %d files could be parsed.' % count)
    else:
        print('Successfully parsed %d files.' % count)

if __name__ == "__main__":
    run_parser()