meditron / Git / [5b4ecd] /gap-replay/guidelines/scrapers/scrapers.py

Models:
cathy-stones/
meditron
Downloads: 0
[5b4ecd]: / gap-replay / guidelines / scrapers / scrapers.py
History
Download this file
1003 lines (871 with data), 38.2 kB

'''
Script for 12/16 Chrome (Firefox, experimentally)-based guideline scrapers.

Sources supported:
AAFP - American Academy of Family Physicians
CCO - Cancer Care Ontario
CDC - Centers for Disease Control and Prevention
CMA - Canadian Medical Association
CPS - Canadian Paediatric Society
Drugs.com
GuidelineCentral
ICRC - International Committee of the Red Cross
IDSA - Infectious Diseases Society of America
MAGIC - Making GRADE the Irresistible Choice
SPOR - Strategy for Patient-Oriented Research
WHO - World Health Organization

The other 4/16 are Typescript-based (MayoClinic, NICE, RCH, WikiDoc) and can be found in scrapers/.
"""

"""
Important note to users:
  Scraping logic will inevitably rot over time, and probably quickly.

  These are best-effort contemporary (November 2023) reconstructions of our original data
  collection effort, which took place some months before. As you can see in places
  the logic is fairly hacky.

  We will support interested users in the immediate period after the code
  release, but it's impossible to imagine supporting the scraping logic
  beyond that.

  Put brutally: if you are reading this after about January 2024, it's almost
  inevitable that you'll have to patch up some of this logic.

  Best Wishes,

  Antoine, Alexandre, and Kyle
'''


# ------------------- Imports ------------------- #
import functools
import subprocess
import argparse
import os
import time
import json
import hashlib
import tarfile

from tqdm import tqdm
import markdownify
import scipdf
import requests

import PyPDF2

from selenium.common import exceptions
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import NoSuchElementException

# ------------------- Selenium Setup ------------------- #

DEFAULT_DRIVER = 'chrome'  # Switch to 'firefox' if you want to use Firefox instead of Chrome


def setup_firefox_driver(download_path):
    '''
    Set up Firefox driver instance with download path.
    '''
    profile = FirefoxOptions()
    profile.binary_location = "/usr/bin/firefox"
    profile.set_preference("browser.download.folderList", 2)
    profile.set_preference("browser.download.dir", download_path)
    profile.set_preference("browser.download.useDownloadDir", True)
    profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
    profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
    profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf,application/x-pdf")
    profile.set_preference("pdfjs.disabled", True)
    driver = webdriver.Firefox(options=profile)
    return driver


def setup_chrome_driver(
    include_experimental: bool = True,
    download_path: str = None,
    binary_location: str = None,
    driver_location: str = None,
    headless: bool = False,
    eager_mode: bool = False
):
    '''
    Set up Chrome driver instance with download path.
    '''
    chrome_options = ChromeOptions()

    if binary_location:
        chrome_options.binary_location = binary_location

    if eager_mode:
        chrome_options.page_load_strategy = 'eager'

    if headless:
        chrome_options.add_argument('--headless')

    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--dns-prefetch-disable")
    chrome_options.add_argument("--disable-gpu")

    if include_experimental:
        assert download_path is not None, "Download path must be provided if including experimental options."
        chrome_options.add_experimental_option("prefs", {
            "download.default_directory": download_path,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": False,
        })

    chrome_params = dict(options=chrome_options)

    if driver_location:
        webdriver_service = Service(driver_location)
        chrome_params.update(service=webdriver_service)

    driver = webdriver.Chrome(**chrome_params)

    return driver


# ------------------- Helper Functions ------------------- #

@functools.lru_cache
def start_grobid():
    process = subprocess.Popen("./serve_grobid.sh")


def pdf2text(src_path, dest_path):
    '''
    Convert all pdf files in src_path to text, then save as jsonl to dest_path.
    '''
    start_grobid()
    guidelines = []
    pdf_files = [file for file in os.listdir(src_path) if file.endswith(".pdf")]
    for file in tqdm(pdf_files):
        try:
            article = scipdf.parse_pdf_to_dict(os.path.join(src_path, file))
            text = str(article["abstract"]) + "\n".join(
                ["# " + sec['heading'] + "\n" + sec["text"] for sec in article["sections"]])
            guideline = {"file_name": file,
                         "text": text,
                         "doi": article["doi"]}
            guidelines.append(guideline)
        except Exception as e:
            print('Exception when parsing file: ', file, e)
    with open(dest_path, 'w') as f:
        for guideline in guidelines:
            f.write(f"{json.dumps(guideline)}\n")
    return guidelines


def scrape_SPOR_links():
    '''
    Helper to scrape CCO, CPS and SPOR links from a SPOR PDF.
    '''
    PDFFile = open("SPOR_links.pdf",'rb')
    PDF = PyPDF2.PdfReader(PDFFile)
    pages = len(PDF.pages)
    key = '/Annots'
    uri = '/URI'
    ank = '/A'
    print(pages)
    links = []
    for page in tqdm(range(pages)):
        pageSliced = PDF.pages[page]
        pageObject = pageSliced.get_object()
        if key in pageObject.keys():
            ann = pageObject[key]
            for a in ann:
                u = a.get_object()
                if ank in u.keys():
                    if uri in u[ank].keys():
                        links.append(u[ank][uri])
    links = list(set(links))
    return links

# ------------------- Scraper Classes ------------------- #


class Scraper():
    '''
    Web Scraper class to scrape clinical guidelines from a source.
    Downloads scraped articles to {path}/raw/{source}.jsonl
    '''

    def __init__(
        self,
        source: str,
        path: str,
        pdfs: bool = False,
        chrome_binary_location: str = None,
        chrome_driver_location: str = None
    ):
        setup_driver = setup_firefox_driver if DEFAULT_DRIVER == 'firefox' else setup_chrome_driver
        self.path = path
        self.source = source
        self.source_path = os.path.join(self.path, self.source)
        os.makedirs(self.source_path, exist_ok=True)
        self.pdfs = pdfs
        self.pdf_path = os.path.join(self.source_path, 'pdfs')
        self.links_path = os.path.join(self.source_path, 'links.txt')

        articles_dir = os.path.join(self.path, 'raw')
        os.makedirs(articles_dir, exist_ok=True)
        self.articles_path = os.path.join(articles_dir, f'{source}.jsonl')

        self.driver = setup_driver(
            include_experimental=self.pdfs,
            download_path=self.pdf_path if self.pdfs else None,
            binary_location=chrome_binary_location,
            driver_location=chrome_driver_location
        )

    def save_articles(self, articles, path=None):
        '''
        Save articles (list of dict) to self.articles_path (.jsonl)
        '''
        articles_path = self.articles_path if path is None else path
        with open(articles_path, 'w') as f:
            for article in articles:
                f.write(f"{json.dumps(article)}\n")

    def save_links(self, links):
        '''
        Save links (list of str) to self.links_path (.txt)
        '''
        links = list(set(links))
        with open(self.links_path, 'w') as f:
            for link in links:
                f.write(f"{link}\n")

    def reset_download_path(self):
        self.driver.execute_cdp_cmd(
            'Page.setDownloadBehavior', 
            {'behavior': 'allow', 'downloadPath': self.pdf_path})

    def _scrape_links(self):
        '''
        Scrape links from source to self.links_path.
        '''
        raise NotImplementedError

    def scrape_links(self):
        '''
        Scrape links from source to self.links_path.
        '''
        if os.path.exists(self.links_path):
            with open(self.links_path, "r") as f:
                return [i.strip() for i in f.readlines()]
        else:
            return self._scrape_links()

    def scrape_articles(self, links):
        '''
        Scrape articles from links to self.path/{source}/{source}.jsonl.
        '''
        raise NotImplementedError

    def scrape(self):
        '''
        Scrape articles to self.path/{source}/{source}.jsonl.
        1 - Download links to source articles.
        2 - Download articles from links.
        3 - Convert PDFs to text (if necessary)
        '''
        print(f"Stage 1: Scraping article links from {self.source}...")
        links = self.scrape_links()
        unique_links = list(set(links))
        print(f"Scraped {len(unique_links)} unique article links from {self.source}.")

        print(f"Stage 2: Scraping articles from {self.source}...")
        articles = self.scrape_articles(unique_links)

        if self.pdfs:
            print(f"Stage 3: Converting PDFs to text...")
            articles = pdf2text(self.pdf_path, self.articles_path)

        print(f"Saved {len(articles)} {self.source} articles to {self.articles_path}.")
        self.driver.quit()
        return articles


class AAFPScraper(Scraper):
    '''
    Source: AAFP - American Academy of Family Physicians (https://www.aafp.org/)
    '''
    def __init__(self, path, **kwargs):
        super().__init__('aafp', path, **kwargs)

    def _scrape_links(self):
        self.driver.get("https://www.aafp.org/family-physician/patient-care/clinical-recommendations/recommendations-by-topic.html")
        wait = WebDriverWait(self.driver, 10)
        elements = wait.until(EC.presence_of_all_elements_located((
            By.XPATH, "//a[(@class='adl-link') and (contains(@href, '/family-physician/patient-care/clinical-recommendations/recommendations-by-topic'))]")))
        topics = list(set([href.get_attribute("href") for href in elements]))
        links = []
        for topic in tqdm(topics):
            try:
                self.driver.get(topic)
                wait = WebDriverWait(self.driver, 10)
                elements = wait.until(EC.presence_of_all_elements_located((
                    By.XPATH, "//a[(@class='adl-link') and (contains(@href, '/family-physician/patient-care/clinical-recommendations/all-clinical-recommendations'))]")))
                links += [href.get_attribute("href") for href in elements]
            except:
                pass
        links = list(set(links))
        self.save_links(links)
        return links

    def scrape_articles(self, links):
        articles = []
        for link in tqdm(links):
            try: 
                self.driver.get(link)
                wait = WebDriverWait(self.driver, 10)
                element = wait.until(EC.presence_of_element_located((
                    By.XPATH, "//div[@class='aem-Grid aem-Grid--12 aem-Grid--default--12 ']")))
                content_text = element.text
                if content_text == '':
                    continue
                article = {"title": self.driver.title, 
                           "text": markdownify.markdownify(content_text),
                           "url": link}
                articles.append(article)
            except:
                pass
        self.save_articles(articles)
        return articles


class CCOScraper(Scraper):
    '''
    Source: CCO - Cancer Care Ontario
    '''
    def __init__(self, path, **kwargs):
        super().__init__('cco', path, pdfs=True, **kwargs)

    def _scrape_links(self):
        links = scrape_SPOR_links()
        links = [link for link in links if 'cancercareontario' in link]
        self.save_links(links)
        return links

    def scrape_articles(self, links):
        os.makedirs(self.pdf_path, exist_ok=True)
        for link in tqdm(links):
            try:
                self.driver.get(link)
                wait = WebDriverWait(self.driver, 10)
                button = wait.until(EC.element_to_be_clickable((
                    By.XPATH, '//span[@class="file"]/a[@class="jquery-once-2-processed btn purple icon pdfBTN"]')))
                self.reset_download_path()
                button.click()
            except:
                pass
        return []


class CDCScraper(Scraper):
    '''
    Source: CDC - Centers for Disease Control and Prevention (https://www.cdc.gov/)
    '''
    def __init__(self, path, **kwargs):
        super().__init__('cdc', path, pdfs=True, **kwargs)

    def _scrape_links(self):
        url = "https://stacks.cdc.gov/cbrowse?pid=cdc%3A100&parentId=cdc%3A100&maxResults=100&start=0"
        self.driver.get(url)
        links = []

        def next_page():
            try:
                continue_link = self.driver.find_element(By.PARTIAL_LINK_TEXT, 'Next')
                continue_link.click()
                time.sleep(5)
                return True

            except NoSuchElementException:
                return False
        search_expr = "//div[@class='search-result-row card']/div/div[1]/div/a"
        wait = WebDriverWait(self.driver, 5)
        hrefs = wait.until(EC.presence_of_all_elements_located((By.XPATH, search_expr)))
        links.extend([a.get_attribute("href") for a in hrefs])

        while next_page():
            try:
                wait = WebDriverWait(self.driver, 5)
                hrefs = wait.until(EC.presence_of_all_elements_located((By.XPATH, search_expr)))
                links.extend([a.get_attribute("href") for a in hrefs])
            except NoSuchElementException:
                pass

        self.save_links(links)
        return links

    def scrape_articles(self, links):
        articles = []
        for page in tqdm(links):
            self.driver.get(page)
            wait = WebDriverWait(self.driver, 5)
            try:
                button = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//button[@id='download-document-submit']")))
                button[0].click()
            except:
                pass
                #but = self.driver.find_element(By.XPATH, "//a[@id='not-link']")
                #articles.append(but.get_attribute("href"))

        self.save_articles(articles)
        return articles


class CMAScraper(Scraper):
    '''
    Source: CMA - Canadian Medical Association (https://joulecma.ca/cpg/homepage)
    '''

    def __init__(self, path, **kwargs):
        super().__init__('cma', path, **kwargs)

    def dismiss_popup(self):
        try:
            dismiss_button = self.driver.find_element(By.XPATH, "//button[@class='agree-button eu-cookie-compliance-default-button btn btn-primary']")
            dismiss_button.click()
            time.sleep(1)
        except NoSuchElementException:
            pass

    def _scrape_links(self):
        links = []
        errors = []
        ids = [488, 1038, 1000, 1001, 1003, 1005, 1007, 1009, 1028, 1012, 1014,
               1023, 1016, 1019, 1021, 1026, 1027, 1029, 1032, 1024, 1035, 1025,
               1043, 998, 999, 1030, 1002, 1004, 1006, 1008, 1010, 1037, 1013,
               1011, 1015, 1017, 1020, 1022, 1039, 1040, 1031, 1033, 1034, 1036,
               1041, 1042]
        for id in tqdm(ids): 
            try:
                self.driver.get(f"https://joulecma.ca/cpg/homepage/browse-by/category/specialties/id/{id}")
                self.dismiss_popup()  
                wait = WebDriverWait(self.driver, 5)
                multiple_page = wait.until(EC.presence_of_element_located((
                    By.XPATH, "//div[@class='pagination']")))
                if multiple_page is not None:
                    pages = self.driver.find_elements(By.XPATH, "//div[@class='pagination']/li/a")
                    for i, page in enumerate(pages):
                        wait = WebDriverWait(self.driver, 10)
                        self.driver.get(f"https://joulecma.ca/cpg/homepage/browse-by/category/specialties/id/{id}")
                        buttons = wait.until(EC.presence_of_all_elements_located((
                            By.XPATH, "//div[@class='pagination']/li/a")))
                        buttons[i].click()
                        time.sleep(3)
                        wait = WebDriverWait(self.driver, 10)
                        element = wait.until(EC.presence_of_all_elements_located((
                            By.XPATH, "//h5/a[@title='View fulltext guideline']")))
                        link = [a.get_attribute('href') for a in element]
                        links.extend(link)
                else:
                    element = wait.until(EC.presence_of_all_elements_located((
                        By.XPATH, "//h5/a[@title='View fulltext guideline']")))
                    link = [a.get_attribute('href') for a in element]
                    links.extend(link)
            except Exception as e:
                errors.append(id)
                pass
        links = list(set(links))
        self.save_links(links)
        return links

    def scrape_articles(self, links):
        '''
        Divide articles into [CMA, CMA PDFs, Choosing Wisely and other articles]
        Downloads only CMA and CMA PDFs.
        '''
        articles = []
        links = ([link.replace("\n", "") for link in links if not ("choosing" in link.lower()) and not ("choisir" in link.lower())])
        pdf_links = ([link for link in links if link.lower().endswith("pdf")])
        links = ([link for link in links if not link.lower().endswith("pdf")])
        cma_links = ([link for link in links if ("www.canada.ca" in link.lower())])

        print('Stage 2A: Downloading CMA PDFs...')
        self.driver.set_page_load_timeout(1)
        pdfs = []
        for pdf_link in tqdm(pdf_links):
            try: 
                self.driver.get(pdf_link)
                wait = WebDriverWait(self.driver, 10)
                element = wait.until(EC.presence_of_element_located((
                    By.XPATH, "//div[@id='main-content']")))
                inner_html = element.get_attribute('innerHTML')
                pdfs.append({"title": self.driver.title,
                             "text": markdownify.markdownify(inner_html),
                             "url": pdf_link})
            except:
                pass
        cma_pdfs_path = os.path.join(self.source_path, 'cma_pdfs.jsonl')
        self.save_articles(pdfs, path=cma_pdfs_path)
        articles.extend(pdfs)

        print('Stage 2B: Downloading HTML CMA articles...')
        cma = []
        for cma_link in tqdm(cma_links):
            try: 
                self.driver.get(cma_link)
                wait = WebDriverWait(self.driver, 10)
                element = wait.until(EC.presence_of_element_located((
                    By.XPATH, "//main[@property='mainContentOfPage']")))
                inner_html = element.get_attribute('innerHTML')
                cma.append({"title": self.driver.title,
                            "text": markdownify.markdownify(inner_html),
                            "url": cma_link})
            except:
                pass
        articles.extend(cma)
        self.save_articles(cma)
        return articles


class CPSScraper(Scraper):
    '''
    Source: CPS - Canadian Paediatric Society
    '''
    def __init__(self, path, **kwargs):
        super().__init__('cps', path, **kwargs)

    def _scrape_links(self):
        links = scrape_SPOR_links()
        links = [link for link in links if 'cps.ca' in link]
        self.save_links(links)
        return links

    def scrape_articles(self, links):
        articles = []
        for link in tqdm(links):
            try:
                self.driver.get(link)
                wait = WebDriverWait(self.driver, 2)
                button = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="cell main-body"]')))
                article = {
                    'title': self.driver.title,
                    'text': markdownify.markdownify(button.get_attribute("innerHTML")).replace("\n\n", "\n"),
                    'link': link
                }
                articles.append(article)
            except Exception as e:
                pass
        self.save_articles(articles)
        return articles


class DrugsScraper(Scraper):
    '''
    Source: Drugs.com (https://www.drugs.com/)
    '''
    def __init__(self, path, **kwargs):
        super().__init__('drugs', path, **kwargs)

    def _scrape_links(self):
        self.driver.get("https://www.drugs.com/dosage/")
        wait = WebDriverWait(self.driver, 10)
        elements = wait.until(EC.presence_of_all_elements_located((
            By.XPATH, "//ul/li/a[contains(@href, '/dosage-')]")))
        links = [el.get_attribute("href") for el in elements]
        self.save_links(links)
        return links

    def scrape_articles(self, links):
        articles = []
        for link in tqdm(links):
            try:
                self.driver.get(link)
                wait = WebDriverWait(self.driver, 10)
                elements_inner = wait.until(EC.presence_of_all_elements_located((
                    By.XPATH, "//ul/li/a[contains(@href, '/dosage/')]")))
                hrefs_art_clean = [el.get_attribute("href") for el in elements_inner]
                for href_inner in hrefs_art_clean:
                    try:
                        self.driver.get(href_inner)
                        wait = WebDriverWait(self.driver, 10)
                        div_content = wait.until(EC.presence_of_element_located((By.ID, "content")))
                        content = "<div>" + div_content.get_attribute('innerHTML') + "</div>"
                        article = {
                            "title": self.driver.title,
                            "text": markdownify.markdownify(content),
                            "url": link
                        }
                        articles.append(article)
                    except Exception as e:
                        print("Page-level: " + str(e))
            except Exception as e:
                print("Section-level: " + str(e))
        self.save_articles(articles)
        return articles


class GCScraper(Scraper):
    '''
    Source: GuidelineCentral (https://www.guidelinecentral.com/)
    '''
    def __init__(self, path, **kwargs):
        super().__init__('guidelinecentral', path, **kwargs)

    def _scrape_links(self):
        url = "https://www.guidelinecentral.com/guidelines/?t=guideline&f=%7B%22sort%22%3A%7B%22type%22%3A%22relevance%22%2C%22order%22%3A%22desc%22%7D%2C%22range%22%3A%7B%22max%22%3A0%2C%22start%22%3A0%2C%22limit%22%3A20%7D%2C%22filters%22%3A%5B%7B%22searchType%22%3A%22guideline%22%2C%22name%22%3A%22Within%2010%20Years%20(1766)%22%2C%22type%22%3A%22within10Years%22%2C%22syntax%22%3A%22docPubDate%3A%5BNOW-10YEAR%20TO%20NOW%5D%22%7D%5D%2C%22state%22%3A%22guidelines_landing_search%22%2C%22term%22%3A%22contentType%3ADOCUMENT%22%7D"
        self.driver.get(url)
        links = []
        count = 0
        while True:
            print(count)
            try:
                wait = WebDriverWait(self.driver, 30)
                hrefs = wait.until(EC.presence_of_all_elements_located((
                    By.XPATH, "//div[@class='result-meta']/h3/a")))
                link = [href.get_attribute("href") for href in hrefs]
                links.extend(link)
                next_page = wait.until(EC.presence_of_element_located((
                    By.XPATH, "//a[@class='page-link next-page-of-results']")))
                next_page.click()
                time.sleep(10)
                count += 1
            except:
                print("No more pages")
                break
        links = list(set(links))
        self.save_links(links)
        return links

    def scrape_articles(self, links):
        articles = []
        for link in tqdm(links):
            try:
                self.driver.get(link)
                wait = WebDriverWait(self.driver, 10)
                content = wait.until(EC.presence_of_element_located((By.XPATH, "//main[@class='site-main']")))
                try:
                    title = content.find_element(By.XPATH, "//div[@class='guideline-title text-center']/h1").text
                except NoSuchElementException:
                    print("No title found")
                    title = ""
                try:
                    main_content = content.find_element(By.ID, "summary-nav").text
                except NoSuchElementException:
                    print("No main content found")
                    main_content = ""
                if title and main_content:
                    articles.append({
                        "title": title,
                        "text": main_content
                    })
                time.sleep(1)
            except Exception as e:
                print(e)
                print("Skipping: " + link)
        self.save_articles(articles)
        return articles


def download_file(url, local_filename):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)


class ICRCScraper(Scraper):
    '''
    Source: ICRC - International Committee of the Red Cross
    '''
    def __init__(self, path, **kwargs):
        super().__init__('icrc', path, pdfs=True, **kwargs)

    def _scrape_links(self):
        return []

    def scrape_articles(self, links):
        '''
        This one differs from other scrapers;
        it downloads a zip file of ICRC PDFs.
        '''
        assert [] == links

        def _check_file_on_disk(lf):
            expected_hash = '4369ebcfc324bd922f973d354d1f52a6aa134b335dc858311e9c69904ba3d93b'

            with open(lf, 'rb') as f:
                first_four_hundred = f.read(400)
            m = hashlib.sha256()
            m.update(first_four_hundred)

            assert m.hexdigest() == expected_hash, \
                "Downloaded file hash wrong"

        url = "https://www.idiap.ch/~kmatoba/icrc.tar.gz"

        local_dir = os.path.join(self.source_path, "pdfs")
        os.makedirs(local_dir, exist_ok=True)
        local_filename = os.path.join(local_dir, "icrc.tar.gz")

        if os.path.isfile(local_filename):
            print(f"{local_filename} exists, skipping download")
        else:
            download_file(url, local_filename)

        _check_file_on_disk(local_filename)
        with tarfile.open(local_filename, 'r') as f:
            f.extractall(path=self.pdf_path)

        files = os.listdir(self.pdf_path)
        return files


class IDSAScraper(Scraper):
    '''
    Source: IDSA - Infectious Diseases Society of America (https://www.idsociety.org/)
    '''
    def __init__(self, path, **kwargs):
        super().__init__('idsa', path, **kwargs)

    def _scrape_links(self):
        self.driver.get("https://www.idsociety.org/practice-guideline/alphabetical-guidelines/")
        wait = WebDriverWait(self.driver, 20)
        content = wait.until(EC.presence_of_element_located((
            By.CLASS_NAME, "rtln_body_content")))
        wait = WebDriverWait(content, 10)
        elements = wait.until(EC.presence_of_all_elements_located((
            By.TAG_NAME, "a")))
        links = [el.get_attribute("href") for el in elements]
        links = [link for link in links if 'idsociety' in link and 'pdf' not in link]
        self.save_links(links)
        return links

    def scrape_articles(self, links):
        articles = []
        for link in tqdm(links):
            print(link)
            try:
                self.driver.get(link)
                wait = WebDriverWait(self.driver, 10)
                content = wait.until(EC.presence_of_element_located((By.XPATH, "//article")))
                inner_html = content.get_attribute('innerHTML')
                text = markdownify.markdownify(inner_html)
                article = {
                    'link': link,
                    'text': text,
                    "url": link
                }
                articles.append(article)
            except Exception as e:
                continue

        self.save_articles(articles)
        return articles


class MAGICScraper(Scraper):
    '''
    Source: MAGIC - Making GRADE the Irresistible Choice (https://app.magicapp.org/)

    UNTESTED.
    '''

    def __init__(self, path, **kwargs):
        super().__init__('magic', path, **kwargs)

    def scroll_down(self):
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        while True:
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

    def scrape(self):
        ''' No links for MAGIC. '''
        articles = self.scrape_articles()
        print(f"Saved {len(articles)} {self.source} articles to {self.articles_path}.")
        self.driver.quit()

    def scrape_articles(self):
        articles = []
        self.driver.get("https://app.magicapp.org/#/guidelines")
        wait = WebDriverWait(self.driver, 10)
        element = wait.until(EC.presence_of_all_elements_located((
            By.XPATH, "//a[@class='contentItemDataTitle']")))

        total = len(element)
        ran = tqdm(range(total))
        for i in ran:
            try:
                self.scroll_down()
                self.driver.get(f"https://app.magicapp.org/#/guidelines")
                wait = WebDriverWait(self.driver, 10)
                element = wait.until(EC.presence_of_all_elements_located((
                    By.XPATH, "//a[@class='contentItemDataTitle']")))
                element[i].click()
                ran.set_description("Waiting for page click")
                time.sleep(7)
                try:
                    wait = WebDriverWait(self.driver, 10)
                    button_more = wait.until(EC.presence_of_all_elements_located((
                        By.XPATH, "//span[@class='sectionMoreLink']")))
                    for more in button_more:
                        more.click()
                    ran.set_description("Waiting for section click")
                    time.sleep(7)
                except:
                    pass
                try:
                    wait = WebDriverWait(self.driver, 10)
                    button_section = wait.until(EC.presence_of_all_elements_located((
                        By.XPATH, "//div[@class='ctrButtons']/button[@class='ctrBtn bkTextBtn']")))
                    for section in button_section:
                        section.click()
                    ran.set_description("Waiting for more click")
                    time.sleep(7)
                except:
                    pass
                wait = WebDriverWait(self.driver, 10)
                element = wait.until(EC.presence_of_element_located((
                    By.XPATH, "//main[@class='centerPanel noTabs']/div[@id='mainContent']//div[@class='guidelinePanel']")))
                inner_html = element.get_attribute('innerHTML')
                wait = WebDriverWait(self.driver, 10)
                strength = wait.until(EC.presence_of_all_elements_located((
                    By.XPATH, "//div[@class='recommendationWidget dojoDndItem']//div[@class='textComments']")))
                articles.append({"title": self.driver.title,
                            "strength": [markdownify.markdownify(stren.get_attribute("innerHTML")).replace("\n\n", "\n") for stren in strength],
                            "content": markdownify.markdownify(inner_html).replace("\n\n", "\n")})
            except:
                print(f"Error with {i}")
                try:
                    self.driver.find_element(By.XPATH, "//button[contains(text(),'OK')]").click()
                except:
                    pass
                pass
        self.save_articles(articles)
        return articles


class SPORScraper(Scraper):
    '''
    Source: SPOR - Strategy for Patient-Oriented Research
    '''
    def __init__(self, path, **kwargs):
        super().__init__('spor', path, pdfs=True, **kwargs)

    def _scrape_links(self):
        links = scrape_SPOR_links()
        pdf_links = [link for link in links if '.pdf' in link]
        self.save_links(pdf_links)
        return pdf_links

    def scrape_articles(self, links):
        self.driver.set_page_load_timeout(2)
        fail = 0
        for link in tqdm(links):
            try:
                self.driver.get(link)
                time.sleep(1)
            except:
                fail += 1
                pass
        if fail > 0:
            print(f"Failed to load {fail} links.")
        return []


class WHOScraper(Scraper):
    '''
    Source: WHO - World Health Organization (https://www.who.int/publications/guidelines/en/)

    Note: In November 2023, the WHO site is very slow. Play around with the sleep between iterations.
          Too long: complaints about timeouts, too fast and don't scrape the data.
    '''

    def __init__(self, path, **kwargs):
        super().__init__('who', path, pdfs=True, **kwargs)

    def _scrape_links(self):
        self.driver.get("https://www.who.int/publications/i?publishingoffices=c09761c0-ab8e-4cfa-9744-99509c4d306b")
        wait = WebDriverWait(self.driver, 10)
        pdf_urls = []
        # sleep_secs = 2.5
        sleep_secs = 1.5
        while True:
            # time.sleep(1)
            try:
                all_buttons = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[@class='download-url']")))
                links = [button.get_attribute("href") for button in all_buttons]
                pdf_urls.extend(links)
                print(f"Found {len(pdf_urls)} PDFs [{len(set(pdf_urls))} unique]")
                next_button_str_click = "//span[@class='k-icon k-i-arrow-60-right']"
                next_page_button_click = self.driver.find_element(By.XPATH, next_button_str_click)
                try:
                    next_page_button_click.click()
                    time.sleep(sleep_secs)
                except exceptions.ElementClickInterceptedException:
                    break
            except exceptions.StaleElementReferenceException:
                print("StaleElementReferenceException - Retrying...")
                continue
        self.save_links(pdf_urls)
        return pdf_urls

    def scrape_articles(self, links):
        os.makedirs(self.pdf_path, exist_ok=True)
        articles = []
        for i, pdf_url in tqdm(enumerate(links)):
            response = requests.get(pdf_url)
            if response.status_code != 200:
                print(f"Failed to get {pdf_url}")
                continue
            filename = os.path.join(self.pdf_path, f"{i}.pdf")
            with open(filename, "wb") as f:
                f.write(response.content)
        return articles


SCRAPERS = {
    'aafp': AAFPScraper,
    'cco': CCOScraper,
    'cdc': CDCScraper,
    'cma': CMAScraper,
    'cps': CPSScraper,
    'drugs': DrugsScraper,
    'guidelinecentral': GCScraper,
    'icrc': ICRCScraper,
    'idsa': IDSAScraper,
    'magic': MAGICScraper,
    'spor': SPORScraper,
    'who': WHOScraper
}

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--path",
                        type=str,
                        # default=os.path.join(os.getcwd(), 'raw'),
                        default=os.getcwd(),
                        help="Path to download scraped guidelines to.")
    parser.add_argument("--sources",
                        nargs="+",
                        default=list(SCRAPERS.keys()),
                        help="List of sources to scrape, formatted as: --sources source1 source2 ... Default: all sources.")
    parser.add_argument("--chrome_binary_location",
                        type=str,
                        default=None,
                        help="Path to Chrome binary.")
    parser.add_argument("--chrome_driver_location",
                        type=str,
                        default=None,
                        help="Path to Chrome driver.")
    catch_scrape_exceptions = False
    # catch_scrape_exceptions = True
    args = parser.parse_args()

    os.makedirs(args.path, exist_ok=True)
    print(f"Downloading guidelines to {args.path}.")

    scrapers_dict = {k: v for k, v in SCRAPERS.items() if k in args.sources} if args.sources else SCRAPERS
    print(f"Scraping {len(scrapers_dict)} sources: {list(scrapers_dict.keys())}")

    for i, (source, scraper_class) in enumerate(scrapers_dict.items()):
        print('\n' + '-' * 50 + f"Scraping {source} [{i+1}/{len(scrapers_dict)}]...")
        try:
            scrap_params = dict(path=args.path)
            if args.chrome_binary_location:
                scrap_params.update(chrome_binary_location=args.chrome_binary_location)
            if args.chrome_driver_location:
                scrap_params.update(chrome_driver_location=args.chrome_driver_location)
            scraper = scraper_class(**scrap_params)
        except Exception as e:
            print(f"Error while initializing {source} scraper: {e}")
            continue
        if catch_scrape_exceptions:
            try:
                scraper.scrape()
            except Exception as e:
                print(f"Error while scraping {source}: {e}")
                continue
        else:
            scraper.scrape()