meditron / Git / [5b4ecd] /gap-replay/guidelines/scrapers/wikidoc/src/index.ts

Models:
cathy-stones/
meditron
Downloads: 0
[5b4ecd]: / gap-replay / guidelines / scrapers / wikidoc / src / index.ts
History
Download this file
248 lines (214 with data), 8.2 kB

/* ==================== IMPORTS ==================== */

import type {Page, Browser} from 'puppeteer';
import { createCursor } from "ghost-cursor"

const puppeteer = require('puppeteer-extra')

// Stealth plugin (all tricks to hide puppeteer usage)
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())

// Adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
puppeteer.use(AdblockerPlugin({blockTrackers: true}))
const fs = require("fs");

/* ==================== GLOBAL VARIABLES ==================== */

const TOC_URL='https://www.wikidoc.org/index.php/Special:AllPages';
const VERBOSE = true;
const HEADLESS = true;
const TIMEOUT = 50;

const TITLE_SELECTOR:string = "#firstHeading"; // Title of the page
const NEXT_PAGE_SELECTOR:string = "div.mw-allpages-nav a"
const TOC_SELECTOR:string = "div.mw-allpages-body li a";                   
const CONTENT_SELECTOR:string = "div.mw-parser-output h1, div.mw-parser-output h2, div.mw-parser-output h3, div.mw-parser-output h4, div.mw-parser-output p, div.mw-parser-output li"; // Text content of the page

const OUTPUT_PATH: string = 'wikidoc_articles.jsonl'
const TOC_PATH = 'toc_pages.txt'
const SCRAPED_PATH = 'scraped_articles.txt'

var SCRAPED_URLs:string[] = [];

/* ==================== CLASSES ==================== */
class Article {
  readonly name:NonNullable<string>;
  readonly url:NonNullable<string>;
  text:NonNullable<string>;

  constructor(name:string,url:string,text:string){
    this.name=name;
    this.url=url;
    this.text=text;
  }
}

/* ==================== SCRAPER ==================== */

class PuppeteerRun{
  page: Page;
  browser: Browser;
  cursor:any;

  constructor(page: Page, browser: Browser,cursor:any){
    this.page=page;
    this.browser=browser;
    this.cursor=cursor;
  }

  /* ==================== HELPER FUNCTIONS ==================== */

  static async setup(headless_b:boolean):Promise<PuppeteerRun>{
    const headless= headless_b ? "new":headless_b;
    const browser=await puppeteer.launch({ headless: headless });
    const page = await browser.newPage();
    page.setViewport({ width: 800, height: 600 });
    const cursor = createCursor(page);
    await page.goto(TOC_URL);
    await page.waitForTimeout(TIMEOUT);
    if (VERBOSE){console.log("Reached table of contents at URL: ",TOC_URL,"\n");}
    return new PuppeteerRun(page,browser,cursor);
  }

  async check_sel (selector: string) {
    return await this.page.$eval(selector, () => true).catch(() => false);
  }
  async save_article(article:Article, path:string){
    await fs.appendFileSync(path, JSON.stringify(article, null, 0)+'\n');
  }
  async get_links (selector: string) {
    return await this.page.$$eval(selector, elements=>elements.map(a=>[a.textContent,(a as HTMLAnchorElement).href]));
  }

  async getAllPages (urls:string[]) : Promise<string[]> {
    // Collect all table of content page URLs
    if (await this.check_sel(NEXT_PAGE_SELECTOR)){
      let next_page_links = await this.get_links(NEXT_PAGE_SELECTOR);
      next_page_links = next_page_links.filter(x=>x[0]!.startsWith("Next"));
      if (next_page_links.length > 0 && next_page_links[0][1] != null){
        let next_url = next_page_links[0][1];
        urls.push(next_url);
        await this.page.goto(next_url);
        await this.page.waitForTimeout(100);
        return await this.getAllPages(urls);
      }
    }
    return urls;
  }

  /* ==================== ARTICLE EXTRACTOR ==================== */

  async formatContent(){
    let elements = (await this.page.$$(CONTENT_SELECTOR));
    let content = '';

    // If there is a TOC, skip all elements until first h2 after 'Contents'
    for (let el of elements) {
      let tag = (await this.page.evaluate(el => el.tagName, el));
      let text = (await this.page.evaluate(el => el.textContent, el))?.trim();

      // Skip children of table of contents or table
      let parent_path = await this.page.evaluate(el => {
        let path = '';
        let parent = el.parentElement;
        while (parent != null){
          path += parent.tagName+'.'+parent.id + ' '; 
          parent = parent.parentElement;
        }
        return path;
      }, el);
      if (parent_path.match(/DIV.toc/) || parent_path.match(/TABLE/)){
        continue;
      }
      //console.log('Tag: '+tag+'\nPath: '+parent_path+'\nText: '+text+'\n')
      if (tag == 'H1' || tag == 'H2' || tag == 'H3' || tag == 'H4') {
        if (text === 'References' || text?.startsWith('See also')){
          break;
        }
      }
      if (tag == 'H1'){
        content += '\n\n# ' + text;
      }
      else if (tag == 'H2'){
        content += '\n\n# ' + text;
      }
      else if (tag == 'H3'){
        content += '\n\n## ' + text;
      }
      else if (tag == 'H4'){
        content += '\n\n### ' + text;
      }
      else if (tag == 'P'){
        if (parent_path.match(/LI/)){
          continue;
        }
        text = text?.replace(/^\|.*$/gm, '');
        text = text?.replace(/^\{\{.*$/gm, '');
        text = text?.replace(/^\}\}.*$/gm, '');
        text = text?.replace(/^\s*[\r\n]/gm, '');
        if (text != null && text != ''){
          content += '\n' + text;
        }
      }
      else if (tag == 'LI'){
        content += '\n- ' + text;
      }
    }
    content = content.trim();
    //console.log('\n\n=========================\n\n'+content+'\n\n=========================\n\n')
    return content;
  }

  async scrapeArticle(url:string){
    await this.page.goto(url);
    await this.page.waitForTimeout(TIMEOUT);

    // Check for error 
    let title = await this.page.$eval(TITLE_SELECTOR, element=>element.textContent?.trim());
    if (title!.startsWith("Error")) {
      console.log(`\tSKIPPED.`);
      return;
    }
    // Extract article content
    let content = await this.formatContent();
    if (title == null){
      title = '';
    }
    if (title != null && title != ''){
      content = title + '\n\n' + content;
    }
    let article:Article = new Article(title,url,content);
    await this.save_article(article, OUTPUT_PATH);
  }

  /* ==================== SCRAPING FUNCTION ==================== */

  async scrape(toc_url:string){
    try {
      await this.page.goto(toc_url);
      await this.page.waitForTimeout(TIMEOUT);

      let article_links = await this.get_links(TOC_SELECTOR);
      for (let i = 0; i < article_links.length; i++) {
        const article_link = article_links[i];
        const article_name = article_link[0];
        const article_url = article_link[1];
        if (article_name == null || article_url! in SCRAPED_URLs) {continue;}
        console.log(`\n\tArticle (${i+1} / ${article_links.length}):\n\tName: ${article_name}\n\tURL: ${article_url}`)
        await this.scrapeArticle(article_url!);
        SCRAPED_URLs.push(article_url!);
        await fs.appendFileSync(SCRAPED_PATH, article_url+'\n');
      }
    }
    catch (e) {
      console.error(e);
    }
  }
}

/* ==================== MAIN ==================== */

async function run(){
  const run = await PuppeteerRun.setup(HEADLESS);

  // Get TOC pages
  var toc_urls:string[] = [];
  if (!fs.existsSync(TOC_PATH)) {
    console.log('Scraping all TOC pages...')
    toc_urls = await run.getAllPages([]);
    console.log('\nSaving TOC URLs to file...')
    for (let i = 0; i < toc_urls.length; i++) {
      const toc_url = toc_urls[i];
      await fs.appendFileSync(TOC_PATH, toc_url+'\n');
    }
  }
  else {
    toc_urls = fs.readFileSync(TOC_PATH, 'utf8').split('\n');
    console.log(`Loaded ${toc_urls.length} TOC pages`)
  }

  // Check for already scraped pages and articles
  if (fs.existsSync(SCRAPED_PATH) && fs.existsSync(OUTPUT_PATH)) {
    SCRAPED_URLs = fs.readFileSync(SCRAPED_PATH, 'utf8').split('\n');
    console.log(`Already scraped ${SCRAPED_URLs.length} articles`)
  }

  // Scrape all remaining articles
  console.log(`Scraping ${toc_urls.length} TOC pages.`)
  for (let i = 0; i < toc_urls.length; i++) {
    const toc_url = toc_urls[i];
    console.log(`Page (${i+1} / ${toc_urls.length}):`)
    await run.scrape(toc_url);
  }
}

run().then(()=>console.log("Done!")).catch(x=>console.error(x));