[1bd6b5]: / notebooks / Literature_data.ipynb

Download this file

2390 lines (2389 with data), 97.2 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%run notebook_setup.ipynb"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define search terms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from search_terms import primary_terms, secondary_terms, descriptive_terms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'multi-omics': '(\"multi-omic\"[Text Words]) OR (\"multiomic\"[Text Words]) OR (\"multi omic\"[Text Words]) OR (\"multi-omics\"[Text Words]) OR (\"multiomics\"[Text Words]) OR (\"multi omics\"[Text Words])',\n",
       " 'pan-omics': '(\"pan-omic\"[Text Words]) OR (\"panomic\"[Text Words]) OR (\"pan omic\"[Text Words]) OR (\"pan-omics\"[Text Words]) OR (\"panomics\"[Text Words]) OR (\"pan omics\"[Text Words])',\n",
       " 'trans-omics': '(\"trans-omic\"[Text Words]) OR (\"transomic\"[Text Words]) OR (\"trans omic\"[Text Words]) OR (\"trans-omics\"[Text Words]) OR (\"transomics\"[Text Words]) OR (\"trans omics\"[Text Words])',\n",
       " 'poly-omics': '(\"poly-omic\"[Text Words]) OR (\"polyomic\"[Text Words]) OR (\"poly omic\"[Text Words]) OR (\"poly-omics\"[Text Words]) OR (\"polyomics\"[Text Words]) OR (\"poly omics\"[Text Words])',\n",
       " 'cross-omics': '(\"cross-omic\"[Text Words]) OR (\"crossomic\"[Text Words]) OR (\"cross omic\"[Text Words]) OR (\"cross-omics\"[Text Words]) OR (\"crossomics\"[Text Words]) OR (\"cross omics\"[Text Words])'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "primary_terms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'multi-table omics': '((\"multi-table\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-table\"[Text Words]) AND (omics[Text Words])) OR ((\"multitable\"[Text Words]) AND (omic[Text Words])) OR ((\"multitable\"[Text Words]) AND (omics[Text Words])) OR ((\"multi table\"[Text Words]) AND (omic[Text Words])) OR ((\"multi table\"[Text Words]) AND (omics[Text Words])) OR ((\"multi-tables\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-tables\"[Text Words]) AND (omics[Text Words])) OR ((\"multitables\"[Text Words]) AND (omic[Text Words])) OR ((\"multitables\"[Text Words]) AND (omics[Text Words])) OR ((\"multi tables\"[Text Words]) AND (omic[Text Words])) OR ((\"multi tables\"[Text Words]) AND (omics[Text Words]))',\n",
       " 'multi-source omics': '((\"multi-source\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-source\"[Text Words]) AND (omics[Text Words])) OR ((\"multisource\"[Text Words]) AND (omic[Text Words])) OR ((\"multisource\"[Text Words]) AND (omics[Text Words])) OR ((\"multi source\"[Text Words]) AND (omic[Text Words])) OR ((\"multi source\"[Text Words]) AND (omics[Text Words])) OR ((\"multi-sources\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-sources\"[Text Words]) AND (omics[Text Words])) OR ((\"multisources\"[Text Words]) AND (omic[Text Words])) OR ((\"multisources\"[Text Words]) AND (omics[Text Words])) OR ((\"multi sources\"[Text Words]) AND (omic[Text Words])) OR ((\"multi sources\"[Text Words]) AND (omics[Text Words]))',\n",
       " 'multi-view omics': '((\"multi-view\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-view\"[Text Words]) AND (omics[Text Words])) OR ((\"multiview\"[Text Words]) AND (omic[Text Words])) OR ((\"multiview\"[Text Words]) AND (omics[Text Words])) OR ((\"multi view\"[Text Words]) AND (omic[Text Words])) OR ((\"multi view\"[Text Words]) AND (omics[Text Words])) OR ((\"multi-views\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-views\"[Text Words]) AND (omics[Text Words])) OR ((\"multiviews\"[Text Words]) AND (omic[Text Words])) OR ((\"multiviews\"[Text Words]) AND (omics[Text Words])) OR ((\"multi views\"[Text Words]) AND (omic[Text Words])) OR ((\"multi views\"[Text Words]) AND (omics[Text Words]))',\n",
       " 'multi-modal omics': '((\"multi-modal\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-modal\"[Text Words]) AND (omics[Text Words])) OR ((\"multimodal\"[Text Words]) AND (omic[Text Words])) OR ((\"multimodal\"[Text Words]) AND (omics[Text Words])) OR ((\"multi modal\"[Text Words]) AND (omic[Text Words])) OR ((\"multi modal\"[Text Words]) AND (omics[Text Words])) OR ((\"multi-modals\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-modals\"[Text Words]) AND (omics[Text Words])) OR ((\"multimodals\"[Text Words]) AND (omic[Text Words])) OR ((\"multimodals\"[Text Words]) AND (omics[Text Words])) OR ((\"multi modals\"[Text Words]) AND (omic[Text Words])) OR ((\"multi modals\"[Text Words]) AND (omics[Text Words]))',\n",
       " 'multi-block omics': '((\"multi-block\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-block\"[Text Words]) AND (omics[Text Words])) OR ((\"multiblock\"[Text Words]) AND (omic[Text Words])) OR ((\"multiblock\"[Text Words]) AND (omics[Text Words])) OR ((\"multi block\"[Text Words]) AND (omic[Text Words])) OR ((\"multi block\"[Text Words]) AND (omics[Text Words])) OR ((\"multi-blocks\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-blocks\"[Text Words]) AND (omics[Text Words])) OR ((\"multiblocks\"[Text Words]) AND (omic[Text Words])) OR ((\"multiblocks\"[Text Words]) AND (omics[Text Words])) OR ((\"multi blocks\"[Text Words]) AND (omic[Text Words])) OR ((\"multi blocks\"[Text Words]) AND (omics[Text Words]))'}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "secondary_terms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'integrative omics': '\"integrative omic\"[Text Words] OR \"integrative omics\"[Text Words]',\n",
       " 'integrated omics': '\"integrated omic\"[Text Words] OR \"integrated omics\"[Text Words]',\n",
       " 'integromics': '\"integromic\"[Text Words] OR \"integromics\"[Text Words]'}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "descriptive_terms"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Perform search in PubMed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from easy_entrez import EntrezAPI\n",
    "from config import ENTREZ_API_NAME, ENTREZ_API_EMAIL\n",
    "\n",
    "entrez_api = EntrezAPI(\n",
    "    tool=ENTREZ_API_NAME,\n",
    "    email=ENTREZ_API_EMAIL,\n",
    "    minimal_interval=2\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "search_terms = {\n",
    "    **primary_terms,\n",
    "    **secondary_terms,\n",
    "    **descriptive_terms\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reusing the results from cache/search_results.pickle (last modified on 2020-07-25 06:51)\n"
     ]
    }
   ],
   "source": [
    "%%cache search_results pubmed_results\n",
    "\n",
    "pubmed_results = {}\n",
    "\n",
    "MAX_RESULTS = 10_000\n",
    "\n",
    "for term in tqdm(search_terms):\n",
    "    result = entrez_api.search(\n",
    "        search_terms[term],\n",
    "        database='pubmed',\n",
    "        max_results=MAX_RESULTS\n",
    "    )\n",
    "    esearch = result.data['esearchresult']\n",
    "    count = int(esearch['count'])\n",
    "    assert count >= 0\n",
    "    assert count < MAX_RESULTS\n",
    "\n",
    "    pubmed_results[term] = result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_papers = sorted(set(sum(\n",
    "    [\n",
    "        result.data['esearchresult']['idlist']\n",
    "        for result in pubmed_results.values()\n",
    "    ],\n",
    "    []\n",
    ")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3456"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(all_papers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reusing the results from cache/pubmed_documents_data.pickle (last modified on 2020-07-25 06:54)\n"
     ]
    }
   ],
   "source": [
    "%%cache pubmed_documents_data documents\n",
    "\n",
    "documents_by_batch = (\n",
    "    entrez_api\n",
    "    .in_batches_of(size=100)\n",
    "    .fetch(all_papers, max_results=10_000, return_type='xml')\n",
    ")\n",
    "\n",
    "documents = sum(\n",
    "    (\n",
    "        list(result.data)\n",
    "        for result in documents_by_batch.values()\n",
    "    ),\n",
    "    []\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from helpers.utils import xml_element_to_json\n",
    "documents = [xml_element_to_json(document) for document in list(documents)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert len(documents) == len(all_papers)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a data frame with PubMed documents and covariates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas import Series, DataFrame, read_csv, to_datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>multi-omics</th>\n",
       "      <th>pan-omics</th>\n",
       "      <th>trans-omics</th>\n",
       "      <th>poly-omics</th>\n",
       "      <th>cross-omics</th>\n",
       "      <th>multi-table omics</th>\n",
       "      <th>multi-source omics</th>\n",
       "      <th>multi-view omics</th>\n",
       "      <th>multi-modal omics</th>\n",
       "      <th>multi-block omics</th>\n",
       "      <th>integrative omics</th>\n",
       "      <th>integrated omics</th>\n",
       "      <th>integromics</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>uid</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12186644</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15687693</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15687700</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15687839</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15763567</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32697738</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32698759</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32698873</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32699215</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32700803</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3456 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          multi-omics  pan-omics  trans-omics  poly-omics  cross-omics  \\\n",
       "uid                                                                      \n",
       "12186644        False      False        False       False        False   \n",
       "15687693        False      False        False       False        False   \n",
       "15687700        False      False         True       False        False   \n",
       "15687839        False      False        False       False        False   \n",
       "15763567         True      False        False       False        False   \n",
       "...               ...        ...          ...         ...          ...   \n",
       "32697738         True      False        False       False        False   \n",
       "32698759         True      False        False       False        False   \n",
       "32698873         True      False        False       False        False   \n",
       "32699215         True      False        False       False        False   \n",
       "32700803         True      False        False       False        False   \n",
       "\n",
       "          multi-table omics  multi-source omics  multi-view omics  \\\n",
       "uid                                                                 \n",
       "12186644              False               False             False   \n",
       "15687693              False               False             False   \n",
       "15687700              False               False             False   \n",
       "15687839              False               False             False   \n",
       "15763567              False               False             False   \n",
       "...                     ...                 ...               ...   \n",
       "32697738              False               False             False   \n",
       "32698759              False               False             False   \n",
       "32698873              False               False             False   \n",
       "32699215              False               False             False   \n",
       "32700803              False               False             False   \n",
       "\n",
       "          multi-modal omics  multi-block omics  integrative omics  \\\n",
       "uid                                                                 \n",
       "12186644              False              False              False   \n",
       "15687693              False              False              False   \n",
       "15687700              False              False              False   \n",
       "15687839              False              False               True   \n",
       "15763567              False              False              False   \n",
       "...                     ...                ...                ...   \n",
       "32697738              False              False              False   \n",
       "32698759              False              False              False   \n",
       "32698873              False              False              False   \n",
       "32699215              False              False              False   \n",
       "32700803              False              False              False   \n",
       "\n",
       "          integrated omics  integromics  \n",
       "uid                                      \n",
       "12186644             False         True  \n",
       "15687693             False         True  \n",
       "15687700             False        False  \n",
       "15687839             False        False  \n",
       "15763567             False        False  \n",
       "...                    ...          ...  \n",
       "32697738             False        False  \n",
       "32698759             False        False  \n",
       "32698873             False        False  \n",
       "32699215             False        False  \n",
       "32700803             False        False  \n",
       "\n",
       "[3456 rows x 13 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a frame with 0 columns and UID of each paper on the index\n",
    "literature = Series(all_papers).to_frame('uid').set_index('uid')\n",
    "# add columns for the occurrences of the terms\n",
    "for term, result in pubmed_results.items():\n",
    "    literature[term] = False\n",
    "    for uid in result.data['esearchresult']['idlist']:\n",
    "        literature.loc[uid, term] = True\n",
    "literature"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parse the PubMed metadata of articles"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Reference:\n",
    "  - Medline: https://www.nlm.nih.gov/bsd/mms/medlineelements.html\n",
    "  - Publication types: https://www.nlm.nih.gov/mesh/pubtypes.html (fun fact: includes \"Wit and Humor\" type)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-17-32b97e314561>:68: UserWarning: {'JournalIssue': {'@CitedMedium': 'Internet', 'Volume': '1', 'Issue': '1', 'PubDate': {'Year': '2018'}}, 'Title': 'Journal of clinical genomics', 'ISOAbbreviation': 'J Clin Genom'} had no ISSN assigned\n",
      "  warn(f'{article[\"Journal\"]} had no ISSN assigned')\n",
      "<ipython-input-17-32b97e314561>:68: UserWarning: {'JournalIssue': {'@CitedMedium': 'Print', 'Volume': '2018', 'PubDate': {'Year': '2018', 'Month': 'Oct'}}, 'Title': 'Proceedings of the ... International Conference on Data Science and Advanced Analytics. IEEE International Conference on Data Science and Advanced Analytics', 'ISOAbbreviation': 'Proc Int Conf Data Sci Adv Anal'} had no ISSN assigned\n",
      "  warn(f'{article[\"Journal\"]} had no ISSN assigned')\n",
      "<ipython-input-17-32b97e314561>:68: UserWarning: {'JournalIssue': {'@CitedMedium': 'Print', 'Volume': '2019', 'PubDate': {'Year': '2019', 'Month': 'Sep'}}, 'Title': 'ACM-BCB ... ... : the ... ACM Conference on Bioinformatics, Computational Biology and Biomedicine. ACM Conference on Bioinformatics, Computational Biology and Biomedicine', 'ISOAbbreviation': 'ACM BCB'} had no ISSN assigned\n",
      "  warn(f'{article[\"Journal\"]} had no ISSN assigned')\n"
     ]
    }
   ],
   "source": [
    "from warnings import warn\n",
    "from helpers.parse_pubmed import listify, extract_abstract, parse_date, parse_doi\n",
    "\n",
    "missing_abstract = []\n",
    "authors = []\n",
    "affiliations = []\n",
    "\n",
    "publication_types = []\n",
    "\n",
    "for document in documents:\n",
    "\n",
    "    kind = None\n",
    "    date = None\n",
    "    doi = None\n",
    "\n",
    "    if 'PubmedBookArticle' in document:\n",
    "        kind = 'article in book'\n",
    "        book_document = document['PubmedBookArticle']['BookDocument']\n",
    "        pmid = book_document['PMID']['#text']\n",
    "\n",
    "        title = book_document['ArticleTitle']['#text']\n",
    "        abstract = extract_abstract(book_document)\n",
    "\n",
    "        # 'PublicationType' and 'KeywordList' ignored for book_document as only 2 matches (compared to 3k)\n",
    "\n",
    "    if 'PubmedArticle' in document:\n",
    "        pubmed_article = document['PubmedArticle']\n",
    "        assert not kind\n",
    "        kind = 'article'\n",
    "        medline_citation = pubmed_article['MedlineCitation']\n",
    "        pmid = medline_citation['PMID']['#text']\n",
    "        article = medline_citation['Article']\n",
    "        literature.loc[pmid, 'journal'] = article['Journal']['Title']\n",
    "\n",
    "        if 'ELocationID' in article:\n",
    "            doi = parse_doi(article['ELocationID'])\n",
    "\n",
    "        issue = article['Journal']['JournalIssue']\n",
    "        if 'PubDate' in issue:\n",
    "            date = parse_date(issue['PubDate'])\n",
    "\n",
    "        for author in listify(article['AuthorList']['Author'] if 'AuthorList' in article else None):\n",
    "            author_id = len(authors)\n",
    "            authors.append(\n",
    "                {\n",
    "                    'ID': author_id,\n",
    "                    'ForeName': author.get('ForeName'),\n",
    "                    'LastName': author.get('LastName'),\n",
    "                    'CollectiveName': author.get('CollectiveName'),\n",
    "                    'PMID': pmid\n",
    "                }\n",
    "            )\n",
    "            for affiliation in listify(author.get('AffiliationInfo')):\n",
    "                affiliations.append({\n",
    "                    'Affiliation': affiliation['Affiliation'],\n",
    "                    'PMID': pmid,\n",
    "                    'AuthorID': author_id\n",
    "                })\n",
    "\n",
    "        for publication_type in listify(article['PublicationTypeList']['PublicationType'] if 'PublicationTypeList' in article else None):\n",
    "            type_name = publication_type['#text']\n",
    "            publication_types.append(type_name)\n",
    "            literature.loc[pmid, f'Is {type_name}'] = True\n",
    "\n",
    "        try:\n",
    "            literature.loc[pmid, 'journal_issn'] = article['Journal']['ISSN']['#text']\n",
    "        except KeyError:\n",
    "            warn(f'{article[\"Journal\"]} had no ISSN assigned')\n",
    "        if 'ArticleTitle' in article:\n",
    "            title = article['ArticleTitle']\n",
    "            if isinstance(title, dict):\n",
    "                title = title['#text']\n",
    "\n",
    "        abstract = extract_abstract(article)\n",
    "\n",
    "    if not abstract:\n",
    "        missing_abstract.append(pmid)\n",
    "\n",
    "    assert kind\n",
    "\n",
    "    literature.loc[pmid, 'kind'] = kind\n",
    "    literature.loc[pmid, 'doi'] = doi\n",
    "    literature.loc[pmid, 'title'] = title\n",
    "    literature.loc[pmid, 'abstract'] = abstract\n",
    "    literature.loc[pmid, 'date'] = date\n",
    "\n",
    "publication_types = Series(publication_types)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "index\n",
       "Journal Article                             3370\n",
       "Research Support, Non-U.S. Gov't            1371\n",
       "Review                                       744\n",
       "Research Support, N.I.H., Extramural         460\n",
       "Research Support, U.S. Gov't, Non-P.H.S.     161\n",
       "Comparative Study                             61\n",
       "Editorial                                     44\n",
       "Comment                                       37\n",
       "Clinical Trial                                26\n",
       "Published Erratum                             23\n",
       "Multicenter Study                             21\n",
       "Research Support, N.I.H., Intramural          16\n",
       "Evaluation Study                              13\n",
       "Letter                                        13\n",
       "Case Reports                                   9\n",
       "Dataset                                        9\n",
       "Introductory Journal Article                   7\n",
       "Observational Study                            7\n",
       "Twin Study                                     7\n",
       "Validation Study                               7\n",
       "English Abstract                               6\n",
       "Randomized Controlled Trial                    6\n",
       "Systematic Review                              6\n",
       "Video-Audio Media                              6\n",
       "Meta-Analysis                                  5\n",
       "Research Support, U.S. Gov't, P.H.S.           5\n",
       "Congress                                       4\n",
       "Interview                                      3\n",
       "News                                           2\n",
       "Clinical Trial, Phase II                       1\n",
       "Consensus Development Conference, NIH          1\n",
       "Controlled Clinical Trial                      1\n",
       "Historical Article                             1\n",
       "Practice Guideline                             1\n",
       "Preprint                                       1\n",
       "Name: 0, dtype: int64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "publication_types.sorted_value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "affiliations = DataFrame(affiliations)\n",
    "authors = DataFrame(authors)\n",
    "\n",
    "authors['JointName'] = authors['ForeName'] + ' ' + authors['LastName']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "literature['has_doi'] = ~literature.doi.isnull()\n",
    "literature.date = to_datetime(literature.date)\n",
    "literature['year'] = literature.date.dt.year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "terms = list(pubmed_results.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def which_term(term):\n",
    "    term = list(term[term].index)\n",
    "    if len(term) == 1:\n",
    "        return term[0]\n",
    "    else:\n",
    "        return 'multiple'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "literature['term'] = literature[terms].apply(which_term, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas import Categorical\n",
    "literature['term'] = Categorical(literature['term'], ordered=True, categories=list(literature['term'].sorted_value_counts().index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "literature['has_url_in_abstract'] = literature['abstract'].str.contains('(?:https?://|www.)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Add PubmedCentral mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reusing the results from cache/pubmed_central_metadata.pickle (last modified on 2020-07-25 06:55)\n"
     ]
    }
   ],
   "source": [
    "%%cache pubmed_central_metadata pmc_metadata\n",
    "# approx 2GB in RAM, best to subset early\n",
    "pmc_metadata_all = read_csv('data/PMC-ids.csv.gz')\n",
    "pmid_of_interest = set(literature.index)\n",
    "pmc_metadata = pmc_metadata_all[pmc_metadata_all.PMID.isin(pmid_of_interest)]\n",
    "del pmc_metadata_all"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1951"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(pmc_metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Journal Title</th>\n",
       "      <th>ISSN</th>\n",
       "      <th>eISSN</th>\n",
       "      <th>Year</th>\n",
       "      <th>Volume</th>\n",
       "      <th>Issue</th>\n",
       "      <th>Page</th>\n",
       "      <th>DOI</th>\n",
       "      <th>PMCID</th>\n",
       "      <th>PMID</th>\n",
       "      <th>Manuscript Id</th>\n",
       "      <th>Release Date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>83696</th>\n",
       "      <td>Genome Biol</td>\n",
       "      <td>1474-7596</td>\n",
       "      <td>1474-760X</td>\n",
       "      <td>2002</td>\n",
       "      <td>3</td>\n",
       "      <td>8</td>\n",
       "      <td>reports4027.1</td>\n",
       "      <td>10.1186/gb-2002-3-8-reports4027</td>\n",
       "      <td>PMC139396</td>\n",
       "      <td>12186644.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>live</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>817169</th>\n",
       "      <td>J Virol</td>\n",
       "      <td>0022-538X</td>\n",
       "      <td>1098-5514</td>\n",
       "      <td>2006</td>\n",
       "      <td>80</td>\n",
       "      <td>9</td>\n",
       "      <td>4356</td>\n",
       "      <td>10.1128/JVI.80.9.4356-4362.2006</td>\n",
       "      <td>PMC1472023</td>\n",
       "      <td>16611894.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>live</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1155415</th>\n",
       "      <td>Proc Natl Acad Sci U S A</td>\n",
       "      <td>0027-8424</td>\n",
       "      <td>1091-6490</td>\n",
       "      <td>2007</td>\n",
       "      <td>104</td>\n",
       "      <td>15</td>\n",
       "      <td>6478</td>\n",
       "      <td>10.1073/pnas.0611629104</td>\n",
       "      <td>PMC1849962</td>\n",
       "      <td>17420480.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>live</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1212422</th>\n",
       "      <td>J Bacteriol</td>\n",
       "      <td>0021-9193</td>\n",
       "      <td>1098-5530</td>\n",
       "      <td>2007</td>\n",
       "      <td>189</td>\n",
       "      <td>13</td>\n",
       "      <td>4635</td>\n",
       "      <td>10.1128/JB.00128-07</td>\n",
       "      <td>PMC1913438</td>\n",
       "      <td>17449607.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>live</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1430120</th>\n",
       "      <td>Osteoarthritis Cartilage</td>\n",
       "      <td>1063-4584</td>\n",
       "      <td>1522-9653</td>\n",
       "      <td>2007</td>\n",
       "      <td>15</td>\n",
       "      <td>12</td>\n",
       "      <td>1367</td>\n",
       "      <td>10.1016/j.joca.2007.04.011</td>\n",
       "      <td>PMC2153443</td>\n",
       "      <td>17604656.0</td>\n",
       "      <td>NIHMS34878</td>\n",
       "      <td>live</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Journal Title       ISSN      eISSN  Year Volume Issue  \\\n",
       "83696                 Genome Biol  1474-7596  1474-760X  2002      3     8   \n",
       "817169                    J Virol  0022-538X  1098-5514  2006     80     9   \n",
       "1155415  Proc Natl Acad Sci U S A  0027-8424  1091-6490  2007    104    15   \n",
       "1212422               J Bacteriol  0021-9193  1098-5530  2007    189    13   \n",
       "1430120  Osteoarthritis Cartilage  1063-4584  1522-9653  2007     15    12   \n",
       "\n",
       "                  Page                              DOI       PMCID  \\\n",
       "83696    reports4027.1  10.1186/gb-2002-3-8-reports4027   PMC139396   \n",
       "817169            4356  10.1128/JVI.80.9.4356-4362.2006  PMC1472023   \n",
       "1155415           6478          10.1073/pnas.0611629104  PMC1849962   \n",
       "1212422           4635              10.1128/JB.00128-07  PMC1913438   \n",
       "1430120           1367       10.1016/j.joca.2007.04.011  PMC2153443   \n",
       "\n",
       "               PMID Manuscript Id Release Date  \n",
       "83696    12186644.0           NaN         live  \n",
       "817169   16611894.0           NaN         live  \n",
       "1155415  17420480.0           NaN         live  \n",
       "1212422  17449607.0           NaN         live  \n",
       "1430120  17604656.0    NIHMS34878         live  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pmc_metadata.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "literature['PMC'] = pmc_metadata.set_index('PMID').reindex(literature.index.astype(float))['PMCID']\n",
    "assert len(pmc_metadata) == sum(~literature['PMC'].isnull())\n",
    "\n",
    "literature['has_pmc'] = (~literature['PMC'].isnull())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note can also try to find missing PMCs in the summaries:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# result = entrez_api.search(primary_terms['poly-omics'], max_results=10_000)\n",
    "# summary = entrez_api.summarize(result.data['esearchresult']['idlist'][:5], max_results=10_000)\n",
    "# summary.data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download full texts as XML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['PMC139396', 'PMC1472023', 'PMC1849962', 'PMC1913438']"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pmc_ids = literature[literature['has_pmc']]['PMC'].tolist()\n",
    "pmc_ids[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reusing the results from cache/pubmed_central_xml.pickle (last modified on 2020-07-25 06:59)\n"
     ]
    }
   ],
   "source": [
    "%%cache pubmed_central_xml pmc_xmls\n",
    "pmc_full_texts = entrez_api.in_batches_of(size=100).fetch(pmc_ids, max_results=5_000, database='pmc', return_type='xml')\n",
    "\n",
    "pmc_xmls = sum(\n",
    "    [\n",
    "        list(response.data)\n",
    "        for response in pmc_full_texts.values()\n",
    "    ],\n",
    "    []\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1951"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(pmc_xmls)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "ignore_text = {'xref', 'table', 'thead', 'th', 'td', 'tr', 'graphic'}\n",
    "\n",
    "\n",
    "def extract_text(body) -> str:\n",
    "    fragments = []\n",
    "    for i in body.iter():\n",
    "        if i.tag in ignore_text:\n",
    "            continue\n",
    "        text = i.text\n",
    "        if i.tag == 'label' and text and text.startswith('Figure'):\n",
    "            continue\n",
    "        if text:\n",
    "            fragments.append(text)\n",
    "    return '\\n'.join(fragments)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "literature_subjects = literature.index.to_frame().drop(columns='uid').copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "for xml in pmc_xmls:\n",
    "    pmid = xml.find('front/article-meta/article-id[@pub-id-type=\"pmid\"]').text\n",
    "    body = xml.find('body')\n",
    "    has_full_text = body is not None\n",
    "\n",
    "    subjects = [subject.text for subject in xml.findall('front/article-meta//subject')]\n",
    "\n",
    "    literature.loc[pmid, 'has_full_text'] = has_full_text\n",
    "    literature.loc[pmid, 'full_text'] = extract_text(body) if has_full_text else None\n",
    "    literature.loc[pmid, 'article_type'] = xml.attrib['article-type']\n",
    "\n",
    "    for subject in subjects:\n",
    "        literature_subjects.loc[pmid, subject] = True\n",
    "\n",
    "literature_subjects = literature_subjects.fillna(False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Article                      582\n",
       "Research Article             240\n",
       "Review                       163\n",
       "Research                     139\n",
       "Genetics                     109\n",
       "Original Research            100\n",
       "Research Paper                67\n",
       "Biology and Life Sciences     61\n",
       "Biochemistry                  53\n",
       "Microbiology                  51\n",
       "dtype: int64"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "literature_subjects.sum().sort_values(ascending=False).head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "index\n",
       "research-article      1566\n",
       "review-article         253\n",
       "brief-report            29\n",
       "editorial               26\n",
       "correction              17\n",
       "data-paper              13\n",
       "other                    8\n",
       "article-commentary       7\n",
       "letter                   7\n",
       "discussion               5\n",
       "methods-article          5\n",
       "product-review           5\n",
       "chapter-article          3\n",
       "meeting-report           2\n",
       "protocol                 2\n",
       "abstract                 1\n",
       "addendum                 1\n",
       "systematic-review        1\n",
       "Name: article_type, dtype: int64"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "literature.article_type.sorted_value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1520"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(literature['has_full_text'] == True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "#from helpers.utils import display_xml\n",
    "#display_xml(pmc_xmls[-2].find('body'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Abstract clean-up"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Many abstracts contains sections/organising headers, such as:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['BACKGROUND', 'MOTIVATION', 'OBJECTIVE', 'SCOPE']"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "['BACKGROUND', 'MOTIVATION', 'OBJECTIVE', 'SCOPE']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "By convention those are upper case in PubMed. Here We filter those out:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "from re import findall\n",
    "\n",
    "\n",
    "def extract_upper_case(abstract: str, min_len: int = 3):\n",
    "    if abstract:\n",
    "        return findall('([A-Z]{' + str(min_len) + ',})', abstract)\n",
    "    return []\n",
    "\n",
    "\n",
    "def count_upper_case_phrases(data: Series, min_len: int = 3) -> Series:\n",
    "    return Series(sum(data.apply(extract_upper_case, min_len=min_len), [])).sorted_value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "index\n",
       "RNA            2421\n",
       "DNA             978\n",
       "RESULTS         460\n",
       "BACKGROUND      347\n",
       "CONCLUSIONS     289\n",
       "METHODS         240\n",
       "HCC             211\n",
       "TCGA            204\n",
       "SNP             176\n",
       "GWAS            159\n",
       "CONCLUSION      146\n",
       "CRC             138\n",
       "QTL             137\n",
       "WNT             126\n",
       "RCC             121\n",
       "GBM             116\n",
       "IBD             116\n",
       "TNBC            108\n",
       "Name: 0, dtype: int64"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "potential_headers = count_upper_case_phrases(literature['abstract'])\n",
    "potential_headers[potential_headers > 100]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There are many disease abbreviations making the list too long to browse:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1223"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(potential_headers[potential_headers > 3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So we will look at longer words:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3     731\n",
       "4     310\n",
       "5      95\n",
       "6      36\n",
       "7      17\n",
       "12      9\n",
       "10      7\n",
       "8       7\n",
       "9       6\n",
       "14      2\n",
       "11      2\n",
       "13      1\n",
       "Name: index, dtype: int64"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "potential_headers[potential_headers > 3].index.map(len).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "index\n",
       "RESULTS           460\n",
       "BACKGROUND        347\n",
       "CONCLUSIONS       289\n",
       "METHODS           240\n",
       "CONCLUSION        146\n",
       "NAFLD              67\n",
       "PURPOSE            58\n",
       "CRISPR             57\n",
       "OBJECTIVE          54\n",
       "AVAILABILITY       51\n",
       "NSCLC              47\n",
       "HNSCC              46\n",
       "MOTIVATION         45\n",
       "OMICS              44\n",
       "INFORMATION        43\n",
       "SUPPLEMENTARY      42\n",
       "IMPLEMENTATION     41\n",
       "FINDINGS           38\n",
       "SIGNIFICANCE       35\n",
       "LASSO              33\n",
       "Name: 0, dtype: int64"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "potential_headers_long = count_upper_case_phrases(literature['abstract'], min_len=5)\n",
    "potential_headers_long.head(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "I manually chosen headers from among top 100 hits:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "ABSTRACT_HEADERS = [\n",
    "    # manually added to prevent hanging \"OF\"\n",
    "    'PURPOSE OF REVIEW',\n",
    "    # chosen from top 100 most frequent\n",
    "    'RESULTS',\n",
    "    'BACKGROUND',\n",
    "    'CONCLUSIONS',\n",
    "    'METHODS',\n",
    "    'CONCLUSION',\n",
    "    'PURPOSE',\n",
    "    'OBJECTIVE',\n",
    "    'AVAILABILITY',\n",
    "    'MOTIVATION',\n",
    "    'INFORMATION',\n",
    "    'SUPPLEMENTARY',\n",
    "    'FINDINGS',\n",
    "    'SIGNIFICANCE',\n",
    "    'INTRODUCTION',\n",
    "    'DESIGN',\n",
    "    'OBJECTIVES',\n",
    "    'REVIEW',\n",
    "    'SUMMARY',\n",
    "    'MATERIALS',\n",
    "    'STUDY',\n",
    "    'EXPERIMENTAL',\n",
    "    'DISCUSSION',\n",
    "    'REGISTRATION',\n",
    "    'METHOD',\n",
    "    'CONTACT',\n",
    "    'FUTURE',\n",
    "    'INTERPRETATION',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "literature['abstract_clean'] = literature['abstract'].str.replace('|'.join(ABSTRACT_HEADERS), '')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "Stored `literature` (904B0F94 → 904B0F94) at Tuesday, 04. Aug 2020 17:58"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {
      "text/markdown": {
       "action": "store",
       "command": "store literature in pubmed_derived_data",
       "finished": "2020-08-04T17:58:45.587466",
       "finished_human_readable": "Tuesday, 04. Aug 2020 17:58",
       "result": [
        {
         "new_file": {
          "crc32": "904B0F94",
          "sha256": "A2EFC068A287A3B724AE4B320EE5356E1E99474BD08A2E2A3EBA34CD0194F23B"
         },
         "old_file": {
          "crc32": "904B0F94",
          "sha256": "A2EFC068A287A3B724AE4B320EE5356E1E99474BD08A2E2A3EBA34CD0194F23B"
         },
         "subject": "literature"
        }
       ],
       "started": "2020-08-04T17:58:34.355113"
      }
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "%vault store literature in pubmed_derived_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "Stored `literature_subjects` (98E10AF9 → 98E10AF9) at Tuesday, 04. Aug 2020 17:58"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {
      "text/markdown": {
       "action": "store",
       "command": "store literature_subjects in pubmed_derived_data",
       "finished": "2020-08-04T17:58:49.610779",
       "finished_human_readable": "Tuesday, 04. Aug 2020 17:58",
       "result": [
        {
         "new_file": {
          "crc32": "98E10AF9",
          "sha256": "DDA318B4E0EC865063AB772B31DD156B2B8E9EA5531B59A0C04B97C70CD21AA9"
         },
         "old_file": {
          "crc32": "98E10AF9",
          "sha256": "DDA318B4E0EC865063AB772B31DD156B2B8E9EA5531B59A0C04B97C70CD21AA9"
         },
         "subject": "literature_subjects"
        }
       ],
       "started": "2020-08-04T17:58:45.623299"
      }
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "%vault store literature_subjects in pubmed_derived_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "Stored:\n",
       "\n",
       " - `affiliations` (E06399F2 → E06399F2)\n",
       " - `authors` (DC49BC74 → DC49BC74)\n",
       " - `publication_types` (7DD4E741 → 7DD4E741)\n",
       "\n",
       "at Tuesday, 04. Aug 2020 17:58"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {
      "text/markdown": {
       "action": "store",
       "command": "store affiliations, authors, publication_types in pubmed_derived_data",
       "finished": "2020-08-04T17:58:56.744355",
       "finished_human_readable": "Tuesday, 04. Aug 2020 17:58",
       "result": [
        {
         "new_file": {
          "crc32": "E06399F2",
          "sha256": "8DD13D4B7CF3D2E314BBC4E051AEDBF21414371F42BB4D100D7721B5F4D24E60"
         },
         "old_file": {
          "crc32": "E06399F2",
          "sha256": "8DD13D4B7CF3D2E314BBC4E051AEDBF21414371F42BB4D100D7721B5F4D24E60"
         },
         "subject": "affiliations"
        },
        {
         "new_file": {
          "crc32": "DC49BC74",
          "sha256": "237BEFD0FDA68E2A155B9EC00519017B4C9BC92BD2AA3D10E058A013EC0DE1D9"
         },
         "old_file": {
          "crc32": "DC49BC74",
          "sha256": "237BEFD0FDA68E2A155B9EC00519017B4C9BC92BD2AA3D10E058A013EC0DE1D9"
         },
         "subject": "authors"
        },
        {
         "new_file": {
          "crc32": "7DD4E741",
          "sha256": "BD0EBF88B38BB9E0E44923E2CB473A532AEFBFFC6A7FCC02926290CAD2615150"
         },
         "old_file": {
          "crc32": "7DD4E741",
          "sha256": "BD0EBF88B38BB9E0E44923E2CB473A532AEFBFFC6A7FCC02926290CAD2615150"
         },
         "subject": "publication_types"
        }
       ],
       "started": "2020-08-04T17:58:49.637812"
      }
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "%vault store affiliations, authors, publication_types in pubmed_derived_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas\n",
    "pandas.set_option('display.max_colwidth', 1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>term</th>\n",
       "      <th>translation_stack</th>\n",
       "      <th>query_translation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>multi-omics</td>\n",
       "      <td>\"multi-omic\"[Text]→532 \"multiomic\"[Text]→138 OR \"multi omic\"[Text]→532 OR \"multi-omics\"[Text]→1798 OR \"multiomics\"[Text]→459 OR \"multi omics\"[Text]→1798 OR</td>\n",
       "      <td>\"multi-omic\"[Text] OR \"multiomic\"[Text] OR \"multi omic\"[Text] OR \"multi-omics\"[Text] OR \"multiomics\"[Text] OR \"multi omics\"[Text]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>pan-omics</td>\n",
       "      <td>\"pan-omic\"[Text]→5 \"panomic\"[Text]→9 OR \"pan omic\"[Text]→5 OR \"pan-omics\"[Text]→15 OR \"panomics\"[Text]→42 OR \"pan omics\"[Text]→15 OR</td>\n",
       "      <td>\"pan-omic\"[Text] OR \"panomic\"[Text] OR \"pan omic\"[Text] OR \"pan-omics\"[Text] OR \"panomics\"[Text] OR \"pan omics\"[Text]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>trans-omics</td>\n",
       "      <td>\"trans-omic\"[Text]→13 \"transomic\"[Text]→6 OR \"trans omic\"[Text]→13 OR \"trans-omics\"[Text]→81 OR \"transomics\"[Text]→12 OR \"trans omics\"[Text]→81 OR</td>\n",
       "      <td>\"trans-omic\"[Text] OR \"transomic\"[Text] OR \"trans omic\"[Text] OR \"trans-omics\"[Text] OR \"transomics\"[Text] OR \"trans omics\"[Text]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>poly-omics</td>\n",
       "      <td>\"poly-omic\"[Text]→6 \"polyomic\"[Text]→7 OR \"poly omic\"[Text]→6 OR \"poly-omics\"[Text]→5 OR \"polyomics\"[Text]→6 OR \"poly omics\"[Text]→5 OR</td>\n",
       "      <td>\"poly-omic\"[Text] OR \"polyomic\"[Text] OR \"poly omic\"[Text] OR \"poly-omics\"[Text] OR \"polyomics\"[Text] OR \"poly omics\"[Text]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>cross-omics</td>\n",
       "      <td>\"cross-omic\"[Text]→2 \"cross omic\"[Text]→2 OR \"cross-omics\"[Text]→43 OR \"crossomics\"[Text]→1 OR \"cross omics\"[Text]→43 OR</td>\n",
       "      <td>\"cross-omic\"[Text] OR \"cross omic\"[Text] OR \"cross-omics\"[Text] OR \"crossomics\"[Text] OR \"cross omics\"[Text]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>multi-table omics</td>\n",
       "      <td>\"multi-table\"[Text]→13 omic[Text]→2139 AND GROUP \"multi-table\"[Text]→13 omics[Text]→11151 AND GROUP OR \"multitable\"[Text]→6 omic[Text]→2139 AND GROUP OR \"multitable\"[Text]→6 omics[Text]→11151 AND GROUP OR \"multi table\"[Text]→13 omic[Text]→2139 AND GROUP OR \"multi table\"[Text]→13 omics[Text]→11151 AND GROUP OR</td>\n",
       "      <td>(\"multi-table\"[Text] AND omic[Text]) OR (\"multi-table\"[Text] AND omics[Text]) OR (\"multitable\"[Text] AND omic[Text]) OR (\"multitable\"[Text] AND omics[Text]) OR (\"multi table\"[Text] AND omic[Text]) OR (\"multi table\"[Text] AND omics[Text])</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>multi-source omics</td>\n",
       "      <td>\"multi-source\"[Text]→859 omic[Text]→2139 AND GROUP \"multi-source\"[Text]→859 omics[Text]→11151 AND GROUP OR \"multisource\"[Text]→935 omic[Text]→2139 AND GROUP OR \"multisource\"[Text]→935 omics[Text]→11151 AND GROUP OR \"multi source\"[Text]→859 omic[Text]→2139 AND GROUP OR \"multi source\"[Text]→859 omics[Text]→11151 AND GROUP OR \"multi-sources\"[Text]→34 omic[Text]→2139 AND GROUP OR \"multi-sources\"[Text]→34 omics[Text]→11151 AND GROUP OR \"multisources\"[Text]→5 omic[Text]→2139 AND GROUP OR \"multisources\"[Text]→5 omics[Text]→11151 AND GROUP OR \"multi sources\"[Text]→34 omic[Text]→2139 AND GROUP OR \"multi sources\"[Text]→34 omics[Text]→11151 AND GROUP OR</td>\n",
       "      <td>(\"multi-source\"[Text] AND omic[Text]) OR (\"multi-source\"[Text] AND omics[Text]) OR (\"multisource\"[Text] AND omic[Text]) OR (\"multisource\"[Text] AND omics[Text]) OR (\"multi source\"[Text] AND omic[Text]) OR (\"multi source\"[Text] AND omics[Text]) OR (\"multi-sources\"[Text] AND omic[Text]) OR (\"multi-sources\"[Text] AND omics[Text]) OR (\"multisources\"[Text] AND omic[Text]) OR (\"multisources\"[Text] AND omics[Text]) OR (\"multi sources\"[Text] AND omic[Text]) OR (\"multi sources\"[Text] AND omics[Text])</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>multi-view omics</td>\n",
       "      <td>\"multi-view\"[Text]→711 omic[Text]→2139 AND GROUP \"multi-view\"[Text]→711 omics[Text]→11151 AND GROUP OR \"multiview\"[Text]→596 omic[Text]→2139 AND GROUP OR \"multiview\"[Text]→596 omics[Text]→11151 AND GROUP OR \"multi view\"[Text]→711 omic[Text]→2139 AND GROUP OR \"multi view\"[Text]→711 omics[Text]→11151 AND GROUP OR \"multi-views\"[Text]→15 omic[Text]→2139 AND GROUP OR \"multi-views\"[Text]→15 omics[Text]→11151 AND GROUP OR \"multiviews\"[Text]→5 omic[Text]→2139 AND GROUP OR \"multiviews\"[Text]→5 omics[Text]→11151 AND GROUP OR \"multi views\"[Text]→15 omic[Text]→2139 AND GROUP OR \"multi views\"[Text]→15 omics[Text]→11151 AND GROUP OR</td>\n",
       "      <td>(\"multi-view\"[Text] AND omic[Text]) OR (\"multi-view\"[Text] AND omics[Text]) OR (\"multiview\"[Text] AND omic[Text]) OR (\"multiview\"[Text] AND omics[Text]) OR (\"multi view\"[Text] AND omic[Text]) OR (\"multi view\"[Text] AND omics[Text]) OR (\"multi-views\"[Text] AND omic[Text]) OR (\"multi-views\"[Text] AND omics[Text]) OR (\"multiviews\"[Text] AND omic[Text]) OR (\"multiviews\"[Text] AND omics[Text]) OR (\"multi views\"[Text] AND omic[Text]) OR (\"multi views\"[Text] AND omics[Text])</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>multi-modal omics</td>\n",
       "      <td>\"multi-modal\"[Text]→3939 omic[Text]→2139 AND GROUP \"multi-modal\"[Text]→3939 omics[Text]→11151 AND GROUP OR \"multimodal\"[Text]→46300 omic[Text]→2139 AND GROUP OR \"multimodal\"[Text]→46300 omics[Text]→11151 AND GROUP OR \"multi modal\"[Text]→3939 omic[Text]→2139 AND GROUP OR \"multi modal\"[Text]→3939 omics[Text]→11151 AND GROUP OR</td>\n",
       "      <td>(\"multi-modal\"[Text] AND omic[Text]) OR (\"multi-modal\"[Text] AND omics[Text]) OR (\"multimodal\"[Text] AND omic[Text]) OR (\"multimodal\"[Text] AND omics[Text]) OR (\"multi modal\"[Text] AND omic[Text]) OR (\"multi modal\"[Text] AND omics[Text])</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>multi-block omics</td>\n",
       "      <td>\"multi-block\"[Text]→181 omic[Text]→2139 AND GROUP \"multi-block\"[Text]→181 omics[Text]→11151 AND GROUP OR \"multiblock\"[Text]→576 omic[Text]→2139 AND GROUP OR \"multiblock\"[Text]→576 omics[Text]→11151 AND GROUP OR \"multi block\"[Text]→181 omic[Text]→2139 AND GROUP OR \"multi block\"[Text]→181 omics[Text]→11151 AND GROUP OR \"multi-blocks\"[Text]→4 omic[Text]→2139 AND GROUP OR \"multi-blocks\"[Text]→4 omics[Text]→11151 AND GROUP OR \"multiblocks\"[Text]→9 omic[Text]→2139 AND GROUP OR \"multiblocks\"[Text]→9 omics[Text]→11151 AND GROUP OR \"multi blocks\"[Text]→4 omic[Text]→2139 AND GROUP OR \"multi blocks\"[Text]→4 omics[Text]→11151 AND GROUP OR</td>\n",
       "      <td>(\"multi-block\"[Text] AND omic[Text]) OR (\"multi-block\"[Text] AND omics[Text]) OR (\"multiblock\"[Text] AND omic[Text]) OR (\"multiblock\"[Text] AND omics[Text]) OR (\"multi block\"[Text] AND omic[Text]) OR (\"multi block\"[Text] AND omics[Text]) OR (\"multi-blocks\"[Text] AND omic[Text]) OR (\"multi-blocks\"[Text] AND omics[Text]) OR (\"multiblocks\"[Text] AND omic[Text]) OR (\"multiblocks\"[Text] AND omics[Text]) OR (\"multi blocks\"[Text] AND omic[Text]) OR (\"multi blocks\"[Text] AND omics[Text])</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>integrative omics</td>\n",
       "      <td>\"integrative omic\"[Text]→1 \"integrative omics\"[Text]→149 OR</td>\n",
       "      <td>\"integrative omic\"[Text] OR \"integrative omics\"[Text]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>integrated omics</td>\n",
       "      <td>\"integrated omic\"[Text]→1 \"integrated omics\"[Text]→272 OR</td>\n",
       "      <td>\"integrated omic\"[Text] OR \"integrated omics\"[Text]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>integromics</td>\n",
       "      <td>\"integromic\"[Text]→14 \"integromics\"[Text]→52 OR</td>\n",
       "      <td>\"integromic\"[Text] OR \"integromics\"[Text]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  term  \\\n",
       "0          multi-omics   \n",
       "1            pan-omics   \n",
       "2          trans-omics   \n",
       "3           poly-omics   \n",
       "4          cross-omics   \n",
       "5    multi-table omics   \n",
       "6   multi-source omics   \n",
       "7     multi-view omics   \n",
       "8    multi-modal omics   \n",
       "9    multi-block omics   \n",
       "10   integrative omics   \n",
       "11    integrated omics   \n",
       "12         integromics   \n",
       "\n",
       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             translation_stack  \\\n",
       "0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  \"multi-omic\"[Text]→532 \"multiomic\"[Text]→138 OR \"multi omic\"[Text]→532 OR \"multi-omics\"[Text]→1798 OR \"multiomics\"[Text]→459 OR \"multi omics\"[Text]→1798 OR   \n",
       "1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         \"pan-omic\"[Text]→5 \"panomic\"[Text]→9 OR \"pan omic\"[Text]→5 OR \"pan-omics\"[Text]→15 OR \"panomics\"[Text]→42 OR \"pan omics\"[Text]→15 OR   \n",
       "2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           \"trans-omic\"[Text]→13 \"transomic\"[Text]→6 OR \"trans omic\"[Text]→13 OR \"trans-omics\"[Text]→81 OR \"transomics\"[Text]→12 OR \"trans omics\"[Text]→81 OR   \n",
       "3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      \"poly-omic\"[Text]→6 \"polyomic\"[Text]→7 OR \"poly omic\"[Text]→6 OR \"poly-omics\"[Text]→5 OR \"polyomics\"[Text]→6 OR \"poly omics\"[Text]→5 OR   \n",
       "4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     \"cross-omic\"[Text]→2 \"cross omic\"[Text]→2 OR \"cross-omics\"[Text]→43 OR \"crossomics\"[Text]→1 OR \"cross omics\"[Text]→43 OR   \n",
       "5                                                                                                                                                                                                                                                                                                                                                       \"multi-table\"[Text]→13 omic[Text]→2139 AND GROUP \"multi-table\"[Text]→13 omics[Text]→11151 AND GROUP OR \"multitable\"[Text]→6 omic[Text]→2139 AND GROUP OR \"multitable\"[Text]→6 omics[Text]→11151 AND GROUP OR \"multi table\"[Text]→13 omic[Text]→2139 AND GROUP OR \"multi table\"[Text]→13 omics[Text]→11151 AND GROUP OR   \n",
       "6   \"multi-source\"[Text]→859 omic[Text]→2139 AND GROUP \"multi-source\"[Text]→859 omics[Text]→11151 AND GROUP OR \"multisource\"[Text]→935 omic[Text]→2139 AND GROUP OR \"multisource\"[Text]→935 omics[Text]→11151 AND GROUP OR \"multi source\"[Text]→859 omic[Text]→2139 AND GROUP OR \"multi source\"[Text]→859 omics[Text]→11151 AND GROUP OR \"multi-sources\"[Text]→34 omic[Text]→2139 AND GROUP OR \"multi-sources\"[Text]→34 omics[Text]→11151 AND GROUP OR \"multisources\"[Text]→5 omic[Text]→2139 AND GROUP OR \"multisources\"[Text]→5 omics[Text]→11151 AND GROUP OR \"multi sources\"[Text]→34 omic[Text]→2139 AND GROUP OR \"multi sources\"[Text]→34 omics[Text]→11151 AND GROUP OR   \n",
       "7                           \"multi-view\"[Text]→711 omic[Text]→2139 AND GROUP \"multi-view\"[Text]→711 omics[Text]→11151 AND GROUP OR \"multiview\"[Text]→596 omic[Text]→2139 AND GROUP OR \"multiview\"[Text]→596 omics[Text]→11151 AND GROUP OR \"multi view\"[Text]→711 omic[Text]→2139 AND GROUP OR \"multi view\"[Text]→711 omics[Text]→11151 AND GROUP OR \"multi-views\"[Text]→15 omic[Text]→2139 AND GROUP OR \"multi-views\"[Text]→15 omics[Text]→11151 AND GROUP OR \"multiviews\"[Text]→5 omic[Text]→2139 AND GROUP OR \"multiviews\"[Text]→5 omics[Text]→11151 AND GROUP OR \"multi views\"[Text]→15 omic[Text]→2139 AND GROUP OR \"multi views\"[Text]→15 omics[Text]→11151 AND GROUP OR   \n",
       "8                                                                                                                                                                                                                                                                                                                                       \"multi-modal\"[Text]→3939 omic[Text]→2139 AND GROUP \"multi-modal\"[Text]→3939 omics[Text]→11151 AND GROUP OR \"multimodal\"[Text]→46300 omic[Text]→2139 AND GROUP OR \"multimodal\"[Text]→46300 omics[Text]→11151 AND GROUP OR \"multi modal\"[Text]→3939 omic[Text]→2139 AND GROUP OR \"multi modal\"[Text]→3939 omics[Text]→11151 AND GROUP OR   \n",
       "9                   \"multi-block\"[Text]→181 omic[Text]→2139 AND GROUP \"multi-block\"[Text]→181 omics[Text]→11151 AND GROUP OR \"multiblock\"[Text]→576 omic[Text]→2139 AND GROUP OR \"multiblock\"[Text]→576 omics[Text]→11151 AND GROUP OR \"multi block\"[Text]→181 omic[Text]→2139 AND GROUP OR \"multi block\"[Text]→181 omics[Text]→11151 AND GROUP OR \"multi-blocks\"[Text]→4 omic[Text]→2139 AND GROUP OR \"multi-blocks\"[Text]→4 omics[Text]→11151 AND GROUP OR \"multiblocks\"[Text]→9 omic[Text]→2139 AND GROUP OR \"multiblocks\"[Text]→9 omics[Text]→11151 AND GROUP OR \"multi blocks\"[Text]→4 omic[Text]→2139 AND GROUP OR \"multi blocks\"[Text]→4 omics[Text]→11151 AND GROUP OR   \n",
       "10                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 \"integrative omic\"[Text]→1 \"integrative omics\"[Text]→149 OR   \n",
       "11                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   \"integrated omic\"[Text]→1 \"integrated omics\"[Text]→272 OR   \n",
       "12                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             \"integromic\"[Text]→14 \"integromics\"[Text]→52 OR   \n",
       "\n",
       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   query_translation  \n",
       "0                                                                                                                                                                                                                                                                                                                                                                                  \"multi-omic\"[Text] OR \"multiomic\"[Text] OR \"multi omic\"[Text] OR \"multi-omics\"[Text] OR \"multiomics\"[Text] OR \"multi omics\"[Text]  \n",
       "1                                                                                                                                                                                                                                                                                                                                                                                              \"pan-omic\"[Text] OR \"panomic\"[Text] OR \"pan omic\"[Text] OR \"pan-omics\"[Text] OR \"panomics\"[Text] OR \"pan omics\"[Text]  \n",
       "2                                                                                                                                                                                                                                                                                                                                                                                  \"trans-omic\"[Text] OR \"transomic\"[Text] OR \"trans omic\"[Text] OR \"trans-omics\"[Text] OR \"transomics\"[Text] OR \"trans omics\"[Text]  \n",
       "3                                                                                                                                                                                                                                                                                                                                                                                        \"poly-omic\"[Text] OR \"polyomic\"[Text] OR \"poly omic\"[Text] OR \"poly-omics\"[Text] OR \"polyomics\"[Text] OR \"poly omics\"[Text]  \n",
       "4                                                                                                                                                                                                                                                                                                                                                                                                       \"cross-omic\"[Text] OR \"cross omic\"[Text] OR \"cross-omics\"[Text] OR \"crossomics\"[Text] OR \"cross omics\"[Text]  \n",
       "5                                                                                                                                                                                                                                                                      (\"multi-table\"[Text] AND omic[Text]) OR (\"multi-table\"[Text] AND omics[Text]) OR (\"multitable\"[Text] AND omic[Text]) OR (\"multitable\"[Text] AND omics[Text]) OR (\"multi table\"[Text] AND omic[Text]) OR (\"multi table\"[Text] AND omics[Text])  \n",
       "6   (\"multi-source\"[Text] AND omic[Text]) OR (\"multi-source\"[Text] AND omics[Text]) OR (\"multisource\"[Text] AND omic[Text]) OR (\"multisource\"[Text] AND omics[Text]) OR (\"multi source\"[Text] AND omic[Text]) OR (\"multi source\"[Text] AND omics[Text]) OR (\"multi-sources\"[Text] AND omic[Text]) OR (\"multi-sources\"[Text] AND omics[Text]) OR (\"multisources\"[Text] AND omic[Text]) OR (\"multisources\"[Text] AND omics[Text]) OR (\"multi sources\"[Text] AND omic[Text]) OR (\"multi sources\"[Text] AND omics[Text])  \n",
       "7                           (\"multi-view\"[Text] AND omic[Text]) OR (\"multi-view\"[Text] AND omics[Text]) OR (\"multiview\"[Text] AND omic[Text]) OR (\"multiview\"[Text] AND omics[Text]) OR (\"multi view\"[Text] AND omic[Text]) OR (\"multi view\"[Text] AND omics[Text]) OR (\"multi-views\"[Text] AND omic[Text]) OR (\"multi-views\"[Text] AND omics[Text]) OR (\"multiviews\"[Text] AND omic[Text]) OR (\"multiviews\"[Text] AND omics[Text]) OR (\"multi views\"[Text] AND omic[Text]) OR (\"multi views\"[Text] AND omics[Text])  \n",
       "8                                                                                                                                                                                                                                                                      (\"multi-modal\"[Text] AND omic[Text]) OR (\"multi-modal\"[Text] AND omics[Text]) OR (\"multimodal\"[Text] AND omic[Text]) OR (\"multimodal\"[Text] AND omics[Text]) OR (\"multi modal\"[Text] AND omic[Text]) OR (\"multi modal\"[Text] AND omics[Text])  \n",
       "9               (\"multi-block\"[Text] AND omic[Text]) OR (\"multi-block\"[Text] AND omics[Text]) OR (\"multiblock\"[Text] AND omic[Text]) OR (\"multiblock\"[Text] AND omics[Text]) OR (\"multi block\"[Text] AND omic[Text]) OR (\"multi block\"[Text] AND omics[Text]) OR (\"multi-blocks\"[Text] AND omic[Text]) OR (\"multi-blocks\"[Text] AND omics[Text]) OR (\"multiblocks\"[Text] AND omic[Text]) OR (\"multiblocks\"[Text] AND omics[Text]) OR (\"multi blocks\"[Text] AND omic[Text]) OR (\"multi blocks\"[Text] AND omics[Text])  \n",
       "10                                                                                                                                                                                                                                                                                                                                                                                                                                                             \"integrative omic\"[Text] OR \"integrative omics\"[Text]  \n",
       "11                                                                                                                                                                                                                                                                                                                                                                                                                                                               \"integrated omic\"[Text] OR \"integrated omics\"[Text]  \n",
       "12                                                                                                                                                                                                                                                                                                                                                                                                                                                                         \"integromic\"[Text] OR \"integromics\"[Text]  "
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from typing import Union\n",
    "\n",
    "\n",
    "def format_token(t: Union[str, dict]) -> str:\n",
    "    if isinstance(t, str):\n",
    "        return t\n",
    "    assert t['explode'] == 'N'\n",
    "    assert t['field'] == 'Text'\n",
    "    return t['term'] + '→' + t['count'] + ''\n",
    "\n",
    "\n",
    "pubmed_translations = []\n",
    "for term, result in pubmed_results.items():\n",
    "    pubmed_translations.append({\n",
    "        'term': term,\n",
    "        'translation_stack': ' '.join([format_token(t) for t in result.data['esearchresult']['translationstack']]),\n",
    "        'query_translation': result.data['esearchresult']['querytranslation']\n",
    "    })\n",
    "pubmed_translations = DataFrame(pubmed_translations)\n",
    "pubmed_translations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a control of documents published in the journals with hits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[2002,\n",
       " 2004,\n",
       " 2005,\n",
       " 2006,\n",
       " 2007,\n",
       " 2008,\n",
       " 2009,\n",
       " 2010,\n",
       " 2011,\n",
       " 2012,\n",
       " 2013,\n",
       " 2014,\n",
       " 2015,\n",
       " 2016,\n",
       " 2017,\n",
       " 2018,\n",
       " 2019,\n",
       " 2020]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "years_set = sorted(set(literature.year.dropna().astype(int)))\n",
    "years_set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "index\n",
       "Scientific reports                                                         126\n",
       "Omics : a journal of integrative biology                                    78\n",
       "PloS one                                                                    69\n",
       "Bioinformatics (Oxford, England)                                            68\n",
       "Nature communications                                                       58\n",
       "                                                                          ... \n",
       "Zhongguo yi xue ke xue yuan xue bao. Acta Academiae Medicinae Sinicae        1\n",
       "Zhonghua nan ke xue = National journal of andrology                          1\n",
       "Zhonghua yu fang yi xue za zhi [Chinese journal of preventive medicine]      1\n",
       "Zoology (Jena, Germany)                                                      1\n",
       "mSphere                                                                      1\n",
       "Name: journal, Length: 975, dtype: int64"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "journal_freq = literature.journal.sorted_value_counts()\n",
    "journal_freq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.750941210541558"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "popular_journals = journal_freq[journal_freq >= 3]\n",
    "popular_journals.sum() / journal_freq.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "Stored `popular_journals` (0B2CABD1 → 0B2CABD1) at Tuesday, 04. Aug 2020 17:58"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {
      "text/markdown": {
       "action": "store",
       "command": "store popular_journals in pubmed_derived_data",
       "finished": "2020-08-04T17:58:58.838195",
       "finished_human_readable": "Tuesday, 04. Aug 2020 17:58",
       "result": [
        {
         "new_file": {
          "crc32": "0B2CABD1",
          "sha256": "90D36B3DA0AF97C85591B7E55E1298A1498C6504032163879A08F825EADC3164"
         },
         "old_file": {
          "crc32": "0B2CABD1",
          "sha256": "90D36B3DA0AF97C85591B7E55E1298A1498C6504032163879A08F825EADC3164"
         },
         "subject": "popular_journals"
        }
       ],
       "started": "2020-08-04T17:58:56.919579"
      }
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "%vault store popular_journals in pubmed_derived_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reusing the results from cache/all_articles_by_journal_and_year.pickle (last modified on 2020-08-02 13:24)\n"
     ]
    }
   ],
   "source": [
    "%%cache all_articles_by_journal_and_year all_articles_by_journal_and_year\n",
    "\n",
    "all_articles_by_journal_and_year = []\n",
    "\n",
    "for journal in tqdm(sorted(popular_journals.index)):\n",
    "    for year in list(years_set):\n",
    "        result = entrez_api.search(\n",
    "            f'(\"{journal}\"[Journal]) AND (\"{year}\"[Date - Publication])',\n",
    "            database='pubmed',\n",
    "            max_results=1\n",
    "        )\n",
    "        esearch = result.data['esearchresult']\n",
    "        count = int(esearch['count'])\n",
    "        assert count >= 0\n",
    "        all_articles_by_journal_and_year.append({\n",
    "            'count': count,\n",
    "            'year': year,\n",
    "            'journal': journal\n",
    "        })\n",
    "\n",
    "all_articles_by_journal_and_year = DataFrame(all_articles_by_journal_and_year)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "Stored `all_articles_by_journal_and_year` (AB6E261E → AB6E261E) at Tuesday, 04. Aug 2020 17:59"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {
      "text/markdown": {
       "action": "store",
       "command": "store all_articles_by_journal_and_year in pubmed_derived_data",
       "finished": "2020-08-04T17:59:00.869364",
       "finished_human_readable": "Tuesday, 04. Aug 2020 17:59",
       "result": [
        {
         "new_file": {
          "crc32": "AB6E261E",
          "sha256": "343D4005442B93F41397AF04892D839174F38A2128ED5A08201A581D7FAF0201"
         },
         "old_file": {
          "crc32": "AB6E261E",
          "sha256": "343D4005442B93F41397AF04892D839174F38A2128ED5A08201A581D7FAF0201"
         },
         "subject": "all_articles_by_journal_and_year"
        }
       ],
       "started": "2020-08-04T17:58:58.895462"
      }
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "%vault store all_articles_by_journal_and_year in pubmed_derived_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a control for cancer enrichment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2002.0, 2020.0)"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "MIN_DATE = min(literature.date.dt.year)\n",
    "MAX_DATE = max(literature.date.dt.year)\n",
    "MIN_DATE, MAX_DATE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'((\"2002.0\"[Date - Publication] : \"2020.0\"[Date - Publication]))'"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SAME_PERIOD_AS_MULTI_OMICS = f'((\"{MIN_DATE}\"[Date - Publication] : \"{MAX_DATE}\"[Date - Publication]))'\n",
    "SAME_PERIOD_AS_MULTI_OMICS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Full-text search:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reusing the results from cache/cancer_articles_from_popular_journals_any_field.pickle (last modified on 2020-08-03 01:55)\n"
     ]
    }
   ],
   "source": [
    "%%cache cancer_articles_from_popular_journals_any_field cancer_articles_from_popular_journals_any_field\n",
    "\n",
    "cancer_articles_by_journal = []\n",
    "\n",
    "for journal in tqdm(sorted(popular_journals.index)):\n",
    "    result = entrez_api.search(\n",
    "        f'(\"{journal}\"[Journal]) AND (\"cancer\"[All Fields]) AND {SAME_PERIOD_AS_MULTI_OMICS}',\n",
    "        database='pubmed',\n",
    "        max_results=1\n",
    "    )\n",
    "    esearch = result.data['esearchresult']\n",
    "    count = int(esearch['count'])\n",
    "    assert count >= 0\n",
    "    cancer_articles_by_journal.append({\n",
    "        'count': count,\n",
    "        'journal': journal\n",
    "    })\n",
    "\n",
    "cancer_articles_from_popular_journals_any_field = DataFrame(cancer_articles_by_journal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "Stored `cancer_articles_from_popular_journals_any_field` (6931F0FF → 6931F0FF) at Tuesday, 04. Aug 2020 17:59"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {
      "text/markdown": {
       "action": "store",
       "command": "store cancer_articles_from_popular_journals_any_field in pubmed_derived_data",
       "finished": "2020-08-04T17:59:02.972861",
       "finished_human_readable": "Tuesday, 04. Aug 2020 17:59",
       "result": [
        {
         "new_file": {
          "crc32": "6931F0FF",
          "sha256": "D891354ECC232F9BDC07328CDBE8707ECE13127B0850FB3C67CA065D49D34C34"
         },
         "old_file": {
          "crc32": "6931F0FF",
          "sha256": "D891354ECC232F9BDC07328CDBE8707ECE13127B0850FB3C67CA065D49D34C34"
         },
         "subject": "cancer_articles_from_popular_journals_any_field"
        }
       ],
       "started": "2020-08-04T17:59:01.003408"
      }
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "%vault store cancer_articles_from_popular_journals_any_field in pubmed_derived_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Title/abstract only:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reusing the results from cache/cancer_articles_from_popular_journals_tiab_only.pickle (last modified on 2020-08-03 02:02)\n"
     ]
    }
   ],
   "source": [
    "%%cache cancer_articles_from_popular_journals_tiab_only cancer_articles_from_popular_journals_tiab_only\n",
    "\n",
    "cancer_tiab_articles_by_journal = []\n",
    "\n",
    "for journal in tqdm(sorted(popular_journals.index)):\n",
    "    result = entrez_api.search(\n",
    "        f'(\"{journal}\"[Journal]) AND (\"cancer\"[TIAB]) AND {SAME_PERIOD_AS_MULTI_OMICS}',\n",
    "        database='pubmed',\n",
    "        max_results=1\n",
    "    )\n",
    "    esearch = result.data['esearchresult']\n",
    "    count = int(esearch['count'])\n",
    "    assert count >= 0\n",
    "    cancer_tiab_articles_by_journal.append({\n",
    "        'count': count,\n",
    "        'journal': journal\n",
    "    })\n",
    "\n",
    "cancer_articles_from_popular_journals_tiab_only = DataFrame(cancer_tiab_articles_by_journal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "Stored `cancer_articles_from_popular_journals_tiab_only` (C6D2493E → C6D2493E) at Tuesday, 04. Aug 2020 17:59"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {
      "text/markdown": {
       "action": "store",
       "command": "store cancer_articles_from_popular_journals_tiab_only in pubmed_derived_data",
       "finished": "2020-08-04T17:59:04.991693",
       "finished_human_readable": "Tuesday, 04. Aug 2020 17:59",
       "result": [
        {
         "new_file": {
          "crc32": "C6D2493E",
          "sha256": "F0C0D1C024BD2CED3E45832958994F88EAB809CDFFAC97C732126B08B87B2C64"
         },
         "old_file": {
          "crc32": "C6D2493E",
          "sha256": "F0C0D1C024BD2CED3E45832958994F88EAB809CDFFAC97C732126B08B87B2C64"
         },
         "subject": "cancer_articles_from_popular_journals_tiab_only"
        }
       ],
       "started": "2020-08-04T17:59:03.023870"
      }
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "%vault store cancer_articles_from_popular_journals_tiab_only in pubmed_derived_data"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}