2390 lines (2389 with data), 97.2 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%run notebook_setup.ipynb"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define search terms"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from search_terms import primary_terms, secondary_terms, descriptive_terms"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'multi-omics': '(\"multi-omic\"[Text Words]) OR (\"multiomic\"[Text Words]) OR (\"multi omic\"[Text Words]) OR (\"multi-omics\"[Text Words]) OR (\"multiomics\"[Text Words]) OR (\"multi omics\"[Text Words])',\n",
" 'pan-omics': '(\"pan-omic\"[Text Words]) OR (\"panomic\"[Text Words]) OR (\"pan omic\"[Text Words]) OR (\"pan-omics\"[Text Words]) OR (\"panomics\"[Text Words]) OR (\"pan omics\"[Text Words])',\n",
" 'trans-omics': '(\"trans-omic\"[Text Words]) OR (\"transomic\"[Text Words]) OR (\"trans omic\"[Text Words]) OR (\"trans-omics\"[Text Words]) OR (\"transomics\"[Text Words]) OR (\"trans omics\"[Text Words])',\n",
" 'poly-omics': '(\"poly-omic\"[Text Words]) OR (\"polyomic\"[Text Words]) OR (\"poly omic\"[Text Words]) OR (\"poly-omics\"[Text Words]) OR (\"polyomics\"[Text Words]) OR (\"poly omics\"[Text Words])',\n",
" 'cross-omics': '(\"cross-omic\"[Text Words]) OR (\"crossomic\"[Text Words]) OR (\"cross omic\"[Text Words]) OR (\"cross-omics\"[Text Words]) OR (\"crossomics\"[Text Words]) OR (\"cross omics\"[Text Words])'}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"primary_terms"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'multi-table omics': '((\"multi-table\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-table\"[Text Words]) AND (omics[Text Words])) OR ((\"multitable\"[Text Words]) AND (omic[Text Words])) OR ((\"multitable\"[Text Words]) AND (omics[Text Words])) OR ((\"multi table\"[Text Words]) AND (omic[Text Words])) OR ((\"multi table\"[Text Words]) AND (omics[Text Words])) OR ((\"multi-tables\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-tables\"[Text Words]) AND (omics[Text Words])) OR ((\"multitables\"[Text Words]) AND (omic[Text Words])) OR ((\"multitables\"[Text Words]) AND (omics[Text Words])) OR ((\"multi tables\"[Text Words]) AND (omic[Text Words])) OR ((\"multi tables\"[Text Words]) AND (omics[Text Words]))',\n",
" 'multi-source omics': '((\"multi-source\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-source\"[Text Words]) AND (omics[Text Words])) OR ((\"multisource\"[Text Words]) AND (omic[Text Words])) OR ((\"multisource\"[Text Words]) AND (omics[Text Words])) OR ((\"multi source\"[Text Words]) AND (omic[Text Words])) OR ((\"multi source\"[Text Words]) AND (omics[Text Words])) OR ((\"multi-sources\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-sources\"[Text Words]) AND (omics[Text Words])) OR ((\"multisources\"[Text Words]) AND (omic[Text Words])) OR ((\"multisources\"[Text Words]) AND (omics[Text Words])) OR ((\"multi sources\"[Text Words]) AND (omic[Text Words])) OR ((\"multi sources\"[Text Words]) AND (omics[Text Words]))',\n",
" 'multi-view omics': '((\"multi-view\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-view\"[Text Words]) AND (omics[Text Words])) OR ((\"multiview\"[Text Words]) AND (omic[Text Words])) OR ((\"multiview\"[Text Words]) AND (omics[Text Words])) OR ((\"multi view\"[Text Words]) AND (omic[Text Words])) OR ((\"multi view\"[Text Words]) AND (omics[Text Words])) OR ((\"multi-views\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-views\"[Text Words]) AND (omics[Text Words])) OR ((\"multiviews\"[Text Words]) AND (omic[Text Words])) OR ((\"multiviews\"[Text Words]) AND (omics[Text Words])) OR ((\"multi views\"[Text Words]) AND (omic[Text Words])) OR ((\"multi views\"[Text Words]) AND (omics[Text Words]))',\n",
" 'multi-modal omics': '((\"multi-modal\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-modal\"[Text Words]) AND (omics[Text Words])) OR ((\"multimodal\"[Text Words]) AND (omic[Text Words])) OR ((\"multimodal\"[Text Words]) AND (omics[Text Words])) OR ((\"multi modal\"[Text Words]) AND (omic[Text Words])) OR ((\"multi modal\"[Text Words]) AND (omics[Text Words])) OR ((\"multi-modals\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-modals\"[Text Words]) AND (omics[Text Words])) OR ((\"multimodals\"[Text Words]) AND (omic[Text Words])) OR ((\"multimodals\"[Text Words]) AND (omics[Text Words])) OR ((\"multi modals\"[Text Words]) AND (omic[Text Words])) OR ((\"multi modals\"[Text Words]) AND (omics[Text Words]))',\n",
" 'multi-block omics': '((\"multi-block\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-block\"[Text Words]) AND (omics[Text Words])) OR ((\"multiblock\"[Text Words]) AND (omic[Text Words])) OR ((\"multiblock\"[Text Words]) AND (omics[Text Words])) OR ((\"multi block\"[Text Words]) AND (omic[Text Words])) OR ((\"multi block\"[Text Words]) AND (omics[Text Words])) OR ((\"multi-blocks\"[Text Words]) AND (omic[Text Words])) OR ((\"multi-blocks\"[Text Words]) AND (omics[Text Words])) OR ((\"multiblocks\"[Text Words]) AND (omic[Text Words])) OR ((\"multiblocks\"[Text Words]) AND (omics[Text Words])) OR ((\"multi blocks\"[Text Words]) AND (omic[Text Words])) OR ((\"multi blocks\"[Text Words]) AND (omics[Text Words]))'}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"secondary_terms"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'integrative omics': '\"integrative omic\"[Text Words] OR \"integrative omics\"[Text Words]',\n",
" 'integrated omics': '\"integrated omic\"[Text Words] OR \"integrated omics\"[Text Words]',\n",
" 'integromics': '\"integromic\"[Text Words] OR \"integromics\"[Text Words]'}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"descriptive_terms"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Perform search in PubMed"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from easy_entrez import EntrezAPI\n",
"from config import ENTREZ_API_NAME, ENTREZ_API_EMAIL\n",
"\n",
"entrez_api = EntrezAPI(\n",
" tool=ENTREZ_API_NAME,\n",
" email=ENTREZ_API_EMAIL,\n",
" minimal_interval=2\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"search_terms = {\n",
" **primary_terms,\n",
" **secondary_terms,\n",
" **descriptive_terms\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reusing the results from cache/search_results.pickle (last modified on 2020-07-25 06:51)\n"
]
}
],
"source": [
"%%cache search_results pubmed_results\n",
"\n",
"pubmed_results = {}\n",
"\n",
"MAX_RESULTS = 10_000\n",
"\n",
"for term in tqdm(search_terms):\n",
" result = entrez_api.search(\n",
" search_terms[term],\n",
" database='pubmed',\n",
" max_results=MAX_RESULTS\n",
" )\n",
" esearch = result.data['esearchresult']\n",
" count = int(esearch['count'])\n",
" assert count >= 0\n",
" assert count < MAX_RESULTS\n",
"\n",
" pubmed_results[term] = result"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"all_papers = sorted(set(sum(\n",
" [\n",
" result.data['esearchresult']['idlist']\n",
" for result in pubmed_results.values()\n",
" ],\n",
" []\n",
")))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3456"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(all_papers)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reusing the results from cache/pubmed_documents_data.pickle (last modified on 2020-07-25 06:54)\n"
]
}
],
"source": [
"%%cache pubmed_documents_data documents\n",
"\n",
"documents_by_batch = (\n",
" entrez_api\n",
" .in_batches_of(size=100)\n",
" .fetch(all_papers, max_results=10_000, return_type='xml')\n",
")\n",
"\n",
"documents = sum(\n",
" (\n",
" list(result.data)\n",
" for result in documents_by_batch.values()\n",
" ),\n",
" []\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"from helpers.utils import xml_element_to_json\n",
"documents = [xml_element_to_json(document) for document in list(documents)]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"assert len(documents) == len(all_papers)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create a data frame with PubMed documents and covariates"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"from pandas import Series, DataFrame, read_csv, to_datetime"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>multi-omics</th>\n",
" <th>pan-omics</th>\n",
" <th>trans-omics</th>\n",
" <th>poly-omics</th>\n",
" <th>cross-omics</th>\n",
" <th>multi-table omics</th>\n",
" <th>multi-source omics</th>\n",
" <th>multi-view omics</th>\n",
" <th>multi-modal omics</th>\n",
" <th>multi-block omics</th>\n",
" <th>integrative omics</th>\n",
" <th>integrated omics</th>\n",
" <th>integromics</th>\n",
" </tr>\n",
" <tr>\n",
" <th>uid</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>12186644</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15687693</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15687700</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15687839</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15763567</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32697738</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32698759</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32698873</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32699215</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32700803</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3456 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" multi-omics pan-omics trans-omics poly-omics cross-omics \\\n",
"uid \n",
"12186644 False False False False False \n",
"15687693 False False False False False \n",
"15687700 False False True False False \n",
"15687839 False False False False False \n",
"15763567 True False False False False \n",
"... ... ... ... ... ... \n",
"32697738 True False False False False \n",
"32698759 True False False False False \n",
"32698873 True False False False False \n",
"32699215 True False False False False \n",
"32700803 True False False False False \n",
"\n",
" multi-table omics multi-source omics multi-view omics \\\n",
"uid \n",
"12186644 False False False \n",
"15687693 False False False \n",
"15687700 False False False \n",
"15687839 False False False \n",
"15763567 False False False \n",
"... ... ... ... \n",
"32697738 False False False \n",
"32698759 False False False \n",
"32698873 False False False \n",
"32699215 False False False \n",
"32700803 False False False \n",
"\n",
" multi-modal omics multi-block omics integrative omics \\\n",
"uid \n",
"12186644 False False False \n",
"15687693 False False False \n",
"15687700 False False False \n",
"15687839 False False True \n",
"15763567 False False False \n",
"... ... ... ... \n",
"32697738 False False False \n",
"32698759 False False False \n",
"32698873 False False False \n",
"32699215 False False False \n",
"32700803 False False False \n",
"\n",
" integrated omics integromics \n",
"uid \n",
"12186644 False True \n",
"15687693 False True \n",
"15687700 False False \n",
"15687839 False False \n",
"15763567 False False \n",
"... ... ... \n",
"32697738 False False \n",
"32698759 False False \n",
"32698873 False False \n",
"32699215 False False \n",
"32700803 False False \n",
"\n",
"[3456 rows x 13 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create a frame with 0 columns and UID of each paper on the index\n",
"literature = Series(all_papers).to_frame('uid').set_index('uid')\n",
"# add columns for the occurrences of the terms\n",
"for term, result in pubmed_results.items():\n",
" literature[term] = False\n",
" for uid in result.data['esearchresult']['idlist']:\n",
" literature.loc[uid, term] = True\n",
"literature"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parse the PubMed metadata of articles"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Reference:\n",
" - Medline: https://www.nlm.nih.gov/bsd/mms/medlineelements.html\n",
" - Publication types: https://www.nlm.nih.gov/mesh/pubtypes.html (fun fact: includes \"Wit and Humor\" type)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-17-32b97e314561>:68: UserWarning: {'JournalIssue': {'@CitedMedium': 'Internet', 'Volume': '1', 'Issue': '1', 'PubDate': {'Year': '2018'}}, 'Title': 'Journal of clinical genomics', 'ISOAbbreviation': 'J Clin Genom'} had no ISSN assigned\n",
" warn(f'{article[\"Journal\"]} had no ISSN assigned')\n",
"<ipython-input-17-32b97e314561>:68: UserWarning: {'JournalIssue': {'@CitedMedium': 'Print', 'Volume': '2018', 'PubDate': {'Year': '2018', 'Month': 'Oct'}}, 'Title': 'Proceedings of the ... International Conference on Data Science and Advanced Analytics. IEEE International Conference on Data Science and Advanced Analytics', 'ISOAbbreviation': 'Proc Int Conf Data Sci Adv Anal'} had no ISSN assigned\n",
" warn(f'{article[\"Journal\"]} had no ISSN assigned')\n",
"<ipython-input-17-32b97e314561>:68: UserWarning: {'JournalIssue': {'@CitedMedium': 'Print', 'Volume': '2019', 'PubDate': {'Year': '2019', 'Month': 'Sep'}}, 'Title': 'ACM-BCB ... ... : the ... ACM Conference on Bioinformatics, Computational Biology and Biomedicine. ACM Conference on Bioinformatics, Computational Biology and Biomedicine', 'ISOAbbreviation': 'ACM BCB'} had no ISSN assigned\n",
" warn(f'{article[\"Journal\"]} had no ISSN assigned')\n"
]
}
],
"source": [
"from warnings import warn\n",
"from helpers.parse_pubmed import listify, extract_abstract, parse_date, parse_doi\n",
"\n",
"missing_abstract = []\n",
"authors = []\n",
"affiliations = []\n",
"\n",
"publication_types = []\n",
"\n",
"for document in documents:\n",
"\n",
" kind = None\n",
" date = None\n",
" doi = None\n",
"\n",
" if 'PubmedBookArticle' in document:\n",
" kind = 'article in book'\n",
" book_document = document['PubmedBookArticle']['BookDocument']\n",
" pmid = book_document['PMID']['#text']\n",
"\n",
" title = book_document['ArticleTitle']['#text']\n",
" abstract = extract_abstract(book_document)\n",
"\n",
" # 'PublicationType' and 'KeywordList' ignored for book_document as only 2 matches (compared to 3k)\n",
"\n",
" if 'PubmedArticle' in document:\n",
" pubmed_article = document['PubmedArticle']\n",
" assert not kind\n",
" kind = 'article'\n",
" medline_citation = pubmed_article['MedlineCitation']\n",
" pmid = medline_citation['PMID']['#text']\n",
" article = medline_citation['Article']\n",
" literature.loc[pmid, 'journal'] = article['Journal']['Title']\n",
"\n",
" if 'ELocationID' in article:\n",
" doi = parse_doi(article['ELocationID'])\n",
"\n",
" issue = article['Journal']['JournalIssue']\n",
" if 'PubDate' in issue:\n",
" date = parse_date(issue['PubDate'])\n",
"\n",
" for author in listify(article['AuthorList']['Author'] if 'AuthorList' in article else None):\n",
" author_id = len(authors)\n",
" authors.append(\n",
" {\n",
" 'ID': author_id,\n",
" 'ForeName': author.get('ForeName'),\n",
" 'LastName': author.get('LastName'),\n",
" 'CollectiveName': author.get('CollectiveName'),\n",
" 'PMID': pmid\n",
" }\n",
" )\n",
" for affiliation in listify(author.get('AffiliationInfo')):\n",
" affiliations.append({\n",
" 'Affiliation': affiliation['Affiliation'],\n",
" 'PMID': pmid,\n",
" 'AuthorID': author_id\n",
" })\n",
"\n",
" for publication_type in listify(article['PublicationTypeList']['PublicationType'] if 'PublicationTypeList' in article else None):\n",
" type_name = publication_type['#text']\n",
" publication_types.append(type_name)\n",
" literature.loc[pmid, f'Is {type_name}'] = True\n",
"\n",
" try:\n",
" literature.loc[pmid, 'journal_issn'] = article['Journal']['ISSN']['#text']\n",
" except KeyError:\n",
" warn(f'{article[\"Journal\"]} had no ISSN assigned')\n",
" if 'ArticleTitle' in article:\n",
" title = article['ArticleTitle']\n",
" if isinstance(title, dict):\n",
" title = title['#text']\n",
"\n",
" abstract = extract_abstract(article)\n",
"\n",
" if not abstract:\n",
" missing_abstract.append(pmid)\n",
"\n",
" assert kind\n",
"\n",
" literature.loc[pmid, 'kind'] = kind\n",
" literature.loc[pmid, 'doi'] = doi\n",
" literature.loc[pmid, 'title'] = title\n",
" literature.loc[pmid, 'abstract'] = abstract\n",
" literature.loc[pmid, 'date'] = date\n",
"\n",
"publication_types = Series(publication_types)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"index\n",
"Journal Article 3370\n",
"Research Support, Non-U.S. Gov't 1371\n",
"Review 744\n",
"Research Support, N.I.H., Extramural 460\n",
"Research Support, U.S. Gov't, Non-P.H.S. 161\n",
"Comparative Study 61\n",
"Editorial 44\n",
"Comment 37\n",
"Clinical Trial 26\n",
"Published Erratum 23\n",
"Multicenter Study 21\n",
"Research Support, N.I.H., Intramural 16\n",
"Evaluation Study 13\n",
"Letter 13\n",
"Case Reports 9\n",
"Dataset 9\n",
"Introductory Journal Article 7\n",
"Observational Study 7\n",
"Twin Study 7\n",
"Validation Study 7\n",
"English Abstract 6\n",
"Randomized Controlled Trial 6\n",
"Systematic Review 6\n",
"Video-Audio Media 6\n",
"Meta-Analysis 5\n",
"Research Support, U.S. Gov't, P.H.S. 5\n",
"Congress 4\n",
"Interview 3\n",
"News 2\n",
"Clinical Trial, Phase II 1\n",
"Consensus Development Conference, NIH 1\n",
"Controlled Clinical Trial 1\n",
"Historical Article 1\n",
"Practice Guideline 1\n",
"Preprint 1\n",
"Name: 0, dtype: int64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"publication_types.sorted_value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"affiliations = DataFrame(affiliations)\n",
"authors = DataFrame(authors)\n",
"\n",
"authors['JointName'] = authors['ForeName'] + ' ' + authors['LastName']"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"literature['has_doi'] = ~literature.doi.isnull()\n",
"literature.date = to_datetime(literature.date)\n",
"literature['year'] = literature.date.dt.year"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"terms = list(pubmed_results.keys())"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"def which_term(term):\n",
" term = list(term[term].index)\n",
" if len(term) == 1:\n",
" return term[0]\n",
" else:\n",
" return 'multiple'"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"literature['term'] = literature[terms].apply(which_term, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"from pandas import Categorical\n",
"literature['term'] = Categorical(literature['term'], ordered=True, categories=list(literature['term'].sorted_value_counts().index))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"literature['has_url_in_abstract'] = literature['abstract'].str.contains('(?:https?://|www.)')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Add PubmedCentral mapping"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reusing the results from cache/pubmed_central_metadata.pickle (last modified on 2020-07-25 06:55)\n"
]
}
],
"source": [
"%%cache pubmed_central_metadata pmc_metadata\n",
"# approx 2GB in RAM, best to subset early\n",
"pmc_metadata_all = read_csv('data/PMC-ids.csv.gz')\n",
"pmid_of_interest = set(literature.index)\n",
"pmc_metadata = pmc_metadata_all[pmc_metadata_all.PMID.isin(pmid_of_interest)]\n",
"del pmc_metadata_all"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1951"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(pmc_metadata)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Journal Title</th>\n",
" <th>ISSN</th>\n",
" <th>eISSN</th>\n",
" <th>Year</th>\n",
" <th>Volume</th>\n",
" <th>Issue</th>\n",
" <th>Page</th>\n",
" <th>DOI</th>\n",
" <th>PMCID</th>\n",
" <th>PMID</th>\n",
" <th>Manuscript Id</th>\n",
" <th>Release Date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>83696</th>\n",
" <td>Genome Biol</td>\n",
" <td>1474-7596</td>\n",
" <td>1474-760X</td>\n",
" <td>2002</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>reports4027.1</td>\n",
" <td>10.1186/gb-2002-3-8-reports4027</td>\n",
" <td>PMC139396</td>\n",
" <td>12186644.0</td>\n",
" <td>NaN</td>\n",
" <td>live</td>\n",
" </tr>\n",
" <tr>\n",
" <th>817169</th>\n",
" <td>J Virol</td>\n",
" <td>0022-538X</td>\n",
" <td>1098-5514</td>\n",
" <td>2006</td>\n",
" <td>80</td>\n",
" <td>9</td>\n",
" <td>4356</td>\n",
" <td>10.1128/JVI.80.9.4356-4362.2006</td>\n",
" <td>PMC1472023</td>\n",
" <td>16611894.0</td>\n",
" <td>NaN</td>\n",
" <td>live</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1155415</th>\n",
" <td>Proc Natl Acad Sci U S A</td>\n",
" <td>0027-8424</td>\n",
" <td>1091-6490</td>\n",
" <td>2007</td>\n",
" <td>104</td>\n",
" <td>15</td>\n",
" <td>6478</td>\n",
" <td>10.1073/pnas.0611629104</td>\n",
" <td>PMC1849962</td>\n",
" <td>17420480.0</td>\n",
" <td>NaN</td>\n",
" <td>live</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1212422</th>\n",
" <td>J Bacteriol</td>\n",
" <td>0021-9193</td>\n",
" <td>1098-5530</td>\n",
" <td>2007</td>\n",
" <td>189</td>\n",
" <td>13</td>\n",
" <td>4635</td>\n",
" <td>10.1128/JB.00128-07</td>\n",
" <td>PMC1913438</td>\n",
" <td>17449607.0</td>\n",
" <td>NaN</td>\n",
" <td>live</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1430120</th>\n",
" <td>Osteoarthritis Cartilage</td>\n",
" <td>1063-4584</td>\n",
" <td>1522-9653</td>\n",
" <td>2007</td>\n",
" <td>15</td>\n",
" <td>12</td>\n",
" <td>1367</td>\n",
" <td>10.1016/j.joca.2007.04.011</td>\n",
" <td>PMC2153443</td>\n",
" <td>17604656.0</td>\n",
" <td>NIHMS34878</td>\n",
" <td>live</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Journal Title ISSN eISSN Year Volume Issue \\\n",
"83696 Genome Biol 1474-7596 1474-760X 2002 3 8 \n",
"817169 J Virol 0022-538X 1098-5514 2006 80 9 \n",
"1155415 Proc Natl Acad Sci U S A 0027-8424 1091-6490 2007 104 15 \n",
"1212422 J Bacteriol 0021-9193 1098-5530 2007 189 13 \n",
"1430120 Osteoarthritis Cartilage 1063-4584 1522-9653 2007 15 12 \n",
"\n",
" Page DOI PMCID \\\n",
"83696 reports4027.1 10.1186/gb-2002-3-8-reports4027 PMC139396 \n",
"817169 4356 10.1128/JVI.80.9.4356-4362.2006 PMC1472023 \n",
"1155415 6478 10.1073/pnas.0611629104 PMC1849962 \n",
"1212422 4635 10.1128/JB.00128-07 PMC1913438 \n",
"1430120 1367 10.1016/j.joca.2007.04.011 PMC2153443 \n",
"\n",
" PMID Manuscript Id Release Date \n",
"83696 12186644.0 NaN live \n",
"817169 16611894.0 NaN live \n",
"1155415 17420480.0 NaN live \n",
"1212422 17449607.0 NaN live \n",
"1430120 17604656.0 NIHMS34878 live "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pmc_metadata.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"literature['PMC'] = pmc_metadata.set_index('PMID').reindex(literature.index.astype(float))['PMCID']\n",
"assert len(pmc_metadata) == sum(~literature['PMC'].isnull())\n",
"\n",
"literature['has_pmc'] = (~literature['PMC'].isnull())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note can also try to find missing PMCs in the summaries:"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# result = entrez_api.search(primary_terms['poly-omics'], max_results=10_000)\n",
"# summary = entrez_api.summarize(result.data['esearchresult']['idlist'][:5], max_results=10_000)\n",
"# summary.data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download full texts as XML"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['PMC139396', 'PMC1472023', 'PMC1849962', 'PMC1913438']"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pmc_ids = literature[literature['has_pmc']]['PMC'].tolist()\n",
"pmc_ids[:4]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reusing the results from cache/pubmed_central_xml.pickle (last modified on 2020-07-25 06:59)\n"
]
}
],
"source": [
"%%cache pubmed_central_xml pmc_xmls\n",
"pmc_full_texts = entrez_api.in_batches_of(size=100).fetch(pmc_ids, max_results=5_000, database='pmc', return_type='xml')\n",
"\n",
"pmc_xmls = sum(\n",
" [\n",
" list(response.data)\n",
" for response in pmc_full_texts.values()\n",
" ],\n",
" []\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1951"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(pmc_xmls)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"ignore_text = {'xref', 'table', 'thead', 'th', 'td', 'tr', 'graphic'}\n",
"\n",
"\n",
"def extract_text(body) -> str:\n",
" fragments = []\n",
" for i in body.iter():\n",
" if i.tag in ignore_text:\n",
" continue\n",
" text = i.text\n",
" if i.tag == 'label' and text and text.startswith('Figure'):\n",
" continue\n",
" if text:\n",
" fragments.append(text)\n",
" return '\\n'.join(fragments)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"literature_subjects = literature.index.to_frame().drop(columns='uid').copy()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"for xml in pmc_xmls:\n",
" pmid = xml.find('front/article-meta/article-id[@pub-id-type=\"pmid\"]').text\n",
" body = xml.find('body')\n",
" has_full_text = body is not None\n",
"\n",
" subjects = [subject.text for subject in xml.findall('front/article-meta//subject')]\n",
"\n",
" literature.loc[pmid, 'has_full_text'] = has_full_text\n",
" literature.loc[pmid, 'full_text'] = extract_text(body) if has_full_text else None\n",
" literature.loc[pmid, 'article_type'] = xml.attrib['article-type']\n",
"\n",
" for subject in subjects:\n",
" literature_subjects.loc[pmid, subject] = True\n",
"\n",
"literature_subjects = literature_subjects.fillna(False)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Article 582\n",
"Research Article 240\n",
"Review 163\n",
"Research 139\n",
"Genetics 109\n",
"Original Research 100\n",
"Research Paper 67\n",
"Biology and Life Sciences 61\n",
"Biochemistry 53\n",
"Microbiology 51\n",
"dtype: int64"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"literature_subjects.sum().sort_values(ascending=False).head(10)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"index\n",
"research-article 1566\n",
"review-article 253\n",
"brief-report 29\n",
"editorial 26\n",
"correction 17\n",
"data-paper 13\n",
"other 8\n",
"article-commentary 7\n",
"letter 7\n",
"discussion 5\n",
"methods-article 5\n",
"product-review 5\n",
"chapter-article 3\n",
"meeting-report 2\n",
"protocol 2\n",
"abstract 1\n",
"addendum 1\n",
"systematic-review 1\n",
"Name: article_type, dtype: int64"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"literature.article_type.sorted_value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1520"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(literature['has_full_text'] == True)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"#from helpers.utils import display_xml\n",
"#display_xml(pmc_xmls[-2].find('body'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Abstract clean-up"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Many abstracts contains sections/organising headers, such as:"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['BACKGROUND', 'MOTIVATION', 'OBJECTIVE', 'SCOPE']"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"['BACKGROUND', 'MOTIVATION', 'OBJECTIVE', 'SCOPE']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"By convention those are upper case in PubMed. Here We filter those out:"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"from re import findall\n",
"\n",
"\n",
"def extract_upper_case(abstract: str, min_len: int = 3):\n",
" if abstract:\n",
" return findall('([A-Z]{' + str(min_len) + ',})', abstract)\n",
" return []\n",
"\n",
"\n",
"def count_upper_case_phrases(data: Series, min_len: int = 3) -> Series:\n",
" return Series(sum(data.apply(extract_upper_case, min_len=min_len), [])).sorted_value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"index\n",
"RNA 2421\n",
"DNA 978\n",
"RESULTS 460\n",
"BACKGROUND 347\n",
"CONCLUSIONS 289\n",
"METHODS 240\n",
"HCC 211\n",
"TCGA 204\n",
"SNP 176\n",
"GWAS 159\n",
"CONCLUSION 146\n",
"CRC 138\n",
"QTL 137\n",
"WNT 126\n",
"RCC 121\n",
"GBM 116\n",
"IBD 116\n",
"TNBC 108\n",
"Name: 0, dtype: int64"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"potential_headers = count_upper_case_phrases(literature['abstract'])\n",
"potential_headers[potential_headers > 100]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"There are many disease abbreviations making the list too long to browse:"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1223"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(potential_headers[potential_headers > 3])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So we will look at longer words:"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3 731\n",
"4 310\n",
"5 95\n",
"6 36\n",
"7 17\n",
"12 9\n",
"10 7\n",
"8 7\n",
"9 6\n",
"14 2\n",
"11 2\n",
"13 1\n",
"Name: index, dtype: int64"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"potential_headers[potential_headers > 3].index.map(len).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"index\n",
"RESULTS 460\n",
"BACKGROUND 347\n",
"CONCLUSIONS 289\n",
"METHODS 240\n",
"CONCLUSION 146\n",
"NAFLD 67\n",
"PURPOSE 58\n",
"CRISPR 57\n",
"OBJECTIVE 54\n",
"AVAILABILITY 51\n",
"NSCLC 47\n",
"HNSCC 46\n",
"MOTIVATION 45\n",
"OMICS 44\n",
"INFORMATION 43\n",
"SUPPLEMENTARY 42\n",
"IMPLEMENTATION 41\n",
"FINDINGS 38\n",
"SIGNIFICANCE 35\n",
"LASSO 33\n",
"Name: 0, dtype: int64"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"potential_headers_long = count_upper_case_phrases(literature['abstract'], min_len=5)\n",
"potential_headers_long.head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I manually chosen headers from among top 100 hits:"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"ABSTRACT_HEADERS = [\n",
" # manually added to prevent hanging \"OF\"\n",
" 'PURPOSE OF REVIEW',\n",
" # chosen from top 100 most frequent\n",
" 'RESULTS',\n",
" 'BACKGROUND',\n",
" 'CONCLUSIONS',\n",
" 'METHODS',\n",
" 'CONCLUSION',\n",
" 'PURPOSE',\n",
" 'OBJECTIVE',\n",
" 'AVAILABILITY',\n",
" 'MOTIVATION',\n",
" 'INFORMATION',\n",
" 'SUPPLEMENTARY',\n",
" 'FINDINGS',\n",
" 'SIGNIFICANCE',\n",
" 'INTRODUCTION',\n",
" 'DESIGN',\n",
" 'OBJECTIVES',\n",
" 'REVIEW',\n",
" 'SUMMARY',\n",
" 'MATERIALS',\n",
" 'STUDY',\n",
" 'EXPERIMENTAL',\n",
" 'DISCUSSION',\n",
" 'REGISTRATION',\n",
" 'METHOD',\n",
" 'CONTACT',\n",
" 'FUTURE',\n",
" 'INTERPRETATION',\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"literature['abstract_clean'] = literature['abstract'].str.replace('|'.join(ABSTRACT_HEADERS), '')"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"Stored `literature` (904B0F94 → 904B0F94) at Tuesday, 04. Aug 2020 17:58"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {
"text/markdown": {
"action": "store",
"command": "store literature in pubmed_derived_data",
"finished": "2020-08-04T17:58:45.587466",
"finished_human_readable": "Tuesday, 04. Aug 2020 17:58",
"result": [
{
"new_file": {
"crc32": "904B0F94",
"sha256": "A2EFC068A287A3B724AE4B320EE5356E1E99474BD08A2E2A3EBA34CD0194F23B"
},
"old_file": {
"crc32": "904B0F94",
"sha256": "A2EFC068A287A3B724AE4B320EE5356E1E99474BD08A2E2A3EBA34CD0194F23B"
},
"subject": "literature"
}
],
"started": "2020-08-04T17:58:34.355113"
}
},
"output_type": "display_data"
}
],
"source": [
"%vault store literature in pubmed_derived_data"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"Stored `literature_subjects` (98E10AF9 → 98E10AF9) at Tuesday, 04. Aug 2020 17:58"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {
"text/markdown": {
"action": "store",
"command": "store literature_subjects in pubmed_derived_data",
"finished": "2020-08-04T17:58:49.610779",
"finished_human_readable": "Tuesday, 04. Aug 2020 17:58",
"result": [
{
"new_file": {
"crc32": "98E10AF9",
"sha256": "DDA318B4E0EC865063AB772B31DD156B2B8E9EA5531B59A0C04B97C70CD21AA9"
},
"old_file": {
"crc32": "98E10AF9",
"sha256": "DDA318B4E0EC865063AB772B31DD156B2B8E9EA5531B59A0C04B97C70CD21AA9"
},
"subject": "literature_subjects"
}
],
"started": "2020-08-04T17:58:45.623299"
}
},
"output_type": "display_data"
}
],
"source": [
"%vault store literature_subjects in pubmed_derived_data"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"Stored:\n",
"\n",
" - `affiliations` (E06399F2 → E06399F2)\n",
" - `authors` (DC49BC74 → DC49BC74)\n",
" - `publication_types` (7DD4E741 → 7DD4E741)\n",
"\n",
"at Tuesday, 04. Aug 2020 17:58"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {
"text/markdown": {
"action": "store",
"command": "store affiliations, authors, publication_types in pubmed_derived_data",
"finished": "2020-08-04T17:58:56.744355",
"finished_human_readable": "Tuesday, 04. Aug 2020 17:58",
"result": [
{
"new_file": {
"crc32": "E06399F2",
"sha256": "8DD13D4B7CF3D2E314BBC4E051AEDBF21414371F42BB4D100D7721B5F4D24E60"
},
"old_file": {
"crc32": "E06399F2",
"sha256": "8DD13D4B7CF3D2E314BBC4E051AEDBF21414371F42BB4D100D7721B5F4D24E60"
},
"subject": "affiliations"
},
{
"new_file": {
"crc32": "DC49BC74",
"sha256": "237BEFD0FDA68E2A155B9EC00519017B4C9BC92BD2AA3D10E058A013EC0DE1D9"
},
"old_file": {
"crc32": "DC49BC74",
"sha256": "237BEFD0FDA68E2A155B9EC00519017B4C9BC92BD2AA3D10E058A013EC0DE1D9"
},
"subject": "authors"
},
{
"new_file": {
"crc32": "7DD4E741",
"sha256": "BD0EBF88B38BB9E0E44923E2CB473A532AEFBFFC6A7FCC02926290CAD2615150"
},
"old_file": {
"crc32": "7DD4E741",
"sha256": "BD0EBF88B38BB9E0E44923E2CB473A532AEFBFFC6A7FCC02926290CAD2615150"
},
"subject": "publication_types"
}
],
"started": "2020-08-04T17:58:49.637812"
}
},
"output_type": "display_data"
}
],
"source": [
"%vault store affiliations, authors, publication_types in pubmed_derived_data"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"import pandas\n",
"pandas.set_option('display.max_colwidth', 1000)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>term</th>\n",
" <th>translation_stack</th>\n",
" <th>query_translation</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>multi-omics</td>\n",
" <td>\"multi-omic\"[Text]→532 \"multiomic\"[Text]→138 OR \"multi omic\"[Text]→532 OR \"multi-omics\"[Text]→1798 OR \"multiomics\"[Text]→459 OR \"multi omics\"[Text]→1798 OR</td>\n",
" <td>\"multi-omic\"[Text] OR \"multiomic\"[Text] OR \"multi omic\"[Text] OR \"multi-omics\"[Text] OR \"multiomics\"[Text] OR \"multi omics\"[Text]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>pan-omics</td>\n",
" <td>\"pan-omic\"[Text]→5 \"panomic\"[Text]→9 OR \"pan omic\"[Text]→5 OR \"pan-omics\"[Text]→15 OR \"panomics\"[Text]→42 OR \"pan omics\"[Text]→15 OR</td>\n",
" <td>\"pan-omic\"[Text] OR \"panomic\"[Text] OR \"pan omic\"[Text] OR \"pan-omics\"[Text] OR \"panomics\"[Text] OR \"pan omics\"[Text]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>trans-omics</td>\n",
" <td>\"trans-omic\"[Text]→13 \"transomic\"[Text]→6 OR \"trans omic\"[Text]→13 OR \"trans-omics\"[Text]→81 OR \"transomics\"[Text]→12 OR \"trans omics\"[Text]→81 OR</td>\n",
" <td>\"trans-omic\"[Text] OR \"transomic\"[Text] OR \"trans omic\"[Text] OR \"trans-omics\"[Text] OR \"transomics\"[Text] OR \"trans omics\"[Text]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>poly-omics</td>\n",
" <td>\"poly-omic\"[Text]→6 \"polyomic\"[Text]→7 OR \"poly omic\"[Text]→6 OR \"poly-omics\"[Text]→5 OR \"polyomics\"[Text]→6 OR \"poly omics\"[Text]→5 OR</td>\n",
" <td>\"poly-omic\"[Text] OR \"polyomic\"[Text] OR \"poly omic\"[Text] OR \"poly-omics\"[Text] OR \"polyomics\"[Text] OR \"poly omics\"[Text]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>cross-omics</td>\n",
" <td>\"cross-omic\"[Text]→2 \"cross omic\"[Text]→2 OR \"cross-omics\"[Text]→43 OR \"crossomics\"[Text]→1 OR \"cross omics\"[Text]→43 OR</td>\n",
" <td>\"cross-omic\"[Text] OR \"cross omic\"[Text] OR \"cross-omics\"[Text] OR \"crossomics\"[Text] OR \"cross omics\"[Text]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>multi-table omics</td>\n",
" <td>\"multi-table\"[Text]→13 omic[Text]→2139 AND GROUP \"multi-table\"[Text]→13 omics[Text]→11151 AND GROUP OR \"multitable\"[Text]→6 omic[Text]→2139 AND GROUP OR \"multitable\"[Text]→6 omics[Text]→11151 AND GROUP OR \"multi table\"[Text]→13 omic[Text]→2139 AND GROUP OR \"multi table\"[Text]→13 omics[Text]→11151 AND GROUP OR</td>\n",
" <td>(\"multi-table\"[Text] AND omic[Text]) OR (\"multi-table\"[Text] AND omics[Text]) OR (\"multitable\"[Text] AND omic[Text]) OR (\"multitable\"[Text] AND omics[Text]) OR (\"multi table\"[Text] AND omic[Text]) OR (\"multi table\"[Text] AND omics[Text])</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>multi-source omics</td>\n",
" <td>\"multi-source\"[Text]→859 omic[Text]→2139 AND GROUP \"multi-source\"[Text]→859 omics[Text]→11151 AND GROUP OR \"multisource\"[Text]→935 omic[Text]→2139 AND GROUP OR \"multisource\"[Text]→935 omics[Text]→11151 AND GROUP OR \"multi source\"[Text]→859 omic[Text]→2139 AND GROUP OR \"multi source\"[Text]→859 omics[Text]→11151 AND GROUP OR \"multi-sources\"[Text]→34 omic[Text]→2139 AND GROUP OR \"multi-sources\"[Text]→34 omics[Text]→11151 AND GROUP OR \"multisources\"[Text]→5 omic[Text]→2139 AND GROUP OR \"multisources\"[Text]→5 omics[Text]→11151 AND GROUP OR \"multi sources\"[Text]→34 omic[Text]→2139 AND GROUP OR \"multi sources\"[Text]→34 omics[Text]→11151 AND GROUP OR</td>\n",
" <td>(\"multi-source\"[Text] AND omic[Text]) OR (\"multi-source\"[Text] AND omics[Text]) OR (\"multisource\"[Text] AND omic[Text]) OR (\"multisource\"[Text] AND omics[Text]) OR (\"multi source\"[Text] AND omic[Text]) OR (\"multi source\"[Text] AND omics[Text]) OR (\"multi-sources\"[Text] AND omic[Text]) OR (\"multi-sources\"[Text] AND omics[Text]) OR (\"multisources\"[Text] AND omic[Text]) OR (\"multisources\"[Text] AND omics[Text]) OR (\"multi sources\"[Text] AND omic[Text]) OR (\"multi sources\"[Text] AND omics[Text])</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>multi-view omics</td>\n",
" <td>\"multi-view\"[Text]→711 omic[Text]→2139 AND GROUP \"multi-view\"[Text]→711 omics[Text]→11151 AND GROUP OR \"multiview\"[Text]→596 omic[Text]→2139 AND GROUP OR \"multiview\"[Text]→596 omics[Text]→11151 AND GROUP OR \"multi view\"[Text]→711 omic[Text]→2139 AND GROUP OR \"multi view\"[Text]→711 omics[Text]→11151 AND GROUP OR \"multi-views\"[Text]→15 omic[Text]→2139 AND GROUP OR \"multi-views\"[Text]→15 omics[Text]→11151 AND GROUP OR \"multiviews\"[Text]→5 omic[Text]→2139 AND GROUP OR \"multiviews\"[Text]→5 omics[Text]→11151 AND GROUP OR \"multi views\"[Text]→15 omic[Text]→2139 AND GROUP OR \"multi views\"[Text]→15 omics[Text]→11151 AND GROUP OR</td>\n",
" <td>(\"multi-view\"[Text] AND omic[Text]) OR (\"multi-view\"[Text] AND omics[Text]) OR (\"multiview\"[Text] AND omic[Text]) OR (\"multiview\"[Text] AND omics[Text]) OR (\"multi view\"[Text] AND omic[Text]) OR (\"multi view\"[Text] AND omics[Text]) OR (\"multi-views\"[Text] AND omic[Text]) OR (\"multi-views\"[Text] AND omics[Text]) OR (\"multiviews\"[Text] AND omic[Text]) OR (\"multiviews\"[Text] AND omics[Text]) OR (\"multi views\"[Text] AND omic[Text]) OR (\"multi views\"[Text] AND omics[Text])</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>multi-modal omics</td>\n",
" <td>\"multi-modal\"[Text]→3939 omic[Text]→2139 AND GROUP \"multi-modal\"[Text]→3939 omics[Text]→11151 AND GROUP OR \"multimodal\"[Text]→46300 omic[Text]→2139 AND GROUP OR \"multimodal\"[Text]→46300 omics[Text]→11151 AND GROUP OR \"multi modal\"[Text]→3939 omic[Text]→2139 AND GROUP OR \"multi modal\"[Text]→3939 omics[Text]→11151 AND GROUP OR</td>\n",
" <td>(\"multi-modal\"[Text] AND omic[Text]) OR (\"multi-modal\"[Text] AND omics[Text]) OR (\"multimodal\"[Text] AND omic[Text]) OR (\"multimodal\"[Text] AND omics[Text]) OR (\"multi modal\"[Text] AND omic[Text]) OR (\"multi modal\"[Text] AND omics[Text])</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>multi-block omics</td>\n",
" <td>\"multi-block\"[Text]→181 omic[Text]→2139 AND GROUP \"multi-block\"[Text]→181 omics[Text]→11151 AND GROUP OR \"multiblock\"[Text]→576 omic[Text]→2139 AND GROUP OR \"multiblock\"[Text]→576 omics[Text]→11151 AND GROUP OR \"multi block\"[Text]→181 omic[Text]→2139 AND GROUP OR \"multi block\"[Text]→181 omics[Text]→11151 AND GROUP OR \"multi-blocks\"[Text]→4 omic[Text]→2139 AND GROUP OR \"multi-blocks\"[Text]→4 omics[Text]→11151 AND GROUP OR \"multiblocks\"[Text]→9 omic[Text]→2139 AND GROUP OR \"multiblocks\"[Text]→9 omics[Text]→11151 AND GROUP OR \"multi blocks\"[Text]→4 omic[Text]→2139 AND GROUP OR \"multi blocks\"[Text]→4 omics[Text]→11151 AND GROUP OR</td>\n",
" <td>(\"multi-block\"[Text] AND omic[Text]) OR (\"multi-block\"[Text] AND omics[Text]) OR (\"multiblock\"[Text] AND omic[Text]) OR (\"multiblock\"[Text] AND omics[Text]) OR (\"multi block\"[Text] AND omic[Text]) OR (\"multi block\"[Text] AND omics[Text]) OR (\"multi-blocks\"[Text] AND omic[Text]) OR (\"multi-blocks\"[Text] AND omics[Text]) OR (\"multiblocks\"[Text] AND omic[Text]) OR (\"multiblocks\"[Text] AND omics[Text]) OR (\"multi blocks\"[Text] AND omic[Text]) OR (\"multi blocks\"[Text] AND omics[Text])</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>integrative omics</td>\n",
" <td>\"integrative omic\"[Text]→1 \"integrative omics\"[Text]→149 OR</td>\n",
" <td>\"integrative omic\"[Text] OR \"integrative omics\"[Text]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>integrated omics</td>\n",
" <td>\"integrated omic\"[Text]→1 \"integrated omics\"[Text]→272 OR</td>\n",
" <td>\"integrated omic\"[Text] OR \"integrated omics\"[Text]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>integromics</td>\n",
" <td>\"integromic\"[Text]→14 \"integromics\"[Text]→52 OR</td>\n",
" <td>\"integromic\"[Text] OR \"integromics\"[Text]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" term \\\n",
"0 multi-omics \n",
"1 pan-omics \n",
"2 trans-omics \n",
"3 poly-omics \n",
"4 cross-omics \n",
"5 multi-table omics \n",
"6 multi-source omics \n",
"7 multi-view omics \n",
"8 multi-modal omics \n",
"9 multi-block omics \n",
"10 integrative omics \n",
"11 integrated omics \n",
"12 integromics \n",
"\n",
" translation_stack \\\n",
"0 \"multi-omic\"[Text]→532 \"multiomic\"[Text]→138 OR \"multi omic\"[Text]→532 OR \"multi-omics\"[Text]→1798 OR \"multiomics\"[Text]→459 OR \"multi omics\"[Text]→1798 OR \n",
"1 \"pan-omic\"[Text]→5 \"panomic\"[Text]→9 OR \"pan omic\"[Text]→5 OR \"pan-omics\"[Text]→15 OR \"panomics\"[Text]→42 OR \"pan omics\"[Text]→15 OR \n",
"2 \"trans-omic\"[Text]→13 \"transomic\"[Text]→6 OR \"trans omic\"[Text]→13 OR \"trans-omics\"[Text]→81 OR \"transomics\"[Text]→12 OR \"trans omics\"[Text]→81 OR \n",
"3 \"poly-omic\"[Text]→6 \"polyomic\"[Text]→7 OR \"poly omic\"[Text]→6 OR \"poly-omics\"[Text]→5 OR \"polyomics\"[Text]→6 OR \"poly omics\"[Text]→5 OR \n",
"4 \"cross-omic\"[Text]→2 \"cross omic\"[Text]→2 OR \"cross-omics\"[Text]→43 OR \"crossomics\"[Text]→1 OR \"cross omics\"[Text]→43 OR \n",
"5 \"multi-table\"[Text]→13 omic[Text]→2139 AND GROUP \"multi-table\"[Text]→13 omics[Text]→11151 AND GROUP OR \"multitable\"[Text]→6 omic[Text]→2139 AND GROUP OR \"multitable\"[Text]→6 omics[Text]→11151 AND GROUP OR \"multi table\"[Text]→13 omic[Text]→2139 AND GROUP OR \"multi table\"[Text]→13 omics[Text]→11151 AND GROUP OR \n",
"6 \"multi-source\"[Text]→859 omic[Text]→2139 AND GROUP \"multi-source\"[Text]→859 omics[Text]→11151 AND GROUP OR \"multisource\"[Text]→935 omic[Text]→2139 AND GROUP OR \"multisource\"[Text]→935 omics[Text]→11151 AND GROUP OR \"multi source\"[Text]→859 omic[Text]→2139 AND GROUP OR \"multi source\"[Text]→859 omics[Text]→11151 AND GROUP OR \"multi-sources\"[Text]→34 omic[Text]→2139 AND GROUP OR \"multi-sources\"[Text]→34 omics[Text]→11151 AND GROUP OR \"multisources\"[Text]→5 omic[Text]→2139 AND GROUP OR \"multisources\"[Text]→5 omics[Text]→11151 AND GROUP OR \"multi sources\"[Text]→34 omic[Text]→2139 AND GROUP OR \"multi sources\"[Text]→34 omics[Text]→11151 AND GROUP OR \n",
"7 \"multi-view\"[Text]→711 omic[Text]→2139 AND GROUP \"multi-view\"[Text]→711 omics[Text]→11151 AND GROUP OR \"multiview\"[Text]→596 omic[Text]→2139 AND GROUP OR \"multiview\"[Text]→596 omics[Text]→11151 AND GROUP OR \"multi view\"[Text]→711 omic[Text]→2139 AND GROUP OR \"multi view\"[Text]→711 omics[Text]→11151 AND GROUP OR \"multi-views\"[Text]→15 omic[Text]→2139 AND GROUP OR \"multi-views\"[Text]→15 omics[Text]→11151 AND GROUP OR \"multiviews\"[Text]→5 omic[Text]→2139 AND GROUP OR \"multiviews\"[Text]→5 omics[Text]→11151 AND GROUP OR \"multi views\"[Text]→15 omic[Text]→2139 AND GROUP OR \"multi views\"[Text]→15 omics[Text]→11151 AND GROUP OR \n",
"8 \"multi-modal\"[Text]→3939 omic[Text]→2139 AND GROUP \"multi-modal\"[Text]→3939 omics[Text]→11151 AND GROUP OR \"multimodal\"[Text]→46300 omic[Text]→2139 AND GROUP OR \"multimodal\"[Text]→46300 omics[Text]→11151 AND GROUP OR \"multi modal\"[Text]→3939 omic[Text]→2139 AND GROUP OR \"multi modal\"[Text]→3939 omics[Text]→11151 AND GROUP OR \n",
"9 \"multi-block\"[Text]→181 omic[Text]→2139 AND GROUP \"multi-block\"[Text]→181 omics[Text]→11151 AND GROUP OR \"multiblock\"[Text]→576 omic[Text]→2139 AND GROUP OR \"multiblock\"[Text]→576 omics[Text]→11151 AND GROUP OR \"multi block\"[Text]→181 omic[Text]→2139 AND GROUP OR \"multi block\"[Text]→181 omics[Text]→11151 AND GROUP OR \"multi-blocks\"[Text]→4 omic[Text]→2139 AND GROUP OR \"multi-blocks\"[Text]→4 omics[Text]→11151 AND GROUP OR \"multiblocks\"[Text]→9 omic[Text]→2139 AND GROUP OR \"multiblocks\"[Text]→9 omics[Text]→11151 AND GROUP OR \"multi blocks\"[Text]→4 omic[Text]→2139 AND GROUP OR \"multi blocks\"[Text]→4 omics[Text]→11151 AND GROUP OR \n",
"10 \"integrative omic\"[Text]→1 \"integrative omics\"[Text]→149 OR \n",
"11 \"integrated omic\"[Text]→1 \"integrated omics\"[Text]→272 OR \n",
"12 \"integromic\"[Text]→14 \"integromics\"[Text]→52 OR \n",
"\n",
" query_translation \n",
"0 \"multi-omic\"[Text] OR \"multiomic\"[Text] OR \"multi omic\"[Text] OR \"multi-omics\"[Text] OR \"multiomics\"[Text] OR \"multi omics\"[Text] \n",
"1 \"pan-omic\"[Text] OR \"panomic\"[Text] OR \"pan omic\"[Text] OR \"pan-omics\"[Text] OR \"panomics\"[Text] OR \"pan omics\"[Text] \n",
"2 \"trans-omic\"[Text] OR \"transomic\"[Text] OR \"trans omic\"[Text] OR \"trans-omics\"[Text] OR \"transomics\"[Text] OR \"trans omics\"[Text] \n",
"3 \"poly-omic\"[Text] OR \"polyomic\"[Text] OR \"poly omic\"[Text] OR \"poly-omics\"[Text] OR \"polyomics\"[Text] OR \"poly omics\"[Text] \n",
"4 \"cross-omic\"[Text] OR \"cross omic\"[Text] OR \"cross-omics\"[Text] OR \"crossomics\"[Text] OR \"cross omics\"[Text] \n",
"5 (\"multi-table\"[Text] AND omic[Text]) OR (\"multi-table\"[Text] AND omics[Text]) OR (\"multitable\"[Text] AND omic[Text]) OR (\"multitable\"[Text] AND omics[Text]) OR (\"multi table\"[Text] AND omic[Text]) OR (\"multi table\"[Text] AND omics[Text]) \n",
"6 (\"multi-source\"[Text] AND omic[Text]) OR (\"multi-source\"[Text] AND omics[Text]) OR (\"multisource\"[Text] AND omic[Text]) OR (\"multisource\"[Text] AND omics[Text]) OR (\"multi source\"[Text] AND omic[Text]) OR (\"multi source\"[Text] AND omics[Text]) OR (\"multi-sources\"[Text] AND omic[Text]) OR (\"multi-sources\"[Text] AND omics[Text]) OR (\"multisources\"[Text] AND omic[Text]) OR (\"multisources\"[Text] AND omics[Text]) OR (\"multi sources\"[Text] AND omic[Text]) OR (\"multi sources\"[Text] AND omics[Text]) \n",
"7 (\"multi-view\"[Text] AND omic[Text]) OR (\"multi-view\"[Text] AND omics[Text]) OR (\"multiview\"[Text] AND omic[Text]) OR (\"multiview\"[Text] AND omics[Text]) OR (\"multi view\"[Text] AND omic[Text]) OR (\"multi view\"[Text] AND omics[Text]) OR (\"multi-views\"[Text] AND omic[Text]) OR (\"multi-views\"[Text] AND omics[Text]) OR (\"multiviews\"[Text] AND omic[Text]) OR (\"multiviews\"[Text] AND omics[Text]) OR (\"multi views\"[Text] AND omic[Text]) OR (\"multi views\"[Text] AND omics[Text]) \n",
"8 (\"multi-modal\"[Text] AND omic[Text]) OR (\"multi-modal\"[Text] AND omics[Text]) OR (\"multimodal\"[Text] AND omic[Text]) OR (\"multimodal\"[Text] AND omics[Text]) OR (\"multi modal\"[Text] AND omic[Text]) OR (\"multi modal\"[Text] AND omics[Text]) \n",
"9 (\"multi-block\"[Text] AND omic[Text]) OR (\"multi-block\"[Text] AND omics[Text]) OR (\"multiblock\"[Text] AND omic[Text]) OR (\"multiblock\"[Text] AND omics[Text]) OR (\"multi block\"[Text] AND omic[Text]) OR (\"multi block\"[Text] AND omics[Text]) OR (\"multi-blocks\"[Text] AND omic[Text]) OR (\"multi-blocks\"[Text] AND omics[Text]) OR (\"multiblocks\"[Text] AND omic[Text]) OR (\"multiblocks\"[Text] AND omics[Text]) OR (\"multi blocks\"[Text] AND omic[Text]) OR (\"multi blocks\"[Text] AND omics[Text]) \n",
"10 \"integrative omic\"[Text] OR \"integrative omics\"[Text] \n",
"11 \"integrated omic\"[Text] OR \"integrated omics\"[Text] \n",
"12 \"integromic\"[Text] OR \"integromics\"[Text] "
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from typing import Union\n",
"\n",
"\n",
"def format_token(t: Union[str, dict]) -> str:\n",
" if isinstance(t, str):\n",
" return t\n",
" assert t['explode'] == 'N'\n",
" assert t['field'] == 'Text'\n",
" return t['term'] + '→' + t['count'] + ''\n",
"\n",
"\n",
"pubmed_translations = []\n",
"for term, result in pubmed_results.items():\n",
" pubmed_translations.append({\n",
" 'term': term,\n",
" 'translation_stack': ' '.join([format_token(t) for t in result.data['esearchresult']['translationstack']]),\n",
" 'query_translation': result.data['esearchresult']['querytranslation']\n",
" })\n",
"pubmed_translations = DataFrame(pubmed_translations)\n",
"pubmed_translations"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create a control of documents published in the journals with hits"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[2002,\n",
" 2004,\n",
" 2005,\n",
" 2006,\n",
" 2007,\n",
" 2008,\n",
" 2009,\n",
" 2010,\n",
" 2011,\n",
" 2012,\n",
" 2013,\n",
" 2014,\n",
" 2015,\n",
" 2016,\n",
" 2017,\n",
" 2018,\n",
" 2019,\n",
" 2020]"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"years_set = sorted(set(literature.year.dropna().astype(int)))\n",
"years_set"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"index\n",
"Scientific reports 126\n",
"Omics : a journal of integrative biology 78\n",
"PloS one 69\n",
"Bioinformatics (Oxford, England) 68\n",
"Nature communications 58\n",
" ... \n",
"Zhongguo yi xue ke xue yuan xue bao. Acta Academiae Medicinae Sinicae 1\n",
"Zhonghua nan ke xue = National journal of andrology 1\n",
"Zhonghua yu fang yi xue za zhi [Chinese journal of preventive medicine] 1\n",
"Zoology (Jena, Germany) 1\n",
"mSphere 1\n",
"Name: journal, Length: 975, dtype: int64"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"journal_freq = literature.journal.sorted_value_counts()\n",
"journal_freq"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.750941210541558"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"popular_journals = journal_freq[journal_freq >= 3]\n",
"popular_journals.sum() / journal_freq.sum()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"Stored `popular_journals` (0B2CABD1 → 0B2CABD1) at Tuesday, 04. Aug 2020 17:58"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {
"text/markdown": {
"action": "store",
"command": "store popular_journals in pubmed_derived_data",
"finished": "2020-08-04T17:58:58.838195",
"finished_human_readable": "Tuesday, 04. Aug 2020 17:58",
"result": [
{
"new_file": {
"crc32": "0B2CABD1",
"sha256": "90D36B3DA0AF97C85591B7E55E1298A1498C6504032163879A08F825EADC3164"
},
"old_file": {
"crc32": "0B2CABD1",
"sha256": "90D36B3DA0AF97C85591B7E55E1298A1498C6504032163879A08F825EADC3164"
},
"subject": "popular_journals"
}
],
"started": "2020-08-04T17:58:56.919579"
}
},
"output_type": "display_data"
}
],
"source": [
"%vault store popular_journals in pubmed_derived_data"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reusing the results from cache/all_articles_by_journal_and_year.pickle (last modified on 2020-08-02 13:24)\n"
]
}
],
"source": [
"%%cache all_articles_by_journal_and_year all_articles_by_journal_and_year\n",
"\n",
"all_articles_by_journal_and_year = []\n",
"\n",
"for journal in tqdm(sorted(popular_journals.index)):\n",
" for year in list(years_set):\n",
" result = entrez_api.search(\n",
" f'(\"{journal}\"[Journal]) AND (\"{year}\"[Date - Publication])',\n",
" database='pubmed',\n",
" max_results=1\n",
" )\n",
" esearch = result.data['esearchresult']\n",
" count = int(esearch['count'])\n",
" assert count >= 0\n",
" all_articles_by_journal_and_year.append({\n",
" 'count': count,\n",
" 'year': year,\n",
" 'journal': journal\n",
" })\n",
"\n",
"all_articles_by_journal_and_year = DataFrame(all_articles_by_journal_and_year)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"Stored `all_articles_by_journal_and_year` (AB6E261E → AB6E261E) at Tuesday, 04. Aug 2020 17:59"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {
"text/markdown": {
"action": "store",
"command": "store all_articles_by_journal_and_year in pubmed_derived_data",
"finished": "2020-08-04T17:59:00.869364",
"finished_human_readable": "Tuesday, 04. Aug 2020 17:59",
"result": [
{
"new_file": {
"crc32": "AB6E261E",
"sha256": "343D4005442B93F41397AF04892D839174F38A2128ED5A08201A581D7FAF0201"
},
"old_file": {
"crc32": "AB6E261E",
"sha256": "343D4005442B93F41397AF04892D839174F38A2128ED5A08201A581D7FAF0201"
},
"subject": "all_articles_by_journal_and_year"
}
],
"started": "2020-08-04T17:58:58.895462"
}
},
"output_type": "display_data"
}
],
"source": [
"%vault store all_articles_by_journal_and_year in pubmed_derived_data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create a control for cancer enrichment"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2002.0, 2020.0)"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"MIN_DATE = min(literature.date.dt.year)\n",
"MAX_DATE = max(literature.date.dt.year)\n",
"MIN_DATE, MAX_DATE"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'((\"2002.0\"[Date - Publication] : \"2020.0\"[Date - Publication]))'"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"SAME_PERIOD_AS_MULTI_OMICS = f'((\"{MIN_DATE}\"[Date - Publication] : \"{MAX_DATE}\"[Date - Publication]))'\n",
"SAME_PERIOD_AS_MULTI_OMICS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Full-text search:"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reusing the results from cache/cancer_articles_from_popular_journals_any_field.pickle (last modified on 2020-08-03 01:55)\n"
]
}
],
"source": [
"%%cache cancer_articles_from_popular_journals_any_field cancer_articles_from_popular_journals_any_field\n",
"\n",
"cancer_articles_by_journal = []\n",
"\n",
"for journal in tqdm(sorted(popular_journals.index)):\n",
" result = entrez_api.search(\n",
" f'(\"{journal}\"[Journal]) AND (\"cancer\"[All Fields]) AND {SAME_PERIOD_AS_MULTI_OMICS}',\n",
" database='pubmed',\n",
" max_results=1\n",
" )\n",
" esearch = result.data['esearchresult']\n",
" count = int(esearch['count'])\n",
" assert count >= 0\n",
" cancer_articles_by_journal.append({\n",
" 'count': count,\n",
" 'journal': journal\n",
" })\n",
"\n",
"cancer_articles_from_popular_journals_any_field = DataFrame(cancer_articles_by_journal)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"Stored `cancer_articles_from_popular_journals_any_field` (6931F0FF → 6931F0FF) at Tuesday, 04. Aug 2020 17:59"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {
"text/markdown": {
"action": "store",
"command": "store cancer_articles_from_popular_journals_any_field in pubmed_derived_data",
"finished": "2020-08-04T17:59:02.972861",
"finished_human_readable": "Tuesday, 04. Aug 2020 17:59",
"result": [
{
"new_file": {
"crc32": "6931F0FF",
"sha256": "D891354ECC232F9BDC07328CDBE8707ECE13127B0850FB3C67CA065D49D34C34"
},
"old_file": {
"crc32": "6931F0FF",
"sha256": "D891354ECC232F9BDC07328CDBE8707ECE13127B0850FB3C67CA065D49D34C34"
},
"subject": "cancer_articles_from_popular_journals_any_field"
}
],
"started": "2020-08-04T17:59:01.003408"
}
},
"output_type": "display_data"
}
],
"source": [
"%vault store cancer_articles_from_popular_journals_any_field in pubmed_derived_data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Title/abstract only:"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reusing the results from cache/cancer_articles_from_popular_journals_tiab_only.pickle (last modified on 2020-08-03 02:02)\n"
]
}
],
"source": [
"%%cache cancer_articles_from_popular_journals_tiab_only cancer_articles_from_popular_journals_tiab_only\n",
"\n",
"cancer_tiab_articles_by_journal = []\n",
"\n",
"for journal in tqdm(sorted(popular_journals.index)):\n",
" result = entrez_api.search(\n",
" f'(\"{journal}\"[Journal]) AND (\"cancer\"[TIAB]) AND {SAME_PERIOD_AS_MULTI_OMICS}',\n",
" database='pubmed',\n",
" max_results=1\n",
" )\n",
" esearch = result.data['esearchresult']\n",
" count = int(esearch['count'])\n",
" assert count >= 0\n",
" cancer_tiab_articles_by_journal.append({\n",
" 'count': count,\n",
" 'journal': journal\n",
" })\n",
"\n",
"cancer_articles_from_popular_journals_tiab_only = DataFrame(cancer_tiab_articles_by_journal)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"Stored `cancer_articles_from_popular_journals_tiab_only` (C6D2493E → C6D2493E) at Tuesday, 04. Aug 2020 17:59"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {
"text/markdown": {
"action": "store",
"command": "store cancer_articles_from_popular_journals_tiab_only in pubmed_derived_data",
"finished": "2020-08-04T17:59:04.991693",
"finished_human_readable": "Tuesday, 04. Aug 2020 17:59",
"result": [
{
"new_file": {
"crc32": "C6D2493E",
"sha256": "F0C0D1C024BD2CED3E45832958994F88EAB809CDFFAC97C732126B08B87B2C64"
},
"old_file": {
"crc32": "C6D2493E",
"sha256": "F0C0D1C024BD2CED3E45832958994F88EAB809CDFFAC97C732126B08B87B2C64"
},
"subject": "cancer_articles_from_popular_journals_tiab_only"
}
],
"started": "2020-08-04T17:59:03.023870"
}
},
"output_type": "display_data"
}
],
"source": [
"%vault store cancer_articles_from_popular_journals_tiab_only in pubmed_derived_data"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}