In [1]:
%run notebook_setup.ipynb

## Define search terms

In [2]:
from search_terms import primary_terms, secondary_terms, descriptive_terms

In [3]:
primary_terms

{'multi-omics': '("multi-omic"[Text Words]) OR ("multiomic"[Text Words]) OR ("multi omic"[Text Words]) OR ("multi-omics"[Text Words]) OR ("multiomics"[Text Words]) OR ("multi omics"[Text Words])',
 'pan-omics': '("pan-omic"[Text Words]) OR ("panomic"[Text Words]) OR ("pan omic"[Text Words]) OR ("pan-omics"[Text Words]) OR ("panomics"[Text Words]) OR ("pan omics"[Text Words])',
 'trans-omics': '("trans-omic"[Text Words]) OR ("transomic"[Text Words]) OR ("trans omic"[Text Words]) OR ("trans-omics"[Text Words]) OR ("transomics"[Text Words]) OR ("trans omics"[Text Words])',
 'poly-omics': '("poly-omic"[Text Words]) OR ("polyomic"[Text Words]) OR ("poly omic"[Text Words]) OR ("poly-omics"[Text Words]) OR ("polyomics"[Text Words]) OR ("poly omics"[Text Words])',
 'cross-omics': '("cross-omic"[Text Words]) OR ("crossomic"[Text Words]) OR ("cross omic"[Text Words]) OR ("cross-omics"[Text Words]) OR ("crossomics"[Text Words]) OR ("cross omics"[Text Words])'}

In [4]:
secondary_terms

{'multi-table omics': '(("multi-table"[Text Words]) AND (omic[Text Words])) OR (("multi-table"[Text Words]) AND (omics[Text Words])) OR (("multitable"[Text Words]) AND (omic[Text Words])) OR (("multitable"[Text Words]) AND (omics[Text Words])) OR (("multi table"[Text Words]) AND (omic[Text Words])) OR (("multi table"[Text Words]) AND (omics[Text Words])) OR (("multi-tables"[Text Words]) AND (omic[Text Words])) OR (("multi-tables"[Text Words]) AND (omics[Text Words])) OR (("multitables"[Text Words]) AND (omic[Text Words])) OR (("multitables"[Text Words]) AND (omics[Text Words])) OR (("multi tables"[Text Words]) AND (omic[Text Words])) OR (("multi tables"[Text Words]) AND (omics[Text Words]))',
 'multi-source omics': '(("multi-source"[Text Words]) AND (omic[Text Words])) OR (("multi-source"[Text Words]) AND (omics[Text Words])) OR (("multisource"[Text Words]) AND (omic[Text Words])) OR (("multisource"[Text Words]) AND (omics[Text Words])) OR (("multi source"[Text Words]) AND (omic[Text W

In [5]:
descriptive_terms

{'integrative omics': '"integrative omic"[Text Words] OR "integrative omics"[Text Words]',
 'integrated omics': '"integrated omic"[Text Words] OR "integrated omics"[Text Words]',
 'integromics': '"integromic"[Text Words] OR "integromics"[Text Words]'}

## Perform search in PubMed

In [6]:
from easy_entrez import EntrezAPI
from config import ENTREZ_API_NAME, ENTREZ_API_EMAIL

entrez_api = EntrezAPI(
    tool=ENTREZ_API_NAME,
    email=ENTREZ_API_EMAIL,
    minimal_interval=2
)

In [7]:
search_terms = {
    **primary_terms,
    **secondary_terms,
    **descriptive_terms
}

In [8]:
from tqdm import tqdm

In [9]:
%%cache search_results pubmed_results

pubmed_results = {}

MAX_RESULTS = 10_000

for term in tqdm(search_terms):
    result = entrez_api.search(
        search_terms[term],
        database='pubmed',
        max_results=MAX_RESULTS
    )
    esearch = result.data['esearchresult']
    count = int(esearch['count'])
    assert count >= 0
    assert count < MAX_RESULTS

    pubmed_results[term] = result

Reusing the results from cache/search_results.pickle (last modified on 2020-07-25 06:51)


In [10]:
all_papers = sorted(set(sum(
    [
        result.data['esearchresult']['idlist']
        for result in pubmed_results.values()
    ],
    []
)))

In [11]:
len(all_papers)

3456

In [12]:
%%cache pubmed_documents_data documents

documents_by_batch = (
    entrez_api
    .in_batches_of(size=100)
    .fetch(all_papers, max_results=10_000, return_type='xml')
)

documents = sum(
    (
        list(result.data)
        for result in documents_by_batch.values()
    ),
    []
)

Reusing the results from cache/pubmed_documents_data.pickle (last modified on 2020-07-25 06:54)


In [13]:
from helpers.utils import xml_element_to_json
documents = [xml_element_to_json(document) for document in list(documents)]

In [14]:
assert len(documents) == len(all_papers)

## Create a data frame with PubMed documents and covariates

In [15]:
from pandas import Series, DataFrame, read_csv, to_datetime

In [16]:
# create a frame with 0 columns and UID of each paper on the index
literature = Series(all_papers).to_frame('uid').set_index('uid')
# add columns for the occurrences of the terms
for term, result in pubmed_results.items():
    literature[term] = False
    for uid in result.data['esearchresult']['idlist']:
        literature.loc[uid, term] = True
literature

Unnamed: 0_level_0,multi-omics,pan-omics,trans-omics,poly-omics,cross-omics,multi-table omics,multi-source omics,multi-view omics,multi-modal omics,multi-block omics,integrative omics,integrated omics,integromics
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
12186644,False,False,False,False,False,False,False,False,False,False,False,False,True
15687693,False,False,False,False,False,False,False,False,False,False,False,False,True
15687700,False,False,True,False,False,False,False,False,False,False,False,False,False
15687839,False,False,False,False,False,False,False,False,False,False,True,False,False
15763567,True,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32697738,True,False,False,False,False,False,False,False,False,False,False,False,False
32698759,True,False,False,False,False,False,False,False,False,False,False,False,False
32698873,True,False,False,False,False,False,False,False,False,False,False,False,False
32699215,True,False,False,False,False,False,False,False,False,False,False,False,False


## Parse the PubMed metadata of articles

Reference:
  - Medline: https://www.nlm.nih.gov/bsd/mms/medlineelements.html
  - Publication types: https://www.nlm.nih.gov/mesh/pubtypes.html (fun fact: includes "Wit and Humor" type)

In [17]:
from warnings import warn
from helpers.parse_pubmed import listify, extract_abstract, parse_date, parse_doi

missing_abstract = []
authors = []
affiliations = []

publication_types = []

for document in documents:

    kind = None
    date = None
    doi = None

    if 'PubmedBookArticle' in document:
        kind = 'article in book'
        book_document = document['PubmedBookArticle']['BookDocument']
        pmid = book_document['PMID']['#text']

        title = book_document['ArticleTitle']['#text']
        abstract = extract_abstract(book_document)

        # 'PublicationType' and 'KeywordList' ignored for book_document as only 2 matches (compared to 3k)

    if 'PubmedArticle' in document:
        pubmed_article = document['PubmedArticle']
        assert not kind
        kind = 'article'
        medline_citation = pubmed_article['MedlineCitation']
        pmid = medline_citation['PMID']['#text']
        article = medline_citation['Article']
        literature.loc[pmid, 'journal'] = article['Journal']['Title']

        if 'ELocationID' in article:
            doi = parse_doi(article['ELocationID'])

        issue = article['Journal']['JournalIssue']
        if 'PubDate' in issue:
            date = parse_date(issue['PubDate'])

        for author in listify(article['AuthorList']['Author'] if 'AuthorList' in article else None):
            author_id = len(authors)
            authors.append(
                {
                    'ID': author_id,
                    'ForeName': author.get('ForeName'),
                    'LastName': author.get('LastName'),
                    'CollectiveName': author.get('CollectiveName'),
                    'PMID': pmid
                }
            )
            for affiliation in listify(author.get('AffiliationInfo')):
                affiliations.append({
                    'Affiliation': affiliation['Affiliation'],
                    'PMID': pmid,
                    'AuthorID': author_id
                })

        for publication_type in listify(article['PublicationTypeList']['PublicationType'] if 'PublicationTypeList' in article else None):
            type_name = publication_type['#text']
            publication_types.append(type_name)
            literature.loc[pmid, f'Is {type_name}'] = True

        try:
            literature.loc[pmid, 'journal_issn'] = article['Journal']['ISSN']['#text']
        except KeyError:
            warn(f'{article["Journal"]} had no ISSN assigned')
        if 'ArticleTitle' in article:
            title = article['ArticleTitle']
            if isinstance(title, dict):
                title = title['#text']

        abstract = extract_abstract(article)

    if not abstract:
        missing_abstract.append(pmid)

    assert kind

    literature.loc[pmid, 'kind'] = kind
    literature.loc[pmid, 'doi'] = doi
    literature.loc[pmid, 'title'] = title
    literature.loc[pmid, 'abstract'] = abstract
    literature.loc[pmid, 'date'] = date

publication_types = Series(publication_types)

  warn(f'{article["Journal"]} had no ISSN assigned')
  warn(f'{article["Journal"]} had no ISSN assigned')
  warn(f'{article["Journal"]} had no ISSN assigned')


In [18]:
publication_types.sorted_value_counts()

index
Journal Article                             3370
Research Support, Non-U.S. Gov't            1371
Review                                       744
Research Support, N.I.H., Extramural         460
Research Support, U.S. Gov't, Non-P.H.S.     161
Comparative Study                             61
Editorial                                     44
Comment                                       37
Clinical Trial                                26
Published Erratum                             23
Multicenter Study                             21
Research Support, N.I.H., Intramural          16
Evaluation Study                              13
Letter                                        13
Case Reports                                   9
Dataset                                        9
Introductory Journal Article                   7
Observational Study                            7
Twin Study                                     7
Validation Study                               7
English Abstra

In [19]:
affiliations = DataFrame(affiliations)
authors = DataFrame(authors)

authors['JointName'] = authors['ForeName'] + ' ' + authors['LastName']

In [20]:
literature['has_doi'] = ~literature.doi.isnull()
literature.date = to_datetime(literature.date)
literature['year'] = literature.date.dt.year

In [21]:
terms = list(pubmed_results.keys())

In [22]:
def which_term(term):
    term = list(term[term].index)
    if len(term) == 1:
        return term[0]
    else:
        return 'multiple'

In [23]:
literature['term'] = literature[terms].apply(which_term, axis=1)

In [24]:
from pandas import Categorical
literature['term'] = Categorical(literature['term'], ordered=True, categories=list(literature['term'].sorted_value_counts().index))

In [25]:
literature['has_url_in_abstract'] = literature['abstract'].str.contains('(?:https?://|www.)')

## Add PubmedCentral mapping

In [26]:
%%cache pubmed_central_metadata pmc_metadata
# approx 2GB in RAM, best to subset early
pmc_metadata_all = read_csv('data/PMC-ids.csv.gz')
pmid_of_interest = set(literature.index)
pmc_metadata = pmc_metadata_all[pmc_metadata_all.PMID.isin(pmid_of_interest)]
del pmc_metadata_all

Reusing the results from cache/pubmed_central_metadata.pickle (last modified on 2020-07-25 06:55)


In [27]:
len(pmc_metadata)

1951

In [28]:
pmc_metadata.head()

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date
83696,Genome Biol,1474-7596,1474-760X,2002,3,8,reports4027.1,10.1186/gb-2002-3-8-reports4027,PMC139396,12186644.0,,live
817169,J Virol,0022-538X,1098-5514,2006,80,9,4356,10.1128/JVI.80.9.4356-4362.2006,PMC1472023,16611894.0,,live
1155415,Proc Natl Acad Sci U S A,0027-8424,1091-6490,2007,104,15,6478,10.1073/pnas.0611629104,PMC1849962,17420480.0,,live
1212422,J Bacteriol,0021-9193,1098-5530,2007,189,13,4635,10.1128/JB.00128-07,PMC1913438,17449607.0,,live
1430120,Osteoarthritis Cartilage,1063-4584,1522-9653,2007,15,12,1367,10.1016/j.joca.2007.04.011,PMC2153443,17604656.0,NIHMS34878,live


In [29]:
literature['PMC'] = pmc_metadata.set_index('PMID').reindex(literature.index.astype(float))['PMCID']
assert len(pmc_metadata) == sum(~literature['PMC'].isnull())

literature['has_pmc'] = (~literature['PMC'].isnull())

Note can also try to find missing PMCs in the summaries:

In [30]:
# result = entrez_api.search(primary_terms['poly-omics'], max_results=10_000)
# summary = entrez_api.summarize(result.data['esearchresult']['idlist'][:5], max_results=10_000)
# summary.data

### Download full texts as XML

In [31]:
pmc_ids = literature[literature['has_pmc']]['PMC'].tolist()
pmc_ids[:4]

['PMC139396', 'PMC1472023', 'PMC1849962', 'PMC1913438']

In [32]:
%%cache pubmed_central_xml pmc_xmls
pmc_full_texts = entrez_api.in_batches_of(size=100).fetch(pmc_ids, max_results=5_000, database='pmc', return_type='xml')

pmc_xmls = sum(
    [
        list(response.data)
        for response in pmc_full_texts.values()
    ],
    []
)

Reusing the results from cache/pubmed_central_xml.pickle (last modified on 2020-07-25 06:59)


In [33]:
len(pmc_xmls)

1951

In [34]:
ignore_text = {'xref', 'table', 'thead', 'th', 'td', 'tr', 'graphic'}


def extract_text(body) -> str:
    fragments = []
    for i in body.iter():
        if i.tag in ignore_text:
            continue
        text = i.text
        if i.tag == 'label' and text and text.startswith('Figure'):
            continue
        if text:
            fragments.append(text)
    return '\n'.join(fragments)

In [35]:
literature_subjects = literature.index.to_frame().drop(columns='uid').copy()

In [36]:
for xml in pmc_xmls:
    pmid = xml.find('front/article-meta/article-id[@pub-id-type="pmid"]').text
    body = xml.find('body')
    has_full_text = body is not None

    subjects = [subject.text for subject in xml.findall('front/article-meta//subject')]

    literature.loc[pmid, 'has_full_text'] = has_full_text
    literature.loc[pmid, 'full_text'] = extract_text(body) if has_full_text else None
    literature.loc[pmid, 'article_type'] = xml.attrib['article-type']

    for subject in subjects:
        literature_subjects.loc[pmid, subject] = True

literature_subjects = literature_subjects.fillna(False)

In [37]:
literature_subjects.sum().sort_values(ascending=False).head(10)

Article                      582
Research Article             240
Review                       163
Research                     139
Genetics                     109
Original Research            100
Research Paper                67
Biology and Life Sciences     61
Biochemistry                  53
Microbiology                  51
dtype: int64

In [38]:
literature.article_type.sorted_value_counts()

index
research-article      1566
review-article         253
brief-report            29
editorial               26
correction              17
data-paper              13
other                    8
article-commentary       7
letter                   7
discussion               5
methods-article          5
product-review           5
chapter-article          3
meeting-report           2
protocol                 2
abstract                 1
addendum                 1
systematic-review        1
Name: article_type, dtype: int64

In [39]:
sum(literature['has_full_text'] == True)

1520

In [40]:
#from helpers.utils import display_xml
#display_xml(pmc_xmls[-2].find('body'))

## Abstract clean-up

Many abstracts contains sections/organising headers, such as:

In [41]:
['BACKGROUND', 'MOTIVATION', 'OBJECTIVE', 'SCOPE']

['BACKGROUND', 'MOTIVATION', 'OBJECTIVE', 'SCOPE']

By convention those are upper case in PubMed. Here We filter those out:

In [42]:
from re import findall


def extract_upper_case(abstract: str, min_len: int = 3):
    if abstract:
        return findall('([A-Z]{' + str(min_len) + ',})', abstract)
    return []


def count_upper_case_phrases(data: Series, min_len: int = 3) -> Series:
    return Series(sum(data.apply(extract_upper_case, min_len=min_len), [])).sorted_value_counts()

In [43]:
potential_headers = count_upper_case_phrases(literature['abstract'])
potential_headers[potential_headers > 100]

index
RNA            2421
DNA             978
RESULTS         460
BACKGROUND      347
CONCLUSIONS     289
METHODS         240
HCC             211
TCGA            204
SNP             176
GWAS            159
CONCLUSION      146
CRC             138
QTL             137
WNT             126
RCC             121
GBM             116
IBD             116
TNBC            108
Name: 0, dtype: int64

There are many disease abbreviations making the list too long to browse:

In [44]:
len(potential_headers[potential_headers > 3])

1223

So we will look at longer words:

In [45]:
potential_headers[potential_headers > 3].index.map(len).value_counts()

3     731
4     310
5      95
6      36
7      17
12      9
10      7
8       7
9       6
14      2
11      2
13      1
Name: index, dtype: int64

In [46]:
potential_headers_long = count_upper_case_phrases(literature['abstract'], min_len=5)
potential_headers_long.head(20)

index
RESULTS           460
BACKGROUND        347
CONCLUSIONS       289
METHODS           240
CONCLUSION        146
NAFLD              67
PURPOSE            58
CRISPR             57
OBJECTIVE          54
AVAILABILITY       51
NSCLC              47
HNSCC              46
MOTIVATION         45
OMICS              44
INFORMATION        43
SUPPLEMENTARY      42
IMPLEMENTATION     41
FINDINGS           38
SIGNIFICANCE       35
LASSO              33
Name: 0, dtype: int64

I manually chosen headers from among top 100 hits:

In [47]:
ABSTRACT_HEADERS = [
    # manually added to prevent hanging "OF"
    'PURPOSE OF REVIEW',
    # chosen from top 100 most frequent
    'RESULTS',
    'BACKGROUND',
    'CONCLUSIONS',
    'METHODS',
    'CONCLUSION',
    'PURPOSE',
    'OBJECTIVE',
    'AVAILABILITY',
    'MOTIVATION',
    'INFORMATION',
    'SUPPLEMENTARY',
    'FINDINGS',
    'SIGNIFICANCE',
    'INTRODUCTION',
    'DESIGN',
    'OBJECTIVES',
    'REVIEW',
    'SUMMARY',
    'MATERIALS',
    'STUDY',
    'EXPERIMENTAL',
    'DISCUSSION',
    'REGISTRATION',
    'METHOD',
    'CONTACT',
    'FUTURE',
    'INTERPRETATION',
]

In [48]:
literature['abstract_clean'] = literature['abstract'].str.replace('|'.join(ABSTRACT_HEADERS), '')

In [49]:
%vault store literature in pubmed_derived_data

Stored `literature` (904B0F94 → 904B0F94) at Tuesday, 04. Aug 2020 17:58

In [50]:
%vault store literature_subjects in pubmed_derived_data

Stored `literature_subjects` (98E10AF9 → 98E10AF9) at Tuesday, 04. Aug 2020 17:58

In [51]:
%vault store affiliations, authors, publication_types in pubmed_derived_data

Stored:

 - `affiliations` (E06399F2 → E06399F2)
 - `authors` (DC49BC74 → DC49BC74)
 - `publication_types` (7DD4E741 → 7DD4E741)

at Tuesday, 04. Aug 2020 17:58

In [52]:
import pandas
pandas.set_option('display.max_colwidth', 1000)

In [53]:
from typing import Union


def format_token(t: Union[str, dict]) -> str:
    if isinstance(t, str):
        return t
    assert t['explode'] == 'N'
    assert t['field'] == 'Text'
    return t['term'] + '→' + t['count'] + ''


pubmed_translations = []
for term, result in pubmed_results.items():
    pubmed_translations.append({
        'term': term,
        'translation_stack': ' '.join([format_token(t) for t in result.data['esearchresult']['translationstack']]),
        'query_translation': result.data['esearchresult']['querytranslation']
    })
pubmed_translations = DataFrame(pubmed_translations)
pubmed_translations

Unnamed: 0,term,translation_stack,query_translation
0,multi-omics,"""multi-omic""[Text]→532 ""multiomic""[Text]→138 OR ""multi omic""[Text]→532 OR ""multi-omics""[Text]→1798 OR ""multiomics""[Text]→459 OR ""multi omics""[Text]→1798 OR","""multi-omic""[Text] OR ""multiomic""[Text] OR ""multi omic""[Text] OR ""multi-omics""[Text] OR ""multiomics""[Text] OR ""multi omics""[Text]"
1,pan-omics,"""pan-omic""[Text]→5 ""panomic""[Text]→9 OR ""pan omic""[Text]→5 OR ""pan-omics""[Text]→15 OR ""panomics""[Text]→42 OR ""pan omics""[Text]→15 OR","""pan-omic""[Text] OR ""panomic""[Text] OR ""pan omic""[Text] OR ""pan-omics""[Text] OR ""panomics""[Text] OR ""pan omics""[Text]"
2,trans-omics,"""trans-omic""[Text]→13 ""transomic""[Text]→6 OR ""trans omic""[Text]→13 OR ""trans-omics""[Text]→81 OR ""transomics""[Text]→12 OR ""trans omics""[Text]→81 OR","""trans-omic""[Text] OR ""transomic""[Text] OR ""trans omic""[Text] OR ""trans-omics""[Text] OR ""transomics""[Text] OR ""trans omics""[Text]"
3,poly-omics,"""poly-omic""[Text]→6 ""polyomic""[Text]→7 OR ""poly omic""[Text]→6 OR ""poly-omics""[Text]→5 OR ""polyomics""[Text]→6 OR ""poly omics""[Text]→5 OR","""poly-omic""[Text] OR ""polyomic""[Text] OR ""poly omic""[Text] OR ""poly-omics""[Text] OR ""polyomics""[Text] OR ""poly omics""[Text]"
4,cross-omics,"""cross-omic""[Text]→2 ""cross omic""[Text]→2 OR ""cross-omics""[Text]→43 OR ""crossomics""[Text]→1 OR ""cross omics""[Text]→43 OR","""cross-omic""[Text] OR ""cross omic""[Text] OR ""cross-omics""[Text] OR ""crossomics""[Text] OR ""cross omics""[Text]"
5,multi-table omics,"""multi-table""[Text]→13 omic[Text]→2139 AND GROUP ""multi-table""[Text]→13 omics[Text]→11151 AND GROUP OR ""multitable""[Text]→6 omic[Text]→2139 AND GROUP OR ""multitable""[Text]→6 omics[Text]→11151 AND GROUP OR ""multi table""[Text]→13 omic[Text]→2139 AND GROUP OR ""multi table""[Text]→13 omics[Text]→11151 AND GROUP OR","(""multi-table""[Text] AND omic[Text]) OR (""multi-table""[Text] AND omics[Text]) OR (""multitable""[Text] AND omic[Text]) OR (""multitable""[Text] AND omics[Text]) OR (""multi table""[Text] AND omic[Text]) OR (""multi table""[Text] AND omics[Text])"
6,multi-source omics,"""multi-source""[Text]→859 omic[Text]→2139 AND GROUP ""multi-source""[Text]→859 omics[Text]→11151 AND GROUP OR ""multisource""[Text]→935 omic[Text]→2139 AND GROUP OR ""multisource""[Text]→935 omics[Text]→11151 AND GROUP OR ""multi source""[Text]→859 omic[Text]→2139 AND GROUP OR ""multi source""[Text]→859 omics[Text]→11151 AND GROUP OR ""multi-sources""[Text]→34 omic[Text]→2139 AND GROUP OR ""multi-sources""[Text]→34 omics[Text]→11151 AND GROUP OR ""multisources""[Text]→5 omic[Text]→2139 AND GROUP OR ""multisources""[Text]→5 omics[Text]→11151 AND GROUP OR ""multi sources""[Text]→34 omic[Text]→2139 AND GROUP OR ""multi sources""[Text]→34 omics[Text]→11151 AND GROUP OR","(""multi-source""[Text] AND omic[Text]) OR (""multi-source""[Text] AND omics[Text]) OR (""multisource""[Text] AND omic[Text]) OR (""multisource""[Text] AND omics[Text]) OR (""multi source""[Text] AND omic[Text]) OR (""multi source""[Text] AND omics[Text]) OR (""multi-sources""[Text] AND omic[Text]) OR (""multi-sources""[Text] AND omics[Text]) OR (""multisources""[Text] AND omic[Text]) OR (""multisources""[Text] AND omics[Text]) OR (""multi sources""[Text] AND omic[Text]) OR (""multi sources""[Text] AND omics[Text])"
7,multi-view omics,"""multi-view""[Text]→711 omic[Text]→2139 AND GROUP ""multi-view""[Text]→711 omics[Text]→11151 AND GROUP OR ""multiview""[Text]→596 omic[Text]→2139 AND GROUP OR ""multiview""[Text]→596 omics[Text]→11151 AND GROUP OR ""multi view""[Text]→711 omic[Text]→2139 AND GROUP OR ""multi view""[Text]→711 omics[Text]→11151 AND GROUP OR ""multi-views""[Text]→15 omic[Text]→2139 AND GROUP OR ""multi-views""[Text]→15 omics[Text]→11151 AND GROUP OR ""multiviews""[Text]→5 omic[Text]→2139 AND GROUP OR ""multiviews""[Text]→5 omics[Text]→11151 AND GROUP OR ""multi views""[Text]→15 omic[Text]→2139 AND GROUP OR ""multi views""[Text]→15 omics[Text]→11151 AND GROUP OR","(""multi-view""[Text] AND omic[Text]) OR (""multi-view""[Text] AND omics[Text]) OR (""multiview""[Text] AND omic[Text]) OR (""multiview""[Text] AND omics[Text]) OR (""multi view""[Text] AND omic[Text]) OR (""multi view""[Text] AND omics[Text]) OR (""multi-views""[Text] AND omic[Text]) OR (""multi-views""[Text] AND omics[Text]) OR (""multiviews""[Text] AND omic[Text]) OR (""multiviews""[Text] AND omics[Text]) OR (""multi views""[Text] AND omic[Text]) OR (""multi views""[Text] AND omics[Text])"
8,multi-modal omics,"""multi-modal""[Text]→3939 omic[Text]→2139 AND GROUP ""multi-modal""[Text]→3939 omics[Text]→11151 AND GROUP OR ""multimodal""[Text]→46300 omic[Text]→2139 AND GROUP OR ""multimodal""[Text]→46300 omics[Text]→11151 AND GROUP OR ""multi modal""[Text]→3939 omic[Text]→2139 AND GROUP OR ""multi modal""[Text]→3939 omics[Text]→11151 AND GROUP OR","(""multi-modal""[Text] AND omic[Text]) OR (""multi-modal""[Text] AND omics[Text]) OR (""multimodal""[Text] AND omic[Text]) OR (""multimodal""[Text] AND omics[Text]) OR (""multi modal""[Text] AND omic[Text]) OR (""multi modal""[Text] AND omics[Text])"
9,multi-block omics,"""multi-block""[Text]→181 omic[Text]→2139 AND GROUP ""multi-block""[Text]→181 omics[Text]→11151 AND GROUP OR ""multiblock""[Text]→576 omic[Text]→2139 AND GROUP OR ""multiblock""[Text]→576 omics[Text]→11151 AND GROUP OR ""multi block""[Text]→181 omic[Text]→2139 AND GROUP OR ""multi block""[Text]→181 omics[Text]→11151 AND GROUP OR ""multi-blocks""[Text]→4 omic[Text]→2139 AND GROUP OR ""multi-blocks""[Text]→4 omics[Text]→11151 AND GROUP OR ""multiblocks""[Text]→9 omic[Text]→2139 AND GROUP OR ""multiblocks""[Text]→9 omics[Text]→11151 AND GROUP OR ""multi blocks""[Text]→4 omic[Text]→2139 AND GROUP OR ""multi blocks""[Text]→4 omics[Text]→11151 AND GROUP OR","(""multi-block""[Text] AND omic[Text]) OR (""multi-block""[Text] AND omics[Text]) OR (""multiblock""[Text] AND omic[Text]) OR (""multiblock""[Text] AND omics[Text]) OR (""multi block""[Text] AND omic[Text]) OR (""multi block""[Text] AND omics[Text]) OR (""multi-blocks""[Text] AND omic[Text]) OR (""multi-blocks""[Text] AND omics[Text]) OR (""multiblocks""[Text] AND omic[Text]) OR (""multiblocks""[Text] AND omics[Text]) OR (""multi blocks""[Text] AND omic[Text]) OR (""multi blocks""[Text] AND omics[Text])"


## Create a control of documents published in the journals with hits

In [54]:
years_set = sorted(set(literature.year.dropna().astype(int)))
years_set

[2002,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020]

In [55]:
journal_freq = literature.journal.sorted_value_counts()
journal_freq

index
Scientific reports                                                         126
Omics : a journal of integrative biology                                    78
PloS one                                                                    69
Bioinformatics (Oxford, England)                                            68
Nature communications                                                       58
                                                                          ... 
Zhongguo yi xue ke xue yuan xue bao. Acta Academiae Medicinae Sinicae        1
Zhonghua nan ke xue = National journal of andrology                          1
Zhonghua yu fang yi xue za zhi [Chinese journal of preventive medicine]      1
Zoology (Jena, Germany)                                                      1
mSphere                                                                      1
Name: journal, Length: 975, dtype: int64

In [56]:
popular_journals = journal_freq[journal_freq >= 3]
popular_journals.sum() / journal_freq.sum()

0.750941210541558

In [57]:
%vault store popular_journals in pubmed_derived_data

Stored `popular_journals` (0B2CABD1 → 0B2CABD1) at Tuesday, 04. Aug 2020 17:58

In [58]:
%%cache all_articles_by_journal_and_year all_articles_by_journal_and_year

all_articles_by_journal_and_year = []

for journal in tqdm(sorted(popular_journals.index)):
    for year in list(years_set):
        result = entrez_api.search(
            f'("{journal}"[Journal]) AND ("{year}"[Date - Publication])',
            database='pubmed',
            max_results=1
        )
        esearch = result.data['esearchresult']
        count = int(esearch['count'])
        assert count >= 0
        all_articles_by_journal_and_year.append({
            'count': count,
            'year': year,
            'journal': journal
        })

all_articles_by_journal_and_year = DataFrame(all_articles_by_journal_and_year)

Reusing the results from cache/all_articles_by_journal_and_year.pickle (last modified on 2020-08-02 13:24)


In [59]:
%vault store all_articles_by_journal_and_year in pubmed_derived_data

Stored `all_articles_by_journal_and_year` (AB6E261E → AB6E261E) at Tuesday, 04. Aug 2020 17:59

## Create a control for cancer enrichment

In [60]:
MIN_DATE = min(literature.date.dt.year)
MAX_DATE = max(literature.date.dt.year)
MIN_DATE, MAX_DATE

(2002.0, 2020.0)

In [61]:
SAME_PERIOD_AS_MULTI_OMICS = f'(("{MIN_DATE}"[Date - Publication] : "{MAX_DATE}"[Date - Publication]))'
SAME_PERIOD_AS_MULTI_OMICS

'(("2002.0"[Date - Publication] : "2020.0"[Date - Publication]))'

Full-text search:

In [62]:
%%cache cancer_articles_from_popular_journals_any_field cancer_articles_from_popular_journals_any_field

cancer_articles_by_journal = []

for journal in tqdm(sorted(popular_journals.index)):
    result = entrez_api.search(
        f'("{journal}"[Journal]) AND ("cancer"[All Fields]) AND {SAME_PERIOD_AS_MULTI_OMICS}',
        database='pubmed',
        max_results=1
    )
    esearch = result.data['esearchresult']
    count = int(esearch['count'])
    assert count >= 0
    cancer_articles_by_journal.append({
        'count': count,
        'journal': journal
    })

cancer_articles_from_popular_journals_any_field = DataFrame(cancer_articles_by_journal)

Reusing the results from cache/cancer_articles_from_popular_journals_any_field.pickle (last modified on 2020-08-03 01:55)


In [63]:
%vault store cancer_articles_from_popular_journals_any_field in pubmed_derived_data

Stored `cancer_articles_from_popular_journals_any_field` (6931F0FF → 6931F0FF) at Tuesday, 04. Aug 2020 17:59

Title/abstract only:

In [64]:
%%cache cancer_articles_from_popular_journals_tiab_only cancer_articles_from_popular_journals_tiab_only

cancer_tiab_articles_by_journal = []

for journal in tqdm(sorted(popular_journals.index)):
    result = entrez_api.search(
        f'("{journal}"[Journal]) AND ("cancer"[TIAB]) AND {SAME_PERIOD_AS_MULTI_OMICS}',
        database='pubmed',
        max_results=1
    )
    esearch = result.data['esearchresult']
    count = int(esearch['count'])
    assert count >= 0
    cancer_tiab_articles_by_journal.append({
        'count': count,
        'journal': journal
    })

cancer_articles_from_popular_journals_tiab_only = DataFrame(cancer_tiab_articles_by_journal)

Reusing the results from cache/cancer_articles_from_popular_journals_tiab_only.pickle (last modified on 2020-08-03 02:02)


In [65]:
%vault store cancer_articles_from_popular_journals_tiab_only in pubmed_derived_data

Stored `cancer_articles_from_popular_journals_tiab_only` (C6D2493E → C6D2493E) at Tuesday, 04. Aug 2020 17:59