In [1]:
%run notebook_setup.ipynb

In [2]:
%vault from pubmed_derived_data import literature, reliable_article_types, code_repositories, domain_features

Imported:

 - `literature` (904B0F94)
 - `reliable_article_types` (5D584CB5)
 - `code_repositories` (5FF4AA2D)
 - `domain_features` (9CBD2CED)

at Wednesday, 05. Aug 2020 14:17

In [3]:
from repository_detection import (
    source_code_platforms, mixed_publication_platforms, data_only_platforms,
    all_platforms as platforms
)

In [4]:
from helpers.features import number_of_articles_mentioning_feature, eval_features_frame

In [5]:
domain_features_py = eval_features_frame(domain_features)

Articles mentioning a disease or clinical finding:

In [6]:
mention_a_disease = (domain_features_py.mentioned_diseases_set.apply(lambda x: len(x - {'disease'})) != 0)
mention_a_finding = (domain_features_py.mentioned_clinical_findings.apply(len) != 0)

In [7]:
sum(mention_a_disease | mention_a_finding)

1719

In [8]:
articles_mentioning_species = sum(domain_features_py.mentioned_species.apply(len) != 0)
articles_mentioning_species

1565

In [9]:
species_terms_redundant_count = len(set(domain_features_py.mentioned_species.sum()))
species_terms_redundant_count

830

In [10]:
%vault from pubmed_derived_data import omics_features

Imported `omics_features` (32CBB0C4) at Wednesday, 05. Aug 2020 14:17

In [11]:
from helpers.text_processing import prefix_remover

In [12]:
omics_columns = omics_features.columns

omes_or_omics = (
    omics_features[omics_columns[omics_columns.str.startswith('ome_or_omic_')]]
    .rename(columns=prefix_remover('ome_or_omic_'))
)

In [13]:
from pandas import Series

In [14]:
code_or_data_links = code_repositories['mentions_' + Series(list(platforms))].sum().sum()
code_or_data_links

594

In [15]:
articles_with_code_or_data_link = code_repositories['mentions_' + Series(list(platforms))].any(axis=1).sum()
articles_with_code_or_data_link

444

In [16]:
from helpers import mermaid

In [17]:
mention_a_disease_count = sum(mention_a_disease)
mention_a_finding_count = sum(mention_a_finding)
omics_count = len(omes_or_omics.columns)
articles_with_omics = omes_or_omics.any(axis=1).sum()
pubmed_matches_count = len(literature)
in_pmc_count = sum(literature.has_pmc)
full_text_count = sum(literature.has_full_text == True)
abstract_only_count = sum((~literature.abstract.isnull()) & (~(literature.has_full_text == True)))
article_types_count = len(reliable_article_types)

In [18]:
%%mermaid
graph TD
    classDef data fill:#ffe49f,stroke:#333,stroke-width:1px;
    classDef integration fill:#b6d7ab,stroke:#333,stroke-width:1px;
    classDef analysis fill:#93c482,stroke:#333,stroke-width:1px,padding:0;
    SEARCH[fa:fa-search {pubmed_matches_count} PubMed results]
    SEARCH:::data-->PMC[fa:fa-file-text {in_pmc_count} in PubMed Central]:::data
    PMC:::data-->FULL_TEXT[fa:fa-align-justify {full_text_count} articles with full-text]:::data
    SEARCH-->ABSTRACTS[fa:fa-font {abstract_only_count} with abstracts only]:::data
    FULL_TEXT-->COMBINED[fa:fa-plus-circle Combined dataset]:::integration
    ABSTRACTS-->COMBINED
    ABSTRACTS-->SPECIES{{{{fa:fa-paw {articles_mentioning_species} mention a species}}}}:::analysis
    ABSTRACTS-->DISEASE{{{{fa:fa-procedures {mention_a_disease_count} with a disease}}}}:::analysis
    ABSTRACTS-->FINDING{{{{fa:fa-stethoscope {mention_a_finding_count} with a clinical finding}}}}:::analysis
    COMBINED-->TYPES{{{{fa:fa-shapes {article_types_count} articles with determined type}}}}:::analysis
    COMBINED-->REPOS{{{{fa:fa-code {code_or_data_links} code and data links}}}}:::analysis
    COMBINED-->OMICS{{{{fa:fa-dna {articles_with_omics} with >=1 of {omics_count} omics}}}}:::analysis
    COMBINED-->TRENDS{{{{fa:fa-calendar-alt phrase trends}}}}:::analysis