426 lines (425 with data), 11.8 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%run notebook_setup.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"Imported:\n",
"\n",
" - `literature` (904B0F94)\n",
" - `reliable_article_types` (5D584CB5)\n",
" - `code_repositories` (5FF4AA2D)\n",
" - `domain_features` (9CBD2CED)\n",
"\n",
"at Wednesday, 05. Aug 2020 14:17"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {
"text/markdown": {
"action": "import",
"command": "from pubmed_derived_data import literature, reliable_article_types, code_repositories, domain_features",
"finished": "2020-08-05T14:17:26.121521",
"finished_human_readable": "Wednesday, 05. Aug 2020 14:17",
"result": [
{
"new_file": {
"crc32": "904B0F94",
"sha256": "A2EFC068A287A3B724AE4B320EE5356E1E99474BD08A2E2A3EBA34CD0194F23B"
},
"subject": "literature"
},
{
"new_file": {
"crc32": "5D584CB5",
"sha256": "585366F3E5A11FC007CC4DFF5AF9C7AFBCBEBA3A15B65333657C632F2218A1AC"
},
"subject": "reliable_article_types"
},
{
"new_file": {
"crc32": "5FF4AA2D",
"sha256": "92B28FE95EA205C4311BD4E9D6360D87087D0C5D452CCF9567829CFFD27EE1E5"
},
"subject": "code_repositories"
},
{
"new_file": {
"crc32": "9CBD2CED",
"sha256": "69E41B5E85F3320A8BED275B947ECA40F456F11EC6734F3E3BCDE4BD64EA9255"
},
"subject": "domain_features"
}
],
"started": "2020-08-05T14:17:22.674214"
}
},
"output_type": "display_data"
}
],
"source": [
"%vault from pubmed_derived_data import literature, reliable_article_types, code_repositories, domain_features"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from repository_detection import (\n",
" source_code_platforms, mixed_publication_platforms, data_only_platforms,\n",
" all_platforms as platforms\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from helpers.features import number_of_articles_mentioning_feature, eval_features_frame"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"domain_features_py = eval_features_frame(domain_features)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Articles mentioning a disease or clinical finding:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"mention_a_disease = (domain_features_py.mentioned_diseases_set.apply(lambda x: len(x - {'disease'})) != 0)\n",
"mention_a_finding = (domain_features_py.mentioned_clinical_findings.apply(len) != 0)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1719"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(mention_a_disease | mention_a_finding)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1565"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"articles_mentioning_species = sum(domain_features_py.mentioned_species.apply(len) != 0)\n",
"articles_mentioning_species"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"830"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"species_terms_redundant_count = len(set(domain_features_py.mentioned_species.sum()))\n",
"species_terms_redundant_count"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"Imported `omics_features` (32CBB0C4) at Wednesday, 05. Aug 2020 14:17"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {
"text/markdown": {
"action": "import",
"command": "from pubmed_derived_data import omics_features",
"finished": "2020-08-05T14:17:27.162765",
"finished_human_readable": "Wednesday, 05. Aug 2020 14:17",
"result": [
{
"new_file": {
"crc32": "32CBB0C4",
"sha256": "5341315C160BE59D98DC80F7B2F5F2FB982F24900AC029B6301F92EE639DECB8"
},
"subject": "omics_features"
}
],
"started": "2020-08-05T14:17:26.472027"
}
},
"output_type": "display_data"
}
],
"source": [
"%vault from pubmed_derived_data import omics_features"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from helpers.text_processing import prefix_remover"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"omics_columns = omics_features.columns\n",
"\n",
"omes_or_omics = (\n",
" omics_features[omics_columns[omics_columns.str.startswith('ome_or_omic_')]]\n",
" .rename(columns=prefix_remover('ome_or_omic_'))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"from pandas import Series"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"594"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"code_or_data_links = code_repositories['mentions_' + Series(list(platforms))].sum().sum()\n",
"code_or_data_links"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"444"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"articles_with_code_or_data_link = code_repositories['mentions_' + Series(list(platforms))].any(axis=1).sum()\n",
"articles_with_code_or_data_link"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<script src=\"https://cdnjs.cloudflare.com/ajax/libs/mermaid/8.6.4/mermaid.min.js\"></script>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from helpers import mermaid"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"mention_a_disease_count = sum(mention_a_disease)\n",
"mention_a_finding_count = sum(mention_a_finding)\n",
"omics_count = len(omes_or_omics.columns)\n",
"articles_with_omics = omes_or_omics.any(axis=1).sum()\n",
"pubmed_matches_count = len(literature)\n",
"in_pmc_count = sum(literature.has_pmc)\n",
"full_text_count = sum(literature.has_full_text == True)\n",
"abstract_only_count = sum((~literature.abstract.isnull()) & (~(literature.has_full_text == True)))\n",
"article_types_count = len(reliable_article_types)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"<div id=\"mermaid_1\">\n",
"graph TD\n",
" classDef data fill:#ffe49f,stroke:#333,stroke-width:1px;\n",
" classDef integration fill:#b6d7ab,stroke:#333,stroke-width:1px;\n",
" classDef analysis fill:#93c482,stroke:#333,stroke-width:1px,padding:0;\n",
" SEARCH[fa:fa-search 3456 PubMed results]\n",
" SEARCH:::data-->PMC[fa:fa-file-text 1951 in PubMed Central]:::data\n",
" PMC:::data-->FULL_TEXT[fa:fa-align-justify 1520 articles with full-text]:::data\n",
" SEARCH-->ABSTRACTS[fa:fa-font 1857 with abstracts only]:::data\n",
" FULL_TEXT-->COMBINED[fa:fa-plus-circle Combined dataset]:::integration\n",
" ABSTRACTS-->COMBINED\n",
" ABSTRACTS-->SPECIES{{fa:fa-paw 1565 mention a species}}:::analysis\n",
" ABSTRACTS-->DISEASE{{fa:fa-procedures 1518 with a disease}}:::analysis\n",
" ABSTRACTS-->FINDING{{fa:fa-stethoscope 703 with a clinical finding}}:::analysis\n",
" COMBINED-->TYPES{{fa:fa-shapes 1982 articles with determined type}}:::analysis\n",
" COMBINED-->REPOS{{fa:fa-code 594 code and data links}}:::analysis\n",
" COMBINED-->OMICS{{fa:fa-dna 2860 with >=1 of 46 omics}}:::analysis\n",
" COMBINED-->TRENDS{{fa:fa-calendar-alt phrase trends}}:::analysis\n",
"\n",
"</div>\n",
"\n",
"<script>mermaid.initialize({})</script>\n",
"<script>mermaid.init({}, \"#mermaid_1\");</script>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%mermaid\n",
"graph TD\n",
" classDef data fill:#ffe49f,stroke:#333,stroke-width:1px;\n",
" classDef integration fill:#b6d7ab,stroke:#333,stroke-width:1px;\n",
" classDef analysis fill:#93c482,stroke:#333,stroke-width:1px,padding:0;\n",
" SEARCH[fa:fa-search {pubmed_matches_count} PubMed results]\n",
" SEARCH:::data-->PMC[fa:fa-file-text {in_pmc_count} in PubMed Central]:::data\n",
" PMC:::data-->FULL_TEXT[fa:fa-align-justify {full_text_count} articles with full-text]:::data\n",
" SEARCH-->ABSTRACTS[fa:fa-font {abstract_only_count} with abstracts only]:::data\n",
" FULL_TEXT-->COMBINED[fa:fa-plus-circle Combined dataset]:::integration\n",
" ABSTRACTS-->COMBINED\n",
" ABSTRACTS-->SPECIES{{{{fa:fa-paw {articles_mentioning_species} mention a species}}}}:::analysis\n",
" ABSTRACTS-->DISEASE{{{{fa:fa-procedures {mention_a_disease_count} with a disease}}}}:::analysis\n",
" ABSTRACTS-->FINDING{{{{fa:fa-stethoscope {mention_a_finding_count} with a clinical finding}}}}:::analysis\n",
" COMBINED-->TYPES{{{{fa:fa-shapes {article_types_count} articles with determined type}}}}:::analysis\n",
" COMBINED-->REPOS{{{{fa:fa-code {code_or_data_links} code and data links}}}}:::analysis\n",
" COMBINED-->OMICS{{{{fa:fa-dna {articles_with_omics} with >=1 of {omics_count} omics}}}}:::analysis\n",
" COMBINED-->TRENDS{{{{fa:fa-calendar-alt phrase trends}}}}:::analysis"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}