[1bd6b5]: / notebooks / Flowchart.ipynb

Download this file

426 lines (425 with data), 11.8 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%run notebook_setup.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "Imported:\n",
       "\n",
       " - `literature` (904B0F94)\n",
       " - `reliable_article_types` (5D584CB5)\n",
       " - `code_repositories` (5FF4AA2D)\n",
       " - `domain_features` (9CBD2CED)\n",
       "\n",
       "at Wednesday, 05. Aug 2020 14:17"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {
      "text/markdown": {
       "action": "import",
       "command": "from pubmed_derived_data import literature, reliable_article_types, code_repositories, domain_features",
       "finished": "2020-08-05T14:17:26.121521",
       "finished_human_readable": "Wednesday, 05. Aug 2020 14:17",
       "result": [
        {
         "new_file": {
          "crc32": "904B0F94",
          "sha256": "A2EFC068A287A3B724AE4B320EE5356E1E99474BD08A2E2A3EBA34CD0194F23B"
         },
         "subject": "literature"
        },
        {
         "new_file": {
          "crc32": "5D584CB5",
          "sha256": "585366F3E5A11FC007CC4DFF5AF9C7AFBCBEBA3A15B65333657C632F2218A1AC"
         },
         "subject": "reliable_article_types"
        },
        {
         "new_file": {
          "crc32": "5FF4AA2D",
          "sha256": "92B28FE95EA205C4311BD4E9D6360D87087D0C5D452CCF9567829CFFD27EE1E5"
         },
         "subject": "code_repositories"
        },
        {
         "new_file": {
          "crc32": "9CBD2CED",
          "sha256": "69E41B5E85F3320A8BED275B947ECA40F456F11EC6734F3E3BCDE4BD64EA9255"
         },
         "subject": "domain_features"
        }
       ],
       "started": "2020-08-05T14:17:22.674214"
      }
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "%vault from pubmed_derived_data import literature, reliable_article_types, code_repositories, domain_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from repository_detection import (\n",
    "    source_code_platforms, mixed_publication_platforms, data_only_platforms,\n",
    "    all_platforms as platforms\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from helpers.features import number_of_articles_mentioning_feature, eval_features_frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "domain_features_py = eval_features_frame(domain_features)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Articles mentioning a disease or clinical finding:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "mention_a_disease = (domain_features_py.mentioned_diseases_set.apply(lambda x: len(x - {'disease'})) != 0)\n",
    "mention_a_finding = (domain_features_py.mentioned_clinical_findings.apply(len) != 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1719"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(mention_a_disease | mention_a_finding)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1565"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "articles_mentioning_species = sum(domain_features_py.mentioned_species.apply(len) != 0)\n",
    "articles_mentioning_species"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "830"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "species_terms_redundant_count = len(set(domain_features_py.mentioned_species.sum()))\n",
    "species_terms_redundant_count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "Imported `omics_features` (32CBB0C4) at Wednesday, 05. Aug 2020 14:17"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {
      "text/markdown": {
       "action": "import",
       "command": "from pubmed_derived_data import omics_features",
       "finished": "2020-08-05T14:17:27.162765",
       "finished_human_readable": "Wednesday, 05. Aug 2020 14:17",
       "result": [
        {
         "new_file": {
          "crc32": "32CBB0C4",
          "sha256": "5341315C160BE59D98DC80F7B2F5F2FB982F24900AC029B6301F92EE639DECB8"
         },
         "subject": "omics_features"
        }
       ],
       "started": "2020-08-05T14:17:26.472027"
      }
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "%vault from pubmed_derived_data import omics_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from helpers.text_processing import prefix_remover"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "omics_columns = omics_features.columns\n",
    "\n",
    "omes_or_omics = (\n",
    "    omics_features[omics_columns[omics_columns.str.startswith('ome_or_omic_')]]\n",
    "    .rename(columns=prefix_remover('ome_or_omic_'))\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas import Series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "594"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "code_or_data_links = code_repositories['mentions_' + Series(list(platforms))].sum().sum()\n",
    "code_or_data_links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "444"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "articles_with_code_or_data_link = code_repositories['mentions_' + Series(list(platforms))].any(axis=1).sum()\n",
    "articles_with_code_or_data_link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<script src=\"https://cdnjs.cloudflare.com/ajax/libs/mermaid/8.6.4/mermaid.min.js\"></script>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from helpers import mermaid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "mention_a_disease_count = sum(mention_a_disease)\n",
    "mention_a_finding_count = sum(mention_a_finding)\n",
    "omics_count = len(omes_or_omics.columns)\n",
    "articles_with_omics = omes_or_omics.any(axis=1).sum()\n",
    "pubmed_matches_count = len(literature)\n",
    "in_pmc_count = sum(literature.has_pmc)\n",
    "full_text_count = sum(literature.has_full_text == True)\n",
    "abstract_only_count = sum((~literature.abstract.isnull()) & (~(literature.has_full_text == True)))\n",
    "article_types_count = len(reliable_article_types)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<div id=\"mermaid_1\">\n",
       "graph TD\n",
       "    classDef data fill:#ffe49f,stroke:#333,stroke-width:1px;\n",
       "    classDef integration fill:#b6d7ab,stroke:#333,stroke-width:1px;\n",
       "    classDef analysis fill:#93c482,stroke:#333,stroke-width:1px,padding:0;\n",
       "    SEARCH[fa:fa-search 3456 PubMed results]\n",
       "    SEARCH:::data-->PMC[fa:fa-file-text 1951 in PubMed Central]:::data\n",
       "    PMC:::data-->FULL_TEXT[fa:fa-align-justify 1520 articles with full-text]:::data\n",
       "    SEARCH-->ABSTRACTS[fa:fa-font 1857 with abstracts only]:::data\n",
       "    FULL_TEXT-->COMBINED[fa:fa-plus-circle Combined dataset]:::integration\n",
       "    ABSTRACTS-->COMBINED\n",
       "    ABSTRACTS-->SPECIES{{fa:fa-paw 1565 mention a species}}:::analysis\n",
       "    ABSTRACTS-->DISEASE{{fa:fa-procedures 1518 with a disease}}:::analysis\n",
       "    ABSTRACTS-->FINDING{{fa:fa-stethoscope 703 with a clinical finding}}:::analysis\n",
       "    COMBINED-->TYPES{{fa:fa-shapes 1982 articles with determined type}}:::analysis\n",
       "    COMBINED-->REPOS{{fa:fa-code 594 code and data links}}:::analysis\n",
       "    COMBINED-->OMICS{{fa:fa-dna 2860 with >=1 of 46 omics}}:::analysis\n",
       "    COMBINED-->TRENDS{{fa:fa-calendar-alt phrase trends}}:::analysis\n",
       "\n",
       "</div>\n",
       "\n",
       "<script>mermaid.initialize({})</script>\n",
       "<script>mermaid.init({}, \"#mermaid_1\");</script>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%mermaid\n",
    "graph TD\n",
    "    classDef data fill:#ffe49f,stroke:#333,stroke-width:1px;\n",
    "    classDef integration fill:#b6d7ab,stroke:#333,stroke-width:1px;\n",
    "    classDef analysis fill:#93c482,stroke:#333,stroke-width:1px,padding:0;\n",
    "    SEARCH[fa:fa-search {pubmed_matches_count} PubMed results]\n",
    "    SEARCH:::data-->PMC[fa:fa-file-text {in_pmc_count} in PubMed Central]:::data\n",
    "    PMC:::data-->FULL_TEXT[fa:fa-align-justify {full_text_count} articles with full-text]:::data\n",
    "    SEARCH-->ABSTRACTS[fa:fa-font {abstract_only_count} with abstracts only]:::data\n",
    "    FULL_TEXT-->COMBINED[fa:fa-plus-circle Combined dataset]:::integration\n",
    "    ABSTRACTS-->COMBINED\n",
    "    ABSTRACTS-->SPECIES{{{{fa:fa-paw {articles_mentioning_species} mention a species}}}}:::analysis\n",
    "    ABSTRACTS-->DISEASE{{{{fa:fa-procedures {mention_a_disease_count} with a disease}}}}:::analysis\n",
    "    ABSTRACTS-->FINDING{{{{fa:fa-stethoscope {mention_a_finding_count} with a clinical finding}}}}:::analysis\n",
    "    COMBINED-->TYPES{{{{fa:fa-shapes {article_types_count} articles with determined type}}}}:::analysis\n",
    "    COMBINED-->REPOS{{{{fa:fa-code {code_or_data_links} code and data links}}}}:::analysis\n",
    "    COMBINED-->OMICS{{{{fa:fa-dna {articles_with_omics} with >=1 of {omics_count} omics}}}}:::analysis\n",
    "    COMBINED-->TRENDS{{{{fa:fa-calendar-alt phrase trends}}}}:::analysis"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}