2635 lines (2634 with data), 101.7 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%run notebook_setup.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas\n",
"pandas.set_option('display.max_colwidth', 100)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"Imported `literature` (904B0F94) at Monday, 03. Aug 2020 01:39"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {
"text/markdown": {
"action": "import",
"command": "from pubmed_derived_data import literature",
"finished": "2020-08-03T01:39:14.793354",
"finished_human_readable": "Monday, 03. Aug 2020 01:39",
"result": [
{
"new_file": {
"crc32": "904B0F94",
"sha256": "A2EFC068A287A3B724AE4B320EE5356E1E99474BD08A2E2A3EBA34CD0194F23B"
},
"subject": "literature"
}
],
"started": "2020-08-03T01:39:12.693141"
}
},
"output_type": "display_data"
}
],
"source": [
"%vault from pubmed_derived_data import literature"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"has_abstract = ~literature['abstract'].isnull()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**TODO: use title if not abstract, add keywords, use full text if available**"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"literature['abstract_clean'] = literature['abstract_clean'].fillna('')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"domain_features = literature.index.to_frame().copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Domain knowledge / semantic features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Spacy corpus and models are trained on generic web-based texts, not on scientific or biomedical research. In order to make use of named entities it is worth including a minimum of domain-specific terminology:\n",
"\n",
"- [ ] chromosomes\n",
" - [ ] locus\n",
"- [ ] mutations\n",
" - [ ] in proteins\n",
" - [ ] in genes\n",
"- [ ] gene & protein names\n",
" - [x] Ensebml identifiers regular expression\n",
" - [ ] refseq regular expression\n",
" - Note: Entrez ids are just numbers - would require a lot of manual\n",
" - [ ] MANE\n",
" - [ ] gene names\n",
" - [ ] protein names\n",
"- [ ] metabolite names\n",
" - [ ] HMDB - would bias for human research\n",
"- [x] diseases\n",
" - [ ] OMIM\n",
" - [x] ClinVar\n",
" - [ ] an ontology?\n",
"- [ ] drug names\n",
" - [ ] DrugBank\n",
"- [x] species\n",
" - [x] NCBI"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from pandas import read_table"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from helpers.text_processing import (\n",
" check_usage,\n",
" highlight_first,\n",
" matches_n_consecutive_words\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from functools import partial\n",
"\n",
"check_usage_in_abstracts = partial(check_usage, data=literature, column='abstract_clean')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/krassowski/.pyenv/versions/3.8.3/envs/multi-omics/lib/python3.8/site-packages/tqdm/std.py:668: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
" from pandas import Panel\n"
]
}
],
"source": [
"from tqdm import tqdm\n",
"tqdm.pandas()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Gene"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Ensembl"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> ENS(species)(object type)(identifier).(version)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from pandas import read_html"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"ens_object_types, ens_species_prefixes = read_html('data/ensembl_prefixes.html')\n",
"ens_object_types.columns = ['code', 'object_type']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>code</th>\n",
" <th>object_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>E</td>\n",
" <td>exon</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>FM</td>\n",
" <td>Ensembl protein family</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>G</td>\n",
" <td>gene</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>GT</td>\n",
" <td>gene tree</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>P</td>\n",
" <td>protein</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>R</td>\n",
" <td>regulatory feature</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>T</td>\n",
" <td>transcript</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" code object_type\n",
"0 E exon\n",
"1 FM Ensembl protein family\n",
"2 G gene\n",
"3 GT gene tree\n",
"4 P protein\n",
"5 R regulatory feature\n",
"6 T transcript"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ens_object_types "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Prefix</th>\n",
" <th>Species name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ENSCPO</td>\n",
" <td>Cavia porcellus (Guinea Pig)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ENSZAL</td>\n",
" <td>Zonotrichia albicollis (White-throated sparrow)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ENSMUN</td>\n",
" <td>Melopsittacus undulatus (Budgerigar)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ENSRRO</td>\n",
" <td>Rhinopithecus roxellana (Golden snub-nosed monkey)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ENSCGR</td>\n",
" <td>Cricetulus griseus (Chinese hamster CHOK1GS)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>279</th>\n",
" <td>ENSHHU</td>\n",
" <td>Hucho hucho (Huchen)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>280</th>\n",
" <td>MGP_129S1SvImJ_</td>\n",
" <td>Mus musculus (Mouse 129S1/SvImJ)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>281</th>\n",
" <td>ENSSPU</td>\n",
" <td>Sphenodon punctatus (Tuatara)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>282</th>\n",
" <td>ENSBTA</td>\n",
" <td>Bos taurus (Cow)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>283</th>\n",
" <td>ENSPCA</td>\n",
" <td>Procavia capensis (Hyrax)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>284 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" Prefix Species name\n",
"0 ENSCPO Cavia porcellus (Guinea Pig)\n",
"1 ENSZAL Zonotrichia albicollis (White-throated sparrow)\n",
"2 ENSMUN Melopsittacus undulatus (Budgerigar)\n",
"3 ENSRRO Rhinopithecus roxellana (Golden snub-nosed monkey)\n",
"4 ENSCGR Cricetulus griseus (Chinese hamster CHOK1GS)\n",
".. ... ...\n",
"279 ENSHHU Hucho hucho (Huchen)\n",
"280 MGP_129S1SvImJ_ Mus musculus (Mouse 129S1/SvImJ)\n",
"281 ENSSPU Sphenodon punctatus (Tuatara)\n",
"282 ENSBTA Bos taurus (Cow)\n",
"283 ENSPCA Procavia capensis (Hyrax)\n",
"\n",
"[284 rows x 2 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ens_species_prefixes"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"from re import escape"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"ens_prefix_re = '|'.join(ens_species_prefixes.Prefix.apply(escape))\n",
"ens_object_types_re = '|'.join(ens_object_types .code.apply(escape))\n",
"ensembl_expression = rf'(?P<species_preifx>{ens_prefix_re})(?P<object_type>{ens_object_types_re})(?P<identifier>\\d+)(?P<version_suffix>\\.\\d+)?'"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"from re import match"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'species_preifx': 'ENS',\n",
" 'object_type': 'G',\n",
" 'identifier': '00000010404',\n",
" 'version_suffix': None}"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"match(ensembl_expression, 'ENSG00000010404').groupdict()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/krassowski/.pyenv/versions/3.8.3/envs/multi-omics/lib/python3.8/site-packages/pandas/core/strings.py:1954: UserWarning: This pattern has match groups. To actually get the groups, use str.extract.\n",
" return func(self, *args, **kwargs)\n"
]
}
],
"source": [
"has_ensembl_id = literature['abstract_clean'].str.contains(ensembl_expression) != False"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(has_ensembl_id)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>abstract_clean</th>\n",
" </tr>\n",
" <tr>\n",
" <th>uid</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>28243742</th>\n",
" <td>Integrative multi-omics analysis revealed SNP-lncRNA-mRNA (SLM) networks in human peripheral blo...</td>\n",
" <td>Long non-coding RNAs (lncRNAs) serve as important controller of cellular functions via regulatin...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32163894</th>\n",
" <td>Inferences of Individual Drug Response-Related Long Non-coding RNAs Based on Integrating Multi-o...</td>\n",
" <td>Differences in individual drug responses are obstacles in breast cancer (BRCA) treatment, so pre...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title \\\n",
"uid \n",
"28243742 Integrative multi-omics analysis revealed SNP-lncRNA-mRNA (SLM) networks in human peripheral blo... \n",
"32163894 Inferences of Individual Drug Response-Related Long Non-coding RNAs Based on Integrating Multi-o... \n",
"\n",
" abstract_clean \n",
"uid \n",
"28243742 Long non-coding RNAs (lncRNAs) serve as important controller of cellular functions via regulatin... \n",
"32163894 Differences in individual drug responses are obstacles in breast cancer (BRCA) treatment, so pre... "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"literature[has_ensembl_id][['title', 'abstract_clean']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Not worth further consideration (at least on abstract levels)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Diseases"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>#DiseaseName</th>\n",
" <th>SourceName</th>\n",
" <th>ConceptID</th>\n",
" <th>SourceID</th>\n",
" <th>DiseaseMIM</th>\n",
" <th>LastModified</th>\n",
" <th>Category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1-2 finger syndactyly</td>\n",
" <td>Human Phenotype Ontology</td>\n",
" <td>C4023732</td>\n",
" <td>HP:0010704</td>\n",
" <td>NaN</td>\n",
" <td>16 Feb 2016</td>\n",
" <td>Finding</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1-2 toe complete cutaneous syndactyly</td>\n",
" <td>Human Phenotype Ontology</td>\n",
" <td>C4025140</td>\n",
" <td>HP:0005767</td>\n",
" <td>NaN</td>\n",
" <td>16 Feb 2016</td>\n",
" <td>Finding</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1-2 toe syndactyly</td>\n",
" <td>Human Phenotype Ontology</td>\n",
" <td>C4023726</td>\n",
" <td>HP:0010711</td>\n",
" <td>NaN</td>\n",
" <td>16 Feb 2016</td>\n",
" <td>Finding</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1-3 finger syndactyly</td>\n",
" <td>Human Phenotype Ontology</td>\n",
" <td>C4023730</td>\n",
" <td>HP:0010706</td>\n",
" <td>NaN</td>\n",
" <td>16 Feb 2016</td>\n",
" <td>Finding</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1-3 toe syndactyly</td>\n",
" <td>Human Phenotype Ontology</td>\n",
" <td>C4025774</td>\n",
" <td>HP:0001459</td>\n",
" <td>NaN</td>\n",
" <td>16 Feb 2016</td>\n",
" <td>Finding</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43749</th>\n",
" <td>Zygodactyly type 3</td>\n",
" <td>MONDO</td>\n",
" <td>CN203276</td>\n",
" <td>MONDO:0017544</td>\n",
" <td>NaN</td>\n",
" <td>17 Apr 2020</td>\n",
" <td>Disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43750</th>\n",
" <td>Zygodactyly type 4</td>\n",
" <td>MONDO</td>\n",
" <td>CN203277</td>\n",
" <td>MONDO:0017545</td>\n",
" <td>NaN</td>\n",
" <td>17 Apr 2020</td>\n",
" <td>Disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43751</th>\n",
" <td>Zygomycosis</td>\n",
" <td>NaN</td>\n",
" <td>C0043541</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>16 Feb 2016</td>\n",
" <td>Infectious disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43752</th>\n",
" <td>Keratosis pilaris atrophicans</td>\n",
" <td>MONDO</td>\n",
" <td>C4310982</td>\n",
" <td>MONDO:0018855</td>\n",
" <td>NaN</td>\n",
" <td>28 Apr 2020</td>\n",
" <td>Disease</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43753</th>\n",
" <td>Keratosis pilaris atrophicans</td>\n",
" <td>Orphanet</td>\n",
" <td>C4310982</td>\n",
" <td>ORPHA498</td>\n",
" <td>NaN</td>\n",
" <td>28 Apr 2020</td>\n",
" <td>Disease</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>43754 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" #DiseaseName SourceName \\\n",
"0 1-2 finger syndactyly Human Phenotype Ontology \n",
"1 1-2 toe complete cutaneous syndactyly Human Phenotype Ontology \n",
"2 1-2 toe syndactyly Human Phenotype Ontology \n",
"3 1-3 finger syndactyly Human Phenotype Ontology \n",
"4 1-3 toe syndactyly Human Phenotype Ontology \n",
"... ... ... \n",
"43749 Zygodactyly type 3 MONDO \n",
"43750 Zygodactyly type 4 MONDO \n",
"43751 Zygomycosis NaN \n",
"43752 Keratosis pilaris atrophicans MONDO \n",
"43753 Keratosis pilaris atrophicans Orphanet \n",
"\n",
" ConceptID SourceID DiseaseMIM LastModified Category \n",
"0 C4023732 HP:0010704 NaN 16 Feb 2016 Finding \n",
"1 C4025140 HP:0005767 NaN 16 Feb 2016 Finding \n",
"2 C4023726 HP:0010711 NaN 16 Feb 2016 Finding \n",
"3 C4023730 HP:0010706 NaN 16 Feb 2016 Finding \n",
"4 C4025774 HP:0001459 NaN 16 Feb 2016 Finding \n",
"... ... ... ... ... ... \n",
"43749 CN203276 MONDO:0017544 NaN 17 Apr 2020 Disease \n",
"43750 CN203277 MONDO:0017545 NaN 17 Apr 2020 Disease \n",
"43751 C0043541 NaN NaN 16 Feb 2016 Infectious disease \n",
"43752 C4310982 MONDO:0018855 NaN 28 Apr 2020 Disease \n",
"43753 C4310982 ORPHA498 NaN 28 Apr 2020 Disease \n",
"\n",
"[43754 rows x 7 columns]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"disease_names_df = read_table('data/disease_names')\n",
"disease_names_df"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Disease 26046\n",
"Finding 15029\n",
"Named protein variant 1117\n",
"Infectious disease 951\n",
"Pharmacological response 547\n",
"Blood group 63\n",
"phenotype instruction 1\n",
"Name: Category, dtype: int64"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"disease_names_df.Category.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"25922"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"disease_names = disease_names_df[disease_names_df.Category == 'Disease']['#DiseaseName'].str.lower()\n",
"len(disease_names.unique())"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD4CAYAAAAO9oqkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAASiklEQVR4nO3dbYxc53ne8f9VMXIUr2NJprEQSBZka8IFbTaOupBUOAi2VitRUhAqgGPIEGLKZcsCpVOnJZBQ6QcWdgTIqBXFRmMXbMSaDlzLquJERKWEJmQNnACRLL8Ieo2rhUxHJCgxMSU5azd217n7YR82k30xOTPLnd2d/w9YzDn3ec6ZZ5494MXzMmdTVUiSRtvfG3YHJEnDZxhIkgwDSZJhIEnCMJAkAeuG3YF+rV+/vt785jfz+te/fthdWVG++93vOiZzOCYLc1zmW+tjsn79eo4ePXq0qnbMXbZqw2Dz5s189KMfZXJycthdWVE6nY5jModjsjDHZb5RGJMk6xeqe5pIkmQYSJIMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEms4m8gD2Lz/geH8r7H77xpKO8rSefikYEkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEnCMJAkcR5hkORQktNJnu6q/eckf5bkySS/n+TSrmW3J5lK8o0k13fVd7TaVJL9XfUtSR5r9c8luXgpP6Ak6dzO58jgU8DcP558DHh7Vf1j4H8DtwMk2QbcArytrfOJJBcluQj4beAGYBvw3tYW4CPA3VX1FuAVYPdAn0iS1LNzhkFVfQk4M6f2haqaabOPAhvb9E7g3qr6flV9E5gCrmo/U1X1QlX9ALgX2JkkwLuA+9v6h4GbB/xMkqQeLcWzif4l8Lk2vYHZcDjrRKsBvDinfjXwJuDVrmDpbj9Pkj3AHoDx8XGmp6fpdDo9d3jf9plzN7oA+ulrr/odk7XMMVmY4zLfKI/JQGGQ5D8CM8BnlqY7P1pVHQQOAkxMTNTY2BiTk5M9b+e2YT2o7tbJC/4enU6nrzFZyxyThTku843ymPQdBkluA34OuLaqqpVPApu6mm1sNRapfxu4NMm6dnTQ3V6StEz6urU0yQ7gV4Gfr6rvdS06AtyS5HVJtgBbgS8DjwNb251DFzN7kflIC5FHgHe39XcBD/T3USRJ/TqfW0s/C/wp8NYkJ5LsBv4L8AbgWJInkvxXgKp6BrgPeBb4I2BvVf2w/a//A8BR4DngvtYW4NeA/5BkitlrCPcs6SeUJJ3TOU8TVdV7Fygv+g92Vd0B3LFA/SHgoQXqLzB7t5EkaUj8BrIkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgSeI8wiDJoSSnkzzdVbs8ybEkz7fXy1o9ST6eZCrJk0mu7FpnV2v/fJJdXfV/kuSpts7Hk2SpP6Qk6Uc7nyODTwE75tT2Aw9X1Vbg4TYPcAOwtf3sAT4Js+EBHACuBq4CDpwNkNbmX3etN/e9JEkX2DnDoKq+BJyZU94JHG7Th4Gbu+qfrlmPApcmuQK4HjhWVWeq6hXgGLCjLfvJqnq0qgr4dNe2JEnLpN9rBuNVdapNvwSMt+kNwItd7U602o+qn1igLklaRusG3UBVVZJais6cS5I9zJ5+Ynx8nOnpaTqdTs/b2bd9Zol7dn766Wuv+h2TtcwxWZjjMt8oj0m/YfBykiuq6lQ71XO61U8Cm7rabWy1k8DknHqn1Tcu0H5BVXUQOAgwMTFRY2NjTE5OLtZ8Ubftf7DndZbC8VsnL/h7dDqdvsZkLXNMFua4zDfKY9LvaaIjwNk7gnYBD3TV39fuKroGeK2dTjoKXJfksnbh+DrgaFv2nSTXtLuI3te1LUnSMjnnkUGSzzL7v/r1SU4we1fQncB9SXYD3wLe05o/BNwITAHfA94PUFVnknwYeLy1+1BVnb0o/W+ZvWPpEuAP248kaRmdMwyq6r2LLLp2gbYF7F1kO4eAQwvUvwK8/Vz9kCRdOH4DWZJkGEiSDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgSWIJ/uylzt/mZfgLa/u2zyz4l9yO33nTBX9vSauXRwaSJMNAkmQYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgSWLAMEjy75M8k+TpJJ9N8uNJtiR5LMlUks8lubi1fV2bn2rLN3dt5/ZW/0aS6wf7SJKkXvUdBkk2AP8OmKiqtwMXAbcAHwHurqq3AK8Au9squ4FXWv3u1o4k29p6bwN2AJ9IclG//ZIk9W7Q00TrgEuSrAN+AjgFvAu4vy0/DNzcpne2edrya5Ok1e+tqu9X1TeBKeCqAfslSepB3w+qq6qTST4K/Dnwf4AvAF8FXq2qmdbsBLChTW8AXmzrziR5DXhTqz/atenudf6OJHuAPQDj4+NMT0/T6XR67vu+7TPnbrRKjV+y8OfrZ5zWin73k7XOcZlvlMek7zBIchmz/6vfArwK/E9mT/NcMFV1EDgIMDExUWNjY0xOTva8nYWe6rlW7Ns+w11Pzf+1Hr91cvk7s0J0Op2+9pO1znGZb5THZJDTRP8c+GZV/UVV/V/g88A7gUvbaSOAjcDJNn0S2ATQlr8R+HZ3fYF1JEnLYJAw+HPgmiQ/0c79Xws8CzwCvLu12QU80KaPtHna8i9WVbX6Le1uoy3AVuDLA/RLktSjQa4ZPJbkfuBrwAzwdWZP4TwI3JvkN1rtnrbKPcDvJpkCzjB7BxFV9UyS+5gNkhlgb1X9sN9+SZJ6N9BfOquqA8CBOeUXWOBuoKr6a+AXF9nOHcAdg/RFktQ/v4EsSTIMJEmGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIYMAySXJrk/iR/luS5JP80yeVJjiV5vr1e1tomyceTTCV5MsmVXdvZ1do/n2TXoB9KktSbQY8MPgb8UVX9I+CngOeA/cDDVbUVeLjNA9wAbG0/e4BPAiS5HDgAXA1cBRw4GyCSpOXRdxgkeSPws8A9AFX1g6p6FdgJHG7NDgM3t+mdwKdr1qPApUmuAK4HjlXVmap6BTgG7Oi3X5Kk3q0bYN0twF8A/z3JTwFfBT4IjFfVqdbmJWC8TW8AXuxa/0SrLVafJ8keZo8qGB8fZ3p6mk6n03PH922f6Xmd1WL8koU/Xz/jtFb0u5+sdY7LfKM8JoOEwTrgSuCXq+qxJB/jb08JAVBVlaQG6eCc7R0EDgJMTEzU2NgYk5OTPW/ntv0PLlWXVpx922e466n5v9bjt04uf2dWiE6n09d+stY5LvON8pgMcs3gBHCiqh5r8/czGw4vt9M/tNfTbflJYFPX+htbbbG6JGmZ9B0GVfUS8GKSt7bStcCzwBHg7B1Bu4AH2vQR4H3trqJrgNfa6aSjwHVJLmsXjq9rNUnSMhnkNBHALwOfSXIx8ALwfmYD5r4ku4FvAe9pbR8CbgSmgO+1tlTVmSQfBh5v7T5UVWcG7JckqQcDhUFVPQFMLLDo2gXaFrB3ke0cAg4N0hdJUv/8BrIkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCSxBGGQ5KIkX0/yv9r8liSPJZlK8rkkF7f669r8VFu+uWsbt7f6N5JcP2ifJEm9WYojgw8Cz3XNfwS4u6reArwC7G713cArrX53a0eSbcAtwNuAHcAnkly0BP2SJJ2ngcIgyUbgJuB32nyAdwH3tyaHgZvb9M42T1t+bWu/E7i3qr5fVd8EpoCrBumXJKk36wZc/7eAXwXe0ObfBLxaVTNt/gSwoU1vAF4EqKqZJK+19huAR7u22b3O35FkD7AHYHx8nOnpaTqdTs+d3rd95tyNVqnxSxb+fP2M01rR736y1jku843ymPQdBkl+DjhdVV9NMrl0XVpcVR0EDgJMTEzU2NgYk5O9v/Vt+x9c4p6tHPu2z3DXU/N/rcdvnVz+zqwQnU6nr/1krXNc5hvlMRnkyOCdwM8nuRH4ceAngY8BlyZZ144ONgInW/uTwCbgRJJ1wBuBb3fVz+peR5K0DPq+ZlBVt1fVxqrazOwF4C9W1a3AI8C7W7NdwANt+kibpy3/YlVVq9/S7jbaAmwFvtxvvyRJvRv0msFCfg24N8lvAF8H7mn1e4DfTTIFnGE2QKiqZ5LcBzwLzAB7q+qHF6BfkqRFLEkYVFUH6LTpF1jgbqCq+mvgFxdZ/w7gjqXoiySpd34DWZJkGEiSDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CSxIV5hLVWoM1D+utux++8aSjvK6k3HhlIkgwDSZJhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJIkBwiDJpiSPJHk2yTNJPtjqlyc5luT59npZqyfJx5NMJXkyyZVd29rV2j+fZNfgH0uS1ItBjgxmgH1VtQ24BtibZBuwH3i4qrYCD7d5gBuAre1nD/BJmA0P4ABwNXAVcOBsgEiSlkffYVBVp6rqa236r4DngA3ATuBwa3YYuLlN7wQ+XbMeBS5NcgVwPXCsqs5U1SvAMWBHv/2SJPVuSR5Ul2Qz8NPAY8B4VZ1qi14Cxtv0BuDFrtVOtNpi9YXeZw+zRxWMj48zPT1Np9Ppub/7ts/0vM5qMX7Jyvp8/fx+llq/+8la57jMN8pjMnAYJBkDfg/4lar6TpL/v6yqKkkN+h5d2zsIHASYmJiosbExJicne97ObUN6gudy2Ld9hrueWjkPoz1+6+Swu0Cn0+lrP1nrHJf5RnlMBrqbKMmPMRsEn6mqz7fyy+30D+31dKufBDZ1rb6x1RarS5KWySB3EwW4B3iuqn6za9ER4OwdQbuAB7rq72t3FV0DvNZOJx0FrktyWbtwfF2rSZKWySDnE94J/BLwVJInWu3XgTuB+5LsBr4FvKctewi4EZgCvge8H6CqziT5MPB4a/ehqjozQL8kST3qOwyq6k+ALLL42gXaF7B3kW0dAg712xdJ0mD8BrIkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkliiR1hLi9k8xCfEHr/zpqG9t7TaeGQgSTIMJEmGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCZ9NpDXs7HOR9m2f4bZlfEaSz0TSauSRgSTJMJAkraAwSLIjyTeSTCXZP+z+SNIoWRHXDJJcBPw28C+AE8DjSY5U1bPD7ZnUO/+Gg1ajlXJkcBUwVVUvVNUPgHuBnUPukySNjFTVsPtAkncDO6rqX7X5XwKurqoPzGm3B9jTZt8KfBv4y+Xs6yqwHsdkLsdkYY7LfGt9TP4SoKp2zF2wIk4Tna+qOggcPDuf5CtVNTHELq04jsl8jsnCHJf5RnlMVsppopPApq75ja0mSVoGKyUMHge2JtmS5GLgFuDIkPskSSNjRZwmqqqZJB8AjgIXAYeq6pnzWPXguZuMHMdkPsdkYY7LfCM7JiviArIkabhWymkiSdIQGQaSpNUbBj6+Yr4kx5M8leSJJF8Zdn+GIcmhJKeTPN1VuzzJsSTPt9fLhtnH5bbImPynJCfbvvJEkhuH2cfllmRTkkeSPJvkmSQfbPWR3VdWZRh0Pb7iBmAb8N4k24bbqxXjn1XVO0b1XmngU8DcL9TsBx6uqq3Aw21+lHyK+WMCcHfbV95RVQ8tc5+GbQbYV1XbgGuAve3fkJHdV1ZlGODjK7SIqvoScGZOeSdwuE0fBm5e1k4N2SJjMtKq6lRVfa1N/xXwHLCBEd5XVmsYbABe7Jo/0WqjroAvJPlqe3SHZo1X1ak2/RIwPszOrCAfSPJkO400MqdD5kqyGfhp4DFGeF9ZrWGghf1MVV3J7OmzvUl+dtgdWmlq9l5q76eGTwL/EHgHcAq4a7jdGY4kY8DvAb9SVd/pXjZq+8pqDQMfX7GAqjrZXk8Dv8/s6TTBy0muAGivp4fcn6Grqper6odV9TfAf2ME95UkP8ZsEHymqj7fyiO7r6zWMPDxFXMkeX2SN5ydBq4Dnv7Ra42MI8CuNr0LeGCIfVkRzv6D1/wCI7avJAlwD/BcVf1m16KR3VdW7TeQ261wv8XfPr7ijiF3aaiS/ANmjwZg9jEj/2MUxyTJZ4FJZh9F/DJwAPgD4D7g7wPfAt5TVSNzQXWRMZlk9hRRAceBf9N1rnzNS/IzwB8DTwF/08q/zux1g5HcV1ZtGEiSls5qPU0kSVpChoEkyTCQJBkGkiQMA0kShoEkCcNAkgT8P56L13aQhVPEAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"disease_words_count = disease_names.str.split(' ').apply(len)\n",
"disease_words_count.hist();"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13.0"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"disease_words_count.quantile(0.999)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"ignored_disease_terms = {\n",
" 'rare',\n",
" 'c',\n",
" 'acquired',\n",
" 'localized'\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 3456/3456 [00:05<00:00, 635.44it/s]\n"
]
}
],
"source": [
"mentioned_diseases = literature['abstract_clean'].str.lower().progress_apply(\n",
" matches_n_consecutive_words,\n",
" database=set(disease_names) - ignored_disease_terms,\n",
" consecutive_n=int(disease_words_count.quantile(0.999))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"domain_features['mentioned_diseases'] = mentioned_diseases\n",
"domain_features['mentioned_diseases_set'] = mentioned_diseases.apply(set)\n",
"domain_features['mentions_n_diseases'] = mentioned_diseases.apply(len)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"cancer 786\n",
"disease 722\n",
"carcinoma 132\n",
"inflammation 77\n",
"cardiovascular 68\n",
"diabetes 60\n",
"colorectal cancer 59\n",
"adenocarcinoma 53\n",
"hepatocellular carcinoma 47\n",
"glioblastoma 42\n",
"lung cancer 41\n",
"aging 37\n",
"injury 34\n",
"obesity 33\n",
"squamous cell carcinoma 31\n",
"ovarian cancer 28\n",
"lung adenocarcinoma 27\n",
"melanoma 26\n",
"gbm 24\n",
"inflammatory bowel disease 21\n",
"fatty liver disease 20\n",
"kidney disease 19\n",
"diabetes mellitus 19\n",
"chronic obstructive pulmonary disease 19\n",
"glioma 19\n",
"fibrosis 18\n",
"transplantation 18\n",
"glioblastoma multiforme 17\n",
"stroke 16\n",
"inherited 16\n",
"pregnancy 16\n",
"schizophrenia 14\n",
"rheumatoid arthritis 13\n",
"non-small cell lung cancer 12\n",
"non-alcoholic fatty liver disease 12\n",
"myeloid leukemia 12\n",
"lymphoma 10\n",
"acute myeloid leukemia 10\n",
"amyotrophic lateral sclerosis 10\n",
"lateral sclerosis 10\n",
"insulin resistance 10\n",
"pancreatic ductal adenocarcinoma 9\n",
"cardiovascular diseases 9\n",
"coronary artery disease 9\n",
"disease susceptibility 9\n",
"myocardial infarction 8\n",
"osteoarthritis 8\n",
"ulcerative colitis 8\n",
"inborn errors of metabolism 7\n",
"autism spectrum disorder 7\n",
"dtype: int64"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"detected_disease_occurrences = Series(domain_features['mentioned_diseases_set'].apply(list).sum())\n",
"detected_disease_occurrences.value_counts().head(50)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"15014"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clinical_findings = disease_names_df[disease_names_df.Category == 'Finding']['#DiseaseName'].str.lower()\n",
"len(clinical_findings.unique())"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD8CAYAAACVZ8iyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAVRklEQVR4nO3df6zddZ3n8edLOijqSIvO3iVts+2ujQZldPAGME4mV9mFgsbyh2MwZCluM/1jmBlnl0TLTHbJqiSYnQwj2ZFNVzrCxFgZxlkaYGS6lRuzyYCgID9F7mCVNmDVFtyuO85W3vvH+XTmeDnl3HvObS/n6/ORnNzv9/39fM/5vMOxr/P9cY6pKiRJv9hesdwTkCQtP8NAkmQYSJIMA0kShoEkCcNAksQCwiDJjiQHkjwyYNuVSSrJG9p6klyfZC7JQ0nO6hu7OcmT7bG5r/6OJA+3fa5PkqVqTpK0MAs5MvgcsHF+Mcla4Hzge33lC4EN7bEVuKGNPQ24GjgHOBu4Osmqts8NwG/17fei15IkHV9Dw6CqvgocHLDpOuCjQP+31jYBN1fPPcDKJKcDFwC7q+pgVR0CdgMb27bXVdU91fv2283AxeO1JElarBWj7JRkE7C/qr4576zOauDpvvV9rfZS9X0D6sd63a30jjg45ZRT3rF27dqhc33hhRd4xSu6c2mka/1A93rqWj/QvZ661g8svKdvf/vbP6yqX5lfX3QYJHk18Af0ThGdUFW1HdgOMD09Xffff//QfWZnZ5mZmTnOMztxutYPdK+nrvUD3eupa/3AwntK8t1B9VGi8V8B64FvJtkLrAG+keSfA/uB/o/ra1rtpeprBtQlSSfQosOgqh6uqn9WVeuqah29UztnVdWzwC7gsnZX0bnA81X1DHAXcH6SVe3C8fnAXW3bj5Oc2+4iugy4bYl6kyQt0EJuLf0C8LfAm5LsS7LlJYbfCTwFzAH/HfhtgKo6CHwCuK89Pt5qtDGfbfv8HfDXo7UiSRrV0GsGVfWhIdvX9S0XcMUxxu0Adgyo3w+8ddg8JEnHT7cup0uSRmIYSJIMA0mSYSBJwjCQJDHiz1FMunXb7liW19177XuX5XUlaRiPDCRJhoEkyTCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkiQWEQZIdSQ4keaSv9l+SfCvJQ0n+KsnKvm1XJZlL8kSSC/rqG1ttLsm2vvr6JPe2+heTnLyUDUqShlvIkcHngI3zaruBt1bVrwLfBq4CSHIGcAnwlrbPZ5KclOQk4E+BC4EzgA+1sQCfAq6rqjcCh4AtY3UkSVq0oWFQVV8FDs6r/U1VHWmr9wBr2vImYGdV/bSqvgPMAWe3x1xVPVVV/wDsBDYlCfAe4Na2/03AxWP2JElapKX4/0D+d8AX2/JqeuFw1L5WA3h6Xv0c4PXAc33B0j/+RZJsBbYCTE1NMTs7O3Ryhw8fftG4K888MnjwcbaQ+Q4zqJ9J17WeutYPdK+nrvUD4/c0Vhgk+UPgCPD5cZ5noapqO7AdYHp6umZmZobuMzs7y/xxl2+74zjMbri9l84MHTPMoH4mXdd66lo/0L2eutYPjN/TyGGQ5HLgfcB5VVWtvB9Y2zdsTatxjPqPgJVJVrSjg/7xkqQTZKRbS5NsBD4KvL+qftK3aRdwSZJXJlkPbAC+BtwHbGh3Dp1M7yLzrhYidwMfaPtvBm4brRVJ0qgWcmvpF4C/Bd6UZF+SLcB/BX4Z2J3kwST/DaCqHgVuAR4DvgxcUVU/a5/6fwe4C3gcuKWNBfgY8B+SzNG7hnDjknYoSRpq6GmiqvrQgPIx/8GuqmuAawbU7wTuHFB/it7dRpKkZeI3kCVJhoEkyTCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkFhAGSXYkOZDkkb7aaUl2J3my/V3V6klyfZK5JA8lOatvn81t/JNJNvfV35Hk4bbP9Umy1E1Kkl7aQo4MPgdsnFfbBuypqg3AnrYOcCGwoT22AjdALzyAq4FzgLOBq48GSBvzW337zX8tSdJxNjQMquqrwMF55U3ATW35JuDivvrN1XMPsDLJ6cAFwO6qOlhVh4DdwMa27XVVdU9VFXBz33NJkk6QUa8ZTFXVM235WWCqLa8Gnu4bt6/VXqq+b0BdknQCrRj3CaqqktRSTGaYJFvpnX5iamqK2dnZofscPnz4ReOuPPPIcZjdcAuZ7zCD+pl0Xeupa/1A93rqWj8wfk+jhsH3k5xeVc+0Uz0HWn0/sLZv3JpW2w/MzKvPtvqaAeMHqqrtwHaA6enpmpmZOdbQfzQ7O8v8cZdvu2PofsfD3ktnho4ZZlA/k65rPXWtH+heT13rB8bvadTTRLuAo3cEbQZu66tf1u4qOhd4vp1Ougs4P8mqduH4fOCutu3HSc5tdxFd1vdckqQTZOiRQZIv0PtU/4Yk++jdFXQtcEuSLcB3gQ+24XcCFwFzwE+ADwNU1cEknwDua+M+XlVHL0r/Nr07lk4B/ro9JEkn0NAwqKoPHWPTeQPGFnDFMZ5nB7BjQP1+4K3D5iFJOn78BrIkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CSxJhhkOTfJ3k0ySNJvpDkVUnWJ7k3yVySLyY5uY19ZVufa9vX9T3PVa3+RJILxmtJkrRYI4dBktXA7wHTVfVW4CTgEuBTwHVV9UbgELCl7bIFONTq17VxJDmj7fcWYCPwmSQnjTovSdLijXuaaAVwSpIVwKuBZ4D3ALe27TcBF7flTW2dtv28JGn1nVX106r6DjAHnD3mvCRJizByGFTVfuCPgO/RC4Hnga8Dz1XVkTZsH7C6La8Gnm77HmnjX99fH7CPJOkEWDHqjklW0ftUvx54DvgLeqd5jpskW4GtAFNTU8zOzg7d5/Dhwy8ad+WZRwYPPs4WMt9hBvUz6brWU9f6ge711LV+YPyeRg4D4F8D36mqHwAk+RLwLmBlkhXt0/8aYH8bvx9YC+xrp5VOBX7UVz+qf5+fU1Xbge0A09PTNTMzM3SSs7OzzB93+bY7FtTgUtt76czQMcMM6mfSda2nrvUD3eupa/3A+D2Nc83ge8C5SV7dzv2fBzwG3A18oI3ZDNzWlne1ddr2r1RVtfol7W6j9cAG4GtjzEuStEgjHxlU1b1JbgW+ARwBHqD3qf0OYGeST7bajW2XG4E/TzIHHKR3BxFV9WiSW+gFyRHgiqr62ajzkiQt3jiniaiqq4Gr55WfYsDdQFX198BvHuN5rgGuGWcukqTR+Q1kSZJhIEkyDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJIkxwyDJyiS3JvlWkseTvDPJaUl2J3my/V3VxibJ9UnmkjyU5Ky+59ncxj+ZZPO4TUmSFmfcI4NPA1+uqjcDbwMeB7YBe6pqA7CnrQNcCGxoj63ADQBJTgOuBs4BzgauPhogkqQTY+QwSHIq8BvAjQBV9Q9V9RywCbipDbsJuLgtbwJurp57gJVJTgcuAHZX1cGqOgTsBjaOOi9J0uKNc2SwHvgB8GdJHkjy2SSvAaaq6pk25llgqi2vBp7u239fqx2rLkk6QVaMue9ZwO9W1b1JPs0/nRICoKoqSY0zwX5JttI7xcTU1BSzs7ND9zl8+PCLxl155pGlmtKiLGS+wwzqZ9J1raeu9QPd66lr/cD4PY0TBvuAfVV1b1u/lV4YfD/J6VX1TDsNdKBt3w+s7dt/TavtB2bm1WcHvWBVbQe2A0xPT9fMzMygYT9ndnaW+eMu33bH0P2Oh72XzgwdM8ygfiZd13rqWj/QvZ661g+M39PIp4mq6lng6SRvaqXzgMeAXcDRO4I2A7e15V3AZe2uonOB59vppLuA85OsaheOz281SdIJMs6RAcDvAp9PcjLwFPBhegFzS5ItwHeBD7axdwIXAXPAT9pYqupgkk8A97VxH6+qg2POS5K0CGOFQVU9CEwP2HTegLEFXHGM59kB7BhnLpKk0fkNZEmSYSBJMgwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSSJJQiDJCcleSDJ7W19fZJ7k8wl+WKSk1v9lW19rm1f1/ccV7X6E0kuGHdOkqTFWYojg48Aj/etfwq4rqreCBwCtrT6FuBQq1/XxpHkDOAS4C3ARuAzSU5agnlJkhZorDBIsgZ4L/DZth7gPcCtbchNwMVteVNbp20/r43fBOysqp9W1XeAOeDsceYlSVqccY8M/gT4KPBCW3898FxVHWnr+4DVbXk18DRA2/58G/+P9QH7SJJOgBWj7pjkfcCBqvp6kpmlm9JLvuZWYCvA1NQUs7OzQ/c5fPjwi8ZdeeaRwYOPs4XMd5hB/Uy6rvXUtX6gez11rR8Yv6eRwwB4F/D+JBcBrwJeB3waWJlkRfv0vwbY38bvB9YC+5KsAE4FftRXP6p/n59TVduB7QDT09M1MzMzdJKzs7PMH3f5tjsW1OBS23vpzNAxwwzqZ9J1raeu9QPd66lr/cD4PY18mqiqrqqqNVW1jt4F4K9U1aXA3cAH2rDNwG1teVdbp23/SlVVq1/S7jZaD2wAvjbqvCRJizfOkcGxfAzYmeSTwAPAja1+I/DnSeaAg/QChKp6NMktwGPAEeCKqvrZcZiXJOkYliQMqmoWmG3LTzHgbqCq+nvgN4+x/zXANUsxF0nS4vkNZEmSYSBJMgwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CSxBhhkGRtkruTPJbk0SQfafXTkuxO8mT7u6rVk+T6JHNJHkpyVt9zbW7jn0yyefy2JEmLMc6RwRHgyqo6AzgXuCLJGcA2YE9VbQD2tHWAC4EN7bEVuAF64QFcDZwDnA1cfTRAJEknxshhUFXPVNU32vL/Bh4HVgObgJvasJuAi9vyJuDm6rkHWJnkdOACYHdVHayqQ8BuYOOo85IkLV6qavwnSdYBXwXeCnyvqla2eoBDVbUyye3AtVX1v9q2PcDHgBngVVX1yVb/j8D/rao/GvA6W+kdVTA1NfWOnTt3Dp3b4cOHee1rX/tztYf3Pz9Sn+M6c/WpYz/HoH4mXdd66lo/0L2eutYPLLynd7/73V+vqun59RXjTiDJa4G/BH6/qn7c+/e/p6oqyfhp80/Ptx3YDjA9PV0zMzND95mdnWX+uMu33bFUU1qUvZfODB0zzKB+Jl3XeupaP9C9nrrWD4zf01h3EyX5JXpB8Pmq+lIrf7+d/qH9PdDq+4G1fbuvabVj1SVJJ8g4dxMFuBF4vKr+uG/TLuDoHUGbgdv66pe1u4rOBZ6vqmeAu4Dzk6xqF47PbzVJ0gkyzmmidwH/Fng4yYOt9gfAtcAtSbYA3wU+2LbdCVwEzAE/AT4MUFUHk3wCuK+N+3hVHRxjXpKkRRo5DNqF4Bxj83kDxhdwxTGeawewY9S5TIp1S3Ct4sozj4x0zWPvte8d+7UldZffQJYkGQaSJMNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSeBmFQZKNSZ5IMpdk23LPR5J+kbwswiDJScCfAhcCZwAfSnLG8s5Kkn5xrFjuCTRnA3NV9RRAkp3AJuCxZZ1Vh6zbdseyvO7ea9+7LK8raXFeLmGwGni6b30fcM78QUm2Alvb6uEkTyzgud8A/HDsGb5M/N6E9ZNPLWjYRPW0AF3rB7rXU9f6gYX39C8GFV8uYbAgVbUd2L6YfZLcX1XTx2lKJ1zX+oHu9dS1fqB7PXWtHxi/p5fFNQNgP7C2b31Nq0mSToCXSxjcB2xIsj7JycAlwK5lnpMk/cJ4WZwmqqojSX4HuAs4CdhRVY8u0dMv6rTSBOhaP9C9nrrWD3Svp671A2P2lKpaqolIkibUy+U0kSRpGRkGkqTuhkEXft4iyY4kB5I80lc7LcnuJE+2v6uWc46LkWRtkruTPJbk0SQfafVJ7ulVSb6W5Jutp//c6uuT3Nvef19sN0ZMjCQnJXkgye1tfdL72Zvk4SQPJrm/1Sb5fbcyya1JvpXk8STvHLefToZBh37e4nPAxnm1bcCeqtoA7Gnrk+IIcGVVnQGcC1zR/rtMck8/Bd5TVW8D3g5sTHIu8Cnguqp6I3AI2LKMcxzFR4DH+9YnvR+Ad1fV2/vuxZ/k992ngS9X1ZuBt9H7bzVeP1XVuQfwTuCuvvWrgKuWe14j9rIOeKRv/Qng9LZ8OvDEcs9xjN5uA/5NV3oCXg18g963538IrGj1n3s/vtwf9L7nswd4D3A7kEnup815L/CGebWJfN8BpwLfod0AtFT9dPLIgME/b7F6meay1Kaq6pm2/CwwtZyTGVWSdcCvAfcy4T21UyoPAgeA3cDfAc9V1ZE2ZNLef38CfBR4oa2/nsnuB6CAv0ny9fazNjC577v1wA+AP2un8j6b5DWM2U9Xw+AXQvU+AkzcvcFJXgv8JfD7VfXj/m2T2FNV/ayq3k7vE/XZwJuXeUojS/I+4EBVfX2557LEfr2qzqJ36viKJL/Rv3HC3ncrgLOAG6rq14D/w7xTQqP009Uw6PLPW3w/yekA7e+BZZ7PoiT5JXpB8Pmq+lIrT3RPR1XVc8Dd9E6jrExy9Eudk/T+exfw/iR7gZ30ThV9msntB4Cq2t/+HgD+il5oT+r7bh+wr6rubeu30guHsfrpahh0+ectdgGb2/JmeufdJ0KSADcCj1fVH/dtmuSefiXJyrZ8Cr1rII/TC4UPtGET01NVXVVVa6pqHb3/3Xylqi5lQvsBSPKaJL98dBk4H3iECX3fVdWzwNNJ3tRK59H7uf/x+lnuiyHH8SLLRcC36Z2//cPlns+IPXwBeAb4f/Q+DWyhd/52D/Ak8D+B05Z7novo59fpHbo+BDzYHhdNeE+/CjzQenoE+E+t/i+BrwFzwF8Ar1zuuY7Q2wxw+6T30+b+zfZ49Oi/BxP+vns7cH973/0PYNW4/fhzFJKkzp4mkiQtgmEgSTIMJEmGgSQJw0CShGEgScIwkCQB/x/uAjas9eGwgAAAAABJRU5ErkJggg==\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"clinical_findings_words_count = clinical_findings.str.split(' ').apply(len)\n",
"clinical_findings_words_count.hist();"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"ignored_findings_terms = {\n",
" 'none',\n",
" 'add',\n",
" '5)',\n",
" 'id',\n",
" 'delay',\n",
" 'diagnostic',\n",
" 'healthy',\n",
" 'affected',\n",
" 'array',\n",
" 'falls',\n",
" 'obligate',\n",
" # quantifiers\n",
" 'all',\n",
" 'mild',\n",
" 'frequent',\n",
" 'progressive',\n",
" 'frequency',\n",
" 'mitochondrial',\n",
" 'symptomatic'\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 3456/3456 [00:11<00:00, 308.02it/s]\n"
]
}
],
"source": [
"mentioned_findings = literature['abstract_clean'].str.lower().progress_apply(\n",
" matches_n_consecutive_words,\n",
" database=set(clinical_findings) - ignored_findings_terms,\n",
" consecutive_n=int(clinical_findings_words_count.quantile(0.999))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"breast cancer 118\n",
"asthma 30\n",
"infections 27\n",
"leukemia 27\n",
"prostate cancer 27\n",
"psychiatric 19\n",
"plethora 19\n",
"poor prognosis 17\n",
"colitis 17\n",
"dissection 15\n",
"arthritis 15\n",
"colon cancer 15\n",
"hypoxia 14\n",
"shock 14\n",
"atherosclerosis 13\n",
"bladder cancer 13\n",
"asymptomatic 12\n",
"triple-negative breast cancer 12\n",
"heart failure 11\n",
"hepatitis 11\n",
"autism spectrum 11\n",
"autism 11\n",
"streptococcus 11\n",
"crohn's disease 11\n",
"cholangiocarcinoma 8\n",
"sepsis 8\n",
"pain 8\n",
"heart disease 8\n",
"allergy 7\n",
"chronic kidney disease 7\n",
"osteoporosis 7\n",
"dementia 7\n",
"weight loss 6\n",
"other cancer 6\n",
"anxiety 6\n",
"bipolar 6\n",
"neurodegeneration 5\n",
"nephropathy 5\n",
"immunodeficiency 5\n",
"periodontitis 5\n",
"asd 5\n",
"unaffected 5\n",
"neurodegenerative disorder 5\n",
"metabolic phenotype 4\n",
"overweight 4\n",
"diarrhea 4\n",
"renal disease 4\n",
"disease-free survival 4\n",
"retinopathy 3\n",
"chronic hepatitis 3\n",
"dtype: int64"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"domain_features['mentioned_clinical_findings'] = mentioned_findings\n",
"domain_features['mentioned_clinical_findings_set'] = mentioned_findings.apply(set)\n",
"domain_features['mentions_n_clinical_findings'] = mentioned_findings.apply(len)\n",
"mentioned_findings_occurrences = Series(domain_features['mentioned_clinical_findings_set'].apply(list).sum())\n",
"mentioned_findings_occurrences.value_counts().head(50)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Species"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>unique_name</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>root</td>\n",
" <td>NaN</td>\n",
" <td>scientific name</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Bacteria</td>\n",
" <td>Bacteria <bacteria></td>\n",
" <td>scientific name</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>bacteria</td>\n",
" <td>NaN</td>\n",
" <td>blast name</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>eubacteria</td>\n",
" <td>NaN</td>\n",
" <td>genbank common name</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Monera</td>\n",
" <td>Monera <bacteria></td>\n",
" <td>in-part</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3234121</th>\n",
" <td>Lebrunia neglecta</td>\n",
" <td>NaN</td>\n",
" <td>synonym</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3234122</th>\n",
" <td>Lilyopsis medusa (Metschnikoff & Metschnikoff, 1871)</td>\n",
" <td>NaN</td>\n",
" <td>authority</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3234123</th>\n",
" <td>Lilyopsis medusa</td>\n",
" <td>NaN</td>\n",
" <td>scientific name</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3234124</th>\n",
" <td>Lilyopsis rosea</td>\n",
" <td>NaN</td>\n",
" <td>synonym</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3234125</th>\n",
" <td>Hydrothelphusinae</td>\n",
" <td>NaN</td>\n",
" <td>scientific name</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3234126 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" name \\\n",
"0 root \n",
"1 Bacteria \n",
"2 bacteria \n",
"3 eubacteria \n",
"4 Monera \n",
"... ... \n",
"3234121 Lebrunia neglecta \n",
"3234122 Lilyopsis medusa (Metschnikoff & Metschnikoff, 1871) \n",
"3234123 Lilyopsis medusa \n",
"3234124 Lilyopsis rosea \n",
"3234125 Hydrothelphusinae \n",
"\n",
" unique_name type \n",
"0 NaN scientific name \n",
"1 Bacteria <bacteria> scientific name \n",
"2 NaN blast name \n",
"3 NaN genbank common name \n",
"4 Monera <bacteria> in-part \n",
"... ... ... \n",
"3234121 NaN synonym \n",
"3234122 NaN authority \n",
"3234123 NaN scientific name \n",
"3234124 NaN synonym \n",
"3234125 NaN scientific name \n",
"\n",
"[3234126 rows x 3 columns]"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"species = read_table('data/taxdump/names.dmp', usecols=[2, 4, 6])\n",
"species.columns = ['name', 'unique_name', 'type']\n",
"species"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"index\n",
"scientific name 2260403\n",
"authority 503986\n",
"synonym 169589\n",
"type material 149465\n",
"includes 53463\n",
"equivalent name 49097\n",
"genbank common name 29623\n",
"common name 14493\n",
"acronym 1197\n",
"genbank synonym 1110\n",
"in-part 535\n",
"genbank acronym 483\n",
"anamorph 284\n",
"blast name 229\n",
"teleomorph 169\n",
"Name: type, dtype: int64"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"species.type.sorted_value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>unique_name</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>89</th>\n",
" <td>ATCC 39565</td>\n",
" <td>ATCC 39565 <type strain></td>\n",
" <td>type material</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>ATCC BAA-642 [[Shewanella affinis Ivanova et al. 2004]]</td>\n",
" <td>ATCC BAA-642 [[Shewanella affinis Ivanova et al. 2004]] <type strain></td>\n",
" <td>type material</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>CIP 107703 [[Shewanella affinis Ivanova et al. 2004]]</td>\n",
" <td>CIP 107703 [[Shewanella affinis Ivanova et al. 2004]] <type strain></td>\n",
" <td>type material</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>KMM 3587 [[Shewanella affinis Ivanova et al. 2004]]</td>\n",
" <td>KMM 3587 [[Shewanella affinis Ivanova et al. 2004]] <type strain></td>\n",
" <td>type material</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>strain LST-W</td>\n",
" <td>strain LST-W <type strain></td>\n",
" <td>type material</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name \\\n",
"89 ATCC 39565 \n",
"90 ATCC BAA-642 [[Shewanella affinis Ivanova et al. 2004]] \n",
"91 CIP 107703 [[Shewanella affinis Ivanova et al. 2004]] \n",
"92 KMM 3587 [[Shewanella affinis Ivanova et al. 2004]] \n",
"97 strain LST-W \n",
"\n",
" unique_name \\\n",
"89 ATCC 39565 <type strain> \n",
"90 ATCC BAA-642 [[Shewanella affinis Ivanova et al. 2004]] <type strain> \n",
"91 CIP 107703 [[Shewanella affinis Ivanova et al. 2004]] <type strain> \n",
"92 KMM 3587 [[Shewanella affinis Ivanova et al. 2004]] <type strain> \n",
"97 strain LST-W <type strain> \n",
"\n",
" type \n",
"89 type material \n",
"90 type material \n",
"91 type material \n",
"92 type material \n",
"97 type material "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head = species.head(100)\n",
"head.query('type == \"type material\"').tail(5)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2572241"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"species_names = set(\n",
" species[(species.name != 'root') & ~species.type.isin({'authority', 'type material'})]\n",
" .name\n",
" .str.lower()\n",
")\n",
"len(species_names)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"148755"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"species_type_materials = set(\n",
" species[species.type.isin({'type material'})]\n",
" .name\n",
" # remove the citation from type material, take the name only\n",
" .str.split(r' \\[\\[').str[0]\n",
")\n",
"len(species_type_materials)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"assert 'human' in species_names\n",
"assert 'homo sapiens' in species_names"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Manual species curation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note: type materials have to be case-sensitive:"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>unique_name</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1871628</th>\n",
" <td>AND</td>\n",
" <td>AND <type material></td>\n",
" <td>type material</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name unique_name type\n",
"1871628 AND AND <type material> type material"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"species[species.name.str.lower() == 'and']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Cancer is a proper genus, but we do not want to match it as it would match cancer as a disease:"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>unique_name</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>31637</th>\n",
" <td>Cancer</td>\n",
" <td>NaN</td>\n",
" <td>scientific name</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name unique_name type\n",
"31637 Cancer NaN scientific name"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"species[species.name.str.lower() == 'cancer']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"None of the articles appears to refer to a fish when they use \"spot\" either:"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>unique_name</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>123457</th>\n",
" <td>spot</td>\n",
" <td>NaN</td>\n",
" <td>common name</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name unique_name type\n",
"123457 spot NaN common name"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"species[species.name.str.lower() == 'spot']"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"uid\n",
"21831800 genomics level, exome sequencing has been the hot spot of the recent research. However, the pred...\n",
"25345010 riptomics and proteomics, is becoming the new hot spot of life science. Although the fast output...\n",
"31162831 Target spot is a newly emerging citrus disease caused by \n",
"31616468 ean or median of the pixel intensities within the spot and then subjected to a within-slide norm...\n",
"Name: abstract_clean, dtype: object"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"check_usage_in_abstracts('spot')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Python is used as a programming language:"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"uid\n",
"23927696 available software.\\n\\n\\nHere, we describe COBRA for Python (COBRApy), a Python package that pro...\n",
"25637560 _pipe). Omics Pipe is distributed as a standalone Python package for installation (https://pypi....\n",
"26072472 D IMPLEMENTATION\\nMSProGene is written in Java and Python. It is open source and available at ht...\n",
"27986083 lated within a user-friendly implementation using Python and Docker. IMP is available at http://...\n",
"28640810 nted as a general-purpose toolbox using the PyGMO Python package to make the most of multicore c...\n",
"29746212 umber of examined species were identified using a Python script. Multiple genomic alignments of ...\n",
"30221093 lack of adequate data analysis training (e.g., R, Python) as a main challenge, in addition to no...\n",
"30596886 http://b2slab.upc.edu/software-and-tutorials/ and Python package: https://pypi.python.org/pypi/m...\n",
"30668675 //rgv.genouest.org. The website is implemented in Python, JavaScript and MongoDB, and is compati...\n",
"30825303 esults and a reference implementation of dgMDL in Python is available on https://github.com/luop...\n",
"30944779 ps://github.com/Candihub/pixel). It is written in Python using the Django framework and stores a...\n",
"31092193 ogical abstractions.\\n\\n\\nHere, we present PathMe, a Python package that transforms pathway know...\n",
"31152171 confounders. DMRs were identified using comb-p in Python. Results were validated in the Rotterda...\n",
"32070398 e analysis software is available as the protaccel Python package.\n",
"Name: abstract_clean, dtype: object"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"check_usage_in_abstracts('Python')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"California is used as place:"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"uid\n",
"21194970 troprusside (SNP). QuantiGene (Panomics, Fremont, California) branched DNA (bDNA) assay was used...\n",
"28443069 . At \"Long-term Soil Productivity Study\" sites in California dominated by Ponderosa pine, we tes...\n",
"29339647 NN) held its 11th annual Congress in Los Angeles, California, between September 16 and 19, 2017....\n",
"29688803 dosis related omics evidence in the University of California Santa Cruz (UCSC) Genome Browser. F...\n",
"31299210 nd PCBs, in a women cohort with sera collected in California in the 1960s. Strikingly, this anal...\n",
"31825540 n,\" was held 20 to 23 June, 2019, in Los Angeles, California.\\n\\n\\nThe CHPCA Meeting is an annua...\n",
"Name: abstract_clean, dtype: object"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"check_usage_in_abstracts('California')"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"uid\n",
"16077939 Rat Sfrp1 (314 aa) consisted of a signal peptide (codon 1-31), Frizzled domain with ten conserve...\n",
"17611704 lineage specific T to G nucleotide change at stop codon of chimpanzee, rat, and mouse HES3 ortho...\n",
"26482106 ion to a changing environment: pathway structure, codon usage, metabolism. To measure adaptabili...\n",
"26850284 s data and subsets thereof to establish reference codon usage biases for codon optimization in s...\n",
"27633273 gnments, conservation and variation, CpG islands, codon context, usage bias and phylogenetic inf...\n",
"29222764 mplemented in MATLAB. METRADE uses microarray and codon usage data to model bacterial metabolic ...\n",
"30066640 collection of gene expression and sequence data, codon usage and protein abundances) to analyse...\n",
"31046701 p.Arg5688*), predicted to create a premature stop codon near the N-terminus of ADGRV1. Ophthalmo...\n",
"31071195 t Saccharomycopsis yeasts have reassigned the CTG codon and translate CTG into serine instead of...\n",
"32041497 417C→A variant introduces a premature termination codon (Y139X). We found reduced CHK2 protein e...\n",
"32694125 of protein-coding genes, which included 44 start codon changes and 42 added proteins. We develo...\n",
"Name: abstract_clean, dtype: object"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"check_usage_in_abstracts('codon')"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"uid\n",
"23883549 y butanol treatment. Notably, 3-phosphoglycerate, glycine, serine and urea related to general st...\n",
"27228119 esults suggested reprogrammed pathways related to glycine metabolism, fatty acids metabolism and...\n",
"27589727 in \"phosphoglycerate mutase 2\" and the metabolite glycine. We hypothesize that association studi...\n",
"27879288 RJ, CD44, and C4A, as well as increased levels of glycine and asparagine, and decreased levels o...\n",
"29325019 G sites, i.e. of glycerophospholipid PC(O-36: 5), glycine and a very low-density lipoprotein (VL...\n",
"29859941 fermentation pathways of the amino acids proline, glycine and leucine. But also a far-reaching r...\n",
"29930756 gnificantly altered glutamine, glucose and serine/glycine metabolism. The predominant upregulate...\n",
"30086113 tes (uridine, C-glycosyl tryptophan, and N-acetyl glycine) were statistically independent and th...\n",
"31029960 red metabolites, the metabolic pathway containing glycine, serine and threonine was the most sig...\n",
"31104335 ar metabolites. Follow-up analyses suggested that glycine mediates the relationship between carb...\n",
"31717805 o amino acid metabolism, particularly alanine and glycine metabolism, were affected in the liver...\n",
"31733966 al changes including aminoacyl-tRNA biosynthesis, glycine, serine and threonine metabolism, nitr...\n",
"32010956 e acids, together with the respective taurine and glycine conjugates, were quantified through ul...\n",
"32245432 e predicted four amino acids (alanine, glutamate, glycine and aspartate) as the limited precurso...\n",
"32277923 in particular, a significantly increased level of glycine N-methyltransferase and increased leve...\n",
"32398126 t the host's metabolism of amino acids (including glycine, serine, threonine, alanine, aspartate...\n",
"32413334 val. We identified a potential involvement of the glycine-serine-threonine metabolic axis in lon...\n",
"32566075 mino acid, aromatic amino acid, beta-alanine, and glycine, serine and threonine metabolism, lipi...\n",
"32616808 s that are uniquely associated with dry, wet, and glycine-amended conditions. A subsequent gene ...\n",
"32655010 igher concentrations of serum amyloid A (SAA) and glycine, and lower concentrations of sphingomy...\n",
"Name: abstract_clean, dtype: object"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"check_usage_in_abstracts('glycine')"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"uid\n",
"22960126 ta thalassemia (Het-βThal). Het-βThal causes mild anemia and is known to determine a pro-oxidant...\n",
"26948278 oorly understood. Moreover, current management of anemia in ESRD is controversial due to the \"an...\n",
"27136057 at risk for a drug side effect (ribavirin-induced anemia) and how genetic variation (inosine tri...\n",
"29590102 cal phenotypes such as red blood cell parameters, anemia, and chronic kidney disease (CKD). In a...\n",
"Name: abstract_clean, dtype: object"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"check_usage_in_abstracts('anemia')"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"uid\n",
"16142419 placenta, and uterus. FGF22 mRNA was expressed in hippocampus and ovarian fibrotheoma. FGF7 prom...\n",
"27731396 files of PLF and PSF groups were acquired for the hippocampus and plasma to identify molecular p...\n",
"28300637 tabolite assignments from three ROIs (cerebellum, hippocampus and midbrain/hypothalamus) in posi...\n",
"30291623 filed the transcriptome and proteome of the mouse hippocampus during early stages of disease dev...\n",
"31942070 The hippocampus is an important part of the limbic sys\n",
"32170004 essed in liver, brain cerebellum, frontal cortex, hippocampus and pituitary. Our studies provide...\n",
"32234596 significantly increased the levels of BDNF in the hippocampus. Cecum contents metabolomics revea...\n",
"Name: abstract_clean, dtype: object"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"check_usage_in_abstracts('hippocampus')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Mixed usage: as referring to species or in another meaning:"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"uid\n",
"26421150 with asthma. The outcome of interest was a binary indicator of asthma control defined by the use...\n",
"26444894 occurring within the warfighter as a preclinical indicator. Current methods for determining tox...\n",
"26870025 rRNA gene amplicon sequencing and methods such as indicator species analysis, the Kolmogorov-Smi...\n",
"28335448 . The multi-omics data, in combination with fecal indicator bacteria (FIB) counts, trace metal c...\n",
"29222764 sion, metabolism is increasingly being used as an indicator of the phenotypic outcome for drugs ...\n",
"29844878 non-coding RNA, which is known to be a prognostic indicator for breast cancer and stimulated by ...\n",
"30189879 e the ability to passively monitor behavior as an indicator of socialization and mood; accumulat...\n",
"31054440 Ambient fine particle is a crucial indicator of air pollution brought into the air by\n",
"31258819 sing its human development index (HDI): a summary indicator of health, education, and income. Th...\n",
"32156745 dies. sCDH3 is a survival predictor and real-time indicator of treatment efficacy in patients wi...\n",
"32368297 prognostic signature was an effective prognostic indicator in 9,122 patients across 30 types of...\n",
"Name: abstract_clean, dtype: object"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"check_usage_in_abstracts('indicator')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"But majority indicators would be false-positives, excluding; \"fecal indicator bacteria\" will match anyways on \"bacteria\"."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Numeric only records are to vague and will be excluded to avoid false positives:"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>unique_name</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>905354</th>\n",
" <td>382</td>\n",
" <td>382 <type material></td>\n",
" <td>type material</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name unique_name type\n",
"905354 382 382 <type material> type material"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"species[species.name == '382']"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"146"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"numeric_species_terms = {name for name in species.name if name.isnumeric()}\n",
"len(numeric_species_terms)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some other species names excluded:"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>unique_name</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2217657</th>\n",
" <td>collection</td>\n",
" <td>NaN</td>\n",
" <td>scientific name</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name unique_name type\n",
"2217657 collection NaN scientific name"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"species[species.name.str.lower() == 'collection']"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"ignored_species_terms = {\n",
" 'cancer',\n",
" 'other',\n",
" 'collection',\n",
" 'synthetic',\n",
" 'artificial',\n",
" 'unknown',\n",
" 'none',\n",
" 'unidentified',\n",
" 'hybrid',\n",
" 'clinical samples',\n",
" 'vectors',\n",
" 'spot',\n",
" 'this',\n",
" 'data',\n",
" 'major',\n",
" 'colon',\n",
" 'cnv', # copy number variation\n",
" 'goes',\n",
" 'automate',\n",
" 'axis',\n",
" 'idea',\n",
" 'laser',\n",
" 'beta',\n",
" 'areas',\n",
" 'electron',\n",
" 'bias',\n",
" 'python',\n",
" 'cis', # chemistry prefix\n",
" 'insertion',\n",
" 'aa', # amino acid\n",
" 'california',\n",
" 'codon', # in sequence\n",
" 'glycine', # amino acid\n",
" 'anemia', # disease\n",
" 'hippocampus',\n",
" 'meta',\n",
" 'indicator'\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some names are just long:"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4788 cloning vector pb-degron-tvmvs-mcp-cnot7-e2a-mcp-cnot7-tevs-degron-p2a-ntevp-yap-t2a-14-3-3sigma...\n",
"20015 cloning vector pb-6-xnor-degron-tvmvs-mcp-tevs-cnot7-e2a-mcp-tvmvs-cnot7-tevs-degron-f2a-ntevp-f...\n",
"175405 cloning vector pb-5-xor-degron-tvmvs-mcp-cnot7-tevs-degron-e2a-mcp-tevs-tvmvs-cnot7-f2a-ntevp-fr...\n",
"202108 (megalobrama amblycephala x parabramis pekinensis) x (megalobrama amblycephala x parabramis peki...\n",
"256597 metschnikowia pimensis (s.o. suh, c.m. gibson & m. blackwell) c.p. kurtzman, c.j. robnett & e. b...\n",
"dtype: object"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = Series(list(species_names))\n",
"x[x.str.len() > 100].head()"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEDCAYAAAAlRP8qAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAQtklEQVR4nO3cf6zdd13H8efLrUOzOzaxeFm2YQGHhGw61+tAIeSWBDMGYWom2TKBEbBImIBhhsIfgCTEYSIoGbJUmWMGdkGHWLdFXKDNhhFcOwvdD4EKNaxMChvruEDAyts/znd40p57z7nt7bnnfPZ8JCf3ez7fzznn1W/aV7/3e77fb6oKSdL0+4m1DiBJWh0WuiQ1wkKXpEZY6JLUCAtdkhphoUtSI9a00JNcl+RAkrtHnP/SJPcmuSfJR453PkmaJlnL89CTPA9YBG6oqnOGzD0b+Bjw/Kr6dpKfraoD48gpSdNgTffQq+p24KH+sSRPS/JPSXYluSPJM7pVvwu8v6q+3b3WMpekPpN4DH0r8PtVtRG4CviLbvzpwNOT/EuSzya5cM0SStIEOnGtA/RLMgP8GvC3SR4dflz380TgbGAeOBO4Pcm5VfXwuHNK0iSaqEKn9xvDw1V13oB19wOfq6r/Ab6a5Ev0Cv7OcQaUpEk1UYdcquoRemX92wDp+aVu9Sfo7Z2TZD29QzBfWYuckjSJ1vq0xRuBfwV+Icn9SV4FXA68KsnngXuAi7vpnwQeTHIvsB34w6p6cC1yS9IkWtPTFiVJq2eiDrlIko7emn0pun79+tqwYcMR49/97nc5+eSTxx/oGJl7/KY1u7nHq7Xcu3bt+lZVPXHgi6pqTR4bN26sQbZv3z5wfNKZe/ymNbu5x6u13MDOWqJXhx5ySXJWku1991B5w4A580kOJtndPd52FP8hSZKOwSiHXA4Bb6qqu5KcAuxKcltV3XvYvDuq6sWrH1GSNIqhe+hV9UBV3dUtfwe4DzjjeAeTJK3Mik5bTLIBuB04p3oXAT06Pg/cRO9qzq8DV1XVPQNevxnYDDA7O7txYWHhiM9YXFxkZmZmJX+GiWDu8ZvW7OYer9Zyb9q0aVdVzQ180VIH1w9/ADPALuC3Bqx7PDDTLV8EfHnY+/ml6GSY1txV05vd3OPVWm6O5UtRgCTr6O2Bf7iqPj7gP4VHqmqxW74VWNddni9JGpNRznIJ8EHgvqp6zxJzntTNI8kF3ft6Wb4kjdEoZ7k8B3gZsCfJ7m7srcCTAarqWuAS4LVJDgHfBy7tfjWQJI3J0EKvqs8AGTLnGuCa1QolSVq5Sbsf+kg2bLllzT5739UvWrPPlqTleHMuSWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqxNBCT3JWku1J7k1yT5I3DJiTJO9LsjfJF5Kcf3ziSpKWcuIIcw4Bb6qqu5KcAuxKcltV3ds354XA2d3jWcAHup+SpDEZuodeVQ9U1V3d8neA+4AzDpt2MXBD9XwWOC3J6aueVpK0pFTV6JOTDcDtwDlV9Ujf+M3A1VX1me75p4A3V9XOw16/GdgMMDs7u3FhYeGIz1hcXGRmZmbZHHv2Hxw582o794xTB46PknsSTWtumN7s5h6v1nJv2rRpV1XNDXrNKIdcAEgyA9wEvLG/zFeiqrYCWwHm5uZqfn7+iDk7duxg0Hi/K7bccjQfvyr2XT4/cHyU3JNoWnPD9GY393g9lnKPdJZLknX0yvzDVfXxAVP2A2f1PT+zG5MkjckoZ7kE+CBwX1W9Z4lp24CXd2e7PBs4WFUPrGJOSdIQoxxyeQ7wMmBPkt3d2FuBJwNU1bXArcBFwF7ge8ArVz+qJGk5Qwu9+6IzQ+YU8LrVCiVJWjmvFJWkRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ1wkKXpEYMLfQk1yU5kOTuJdbPJzmYZHf3eNvqx5QkDXPiCHOuB64Bblhmzh1V9eJVSSRJOipD99Cr6nbgoTFkkSQdg1TV8EnJBuDmqjpnwLp54CbgfuDrwFVVdc8S77MZ2AwwOzu7cWFh4Yg5i4uLzMzMLJtnz/6DQzMfL+eecerA8VFyT6JpzQ3Tm93c49Va7k2bNu2qqrlBr1mNQn888KOqWkxyEfDnVXX2sPecm5urnTt3HjG+Y8cO5ufnl33thi23DM18vOy7+kUDx0fJPYmmNTdMb3Zzj1druZMsWejHfJZLVT1SVYvd8q3AuiTrj/V9JUkrc8yFnuRJSdItX9C954PH+r6SpJUZepZLkhuBeWB9kvuBtwPrAKrqWuAS4LVJDgHfBy6tUY7jSJJW1dBCr6rLhqy/ht5pjZKkNeSVopLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaMbTQk1yX5ECSu5dYnyTvS7I3yReSnL/6MSVJw4yyh349cOEy618InN09NgMfOPZYkqSVGlroVXU78NAyUy4GbqiezwKnJTl9tQJKkkaTqho+KdkA3FxV5wxYdzNwdVV9pnv+KeDNVbVzwNzN9PbimZ2d3biwsHDEZy0uLjIzM7Nsnj37Dw7NfLyce8apA8dHyT2JpjU3TG92c49Xa7k3bdq0q6rmBr3mxOOeqk9VbQW2AszNzdX8/PwRc3bs2MGg8X5XbLnlOKQbzb7L5weOj5J7Ek1rbpje7OYer8dS7tU4y2U/cFbf8zO7MUnSGK1GoW8DXt6d7fJs4GBVPbAK7ytJWoGhh1yS3AjMA+uT3A+8HVgHUFXXArcCFwF7ge8BrzxeYSVJSxta6FV12ZD1Bbxu1RJJko6KV4pKUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGmGhS1IjRir0JBcm+WKSvUm2DFh/RZJvJtndPV69+lElScs5cdiEJCcA7wdeANwP3JlkW1Xde9jUj1bVlcchoyRpBKPsoV8A7K2qr1TVD4EF4OLjG0uStFKpquUnJJcAF1bVq7vnLwOe1b83nuQK4I+BbwJfAv6gqr424L02A5sBZmdnNy4sLBzxeYuLi8zMzCybac/+g8uuP57OPePUgeOj5J5E05obpje7ucertdybNm3aVVVzg14z9JDLiP4RuLGqfpDkNcCHgOcfPqmqtgJbAebm5mp+fv6IN9qxYweDxvtdseWWY098lPZdPj9wfJTck2hac8P0Zjf3eD2Wco9yyGU/cFbf8zO7sR+rqger6gfd078CNq4ohSTpmI1S6HcCZyd5SpKTgEuBbf0Tkpze9/QlwH2rF1GSNIqhh1yq6lCSK4FPAicA11XVPUneCeysqm3A65O8BDgEPARccRwzS5IGGOkYelXdCtx62Njb+pbfArxldaNJklbCK0UlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ14sS1DjBtNmy5ZeD4m849xBVLrFsN+65+0XF7b0ltcA9dkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGjFToSS5M8sUke5NsGbD+cUk+2q3/XJINqx1UkrS8oYWe5ATg/cALgWcClyV55mHTXgV8u6p+Hngv8O7VDipJWt4oV4peAOytqq8AJFkALgbu7ZtzMfCObvnvgGuSpKpqFbM+pi11heqxGuUKV69SlabDKIV+BvC1vuf3A89aak5VHUpyEPgZ4Fv9k5JsBjZ3TxeTfHHA560//HXT4PUN587k/r41ldscc49ba7l/bqkXjPVeLlW1Fdi63JwkO6tqbkyRVo25x29as5t7vB5LuUf5UnQ/cFbf8zO7sYFzkpwInAo8uJIgkqRjM0qh3wmcneQpSU4CLgW2HTZnG/CKbvkS4NMeP5ek8Rp6yKU7Jn4l8EngBOC6qronyTuBnVW1Dfgg8DdJ9gIP0Sv9o7XsIZkJZu7xm9bs5h6vx0zuuCMtSW3wSlFJaoSFLkmNmKhCH3aLgUmVZF+SPUl2J9m51nmWkuS6JAeS3N039oQktyX5cvfzp9cy4yBL5H5Hkv3dNt+d5KK1zDhIkrOSbE9yb5J7kryhG5/obb5M7one5kl+Msm/Jfl8l/uPuvGndLck2dvdouSktc7ab5nc1yf5at/2Pm/om1XVRDzofeH6n8BTgZOAzwPPXOtcI2bfB6xf6xwj5HwecD5wd9/YnwBbuuUtwLvXOueIud8BXLXW2YbkPh04v1s+BfgSvdtnTPQ2Xyb3RG9zIMBMt7wO+BzwbOBjwKXd+LXAa9c664i5rwcuWcl7TdIe+o9vMVBVPwQevcWAVklV3U7vLKR+FwMf6pY/BPzGWEONYIncE6+qHqiqu7rl7wD30buqeqK3+TK5J1r1LHZP13WPAp5P75YkMJnbe6ncKzZJhT7oFgMT/5eoU8A/J9nV3d5gmsxW1QPd8n8Ds2sZZoWuTPKF7pDMRB22OFx3B9Jfprf3NTXb/LDcMOHbPMkJSXYDB4Db6P3W/3BVHeqmTGSvHJ67qh7d3u/qtvd7kzxu2PtMUqFPs+dW1fn07kj5uiTPW+tAR6N6v/NNy3msHwCeBpwHPAD86drGWVqSGeAm4I1V9Uj/ukne5gNyT/w2r6r/rarz6F3RfgHwjDWONJLDcyc5B3gLvfy/AjwBePOw95mkQh/lFgMTqar2dz8PAH9P7y/StPhGktMBup8H1jjPSKrqG90/gh8Bf8mEbvMk6+iV4oer6uPd8MRv80G5p2WbA1TVw8B24FeB07pbksCE90pf7gu7Q19VVT8A/poRtvckFfootxiYOElOTnLKo8vArwN3L/+qidJ/24ZXAP+whllG9mghdn6TCdzmSULvKur7quo9fasmepsvlXvSt3mSJyY5rVv+KeAF9I7/b6d3SxKYzO09KPd/9P2nH3rH/Ydu74m6UrQ7DerP+P9bDLxrjSMNleSp9PbKoXcrhY9Mau4kNwLz9G7L+Q3g7cAn6J0F8GTgv4CXVtVEfQG5RO55er/6F72zjF7Td1x6IiR5LnAHsAf4UTf8VnrHoyd2my+T+zImeJsn+UV6X3qeQG9n9WNV9c7u3+gCvcMW/w78TrfXOxGWyf1p4In0zoLZDfxe35eng99rkgpdknT0JumQiyTpGFjoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqRH/B3QjSg5+zrP7AAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"species_words_count_distribution = x.str.split(' ').apply(len)\n",
"species_words_count_distribution.hist();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For sanity, will only check for species names up to 8 words long (which captures 99.9% of the terms):"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8.0"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"species_words_count_distribution.quantile(0.999)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 3456/3456 [00:03<00:00, 961.69it/s] \n"
]
}
],
"source": [
"mentioned_species_type_material = literature['abstract_clean'].str.lower().progress_apply(\n",
" matches_n_consecutive_words,\n",
" database=species_type_materials - numeric_species_terms,\n",
" consecutive_n=int(species_words_count_distribution.quantile(0.999))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[] 3456\n",
"Name: abstract_clean, dtype: int64"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mentioned_species_type_material.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"assert (mentioned_species_type_material.apply(len) == 0).all()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 3456/3456 [00:04<00:00, 859.64it/s]\n"
]
}
],
"source": [
"mentioned_species = literature['abstract_clean'].str.lower().progress_apply(\n",
" matches_n_consecutive_words,\n",
" database=species_names - ignored_species_terms - numeric_species_terms,\n",
" consecutive_n=int(species_words_count_distribution.quantile(0.999))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"domain_features['mentioned_species'] = mentioned_species\n",
"domain_features['mentions_n_species'] = mentioned_species.isnull()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"terms_to_merge = {\n",
" 'escherichia': ['e. coli', 'escherichia', 'escherichia coli'],\n",
" 'rat': ['rat', 'rats'],\n",
" 'mice': ['mice', 'mouse'],\n",
" 'human': ['humans', 'human'],\n",
" 'bacteria': ['bacteria', 'bacterium'],\n",
" 'saccharomyces': ['saccharomyces', 'saccharomyces cerevisiae', 's. cerevisiae'],\n",
" 'corn': ['maize', 'corn'],\n",
" 'cattle': ['cow', 'cattle', 'bovine'],\n",
" 'nematode': ['nematodes', 'nematode']\n",
"}\n",
"\n",
"merge_terms = {\n",
" value: label\n",
" for label, values in terms_to_merge.items()\n",
" for value in values\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"mentioned_species = mentioned_species.apply(lambda species: [merge_terms.get(s, s) for s in species])"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"human 697\n",
"mice 222\n",
"microbiota 129\n",
"bacteria 95\n",
"rat 67\n",
"gut microbiome 62\n",
"plants 61\n",
"escherichia 42\n",
"animals 31\n",
"cattle 23\n",
"saccharomyces 22\n",
"arabidopsis 20\n",
"rice 20\n",
"corn 19\n",
"fungi 19\n",
"zebrafish 18\n",
"fish 16\n",
"metagenome 15\n",
"human microbiome 12\n",
"viruses 12\n",
"rodent 12\n",
"soybean 12\n",
"streptococcus 11\n",
"chicken 10\n",
"chimpanzee 10\n",
"nematode 10\n",
"pigs 10\n",
"caenorhabditis 9\n",
"bacteroidetes 9\n",
"clostridium 9\n",
"pseudomonas 9\n",
"firmicutes 9\n",
"wheat 9\n",
"eukaryotes 9\n",
"sepsis 8\n",
"mycobacterium 8\n",
"chinese hamster 8\n",
"drosophila 8\n",
"tobacco 8\n",
"algae 7\n",
"bacillus 7\n",
"aspergillus 7\n",
"plasmodium 7\n",
"proteobacteria 7\n",
"phyla 7\n",
"archaea 7\n",
"human gut microbiota 7\n",
"niger 7\n",
"streptomyces 6\n",
"cyanobacteria 6\n",
"dtype: int64"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"domain_features['mentioned_species'] = mentioned_species\n",
"domain_features['mentioned_species_set'] = mentioned_species.apply(set)\n",
"domain_features['mentions_n_species'] = mentioned_findings.apply(len)\n",
"mentioned_species_occurrences = Series(domain_features['mentioned_species_set'].apply(list).sum())\n",
"mentioned_species_occurrences.value_counts().head(50)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"papillomavirus 6\n",
"chlamydomonas 6\n",
"human papillomavirus 6\n",
"primate 6\n",
"soil microbiome 5\n",
"hepatitis b virus 5\n",
"lactobacillus 5\n",
"apple 5\n",
"cotton 5\n",
"salmonella 5\n",
"coronavirus 5\n",
"cyanobacterium 5\n",
"b virus 5\n",
"chlamydomonas reinhardtii 5\n",
"tomato 5\n",
"yersinia 5\n",
"metagenomes 5\n",
"arabidopsis thaliana 5\n",
"pig 5\n",
"candida 4\n",
"dtype: int64"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mentioned_species_occurrences.value_counts().head(70).tail(20)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"830"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(set(mentioned_species_occurrences))"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"microbiota_terms = [\n",
" 'microbiota', 'gut microbiome', 'human gut microbiota'\n",
" # maybe - needs context\n",
" 'metagenome',\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.7965856481481481, 0.8195891634415005)"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(domain_features['mentions_n_species'] == 0).mean(), (domain_features['mentions_n_species'] == 0).sum() / has_abstract.sum()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.40335648148148145, 0.4150044656147663)"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"domain_features['mentions_n_species'].mean(), domain_features['mentions_n_species'].sum() / has_abstract.sum()"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"Stored `domain_features` (02FA7AED → 9CBD2CED) at Monday, 03. Aug 2020 01:40"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {
"text/markdown": {
"action": "store",
"command": "store domain_features in pubmed_derived_data",
"finished": "2020-08-03T01:40:10.452984",
"finished_human_readable": "Monday, 03. Aug 2020 01:40",
"result": [
{
"new_file": {
"crc32": "9CBD2CED",
"sha256": "69E41B5E85F3320A8BED275B947ECA40F456F11EC6734F3E3BCDE4BD64EA9255"
},
"old_file": {
"crc32": "02FA7AED",
"sha256": "FBE800B8847F4D6E26AEF9343BAD9CCEF7C1314121C3658EFF07EDD89EDDE491"
},
"subject": "domain_features"
}
],
"started": "2020-08-03T01:40:09.310819"
}
},
"output_type": "display_data"
}
],
"source": [
"%vault store domain_features in pubmed_derived_data"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}