382 lines (381 with data), 13.2 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Build phenome training data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!python3 -m pip install -U pandas numpy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pathlib\n",
"import pandas as pd\n",
"\n",
"base_path = pathlib.Path(os.getcwd().replace(\"/synthetics\", \"\"))\n",
"data_path = base_path / 'mice_data_set' / 'data'\n",
"data_path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Specify the phennotypes and their covariates you plan to analyze\n",
"* We will be synthesizig all phenotypes but only analyzing the ones you specify here\n",
"* While you can run multiple phenotypes at the same time, you might get better peformance by running them individually"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pheno_analysis = [\"abBMD\"]\n",
"#pheno_analysis = [\"abBMD\",\"soleus\"]\n",
"#pheno_analysis = [\"soleus\"]\n",
"\n",
"pheno_analysis_df = pd.DataFrame({\"pheno\": pheno_analysis})\n",
"filename = data_path / 'pheno_analysis.csv'\n",
"pheno_analysis_df.to_csv(filename, index=False, header=True)\n",
"\n",
"pheno_posby_covariates = [\"abBMD\", \"SW16\"]\n",
"#pheno_posby_covariates = [\"abBMD\", \"SW16\", \"soleus\", \"tibia\"]\n",
"#pheno_posby_covariates = [\"soleus\", \"SW16\", \"tibia\"]\n",
"pheno_and_cov_df = pd.DataFrame({\"pheno_and_cov\": pheno_posby_covariates})\n",
"filename = data_path / 'pheno_and_covariates.csv'\n",
"pheno_and_cov_df.to_csv(filename, index=False, header=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Batch the phenotypes into functional groups"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# MUSCLE AND BONE TRAITS AND COVARIATES\n",
"# ----------------------\n",
"# For all five muscle weights (TA, EDL, soleus, plantaris and\n",
"# gastrocnemius), we map QTLs conditioning on tibia length\n",
"# (\"tibia\"). For tibia length, we map QTLs conditioned on body weight.\n",
"#\n",
"# Tibia length explains 12-18% of variance in the muscle weights. The\n",
"# rationale for including tibia length as a covariate is bone length\n",
"# may somehow regulate muscle weight as well, and we would like to\n",
"# isolate the genetic factors that directly regulate development of\n",
"# the muscle tissues.\n",
"# \n",
"# For bone-mineral density (BMD), we created a binary trait that\n",
"# signals \"abnormal\" BMD. We do not include any covariates when\n",
"# mapping QTLs for these traits. Note that body weight is also\n",
"# uncorrelated with BMD.\n",
"# \n",
"# For all muscle and bone traits, we include a binary indicator for\n",
"# round SW16 as a covariate because the mice from this round showed\n",
"# substantial deviation in these traits compared to the rest of the\n",
"# mice.\n",
"bone_muscle_pheno = [\n",
" 'TA',\n",
" 'SW16',\n",
" 'tibia',\n",
" 'EDL',\n",
" 'soleus',\n",
" 'plantaris',\n",
" 'gastroc',\n",
" 'SW6',\n",
" 'sacweight',\n",
" 'BMD',\n",
" 'abBMD']\n",
"\n",
"# OTHER PHYSIOLOGICAL TRAITS AND COVARIATES\n",
"# --------------------------\n",
"# Body weights bw1, bw2 and bw3 were measured on subsequent days of\n",
"# the methamphetamine sensitivity tests, and are highly correlated\n",
"# with each other (r^2 = 98%), so it is only necessary to map QTLs for\n",
"# one of them. The body weight measurements after sacrifice\n",
"# (\"sacweight\") show a considerable departure in Round SW17, so we\n",
"# include a binary indicator for this round as a covariate for\n",
"# sacweight. We include age as a covariate for the \"bw0\" body weight\n",
"# because it was measured while the mouse was still growing.\n",
"#\n",
"# Fasting glucose levels are explained partially by body weight (PVE =\n",
"# 6%), so we include body weight as a covariate for fasting glucose\n",
"# levels. Rounds SW1 and SW11 showed a considerable departure in\n",
"# fasting glucose levels from the other rounds, so we included binary\n",
"# indicators for these two rounds as covariates for fasting glucose\n",
"# levels.\n",
"other_physio_traits_pheno = [\n",
" 'bw0',\n",
" 'glucoseage',\n",
" 'bw1',\n",
" 'methage',\n",
" 'SW17',\n",
" 'PPIweight',\n",
" 'sacweight',\n",
" 'fastglucose',\n",
" 'SW1',\n",
" 'SW11',\n",
" 'taillength',\n",
" 'SW3',\n",
" 'SW4',\n",
" 'SW19',\n",
" 'SW20',\n",
" 'SW22',\n",
" 'SW24',\n",
" 'testisweight']\n",
"\n",
"# FEAR CONDITIONING TRAITS AND COVARIATES\n",
"# ------------------------\n",
"# For all fear conditioning traits, the cage used for testing appears\n",
"# to have an effect on the phenotype, so we include binary indicators\n",
"# for cage as covariates for all FC phenotypes. Further, the FC\n",
"# phenotype measurements in Round SW17 show a noticeably different\n",
"# distribution in the FC phenotypes from the other rounds, so we\n",
"# include a binary indicator for round SW17 as a covariate in all FC\n",
"# traits.\n",
"#\n",
"# These analyses control for proportion of freezing on day 1 during\n",
"# exposure to the tone (\"AvToneD1\"). AvToneD1 explains 11-25% of the\n",
"# variance in the Day 2 and Day 3 freezing measures. Note that here we\n",
"# can map QTLs for freezing to the altered context on Day 3\n",
"# (\"AvAltContextD3\") as a quantitative trait after conditioning on\n",
"# AvToneD1 because the distribution for this trait is no longer quite\n",
"# so bimodal, and looks fairly \"normal\". So there is no need to map\n",
"# QTLs for the binary version of this trait.\n",
"#\n",
"# PreTrainD1 is a very ugly trait with massive box effects and a lot\n",
"# of low values, which might have to be removed as outliers. It is\n",
"# quite likely that these outliers represent the \"deaf\" mice that\n",
"# might be skewing the whole results. These outliers are present in\n",
"# every box, so not a box-specific effect.\n",
"fear_cond_traits_pheno = [\n",
" 'AVContextD2',\n",
" 'AVToneD1',\n",
" 'FCbox1',\n",
" 'FCbox2',\n",
" 'FCbox3',\n",
" 'SW17',\n",
" 'AVAltContextD3',\n",
" 'AvToneD3',\n",
" 'PreTrainD1',\n",
" 'SW10',\n",
" 'SW16',\n",
" 'SW20',\n",
" 'SW7',\n",
" 'SW14']\n",
"\n",
"# METHAMPHETAMINE SENSITIVITY, LOCOMOTOR ACTIVITY AND ANXIETY-LIKE BEHAVIOR AND COVARIATES\n",
"# -------------------------------------------------------------------------\n",
"# We checked all the cages used in these tests to see whether the\n",
"# phenotypes measured using any given cage departed noticeably from\n",
"# the other cages. Cage #7 consistently has a large effect.\n",
"meth_loco_anxiety_pheno1 = [\n",
" 'D1totaldist0to15',\n",
" 'D1totaldist0to30',\n",
" 'D1TOTDIST5',\n",
" 'D1TOTDIST10',\n",
" 'D1TOTDIST15',\n",
" 'D1TOTDIST20',\n",
" 'D1TOTDIST25',\n",
" 'D1TOTDIST30',\n",
" 'D1ctrtime0to15',\n",
" 'D1ctrtime0to30',\n",
" 'D1hact0to15',\n",
" 'D1hact0to30',\n",
" 'D1vact0to15',\n",
" 'D1vact0to30',\n",
" 'methcage7',\n",
" 'methcage8',\n",
" 'methcage9',\n",
" 'methcage10',\n",
" 'methcage11',\n",
" 'methcage12']\n",
"\n",
"meth_loco_anxiety_pheno2 = [\n",
" 'D2totaldist0to15',\n",
" 'D2totaldist0to30',\n",
" 'D2TOTDIST5',\n",
" 'D2TOTDIST10',\n",
" 'D2TOTDIST15',\n",
" 'D2TOTDIST20',\n",
" 'D2TOTDIST25',\n",
" 'D2TOTDIST30',\n",
" 'D2ctrtime0to15',\n",
" 'D2ctrtime0to30',\n",
" 'D2hact0to15',\n",
" 'D2hact0to30',\n",
" 'D2vact0to15',\n",
" 'D2vact0to30',\n",
" 'methcage7',\n",
" 'methcage8',\n",
" 'methcage9',\n",
" 'methcage10',\n",
" 'methcage11',\n",
" 'methcage12']\n",
"\n",
" \n",
"\n",
"meth_loco_anxiety_pheno3 = [\n",
" 'D3totaldist0to15',\n",
" 'D3totaldist0to30',\n",
" 'D3TOTDIST5',\n",
" 'D3TOTDIST10',\n",
" 'D3TOTDIST15',\n",
" 'D3TOTDIST20',\n",
" 'D3TOTDIST25',\n",
" 'D3TOTDIST30',\n",
" 'D3ctrtime0to15',\n",
" 'D3ctrtime0to30',\n",
" 'D3hact0to15',\n",
" 'D3hact0to30',\n",
" 'D3vact0to15',\n",
" 'D3vact0to30',\n",
" 'methcage7',\n",
" 'methcage8',\n",
" 'methcage9',\n",
" 'methcage10',\n",
" 'methcage11',\n",
" 'methcage12']\n",
"\n",
"# PREPULSE INHIBITION (PPI) PHENOTYPES AND COVARIATES\n",
"# ------------------------------------\n",
"# All boxes appear to have some effect on some of the PPI phenotypes,\n",
"# with Box #3 having a particularly large effect on some phenotypes,\n",
"# so we include all PPI box indicators as covariates in analysis of the\n",
"# PPI phenotypes.\n",
"#\n",
"# We also map QTLs for habituation to pulses by analyzing the startle\n",
"# response during the fourth block of pulse-alone trials against the\n",
"# startle response during the first block of pulse-alone trials.\n",
"ppi_pheno = [\n",
" 'pp3PPIavg',\n",
" 'pp6PPIavg',\n",
" 'pp12PPIavg',\n",
" 'PPIavg',\n",
" 'startle',\n",
" 'p120b4',\n",
" 'PPIbox1',\n",
" 'PPIbox2',\n",
" 'PPIbox3',\n",
" 'PPIbox4',\n",
" 'p120b1']\n",
"\n",
"\n",
"pheno_batches = [bone_muscle_pheno, other_physio_traits_pheno, fear_cond_traits_pheno, meth_loco_anxiety_pheno1, \n",
" meth_loco_anxiety_pheno2, meth_loco_anxiety_pheno3, ppi_pheno]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Read in the pheno data saved from map notebook and discard lines with no data\n",
"\n",
"import pandas as pd\n",
"phenofile = data_path / \"pheno_new.csv\"\n",
"pheno_data = pd.read_csv(phenofile)\n",
"pheno_data = pheno_data[pheno_data[\"cageid\"].notnull()]\n",
"pheno_data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create synthetic training files for each phenome batch. Create one with \"id\" for joining with the genome\n",
"# data, and one without \"id\" for phenome training\n",
"\n",
"for i in range(len(pheno_batches)):\n",
" columns_use = pheno_batches[i]\n",
" columns_use.append(\"id\")\n",
" pheno_batch = pheno_data.filter(columns_use) \n",
" pheno_batch.dropna(inplace=True)\n",
" pheno_batch = pheno_batch.round(4)\n",
" pheno_batch_file = \"pheno_batch\" + str(i) + \"_withID.csv\"\n",
" filename = data_path / pheno_batch_file\n",
" pheno_batch.to_csv(filename, header=True, index=False)\n",
" pheno_batch = pheno_batch.drop(['id'], axis=1)\n",
" pheno_batch_file = \"pheno_batch\" + str(i) + \".csv\"\n",
" filename = data_path / pheno_batch_file\n",
" pheno_batch.to_csv(filename, header=True, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Make one big phenome dataset using all the relevant columns\n",
"\n",
"columns_use = []\n",
"for next_batch in pheno_batches:\n",
" columns_use = columns_use + next_batch\n",
" \n",
"# Remove duplicates\n",
"columns_uniq = list(set(columns_use))\n",
"\n",
"# Filter down to just these columns\n",
"pheno_alldata = pheno_data.filter(columns_uniq)\n",
"\n",
"# Save data out for later comparison with synthetic data\n",
"phenofile = data_path / \"phenome_alldata.csv\"\n",
"pheno_alldata.to_csv(phenofile, index=False, header=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}