synthetic-data-genomics / Git / [befbfc] /synthetics/01_build_phenome_training

Models:
MarcoTheBlack/
synthetic-data-genomics
Downloads: 1
[befbfc]: / synthetics / 01_build_phenome_training_data.ipynb
History
Download this file
382 lines (381 with data), 13.2 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build phenome training data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!python3 -m pip install -U pandas numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pathlib\n",
    "import pandas as pd\n",
    "\n",
    "base_path = pathlib.Path(os.getcwd().replace(\"/synthetics\", \"\"))\n",
    "data_path = base_path / 'mice_data_set' / 'data'\n",
    "data_path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Specify the phennotypes and their covariates you plan to analyze\n",
    "* We will be synthesizig all phenotypes but only analyzing the ones you specify here\n",
    "* While you can run multiple phenotypes at the same time, you might get better peformance by running them individually"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pheno_analysis = [\"abBMD\"]\n",
    "#pheno_analysis = [\"abBMD\",\"soleus\"]\n",
    "#pheno_analysis = [\"soleus\"]\n",
    "\n",
    "pheno_analysis_df = pd.DataFrame({\"pheno\": pheno_analysis})\n",
    "filename = data_path / 'pheno_analysis.csv'\n",
    "pheno_analysis_df.to_csv(filename, index=False, header=True)\n",
    "\n",
    "pheno_posby_covariates = [\"abBMD\", \"SW16\"]\n",
    "#pheno_posby_covariates = [\"abBMD\", \"SW16\", \"soleus\", \"tibia\"]\n",
    "#pheno_posby_covariates = [\"soleus\", \"SW16\", \"tibia\"]\n",
    "pheno_and_cov_df = pd.DataFrame({\"pheno_and_cov\": pheno_posby_covariates})\n",
    "filename = data_path / 'pheno_and_covariates.csv'\n",
    "pheno_and_cov_df.to_csv(filename, index=False, header=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Batch the phenotypes into functional groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# MUSCLE AND BONE TRAITS AND COVARIATES\n",
    "# ----------------------\n",
    "# For all five muscle weights (TA, EDL, soleus, plantaris and\n",
    "# gastrocnemius), we map QTLs conditioning on tibia length\n",
    "# (\"tibia\"). For tibia length, we map QTLs conditioned on body weight.\n",
    "#\n",
    "# Tibia length explains 12-18% of variance in the muscle weights. The\n",
    "# rationale for including tibia length as a covariate is bone length\n",
    "# may somehow regulate muscle weight as well, and we would like to\n",
    "# isolate the genetic factors that directly regulate development of\n",
    "# the muscle tissues.\n",
    "#  \n",
    "# For bone-mineral density (BMD), we created a binary trait that\n",
    "# signals \"abnormal\" BMD. We do not include any covariates when\n",
    "# mapping QTLs for these traits. Note that body weight is also\n",
    "# uncorrelated with BMD.\n",
    "# \n",
    "# For all muscle and bone traits, we include a binary indicator for\n",
    "# round SW16 as a covariate because the mice from this round showed\n",
    "# substantial deviation in these traits compared to the rest of the\n",
    "# mice.\n",
    "bone_muscle_pheno = [\n",
    "    'TA',\n",
    "    'SW16',\n",
    "    'tibia',\n",
    "    'EDL',\n",
    "    'soleus',\n",
    "    'plantaris',\n",
    "    'gastroc',\n",
    "    'SW6',\n",
    "    'sacweight',\n",
    "    'BMD',\n",
    "    'abBMD']\n",
    "\n",
    "# OTHER PHYSIOLOGICAL TRAITS AND COVARIATES\n",
    "# --------------------------\n",
    "# Body weights bw1, bw2 and bw3 were measured on subsequent days of\n",
    "# the methamphetamine sensitivity tests, and are highly correlated\n",
    "# with each other (r^2 = 98%), so it is only necessary to map QTLs for\n",
    "# one of them. The body weight measurements after sacrifice\n",
    "# (\"sacweight\") show a considerable departure in Round SW17, so we\n",
    "# include a binary indicator for this round as a covariate for\n",
    "# sacweight. We include age as a covariate for the \"bw0\" body weight\n",
    "# because it was measured while the mouse was still growing.\n",
    "#\n",
    "# Fasting glucose levels are explained partially by body weight (PVE =\n",
    "# 6%), so we include body weight as a covariate for fasting glucose\n",
    "# levels. Rounds SW1 and SW11 showed a considerable departure in\n",
    "# fasting glucose levels from the other rounds, so we included binary\n",
    "# indicators for these two rounds as covariates for fasting glucose\n",
    "# levels.\n",
    "other_physio_traits_pheno = [\n",
    "    'bw0',\n",
    "    'glucoseage',\n",
    "    'bw1',\n",
    "    'methage',\n",
    "    'SW17',\n",
    "    'PPIweight',\n",
    "    'sacweight',\n",
    "    'fastglucose',\n",
    "    'SW1',\n",
    "    'SW11',\n",
    "    'taillength',\n",
    "    'SW3',\n",
    "    'SW4',\n",
    "    'SW19',\n",
    "    'SW20',\n",
    "    'SW22',\n",
    "    'SW24',\n",
    "    'testisweight']\n",
    "\n",
    "# FEAR CONDITIONING TRAITS AND COVARIATES\n",
    "# ------------------------\n",
    "# For all fear conditioning traits, the cage used for testing appears\n",
    "# to have an effect on the phenotype, so we include binary indicators\n",
    "# for cage as covariates for all FC phenotypes. Further, the FC\n",
    "# phenotype measurements in Round SW17 show a noticeably different\n",
    "# distribution in the FC phenotypes from the other rounds, so we\n",
    "# include a binary indicator for round SW17 as a covariate in all FC\n",
    "# traits.\n",
    "#\n",
    "# These analyses control for proportion of freezing on day 1 during\n",
    "# exposure to the tone (\"AvToneD1\"). AvToneD1 explains 11-25% of the\n",
    "# variance in the Day 2 and Day 3 freezing measures. Note that here we\n",
    "# can map QTLs for freezing to the altered context on Day 3\n",
    "# (\"AvAltContextD3\") as a quantitative trait after conditioning on\n",
    "# AvToneD1 because the distribution for this trait is no longer quite\n",
    "# so bimodal, and looks fairly \"normal\". So there is no need to map\n",
    "# QTLs for the binary version of this trait.\n",
    "#\n",
    "# PreTrainD1 is a very ugly trait with massive box effects and a lot\n",
    "# of low values, which might have to be removed as outliers. It is\n",
    "# quite likely that these outliers represent the \"deaf\" mice that\n",
    "# might be skewing the whole results. These outliers are present in\n",
    "# every box, so not a box-specific effect.\n",
    "fear_cond_traits_pheno = [\n",
    "    'AVContextD2',\n",
    "    'AVToneD1',\n",
    "    'FCbox1',\n",
    "    'FCbox2',\n",
    "    'FCbox3',\n",
    "    'SW17',\n",
    "    'AVAltContextD3',\n",
    "    'AvToneD3',\n",
    "    'PreTrainD1',\n",
    "    'SW10',\n",
    "    'SW16',\n",
    "    'SW20',\n",
    "    'SW7',\n",
    "    'SW14']\n",
    "\n",
    "# METHAMPHETAMINE SENSITIVITY, LOCOMOTOR ACTIVITY AND ANXIETY-LIKE BEHAVIOR AND COVARIATES\n",
    "# -------------------------------------------------------------------------\n",
    "# We checked all the cages used in these tests to see whether the\n",
    "# phenotypes measured using any given cage departed noticeably from\n",
    "# the other cages. Cage #7 consistently has a large effect.\n",
    "meth_loco_anxiety_pheno1 = [\n",
    "    'D1totaldist0to15',\n",
    "    'D1totaldist0to30',\n",
    "    'D1TOTDIST5',\n",
    "    'D1TOTDIST10',\n",
    "    'D1TOTDIST15',\n",
    "    'D1TOTDIST20',\n",
    "    'D1TOTDIST25',\n",
    "    'D1TOTDIST30',\n",
    "    'D1ctrtime0to15',\n",
    "    'D1ctrtime0to30',\n",
    "    'D1hact0to15',\n",
    "    'D1hact0to30',\n",
    "    'D1vact0to15',\n",
    "    'D1vact0to30',\n",
    "    'methcage7',\n",
    "    'methcage8',\n",
    "    'methcage9',\n",
    "    'methcage10',\n",
    "    'methcage11',\n",
    "    'methcage12']\n",
    "\n",
    "meth_loco_anxiety_pheno2 = [\n",
    "    'D2totaldist0to15',\n",
    "    'D2totaldist0to30',\n",
    "    'D2TOTDIST5',\n",
    "    'D2TOTDIST10',\n",
    "    'D2TOTDIST15',\n",
    "    'D2TOTDIST20',\n",
    "    'D2TOTDIST25',\n",
    "    'D2TOTDIST30',\n",
    "    'D2ctrtime0to15',\n",
    "    'D2ctrtime0to30',\n",
    "    'D2hact0to15',\n",
    "    'D2hact0to30',\n",
    "    'D2vact0to15',\n",
    "    'D2vact0to30',\n",
    "    'methcage7',\n",
    "    'methcage8',\n",
    "    'methcage9',\n",
    "    'methcage10',\n",
    "    'methcage11',\n",
    "    'methcage12']\n",
    "\n",
    " \n",
    "\n",
    "meth_loco_anxiety_pheno3 = [\n",
    "    'D3totaldist0to15',\n",
    "    'D3totaldist0to30',\n",
    "    'D3TOTDIST5',\n",
    "    'D3TOTDIST10',\n",
    "    'D3TOTDIST15',\n",
    "    'D3TOTDIST20',\n",
    "    'D3TOTDIST25',\n",
    "    'D3TOTDIST30',\n",
    "    'D3ctrtime0to15',\n",
    "    'D3ctrtime0to30',\n",
    "    'D3hact0to15',\n",
    "    'D3hact0to30',\n",
    "    'D3vact0to15',\n",
    "    'D3vact0to30',\n",
    "    'methcage7',\n",
    "    'methcage8',\n",
    "    'methcage9',\n",
    "    'methcage10',\n",
    "    'methcage11',\n",
    "    'methcage12']\n",
    "\n",
    "# PREPULSE INHIBITION (PPI) PHENOTYPES AND COVARIATES\n",
    "# ------------------------------------\n",
    "# All boxes appear to have some effect on some of the PPI phenotypes,\n",
    "# with Box #3 having a particularly large effect on some phenotypes,\n",
    "# so we include all PPI box indicators as covariates in analysis of the\n",
    "# PPI phenotypes.\n",
    "#\n",
    "# We also map QTLs for habituation to pulses by analyzing the startle\n",
    "# response during the fourth block of pulse-alone trials against the\n",
    "# startle response during the first block of pulse-alone trials.\n",
    "ppi_pheno = [\n",
    "    'pp3PPIavg',\n",
    "    'pp6PPIavg',\n",
    "    'pp12PPIavg',\n",
    "    'PPIavg',\n",
    "    'startle',\n",
    "    'p120b4',\n",
    "    'PPIbox1',\n",
    "    'PPIbox2',\n",
    "    'PPIbox3',\n",
    "    'PPIbox4',\n",
    "    'p120b1']\n",
    "\n",
    "\n",
    "pheno_batches = [bone_muscle_pheno, other_physio_traits_pheno, fear_cond_traits_pheno, meth_loco_anxiety_pheno1, \n",
    "                 meth_loco_anxiety_pheno2, meth_loco_anxiety_pheno3, ppi_pheno]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read in the pheno data saved from map notebook and discard lines with no data\n",
    "\n",
    "import pandas as pd\n",
    "phenofile = data_path / \"pheno_new.csv\"\n",
    "pheno_data = pd.read_csv(phenofile)\n",
    "pheno_data = pheno_data[pheno_data[\"cageid\"].notnull()]\n",
    "pheno_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create synthetic training files for each phenome batch. Create one with \"id\" for joining with the genome\n",
    "# data, and one without \"id\" for phenome training\n",
    "\n",
    "for i in range(len(pheno_batches)):\n",
    "    columns_use = pheno_batches[i]\n",
    "    columns_use.append(\"id\")\n",
    "    pheno_batch = pheno_data.filter(columns_use) \n",
    "    pheno_batch.dropna(inplace=True)\n",
    "    pheno_batch = pheno_batch.round(4)\n",
    "    pheno_batch_file = \"pheno_batch\" + str(i) + \"_withID.csv\"\n",
    "    filename = data_path / pheno_batch_file\n",
    "    pheno_batch.to_csv(filename, header=True, index=False)\n",
    "    pheno_batch = pheno_batch.drop(['id'], axis=1)\n",
    "    pheno_batch_file = \"pheno_batch\" + str(i) + \".csv\"\n",
    "    filename = data_path / pheno_batch_file\n",
    "    pheno_batch.to_csv(filename, header=True, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make one big phenome dataset using all the relevant columns\n",
    "\n",
    "columns_use = []\n",
    "for next_batch in pheno_batches:\n",
    "    columns_use = columns_use + next_batch\n",
    "    \n",
    "# Remove duplicates\n",
    "columns_uniq = list(set(columns_use))\n",
    "\n",
    "# Filter down to just these columns\n",
    "pheno_alldata = pheno_data.filter(columns_uniq)\n",
    "\n",
    "# Save data out for later comparison with synthetic data\n",
    "phenofile = data_path / \"phenome_alldata.csv\"\n",
    "pheno_alldata.to_csv(phenofile, index=False, header=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}