synthetic-data-genomics / Git / [befbfc] /synthetics/03_build_genome_training

Models:
MarcoTheBlack/
synthetic-data-genomics
Downloads: 1
[befbfc]: / synthetics / 03_build_genome_training_data.ipynb
History
Download this file
401 lines (400 with data), 13.3 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build the mouse genome training dataset\n",
    "This notebook creates a mouse genome training datasets by batching the genome data by position on the chromosome. You can pick the quantity and characteristics of the batches you'd like to create training \n",
    "datasets for. Your choices are written to a file that the next notebook, 04_create_synthetic_mouse_genomes, reads from."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pathlib\n",
    "import pandas as pd\n",
    "\n",
    "base_path = pathlib.Path(os.getcwd().replace(\"/synthetics\", \"\"))\n",
    "data_path = base_path / 'mice_data_set' / 'data' \n",
    "experiment_path = base_path / 'mice_data_set' / 'out' \n",
    "data_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read in the geno data and remove the discards (slow)\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "genofile = data_path / \"geno.txt\"\n",
    "geno = pd.read_csv(genofile, sep=' ')\n",
    "geno = geno[geno[\"discard\"] == \"no\"]\n",
    "geno.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gather the subset of phenotype data you plan to analyze\n",
    "# Note, it is important that any rounding you do on these values\n",
    "# mirrors what you did when you created the seed file in 02\n",
    "\n",
    "phenofile = data_path / \"phenome_alldata.csv\"\n",
    "\n",
    "pheno_alldata = pd.read_csv(phenofile)\n",
    "\n",
    "filename = data_path / 'pheno_analysis.csv'\n",
    "pheno_analysis_df = pd.read_csv(filename)\n",
    "pheno_analysis = list(pheno_analysis_df[\"pheno\"])\n",
    "\n",
    "filename = data_path / 'pheno_and_covariates.csv'\n",
    "pheno_and_cov_df = pd.read_csv(filename)\n",
    "pheno_and_cov = list(pheno_and_cov_df[\"pheno_and_cov\"])\n",
    "\n",
    "# Here we're going to set the SNP count to use per training batch to be\n",
    "# 19 minus the number of phenotypes and covariates being analyzed\n",
    "snp_cnt = 19 - len(pheno_and_cov)\n",
    "print(\"Using \" + str(snp_cnt) + \" SNPs per training batch\")\n",
    "\n",
    "columns_use = pheno_and_cov\n",
    "columns_use.append('id')\n",
    "pheno = pheno_alldata.filter(columns_use).round(4)\n",
    "\n",
    "# Now you must drop any NaN's in the subset of pheno's you decided to use\n",
    "pheno = pheno.dropna()\n",
    "pheno.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Grab the original GWAS linear model results for each phenotype being analyzed\n",
    "# These files also contain the chromosome position information\n",
    "\n",
    "all_gwas_scores = []\n",
    "for next_pheno in pheno_analysis:\n",
    "    filename = \"lm_\" + next_pheno + \"_1_79646.csv\"\n",
    "    gwasfile = experiment_path / filename\n",
    "    gwas_scores = pd.read_csv(gwasfile)\n",
    "    all_gwas_scores.append(gwas_scores)\n",
    "\n",
    "all_gwas_scores[0].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sort each by pvalue\n",
    "\n",
    "gwas_scores_sorted = []\n",
    "for gwas_scores in all_gwas_scores:\n",
    "    scores_sorted = gwas_scores.sort_values(by=['p']).reset_index()\n",
    "    gwas_scores_sorted.append(scores_sorted)\n",
    "    \n",
    "gwas_scores_sorted[0].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Group the SNPs into batches of snp_cnt (determined above)\n",
    "# Set min_chromo and max_chromo if you plan to analyze all batches\n",
    "# on certain chromozomes\n",
    "\n",
    "interesting_threshold = 5e-8\n",
    "#interesting_threshold = 5e-7\n",
    "min_chromo = 11\n",
    "max_chromo = 14\n",
    "\n",
    "batches = {}\n",
    "batch_num = -1\n",
    "max_snps_per_batch = snp_cnt\n",
    "snp_cnt = 0\n",
    "last_chromo = -1\n",
    "batch_avg_pvalue = {}\n",
    "all_avg_pvalues = []\n",
    "all_int_cnt = []\n",
    "all_not_int_cnt = []\n",
    "all_batch_nums = []\n",
    "snp_to_batch = {}\n",
    "\n",
    "# Get a list of the SNPs sort by position on the chromosome\n",
    "chromo_sort = all_gwas_scores[0].sort_values(by=['chr','pos'], ascending=False).reset_index()\n",
    "\n",
    "for i in range(len(chromo_sort)):\n",
    "    chromo = chromo_sort.loc[i]['chr']\n",
    "    pos = chromo_sort.loc[i]['pos'] \n",
    "    snp = chromo_sort.loc[i]['snp'] \n",
    "    if ((snp_cnt == max_snps_per_batch) or (chromo != last_chromo)):\n",
    "        batch_num += 1\n",
    "        batches[batch_num] = {}\n",
    "        batches[batch_num]['chromos'] = []\n",
    "        batches[batch_num]['chr_pos'] = []\n",
    "        batches[batch_num]['snps'] = []\n",
    "        snp_cnt = 0\n",
    "        \n",
    "    batches[batch_num]['chr_pos'].append(str(chromo) + \"_\" + str(pos))\n",
    "    batches[batch_num]['chromos'].append(chromo)\n",
    "    batches[batch_num]['snps'].append(snp)\n",
    "    snp_to_batch[snp] = batch_num\n",
    "    last_chromo = chromo\n",
    "    snp_cnt += 1\n",
    "\n",
    "# Now for each phenotype we're analyzing gather the pvalue interesting cnt per batch\n",
    "\n",
    "pheno_batch_interesting = {}\n",
    "for i, next_pheno in enumerate(pheno_analysis):\n",
    "    pheno_batch_interesting[next_pheno] = {}\n",
    "    for j in range(len(batches)):\n",
    "        pheno_batch_interesting[next_pheno][j] = 0\n",
    "        \n",
    "    gwas_scores = gwas_scores_sorted[i]\n",
    "    k = 0\n",
    "    pscore = gwas_scores.loc[k]['p']\n",
    "    snp = gwas_scores.loc[k]['snp']\n",
    "    while pscore <= interesting_threshold:\n",
    "        batch_num = snp_to_batch[snp]\n",
    "        pheno_batch_interesting[next_pheno][batch_num] += 1\n",
    "        k += 1\n",
    "        pscore = gwas_scores.loc[k]['p']\n",
    "        snp = gwas_scores.loc[k]['snp']\n",
    "    \n",
    "# Sum the interesting pvalues per batch\n",
    "\n",
    "batch_interesting_sums = []\n",
    "for i in range(len(batches)):\n",
    "    interesting_sum = 0\n",
    "    for next_pheno in pheno_analysis:\n",
    "        interesting_sum += pheno_batch_interesting[next_pheno][i]\n",
    "    batch_interesting_sums.append(interesting_sum) \n",
    "\n",
    "# Save the batches in the deemed interesting range\n",
    "\n",
    "batches_in_chromo_range = []\n",
    "for i in range(len(batches)):\n",
    "    chromos = batches[i]['chromos']\n",
    "    use_batch = False\n",
    "    for chromo in chromos:\n",
    "        if ((chromo >= min_chromo) & (chromo < max_chromo)):\n",
    "            use_batch = True\n",
    "    if use_batch:\n",
    "        batches_in_chromo_range.append(i)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gather up the interesting and non-interesting batches\n",
    "\n",
    "interesting_batches = [i for i in range(len(batches)) if batch_interesting_sums[i] > 0] \n",
    "noninteresting_batches = [i for i in range(len(batches)) if batch_interesting_sums[i] == 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optional cell to see which batches have interesting relationships to the pheno's you're studying\n",
    "\n",
    "for i in range(len(batch_interesting_sums)):\n",
    "    count = batch_interesting_sums[i]\n",
    "    if count > 0:\n",
    "        print(\"Batch \" + str(i) + \": \")\n",
    "        for next in pheno_analysis:\n",
    "            cnt = pheno_batch_interesting[next][i]\n",
    "            print(\"\\t\" + next + \": \" + str(cnt))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create function to build a training set from a batch\n",
    "\n",
    "def build_training(batch):\n",
    "    \n",
    "    training_min_rows = 25000\n",
    "    \n",
    "    # Gather the SNPs\n",
    "    grp_columns = list(batches[batch][\"snps\"])\n",
    "    grp_columns.append(\"id\")\n",
    "    geno_grp = geno.filter(grp_columns)\n",
    "\n",
    "    # Round float values to integers\n",
    "    floats = geno_grp.select_dtypes(include=['float64']) \n",
    "    for col in floats.columns.values:\n",
    "        geno_grp[col] = round(geno_grp[col]).astype('int')    \n",
    "\n",
    "    # Add in the phenome information to the genome training set\n",
    "    genome_phenome = geno_grp.join(pheno.set_index('id'), on = \"id\", how = \"inner\")\n",
    "    columns_use = list(geno_grp.columns)\n",
    "    columns_use.reverse()\n",
    "    columns_use = columns_use + pheno_and_cov  \n",
    "    genome_train = genome_phenome.filter(columns_use)\n",
    "    \n",
    "    # Replicate training set to have a minimum of 25000 examples\n",
    "    dataset_rows = len(genome_train)\n",
    "    genome_train = pd.concat([genome_train] * (training_min_rows // dataset_rows + 1))\n",
    "    genome_train.drop(['id'], axis=1, inplace=True)\n",
    "    \n",
    "    # Save the training file\n",
    "    filename = \"geno_batch\" + str(batch) + \"_train.csv\"\n",
    "    genofile = data_path / \"genome_training_data\" / filename\n",
    "    genome_train.to_csv(genofile, index=False, header=True)\n",
    "    \n",
    "    # Now create of version of map.txt with just the SNPs in the training set of this first batch\n",
    "    mapfile = data_path / \"map.txt\"\n",
    "    mapdata = pd.read_csv(mapfile, sep=' ')\n",
    "    mapdata_use = mapdata[mapdata[\"id\"].isin(batches[batch][\"snps\"])]\n",
    "    filename = \"map_batch\" + str(batch) + \".txt\"\n",
    "    mapfile_new = data_path / \"genome_map_data\" / filename\n",
    "    mapdata_use.to_csv(mapfile_new, sep=' ', header=True, index=False)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# How many batches are in the intesesting chromosome range (if you chose to do that)\n",
    "len(batches_in_chromo_range)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# If desired, create training sets for batches in the desired chromo range\n",
    "\n",
    "batches_to_use = batches_in_chromo_range\n",
    "all_batches_used = []\n",
    "for batch in batches_to_use:\n",
    "    build_training(batch)\n",
    "    all_batches_used.append(batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Alternatively, you can choose batches from the interesting and non-interesting batch lists.\n",
    "# How many batches have interesting pvalues\n",
    "len(interesting_batches)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create training sets for some or all of the batches with interesting pvalues\n",
    "# Set the number you want to use below\n",
    "\n",
    "import random\n",
    "\n",
    "interesting_batches_to_use = 6\n",
    "batches_to_use = random.sample(interesting_batches, interesting_batches_to_use)\n",
    "all_batches_used = []\n",
    "for batch in batches_to_use:\n",
    "    build_training(batch)\n",
    "    all_batches_used.append(batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# How many batches have interesting pvalues\n",
    "len(noninteresting_batches)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Also create training sets for some or all of the batches with no interesting pvalues\n",
    "\n",
    "non_interesting_batches_to_use = 19911\n",
    "batches_to_use = random.sample(noninteresting_batches, non_interesting_batches_to_use)\n",
    "for batch in batches_to_use:\n",
    "    build_training(batch)\n",
    "    all_batches_used.append(batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save to a file the list of batch numbers we created training sets for\n",
    "\n",
    "file_df = pd.DataFrame({\"batch\": all_batches_used})\n",
    "filename = data_path / \"batch_training_list.csv\"\n",
    "file_df.to_csv(filename, index=False, header=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}