synthetic-data-genomics / Git / [befbfc] /research_paper

Models:
MarcoTheBlack/
synthetic-data-genomics
Downloads: 1
[befbfc]: / research_paper_code / notebooks / map.ipynb
History
Download this file
365 lines (364 with data), 14.4 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a1c39fd9",
   "metadata": {},
   "source": [
    "# Experiment\n",
    "This notebook generates the experiment results from output using both GEMMA and linear regression in the directory output. The complete list of SNP-s and the p-value computed is generated for each algorithm and a graphical representation of the SNP-s is outputted as PNG.\n",
    "\n",
    "Generating all the output for all phenotypes would take ~3 days (ofc it can be easily parallelized by running different phenotypes in parallel). The GEMMA algorithm is optimized hence runs much faster than the linear regression.\n",
    "\n",
    "An example output is in example_output.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebb217cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ensure R kernel is installed. For a fresh install, un-comment and run this cell to \n",
    "# install R packages.\n",
    "\n",
    "#install.packages(\"qtl\", repos = \"http://cran.us.r-project.org\")\n",
    "#install.packages(\"qqman\", repos = \"http://cran.us.r-project.org\")\n",
    "#install.packages(\"data.table\", repos = \"http://cran.us.r-project.org\")\n",
    "#install.packages(\"stringr\", repos = \"http://cran.us.r-project.org\")\n",
    "#install.packages(\"qqman\", repos = \"http://cran.us.r-project.org\")\n",
    "#install.packages(\"devtools\", repos = \"http://cran.us.r-project.org\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a19f562",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set up working directory structures\n",
    "library(stringr)\n",
    "base_dir        <- str_replace(getwd(), 'research_paper_code/notebooks', '')\n",
    "r_base          <- \"research_paper_code\"\n",
    "experiment_dir  <- \"mice_data_set\"\n",
    "\n",
    "setwd(base_dir)\n",
    "getwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3092da8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Map QTLs for phenotypes measured in CFW outbred mice using the linear\n",
    "# mixed model (LMM) analysis implemented in GEMMA.\n",
    "library(qtl)\n",
    "library(data.table)\n",
    "source(paste(r_base, \"/src/misc.R\", sep=\"\"))\n",
    "source(paste(r_base, \"/src/gemma.R\", sep=\"\"))\n",
    "source(paste(r_base, \"/src/read.data.R\", sep=\"\"))\n",
    "source(paste(r_base, \"/src/data.manip.R\", sep=\"\"))\n",
    "source(paste(r_base, \"/src/qtl.analyses.R\", sep=\"\"))\n",
    "\n",
    "# SCRIPT PARAMETERS\n",
    "# -----------------\n",
    "chromosomes    <- NULL\n",
    "gemmadir       <- paste(experiment_dir, \"/gemma\", sep=\"\")\n",
    "gemma.exe      <- paste(\"./\", \"gemma-0.98.4-linux-static-AMD64\", sep=\"\")\n",
    "geno_txt       <- paste(experiment_dir, \"/data/geno.txt\", sep=\"\")\n",
    "map_txt        <- paste(experiment_dir, \"/data/map.txt\", sep=\"\")\n",
    "pheno_csv      <- paste(experiment_dir, \"/data/pheno.csv\", sep=\"\")\n",
    "\n",
    "# LOAD PHENOTYPE DATA\n",
    "# -------------------\n",
    "# Load the phenotype data, and discard outlying phenotype values. I\n",
    "# create binary covariates from some of the categorical phenotypes.\n",
    "cat(\"Loading phenotype data.\\n\")\n",
    "pheno_all <- read.pheno(pheno_csv)\n",
    "pheno_all <- prepare.pheno(pheno_all)\n",
    "pheno_all <- cbind(pheno_all,\n",
    "               binary.from.categorical(pheno_all$FCbox,paste0(\"FCbox\",1:4)),\n",
    "               binary.from.categorical(pheno_all$PPIbox,paste0(\"PPIbox\",1:5)),\n",
    "               binary.from.categorical(pheno_all$methcage,\n",
    "                                       paste0(\"methcage\",1:12)),\n",
    "               binary.from.categorical(pheno_all$round,paste0(\"SW\",1:25)))\n",
    "\n",
    "\n",
    "# LOAD GENOTYPE DATA\n",
    "# ------------------\n",
    "# Load the \"mean genotypes\"; i.e., the the mean alternative allele\n",
    "# counts.\n",
    "cat(\"Loading genotype data.\\n\")\n",
    "map     <- read.map(map_txt)\n",
    "out     <- read.geno.dosage(geno_txt,nrow(map))\n",
    "discard <- out$discard\n",
    "X_all   <- out$geno\n",
    "rm(out)\n",
    "\n",
    "# Discard genotype samples from mislabeled flowcell samples.\n",
    "X_all <- X_all[which(discard == \"no\"),]\n",
    "\n",
    "\n",
    "# Discard SNPs with low \"imputation quality\" assessed by inspecting\n",
    "# the genotype probabilities. Retain SNPs for which: (1) at least 95%\n",
    "# of the samples have a maximum probability genotype greater than than\n",
    "# 0.5; (2) the minor allele frequency is greater than 2%.\n",
    "f       <- apply(X_all,2,compute.maf)\n",
    "markers <- which(map$quality > 0.95 & f > 0.02)\n",
    "map     <- map[markers,]\n",
    "X_all   <- X_all[,markers]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e906e23",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_all[1:30,1:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83231fca",
   "metadata": {},
   "outputs": [],
   "source": [
    "map[1:10,]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95430e8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "pheno_all[1:10,1:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "352e8d84",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_all_csv         <- paste(experiment_dir, \"/data/geno_new.csv\", sep=\"\")\n",
    "pheno_all_csv     <- paste(experiment_dir, \"/data/pheno_new.csv\", sep=\"\")\n",
    "\n",
    "write.csv(X_all, X_all_csv, row.names = FALSE)\n",
    "write.csv(pheno_all, pheno_all_csv, row.names = FALSE)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16524707",
   "metadata": {},
   "source": [
    "# Paramaters that can be changed\n",
    "\n",
    "min_var, max_var - which of the columns in genotype data (X_all above) to be considered when using linear models (mostly for speed). Please note that gemma analysis will analyze the whole chromosome\n",
    "\n",
    "analysis_selection - a list of analysis to be processed. For the complete list check src/qtl.analysis.R"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9341aa70",
   "metadata": {},
   "outputs": [],
   "source": [
    "analysis_selection <- analyses\n",
    "# min_var = 1\n",
    "# max_var = 100\n",
    "\n",
    "# Example for all variants\n",
    "min_var = 1\n",
    "max_var = dim(X_all)[2]\n",
    "\n",
    "# Example to select only one analysis\n",
    "# analysis_selection <- analyses[\"BMD\"]\n",
    "\n",
    "# Example to select only one chromosome\n",
    "# min_var = min(which(map[\"chr\"]==11))\n",
    "# max_var = max(which(map[\"chr\"]==11))\n",
    "\n",
    "# Examples from sub-graphs (b) and (c) at https://www.nature.com/articles/ng.3609/figures/3 \n",
    "# analysis_selection <- analyses[\"pp12PPIavg\"]\n",
    "# min_var = min(which(map[\"chr\"]==11))\n",
    "# max_var = max(which(map[\"chr\"]==11))\n",
    "\n",
    "# analysis_selection <- analyses[\"pp12PPIavg\"]\n",
    "# min_var = min(which(map[\"chr\"]==7))\n",
    "# max_var = max(which(map[\"chr\"]==7))\n",
    "\n",
    "# analysis_selection <- analyses[\"testis\"]\n",
    "# min_var = min(which(map[\"chr\"]==13))\n",
    "# max_var = max(which(map[\"chr\"]==13))\n",
    "\n",
    "# analysis_selection <- analyses[\"testis\"]\n",
    "# min_var = min(which(map[\"chr\"]==5))\n",
    "# max_var = max(which(map[\"chr\"]==5))\n",
    "\n",
    "chromosomes <- unique(map[min_var:max_var,\"chr\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70c940c2",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "library(qqman)\n",
    "library(data.table)\n",
    "\n",
    "for(analysis in analysis_selection) { \n",
    "\n",
    "    ##################################\n",
    "    # Cleanup data\n",
    "    cat(\"Loading experiment\\n\")\n",
    "    phenotype  <- analysis$pheno\n",
    "    covariates <- analysis$cov\n",
    "    outliers   <- analysis$outliers\n",
    "    \n",
    "    cat(\"Removing outliers from experiment\\n\")\n",
    "    pheno <- copy(pheno_all)\n",
    "    if (!is.null(outliers))\n",
    "      pheno_all <- remove.outliers(pheno,phenotype,covariates,outliers)\n",
    "\n",
    "    \n",
    "    # Only analyze samples (i.e. rows of the genotype and phenotype\n",
    "    # matrices) for which the phenotype and all the covariates are\n",
    "    # observed.\n",
    "    cat(\"Filtering experiment samples with missing data\\n\")\n",
    "    pheno <- pheno[which(none.missing.row(pheno[c(phenotype,covariates)])),]\n",
    "\n",
    "    # Align the phenotypes and genotypes\n",
    "    cat(\"Aligning phenotypes and genotypes\\n\")\n",
    "    ids   <- intersect(pheno_all$id,rownames(X_all))\n",
    "    pheno <- pheno_all[match(ids,pheno_all$id),]\n",
    "    X     <- X_all[match(ids,rownames(X_all)),]\n",
    "\n",
    "    ###################################\n",
    "    # Compute using gemma\n",
    "    # MAP QTLs\n",
    "    \n",
    "    # Note: Geno is called X, geno and pheno are paired by ID, \n",
    "    # pheno has a column called ID, ID in geno is the row number/name.\n",
    "    X_csv     <- paste(experiment_dir, \"/out/X_\", analysis$pheno, \".csv\",sep=\"\")\n",
    "    pheno_csv <- paste(experiment_dir, \"/out/pheno_\", analysis$pheno, \".csv\",sep=\"\")\n",
    "    ids_csv   <- paste(experiment_dir, \"/out/ids_\", analysis$pheno, \".csv\",sep=\"\")\n",
    "    \n",
    "    write.csv(X, X_csv, row.names = TRUE)\n",
    "    write.csv(pheno, pheno_csv, row.names = FALSE)\n",
    "    write.csv(ids, ids_csv, row.names = FALSE)\n",
    "    \n",
    "    ge_out_dat <- paste(experiment_dir, \"/out/ge_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".dat\", sep=\"\")\n",
    "    ge_out_csv <- paste(experiment_dir, \"/out/ge_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".csv\", sep=\"\")\n",
    "    ge_out_png <- paste(experiment_dir, \"/out/ge_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".png\", sep=\"\")\n",
    "    \n",
    "    if (!file.exists(ge_out_csv) | !file.exists(ge_out_png)) {\n",
    "      # Calculate p-values using GEMMA.\n",
    "        gwscan.gemma <- run.gemma(phenotype,covariates,pheno,X,map,\n",
    "                                  gemmadir,gemma.exe,chromosomes)\n",
    "\n",
    "        # Save results to file.\n",
    "        cat(\"Saving results to file.\\n\")\n",
    "        save(list = c(\"analysis\",\"gwscan.gemma\"),file = ge_out_dat)\n",
    "        \n",
    "        named_gws <- gwscan.gemma\n",
    "        named_gws$snp = rownames(named_gws)\n",
    "        named_gws$p = 10 ^ (-named_gws$log10p)\n",
    "        \n",
    "        png(filename = ge_out_png,\n",
    "            width = 600, height = 600, units = \"px\", pointsize = 12,\n",
    "             bg = \"white\",  res = NA,\n",
    "             type = c(\"cairo\", \"cairo-png\", \"Xlib\", \"quartz\"))\n",
    "        manhattan(named_gws, chr=\"chr\", bp=\"pos\", snp='snp', p=\"p\", annotatePval = 0.1, annotateTop = TRUE )\n",
    "        dev.off()\n",
    "        \n",
    "        write.csv(data.table(named_gws)[order(rank(p)),], ge_out_csv)\n",
    "        \n",
    "    }\n",
    "    \n",
    "    ###################################\n",
    "    # Compute using linear model\n",
    "    lm_out_csv <- paste(experiment_dir, \"/out/lm_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".csv\",sep=\"\")\n",
    "    lm_out_png <- paste(experiment_dir, \"/out/lm_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".png\",sep=\"\")\n",
    "    \n",
    "    if(!file.exists(lm_out_csv) | ! file.exists(lm_out_png)) {\n",
    "        print(dim(X)[2])\n",
    "        dt <- data.table(snp=rep(\"\",dim(X)[2]), chr=rep(0,dim(X)[2]), pos=rep(0,dim(X)[2]), p=rep(1,dim(X)[2]))\n",
    "        for (i in min_var:max_var) {\n",
    "            X_variant <- cbind(X[,i], pheno_column=pheno[,analysis$pheno])\n",
    "            colnames(X_variant)[1]<-colnames(X)[i]\n",
    "            f <- paste(\"pheno_column ~ \",colnames(X)[i])\n",
    "            # Add any covariates\n",
    "            for(cov in analysis$cov) {\n",
    "                X_variant <- cbind(X_variant, pheno_column=pheno[,cov])\n",
    "                f <- paste(f,\"+\",cov)\n",
    "            }\n",
    "            res_variant <- lm(pheno_column~., data = data.frame(X_variant))\n",
    "            dt[i,1] = colnames(X)[i]\n",
    "            dt[i,2] = as.numeric(map[map[\"id\"]==colnames(X)[i],\"chr\"])\n",
    "            dt[i,3] = as.numeric(map[map[\"id\"]==colnames(X)[i],\"pos\"])\n",
    "            dt[i,4] = as.numeric(summary(res_variant)$coefficients[2,4])\n",
    "        }\n",
    "\n",
    "        # Print to file\n",
    "        png(filename = lm_out_png,\n",
    "            width = 600, height = 600, units = \"px\", pointsize = 12,\n",
    "             bg = \"white\",  res = NA,\n",
    "             type = c(\"cairo\", \"cairo-png\", \"Xlib\", \"quartz\"))\n",
    "        manhattan(dt[min_var:max_var], chr=\"chr\", bp=\"pos\", snp='snp', p=\"p\", annotatePval = 0.1, annotateTop = TRUE)\n",
    "        write.csv(dt[order(rank(p)),][1:(max_var-min_var)], lm_out_csv)\n",
    "        dev.off()\n",
    "        print(paste(\"Manhattan plot output to: \", lm_out_png, \", (sorted) values saved in: \", lm_out_csv, sep=\"\"))\n",
    "\n",
    "        ## Print also on screen\n",
    "        manhattan(dt[min_var:max_var], chr=\"chr\", bp=\"pos\", snp='snp', p=\"p\", annotatePval = 0.1, annotateTop = TRUE)\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08c52404",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "R",
   "language": "R",
   "name": "ir"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "4.1.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}