synthetic-data-genomics / Git / Diff of /research_paper

Models:
MarcoTheBlack/
synthetic-data-genomics
Downloads: 1
Diff of /research_paper_code/notebooks/map.ipynb [000000] .. [befbfc]
Switch to side-by-side view

--- a
+++ b/research_paper_code/notebooks/map.ipynb
@@ -0,0 +1,364 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a1c39fd9",
+   "metadata": {},
+   "source": [
+    "# Experiment\n",
+    "This notebook generates the experiment results from output using both GEMMA and linear regression in the directory output. The complete list of SNP-s and the p-value computed is generated for each algorithm and a graphical representation of the SNP-s is outputted as PNG.\n",
+    "\n",
+    "Generating all the output for all phenotypes would take ~3 days (ofc it can be easily parallelized by running different phenotypes in parallel). The GEMMA algorithm is optimized hence runs much faster than the linear regression.\n",
+    "\n",
+    "An example output is in example_output.tar.gz"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebb217cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ensure R kernel is installed. For a fresh install, un-comment and run this cell to \n",
+    "# install R packages.\n",
+    "\n",
+    "#install.packages(\"qtl\", repos = \"http://cran.us.r-project.org\")\n",
+    "#install.packages(\"qqman\", repos = \"http://cran.us.r-project.org\")\n",
+    "#install.packages(\"data.table\", repos = \"http://cran.us.r-project.org\")\n",
+    "#install.packages(\"stringr\", repos = \"http://cran.us.r-project.org\")\n",
+    "#install.packages(\"qqman\", repos = \"http://cran.us.r-project.org\")\n",
+    "#install.packages(\"devtools\", repos = \"http://cran.us.r-project.org\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a19f562",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set up working directory structures\n",
+    "library(stringr)\n",
+    "base_dir        <- str_replace(getwd(), 'research_paper_code/notebooks', '')\n",
+    "r_base          <- \"research_paper_code\"\n",
+    "experiment_dir  <- \"mice_data_set\"\n",
+    "\n",
+    "setwd(base_dir)\n",
+    "getwd()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e3092da8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Map QTLs for phenotypes measured in CFW outbred mice using the linear\n",
+    "# mixed model (LMM) analysis implemented in GEMMA.\n",
+    "library(qtl)\n",
+    "library(data.table)\n",
+    "source(paste(r_base, \"/src/misc.R\", sep=\"\"))\n",
+    "source(paste(r_base, \"/src/gemma.R\", sep=\"\"))\n",
+    "source(paste(r_base, \"/src/read.data.R\", sep=\"\"))\n",
+    "source(paste(r_base, \"/src/data.manip.R\", sep=\"\"))\n",
+    "source(paste(r_base, \"/src/qtl.analyses.R\", sep=\"\"))\n",
+    "\n",
+    "# SCRIPT PARAMETERS\n",
+    "# -----------------\n",
+    "chromosomes    <- NULL\n",
+    "gemmadir       <- paste(experiment_dir, \"/gemma\", sep=\"\")\n",
+    "gemma.exe      <- paste(\"./\", \"gemma-0.98.4-linux-static-AMD64\", sep=\"\")\n",
+    "geno_txt       <- paste(experiment_dir, \"/data/geno.txt\", sep=\"\")\n",
+    "map_txt        <- paste(experiment_dir, \"/data/map.txt\", sep=\"\")\n",
+    "pheno_csv      <- paste(experiment_dir, \"/data/pheno.csv\", sep=\"\")\n",
+    "\n",
+    "# LOAD PHENOTYPE DATA\n",
+    "# -------------------\n",
+    "# Load the phenotype data, and discard outlying phenotype values. I\n",
+    "# create binary covariates from some of the categorical phenotypes.\n",
+    "cat(\"Loading phenotype data.\\n\")\n",
+    "pheno_all <- read.pheno(pheno_csv)\n",
+    "pheno_all <- prepare.pheno(pheno_all)\n",
+    "pheno_all <- cbind(pheno_all,\n",
+    "               binary.from.categorical(pheno_all$FCbox,paste0(\"FCbox\",1:4)),\n",
+    "               binary.from.categorical(pheno_all$PPIbox,paste0(\"PPIbox\",1:5)),\n",
+    "               binary.from.categorical(pheno_all$methcage,\n",
+    "                                       paste0(\"methcage\",1:12)),\n",
+    "               binary.from.categorical(pheno_all$round,paste0(\"SW\",1:25)))\n",
+    "\n",
+    "\n",
+    "# LOAD GENOTYPE DATA\n",
+    "# ------------------\n",
+    "# Load the \"mean genotypes\"; i.e., the the mean alternative allele\n",
+    "# counts.\n",
+    "cat(\"Loading genotype data.\\n\")\n",
+    "map     <- read.map(map_txt)\n",
+    "out     <- read.geno.dosage(geno_txt,nrow(map))\n",
+    "discard <- out$discard\n",
+    "X_all   <- out$geno\n",
+    "rm(out)\n",
+    "\n",
+    "# Discard genotype samples from mislabeled flowcell samples.\n",
+    "X_all <- X_all[which(discard == \"no\"),]\n",
+    "\n",
+    "\n",
+    "# Discard SNPs with low \"imputation quality\" assessed by inspecting\n",
+    "# the genotype probabilities. Retain SNPs for which: (1) at least 95%\n",
+    "# of the samples have a maximum probability genotype greater than than\n",
+    "# 0.5; (2) the minor allele frequency is greater than 2%.\n",
+    "f       <- apply(X_all,2,compute.maf)\n",
+    "markers <- which(map$quality > 0.95 & f > 0.02)\n",
+    "map     <- map[markers,]\n",
+    "X_all   <- X_all[,markers]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e906e23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_all[1:30,1:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83231fca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "map[1:10,]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95430e8f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pheno_all[1:10,1:20]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "352e8d84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_all_csv         <- paste(experiment_dir, \"/data/geno_new.csv\", sep=\"\")\n",
+    "pheno_all_csv     <- paste(experiment_dir, \"/data/pheno_new.csv\", sep=\"\")\n",
+    "\n",
+    "write.csv(X_all, X_all_csv, row.names = FALSE)\n",
+    "write.csv(pheno_all, pheno_all_csv, row.names = FALSE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "16524707",
+   "metadata": {},
+   "source": [
+    "# Paramaters that can be changed\n",
+    "\n",
+    "min_var, max_var - which of the columns in genotype data (X_all above) to be considered when using linear models (mostly for speed). Please note that gemma analysis will analyze the whole chromosome\n",
+    "\n",
+    "analysis_selection - a list of analysis to be processed. For the complete list check src/qtl.analysis.R"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9341aa70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "analysis_selection <- analyses\n",
+    "# min_var = 1\n",
+    "# max_var = 100\n",
+    "\n",
+    "# Example for all variants\n",
+    "min_var = 1\n",
+    "max_var = dim(X_all)[2]\n",
+    "\n",
+    "# Example to select only one analysis\n",
+    "# analysis_selection <- analyses[\"BMD\"]\n",
+    "\n",
+    "# Example to select only one chromosome\n",
+    "# min_var = min(which(map[\"chr\"]==11))\n",
+    "# max_var = max(which(map[\"chr\"]==11))\n",
+    "\n",
+    "# Examples from sub-graphs (b) and (c) at https://www.nature.com/articles/ng.3609/figures/3 \n",
+    "# analysis_selection <- analyses[\"pp12PPIavg\"]\n",
+    "# min_var = min(which(map[\"chr\"]==11))\n",
+    "# max_var = max(which(map[\"chr\"]==11))\n",
+    "\n",
+    "# analysis_selection <- analyses[\"pp12PPIavg\"]\n",
+    "# min_var = min(which(map[\"chr\"]==7))\n",
+    "# max_var = max(which(map[\"chr\"]==7))\n",
+    "\n",
+    "# analysis_selection <- analyses[\"testis\"]\n",
+    "# min_var = min(which(map[\"chr\"]==13))\n",
+    "# max_var = max(which(map[\"chr\"]==13))\n",
+    "\n",
+    "# analysis_selection <- analyses[\"testis\"]\n",
+    "# min_var = min(which(map[\"chr\"]==5))\n",
+    "# max_var = max(which(map[\"chr\"]==5))\n",
+    "\n",
+    "chromosomes <- unique(map[min_var:max_var,\"chr\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70c940c2",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "library(qqman)\n",
+    "library(data.table)\n",
+    "\n",
+    "for(analysis in analysis_selection) { \n",
+    "\n",
+    "    ##################################\n",
+    "    # Cleanup data\n",
+    "    cat(\"Loading experiment\\n\")\n",
+    "    phenotype  <- analysis$pheno\n",
+    "    covariates <- analysis$cov\n",
+    "    outliers   <- analysis$outliers\n",
+    "    \n",
+    "    cat(\"Removing outliers from experiment\\n\")\n",
+    "    pheno <- copy(pheno_all)\n",
+    "    if (!is.null(outliers))\n",
+    "      pheno_all <- remove.outliers(pheno,phenotype,covariates,outliers)\n",
+    "\n",
+    "    \n",
+    "    # Only analyze samples (i.e. rows of the genotype and phenotype\n",
+    "    # matrices) for which the phenotype and all the covariates are\n",
+    "    # observed.\n",
+    "    cat(\"Filtering experiment samples with missing data\\n\")\n",
+    "    pheno <- pheno[which(none.missing.row(pheno[c(phenotype,covariates)])),]\n",
+    "\n",
+    "    # Align the phenotypes and genotypes\n",
+    "    cat(\"Aligning phenotypes and genotypes\\n\")\n",
+    "    ids   <- intersect(pheno_all$id,rownames(X_all))\n",
+    "    pheno <- pheno_all[match(ids,pheno_all$id),]\n",
+    "    X     <- X_all[match(ids,rownames(X_all)),]\n",
+    "\n",
+    "    ###################################\n",
+    "    # Compute using gemma\n",
+    "    # MAP QTLs\n",
+    "    \n",
+    "    # Note: Geno is called X, geno and pheno are paired by ID, \n",
+    "    # pheno has a column called ID, ID in geno is the row number/name.\n",
+    "    X_csv     <- paste(experiment_dir, \"/out/X_\", analysis$pheno, \".csv\",sep=\"\")\n",
+    "    pheno_csv <- paste(experiment_dir, \"/out/pheno_\", analysis$pheno, \".csv\",sep=\"\")\n",
+    "    ids_csv   <- paste(experiment_dir, \"/out/ids_\", analysis$pheno, \".csv\",sep=\"\")\n",
+    "    \n",
+    "    write.csv(X, X_csv, row.names = TRUE)\n",
+    "    write.csv(pheno, pheno_csv, row.names = FALSE)\n",
+    "    write.csv(ids, ids_csv, row.names = FALSE)\n",
+    "    \n",
+    "    ge_out_dat <- paste(experiment_dir, \"/out/ge_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".dat\", sep=\"\")\n",
+    "    ge_out_csv <- paste(experiment_dir, \"/out/ge_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".csv\", sep=\"\")\n",
+    "    ge_out_png <- paste(experiment_dir, \"/out/ge_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".png\", sep=\"\")\n",
+    "    \n",
+    "    if (!file.exists(ge_out_csv) | !file.exists(ge_out_png)) {\n",
+    "      # Calculate p-values using GEMMA.\n",
+    "        gwscan.gemma <- run.gemma(phenotype,covariates,pheno,X,map,\n",
+    "                                  gemmadir,gemma.exe,chromosomes)\n",
+    "\n",
+    "        # Save results to file.\n",
+    "        cat(\"Saving results to file.\\n\")\n",
+    "        save(list = c(\"analysis\",\"gwscan.gemma\"),file = ge_out_dat)\n",
+    "        \n",
+    "        named_gws <- gwscan.gemma\n",
+    "        named_gws$snp = rownames(named_gws)\n",
+    "        named_gws$p = 10 ^ (-named_gws$log10p)\n",
+    "        \n",
+    "        png(filename = ge_out_png,\n",
+    "            width = 600, height = 600, units = \"px\", pointsize = 12,\n",
+    "             bg = \"white\",  res = NA,\n",
+    "             type = c(\"cairo\", \"cairo-png\", \"Xlib\", \"quartz\"))\n",
+    "        manhattan(named_gws, chr=\"chr\", bp=\"pos\", snp='snp', p=\"p\", annotatePval = 0.1, annotateTop = TRUE )\n",
+    "        dev.off()\n",
+    "        \n",
+    "        write.csv(data.table(named_gws)[order(rank(p)),], ge_out_csv)\n",
+    "        \n",
+    "    }\n",
+    "    \n",
+    "    ###################################\n",
+    "    # Compute using linear model\n",
+    "    lm_out_csv <- paste(experiment_dir, \"/out/lm_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".csv\",sep=\"\")\n",
+    "    lm_out_png <- paste(experiment_dir, \"/out/lm_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".png\",sep=\"\")\n",
+    "    \n",
+    "    if(!file.exists(lm_out_csv) | ! file.exists(lm_out_png)) {\n",
+    "        print(dim(X)[2])\n",
+    "        dt <- data.table(snp=rep(\"\",dim(X)[2]), chr=rep(0,dim(X)[2]), pos=rep(0,dim(X)[2]), p=rep(1,dim(X)[2]))\n",
+    "        for (i in min_var:max_var) {\n",
+    "            X_variant <- cbind(X[,i], pheno_column=pheno[,analysis$pheno])\n",
+    "            colnames(X_variant)[1]<-colnames(X)[i]\n",
+    "            f <- paste(\"pheno_column ~ \",colnames(X)[i])\n",
+    "            # Add any covariates\n",
+    "            for(cov in analysis$cov) {\n",
+    "                X_variant <- cbind(X_variant, pheno_column=pheno[,cov])\n",
+    "                f <- paste(f,\"+\",cov)\n",
+    "            }\n",
+    "            res_variant <- lm(pheno_column~., data = data.frame(X_variant))\n",
+    "            dt[i,1] = colnames(X)[i]\n",
+    "            dt[i,2] = as.numeric(map[map[\"id\"]==colnames(X)[i],\"chr\"])\n",
+    "            dt[i,3] = as.numeric(map[map[\"id\"]==colnames(X)[i],\"pos\"])\n",
+    "            dt[i,4] = as.numeric(summary(res_variant)$coefficients[2,4])\n",
+    "        }\n",
+    "\n",
+    "        # Print to file\n",
+    "        png(filename = lm_out_png,\n",
+    "            width = 600, height = 600, units = \"px\", pointsize = 12,\n",
+    "             bg = \"white\",  res = NA,\n",
+    "             type = c(\"cairo\", \"cairo-png\", \"Xlib\", \"quartz\"))\n",
+    "        manhattan(dt[min_var:max_var], chr=\"chr\", bp=\"pos\", snp='snp', p=\"p\", annotatePval = 0.1, annotateTop = TRUE)\n",
+    "        write.csv(dt[order(rank(p)),][1:(max_var-min_var)], lm_out_csv)\n",
+    "        dev.off()\n",
+    "        print(paste(\"Manhattan plot output to: \", lm_out_png, \", (sorted) values saved in: \", lm_out_csv, sep=\"\"))\n",
+    "\n",
+    "        ## Print also on screen\n",
+    "        manhattan(dt[min_var:max_var], chr=\"chr\", bp=\"pos\", snp='snp', p=\"p\", annotatePval = 0.1, annotateTop = TRUE)\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08c52404",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "R",
+   "language": "R",
+   "name": "ir"
+  },
+  "language_info": {
+   "codemirror_mode": "r",
+   "file_extension": ".r",
+   "mimetype": "text/x-r-source",
+   "name": "R",
+   "pygments_lexer": "r",
+   "version": "4.1.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}