--- a +++ b/research_paper_code/notebooks/map.ipynb @@ -0,0 +1,364 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1c39fd9", + "metadata": {}, + "source": [ + "# Experiment\n", + "This notebook generates the experiment results from output using both GEMMA and linear regression in the directory output. The complete list of SNP-s and the p-value computed is generated for each algorithm and a graphical representation of the SNP-s is outputted as PNG.\n", + "\n", + "Generating all the output for all phenotypes would take ~3 days (ofc it can be easily parallelized by running different phenotypes in parallel). The GEMMA algorithm is optimized hence runs much faster than the linear regression.\n", + "\n", + "An example output is in example_output.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebb217cd", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure R kernel is installed. For a fresh install, un-comment and run this cell to \n", + "# install R packages.\n", + "\n", + "#install.packages(\"qtl\", repos = \"http://cran.us.r-project.org\")\n", + "#install.packages(\"qqman\", repos = \"http://cran.us.r-project.org\")\n", + "#install.packages(\"data.table\", repos = \"http://cran.us.r-project.org\")\n", + "#install.packages(\"stringr\", repos = \"http://cran.us.r-project.org\")\n", + "#install.packages(\"qqman\", repos = \"http://cran.us.r-project.org\")\n", + "#install.packages(\"devtools\", repos = \"http://cran.us.r-project.org\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a19f562", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up working directory structures\n", + "library(stringr)\n", + "base_dir <- str_replace(getwd(), 'research_paper_code/notebooks', '')\n", + "r_base <- \"research_paper_code\"\n", + "experiment_dir <- \"mice_data_set\"\n", + "\n", + "setwd(base_dir)\n", + "getwd()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3092da8", + "metadata": {}, + "outputs": [], + "source": [ + "# Map QTLs for phenotypes measured in CFW outbred mice using the linear\n", + "# mixed model (LMM) analysis implemented in GEMMA.\n", + "library(qtl)\n", + "library(data.table)\n", + "source(paste(r_base, \"/src/misc.R\", sep=\"\"))\n", + "source(paste(r_base, \"/src/gemma.R\", sep=\"\"))\n", + "source(paste(r_base, \"/src/read.data.R\", sep=\"\"))\n", + "source(paste(r_base, \"/src/data.manip.R\", sep=\"\"))\n", + "source(paste(r_base, \"/src/qtl.analyses.R\", sep=\"\"))\n", + "\n", + "# SCRIPT PARAMETERS\n", + "# -----------------\n", + "chromosomes <- NULL\n", + "gemmadir <- paste(experiment_dir, \"/gemma\", sep=\"\")\n", + "gemma.exe <- paste(\"./\", \"gemma-0.98.4-linux-static-AMD64\", sep=\"\")\n", + "geno_txt <- paste(experiment_dir, \"/data/geno.txt\", sep=\"\")\n", + "map_txt <- paste(experiment_dir, \"/data/map.txt\", sep=\"\")\n", + "pheno_csv <- paste(experiment_dir, \"/data/pheno.csv\", sep=\"\")\n", + "\n", + "# LOAD PHENOTYPE DATA\n", + "# -------------------\n", + "# Load the phenotype data, and discard outlying phenotype values. I\n", + "# create binary covariates from some of the categorical phenotypes.\n", + "cat(\"Loading phenotype data.\\n\")\n", + "pheno_all <- read.pheno(pheno_csv)\n", + "pheno_all <- prepare.pheno(pheno_all)\n", + "pheno_all <- cbind(pheno_all,\n", + " binary.from.categorical(pheno_all$FCbox,paste0(\"FCbox\",1:4)),\n", + " binary.from.categorical(pheno_all$PPIbox,paste0(\"PPIbox\",1:5)),\n", + " binary.from.categorical(pheno_all$methcage,\n", + " paste0(\"methcage\",1:12)),\n", + " binary.from.categorical(pheno_all$round,paste0(\"SW\",1:25)))\n", + "\n", + "\n", + "# LOAD GENOTYPE DATA\n", + "# ------------------\n", + "# Load the \"mean genotypes\"; i.e., the the mean alternative allele\n", + "# counts.\n", + "cat(\"Loading genotype data.\\n\")\n", + "map <- read.map(map_txt)\n", + "out <- read.geno.dosage(geno_txt,nrow(map))\n", + "discard <- out$discard\n", + "X_all <- out$geno\n", + "rm(out)\n", + "\n", + "# Discard genotype samples from mislabeled flowcell samples.\n", + "X_all <- X_all[which(discard == \"no\"),]\n", + "\n", + "\n", + "# Discard SNPs with low \"imputation quality\" assessed by inspecting\n", + "# the genotype probabilities. Retain SNPs for which: (1) at least 95%\n", + "# of the samples have a maximum probability genotype greater than than\n", + "# 0.5; (2) the minor allele frequency is greater than 2%.\n", + "f <- apply(X_all,2,compute.maf)\n", + "markers <- which(map$quality > 0.95 & f > 0.02)\n", + "map <- map[markers,]\n", + "X_all <- X_all[,markers]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e906e23", + "metadata": {}, + "outputs": [], + "source": [ + "X_all[1:30,1:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83231fca", + "metadata": {}, + "outputs": [], + "source": [ + "map[1:10,]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95430e8f", + "metadata": {}, + "outputs": [], + "source": [ + "pheno_all[1:10,1:20]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "352e8d84", + "metadata": {}, + "outputs": [], + "source": [ + "X_all_csv <- paste(experiment_dir, \"/data/geno_new.csv\", sep=\"\")\n", + "pheno_all_csv <- paste(experiment_dir, \"/data/pheno_new.csv\", sep=\"\")\n", + "\n", + "write.csv(X_all, X_all_csv, row.names = FALSE)\n", + "write.csv(pheno_all, pheno_all_csv, row.names = FALSE)" + ] + }, + { + "cell_type": "markdown", + "id": "16524707", + "metadata": {}, + "source": [ + "# Paramaters that can be changed\n", + "\n", + "min_var, max_var - which of the columns in genotype data (X_all above) to be considered when using linear models (mostly for speed). Please note that gemma analysis will analyze the whole chromosome\n", + "\n", + "analysis_selection - a list of analysis to be processed. For the complete list check src/qtl.analysis.R" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9341aa70", + "metadata": {}, + "outputs": [], + "source": [ + "analysis_selection <- analyses\n", + "# min_var = 1\n", + "# max_var = 100\n", + "\n", + "# Example for all variants\n", + "min_var = 1\n", + "max_var = dim(X_all)[2]\n", + "\n", + "# Example to select only one analysis\n", + "# analysis_selection <- analyses[\"BMD\"]\n", + "\n", + "# Example to select only one chromosome\n", + "# min_var = min(which(map[\"chr\"]==11))\n", + "# max_var = max(which(map[\"chr\"]==11))\n", + "\n", + "# Examples from sub-graphs (b) and (c) at https://www.nature.com/articles/ng.3609/figures/3 \n", + "# analysis_selection <- analyses[\"pp12PPIavg\"]\n", + "# min_var = min(which(map[\"chr\"]==11))\n", + "# max_var = max(which(map[\"chr\"]==11))\n", + "\n", + "# analysis_selection <- analyses[\"pp12PPIavg\"]\n", + "# min_var = min(which(map[\"chr\"]==7))\n", + "# max_var = max(which(map[\"chr\"]==7))\n", + "\n", + "# analysis_selection <- analyses[\"testis\"]\n", + "# min_var = min(which(map[\"chr\"]==13))\n", + "# max_var = max(which(map[\"chr\"]==13))\n", + "\n", + "# analysis_selection <- analyses[\"testis\"]\n", + "# min_var = min(which(map[\"chr\"]==5))\n", + "# max_var = max(which(map[\"chr\"]==5))\n", + "\n", + "chromosomes <- unique(map[min_var:max_var,\"chr\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70c940c2", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "library(qqman)\n", + "library(data.table)\n", + "\n", + "for(analysis in analysis_selection) { \n", + "\n", + " ##################################\n", + " # Cleanup data\n", + " cat(\"Loading experiment\\n\")\n", + " phenotype <- analysis$pheno\n", + " covariates <- analysis$cov\n", + " outliers <- analysis$outliers\n", + " \n", + " cat(\"Removing outliers from experiment\\n\")\n", + " pheno <- copy(pheno_all)\n", + " if (!is.null(outliers))\n", + " pheno_all <- remove.outliers(pheno,phenotype,covariates,outliers)\n", + "\n", + " \n", + " # Only analyze samples (i.e. rows of the genotype and phenotype\n", + " # matrices) for which the phenotype and all the covariates are\n", + " # observed.\n", + " cat(\"Filtering experiment samples with missing data\\n\")\n", + " pheno <- pheno[which(none.missing.row(pheno[c(phenotype,covariates)])),]\n", + "\n", + " # Align the phenotypes and genotypes\n", + " cat(\"Aligning phenotypes and genotypes\\n\")\n", + " ids <- intersect(pheno_all$id,rownames(X_all))\n", + " pheno <- pheno_all[match(ids,pheno_all$id),]\n", + " X <- X_all[match(ids,rownames(X_all)),]\n", + "\n", + " ###################################\n", + " # Compute using gemma\n", + " # MAP QTLs\n", + " \n", + " # Note: Geno is called X, geno and pheno are paired by ID, \n", + " # pheno has a column called ID, ID in geno is the row number/name.\n", + " X_csv <- paste(experiment_dir, \"/out/X_\", analysis$pheno, \".csv\",sep=\"\")\n", + " pheno_csv <- paste(experiment_dir, \"/out/pheno_\", analysis$pheno, \".csv\",sep=\"\")\n", + " ids_csv <- paste(experiment_dir, \"/out/ids_\", analysis$pheno, \".csv\",sep=\"\")\n", + " \n", + " write.csv(X, X_csv, row.names = TRUE)\n", + " write.csv(pheno, pheno_csv, row.names = FALSE)\n", + " write.csv(ids, ids_csv, row.names = FALSE)\n", + " \n", + " ge_out_dat <- paste(experiment_dir, \"/out/ge_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".dat\", sep=\"\")\n", + " ge_out_csv <- paste(experiment_dir, \"/out/ge_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".csv\", sep=\"\")\n", + " ge_out_png <- paste(experiment_dir, \"/out/ge_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".png\", sep=\"\")\n", + " \n", + " if (!file.exists(ge_out_csv) | !file.exists(ge_out_png)) {\n", + " # Calculate p-values using GEMMA.\n", + " gwscan.gemma <- run.gemma(phenotype,covariates,pheno,X,map,\n", + " gemmadir,gemma.exe,chromosomes)\n", + "\n", + " # Save results to file.\n", + " cat(\"Saving results to file.\\n\")\n", + " save(list = c(\"analysis\",\"gwscan.gemma\"),file = ge_out_dat)\n", + " \n", + " named_gws <- gwscan.gemma\n", + " named_gws$snp = rownames(named_gws)\n", + " named_gws$p = 10 ^ (-named_gws$log10p)\n", + " \n", + " png(filename = ge_out_png,\n", + " width = 600, height = 600, units = \"px\", pointsize = 12,\n", + " bg = \"white\", res = NA,\n", + " type = c(\"cairo\", \"cairo-png\", \"Xlib\", \"quartz\"))\n", + " manhattan(named_gws, chr=\"chr\", bp=\"pos\", snp='snp', p=\"p\", annotatePval = 0.1, annotateTop = TRUE )\n", + " dev.off()\n", + " \n", + " write.csv(data.table(named_gws)[order(rank(p)),], ge_out_csv)\n", + " \n", + " }\n", + " \n", + " ###################################\n", + " # Compute using linear model\n", + " lm_out_csv <- paste(experiment_dir, \"/out/lm_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".csv\",sep=\"\")\n", + " lm_out_png <- paste(experiment_dir, \"/out/lm_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".png\",sep=\"\")\n", + " \n", + " if(!file.exists(lm_out_csv) | ! file.exists(lm_out_png)) {\n", + " print(dim(X)[2])\n", + " dt <- data.table(snp=rep(\"\",dim(X)[2]), chr=rep(0,dim(X)[2]), pos=rep(0,dim(X)[2]), p=rep(1,dim(X)[2]))\n", + " for (i in min_var:max_var) {\n", + " X_variant <- cbind(X[,i], pheno_column=pheno[,analysis$pheno])\n", + " colnames(X_variant)[1]<-colnames(X)[i]\n", + " f <- paste(\"pheno_column ~ \",colnames(X)[i])\n", + " # Add any covariates\n", + " for(cov in analysis$cov) {\n", + " X_variant <- cbind(X_variant, pheno_column=pheno[,cov])\n", + " f <- paste(f,\"+\",cov)\n", + " }\n", + " res_variant <- lm(pheno_column~., data = data.frame(X_variant))\n", + " dt[i,1] = colnames(X)[i]\n", + " dt[i,2] = as.numeric(map[map[\"id\"]==colnames(X)[i],\"chr\"])\n", + " dt[i,3] = as.numeric(map[map[\"id\"]==colnames(X)[i],\"pos\"])\n", + " dt[i,4] = as.numeric(summary(res_variant)$coefficients[2,4])\n", + " }\n", + "\n", + " # Print to file\n", + " png(filename = lm_out_png,\n", + " width = 600, height = 600, units = \"px\", pointsize = 12,\n", + " bg = \"white\", res = NA,\n", + " type = c(\"cairo\", \"cairo-png\", \"Xlib\", \"quartz\"))\n", + " manhattan(dt[min_var:max_var], chr=\"chr\", bp=\"pos\", snp='snp', p=\"p\", annotatePval = 0.1, annotateTop = TRUE)\n", + " write.csv(dt[order(rank(p)),][1:(max_var-min_var)], lm_out_csv)\n", + " dev.off()\n", + " print(paste(\"Manhattan plot output to: \", lm_out_png, \", (sorted) values saved in: \", lm_out_csv, sep=\"\"))\n", + "\n", + " ## Print also on screen\n", + " manhattan(dt[min_var:max_var], chr=\"chr\", bp=\"pos\", snp='snp', p=\"p\", annotatePval = 0.1, annotateTop = TRUE)\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08c52404", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.1.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}