294 lines (293 with data), 11.2 kB
{
"cells": [
{
"cell_type": "markdown",
"id": "22d22050",
"metadata": {},
"source": [
"# Experiment\n",
"This notebook generates the experiment results from output using both GEMMA and linear regression in the directory output. The complete list of SNP-s and the p-value computed is generated for each algorithm and a graphical representation of the SNP-s is outputted as PNG.\n",
"\n",
"This notebook is similar to map.ipynb but specifically meant to be run on the new synthetic genome/phenome data."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4ba8943",
"metadata": {},
"outputs": [],
"source": [
"# Ensure R kernel is installed. For a fresh install, un-comment and run this cell to \n",
"# install R packages.\n",
"\n",
"install.packages(\"qtl\", repos = \"http://cran.us.r-project.org\")\n",
"install.packages(\"qqman\", repos = \"http://cran.us.r-project.org\")\n",
"install.packages(\"data.table\", repos = \"http://cran.us.r-project.org\")\n",
"install.packages(\"stringr\", repos = \"http://cran.us.r-project.org\")\n",
"install.packages(\"qqman\", repos = \"http://cran.us.r-project.org\")\n",
"install.packages(\"devtools\", repos = \"http://cran.us.r-project.org\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "39ad6796",
"metadata": {},
"outputs": [],
"source": [
"# Set up working directory structures\n",
"\n",
"library(stringr)\n",
"base_dir <- str_replace(getwd(), 'research_paper_code/notebooks', '')\n",
"r_base <- \"research_paper_code\"\n",
"experiment_dir <- \"mice_data_set\" \n",
"real_gwas_path = paste(base_dir, \"/mice_data_set/out\", sep=\"\")\n",
"\n",
"setwd(base_dir)\n",
"getwd()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3bd2a97b",
"metadata": {},
"outputs": [],
"source": [
"# Decide which phenotype you'll be analyzing\n",
"phenotype_choice = \"abBMD\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd44e82c",
"metadata": {},
"outputs": [],
"source": [
"# Map QTLs for phenotypes measured in CFW outbred mice using the linear\n",
"# mixed model (LMM) analysis implemented in GEMMA.\n",
"library(qtl)\n",
"library(data.table)\n",
"library(qqman)\n",
"source(paste(r_base, \"/src/misc.R\", sep=\"\"))\n",
"source(paste(r_base, \"/src/gemma.R\", sep=\"\"))\n",
"source(paste(r_base, \"/src/read.data.R\", sep=\"\"))\n",
"source(paste(r_base, \"/src/data.manip.R\", sep=\"\"))\n",
"source(paste(r_base, \"/src/qtl.analyses.R\", sep=\"\"))\n",
"analysis_selection = analyses[\"abBMD\"]\n",
"for (analysis in analysis_selection) {\n",
" print(analysis)\n",
"}\n",
"\n",
"# SCRIPT PARAMETERS\n",
"# -----------------\n",
"chromosomes <- NULL\n",
"gemmadir <- paste(experiment_dir, \"/gemma\", sep=\"\")\n",
"gemma.exe <- paste(\"./\", \"gemma-0.98.4-linux-static-AMD64\", sep=\"\")\n",
"geno_txt_base <- paste(experiment_dir, \"/data/synthetic_genome_data\", sep=\"\")\n",
"map_txt_base <- paste(experiment_dir, \"/data/genome_map_data\", sep=\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5cefa75",
"metadata": {},
"outputs": [],
"source": [
"# Read in the synthetic phenotype data\n",
"pheno_synth_file <- paste(experiment_dir, \"/data/phenome_alldata_synth.csv\", sep=\"\")\n",
"pheno_all <- read.csv(pheno_synth_file,quote = \"\",header = TRUE,check.names = FALSE,\n",
" stringsAsFactors = FALSE,comment.char = \"#\")"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1360e25f",
"metadata": {},
"outputs": [],
"source": [
"# This is the main gwas function\n",
"\n",
"run_gwas <- function(geno_txt, map_txt, pheno_all, analysis, batch) {\n",
"\n",
" # LOAD GENOTYPE DATA\n",
" # ------------------\n",
" # Load the \"mean genotypes\"; i.e., the the mean alternative allele\n",
" # counts.\n",
" map <- read.map(map_txt)\n",
" out <- read.geno.dosage(geno_txt,nrow(map))\n",
" discard <- out$discard\n",
" X_all <- out$geno\n",
" rm(out)\n",
"\n",
" # Discard genotype samples from mislabeled flowcell samples.\n",
" X_all <- X_all[which(discard == \"no\"),]\n",
"\n",
"\n",
" # Discard SNPs with low \"imputation quality\" assessed by inspecting\n",
" # the genotype probabilities. Retain SNPs for which: (1) at least 95%\n",
" # of the samples have a maximum probability genotype greater than than\n",
" # 0.5; (2) the minor allele frequency is greater than 2%.\n",
" f <- apply(X_all,2,compute.maf)\n",
" markers <- which(map$quality > 0.95 & f > 0.02)\n",
" map <- map[markers,]\n",
" X_all <- X_all[,markers]\n",
" \n",
" # min_var, max_var - which of the columns in genotype data (X_all above) to be considered when \n",
" # using linear models (mostly for speed). Please note that gemma analysis will analyze the \n",
" # whole chromosome\n",
" min_var = 1\n",
" max_var = dim(X_all)[2]\n",
"\n",
" analysis_selection <- analyses[phenotype_choice]\n",
" chromosomes <- unique(map[min_var:max_var,\"chr\"])\n",
" \n",
" ##################################\n",
" # Cleanup data\n",
" phenotype <- analysis$pheno\n",
" covariates <- analysis$cov\n",
" outliers <- analysis$outliers\n",
" \n",
" pheno <- copy(pheno_all)\n",
" if (!is.null(outliers))\n",
" pheno_all <- remove.outliers(pheno,phenotype,covariates,outliers)\n",
"\n",
" \n",
" # Only analyze samples (i.e. rows of the genotype and phenotype\n",
" # matrices) for which the phenotype and all the covariates are\n",
" # observed.\n",
" pheno <- pheno[which(none.missing.row(pheno[c(phenotype,covariates)])),] \n",
" \n",
" # Align the phenotypes and genotypes\n",
" ids <- intersect(pheno_all$id,rownames(X_all))\n",
" pheno <- pheno_all[match(ids,pheno_all$id),]\n",
" X <- X_all[match(ids,rownames(X_all)),]\n",
"\n",
" ###################################\n",
" # Compute using gemma\n",
" # MAP QTLs\n",
" \n",
" \n",
" ge_out_dat <- paste(experiment_dir, \"/out_synth/ge_batch\", batch, \"_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".dat\", sep=\"\")\n",
" ge_out_csv <- paste(experiment_dir, \"/out_synth/ge_batch\", batch, \"_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".csv\", sep=\"\")\n",
" \n",
" if (!file.exists(ge_out_csv)) {\n",
" # Calculate p-values using GEMMA.\n",
" gwscan.gemma <- run.gemma(phenotype,covariates,pheno,X,map,\n",
" gemmadir,gemma.exe,chromosomes)\n",
"\n",
" # Save results to file.\n",
" save(list = c(\"analysis\",\"gwscan.gemma\"),file = ge_out_dat)\n",
" \n",
" named_gws <- gwscan.gemma\n",
" named_gws$snp = rownames(named_gws)\n",
" named_gws$p = 10 ^ (-named_gws$log10p)\n",
" \n",
" write.csv(data.table(named_gws)[order(rank(p)),], ge_out_csv)\n",
" \n",
" }\n",
" \n",
" ###################################\n",
" # Compute using linear model\n",
" lm_out_csv <- paste(experiment_dir, \"/out_synth/lm_batch\", batch, \"_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".csv\",sep=\"\")\n",
" \n",
" if(!file.exists(lm_out_csv)) {\n",
" print(dim(X)[2])\n",
" dt <- data.table(snp=rep(\"\",dim(X)[2]), chr=rep(0,dim(X)[2]), pos=rep(0,dim(X)[2]), p=rep(1,dim(X)[2]))\n",
" for (i in min_var:max_var) {\n",
" X_variant <- cbind(X[,i], pheno_column=pheno[,analysis$pheno])\n",
" colnames(X_variant)[1]<-colnames(X)[i]\n",
" f <- paste(\"pheno_column ~ \",colnames(X)[i])\n",
" # Add any covariates\n",
" for(cov in analysis$cov) {\n",
" X_variant <- cbind(X_variant, pheno_column=pheno[,cov])\n",
" f <- paste(f,\"+\",cov)\n",
" }\n",
" res_variant <- lm(pheno_column~., data = data.frame(X_variant))\n",
" \n",
" dt[i,1] = colnames(X)[i]\n",
" dt[i,2] = as.numeric(map[map[\"id\"]==colnames(X)[i],\"chr\"])\n",
" dt[i,3] = as.numeric(map[map[\"id\"]==colnames(X)[i],\"pos\"])\n",
" dt[i,4] = as.numeric(summary(res_variant)$coefficients[2,4])\n",
" }\n",
" \n",
" # Print to file\n",
"\n",
" write.csv(dt[order(rank(p)),][1:(max_var-min_var)], lm_out_csv)\n",
" print(paste(\"Sorted p-values saved in: \", lm_out_csv, sep=\"\"))\n",
"\n",
" }\n",
"\n",
" \n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f5e3d381",
"metadata": {},
"outputs": [],
"source": [
"# Now you can run GWAS for a particular synthetic genome batch\n",
"# Be sure the contents of out_synth doesn't already have output for this batch or it won't rerun\n",
"# Restart the notebook for each call to run_gwas\n",
"\n",
"batch = 2316\n",
"geno_txt <- paste(geno_txt_base, \"/synthetic_genomes_batch\", batch, \".txt\", sep=\"\")\n",
"map_txt <- paste(map_txt_base, \"/map_abBMD_batch\", batch, \".txt\", sep=\"\")\n",
"run_gwas(geno_txt, map_txt, pheno_all, analysis, batch)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aee40f73",
"metadata": {},
"outputs": [],
"source": [
"# Alternatively, you can run gwas for a merged larger synthetic genome file\n",
"# Be sure that any previous files created in out_synth for this pheno aren't still there\n",
"# or new data will not be created\n",
"# Restart the notebook for each call to run_gwas\n",
"# NOTE - YOU MUST USE THE FILENAMES THAT YOU CREATED AT THE END OF THE 04 NOTEBOOK\n",
"\n",
"batch = \"all\"\n",
"geno_txt <- paste(geno_txt_base, \"/synthetic_genomes_allbatches.txt\", sep=\"\")\n",
"map_txt <- paste(map_txt_base, \"/map_abBMD_allbatches.txt\", sep=\"\")\n",
"run_gwas(geno_txt, map_txt, pheno_all, analysis, batch)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "31f34dc0",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "4.1.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}