shaprs / Git / Diff of /R/shaPRS.R

Models:

MarcoTheBlack/

shaprs

Downloads: 1

Diff of /R/shaPRS.R [000000] .. [6b94fb]

Switch to unified view

 b/R/shaPRS.R
+#' Creates a new set of summary statistics
+#'
+#' it performs both steps of the shaPRS method in a single call:
+#' (1) Creates lFDR corrected Q-test statistics for each SNP (via shaPRS_adjust)
+#' (2) produce summary statistics according to a continuous weighting scheme (via shaPRS_blend_overlap)
+#' (3) It writes to disk the following files with postfixes:   "_adjustinput", "_SNP_lFDR", "_shaprs", "_meta", which are the input and output for shaPRS_adjust (also required for shaPRS_LDGen), and shaPRS adjusted summary stats and a fixed-effect meta-analysis
+#'
+#' @param proximalLoc proximal LDPred formatted GWAS summary statistics table  that has header with the following columns: chr  pos SNP A1  A2  Freq1.Hapmap    b   se  p   N
+#' @param adjunctLoc dataframe for adjunct dataset of the same signature
+#' @param outputLoc the location of the output files
+#' @param rho estimate of correlation between studies due to shared subjects. 0 for no overlap and 1 for complete overlap. default: 0. Obtain this from shaPRS_rho()
+#' @param discardAmbiguousSNPs (optional) if ambiguous SNPs (G/C and A/T) should be discarded (default TRUE)
+#' @param useProximalForMissing (optional) if SNPs missing from the adjunct data should be kept using the proximal data or not (default TRUE)
+#'
+#' @importFrom stats na.omit pchisq pnorm cor
+#' @importFrom utils read.table write.table
+#'
+#' @examples
+#' proximalLoc <- system.file("extdata", "phenoA_sumstats", package = "shaPRS")
+#' adjunctLoc <- system.file("extdata", "phenoB_sumstats", package = "shaPRS")
+#' shaPRS(proximalLoc, adjunctLoc, tempfile())
+#'
+#' @export
+shaPRS = function(proximalLoc, adjunctLoc, outputLoc, rho = 0, discardAmbiguousSNPs = T, useProximalForMissing = T) {
+  # load proximal and adjunct datas
+  proximal= read.table(proximalLoc, header = T)
+  adjunct= read.table(adjunctLoc, header = T)
+  # Create input file for blending factors step with following format:
+  # SNP CHR BP  Beta_A  SE_A  A1.x  A2.x    Beta_B  SE_B  A1.y  A2.y
+  inputData = merge(proximal,adjunct,by.x = "SNP",by.y = "SNP")[,c(1,2,3,7,8,4,5  ,16,17,13,14)]
+  colnames(inputData) = c("SNP",    "CHR",  "BP",   "Beta_A",   "SE_A",  "A1.x",  "A2.x",   "Beta_B",   "SE_B",  "A1.y",  "A2.y")
+  inputDataLoc = paste0(outputLoc,"_adjustinput")
+  # write to disk
+  write.table(inputData, inputDataLoc, row.names = F, col.names = T, quote = FALSE)
+  # I) Generate blending factors
+  # 1. load data
+  # inputData= read.table(inputDataLoc, header = T)
+  # 2. lFDR estimation
+  results = shaPRS_adjust(inputData, rho = rho, discardAmbiguousSNPs =discardAmbiguousSNPs)
+  # 3. write out a table of lFDR values for each SNP
+  lFDRTable <- results$lFDRTable
+  colnames(lFDRTable) = c("SNP", "lFDR", "Qval")
+  blendFactorLoc = paste(outputLoc, "_SNP_lFDR" , sep="")
+  write.table(lFDRTable, blendFactorLoc, row.names = F, col.names = T, quote = FALSE)
+  print(paste("written lFDRs and Qvals for SNPs to",blendFactorLoc))
+  # II) produce final summary statistics
+  # INVERSE VARIANCE FIXED EFFECT META ANALYSIS:
+  CombinedPheno = inverse_metaAnalaysis(proximal, adjunct, rho = rho, discardAmbiguousSNPs =discardAmbiguousSNPs)
+  blendingFactors= read.table(blendFactorLoc, header = T)
+  # 4. create new data frame to store the new summary stats
+  blendedSumstats = shaPRS_blend_overlap(proximal, adjunct, blendingFactors,rho, discardAmbiguousSNPs =discardAmbiguousSNPs)
+  # if enabled, we fill in SNPs that were missing from the adjunct data from the proximal study
+  if(useProximalForMissing) {
+    # find IDs of missing SNPs that only exist in proximal
+    allIDs = 1:nrow(proximal)
+    nonMissingSNPs = match(blendedSumstats$SNP, proximal$SNP)
+    missingIDs = allIDs[-nonMissingSNPs]
+    if( length(missingIDs) > 0) {
+      # extract these
+      missingProximal = proximal[missingIDs,]
+      # concat them into both shaPRS and meta
+      blendedSumstats = rbind(blendedSumstats,missingProximal)
+      CombinedPheno = rbind(CombinedPheno,missingProximal)
+    }
+  }
+  # 5. write blended stats to disk
+  filen=paste0(outputLoc,"_shaprs")
+  write.table(blendedSumstats, filen, sep = "\t", row.names = F, col.names = T, quote = FALSE)
+  print(paste("written shaPRS sumstats to",filen))
+  uga= read.table(filen, header = T)
+  # 6. write Combined stats to disk too
+  filen=paste0(outputLoc,"_meta")
+  write.table(CombinedPheno, filen, sep = "\t", row.names = F, col.names = T, quote = FALSE)
+  print(paste("written meta-analysis sumstats to",filen))
+}
+#' Create lFDR corrected Q-test statistics for each SNP
+#'
+#' it performs:
+#' (1) modified Cochran's Q-test which optionally adjusts for overlapping controls
+#' (2) lFDR estimation on the p-values from the above Cochran's Q-test
+#' (3) lists SNPs that fail the heterogeneity test at specified thresholds (optional)
+#'
+#' @param inputData summary statistics table that has header with the following columns: SNP    CHR BP  Beta_A  SE_A    A1.x    A2.x    Beta_B  SE_B    A1.y    A2.y
+#' @param rho estimate of correlation between studies due to shared subjects. 0 for no overlap and 1 for complete overlap. default: 0. Obtain this from shaPRS_rho()
+#' @param thresholds vector of thresholds to be used to create list of SNPs (default empty)
+#' @param discardAmbiguousSNPs (optional) if ambiguous SNPs (G/C and A/T) should be discarded (default TRUE)
+#' @return returns object with two fields, (1) lFDRTable: a 3 column file with the following signature SNP lFDR Qval (2) hardThresholds list of SNPids that failed the heterogeneity test at each threshold
+#'
+#' @importFrom stats na.omit pchisq pnorm cor
+#'
+#' @examples
+#' inputDataLoc <- system.file("extdata", "shapersToydata.txt", package = "shaPRS")
+#' inputData= read.table(inputDataLoc, header = TRUE)
+#' results = shaPRS_adjust(inputData, thresholds=c(0.5,0.99))
+#'
+#' @export
+shaPRS_adjust = function(inputData, rho = 0, thresholds =  vector(), discardAmbiguousSNPs = T) {
+  # remove non numeric data from cols that must be numeric (must cast to 'characer' otherwise R may turn a value eg 0.8249 into a large integer like 8150 on nix systems)
+  inputData <- inputData[!is.na(as.numeric(as.character(inputData$SE_A))),]
+  inputData <- inputData[!is.na(as.numeric(as.character(inputData$SE_B))),]
+  inputData <- inputData[!is.na(as.numeric(as.character(inputData$Beta_A))),]
+  inputData <- inputData[!is.na(as.numeric(as.character(inputData$Beta_B))),]
+  # now actually cast them to numeric
+  inputData$SE_A = as.numeric(as.character(inputData$SE_A ))
+  inputData$SE_B = as.numeric(as.character(inputData$SE_B ))
+  inputData$Beta_A = as.numeric(as.character(inputData$Beta_A ))
+  inputData$Beta_B = as.numeric(as.character(inputData$Beta_B ))
+  inputData = alignStrands(inputData, discardAmbiguousSNPs = discardAmbiguousSNPs)
+  # 0. Reverse effect sizes alleles
+  misalignedAlleleIndices = which( as.character(inputData$A1.x) != as.character(inputData$A1.y) ) # compare as character, as if we have non-SNPs with different alleles factors will break
+  inputData$Beta_B[misalignedAlleleIndices] = -inputData$Beta_B[misalignedAlleleIndices] # flip effects
+  if(length(misalignedAlleleIndices) > 0) message(paste0(length(misalignedAlleleIndices)), " misaligned allele(s) effects were reversed" )
+  # 1. Cochran's Q-test formula: from 'Meta-Analysis of Genome-wide Association Studies with Overlapping Subjects' ncbi.nlm.nih.gov/pmc/articles/PMC2790578/
+  Vhat = (inputData$SE_A^2 + inputData$SE_B^2 - 2 * rho * inputData$SE_A * inputData$SE_B)
+  Q_vals = (inputData$Beta_A - inputData$Beta_B)^2/ Vhat
+  df=2-1 # degrees of freedom, 2 studies -1
+  Q_pvals = pchisq(Q_vals, df = df, lower.tail = F)
+  # 2. lFDR estimation
+  lfdr_obj = qvalue::qvalue(p = Q_pvals)
+  lfdr_qvals <- lfdr_obj$lfdr
+  # 3. prepare table of lFDR values for each SNP
+  lFDRTable <- data.frame(inputData$SNP, lfdr_qvals, Q_vals)
+  colnames(lFDRTable) = c("SNP", "lFDR", "Qval")
+  # 4. Create list for each threshold of SNPs that fail the heterogeneity test at the specified thresholds
+  hard_threshold_results = list()
+  if (length(thresholds) > 0) {
+    for (i in 1:length(thresholds)) {
+      currentThreshold=thresholds[i]
+      sigSNPs = lFDRTable[which(lFDRTable$lfdr_qvals <currentThreshold),]
+      # SNPs which are heterogeneous, will be sourced from the proximaltpye
+      CompositePheno = inputData # start from the composite pheno
+      hard_threshold_results[[i]] = sigSNPs$inputData.SNP
+    }
+  }
+  results <- list("lFDRTable" = lFDRTable, "hardThresholds" = hard_threshold_results)
+  return(results)
+}
+#alleles = c("G", "G","C","G")
+#switches As with Ts, and Gs with Cs (and vica versa)
+flipStrand = function(alleles) {
+  allelesFlipped = alleles
+  whereAsare = which(alleles == "A")
+  whereGsare = which(alleles == "G")
+  allelesFlipped[which(allelesFlipped == "T")] = "A" # flip Ts to As
+  allelesFlipped[which(allelesFlipped == "C")] = "G" # flip Cs to Gs
+  allelesFlipped[whereAsare] = "T" # flip As to Ts
+  allelesFlipped[whereGsare] = "C" # flip Gs to Cs
+  return(allelesFlipped)
+}
+# inputData = proximal_adjunct
+#inputData = adjunctPheno_blending
+#inputData = sumstatsDataAll
+#' Aligns strands between 2 summary statistics data
+#'
+#'
+#'
+#' @param inputData dataframe of both studies with columns:  A1.x, A2.x, A1.y, A2.y
+#' @param A1.x (optional) column name for A1 allele in study 1 (default "A1.x")
+#' @param A2.x (optional) column name for A2 allele in study 1 (default "A2.x")
+#' @param A1.y (optional) column name for A1 allele in study 2 (default "A1.y")
+#' @param A2.y (optional) column name for A1 allele in study 2 (default "A2.y")
+#' @param discardAmbiguousSNPs (optional) if ambiguous SNPs (G/C and A/T) should be discarded (default TRUE)
+#' @return returns real value of the approximate correlation
+#'
+#'
+#' @export
+alignStrands = function(inputData, A1.x ="A1.x", A2.x ="A2.x", A1.y ="A1.y", A2.y ="A2.y", discardAmbiguousSNPs = T) {
+  #flipSNPs=F
+  # exclude ambiguous SNPs
+  if(discardAmbiguousSNPs) {
+    print("Ambiguous SNP (G/C and A/T) filter enabled. This can be changed by setting discardAmbiguousSNPs to FALSE.")
+    ambiguousSNPIndices = which(inputData[,A1.x] == "G" & inputData[,A2.x] == "C" |  inputData[,A1.x] == "C" & inputData[,A2.x] == "G" | inputData[,A1.x] == "A" & inputData[,A2.x] == "T"  | inputData[,A1.x] == "T" &  inputData[,A2.x] ==  "A" |
+                                inputData[,A1.y] == "G" & inputData[,A2.y ]== "C" |  inputData[,A1.y] == "C" & inputData[,A2.y] == "G" | inputData[,A1.y] == "A" & inputData[,A2.y] == "T"  | inputData[,A1.y] == "T" &  inputData[,A2.y] ==  "A")
+    print(paste0("removed ", length(ambiguousSNPIndices), " ambiguous SNPs out of ", nrow(inputData), " variants"))
+    if(length(ambiguousSNPIndices) > 0) inputData = inputData[-ambiguousSNPIndices,] # otherwise R would remove all as R is shit
+    # 0. Reverse strands: (this assumes that ambiguous SNPs have been excluded prior to this, as it only makes sense to try to match via flipped strands if we can be sure that the flipped strands don't match because of the ambiguity itself)
+    # cache flipped strands for phenoB
+    inputData$A1.y_flipped =  flipStrand(inputData[,A1.y])
+    inputData$A2.y_flipped =  flipStrand(inputData[,A2.y])
+    #flipSNPs = T # take note that we will need to flip SNPs
+    flippedIndices = which(as.character(inputData[,A1.x]) == as.character(inputData$A1.y_flipped) & as.character(inputData[,A2.x]) == as.character(inputData$A2.y_flipped) | as.character(inputData[,A1.x]) == as.character(inputData$A2.y_flipped) & as.character(inputData[,A2.x]) ==  as.character(inputData$A1.y_flipped) )
+    # print( head(inputData[flippedIndices,]) )
+    print(paste0("flipped strand for ", length(flippedIndices), " variants"))
+    inputData[flippedIndices,A1.y] = as.character(inputData$A1.y_flipped[flippedIndices])
+    inputData[flippedIndices,A2.y] = as.character(inputData$A2.y_flipped[flippedIndices])
+    #                                                                                 regular match                                                                                                                         reverse match                                                                                                    flipped match                                                                                                    reverse flipped match
+    # matchedIndices = which(as.character(inputData[,A1.x]) == as.character(inputData[,A1.y]) & as.character(inputData[,A2.x]) == as.character(inputData[,A2.y]) |
+    #                          as.character(inputData[,A1.x]) == as.character(inputData[,A2.y]) & as.character(inputData[,A2.x]) == as.character(inputData[,A1.y]) |
+    #                          as.character(inputData[,A1.x]) == as.character(inputData$A1.y_flipped) & as.character(inputData[,A2.x]) == as.character(inputData$A2.y_flipped)  |
+    #                          as.character( inputData[,A1.x]) == as.character(inputData$A2.y_flipped) &  as.character(inputData[,A2.x]) == as.character(inputData$A1.y_flipped) )
+    } else {
+    print("Ambiguous SNP (G/C and A/T) filter disabled. This can be changed by setting discardAmbiguousSNPs to TRUE.")
+      # here we only match against the original strands, not the flipped one
+      # matchedIndices = which(as.character(inputData[,A1.x]) == as.character(inputData[,A1.y]) & as.character(inputData[,A2.x]) == as.character(inputData[,A2.y]) |
+      #                          as.character(inputData[,A1.x]) == as.character(inputData[,A2.y]) & as.character(inputData[,A2.x]) == as.character(inputData[,A1.y])  )
+    }
+  # this will work for both forward and reverse strands now, as if we enabled flipping above then A1.y matches the A1.y_flipped now, so we only need to check
+  matchedIndices = which(as.character(inputData[,A1.x]) == as.character(inputData[,A1.y]) & as.character(inputData[,A2.x]) == as.character(inputData[,A2.y]) |
+                           as.character(inputData[,A1.x]) == as.character(inputData[,A2.y]) & as.character(inputData[,A2.x]) == as.character(inputData[,A1.y])  )
+  print(paste0("matched ", length(matchedIndices), " out of ", nrow(inputData), " variants"))
+  # as there may be non-SNPs, we need to cast them as character
+  # exclude unmatchables
+  inputData = inputData[matchedIndices,]
+  #inputData = inputData_orig
+  # if(flipSNPs) {
+  #   # flip those A1/A2s which were flipped matches
+  #   #                                                                                       flipped match                                                                                                                                reverse flipped match
+  #   flippedIndices = which(as.character(inputData[,A1.x]) == as.character(inputData$A1.y_flipped) & as.character(inputData[,A2.x]) == as.character(inputData$A2.y_flipped) | as.character(inputData[,A1.x]) == as.character(inputData$A2.y_flipped) & as.character(inputData[,A2.x]) ==  as.character(inputData$A1.y_flipped) )
+  #  # print( head(inputData[flippedIndices,]) )
+  #
+  #   print(paste0("flipped strand for ", length(flippedIndices), " variants"))
+  #   inputData[flippedIndices,A1.y] = as.character(inputData$A1.y_flipped[flippedIndices])
+  #   inputData[flippedIndices,A2.y] = as.character(inputData$A2.y_flipped[flippedIndices])
+  # }
+  return(inputData)
+}
+#' Blended shaPRS (with overlapping datasets): produce summary statistics according to a continuous weighting scheme
+#'
+#' This function continuously blends the two sub-phenotype statistics
+#' and generates an LDPred formatted table.
+#'
+#' @param proximal  Proximal LDPred formatted GWAS summary statistics table  that has header with the following columns: chr    pos SNP A1  A2  Freq1.Hapmap    b   se  p   N
+#' @param adjunct dataframe for adjunct dataset of the same signature
+#' @param blendingFactors a 3 column table of: SNP lFDR Qval, (produced by shaPRS_adjust)
+#' @param rho (optional) sample overlap between studies
+#' @param discardAmbiguousSNPs (optional) if ambiguous SNPs (G/C and A/T) should be discarded (default TRUE)
+#' @return returns an LDPred formatted summary statistics table
+#'
+#' @importFrom stats na.omit pchisq pnorm cor
+#'
+#' @examples
+#' proximalLoc <- system.file("extdata", "phenoA_sumstats", package = "shaPRS")
+#' adjunctLoc <- system.file("extdata", "phenoB_sumstats", package = "shaPRS")
+#' blendFactorLoc <- system.file("extdata", "myOutput_SNP_lFDR", package = "shaPRS")
+#' proximal= read.table(proximalLoc, header = TRUE)
+#' adjunct= read.table(adjunctLoc, header = TRUE)
+#' blendingFactors= read.table(blendFactorLoc, header = TRUE)
+#' blendedSumstats = shaPRS_blend_overlap(proximal, adjunct, blendingFactors)
+#'
+#' @export
+shaPRS_blend_overlap = function(proximal, adjunct, blendingFactors, rho = 0, discardAmbiguousSNPs = T) {
+  # cast as numeric
+  proximal = RemoveNonNumerics(proximal)
+  adjunct = RemoveNonNumerics(adjunct)
+  # 1.  Merge first the 3 tables together by RSid, so they are always aligned, x = proximal  and    y = CombinedPheno ( ensure that when we check allele alignment we are comparing the same SNPs
+  adjunctPheno = merge(proximal,adjunct,by.x = "SNP",by.y = "SNP")
+  adjunctPheno_blending = merge(adjunctPheno,blendingFactors, by.x = "SNP", by.y = "SNP")
+  adjunctPheno_blending = alignStrands(adjunctPheno_blending, discardAmbiguousSNPs = discardAmbiguousSNPs)
+  # 2. Align PheB/B alleles
+  misalignedAlleleIndices = which( as.character(adjunctPheno_blending$A1.x) != as.character(adjunctPheno_blending$A1.y) ) # compare as character, as if we have non-SNPs with different alleles factors will break
+  adjunctPheno_blending$b.y[misalignedAlleleIndices] = -adjunctPheno_blending$b.y[misalignedAlleleIndices] # flip effects
+  if(length(misalignedAlleleIndices) > 0) message(paste0(length(misalignedAlleleIndices)), " misaligned allele(s) effects were reversed" )
+  # sanitize each input of NAs
+  adjunctPheno_blending <- na.omit(adjunctPheno_blending) # remove any NAs of SNPs
+  w = adjunctPheno_blending$lFDR
+  tao1 = 1/adjunctPheno_blending$se.x^2
+  tao2 = 1/adjunctPheno_blending$se.y^2
+  # calculate the meta analysis beta coefficients and standard errors
+  meta_se = sqrt( (tao1 + tao2 + rho *sqrt(tao1 * tao2) ) / ( (tao1 + tao2)^2 ) ) # when rho ==0, this is identical to sqrt(CovB12), but otherwwise this is different
+  meta_coef = (adjunctPheno_blending$b.x*1/adjunctPheno_blending$se.x^2 + adjunctPheno_blending$b.y* 1/adjunctPheno_blending$se.y^2) / (1/adjunctPheno_blending$se.x^2+ 1/adjunctPheno_blending$se.y^2)
+  # 3. Blend the proximal and CombinedPheno together and create new summary statistics via following logic:
+  # Theoretical (Chris's orginal):
+  CovB12 =  ( 1 + rho * sqrt(tao2/tao1) ) / (tao1 + tao2)
+  blendedSE =  sqrt( (1-w)^2 /tao1 + w^2 * (tao1 +tao2 + rho * sqrt(tao1 * tao2) ) / (tao1 + tao2)^2 + 2 * w*(1-w) * CovB12 )
+  # Empirical corr
+  #CovB12_empirical = cor(adjunctPheno_blending$b.x,meta_coef)* adjunctPheno_blending$se.x*meta_se
+  #blendedSE =  sqrt( (1-w)^2 /tao1 + w^2 * (tao1 +tao2 + rho * sqrt(tao1 * tao2) ) / (tao1 + tao2)^2 + 2 * w*(1-w) * CovB12_empirical )
+  blendedBeta=adjunctPheno_blending$b.x * (1-adjunctPheno_blending$lFDR) + meta_coef * adjunctPheno_blending$lFDR
+  blendedp=2*pnorm( abs(blendedBeta)/blendedSE,lower.tail=FALSE)
+  # also need the combined sample size
+  CombinedN = adjunctPheno_blending$N.x + adjunctPheno_blending$N.y
+  # 3. create new data frame to store the new summary stats
+  blendedSumstats = data.frame(adjunctPheno_blending$chr.x, adjunctPheno_blending$pos.x, adjunctPheno_blending$SNP, adjunctPheno_blending$A1.x, adjunctPheno_blending$A2.x, adjunctPheno_blending$Freq1.Hapmap.x,
+                               blendedBeta,
+                               blendedSE,
+                               blendedp,
+                               round(adjunctPheno_blending$N.x * (1-adjunctPheno_blending$lFDR) + CombinedN * adjunctPheno_blending$lFDR * (1-rho))
+  )
+  colnames(blendedSumstats) = colnames(proximal)
+  blendedSumstats= blendedSumstats[match(proximal$SNP, blendedSumstats$SNP),]
+  blendedSumstats <- na.omit(blendedSumstats) # remove any NAs of SNPs that couldn't be matched
+  return(blendedSumstats)
+}
+#' Calculate rho to be used in shaPRS_adjust()
+#'
+#' Convenience function to estimate the correlation between  two studies with overlapping controls
+#' for more details see:
+#' Lin et al. 'Meta-Analysis of Genome-wide Association Studies with Overlapping Subjects' (2009
+#' ncbi.nlm.nih.gov/pmc/articles/PMC2790578
+#'
+#' @param nkl0 number of controls overlapping between studies
+#' @param nk1 number of cases in study k
+#' @param nk0 number of controls in study k
+#' @param nl1 number of cases in study l
+#' @param nl0 number of controls in study l
+#' @return returns real value of the approximate correlation
+#'
+#' @examples
+#' rho = shaPRS_rho(nkl0 = 9492,nk1 = 3810, nk0= 9492, nl1= 3765,nl0= 9492)
+#'
+#' @export
+shaPRS_rho = function(nkl0,nk1, nk0, nl1,nl0) {
+  nk=nk1+nk0 # total number indis in k
+  nl= nl1+nl0 # total number of indis in l
+  approx_cor= (nkl0 * sqrt(nk1*nl1 / (nk0*nl0) )   ) / sqrt(nk*nl)
+  return(approx_cor )
+}
+#' Generic inverse variance meta analysis
+#'
+#' Convenience function to produce the combined phenotype estimate if we only have summary stats for phenoA and pheno B
+#'
+#' @param proximal dataframe for main proximal
+#' @param adjunct dataframe for other proximal
+#' @param rho (optional) overlap between studies
+#' @param discardAmbiguousSNPs (optional) if ambiguous SNPs (G/C and A/T) should be discarded (default TRUE)
+#' @return returns Combinedpheno dataframe that can be plugged into shaPRS_blend or shaPRS_composite
+#'
+#'
+#' @export
+inverse_metaAnalaysis = function(proximal,adjunct, rho = 0, discardAmbiguousSNPs = T) {
+  # cast as numeric
+  proximal = RemoveNonNumerics(proximal)
+  adjunct = RemoveNonNumerics(adjunct)
+  # 1.  Merge first the tables together by RSid, so they are always aligned, x = proximal  and    y = adjunct ( ensure that when we check allele alignment we are comparing the same SNPs
+  proximal_adjunct = merge(proximal,adjunct,by.x = "SNP",by.y = "SNP")
+  proximal_adjunct = alignStrands(proximal_adjunct, discardAmbiguousSNPs = discardAmbiguousSNPs)
+  # 2. Align PheB/B alleles
+  misalignedAlleleIndices = which( as.character(proximal_adjunct$A1.x) != as.character(proximal_adjunct$A1.y) ) # compare as character, as if we have non-SNPs with different alleles factors will break
+  proximal_adjunct$b.y[misalignedAlleleIndices] = -proximal_adjunct$b.y[misalignedAlleleIndices] # flip effects for phe B
+  if(length(misalignedAlleleIndices) > 0) message(paste0(length(misalignedAlleleIndices)), " misaligned allele(s) effects were reversed" )
+  # INVERSE VARIANCE FIXED EFFECT META ANALYSIS: https://en.wikipedia.org/wiki/Inverse-variance_weighting
+  meta_coef = (proximal_adjunct$b.x*1/proximal_adjunct$se.x^2 + proximal_adjunct$b.y* 1/proximal_adjunct$se.y^2) / (1/proximal_adjunct$se.x^2+ 1/proximal_adjunct$se.y^2)
+  # no overlap, use simple formula
+  if(rho == 0) { meta_se = sqrt( (proximal_adjunct$se.x^(-2) + proximal_adjunct$se.y^(-2) )^(-1) ) }
+  else { # there is an overlap, use Chris' updated formula
+    tao1 = 1/proximal_adjunct$se.x^2
+    tao2 = 1/proximal_adjunct$se.y^2
+    meta_se = sqrt( (tao1 + tao2 + rho *sqrt(tao1 * tao2) ) / ( (tao1 + tao2)^2 ) )
+  }
+  meta_p = 2*pnorm(-abs(meta_coef/meta_se))
+  # 3. create new data frame to store the new summary stats
+  CombinedPheno = data.frame(proximal_adjunct$chr.x, proximal_adjunct$pos.x, proximal_adjunct$SNP, proximal_adjunct$A1.x, proximal_adjunct$A2.x, proximal_adjunct$Freq1.Hapmap.x,
+                             meta_coef,
+                             meta_se,
+                             meta_p,
+                             proximal_adjunct$N.x + proximal_adjunct$N.y)
+  colnames(CombinedPheno) = colnames(proximal)
+  CombinedPheno= CombinedPheno[match(proximal$SNP, CombinedPheno$SNP),]
+  CombinedPheno <- na.omit(CombinedPheno) # remove any NAs of SNPs that couldn't be matched
+  return(CombinedPheno)
+}
+# helper function that removes and casts all columns that should be numeric as numeric
+RemoveNonNumerics = function(proximal) {
+  # remove non numeric data from cols that must be numeric (must cast to 'characer' otherwise R may turn a value eg 0.8249 into a large integer like 8150 on nix systems)
+  proximal <- proximal[!is.na(as.numeric(as.character(proximal$b))),]
+  proximal <- proximal[!is.na(as.numeric(as.character(proximal$se))),]
+  # now actually cast them to numeric
+  proximal$b = as.numeric(as.character(proximal$b ))
+  proximal$se = as.numeric(as.character(proximal$se ))
+  return(proximal)
+}
+#' Convenience function to generate a shaPRS specific LD reference panel for cross-ancestry analyses
+#'
+#' Wrapper function that loads and processes the LD data for two populations, aligns it with summary data for shaPRS and then generate a full new LD-ref panel for 22 autosomes (this should be used instead of LDRefBlend)
+#'
+#' @param Pop1LDRefLoc Location of the folder of the LDpred2 formatted LD-reference matrices for the 22 autosomes together with a map.rds file for the proximal study
+#' @param Pop2LDRefLoc Location of the folder of the LDpred2 formatted LD-reference matrices for the 22 autosomes together with a map.rds file for the adjunct study
+#' @param blendFactorLoc Location for the lFDR data file produced by shaPRS(), postfix: "_SNP_lFDR"
+#' @param adjustinputLoc Location for the file produced by the shaPRS(), postfix: "_adjustinput"
+#' @param outputLoc Output location
+#' @param discardAmbiguousSNPs (optional) if ambiguous SNPs (G/C and A/T) should be discarded (default TRUE)
+#' @param memoryEfficiency (optional) larger numbers result in longer runs but lower memory usage (default 5)
+#'
+#' @import Matrix compiler
+#' @importFrom utils read.table write.table
+#'
+#' @examples
+#' Pop1LDRefLoc <- paste0(system.file("extdata", "", package = "shaPRS"), "/")
+#' Pop2LDRefLoc <- paste0(system.file("extdata", "", package = "shaPRS"), "/")
+#' blendFactorLoc <- system.file("extdata", "pop_SNP_lFDR", package = "shaPRS")
+#' adjustinputLoc <- system.file("extdata", "pop_adjustinput", package = "shaPRS")
+#' outputLoc <- "<YOUR LOCATION>"
+#' # shaPRS_LDGen(Pop1LDRefLoc, Pop2LDRefLoc, blendFactorLoc, adjustinputLoc, outputLoc)
+#'
+#' @export
+shaPRS_LDGen = function(Pop1LDRefLoc,Pop2LDRefLoc, blendFactorLoc, adjustinputLoc, outputLoc, discardAmbiguousSNPs = F, memoryEfficiency = 5) {
+  # 1. load Map data
+  pop1_map_rds = readRDS(file = paste0(Pop1LDRefLoc,"map.rds") )
+  pop2_map_rds = readRDS(file = paste0(Pop2LDRefLoc,"map.rds") )
+  # load raw blending factors and summary stats for the entire genome
+  blendingFactors= read.table(blendFactorLoc, header = T)
+  sumsData= read.table(adjustinputLoc, header = T)
+  dir.create(file.path(paste0(outputLoc,"/") ), recursive = T, showWarnings = F)
+  # merge them
+  sumstatsDataAll =  merge(blendingFactors,sumsData,by.x = "SNP",by.y = "SNP")
+  # remove non numeric data
+  sumstatsDataAll <- sumstatsDataAll[!is.na(as.numeric(as.character(sumstatsDataAll$lFDR))),]
+  sumstatsDataAll <- sumstatsDataAll[!is.na(as.numeric(as.character(sumstatsDataAll$SE_A))),]
+  sumstatsDataAll <- sumstatsDataAll[!is.na(as.numeric(as.character(sumstatsDataAll$SE_B))),]
+  # now actually cast them to numeric
+  sumstatsDataAll$lFDR = as.numeric(as.character(sumstatsDataAll$lFDR ))
+  sumstatsDataAll$SE_A = as.numeric(as.character(sumstatsDataAll$SE_A ))
+  sumstatsDataAll$SE_B = as.numeric(as.character(sumstatsDataAll$SE_B ))
+  # align the summary for phe A and B
+  sumstatsDataAll = alignStrands(sumstatsDataAll, discardAmbiguousSNPs = discardAmbiguousSNPs)
+  #chromNum=21
+  # go through each chrom
+  for(chromNum in 1:22){
+    # load the two chromosomes from each population
+    pop1LDmatrix = readRDS(file = paste0(Pop1LDRefLoc,"LD_chr",chromNum,".rds") )
+    pop2LDmatrix = readRDS(file = paste0(Pop2LDRefLoc,"LD_chr",chromNum,".rds") )
+    # 2. grab the RSids from the map for the SNPS on this chrom, each LD mat has a potentiall different subset of SNPs
+    pop1_chrom_SNPs = pop1_map_rds[ which(pop1_map_rds$chr == chromNum),] # this is guaranteed to be the same order as the pop1LDmatrix
+    pop2_chrom_SNPs = pop2_map_rds[ which(pop2_map_rds$chr == chromNum),] # this is guaranteed to be the same order as the pop2LDmatrix
+    pop1_chrom_SNPs$pop1_id = 1:nrow(pop1_chrom_SNPs)
+    pop2_chrom_SNPs$pop2_id = 1:nrow(pop2_chrom_SNPs)
+    # intersect the 2 SNP lists so that we only use the ones common to both LD matrices by merging them
+    chrom_SNPs_df  <- merge(pop1_chrom_SNPs,pop2_chrom_SNPs, by = "rsid")
+    # align the two LD matrices
+    chrom_SNPs_df = alignStrands(chrom_SNPs_df, A1.x ="a1.x", A2.x ="a0.x", A1.y ="a1.y", A2.y ="a0.y")
+    # subset sumstats data to the same chrom
+    sumstatsData = sumstatsDataAll[which(sumstatsDataAll$CHR == chromNum ),]
+    if(nrow(sumstatsData) > 0) {
+      # merge sumstats with common LD map data
+      sumstatsData  <- merge(chrom_SNPs_df,sumstatsData, by.x="rsid", by.y = "SNP")
+      # remove duplicates
+      sumstatsData = sumstatsData[ !duplicated(sumstatsData$rsid) ,]
+      # use the effect alleles for the sumstats data with the effect allele of the LD mat
+      # as we are aligning the LD mats against each other, not against the summary stats
+      # we only use the lFDR /SE from the sumstats, which are directionless, so those dont need to be aligned
+      sumstatsData$A1.x =sumstatsData$a1.x
+      sumstatsData$A1.y =sumstatsData$a1.y
+      # make sure the sumstats is ordered the same way as the LD matrix: https://stackoverflow.com/questions/17878048/merge-two-data-frames-while-keeping-the-original-row-order
+      sumstatsData = sumstatsData[order(sumstatsData$pop1_id), ] # it doesn't matter which matrix to use to order the sumstats as they are the same
+      # subset the LD matrices to the SNPs we actualy have
+      pop1LDmatrix = pop1LDmatrix[sumstatsData$pop1_id,sumstatsData$pop1_id]
+      pop2LDmatrix = pop2LDmatrix[sumstatsData$pop2_id,sumstatsData$pop2_id]
+      # generate the blended LD matrix
+      cormat = LDRefBlend(pop1LDmatrix,pop2LDmatrix, sumstatsData, memoryEfficiency = memoryEfficiency)
+      fileLoc= paste0(outputLoc,"/LD_chr",chromNum,".rds")
+      saveRDS(cormat,file = fileLoc)
+      print(paste0("written PRS specific LD mat to ",fileLoc ))
+    } else {print(paste0("no variants on chrom", chromNum))}
+    # also need to write out the list of SNPs that made it into the final subset, as after all LD matrices are done, we need to create a map.rds too
+    write.table(sumstatsData$rsid, paste0(outputLoc,chromNum,"_snps"), sep = "\t", row.names = F, col.names = F, quote = FALSE)
+    # map the final list of SNPs back to the original map file's indices
+    map_rds_new = pop1_map_rds[which(pop1_map_rds$chr == chromNum),]
+    map_rds_new2 = map_rds_new[which(map_rds_new$rsid %in% sumstatsData$rsid),] # match the first to the second
+    fileLoc= paste0(outputLoc,"/LD_chr",chromNum,"_map.rds")
+    saveRDS(map_rds_new2,file = fileLoc)
+    print(paste0("written chr map to ",fileLoc ))
+   # mem_used()
+  }
+  # at the end concat all of the map files into a single file and write it to disk
+  all_map_rds = NULL
+  for(chromNum in 1:22){
+    filLoc=paste0(outputLoc,"/LD_chr",chromNum,"_map.rds")
+    chr_map_rds = readRDS(file = filLoc )
+    all_map_rds = rbind(all_map_rds,chr_map_rds)
+    file.remove(filLoc)
+  }
+  fileLoc= paste0(outputLoc,"/map.rds")
+  saveRDS(all_map_rds,file = fileLoc)
+  print(paste0("written overall map to ",fileLoc ))
+}
+#' Generate shaPRS specific LD reference panel
+#'
+#' Generates a PRS specific LD reference matrix by blending together two LD ref panels according to
+#' shaPRS produced lFDR and standard errors
+#'
+#' @param pop1LDmatrix LD reference matrix in RDS (dsCMatrix) format for target population
+#' @param pop2LDmatrix LD reference matrix in RDS (dsCMatrix) format for other population
+#' @param sumstatsData summary data with required columns of SE_A, SE_B, A1.x, A1.y, and lFDR
+#' @param memoryEfficiency larger numbers result in longer runs but lower memory usage (default 5)
+#' @return returns a PRS specific LD matrix
+#'
+#' @import Matrix compiler
+#'
+#' @examples
+#' sumstatsData = readRDS(file = system.file("extdata", "sumstatsData_toy.rds", package = "shaPRS") )
+#'
+#' # read SNP map files (same toy data for the example)
+#' pop1_map_rds = readRDS(file = system.file("extdata", "my_data.rds", package = "shaPRS") )
+#' pop2_map_rds = readRDS(file = system.file("extdata", "my_data2.rds", package = "shaPRS") )
+#'
+#' # use chrom 21 as an example
+#' chromNum=21
+#'
+#' # load the two chromosomes from each population ( same toy data for the example)
+#' pop1LDmatrix = readRDS(file = system.file("extdata", "LDref.rds", package = "shaPRS") )
+#' pop2LDmatrix = readRDS(file = system.file("extdata", "LDref2.rds", package = "shaPRS") )
+#'
+#'
+#' # 2. grab the RSids from the map for the SNPS on this chrom,
+#' # each LD mat has a potentially different subset of SNPs
+#' # this is guaranteed to be the same order as the pop1LDmatrix
+#' pop1_chrom_SNPs = pop1_map_rds[ which(pop1_map_rds$chr == chromNum),]
+#' # this is guaranteed to be the same order as the pop2LDmatrix
+#' pop2_chrom_SNPs = pop2_map_rds[ which(pop2_map_rds$chr == chromNum),]
+#' pop1_chrom_SNPs$pop1_id = 1:nrow(pop1_chrom_SNPs)
+#' pop2_chrom_SNPs$pop2_id = 1:nrow(pop2_chrom_SNPs)
+#'
+#'
+#' # intersect the 2 SNP lists so that we only use the ones common to both LD matrices by merging them
+#' chrom_SNPs_df  <- merge(pop1_chrom_SNPs,pop2_chrom_SNPs, by = "rsid")
+#'
+#' # align the two LD matrices
+#' chrom_SNPs_df = alignStrands(chrom_SNPs_df, A1.x ="a1.x", A2.x ="a0.x", A1.y ="a1.y", A2.y ="a0.y")
+#'
+#'
+#' # align the summary for phe A and B
+#' sumstatsData = alignStrands(sumstatsData)
+#'
+#' # subset sumstats data to the same chrom
+#' sumstatsData = sumstatsData[which(sumstatsData$CHR == chromNum ),]
+#'
+#' # merge sumstats with common LD map data
+#' sumstatsData  <- merge(chrom_SNPs_df,sumstatsData, by.x="rsid", by.y = "SNP")
+#'
+#' # remove duplicates
+#' sumstatsData = sumstatsData[ !duplicated(sumstatsData$rsid) ,]
+#' # use the effect alleles for the sumstats data with the effect allele of the LD mat
+#' # as we are aligning the LD mats against each other, not against the summary stats
+#' # we only use the lFDR /SE from the sumstats,
+#' # which are directionless, so those dont need to be aligned
+#' sumstatsData$A1.x =sumstatsData$a1.x
+#' sumstatsData$A1.y =sumstatsData$a1.y
+#'
+#' # make sure the sumstats is ordered the same way as the LD matrix:
+#' sumstatsData = sumstatsData[order(sumstatsData$pop1_id), ]
+#' # it doesn't matter which matrix to use to order the sumstats as they are the same
+#'
+#' # subset the LD matrices to the SNPs we actually have
+#' pop1LDmatrix = pop1LDmatrix[sumstatsData$pop1_id,sumstatsData$pop1_id]
+#' pop2LDmatrix = pop2LDmatrix[sumstatsData$pop2_id,sumstatsData$pop2_id]
+#'
+#' # generate the blended LD matrix
+#' cormat = LDRefBlend(pop1LDmatrix,pop2LDmatrix, sumstatsData)
+#'
+#' # create a new map file that matches the SNPs common to both LD panels
+#' map_rds_new = pop1_map_rds[which(pop1_map_rds$chr == chromNum),]
+#' map_rds_new2 = map_rds_new[which(map_rds_new$rsid %in% sumstatsData$rsid),]
+#'
+#' # save the new LD matrix to a location of your choice
+#' # saveRDS(cormat,file =paste0(<YOUR LOCATION>,"/LD_chr",chromNum,".rds"))
+#'
+#' # save its Map file too
+#' # saveRDS(map_rds_new2,file = paste0(<YOUR LOCATION>,"/LD_chr",chromNum,"_map.rds"))
+#'
+#' @export
+LDRefBlend = function(pop1LDmatrix,pop2LDmatrix, sumstatsData, memoryEfficiency = 5) {
+  #library(compiler)
+  enableJIT(3)
+  #library(Matrix)
+  ## use symbols from the tex
+  wA = 1 - sumstatsData$lFDR    # wA is SNP_A's lFDR , between study 1 and study 2
+  tauA1 = 1/sumstatsData$SE_A^2 # tauA1 is SNP_A's study 1 precision
+  tauA2 = 1/sumstatsData$SE_B^2 # tauA2 is SNP_A's study 2 precision
+  wB = 1 - sumstatsData$lFDR    # wB is SNP_B's lFDR, between study 1 and study 2
+  tauB1 = 1/sumstatsData$SE_A^2 # tauB1 is SNP_B's study 1 precision
+  tauB2 = 1/sumstatsData$SE_B^2 # tauB2 is SNP_B's study 2 precision
+  ## build covariance matrix
+  r1 = convertDSCToDense(pop1LDmatrix, numparts = 5) # LD between SNPA and SNPB in study (population) 1 # must convert to dense otherwise will get memory error later
+  prodA=(tauA1 + wA*tauA2)/(tauA1 + tauA2)/sqrt(tauA1) # vector of 5, IE pre-calculating all for all 'A' and all 'B; SNPs, all terms
+  prodB=(tauB1 + wB*tauB2)/(tauB1 + tauB2)/sqrt(tauB1) # vector of 5
+  # mat of 5x5, an outer product of A and B, IE make it the same dim as the LD matrix, IE the outer product expands out and performs all calculations in the loop in one go
+  # IE outer product =~ loop
+  term1=outer(prodA,prodB,"*") * r1   # multiplying elementwise by the LD
+  remove(r1) # free up RAM
+  # align the two LD matrices: invert correlations where one of the studies' effect allele is flipped wrt other studies' effect allele
+  misalignedAlleleIndices = which( as.character(sumstatsData$A1.x) != as.character(sumstatsData$A1.y) ) # compare as character, as if we have non-SNPs with different alleles factors will break
+  if(length(misalignedAlleleIndices) > 0) message(paste0(length(misalignedAlleleIndices)), " misaligned variants correlation were reversed" )
+  # create a vector of the flipped alleles, 1 for aloigned, and -1 for misaligned
+  flipped_mask =  rep(1, nrow(sumstatsData) )
+  flipped_mask[misalignedAlleleIndices] = -1 # set
+  # create a mask of the flipped alleles: this is just the outer product of the above vector
+  flippedMat=outer(flipped_mask,flipped_mask,"*")  # create a matrix that can be used to flip correlations via elementwise multiplications
+  r2 = convertDSCToDense(pop2LDmatrix, numparts = memoryEfficiency) # LD between SNPA and SNPB in study (population) 2
+  r2 = r2 * flippedMat # apply the flipping to the second LD mat
+  remove(flippedMat) # free up RAM
+  prodA=((1-wA) * tauA2)/(tauA1 + tauA2)/sqrt(tauA2)
+  prodB=((1-wB) * tauB2)/(tauB1 + tauB2)/sqrt(tauB2)
+  term2=outer(prodA,prodB,"*") * r2
+  remove(r2) # free up RAM
+  covmat=term1 + term2 # reconstruct full eq from 3.1
+  # free up ram
+  remove(term1)
+  remove(term2)
+  ## need variance to get cor
+  # this replaces my
+  # SEbar_A = sqrt( VarAbar )
+  # VarAbar = wA^2 /taoA1 + (1-wA^2) / (taoA1 + taoA2)
+  VbarA=wA^2/tauA1 + (1-wA^2)/(tauA1+tauA2) # vector of 5, of the variances
+  VbarB=wB^2/tauB1 + (1-wB^2)/(tauB1+tauB2)
+  outerProd = outer(sqrt(VbarA),sqrt(VbarB),"*") # this multiplies all combination of SEs together to create a 5x5 dim matrix
+  # and thus replacing the loop that generates
+  cormat=covmat / outerProd
+  # free up ram
+  remove(covmat)
+  remove(outerProd)
+  return(cormat)
+}
+#' Convert DSC to Dense matrix
+#'
+#' Helper function that converts an LD from sparse DSC to Dense format, in a given number of parts (to overcome RAM limitations)
+#'
+#' @param pop1LDmatrix LD reference matrix in RDS (dsCMatrix) format for target population
+#' @param numparts (optional) how many parts should be used for converting the matrix, if memory becomes an issue use higher numbers
+#' @return returns a dense LD matrix
+#'
+#' @export
+convertDSCToDense = function(pop1LDmatrix, numparts = 3) {
+  # convert dsc to dense matrix in a non-crashing way
+  firstHalf = floor(ncol(pop1LDmatrix)/numparts)
+  start = 0;
+  end = 0
+  r1 = NULL
+  for (i in 1:numparts) {
+    start = end
+    if (i == numparts) { # last one has to go to the end
+      end = ncol(pop1LDmatrix)
+    } else {
+      end = end + firstHalf
+    }
+    r1_p1 = as.matrix(pop1LDmatrix[,(start+1):end])
+    r1 = cbind(r1,r1_p1)
+    remove(r1_p1)
+  }
+  return(r1)
+}