Diff of /R/shaPRS.R [000000] .. [6b94fb]

Switch to unified view

a b/R/shaPRS.R
1
#' Creates a new set of summary statistics
2
#'
3
#' it performs both steps of the shaPRS method in a single call:
4
#' (1) Creates lFDR corrected Q-test statistics for each SNP (via shaPRS_adjust)
5
#' (2) produce summary statistics according to a continuous weighting scheme (via shaPRS_blend_overlap)
6
#' (3) It writes to disk the following files with postfixes:   "_adjustinput", "_SNP_lFDR", "_shaprs", "_meta", which are the input and output for shaPRS_adjust (also required for shaPRS_LDGen), and shaPRS adjusted summary stats and a fixed-effect meta-analysis
7
#'
8
#' @param proximalLoc proximal LDPred formatted GWAS summary statistics table  that has header with the following columns: chr  pos SNP A1  A2  Freq1.Hapmap    b   se  p   N
9
#' @param adjunctLoc dataframe for adjunct dataset of the same signature
10
#' @param outputLoc the location of the output files
11
#' @param rho estimate of correlation between studies due to shared subjects. 0 for no overlap and 1 for complete overlap. default: 0. Obtain this from shaPRS_rho()
12
#' @param discardAmbiguousSNPs (optional) if ambiguous SNPs (G/C and A/T) should be discarded (default TRUE)
13
#' @param useProximalForMissing (optional) if SNPs missing from the adjunct data should be kept using the proximal data or not (default TRUE)
14
#'
15
#' @importFrom stats na.omit pchisq pnorm cor
16
#' @importFrom utils read.table write.table
17
#'
18
#' @examples
19
#' proximalLoc <- system.file("extdata", "phenoA_sumstats", package = "shaPRS")
20
#' adjunctLoc <- system.file("extdata", "phenoB_sumstats", package = "shaPRS")
21
#' shaPRS(proximalLoc, adjunctLoc, tempfile())
22
#'
23
#' @export
24
shaPRS = function(proximalLoc, adjunctLoc, outputLoc, rho = 0, discardAmbiguousSNPs = T, useProximalForMissing = T) {
25
26
  # load proximal and adjunct datas
27
  proximal= read.table(proximalLoc, header = T)
28
  adjunct= read.table(adjunctLoc, header = T)
29
30
  # Create input file for blending factors step with following format:
31
  # SNP CHR BP  Beta_A  SE_A  A1.x  A2.x    Beta_B  SE_B  A1.y  A2.y
32
  inputData = merge(proximal,adjunct,by.x = "SNP",by.y = "SNP")[,c(1,2,3,7,8,4,5  ,16,17,13,14)]
33
  colnames(inputData) = c("SNP",    "CHR",  "BP",   "Beta_A",   "SE_A",  "A1.x",  "A2.x",   "Beta_B",   "SE_B",  "A1.y",  "A2.y")
34
  inputDataLoc = paste0(outputLoc,"_adjustinput")
35
  # write to disk
36
  write.table(inputData, inputDataLoc, row.names = F, col.names = T, quote = FALSE)
37
38
  # I) Generate blending factors
39
  # 1. load data
40
  # inputData= read.table(inputDataLoc, header = T)
41
42
  # 2. lFDR estimation
43
  results = shaPRS_adjust(inputData, rho = rho, discardAmbiguousSNPs =discardAmbiguousSNPs)
44
45
  # 3. write out a table of lFDR values for each SNP
46
  lFDRTable <- results$lFDRTable
47
  colnames(lFDRTable) = c("SNP", "lFDR", "Qval")
48
  blendFactorLoc = paste(outputLoc, "_SNP_lFDR" , sep="")
49
  write.table(lFDRTable, blendFactorLoc, row.names = F, col.names = T, quote = FALSE)
50
  print(paste("written lFDRs and Qvals for SNPs to",blendFactorLoc))
51
52
  # II) produce final summary statistics
53
54
  # INVERSE VARIANCE FIXED EFFECT META ANALYSIS:
55
  CombinedPheno = inverse_metaAnalaysis(proximal, adjunct, rho = rho, discardAmbiguousSNPs =discardAmbiguousSNPs)
56
57
  blendingFactors= read.table(blendFactorLoc, header = T)
58
59
  # 4. create new data frame to store the new summary stats
60
  blendedSumstats = shaPRS_blend_overlap(proximal, adjunct, blendingFactors,rho, discardAmbiguousSNPs =discardAmbiguousSNPs)
61
62
  # if enabled, we fill in SNPs that were missing from the adjunct data from the proximal study
63
  if(useProximalForMissing) {
64
65
    # find IDs of missing SNPs that only exist in proximal
66
    allIDs = 1:nrow(proximal)
67
    nonMissingSNPs = match(blendedSumstats$SNP, proximal$SNP)
68
69
    missingIDs = allIDs[-nonMissingSNPs]
70
71
    if( length(missingIDs) > 0) {
72
      # extract these
73
      missingProximal = proximal[missingIDs,]
74
75
      # concat them into both shaPRS and meta
76
      blendedSumstats = rbind(blendedSumstats,missingProximal)
77
      CombinedPheno = rbind(CombinedPheno,missingProximal)
78
    }
79
80
  }
81
82
  # 5. write blended stats to disk
83
  filen=paste0(outputLoc,"_shaprs")
84
  write.table(blendedSumstats, filen, sep = "\t", row.names = F, col.names = T, quote = FALSE)
85
  print(paste("written shaPRS sumstats to",filen))
86
  uga= read.table(filen, header = T)
87
88
  # 6. write Combined stats to disk too
89
  filen=paste0(outputLoc,"_meta")
90
  write.table(CombinedPheno, filen, sep = "\t", row.names = F, col.names = T, quote = FALSE)
91
  print(paste("written meta-analysis sumstats to",filen))
92
93
}
94
95
96
#' Create lFDR corrected Q-test statistics for each SNP
97
#'
98
#' it performs:
99
#' (1) modified Cochran's Q-test which optionally adjusts for overlapping controls
100
#' (2) lFDR estimation on the p-values from the above Cochran's Q-test
101
#' (3) lists SNPs that fail the heterogeneity test at specified thresholds (optional)
102
#'
103
#' @param inputData summary statistics table that has header with the following columns: SNP    CHR BP  Beta_A  SE_A    A1.x    A2.x    Beta_B  SE_B    A1.y    A2.y
104
#' @param rho estimate of correlation between studies due to shared subjects. 0 for no overlap and 1 for complete overlap. default: 0. Obtain this from shaPRS_rho()
105
#' @param thresholds vector of thresholds to be used to create list of SNPs (default empty)
106
#' @param discardAmbiguousSNPs (optional) if ambiguous SNPs (G/C and A/T) should be discarded (default TRUE)
107
#' @return returns object with two fields, (1) lFDRTable: a 3 column file with the following signature SNP lFDR Qval (2) hardThresholds list of SNPids that failed the heterogeneity test at each threshold
108
#'
109
#' @importFrom stats na.omit pchisq pnorm cor
110
#'
111
#' @examples
112
#' inputDataLoc <- system.file("extdata", "shapersToydata.txt", package = "shaPRS")
113
#' inputData= read.table(inputDataLoc, header = TRUE)
114
#' results = shaPRS_adjust(inputData, thresholds=c(0.5,0.99))
115
#'
116
#' @export
117
shaPRS_adjust = function(inputData, rho = 0, thresholds =  vector(), discardAmbiguousSNPs = T) {
118
119
  # remove non numeric data from cols that must be numeric (must cast to 'characer' otherwise R may turn a value eg 0.8249 into a large integer like 8150 on nix systems)
120
  inputData <- inputData[!is.na(as.numeric(as.character(inputData$SE_A))),]
121
  inputData <- inputData[!is.na(as.numeric(as.character(inputData$SE_B))),]
122
  inputData <- inputData[!is.na(as.numeric(as.character(inputData$Beta_A))),]
123
  inputData <- inputData[!is.na(as.numeric(as.character(inputData$Beta_B))),]
124
  # now actually cast them to numeric
125
  inputData$SE_A = as.numeric(as.character(inputData$SE_A ))
126
  inputData$SE_B = as.numeric(as.character(inputData$SE_B ))
127
  inputData$Beta_A = as.numeric(as.character(inputData$Beta_A ))
128
  inputData$Beta_B = as.numeric(as.character(inputData$Beta_B ))
129
130
  inputData = alignStrands(inputData, discardAmbiguousSNPs = discardAmbiguousSNPs)
131
132
133
  # 0. Reverse effect sizes alleles
134
  misalignedAlleleIndices = which( as.character(inputData$A1.x) != as.character(inputData$A1.y) ) # compare as character, as if we have non-SNPs with different alleles factors will break
135
  inputData$Beta_B[misalignedAlleleIndices] = -inputData$Beta_B[misalignedAlleleIndices] # flip effects
136
  if(length(misalignedAlleleIndices) > 0) message(paste0(length(misalignedAlleleIndices)), " misaligned allele(s) effects were reversed" )
137
138
  # 1. Cochran's Q-test formula: from 'Meta-Analysis of Genome-wide Association Studies with Overlapping Subjects' ncbi.nlm.nih.gov/pmc/articles/PMC2790578/
139
  Vhat = (inputData$SE_A^2 + inputData$SE_B^2 - 2 * rho * inputData$SE_A * inputData$SE_B)
140
  Q_vals = (inputData$Beta_A - inputData$Beta_B)^2/ Vhat
141
  df=2-1 # degrees of freedom, 2 studies -1
142
  Q_pvals = pchisq(Q_vals, df = df, lower.tail = F)
143
144
  # 2. lFDR estimation
145
  lfdr_obj = qvalue::qvalue(p = Q_pvals)
146
  lfdr_qvals <- lfdr_obj$lfdr
147
148
  # 3. prepare table of lFDR values for each SNP
149
  lFDRTable <- data.frame(inputData$SNP, lfdr_qvals, Q_vals)
150
  colnames(lFDRTable) = c("SNP", "lFDR", "Qval")
151
  # 4. Create list for each threshold of SNPs that fail the heterogeneity test at the specified thresholds
152
  hard_threshold_results = list()
153
  if (length(thresholds) > 0) {
154
    for (i in 1:length(thresholds)) {
155
      currentThreshold=thresholds[i]
156
      sigSNPs = lFDRTable[which(lFDRTable$lfdr_qvals <currentThreshold),]
157
158
      # SNPs which are heterogeneous, will be sourced from the proximaltpye
159
      CompositePheno = inputData # start from the composite pheno
160
      hard_threshold_results[[i]] = sigSNPs$inputData.SNP
161
    }
162
  }
163
164
  results <- list("lFDRTable" = lFDRTable, "hardThresholds" = hard_threshold_results)
165
  return(results)
166
}
167
168
169
170
#alleles = c("G", "G","C","G")
171
#switches As with Ts, and Gs with Cs (and vica versa)
172
flipStrand = function(alleles) {
173
  allelesFlipped = alleles
174
  whereAsare = which(alleles == "A")
175
  whereGsare = which(alleles == "G")
176
  allelesFlipped[which(allelesFlipped == "T")] = "A" # flip Ts to As
177
  allelesFlipped[which(allelesFlipped == "C")] = "G" # flip Cs to Gs
178
  allelesFlipped[whereAsare] = "T" # flip As to Ts
179
  allelesFlipped[whereGsare] = "C" # flip Gs to Cs
180
  return(allelesFlipped)
181
}
182
183
184
# inputData = proximal_adjunct
185
#inputData = adjunctPheno_blending
186
#inputData = sumstatsDataAll
187
188
#' Aligns strands between 2 summary statistics data
189
#'
190
#'
191
#'
192
#' @param inputData dataframe of both studies with columns:  A1.x, A2.x, A1.y, A2.y
193
#' @param A1.x (optional) column name for A1 allele in study 1 (default "A1.x")
194
#' @param A2.x (optional) column name for A2 allele in study 1 (default "A2.x")
195
#' @param A1.y (optional) column name for A1 allele in study 2 (default "A1.y")
196
#' @param A2.y (optional) column name for A1 allele in study 2 (default "A2.y")
197
#' @param discardAmbiguousSNPs (optional) if ambiguous SNPs (G/C and A/T) should be discarded (default TRUE)
198
#' @return returns real value of the approximate correlation
199
#'
200
#'
201
#' @export
202
alignStrands = function(inputData, A1.x ="A1.x", A2.x ="A2.x", A1.y ="A1.y", A2.y ="A2.y", discardAmbiguousSNPs = T) {
203
204
  #flipSNPs=F
205
  # exclude ambiguous SNPs
206
  if(discardAmbiguousSNPs) {
207
    print("Ambiguous SNP (G/C and A/T) filter enabled. This can be changed by setting discardAmbiguousSNPs to FALSE.")
208
209
    ambiguousSNPIndices = which(inputData[,A1.x] == "G" & inputData[,A2.x] == "C" |  inputData[,A1.x] == "C" & inputData[,A2.x] == "G" | inputData[,A1.x] == "A" & inputData[,A2.x] == "T"  | inputData[,A1.x] == "T" &  inputData[,A2.x] ==  "A" |
210
                                inputData[,A1.y] == "G" & inputData[,A2.y ]== "C" |  inputData[,A1.y] == "C" & inputData[,A2.y] == "G" | inputData[,A1.y] == "A" & inputData[,A2.y] == "T"  | inputData[,A1.y] == "T" &  inputData[,A2.y] ==  "A")
211
212
    print(paste0("removed ", length(ambiguousSNPIndices), " ambiguous SNPs out of ", nrow(inputData), " variants"))
213
    if(length(ambiguousSNPIndices) > 0) inputData = inputData[-ambiguousSNPIndices,] # otherwise R would remove all as R is shit
214
215
    # 0. Reverse strands: (this assumes that ambiguous SNPs have been excluded prior to this, as it only makes sense to try to match via flipped strands if we can be sure that the flipped strands don't match because of the ambiguity itself)
216
    # cache flipped strands for phenoB
217
    inputData$A1.y_flipped =  flipStrand(inputData[,A1.y])
218
    inputData$A2.y_flipped =  flipStrand(inputData[,A2.y])
219
    #flipSNPs = T # take note that we will need to flip SNPs
220
221
222
    flippedIndices = which(as.character(inputData[,A1.x]) == as.character(inputData$A1.y_flipped) & as.character(inputData[,A2.x]) == as.character(inputData$A2.y_flipped) | as.character(inputData[,A1.x]) == as.character(inputData$A2.y_flipped) & as.character(inputData[,A2.x]) ==  as.character(inputData$A1.y_flipped) )
223
    # print( head(inputData[flippedIndices,]) )
224
225
    print(paste0("flipped strand for ", length(flippedIndices), " variants"))
226
    inputData[flippedIndices,A1.y] = as.character(inputData$A1.y_flipped[flippedIndices])
227
    inputData[flippedIndices,A2.y] = as.character(inputData$A2.y_flipped[flippedIndices])
228
229
    #                                                                                 regular match                                                                                                                         reverse match                                                                                                    flipped match                                                                                                    reverse flipped match
230
    # matchedIndices = which(as.character(inputData[,A1.x]) == as.character(inputData[,A1.y]) & as.character(inputData[,A2.x]) == as.character(inputData[,A2.y]) |
231
    #                          as.character(inputData[,A1.x]) == as.character(inputData[,A2.y]) & as.character(inputData[,A2.x]) == as.character(inputData[,A1.y]) |
232
    #                          as.character(inputData[,A1.x]) == as.character(inputData$A1.y_flipped) & as.character(inputData[,A2.x]) == as.character(inputData$A2.y_flipped)  |
233
    #                          as.character( inputData[,A1.x]) == as.character(inputData$A2.y_flipped) &  as.character(inputData[,A2.x]) == as.character(inputData$A1.y_flipped) )
234
235
    } else {
236
    print("Ambiguous SNP (G/C and A/T) filter disabled. This can be changed by setting discardAmbiguousSNPs to TRUE.")
237
238
      # here we only match against the original strands, not the flipped one
239
      # matchedIndices = which(as.character(inputData[,A1.x]) == as.character(inputData[,A1.y]) & as.character(inputData[,A2.x]) == as.character(inputData[,A2.y]) |
240
      #                          as.character(inputData[,A1.x]) == as.character(inputData[,A2.y]) & as.character(inputData[,A2.x]) == as.character(inputData[,A1.y])  )
241
242
    }
243
  # this will work for both forward and reverse strands now, as if we enabled flipping above then A1.y matches the A1.y_flipped now, so we only need to check
244
  matchedIndices = which(as.character(inputData[,A1.x]) == as.character(inputData[,A1.y]) & as.character(inputData[,A2.x]) == as.character(inputData[,A2.y]) |
245
                           as.character(inputData[,A1.x]) == as.character(inputData[,A2.y]) & as.character(inputData[,A2.x]) == as.character(inputData[,A1.y])  )
246
247
248
  print(paste0("matched ", length(matchedIndices), " out of ", nrow(inputData), " variants"))
249
  # as there may be non-SNPs, we need to cast them as character
250
251
252
  # exclude unmatchables
253
  inputData = inputData[matchedIndices,]
254
  #inputData = inputData_orig
255
256
  # if(flipSNPs) {
257
  #   # flip those A1/A2s which were flipped matches
258
  #   #                                                                                       flipped match                                                                                                                                reverse flipped match
259
  #   flippedIndices = which(as.character(inputData[,A1.x]) == as.character(inputData$A1.y_flipped) & as.character(inputData[,A2.x]) == as.character(inputData$A2.y_flipped) | as.character(inputData[,A1.x]) == as.character(inputData$A2.y_flipped) & as.character(inputData[,A2.x]) ==  as.character(inputData$A1.y_flipped) )
260
  #  # print( head(inputData[flippedIndices,]) )
261
  #
262
  #   print(paste0("flipped strand for ", length(flippedIndices), " variants"))
263
  #   inputData[flippedIndices,A1.y] = as.character(inputData$A1.y_flipped[flippedIndices])
264
  #   inputData[flippedIndices,A2.y] = as.character(inputData$A2.y_flipped[flippedIndices])
265
  # }
266
  return(inputData)
267
}
268
269
270
271
#' Blended shaPRS (with overlapping datasets): produce summary statistics according to a continuous weighting scheme
272
#'
273
#' This function continuously blends the two sub-phenotype statistics
274
#' and generates an LDPred formatted table.
275
#'
276
#' @param proximal  Proximal LDPred formatted GWAS summary statistics table  that has header with the following columns: chr    pos SNP A1  A2  Freq1.Hapmap    b   se  p   N
277
#' @param adjunct dataframe for adjunct dataset of the same signature
278
#' @param blendingFactors a 3 column table of: SNP lFDR Qval, (produced by shaPRS_adjust)
279
#' @param rho (optional) sample overlap between studies
280
#' @param discardAmbiguousSNPs (optional) if ambiguous SNPs (G/C and A/T) should be discarded (default TRUE)
281
#' @return returns an LDPred formatted summary statistics table
282
#'
283
#' @importFrom stats na.omit pchisq pnorm cor
284
#'
285
#' @examples
286
#' proximalLoc <- system.file("extdata", "phenoA_sumstats", package = "shaPRS")
287
#' adjunctLoc <- system.file("extdata", "phenoB_sumstats", package = "shaPRS")
288
#' blendFactorLoc <- system.file("extdata", "myOutput_SNP_lFDR", package = "shaPRS")
289
#' proximal= read.table(proximalLoc, header = TRUE)
290
#' adjunct= read.table(adjunctLoc, header = TRUE)
291
#' blendingFactors= read.table(blendFactorLoc, header = TRUE)
292
#' blendedSumstats = shaPRS_blend_overlap(proximal, adjunct, blendingFactors)
293
#'
294
#' @export
295
shaPRS_blend_overlap = function(proximal, adjunct, blendingFactors, rho = 0, discardAmbiguousSNPs = T) {
296
  # cast as numeric
297
  proximal = RemoveNonNumerics(proximal)
298
  adjunct = RemoveNonNumerics(adjunct)
299
300
  # 1.  Merge first the 3 tables together by RSid, so they are always aligned, x = proximal  and    y = CombinedPheno ( ensure that when we check allele alignment we are comparing the same SNPs
301
  adjunctPheno = merge(proximal,adjunct,by.x = "SNP",by.y = "SNP")
302
  adjunctPheno_blending = merge(adjunctPheno,blendingFactors, by.x = "SNP", by.y = "SNP")
303
304
305
  adjunctPheno_blending = alignStrands(adjunctPheno_blending, discardAmbiguousSNPs = discardAmbiguousSNPs)
306
307
  # 2. Align PheB/B alleles
308
  misalignedAlleleIndices = which( as.character(adjunctPheno_blending$A1.x) != as.character(adjunctPheno_blending$A1.y) ) # compare as character, as if we have non-SNPs with different alleles factors will break
309
  adjunctPheno_blending$b.y[misalignedAlleleIndices] = -adjunctPheno_blending$b.y[misalignedAlleleIndices] # flip effects
310
  if(length(misalignedAlleleIndices) > 0) message(paste0(length(misalignedAlleleIndices)), " misaligned allele(s) effects were reversed" )
311
312
  # sanitize each input of NAs
313
  adjunctPheno_blending <- na.omit(adjunctPheno_blending) # remove any NAs of SNPs
314
315
  w = adjunctPheno_blending$lFDR
316
  tao1 = 1/adjunctPheno_blending$se.x^2
317
  tao2 = 1/adjunctPheno_blending$se.y^2
318
319
  # calculate the meta analysis beta coefficients and standard errors
320
  meta_se = sqrt( (tao1 + tao2 + rho *sqrt(tao1 * tao2) ) / ( (tao1 + tao2)^2 ) ) # when rho ==0, this is identical to sqrt(CovB12), but otherwwise this is different
321
  meta_coef = (adjunctPheno_blending$b.x*1/adjunctPheno_blending$se.x^2 + adjunctPheno_blending$b.y* 1/adjunctPheno_blending$se.y^2) / (1/adjunctPheno_blending$se.x^2+ 1/adjunctPheno_blending$se.y^2)
322
323
324
325
  # 3. Blend the proximal and CombinedPheno together and create new summary statistics via following logic:
326
  # Theoretical (Chris's orginal):
327
  CovB12 =  ( 1 + rho * sqrt(tao2/tao1) ) / (tao1 + tao2)
328
  blendedSE =  sqrt( (1-w)^2 /tao1 + w^2 * (tao1 +tao2 + rho * sqrt(tao1 * tao2) ) / (tao1 + tao2)^2 + 2 * w*(1-w) * CovB12 )
329
330
  # Empirical corr
331
  #CovB12_empirical = cor(adjunctPheno_blending$b.x,meta_coef)* adjunctPheno_blending$se.x*meta_se
332
  #blendedSE =  sqrt( (1-w)^2 /tao1 + w^2 * (tao1 +tao2 + rho * sqrt(tao1 * tao2) ) / (tao1 + tao2)^2 + 2 * w*(1-w) * CovB12_empirical )
333
334
335
336
  blendedBeta=adjunctPheno_blending$b.x * (1-adjunctPheno_blending$lFDR) + meta_coef * adjunctPheno_blending$lFDR
337
  blendedp=2*pnorm( abs(blendedBeta)/blendedSE,lower.tail=FALSE)
338
339
  # also need the combined sample size
340
  CombinedN = adjunctPheno_blending$N.x + adjunctPheno_blending$N.y
341
  # 3. create new data frame to store the new summary stats
342
  blendedSumstats = data.frame(adjunctPheno_blending$chr.x, adjunctPheno_blending$pos.x, adjunctPheno_blending$SNP, adjunctPheno_blending$A1.x, adjunctPheno_blending$A2.x, adjunctPheno_blending$Freq1.Hapmap.x,
343
                               blendedBeta,
344
                               blendedSE,
345
                               blendedp,
346
                               round(adjunctPheno_blending$N.x * (1-adjunctPheno_blending$lFDR) + CombinedN * adjunctPheno_blending$lFDR * (1-rho))
347
  )
348
  colnames(blendedSumstats) = colnames(proximal)
349
  blendedSumstats= blendedSumstats[match(proximal$SNP, blendedSumstats$SNP),]
350
  blendedSumstats <- na.omit(blendedSumstats) # remove any NAs of SNPs that couldn't be matched
351
352
  return(blendedSumstats)
353
}
354
355
356
357
#' Calculate rho to be used in shaPRS_adjust()
358
#'
359
#' Convenience function to estimate the correlation between  two studies with overlapping controls
360
#' for more details see:
361
#' Lin et al. 'Meta-Analysis of Genome-wide Association Studies with Overlapping Subjects' (2009
362
#' ncbi.nlm.nih.gov/pmc/articles/PMC2790578
363
#'
364
#' @param nkl0 number of controls overlapping between studies
365
#' @param nk1 number of cases in study k
366
#' @param nk0 number of controls in study k
367
#' @param nl1 number of cases in study l
368
#' @param nl0 number of controls in study l
369
#' @return returns real value of the approximate correlation
370
#'
371
#' @examples
372
#' rho = shaPRS_rho(nkl0 = 9492,nk1 = 3810, nk0= 9492, nl1= 3765,nl0= 9492)
373
#'
374
#' @export
375
shaPRS_rho = function(nkl0,nk1, nk0, nl1,nl0) {
376
  nk=nk1+nk0 # total number indis in k
377
  nl= nl1+nl0 # total number of indis in l
378
  approx_cor= (nkl0 * sqrt(nk1*nl1 / (nk0*nl0) )   ) / sqrt(nk*nl)
379
  return(approx_cor )
380
}
381
382
383
#' Generic inverse variance meta analysis
384
#'
385
#' Convenience function to produce the combined phenotype estimate if we only have summary stats for phenoA and pheno B
386
#'
387
#' @param proximal dataframe for main proximal
388
#' @param adjunct dataframe for other proximal
389
#' @param rho (optional) overlap between studies
390
#' @param discardAmbiguousSNPs (optional) if ambiguous SNPs (G/C and A/T) should be discarded (default TRUE)
391
#' @return returns Combinedpheno dataframe that can be plugged into shaPRS_blend or shaPRS_composite
392
#'
393
#'
394
#' @export
395
inverse_metaAnalaysis = function(proximal,adjunct, rho = 0, discardAmbiguousSNPs = T) {
396
397
  # cast as numeric
398
  proximal = RemoveNonNumerics(proximal)
399
  adjunct = RemoveNonNumerics(adjunct)
400
401
  # 1.  Merge first the tables together by RSid, so they are always aligned, x = proximal  and    y = adjunct ( ensure that when we check allele alignment we are comparing the same SNPs
402
  proximal_adjunct = merge(proximal,adjunct,by.x = "SNP",by.y = "SNP")
403
404
405
  proximal_adjunct = alignStrands(proximal_adjunct, discardAmbiguousSNPs = discardAmbiguousSNPs)
406
407
  # 2. Align PheB/B alleles
408
  misalignedAlleleIndices = which( as.character(proximal_adjunct$A1.x) != as.character(proximal_adjunct$A1.y) ) # compare as character, as if we have non-SNPs with different alleles factors will break
409
  proximal_adjunct$b.y[misalignedAlleleIndices] = -proximal_adjunct$b.y[misalignedAlleleIndices] # flip effects for phe B
410
  if(length(misalignedAlleleIndices) > 0) message(paste0(length(misalignedAlleleIndices)), " misaligned allele(s) effects were reversed" )
411
412
413
  # INVERSE VARIANCE FIXED EFFECT META ANALYSIS: https://en.wikipedia.org/wiki/Inverse-variance_weighting
414
  meta_coef = (proximal_adjunct$b.x*1/proximal_adjunct$se.x^2 + proximal_adjunct$b.y* 1/proximal_adjunct$se.y^2) / (1/proximal_adjunct$se.x^2+ 1/proximal_adjunct$se.y^2)
415
416
  # no overlap, use simple formula
417
  if(rho == 0) { meta_se = sqrt( (proximal_adjunct$se.x^(-2) + proximal_adjunct$se.y^(-2) )^(-1) ) }
418
  else { # there is an overlap, use Chris' updated formula
419
    tao1 = 1/proximal_adjunct$se.x^2
420
    tao2 = 1/proximal_adjunct$se.y^2
421
    meta_se = sqrt( (tao1 + tao2 + rho *sqrt(tao1 * tao2) ) / ( (tao1 + tao2)^2 ) )
422
  }
423
424
425
  meta_p = 2*pnorm(-abs(meta_coef/meta_se))
426
427
428
  # 3. create new data frame to store the new summary stats
429
  CombinedPheno = data.frame(proximal_adjunct$chr.x, proximal_adjunct$pos.x, proximal_adjunct$SNP, proximal_adjunct$A1.x, proximal_adjunct$A2.x, proximal_adjunct$Freq1.Hapmap.x,
430
                             meta_coef,
431
                             meta_se,
432
                             meta_p,
433
                             proximal_adjunct$N.x + proximal_adjunct$N.y)
434
435
  colnames(CombinedPheno) = colnames(proximal)
436
  CombinedPheno= CombinedPheno[match(proximal$SNP, CombinedPheno$SNP),]
437
  CombinedPheno <- na.omit(CombinedPheno) # remove any NAs of SNPs that couldn't be matched
438
439
  return(CombinedPheno)
440
}
441
442
# helper function that removes and casts all columns that should be numeric as numeric
443
RemoveNonNumerics = function(proximal) {
444
  # remove non numeric data from cols that must be numeric (must cast to 'characer' otherwise R may turn a value eg 0.8249 into a large integer like 8150 on nix systems)
445
  proximal <- proximal[!is.na(as.numeric(as.character(proximal$b))),]
446
  proximal <- proximal[!is.na(as.numeric(as.character(proximal$se))),]
447
448
  # now actually cast them to numeric
449
  proximal$b = as.numeric(as.character(proximal$b ))
450
  proximal$se = as.numeric(as.character(proximal$se ))
451
  return(proximal)
452
}
453
454
455
#' Convenience function to generate a shaPRS specific LD reference panel for cross-ancestry analyses
456
#'
457
#' Wrapper function that loads and processes the LD data for two populations, aligns it with summary data for shaPRS and then generate a full new LD-ref panel for 22 autosomes (this should be used instead of LDRefBlend)
458
#'
459
#' @param Pop1LDRefLoc Location of the folder of the LDpred2 formatted LD-reference matrices for the 22 autosomes together with a map.rds file for the proximal study
460
#' @param Pop2LDRefLoc Location of the folder of the LDpred2 formatted LD-reference matrices for the 22 autosomes together with a map.rds file for the adjunct study
461
#' @param blendFactorLoc Location for the lFDR data file produced by shaPRS(), postfix: "_SNP_lFDR"
462
#' @param adjustinputLoc Location for the file produced by the shaPRS(), postfix: "_adjustinput"
463
#' @param outputLoc Output location
464
#' @param discardAmbiguousSNPs (optional) if ambiguous SNPs (G/C and A/T) should be discarded (default TRUE)
465
#' @param memoryEfficiency (optional) larger numbers result in longer runs but lower memory usage (default 5)
466
#'
467
#' @import Matrix compiler
468
#' @importFrom utils read.table write.table
469
#'
470
#' @examples
471
#' Pop1LDRefLoc <- paste0(system.file("extdata", "", package = "shaPRS"), "/")
472
#' Pop2LDRefLoc <- paste0(system.file("extdata", "", package = "shaPRS"), "/")
473
#' blendFactorLoc <- system.file("extdata", "pop_SNP_lFDR", package = "shaPRS")
474
#' adjustinputLoc <- system.file("extdata", "pop_adjustinput", package = "shaPRS")
475
#' outputLoc <- "<YOUR LOCATION>"
476
#' # shaPRS_LDGen(Pop1LDRefLoc, Pop2LDRefLoc, blendFactorLoc, adjustinputLoc, outputLoc)
477
#'
478
#' @export
479
shaPRS_LDGen = function(Pop1LDRefLoc,Pop2LDRefLoc, blendFactorLoc, adjustinputLoc, outputLoc, discardAmbiguousSNPs = F, memoryEfficiency = 5) {
480
481
  # 1. load Map data
482
  pop1_map_rds = readRDS(file = paste0(Pop1LDRefLoc,"map.rds") )
483
  pop2_map_rds = readRDS(file = paste0(Pop2LDRefLoc,"map.rds") )
484
485
  # load raw blending factors and summary stats for the entire genome
486
  blendingFactors= read.table(blendFactorLoc, header = T)
487
  sumsData= read.table(adjustinputLoc, header = T)
488
489
  dir.create(file.path(paste0(outputLoc,"/") ), recursive = T, showWarnings = F)
490
491
  # merge them
492
  sumstatsDataAll =  merge(blendingFactors,sumsData,by.x = "SNP",by.y = "SNP")
493
494
  # remove non numeric data
495
  sumstatsDataAll <- sumstatsDataAll[!is.na(as.numeric(as.character(sumstatsDataAll$lFDR))),]
496
  sumstatsDataAll <- sumstatsDataAll[!is.na(as.numeric(as.character(sumstatsDataAll$SE_A))),]
497
  sumstatsDataAll <- sumstatsDataAll[!is.na(as.numeric(as.character(sumstatsDataAll$SE_B))),]
498
499
  # now actually cast them to numeric
500
  sumstatsDataAll$lFDR = as.numeric(as.character(sumstatsDataAll$lFDR ))
501
  sumstatsDataAll$SE_A = as.numeric(as.character(sumstatsDataAll$SE_A ))
502
  sumstatsDataAll$SE_B = as.numeric(as.character(sumstatsDataAll$SE_B ))
503
504
  # align the summary for phe A and B
505
  sumstatsDataAll = alignStrands(sumstatsDataAll, discardAmbiguousSNPs = discardAmbiguousSNPs)
506
  #chromNum=21
507
  # go through each chrom
508
  for(chromNum in 1:22){
509
    # load the two chromosomes from each population
510
    pop1LDmatrix = readRDS(file = paste0(Pop1LDRefLoc,"LD_chr",chromNum,".rds") )
511
    pop2LDmatrix = readRDS(file = paste0(Pop2LDRefLoc,"LD_chr",chromNum,".rds") )
512
513
    # 2. grab the RSids from the map for the SNPS on this chrom, each LD mat has a potentiall different subset of SNPs
514
    pop1_chrom_SNPs = pop1_map_rds[ which(pop1_map_rds$chr == chromNum),] # this is guaranteed to be the same order as the pop1LDmatrix
515
    pop2_chrom_SNPs = pop2_map_rds[ which(pop2_map_rds$chr == chromNum),] # this is guaranteed to be the same order as the pop2LDmatrix
516
    pop1_chrom_SNPs$pop1_id = 1:nrow(pop1_chrom_SNPs)
517
    pop2_chrom_SNPs$pop2_id = 1:nrow(pop2_chrom_SNPs)
518
519
    # intersect the 2 SNP lists so that we only use the ones common to both LD matrices by merging them
520
    chrom_SNPs_df  <- merge(pop1_chrom_SNPs,pop2_chrom_SNPs, by = "rsid")
521
522
    # align the two LD matrices
523
    chrom_SNPs_df = alignStrands(chrom_SNPs_df, A1.x ="a1.x", A2.x ="a0.x", A1.y ="a1.y", A2.y ="a0.y")
524
525
    # subset sumstats data to the same chrom
526
    sumstatsData = sumstatsDataAll[which(sumstatsDataAll$CHR == chromNum ),]
527
528
    if(nrow(sumstatsData) > 0) {
529
      # merge sumstats with common LD map data
530
      sumstatsData  <- merge(chrom_SNPs_df,sumstatsData, by.x="rsid", by.y = "SNP")
531
532
      # remove duplicates
533
      sumstatsData = sumstatsData[ !duplicated(sumstatsData$rsid) ,]
534
      # use the effect alleles for the sumstats data with the effect allele of the LD mat
535
      # as we are aligning the LD mats against each other, not against the summary stats
536
      # we only use the lFDR /SE from the sumstats, which are directionless, so those dont need to be aligned
537
      sumstatsData$A1.x =sumstatsData$a1.x
538
      sumstatsData$A1.y =sumstatsData$a1.y
539
540
      # make sure the sumstats is ordered the same way as the LD matrix: https://stackoverflow.com/questions/17878048/merge-two-data-frames-while-keeping-the-original-row-order
541
      sumstatsData = sumstatsData[order(sumstatsData$pop1_id), ] # it doesn't matter which matrix to use to order the sumstats as they are the same
542
543
544
      # subset the LD matrices to the SNPs we actualy have
545
      pop1LDmatrix = pop1LDmatrix[sumstatsData$pop1_id,sumstatsData$pop1_id]
546
      pop2LDmatrix = pop2LDmatrix[sumstatsData$pop2_id,sumstatsData$pop2_id]
547
548
      # generate the blended LD matrix
549
      cormat = LDRefBlend(pop1LDmatrix,pop2LDmatrix, sumstatsData, memoryEfficiency = memoryEfficiency)
550
551
      fileLoc= paste0(outputLoc,"/LD_chr",chromNum,".rds")
552
      saveRDS(cormat,file = fileLoc)
553
      print(paste0("written PRS specific LD mat to ",fileLoc ))
554
    } else {print(paste0("no variants on chrom", chromNum))}
555
556
557
    # also need to write out the list of SNPs that made it into the final subset, as after all LD matrices are done, we need to create a map.rds too
558
    write.table(sumstatsData$rsid, paste0(outputLoc,chromNum,"_snps"), sep = "\t", row.names = F, col.names = F, quote = FALSE)
559
560
561
    # map the final list of SNPs back to the original map file's indices
562
    map_rds_new = pop1_map_rds[which(pop1_map_rds$chr == chromNum),]
563
    map_rds_new2 = map_rds_new[which(map_rds_new$rsid %in% sumstatsData$rsid),] # match the first to the second
564
565
    fileLoc= paste0(outputLoc,"/LD_chr",chromNum,"_map.rds")
566
    saveRDS(map_rds_new2,file = fileLoc)
567
    print(paste0("written chr map to ",fileLoc ))
568
569
   # mem_used()
570
  }
571
572
  # at the end concat all of the map files into a single file and write it to disk
573
  all_map_rds = NULL
574
  for(chromNum in 1:22){
575
    filLoc=paste0(outputLoc,"/LD_chr",chromNum,"_map.rds")
576
    chr_map_rds = readRDS(file = filLoc )
577
    all_map_rds = rbind(all_map_rds,chr_map_rds)
578
    file.remove(filLoc)
579
  }
580
581
  fileLoc= paste0(outputLoc,"/map.rds")
582
  saveRDS(all_map_rds,file = fileLoc)
583
  print(paste0("written overall map to ",fileLoc ))
584
}
585
586
587
588
#' Generate shaPRS specific LD reference panel
589
#'
590
#' Generates a PRS specific LD reference matrix by blending together two LD ref panels according to
591
#' shaPRS produced lFDR and standard errors
592
#'
593
#' @param pop1LDmatrix LD reference matrix in RDS (dsCMatrix) format for target population
594
#' @param pop2LDmatrix LD reference matrix in RDS (dsCMatrix) format for other population
595
#' @param sumstatsData summary data with required columns of SE_A, SE_B, A1.x, A1.y, and lFDR
596
#' @param memoryEfficiency larger numbers result in longer runs but lower memory usage (default 5)
597
#' @return returns a PRS specific LD matrix
598
#'
599
#' @import Matrix compiler
600
#'
601
#' @examples
602
#' sumstatsData = readRDS(file = system.file("extdata", "sumstatsData_toy.rds", package = "shaPRS") )
603
#'
604
#' # read SNP map files (same toy data for the example)
605
#' pop1_map_rds = readRDS(file = system.file("extdata", "my_data.rds", package = "shaPRS") )
606
#' pop2_map_rds = readRDS(file = system.file("extdata", "my_data2.rds", package = "shaPRS") )
607
#'
608
#' # use chrom 21 as an example
609
#' chromNum=21
610
#'
611
#' # load the two chromosomes from each population ( same toy data for the example)
612
#' pop1LDmatrix = readRDS(file = system.file("extdata", "LDref.rds", package = "shaPRS") )
613
#' pop2LDmatrix = readRDS(file = system.file("extdata", "LDref2.rds", package = "shaPRS") )
614
#'
615
#'
616
#' # 2. grab the RSids from the map for the SNPS on this chrom,
617
#' # each LD mat has a potentially different subset of SNPs
618
#' # this is guaranteed to be the same order as the pop1LDmatrix
619
#' pop1_chrom_SNPs = pop1_map_rds[ which(pop1_map_rds$chr == chromNum),]
620
#' # this is guaranteed to be the same order as the pop2LDmatrix
621
#' pop2_chrom_SNPs = pop2_map_rds[ which(pop2_map_rds$chr == chromNum),]
622
#' pop1_chrom_SNPs$pop1_id = 1:nrow(pop1_chrom_SNPs)
623
#' pop2_chrom_SNPs$pop2_id = 1:nrow(pop2_chrom_SNPs)
624
#'
625
#'
626
#' # intersect the 2 SNP lists so that we only use the ones common to both LD matrices by merging them
627
#' chrom_SNPs_df  <- merge(pop1_chrom_SNPs,pop2_chrom_SNPs, by = "rsid")
628
#'
629
#' # align the two LD matrices
630
#' chrom_SNPs_df = alignStrands(chrom_SNPs_df, A1.x ="a1.x", A2.x ="a0.x", A1.y ="a1.y", A2.y ="a0.y")
631
#'
632
#'
633
#' # align the summary for phe A and B
634
#' sumstatsData = alignStrands(sumstatsData)
635
#'
636
#' # subset sumstats data to the same chrom
637
#' sumstatsData = sumstatsData[which(sumstatsData$CHR == chromNum ),]
638
#'
639
#' # merge sumstats with common LD map data
640
#' sumstatsData  <- merge(chrom_SNPs_df,sumstatsData, by.x="rsid", by.y = "SNP")
641
#'
642
#' # remove duplicates
643
#' sumstatsData = sumstatsData[ !duplicated(sumstatsData$rsid) ,]
644
#' # use the effect alleles for the sumstats data with the effect allele of the LD mat
645
#' # as we are aligning the LD mats against each other, not against the summary stats
646
#' # we only use the lFDR /SE from the sumstats,
647
#' # which are directionless, so those dont need to be aligned
648
#' sumstatsData$A1.x =sumstatsData$a1.x
649
#' sumstatsData$A1.y =sumstatsData$a1.y
650
#'
651
#' # make sure the sumstats is ordered the same way as the LD matrix:
652
#' sumstatsData = sumstatsData[order(sumstatsData$pop1_id), ]
653
#' # it doesn't matter which matrix to use to order the sumstats as they are the same
654
#'
655
#' # subset the LD matrices to the SNPs we actually have
656
#' pop1LDmatrix = pop1LDmatrix[sumstatsData$pop1_id,sumstatsData$pop1_id]
657
#' pop2LDmatrix = pop2LDmatrix[sumstatsData$pop2_id,sumstatsData$pop2_id]
658
#'
659
#' # generate the blended LD matrix
660
#' cormat = LDRefBlend(pop1LDmatrix,pop2LDmatrix, sumstatsData)
661
#'
662
#' # create a new map file that matches the SNPs common to both LD panels
663
#' map_rds_new = pop1_map_rds[which(pop1_map_rds$chr == chromNum),]
664
#' map_rds_new2 = map_rds_new[which(map_rds_new$rsid %in% sumstatsData$rsid),]
665
#'
666
#' # save the new LD matrix to a location of your choice
667
#' # saveRDS(cormat,file =paste0(<YOUR LOCATION>,"/LD_chr",chromNum,".rds"))
668
#'
669
#' # save its Map file too
670
#' # saveRDS(map_rds_new2,file = paste0(<YOUR LOCATION>,"/LD_chr",chromNum,"_map.rds"))
671
#'
672
#' @export
673
LDRefBlend = function(pop1LDmatrix,pop2LDmatrix, sumstatsData, memoryEfficiency = 5) {
674
  #library(compiler)
675
  enableJIT(3)
676
677
  #library(Matrix)
678
679
  ## use symbols from the tex
680
  wA = 1 - sumstatsData$lFDR    # wA is SNP_A's lFDR , between study 1 and study 2
681
  tauA1 = 1/sumstatsData$SE_A^2 # tauA1 is SNP_A's study 1 precision
682
  tauA2 = 1/sumstatsData$SE_B^2 # tauA2 is SNP_A's study 2 precision
683
684
  wB = 1 - sumstatsData$lFDR    # wB is SNP_B's lFDR, between study 1 and study 2
685
  tauB1 = 1/sumstatsData$SE_A^2 # tauB1 is SNP_B's study 1 precision
686
  tauB2 = 1/sumstatsData$SE_B^2 # tauB2 is SNP_B's study 2 precision
687
688
  ## build covariance matrix
689
  r1 = convertDSCToDense(pop1LDmatrix, numparts = 5) # LD between SNPA and SNPB in study (population) 1 # must convert to dense otherwise will get memory error later
690
  prodA=(tauA1 + wA*tauA2)/(tauA1 + tauA2)/sqrt(tauA1) # vector of 5, IE pre-calculating all for all 'A' and all 'B; SNPs, all terms
691
  prodB=(tauB1 + wB*tauB2)/(tauB1 + tauB2)/sqrt(tauB1) # vector of 5
692
  # mat of 5x5, an outer product of A and B, IE make it the same dim as the LD matrix, IE the outer product expands out and performs all calculations in the loop in one go
693
  # IE outer product =~ loop
694
  term1=outer(prodA,prodB,"*") * r1   # multiplying elementwise by the LD
695
  remove(r1) # free up RAM
696
697
  # align the two LD matrices: invert correlations where one of the studies' effect allele is flipped wrt other studies' effect allele
698
  misalignedAlleleIndices = which( as.character(sumstatsData$A1.x) != as.character(sumstatsData$A1.y) ) # compare as character, as if we have non-SNPs with different alleles factors will break
699
  if(length(misalignedAlleleIndices) > 0) message(paste0(length(misalignedAlleleIndices)), " misaligned variants correlation were reversed" )
700
  # create a vector of the flipped alleles, 1 for aloigned, and -1 for misaligned
701
  flipped_mask =  rep(1, nrow(sumstatsData) )
702
  flipped_mask[misalignedAlleleIndices] = -1 # set
703
  # create a mask of the flipped alleles: this is just the outer product of the above vector
704
  flippedMat=outer(flipped_mask,flipped_mask,"*")  # create a matrix that can be used to flip correlations via elementwise multiplications
705
706
707
  r2 = convertDSCToDense(pop2LDmatrix, numparts = memoryEfficiency) # LD between SNPA and SNPB in study (population) 2
708
  r2 = r2 * flippedMat # apply the flipping to the second LD mat
709
  remove(flippedMat) # free up RAM
710
711
  prodA=((1-wA) * tauA2)/(tauA1 + tauA2)/sqrt(tauA2)
712
  prodB=((1-wB) * tauB2)/(tauB1 + tauB2)/sqrt(tauB2)
713
  term2=outer(prodA,prodB,"*") * r2
714
  remove(r2) # free up RAM
715
  covmat=term1 + term2 # reconstruct full eq from 3.1
716
717
  # free up ram
718
  remove(term1)
719
  remove(term2)
720
721
  ## need variance to get cor
722
  # this replaces my
723
  # SEbar_A = sqrt( VarAbar )
724
  # VarAbar = wA^2 /taoA1 + (1-wA^2) / (taoA1 + taoA2)
725
  VbarA=wA^2/tauA1 + (1-wA^2)/(tauA1+tauA2) # vector of 5, of the variances
726
  VbarB=wB^2/tauB1 + (1-wB^2)/(tauB1+tauB2)
727
  outerProd = outer(sqrt(VbarA),sqrt(VbarB),"*") # this multiplies all combination of SEs together to create a 5x5 dim matrix
728
  # and thus replacing the loop that generates
729
  cormat=covmat / outerProd
730
731
  # free up ram
732
  remove(covmat)
733
  remove(outerProd)
734
  return(cormat)
735
}
736
737
738
739
#' Convert DSC to Dense matrix
740
#'
741
#' Helper function that converts an LD from sparse DSC to Dense format, in a given number of parts (to overcome RAM limitations)
742
#'
743
#' @param pop1LDmatrix LD reference matrix in RDS (dsCMatrix) format for target population
744
#' @param numparts (optional) how many parts should be used for converting the matrix, if memory becomes an issue use higher numbers
745
#' @return returns a dense LD matrix
746
#'
747
#' @export
748
convertDSCToDense = function(pop1LDmatrix, numparts = 3) {
749
  # convert dsc to dense matrix in a non-crashing way
750
  firstHalf = floor(ncol(pop1LDmatrix)/numparts)
751
  start = 0;
752
  end = 0
753
  r1 = NULL
754
  for (i in 1:numparts) {
755
    start = end
756
    if (i == numparts) { # last one has to go to the end
757
      end = ncol(pop1LDmatrix)
758
    } else {
759
      end = end + firstHalf
760
    }
761
762
    r1_p1 = as.matrix(pop1LDmatrix[,(start+1):end])
763
    r1 = cbind(r1,r1_p1)
764
    remove(r1_p1)
765
  }
766
  return(r1)
767
}