Diff of /lib/handy.R [000000] .. [0375db]

Switch to unified view

a b/lib/handy.R
1
################################################################################
2
###  VARIABLE DEFINITION  ######################################################
3
################################################################################
4
default.logfile.name <- 'log.txt'
5
6
################################################################################
7
###  DATA FRAMES  ##############################################################
8
################################################################################
9
10
sample.df <- function(df, size = 1, replace = FALSE, prob = NULL) {
11
  # Samples rows from a data frame.
12
  #
13
  # Args:
14
  #      df: A data frame.
15
  #   size, replace, prob: Arguments from the sample function.
16
  #
17
  # Returns:
18
  #   size rows from this data frame (default 1), with or without replacement
19
  #   and with an optional probability vector.
20
  df[sample(nrow(df), size, replace, prob), , drop = FALSE]
21
}
22
23
bootstrapSampleDf <- function(df) {
24
  # Quick wrapper function for simple bootstrap sampling of a data frame.
25
  #
26
  # Args:
27
  #      df: A data frame.
28
  #
29
  # Returns:
30
  #  A data frame with the same number of rows as the original, but randomly
31
  #  sampled with replacement.
32
  sample.df(df, size = nrow(df), replace = TRUE)
33
}
34
35
withoutCols <- function(df, cols) {
36
  # Returns a vector to allow certain columns to be excluded from a
37
  # data frame. The case where the columns being excluded are referred to
38
  # numerically is trivial, but is included as well for generality.
39
  #
40
  # Args:
41
  #      df: a data frame.
42
  #    cols: the columns to be excluded, as a vector or scalar of number(s) or
43
  #          name(s).
44
  #
45
  # Returns:
46
  #      In the text case, a vector of booleans; TRUE for columns to include.
47
  #      In the numerical case, R understands df[-3,], so just minus the input.
48
  if(is.character(cols)) { return(!(names(df) %in% cols)) }
49
  else { return(-cols) }
50
}
51
52
explodeByCol <- function(df, cols, sep=',', regex = NULL,
53
                         fixed = TRUE) {
54
  # If your data frame contains multiple values in a single column, this splits
55
  # multiple values across different rows, using either a separator character or
56
  # a regular expression.
57
  #
58
  # Args:
59
  #       df: a data frame.
60
  #     cols: one or more column(s) to explode by.
61
  #      sep: the separator of the multiple values.
62
  #    regex: a regular expression which matches the values you're looking for;
63
  #           overrides sep.
64
  #    fixed: for strsplit, this variable determines whether the string passed
65
  #           in sep is fixed (TRUE) or a regular expression (FALSE).
66
  #
67
  # Returns:
68
  #      A data frame with new rows, one for each value in the exploded column.
69
  
70
  # data frames fairly often come in with 'character' columns which are factors,
71
  # and these string-based functions can't handle that, so convert with a
72
  # warning
73
74
  # instantiate a list to contain the exploded output
75
  exploded <- list()
76
  n.exploded <- list()
77
78
  for(col in cols) {
79
    if(is.factor(df[, col])) {
80
      warning(
81
        paste0('Column ', col, ' is a factor, and has been coerced ',
82
               'to a character for exploding.')
83
      )
84
      df[, col] <- as.character(df[, col])
85
    } else if(!is.character(df[, col])) {
86
      # if it's not character data, it won't work, so pass an error
87
      stop(
88
        paste0('Column  ', col, ' passed to explodeByType should be character ',
89
               'data; it is of class ', class(df[, col]), '.'
90
        )
91
      )
92
    }
93
    # if regex is NULL, use the separator provided
94
    if(is.null(regex)) {
95
      exploded[col] <- list(strsplit(df[, col], sep, fixed = fixed))
96
    # otherwise, use a regular expression to split the column
97
    } else {
98
      exploded[col] <- list(regmatches(df[, col], gregexpr(regex, df[, col])))
99
    }
100
    # how many of each row should I create? ie 1,1,2,1,0
101
    n.exploded[[col]] <- sapply(exploded[[col]], length)
102
  }
103
104
  # check the n.exploded values are the same for all columns
105
  if(!allSame(n.exploded)) {
106
    stop(
107
      paste0('The columns provided have inconsistent numbers of elements ',
108
        'after exploding.'
109
        )
110
      )
111
  }
112
  # turn the first element of n.exploded into a list of data frame row indices,
113
  # ie 1,2,3,3,4
114
  n.exploded.rows <- rep(1:length(n.exploded[[cols[1]]]), n.exploded[[cols[1]]])
115
116
  # take the data frame and repeat rows the relevant number of times
117
  df <- df[n.exploded.rows, ]
118
  # fill its exploded column(s) with the appropriate values
119
  for(col in cols) {
120
    df[, col] <- unlist(exploded[[col]])
121
  }
122
  df
123
}
124
125
################################################################################
126
###  CHARACTERS  ###############################################################
127
################################################################################
128
129
removeWhitespace <- function(x) { gsub("\\s","", x) }
130
# For a character or vector of characters x, removes all spaces and line
131
# breaks.
132
#
133
# Args:
134
#      x: A character or vector of characters.
135
#
136
# Returns:
137
#      The character with whitespace removed.
138
139
pastePlus <- function(..., sep=" ", collapse = NULL, recycleZeroLength = TRUE) {
140
  # Version of the base R paste function which optionally returns nothing if any
141
  # of the ...s being concatenated have zero length. (Default behaviour is to
142
  # recycle them to "".)
143
  #
144
  # Args:
145
  # ..., sep, collapse: as paste in base R
146
  #   ignoreZeroLength: 
147
  #
148
  # Returns:
149
  #      If any of the passed objects has zero length, NULL; otherwise, the
150
  #      result of the paste function.
151
  if(!recycleZeroLength &
152
       any(lapply(list(...), length) == 0)) {
153
    return(NULL);
154
  }
155
  paste(..., sep = sep, collapse = collapse)
156
}
157
158
paste0Plus <- function(..., collapse = NULL, recycleZeroLength = TRUE) {
159
  pastePlus(..., sep="", collapse = collapse,
160
            recycleZeroLength = recycleZeroLength)
161
}
162
163
strPos <- function(..., fixed = TRUE) {
164
  # Wrapper function which returns the positions of the first occurrence of a
165
  # pattern in some text. Simplifies regexpr which returns a variety of things
166
  # other than simply the position. Defaults to fixed rather than regular
167
  # expression searching.
168
  #
169
  # Args:
170
  #    ...: see grep in base R; usually (pattern, text)
171
  #  fixed: Logical. If true, pattern is matched as-is.
172
  #
173
  # Returns:
174
  #      The position of the first occurrence of the pattern in the text.
175
  regexpr(..., fixed = fixed)[1]
176
}
177
178
startsWithAny <- function(x, prefixes) {
179
  # Function extending startsWith for use with a vector of many prefixes.
180
  #
181
  # Args:
182
  #         x: Vector of characters whose starts will be examined.
183
  #  prefixes: Vector of characters which may be those starts.
184
  #
185
  # Returns:
186
  #      A logical vector denoting whether a given string in x starts with any
187
  #      of the prefixes provided.
188
  apply(
189
    # sapply startsWith over all prefixes, giving a table of logicals
190
    sapply(
191
      prefixes,
192
      function(prefix) {
193
        startsWith(x, prefix)
194
      }
195
    ),
196
    # ...then, apply a massive logical 'or' over the rows of that table
197
    MARGIN = 1, FUN = any
198
  )
199
}
200
201
textAfter <- function(x, prefix) {
202
  # Find and return text from strings starting with a prefix, after that prefix.
203
  #
204
  # Args:
205
  #       x: Vector of characters to examine.
206
  #  prefix: Single character string to search for and then discard where
207
  #          present. Accepts a vector as startsWith, but this would be a
208
  #          slightly strange use-case.
209
  #
210
  # Returns:
211
  #      A character vector of pieces of text which occur after the prefix
212
  #      specified. eg textAfter(c('a1', 'a2', 'b1'), 'a') would return
213
  #      c('1', '2').
214
  i <- startsWith(x, prefix)
215
  substr(x[i], nchar(prefix) + 1, nchar(x[i]))
216
}
217
218
randomString <- function(l, characters = letters, disallowed = NULL) {
219
  # Generate a random string, with optional excision of disallowed sequences.
220
  #
221
  # Args:
222
  #      l: The length of the string to generate in number of components
223
  #         (usually single characters, see below)
224
  #  characters: Either a string eg 'Argh' which will be split into individual
225
  #         characters to act as string components, or a vector of components;
226
  #         no check is made so these can be multi-character
227
  #  disallowed: A vector of disallowed sequences. Defaults to NULL, which
228
  #         lets anything through.
229
  #
230
  # Returns:
231
  #      A string of length l picked from the characters provided, without any
232
  #      disallowed strings.
233
  
234
  
235
  # if passed a single-element vector, it's almost certain they don't want a
236
  # single string repeated 'randomly' over and over, so split it into characters
237
  if(length(characters == 1)) {
238
    characters <- unlist(strsplit(characters, ''))
239
  }
240
  # generate a random string by sampling from characters
241
  random.string <- paste0(
242
    sample(
243
      characters,
244
      l,
245
      replace = TRUE
246
    ),
247
    collapse=''
248
  )
249
  # if they've passed a disallowed vector, let's check none of the parts of the
250
  # string contain it
251
  if(!is.null(disallowed)) {
252
    # loop over elements of disallowed
253
    for(not.allowed in disallowed) {
254
      # find matches of the forbidden string
255
      not.allowed.matches <- gregexpr(not.allowed, random.string)
256
      # if some matches are found...
257
      while(not.allowed.matches[[1]][1] != -1) {
258
        # ...loop over them, getting rid of one at a time
259
        for(i in 1:length(not.allowed.matches[[1]])) {
260
          substring(
261
            random.string,
262
            # start at the match point
263
            not.allowed.matches[[1]][i],
264
            # end at match point plus match length
265
            not.allowed.matches[[1]][i] + attr(not.allowed.matches[[1]], 'match.length')[i]
266
          ) <-
267
            # and what better to replace them with than a string generated at
268
            # random with this very function!
269
            randomString(
270
              attr(not.allowed.matches[[1]], 'match.length')[i],
271
              characters,
272
              disallowed
273
            )
274
        }
275
        # and then perform the test again to make sure we didn't introduce
276
        # any unexpected disallowed patterns with the replacements...
277
        not.allowed.matches <- gregexpr(not.allowed, random.string)
278
      }
279
    }
280
  }
281
  # return the random string
282
  random.string
283
}
284
285
randomStrings <- function(n, l, characters = letters, disallowed = NULL) {
286
  # Generate n random strings; wrapper for the randomString function.
287
  #
288
  # Args:
289
  #      n: Number of random strings to generate
290
  #    ...: For other arguments, see randomString
291
  #
292
  # Returns:
293
  #      n random strings with the specified properties
294
  replicate(n, randomString(l, characters, disallowed))
295
}
296
297
################################################################################
298
###  FACTORS  ##################################################################
299
################################################################################
300
301
concatFactors <- function(...) {
302
  # Takes some factors and concatenates them. R coerces factors to integers if
303
  # you don't convert them to character vectors at the intermediate stage, so
304
  # this saves typing that every time.
305
  #
306
  # Args:
307
  #      ...: Some factors
308
  #
309
  # Returns:
310
  #      A big factor.
311
  factor(unlist(lapply(list(...), FUN=as.character)))
312
}
313
314
factorChooseFirst <- function(x, first) {
315
  # Move a chosen level to be the first in a factor.
316
  #
317
  # Args:
318
  #         x: A factor.
319
  #     first: The level in the factor you want to be first.
320
  #
321
  # Returns:
322
  #      A factor with the first level redefined to be the one specified.
323
  
324
  # if the level requested to be first isn't present, this ain't gonna work
325
  if (!(first %in% levels(x))) {
326
    stop(paste("Error: the level", first, "doesn't appear in the factor",
327
               deparse(substitute(x))))
328
  }
329
  factor(x, levels = c(first, levels(x)[levels(x) != first]))
330
}
331
332
factorNAfix <- function(x, NAval = 'NA', force = FALSE) {
333
  # Make NA values in a factor into their own level.
334
  #
335
  # Args:
336
  #         x: A factor.
337
  #     NAval: The value to replace NAs with. The string 'NA' by default.
338
  #     force: Whether to force the operation even if there aren't any NAs in
339
  #            the passed factor.
340
  #
341
  # Returns:
342
  #      A factor with NAs replaced by a specific level.
343
  
344
  # if it's forced, or if it's not but there are NAs present...
345
  if(force | sum(is.na(x)) > 0) {
346
    levels(x) <- c(levels(x), NAval)
347
    x[is.na(x)] <- NAval
348
  }
349
  x
350
}
351
352
factorOrderedLevels <- function(x, ...) {
353
  # Create a factor with levels in the order of the 
354
  #
355
  # Args:
356
  #         x: A list or vector. Works if the list's elements are themselves
357
  #            lists or vectors.
358
  #
359
  # Returns:
360
  #      TRUE or FALSE, depending.
361
  if(length(x) == length(unique(x))) {
362
    return(factor(x, levels = x))
363
  } else {
364
    stop('Elements of x must all be unique.')
365
  }
366
}
367
368
allSame <- function(x) {
369
  # Work out whether all elements of a list or vector are the same.
370
  #
371
  # Args:
372
  #         x: A list or vector. Works if the list's elements are themselves
373
  #            lists or vectors.
374
  #
375
  # Returns:
376
  #      TRUE or FALSE, depending.
377
  length(unique(x)) == 1
378
}
379
380
allSameLength <- function(x) {
381
  # Work out whether all elements of a list are the same length.
382
  #
383
  # Args:
384
  #         x: A list.
385
  #
386
  # Returns:
387
  #      TRUE or FALSE, depending.
388
  length(unique(lapply(x, length))) == 1
389
}
390
391
################################################################################
392
###  LISTS  ####################################################################
393
################################################################################
394
395
list2dataframe <- function(x)  {
396
  # Simple wrapper to very naively turn a list into a data frame. If your list
397
  # elements have different numbers of elements, this will go wrong!
398
  #
399
  # Args:
400
  #      x: A list.
401
  #
402
  # Returns:
403
  #      A data frame made from the passed list.
404
  data.frame(matrix(unlist(x), ncol = length(x[[1]]), byrow = TRUE))
405
}
406
407
################################################################################
408
###  FILES  ####################################################################
409
################################################################################
410
411
list.dirs <- function(path=".", pattern=NULL, all.dirs=FALSE,
412
                      ignore.case=FALSE) {
413
  # Lists the directories present within a path.
414
  # Credit: http://stackoverflow.com/questions/4749783
415
  #
416
  # Args:
417
  #      See list.files
418
  #
419
  # Returns:
420
  #      A vector of directories within the path being searched.
421
  all <- list.files(path, pattern, all.dirs,
422
                    full.names=TRUE, recursive=FALSE, ignore.case)
423
  all[file.info(all)$isdir]
424
}
425
426
writeTablePlus <- function(data, filename, comment = '', sep = '\t',
427
                           comment.char = '#', row.names = FALSE, 
428
                           col.names = TRUE, ...) {
429
  # A wrapper for the write.table function which adds a comment of your choice
430
  # at the top of the file.
431
  #
432
  # Args:
433
  #     filename: The name of the file to be written.
434
  #      comment: The comment to be added at the top of the file.
435
  #          sep: The separator for the data, tab by default.
436
  # comment.char: The character denoting comments, # by default.
437
  #    row.names: FALSE by default, because who wants row names?
438
  #    col.names: TRUE by default, because everyone wants column names!
439
  #         ... : Allows arbitrary extra arguments relevant to write.table.
440
  #
441
  # Returns:
442
  #      Nothing!
443
  
444
  f <- file(filename, open="wt") # open a connection to the file
445
  # if there's a comment, write that first
446
  if(nchar(comment) > 0) {
447
    # wrap the comment at 80 lines prefixed the comment character plus space
448
    comment <-
449
      strwrap(comment, width = 80, prefix = paste(comment.char, ' ', sep = ''))
450
    writeLines(comment, f)
451
  }
452
  write.table(
453
    data, f, sep=sep, row.names = row.names, col.names = col.names, ...
454
  )
455
  close(f)
456
}
457
458
readTablePlus <- function(filename, sep='\t', comment.char='#', header=TRUE,
459
                          ...) {
460
  # Handy wrapper for the read.table function to make it compatible with the
461
  # writeTablePlus function with its default options.
462
  #
463
  # Args:
464
  #  filename: The name of the file to read. If it's a vector, the function
465
  #            read the list of files provided and concatenate them into a
466
  #            single data frame.
467
  if(length(filename) > 1) {
468
    do.call(rbind, lapply(filename, readTablePlus))
469
  } else {
470
    read.table(filename, sep=sep, comment.char=comment.char, header=header, ...)
471
  }
472
}
473
474
readXlsxWriteTables <- function(filename, output.ext = 'tsv', sheet.names = NA,
475
                                ...) {
476
  # Write a multi-workbook xlsx file to a series of text files. Currently not
477
  # that useful, as neither read.xlsx nor read.xlsx2 does a very good job of
478
  # identifying column types, and you end up with a TSV composed of strings
479
  #
480
  # Args:
481
  #  filename:    The name of the Excel file to read.
482
  #   output.ext: The extension of the output file(s).
483
  #  sheet.names: The names of the sheets to be written out. Default NA will
484
  #               write all sheets which are present.
485
  #         ... : Allows arbitrary extra arguments to writeTablePlus
486
  
487
  # Require the xlsx library - not loaded by default as it's not often needed
488
  requirePlus('xlsx')
489
  
490
  # If no sheet names were provided...
491
  if(is.na(sheet.names)) {
492
    # ...get the sheet names by a few nested functions
493
    sheet.names <- names(getSheets(loadWorkbook(filename)))
494
  }
495
  
496
  base.filename <- justFilename(filename)
497
  
498
  # For the provided sheet names, go through and read them, and then write them
499
  for(sheet.name in sheet.names) {
500
    writeTablePlus(
501
      # read.xlsx2 is faster, and also assumes tabular data with a header,
502
      # probably the more likely use-case here
503
      read.xlsx2(filename, sheetName = sheet.name),
504
      # filename_sheetname.tsv
505
      paste0(base.filename, '_', sheet.name, '.', output.ext),
506
      ...
507
    )
508
  }
509
}
510
511
justFilename <- function(x) {
512
  # Returns filenames without extensions.
513
  #
514
  # Args:
515
  #       x: A character or vector of characters containing filenames, with or
516
  #          without paths.
517
  #
518
  # Returns:
519
  #   The string or vector with everything after and including a final full stop
520
  #   removed.
521
  sapply(strsplit(basename(x),"\\."),
522
         function(x) paste(x[1:(length(x)-1)],
523
                           collapse=".")
524
         )
525
}
526
527
fileExt <- function(x) {
528
  # Returns just extensions from filenames.
529
  #
530
  # Args:
531
  #       x: A character or vector of characters containing filenames, with or
532
  #          without paths.
533
  #
534
  # Returns:
535
  #   Everything after a final full stop.
536
  
537
  # split the strings by full stops, and only take the final element
538
  extensions <- sapply(strsplit(basename(x),"\\."),
539
                       function(x) tail(x, 1)
540
                      )
541
  # where the extension is the same as the input filename, there is no extension
542
  extensions[extensions == x] <- ''
543
  
544
  extensions
545
}
546
547
suffixFilename <- function(x, suffix = '_1') {
548
  # Returns a filename with a suffix appended before its extension.
549
  #
550
  # Args:
551
  #       x: A character or vector of characters containing filenames.
552
  #
553
  # Returns:
554
  #   filename_suffix.ext
555
  paste0(justFilename(x), suffix, '.', fileExt(x))
556
}
557
558
################################################################################
559
###  MATHEMATICS  ##############################################################
560
################################################################################
561
562
floorPlus <- function(x, digits = 0) {
563
  # the function floor but with a "digits" option to floor not just to the
564
  # nearest integer
565
  #
566
  # Args:
567
  #         x: vector to floor
568
  #         digits: Number of decimal places to be used
569
  #
570
  # Returns:
571
  #   integer/vector.
572
  floor(x*(10^digits)) / 10^digits
573
}
574
575
ceilPlus <- function(x, digits = 0) {
576
  # the function ceil but with a "digits" option to floor not just to the
577
  # nearest integer
578
  #
579
  # Args:
580
  #         x: vector to floor
581
  #         digits: Number of decimal places to be used
582
  #
583
  # Returns:
584
  #   integer/vector.
585
  ceil(x*(10^digits)) / 10^digits
586
}
587
588
tri <- function(x) {
589
  # Calculates the xth triangular number.
590
  #
591
  # Args:
592
  #       x: A number.
593
  #
594
  # Returns:
595
  #   The xth triangular number.
596
  x * (x + 1) / 2
597
}
598
599
trirt <- function(x) {
600
  # Calculates the triangular root of a number.
601
  #
602
  # Args:
603
  #       x: A number.
604
  #
605
  # Returns:
606
  #   Its triangular root.
607
  (sqrt(8*x + 1) - 1) / 2
608
}
609
610
# A series of functions which allow arithmetic on quantities with uncertainty.
611
# Create a quantity by passing values or vectors to unum(x, dx), and then add,
612
# subtract, multiply or divide with the functions below.
613
unum <- function(x, dx) { data.frame(x=x, dx=dx) }
614
  # Calculates the triangular root of a number.
615
  #
616
  # Args:
617
  #       x: A number or vector of numbers.
618
  #      dx: A number or vector of numbers representing the uncertainty on x.
619
  #
620
  # Returns:
621
  #   A data frame with columns x and dx which can be used for further
622
  #   operations.
623
uadd <- function(a, b) {
624
  z <- a$x + b$x
625
  dz <- sqrt(a$dx^2 + b$dx^2)
626
  unum(z, dz)
627
}
628
usub <- function(a, b) {
629
  z <- a$x - b$x
630
  dz <- sqrt(a$dx^2 + b$dx^2)
631
  unum(z, dz)
632
}
633
umul <- function(a, b) {
634
  z <- a$x * b$x
635
  dz <- z * sqrt((a$dx/a$x)^2 + (b$dx/b$x)^2)
636
  unum(z, dz)
637
}
638
udiv <- function(a, b) {
639
  z <- a$x / b$x
640
  dz <- z * sqrt((a$dx/a$x)^2 + (b$dx/b$x)^2)
641
  unum(z, dz)
642
}
643
644
normalise <- function(x, FUN = sum) {
645
  # Returns a vector normalised by the function FUN, default being sum so the
646
  # vector would now sum to 1. Another example would be max, so the largest
647
  # value in x becomes 1.
648
  #
649
  # Args:
650
  #       x: A vector.
651
  #     FUN: A function which returns a single value when applied to a vector.
652
  #
653
  # Returns:
654
  #   A vector, normalised appropriately.
655
  if(!is.function(FUN)) stop('Passed FUN is not a function')
656
  x / FUN(x)
657
}
658
659
minGt <- function(x, gt = 0) {
660
  # Return the minimum value of a vector greater than a value gt.
661
  #
662
  # Args:
663
  #   x: A vector.
664
  #   q: The value which this minimum value must be greater than, defaulting to
665
  #      0 (which returns the minimum positive value).
666
  #
667
  # Returns:
668
  #   The minimum positive value, eg c(-1, 0, 2, 4) would return 2.
669
  min(x[x > gt])
670
}
671
672
minPositive <- function(x) {
673
  # Return the minimum positive value of a vector. Wrapper for minGt.
674
  #
675
  # Args:
676
  #       x: A vector.
677
  # Returns:
678
  #   The minimum positive value, eg c(-1, 0, 2, 4) would return 2.
679
  minGt(x, 0)
680
}
681
682
################################################################################
683
###  STATISTICS  ###############################################################
684
################################################################################
685
686
stdErr <- function(x) { sqrt(var(x)/length(x)) }
687
  # For a vector x, returns the standard error on the mean.
688
  #
689
  # Args:
690
  #      x: A vector.
691
  #
692
  # Returns:
693
  #      The standard error on the mean.
694
695
cv <- function(x) { sd(x)/mean(x) }
696
  # For a vector x, returns the coefficient of variation.
697
  #
698
  # Args:
699
  #      x: A vector.
700
  #
701
  # Returns:
702
  #      The coefficient of variation.
703
704
covar <- function(x) {
705
  # Wrapper function which returns the variance for a single-column vector and
706
  # a covariance matrix for a multi-column vector.
707
  #
708
  # Args:
709
  #      x: Some data.
710
  #
711
  # Returns:
712
  #      The covariance matrix.
713
  if(is.null(dim(x))) {
714
    return(var(x))
715
  } else {
716
    return(cov(x))
717
  }
718
}
719
720
popvar <- function(x, na.rm = FALSE) {
721
  # Calculates population variance instead of sample variance (which is the
722
  # default of the var() function in R).
723
  # 
724
  # Args:
725
  #      x: a vector of the population data.
726
  #  na.rm: a logical value indicating whether NA values should be stripped
727
  #         before the computation proceeds.
728
  #
729
  # Returns:
730
  #      The population variance.
731
  if(na.rm) {
732
    x   <- x[!is.na(x)]
733
  } else if(any(is.na(x))) {
734
    return(NA)
735
  }
736
  mean((x-mean(x))^2)
737
}
738
739
weightedMeanPlus <- function(x, w, na.rm = FALSE) {
740
  # Compute a weighted mean, where the na.rm argument ignores NA values in both
741
  # the values and their weights. (The default R function returns NA if any
742
  # weight is NA even with na.rm = TRUE.)
743
  # 
744
  # Args:
745
  #      x: an object containing the values whose weighted mean is to be
746
  #         computed.
747
  #      w: a numerical vector of weights the same length as x giving the
748
  #         weights to use for elements of x.
749
  #  na.rm: a logical value indicating whether NA values should be stripped
750
  #         before the computation proceeds.
751
  #
752
  # Returns:
753
  #      The weighted mean.
754
  if(na.rm) {
755
    not.na <- !(is.na(x) | is.na(w))
756
    wm <- weighted.mean(x[not.na], w[not.na])
757
  } else {
758
    wm <- weighted.mean(x, w)
759
  }
760
  wm
761
}
762
763
################################################################################
764
###  MISCELLANEOUS  ############################################################
765
################################################################################
766
767
NA2val <- function(x, val = 0) {
768
  # Wrapper to turn NAs in an object into a value of your choice.
769
  #
770
  # Args:
771
  #       x: The object containing errant NA values.
772
  #     val: The value to replace the NAs with, default 0.
773
  #
774
  # Returns:
775
  #   The object with the NAs replaced appropriately.
776
  x[is.na(x)] <- val
777
  x
778
}
779
780
isExactlyNA <- function(x) {
781
  # Is an object NA, or is it another kind of object? Unlike is.na, this is not
782
  # a vector operation and doesn't return eg a vector with whether or not each
783
  # value is NA, it would simply return FALSE because the object isn't an NA.
784
  # This is to prevent warnings when performing if statements on things like
785
  # optional arguments where NA is the default, but a vector could be passed,
786
  # and the if statement then warns that only the first element was used.
787
  #
788
  # Args:
789
  #     x: The object to be tested.
790
  #
791
  # Returns:
792
  #   TRUE if the object is literally just NA; FALSE otherwise.
793
  
794
  # NA is of type logical, so in order to be a true NA, it must be...
795
  if(is.logical(x) & length(x) == 1) {
796
    # If so, is it NA?
797
    return(is.na(x))
798
  } else {
799
    # Otherwise, return FALSE
800
    return(FALSE)
801
  }
802
}
803
804
firstElement <- function(x) {
805
  # Function for apply-ing to lists which will return the first element of a
806
  # list element
807
  # Args:
808
  #       x: An object with elements.
809
  #
810
  # Returns:
811
  #   The first element of that object.
812
  x[1]
813
}
814
815
NArm <- function(x) {
816
  # Remove NAs from a vector.
817
  x[!is.na(x)]
818
}
819
820
821
samplePlus <- function(x, ..., na.rm = TRUE, only.unique = FALSE) {
822
  # Extension of the sample function from base R with the option of only
823
  # sampling from non-missing values.
824
  #
825
  # Args:
826
  #      x: A vector of one or more elements from which to choose.
827
  #    ...: Other arguments to sample (ie size, replace, prob)
828
  #  na.rm: Whether or not to remove NAs. Default TRUE since otherwise why are
829
  #         you using this wrapper function?
830
  #  only.unique: Sample from only the unique values of x?
831
  #
832
  # Returns:
833
  #   A sample from the vector (see sample documentation), without NAs if na.rm
834
  #   is set to TRUE, and only drawn from unique values of x is only.unique is
835
  #   set to TRUE.
836
  
837
  if(na.rm) {
838
    x <- NArm(x)
839
  }
840
  if(only.unique) { 
841
    x <- unique(x) 
842
  }
843
  sample(x, ...)
844
}
845
846
permute <-
847
function(
848
  # Randomly permute (some) elements of a vector or character string to create a
849
  # (slightly) randomised version of it.
850
  #
851
  # Args:
852
          x,
853
  #       A vector or character string to have its contents permuted.
854
          frac = 1.0,
855
  #       The fraction of the contents to be permuted, from 0 (no permutation)
856
  #       to 1 (permute everything).
857
          n.permute = NA
858
  #       The number of items to permute. Defaults to being calculated from frac
859
  #       but can be specified manually too. Must be a multiple of 2, because
860
  #       elements are swapped in pairs.
861
  #
862
  # Returns:
863
  #       The vector or string with n.permute of its elements permuted.
864
) {
865
  # if frac is not a fraction, throw an error
866
  if(frac < 0.0 | frac > 1.0) {
867
    stop(paste0('frac = ', frac,'; it must be between 0 and 1.'))
868
  } else if(!is.na(n.permute) & n.permute %% 2 != 0) {
869
    stop(paste0('n.permute = ', n.permute,'; it must be divisible by 2.'))
870
  }
871
      
872
  # if x is a character string, make it into a vector for processing and set a
873
  # reminder to put it back as a string before returning
874
  if(class(x) == 'character') {
875
    x.is.string <- TRUE
876
    x <- strsplit(x, '')[[1]]
877
  } else {
878
    x.is.string <- FALSE
879
  }
880
  
881
  # if n.permute was not provided, we can now calculate it
882
  if(is.na(n.permute)) {
883
    n.permute <- round(length(x)*frac / 2) * 2 # make sure it's a multiple of 2!
884
  }
885
  
886
  # if n.permute is longer than the vector...
887
  if(n.permute > length(x)) {
888
    stop(paste0('n.permute = ', n.permute,', which is greater than the length ',
889
                'of the string or vector provided, ', length(x)))
890
  }
891
  
892
  # Create a random sample for pairs of positions to swap between
893
  swapsies <- sample(1:length(x), n.permute)
894
  
895
  # take those swapping positions and move them around; reversing the indices in
896
  # the second part of the function implies that position 1 will swap with
897
  # position n, 2 with n-1, etc...
898
  x <- replace(x, swapsies, x[rev(swapsies)])
899
  
900
  # if it was a string then reassemble it before returning
901
  if(x.is.string) {
902
    x <- paste(x, collapse='')
903
  }
904
  
905
  x
906
}
907
908
requirePlus <- function(..., install = TRUE, quietly = TRUE) {
909
  # Simply require a number of packages in the same command, and install them if
910
  # not present.
911
  #
912
  # Args:
913
  #   packages: A vector of the names of the packages to be imported.
914
  #    install: Logical indicating whether missing packages should be installed.
915
  #    quietly: As with require, quietly suppresses messages.
916
  #
917
  # Returns:
918
  #   Nothing (though warning and error messages are displayed on failure)
919
920
  package.list <- c(...)
921
  # if the install parameter is true, install missing packages
922
  if(install) {
923
    # Check for missing packages
924
    packages.present <- package.list %in% rownames(installed.packages())
925
    # And, if there are any, install them
926
    if(any(!packages.present)) {
927
      message(
928
        paste('Installing missing packages',
929
              paste(package.list[!packages.present], collapse = ', ')
930
        )
931
      )
932
      install.packages(package.list[!packages.present])
933
    }
934
  }
935
  # loop over packages, importing them
936
  require.success <- unlist(
937
    # suppress warnings, because we'll tell the user which packages failed later
938
    suppressWarnings(
939
      lapply(package.list, require, character.only = TRUE, quietly = quietly)
940
    )
941
  )
942
  
943
  # If at least some packages imported successfully and the user hasn't asked
944
  # for quietness...
945
  if(sum(require.success) > 0 & !quietly) {
946
    message(
947
      paste('Successfully imported packages',
948
            paste(package.list[require.success], collapse = ', ')
949
      )
950
    )
951
  }
952
  
953
  # If at least some packages failed, warn the user regardless of quietly arg
954
  if(sum(!require.success) > 0) {
955
    warning(
956
      paste('Failed to import packages',
957
            paste(package.list[!require.success], collapse = ', ')
958
      )
959
    )
960
  }
961
}
962
963
initParallel <- function(cores = NULL, backend = 'doMC') {
964
  # Wrapper to initialise parallel computing functionality.
965
  #
966
  # Args:
967
  #   cores: The number of cores to use simultaneously. If absent, use the
968
  #          default from the relevant backend.
969
  # backend: Which backend to use. Currently supports doMC and doParallel.
970
  #
971
  # Returns:
972
  #   Nothing.
973
  
974
  if(backend == 'doMC') {
975
    requirePlus('doMC', 'foreach')
976
    registerDoMC(cores)
977
  } else if(backend == 'doParallel') {
978
    requirePlus('doParallel', 'foreach')
979
    cl <- makeCluster(cores)
980
    registerDoParallel(cl)
981
    return(cl)
982
  } else {
983
    stop('Unrecognised backend', backend)
984
  }
985
}
986
987
inRange <-
988
  function(x, rang, incl.end = rep(FALSE, length(rang)), na.false = FALSE) {
989
  # Returns a vector of booleans specifying whether values of x fall within the
990
  # range rang.
991
  #
992
  # Args:
993
  #          x: A vector of numbers.
994
  #       rang: A vector specifying the range within which they are permitted to
995
  #             fall. It can be of any length; only the smallest and largest
996
  #             values are used. Called 'rang' so as not to clash with the
997
  #             'range' function in base R.
998
  #   incl.end: A vector of the same length as rang specifying whether a given
999
  #             endpoint is included or excluded from the range.
1000
  #   na.false: Return NA values as not in range?
1001
  #
1002
  # Returns:
1003
  #   A boolean with the same length of x, TRUE if within the range specified,
1004
  #   FALSE otherwise.
1005
  
1006
  # find the indices of the smallest and largest values of the vector
1007
  # (presumably usually there will only be two!)
1008
  i.min <- which.min(rang)
1009
  i.max <- which.max(rang)
1010
  
1011
  # find those which are bigger than the minimum value (including endpoint if
1012
  # specified)...
1013
  if(incl.end[i.min]) {x >= rang[i.min]} else {x > rang[i.min]} &
1014
  # ...and those smaller than the max...
1015
  if(incl.end[i.max]) {x <= rang[i.max]} else {x < rang[i.max]} &
1016
  # ...and, if applicable, set NA values to false?
1017
  if(na.false) {!is.na(x)} else {TRUE}
1018
  # ...and return it!
1019
}
1020
1021
logfileStart <- function(filename = default.logfile.name) {
1022
  # Wrapper for creating a new blank log file during script execution.
1023
  # NB This will silently overwrite existing files!
1024
  #
1025
  # Args:
1026
  #  filename: The name of the file to create.
1027
  #
1028
  # Returns:
1029
  #   Nothing.
1030
  #
1031
  # Globals:
1032
  #   Creates a global variable called logfileName so that related functions
1033
  #   know where to write to.
1034
  logfileName <<- filename
1035
  cat('', file = filename)
1036
}
1037
1038
logfileCat <- function(...,
1039
                       newline = TRUE, sep = "", fill = FALSE,
1040
                       filename = logfileName
1041
                       ) {
1042
  # Wrapper for adding an entry to a log file.
1043
  #
1044
  # Args:
1045
  #      ... : Stuff to write to the file
1046
  #   newline: Whether to start a new line after the entry.
1047
  #       sep: Separator between objects to write.
1048
  #      fill: The fill option for cat().
1049
  #  filename: The name of the file to write to; default being the global
1050
  #            variable set by logfileStart.
1051
  #
1052
  # Returns:
1053
  #   Nothing.
1054
  if(newline & !fill) append.me <- "\n" else append.me <- NULL
1055
  cat(..., append.me, file = filename, sep = sep, fill = fill,
1056
      append = TRUE)
1057
}
1058
1059
logfileEnd <- function() {
1060
  # Wrapper for blanking the existing logfileName such that no further entries
1061
  # are written to it given the default options for logfileCat.
1062
  #
1063
  # Args:
1064
  #  None.
1065
  #
1066
  # Returns:
1067
  #   Nothing.
1068
  #
1069
  # Globals:
1070
  #   Sets logfileName  to "".
1071
  logfileName <<- ""
1072
}
1073
1074
unixTimestamp <-function() {
1075
  # Quick function to generate the UNIX timestamp.
1076
  #
1077
  # Args:
1078
  #   None.
1079
  #
1080
  # Returns:
1081
  #   Time in whole seconds since the start of the Unix epoch (01/01/1970 UTC)
1082
  as.numeric(Sys.time())
1083
}
1084
1085
getUserInput <- function(s, parse.fun = NULL, validate.fun = NULL, e = NULL) {
1086
  # Get input from the user typing at the terminal.
1087
  #
1088
  # Args:
1089
  #         s: The question to present the user with.
1090
  # parse.fun: Optional function with which to parse the input string.
1091
  # validate.fun: Optional function with which to validate the input string.
1092
  #         e: Error message to display if the (cleaned) value fails validation.
1093
  #
1094
  # Returns:
1095
  #   A parsed, validated input value.
1096
  
1097
  # loop, to keep asking for input if there's a problem
1098
  repeat {
1099
    # get the user's input by asking them s
1100
    user.input <- readline(s)
1101
    # if a function to clean input has been specified, run it
1102
    if(!is.null(parse.fun)) {
1103
      user.input <- parse.fun(user.input)
1104
    }
1105
    # if a function to validate input has been specified
1106
    if(!is.null(validate.fun)) {
1107
      # if the input validates...
1108
      if(validate.fun(user.input)) {
1109
        # ...the pass it back
1110
        return(user.input)
1111
      }
1112
    } else {
1113
      # if there's no validation to perform, return it anyway
1114
      return(user.input)
1115
    }
1116
    # if we've got this far, validation must have failed...print an error
1117
    # message and have another try...
1118
    cat(e)
1119
  }
1120
}
1121
1122
getUserInputInteger <- function(s) {
1123
  # Wrapper function to use getUserInput to acquire an integer from the user.
1124
  #
1125
  # Args:
1126
  #         s: The question to present the user with.
1127
  #
1128
  # Returns:
1129
  #   An integer.
1130
  getUserInput(s,
1131
               parse.fun = function(x){
1132
                  suppressWarnings(as.integer(x))
1133
                 },
1134
               validate.fun = function(x) {
1135
                  ifelse(!is.na(x), TRUE, FALSE)
1136
                 },
1137
               e = 'Could not parse input as an integer.')
1138
}
1139
1140
1141
1142
handyTimer <- function(t = NA, numeric = TRUE) {
1143
  # Wrapper function for easy to read timing code.
1144
  #
1145
  # Args:
1146
  #   t: The time. Pass nothing (or NA) and the function will return the current
1147
  #      time, ie start the clock. Pass a numeric or time object (ie a
1148
  #      previously stored start time), and the function returns the difference
1149
  #      (ie a time you wanted to measure).
1150
  #   numeric: Whether to convert your time into a simple numerical value. FALSE
1151
  #      results in returning an R time or time difference object.
1152
  #
1153
  # Returns:
1154
  #   Either the current time, or the time since t.
1155
  
1156
  # If t is NA, get current time
1157
  if(is.na(t)) {
1158
    t <- proc.time()['elapsed']
1159
    # Otherwise, get time difference
1160
  } else {
1161
    t <- proc.time()['elapsed'] - t
1162
  }
1163
  
1164
  # If the user wanted a numerical answer, as.numeric it
1165
  if(numeric) {
1166
    t <- as.numeric(t)
1167
  }
1168
  
1169
  # Return t
1170
  t
1171
}
1172
1173
varName <- function(...) {
1174
  # Get the name of a variable as a string. Inspired by:
1175
  # http://stackoverflow.com/questions/14577412/
1176
  # Args:
1177
  #   x: A variable, eg myvar.
1178
  #
1179
  # Returns:
1180
  #   The name of the variable, eg 'myvar'.
1181
  
1182
  do.call(
1183
    function(x) { deparse(substitute(x)) },
1184
    ...
1185
      
1186
  )
1187
}
1188
1189
varsToTable <- function(df, filename, index.cols = 1, ...) {
1190
  # Function to serialise variables for storage as a simple table in a text
1191
  # file. Takes a data frame df with columns (eg id1, id2, var1, var2) and, if
1192
  # a preexisting value is found with the same index.cols (eg c('id1', 'id2'),
1193
  # or 1:2 in this example), replaces the values (var1 and var2) or, if not,
1194
  # appends them, then writes the results to filename.
1195
  #
1196
  # TODO: Check that df has the same shape/column names(?) as the file.
1197
  # TODO: Check for repeated indices in the df provided.
1198
  #
1199
  # Args:
1200
  #   df:         A data frame of indices and values to store.
1201
  #   filename:   A file in which to store them.
1202
  #   index.cols: Which columns to use as indices, to find and replace existing
1203
  #               values with the same index.
1204
  #   ...:        Extra arguments for writeTablePlus.
1205
  #
1206
  # Returns:
1207
  #   Nothing, just writes the data to file.
1208
  if(file.exists(filename)) {
1209
    vars.table <- readTablePlus(filename)
1210
    # First, append our data frame to the existing table
1211
    vars.table <- rbind(vars.table, df)
1212
1213
    # Find the duplicate values of just the index columns, searching from the
1214
    # end because we want to keep the newer values we just rbind-ed in that
1215
    # case, and only keep unique ones.
1216
    vars.table <-
1217
      vars.table[!(duplicated(vars.table[, index.cols], fromLast = TRUE)), ]
1218
    
1219
  } else {
1220
    # If there's no existing file, just create a fresh data frame
1221
    vars.table <- df
1222
  }
1223
  
1224
  # Write the resulting data frame to a file of the given name
1225
  writeTablePlus(vars.table, filename, ...)
1226
}