synthetic-data-genomics / Git / Diff of /research_paper

Models:

MarcoTheBlack/

synthetic-data-genomics

Downloads: 1

Diff of /research_paper_code/src/misc.R [000000] .. [befbfc]

Switch to unified view

 b/research_paper_code/src/misc.R
+# SUMMARY
+# -------
+# This file contains some function definitions that don't fit in any
+# other file. Here is an overview of the functions defined in this
+# file:
+#
+#    caterase(s)
+#    logit10(x)
+#    project.onto.interval(x,a,b)
+#    center.columns(X)
+#    is.binary.factor(x)
+#    is.binary(x)
+#    binfactor2num(x)
+#    binary.from.categorical(x,col.names)
+#    none.missing.row(x)
+#    compute.maf(geno)
+#    check.normal.quantiles(x)
+#
+# FUNCTION DEFINITIONS
+# ----------------------------------------------------------------------
+# Output the string using 'cat', then move the cursor back to the
+# beginning of the string so that subsequent output will overwrite
+# this string.
+caterase <- function (s)
+  cat(s,rep("\b",nchar(s)),sep="")
+# ----------------------------------------------------------------------
+# Return the logit of x. (Here we use the base-10 logarithm instead of the
+# more commonly used natural logarithm.)
+logit10 <- function (x) {
+  eps <- .Machine$double.eps
+  return(log10((x + eps)/(1 - x + eps)))
+}
+# ----------------------------------------------------------------------
+# Return x projected onto interval [a,b].
+project.onto.interval <- function (x, a, b)
+  pmin(b,pmax(a,x))
+# ----------------------------------------------------------------------
+# Center the columns of matrix X so that the entries in each column
+# of X add up to 0.
+#
+center.columns <- function (X) {
+  y <- colMeans(X)
+  X <- X - matrix(y,nrow(X),ncol(X),byrow = TRUE)
+  return(X)
+}
+# ----------------------------------------------------------------------
+# Return TRUE if x is a factor with exactly 2 levels.
+is.binary.factor <- function (x)
+  is.factor(x) & nlevels(x) == 2
+# ----------------------------------------------------------------------
+# Return TRUE if all elements of a numeric vector are either 0 or 1,
+# ignoring missing values (NA).
+is.binary <- function (x)
+  is.numeric(x) & sum(!(x == 0 | x == 1),na.rm = TRUE) == 0
+# ----------------------------------------------------------------------
+# Convert a factor with exactly 2 levels to a numeric vector with
+# values 0 or 1.
+binfactor2num <- function (x) {
+  if (!is.binary.factor(x))
+    stop("Factor must have exactly 2 levels")
+  return(as.numeric(x) - 1)
+}
+# ----------------------------------------------------------------------
+# Return a data frame with one column for each level of factor x. The
+# columns of the data frame encode the categorical variable with n
+# levels as n binary variables.
+binary.from.categorical <- function (x, col.names = NULL) {
+  # Create a binary factor for each value of the categorical variable.
+  d <- list()
+  for (value in levels(x))
+    d[[value]] <- factor(as.integer(x == value))
+  # Convert the list to a data frame, and adjust the column names, if
+  # requested.
+  d <- data.frame(d,check.names = FALSE)
+  if (!is.null(col.names))
+    names(d) <- col.names
+  # Output the newly created data frame.
+  return(d)
+}
+# ----------------------------------------------------------------------
+# For each row of the matrix or data frame, return TRUE if all the
+# entries in the row are provided (that is, they are not missing).
+none.missing.row <- function (x)
+  rowSums(is.na(x)) == 0
+# ----------------------------------------------------------------------
+# Returns the minor allele frequency given a vector of genotypes
+# encoded as allele counts.
+compute.maf <- function (geno) {
+  f <- mean(geno,na.rm = TRUE)/2
+  return(min(f,1-f))
+}
+# ----------------------------------------------------------------------
+# Compare the empirical quantiles of x against the expected valuesx
+# under the normal distribution.
+check.normal.quantiles <- function (x) {
+  # Discard the missing values.
+  x <- x[!is.na(x)]
+  # Transform the observations so that they have zero mean and unit
+  # variance.
+  x <- (x - mean(x))/sqrt(var(x))
+  # Create a data frame giving the observed and expected proportion of
+  # samples within 1, 2 and 3 standard deviations of the mean.
+  return(data.frame(exp = c(pnorm(1) - pnorm(-1),
+                            pnorm(2) - pnorm(-2),
+                            pnorm(3) - pnorm(-3)),
+                    obs = c(mean(-1 < x & x < 1),
+                            mean(-2 < x & x < 2),
+                            mean(-3 < x & x < 3)),
+                    row.names = c("sd1","sd2","sd3")))
+}