synthetic-data-genomics / Git / [befbfc] /research_paper

Models:
MarcoTheBlack/
synthetic-data-genomics
Downloads: 1
[befbfc]: / research_paper_code / src / misc.R
History
Download this file
127 lines (110 with data), 4.4 kB

# SUMMARY
# -------
# This file contains some function definitions that don't fit in any
# other file. Here is an overview of the functions defined in this
# file:
#
#    caterase(s)
#    logit10(x)
#    project.onto.interval(x,a,b)
#    center.columns(X)
#    is.binary.factor(x)
#    is.binary(x)
#    binfactor2num(x)
#    binary.from.categorical(x,col.names)
#    none.missing.row(x)
#    compute.maf(geno)
#    check.normal.quantiles(x)
#
# FUNCTION DEFINITIONS
# ----------------------------------------------------------------------
# Output the string using 'cat', then move the cursor back to the
# beginning of the string so that subsequent output will overwrite
# this string.
caterase <- function (s)
  cat(s,rep("\b",nchar(s)),sep="")

# ----------------------------------------------------------------------
# Return the logit of x. (Here we use the base-10 logarithm instead of the 
# more commonly used natural logarithm.)
logit10 <- function (x) {
  eps <- .Machine$double.eps
  return(log10((x + eps)/(1 - x + eps)))
}

# ----------------------------------------------------------------------
# Return x projected onto interval [a,b].
project.onto.interval <- function (x, a, b)
  pmin(b,pmax(a,x))

# ----------------------------------------------------------------------
# Center the columns of matrix X so that the entries in each column
# of X add up to 0.
# 
center.columns <- function (X) {
  y <- colMeans(X)
  X <- X - matrix(y,nrow(X),ncol(X),byrow = TRUE)
  return(X)
}

# ----------------------------------------------------------------------
# Return TRUE if x is a factor with exactly 2 levels.
is.binary.factor <- function (x)
  is.factor(x) & nlevels(x) == 2
    
# ----------------------------------------------------------------------
# Return TRUE if all elements of a numeric vector are either 0 or 1,
# ignoring missing values (NA).
is.binary <- function (x)
  is.numeric(x) & sum(!(x == 0 | x == 1),na.rm = TRUE) == 0

# ----------------------------------------------------------------------
# Convert a factor with exactly 2 levels to a numeric vector with
# values 0 or 1.
binfactor2num <- function (x) {
  if (!is.binary.factor(x))
    stop("Factor must have exactly 2 levels")
  return(as.numeric(x) - 1)
}
  
# ----------------------------------------------------------------------
# Return a data frame with one column for each level of factor x. The
# columns of the data frame encode the categorical variable with n
# levels as n binary variables.
binary.from.categorical <- function (x, col.names = NULL) {

  # Create a binary factor for each value of the categorical variable.
  d <- list()
  for (value in levels(x))
    d[[value]] <- factor(as.integer(x == value))

  # Convert the list to a data frame, and adjust the column names, if
  # requested.
  d <- data.frame(d,check.names = FALSE)
  if (!is.null(col.names))
    names(d) <- col.names

  # Output the newly created data frame.
  return(d)
}

# ----------------------------------------------------------------------
# For each row of the matrix or data frame, return TRUE if all the
# entries in the row are provided (that is, they are not missing).
none.missing.row <- function (x)
  rowSums(is.na(x)) == 0

# ----------------------------------------------------------------------
# Returns the minor allele frequency given a vector of genotypes
# encoded as allele counts.
compute.maf <- function (geno) {
  f <- mean(geno,na.rm = TRUE)/2
  return(min(f,1-f))
}

# ----------------------------------------------------------------------
# Compare the empirical quantiles of x against the expected valuesx
# under the normal distribution.
check.normal.quantiles <- function (x) {

  # Discard the missing values.
  x <- x[!is.na(x)]
    
  # Transform the observations so that they have zero mean and unit
  # variance.
  x <- (x - mean(x))/sqrt(var(x))
  
  # Create a data frame giving the observed and expected proportion of
  # samples within 1, 2 and 3 standard deviations of the mean.
  return(data.frame(exp = c(pnorm(1) - pnorm(-1),
                            pnorm(2) - pnorm(-2),
                            pnorm(3) - pnorm(-3)),
                    obs = c(mean(-1 < x & x < 1),
                            mean(-2 < x & x < 2),
                            mean(-3 < x & x < 3)),
                    row.names = c("sd1","sd2","sd3")))
}