|
a |
|
b/research_paper_code/src/misc.R |
|
|
1 |
# SUMMARY |
|
|
2 |
# ------- |
|
|
3 |
# This file contains some function definitions that don't fit in any |
|
|
4 |
# other file. Here is an overview of the functions defined in this |
|
|
5 |
# file: |
|
|
6 |
# |
|
|
7 |
# caterase(s) |
|
|
8 |
# logit10(x) |
|
|
9 |
# project.onto.interval(x,a,b) |
|
|
10 |
# center.columns(X) |
|
|
11 |
# is.binary.factor(x) |
|
|
12 |
# is.binary(x) |
|
|
13 |
# binfactor2num(x) |
|
|
14 |
# binary.from.categorical(x,col.names) |
|
|
15 |
# none.missing.row(x) |
|
|
16 |
# compute.maf(geno) |
|
|
17 |
# check.normal.quantiles(x) |
|
|
18 |
# |
|
|
19 |
# FUNCTION DEFINITIONS |
|
|
20 |
# ---------------------------------------------------------------------- |
|
|
21 |
# Output the string using 'cat', then move the cursor back to the |
|
|
22 |
# beginning of the string so that subsequent output will overwrite |
|
|
23 |
# this string. |
|
|
24 |
caterase <- function (s) |
|
|
25 |
cat(s,rep("\b",nchar(s)),sep="") |
|
|
26 |
|
|
|
27 |
# ---------------------------------------------------------------------- |
|
|
28 |
# Return the logit of x. (Here we use the base-10 logarithm instead of the |
|
|
29 |
# more commonly used natural logarithm.) |
|
|
30 |
logit10 <- function (x) { |
|
|
31 |
eps <- .Machine$double.eps |
|
|
32 |
return(log10((x + eps)/(1 - x + eps))) |
|
|
33 |
} |
|
|
34 |
|
|
|
35 |
# ---------------------------------------------------------------------- |
|
|
36 |
# Return x projected onto interval [a,b]. |
|
|
37 |
project.onto.interval <- function (x, a, b) |
|
|
38 |
pmin(b,pmax(a,x)) |
|
|
39 |
|
|
|
40 |
# ---------------------------------------------------------------------- |
|
|
41 |
# Center the columns of matrix X so that the entries in each column |
|
|
42 |
# of X add up to 0. |
|
|
43 |
# |
|
|
44 |
center.columns <- function (X) { |
|
|
45 |
y <- colMeans(X) |
|
|
46 |
X <- X - matrix(y,nrow(X),ncol(X),byrow = TRUE) |
|
|
47 |
return(X) |
|
|
48 |
} |
|
|
49 |
|
|
|
50 |
# ---------------------------------------------------------------------- |
|
|
51 |
# Return TRUE if x is a factor with exactly 2 levels. |
|
|
52 |
is.binary.factor <- function (x) |
|
|
53 |
is.factor(x) & nlevels(x) == 2 |
|
|
54 |
|
|
|
55 |
# ---------------------------------------------------------------------- |
|
|
56 |
# Return TRUE if all elements of a numeric vector are either 0 or 1, |
|
|
57 |
# ignoring missing values (NA). |
|
|
58 |
is.binary <- function (x) |
|
|
59 |
is.numeric(x) & sum(!(x == 0 | x == 1),na.rm = TRUE) == 0 |
|
|
60 |
|
|
|
61 |
# ---------------------------------------------------------------------- |
|
|
62 |
# Convert a factor with exactly 2 levels to a numeric vector with |
|
|
63 |
# values 0 or 1. |
|
|
64 |
binfactor2num <- function (x) { |
|
|
65 |
if (!is.binary.factor(x)) |
|
|
66 |
stop("Factor must have exactly 2 levels") |
|
|
67 |
return(as.numeric(x) - 1) |
|
|
68 |
} |
|
|
69 |
|
|
|
70 |
# ---------------------------------------------------------------------- |
|
|
71 |
# Return a data frame with one column for each level of factor x. The |
|
|
72 |
# columns of the data frame encode the categorical variable with n |
|
|
73 |
# levels as n binary variables. |
|
|
74 |
binary.from.categorical <- function (x, col.names = NULL) { |
|
|
75 |
|
|
|
76 |
# Create a binary factor for each value of the categorical variable. |
|
|
77 |
d <- list() |
|
|
78 |
for (value in levels(x)) |
|
|
79 |
d[[value]] <- factor(as.integer(x == value)) |
|
|
80 |
|
|
|
81 |
# Convert the list to a data frame, and adjust the column names, if |
|
|
82 |
# requested. |
|
|
83 |
d <- data.frame(d,check.names = FALSE) |
|
|
84 |
if (!is.null(col.names)) |
|
|
85 |
names(d) <- col.names |
|
|
86 |
|
|
|
87 |
# Output the newly created data frame. |
|
|
88 |
return(d) |
|
|
89 |
} |
|
|
90 |
|
|
|
91 |
# ---------------------------------------------------------------------- |
|
|
92 |
# For each row of the matrix or data frame, return TRUE if all the |
|
|
93 |
# entries in the row are provided (that is, they are not missing). |
|
|
94 |
none.missing.row <- function (x) |
|
|
95 |
rowSums(is.na(x)) == 0 |
|
|
96 |
|
|
|
97 |
# ---------------------------------------------------------------------- |
|
|
98 |
# Returns the minor allele frequency given a vector of genotypes |
|
|
99 |
# encoded as allele counts. |
|
|
100 |
compute.maf <- function (geno) { |
|
|
101 |
f <- mean(geno,na.rm = TRUE)/2 |
|
|
102 |
return(min(f,1-f)) |
|
|
103 |
} |
|
|
104 |
|
|
|
105 |
# ---------------------------------------------------------------------- |
|
|
106 |
# Compare the empirical quantiles of x against the expected valuesx |
|
|
107 |
# under the normal distribution. |
|
|
108 |
check.normal.quantiles <- function (x) { |
|
|
109 |
|
|
|
110 |
# Discard the missing values. |
|
|
111 |
x <- x[!is.na(x)] |
|
|
112 |
|
|
|
113 |
# Transform the observations so that they have zero mean and unit |
|
|
114 |
# variance. |
|
|
115 |
x <- (x - mean(x))/sqrt(var(x)) |
|
|
116 |
|
|
|
117 |
# Create a data frame giving the observed and expected proportion of |
|
|
118 |
# samples within 1, 2 and 3 standard deviations of the mean. |
|
|
119 |
return(data.frame(exp = c(pnorm(1) - pnorm(-1), |
|
|
120 |
pnorm(2) - pnorm(-2), |
|
|
121 |
pnorm(3) - pnorm(-3)), |
|
|
122 |
obs = c(mean(-1 < x & x < 1), |
|
|
123 |
mean(-2 < x & x < 2), |
|
|
124 |
mean(-3 < x & x < 3)), |
|
|
125 |
row.names = c("sd1","sd2","sd3"))) |
|
|
126 |
} |