[727782]: / R / utilities.R

Download this file

169 lines (124 with data), 4.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
get_AICtab<-function(fit){
########################
# Flag invalid options #
########################
if (!class(fit) %in% c('cpglm', 'zcpglm', 'glmmTMB')){
stop('Not supported. Valid options are cplm , zcpglm, and glmmTMB')
}
######################
# Initialize AICtab #
######################
AICtab<-rep(NA, 5)
###########################
# Case-by-Case Extraction #
###########################
if (class(fit)=='cpglm'){
##########################################
# Back calculate logLik and BIC from AIC #
##########################################
AIC<-fit$aic
AIC_multiplier<-length(fit$y) - fit$df.residual
logLik<-(AIC - 2*AIC_multiplier)/2
BIC_multiplier<-AIC_multiplier*log(length(fit$y))
BIC<-BIC_multiplier + 2*logLik
deviance<-fit$deviance
df.resid<-fit$df.residual
# Coherent output
AICtab<-c(AIC, BIC, logLik, deviance, df.resid)
}
if (class(fit)=='zcpglm'){
##########################################
# Back calculate AIC and BIC from logLik #
##########################################
logLik<--fit$llik
AIC_multiplier<-length(fit$y) - fit$df.residual
BIC_multiplier<-AIC_multiplier*log(length(fit$y))
AIC<-2*AIC_multiplier + 2*logLik
BIC<-BIC_multiplier + 2*logLik
deviance<-NA
df.resid<-fit$df.residual
# Coherent output
AICtab<-c(AIC, BIC, logLik, deviance, df.resid)
}
if (class(fit)=='glmmTMB'){
#######################################
# Extract AICtab from glmmTMB objects #
#######################################
AICtab<-summary(fit)["AICtab"]$AICtab
}
##########
# Return #
##########
names(AICtab)<-c('AIC', 'BIC', 'logLik', 'deviance', 'df.resid')
return(AICtab)
}
# Adapted form: https://rstudio-pubs-static.s3.amazonaws.com/455435_30729e265f7a4d049400d03a18e218db.html
#' @export
entropy <- function(target) {
#if(all(is.na(target))) 0
freq <- table(target)/length(target)
# vectorize
vec <- as.data.frame(freq)[,2]
#drop 0 to avoid NaN resulting from log2
vec<-vec[vec>0]
#compute entropy
-sum(vec * log2(vec))
}
IG_numeric<-function(data, feature, target, bins=4) {
#Strip out rows where feature is NA
data<-data[!is.na(data[,feature]),]
#compute entropy for the parent
e0<-entropy(data[,target])
data$cat<-cut(data[,feature], breaks=bins, labels=c(1:bins))
#use dplyr to compute e and p for each value of the feature
dd_data <- data %>% group_by(cat) %>% summarise(e=entropy(get(target)),
n=length(get(target)),
min=min(get(feature)),
max=max(get(feature))
)
#calculate p for each value of feature
dd_data$p<-dd_data$n/nrow(data)
#compute IG
IG<-e0-sum(dd_data$p*dd_data$e)
return(IG)
}
#returns IG for categorical variables.
IG_cat<-function(data,feature,target){
#Strip out rows where feature is NA
data<-data[!is.na(data[,feature]),]
#use dplyr to compute e and p for each value of the feature
dd_data <- data %>% group_by_at(feature) %>% summarise(e=entropy(get(target)),
n=length(get(target))
)
#compute entropy for the parent
e0<-entropy(data[,target])
#calculate p for each value of feature
dd_data$p<-dd_data$n/nrow(data)
#compute IG
IG<-e0-sum(dd_data$p*dd_data$e)
return(IG)
}
# entropy (c("A", "A", "A", "A", "A", "B", "B"))
# 0.8631206
#entropy (c("A", "A", "A", "A"))
# 0
#entropy (c("A", "A", "A", "A", "B", "B", "B", "B"))
#1
#entropy (c("C", "A", "A", "A", "B", "B", "B", "B"))
# 1.405639
#entropy (c("C", "A", "D", "A", "B", "B", "B", "B"))
# 1.75
# entropy (c(1, 1, 2, 1, 1, 1, 2, 1))
#0.8112781
# Written by Grace
extractAssay <- function(input, assay_name = "counts") {
# Extract assay name based on the user input
if ("counts" %in% assayNames(input)) {
counts_data <- assay(input, assay_name)
cat("The specified assay has been extracted\n")
return(as.data.frame(as.matrix(counts_data)))
} else {
cat("The specified assay was not found\n")
return(NULL)
}
}