Diff of /R/fit_ZSCP.R [000000] .. [727782]

Switch to unified view

a b/R/fit_ZSCP.R
1
fit.ZSCP <- function(features,
2
                     metadata,
3
                     link = "log",
4
                     formula = NULL,
5
                     random_effects_formula = NULL,
6
                     cutoff_ZSCP = 0.3,
7
                     correction = 'BH',
8
                     cores = 4,
9
                     optimizer = 'nlminb',
10
                     na.action = na.exclude) {
11
  ######################################
12
  # Fit and summary functions for ZSCP #
13
  ######################################
14
  
15
  if (is.null(random_effects_formula)) {
16
    ##########################
17
    # Fixed effects modeling #
18
    ##########################
19
    
20
    model_function <- function(formula,
21
                               data,
22
                               link,
23
                               optimizer,
24
                               na.action,
25
                               cutoff_ZSCP) {
26
      ##########################################
27
      # Calculate stratifier for ZSCP modeling #
28
      ##########################################
29
      
30
      zero_percentage <- mean(data$expr == 0, na.rm = TRUE)
31
      
32
      ################################
33
      # Stratified per-feature model #
34
      ################################
35
      
36
      ifelse(zero_percentage <= cutoff_ZSCP,
37
             return(
38
               cplm::cpglm(
39
                 formula = formula,
40
                 data = data,
41
                 link = link,
42
                 optimizer = optimizer,
43
                 na.action = na.action
44
               )
45
             ),
46
             return(
47
               zcpglm(
48
                 formula = formula,
49
                 data = data,
50
                 link = link,
51
                 optimizer = optimizer,
52
                 na.action = na.action
53
               )
54
             ))
55
    }
56
    
57
    summary_function <- function(fit) {
58
      if (class(fit) == "cpglm") {
59
        cplm_out <-
60
          capture.output(cplm_summary <- cplm::summary(fit)$coefficients)
61
        para <- as.data.frame(cplm_summary)[-1,-3]
62
        para$base.model <- 'CPLM'
63
        para$tweedie.index <- round(fit$p, 3)
64
        para$name <- rownames(cplm_summary)[-1]
65
        return(para)
66
      }
67
      if (class(fit) == "zcpglm") {
68
        zicp_out <-
69
          capture.output(zicp_summary <-
70
                           cplm::summary(fit)$coefficients$tweedie)
71
        para <- as.data.frame(zicp_summary)[-1,-3]
72
        para$base.model <- 'ZICP'
73
        para$tweedie.index <- round(fit$p, 3)
74
        para$name <- rownames(zicp_summary)[-1]
75
        return(para)
76
      }
77
    }
78
    
79
  } else{
80
    ###########################
81
    # Random effects modeling #
82
    ###########################
83
    
84
    formula <-
85
      paste('. ~', paste(all.vars(formula)[-1], collapse = ' + '), '.', sep = ' + ')
86
    formula <- update(random_effects_formula, formula)
87
    
88
    model_function <- function(formula,
89
                               data,
90
                               link,
91
                               optimizer,
92
                               na.action,
93
                               cutoff_ZSCP) {
94
      ##########################################
95
      # Calculate stratifier for ZSCP modeling #
96
      ##########################################
97
      
98
      zero_percentage <- mean(data$expr == 0, na.rm = TRUE)
99
      
100
      ################################
101
      # Stratified per-feature model #
102
      ################################
103
      
104
      ifelse(zero_percentage <= cutoff_ZSCP,
105
             return(
106
               glmmTMB::glmmTMB(
107
                 formula = formula,
108
                 data = data,
109
                 family = glmmTMB::tweedie(link = link),
110
                 ziformula = ~ 0,
111
                 na.action = na.action
112
               )
113
             ),
114
             return(
115
               glmmTMB::glmmTMB(
116
                 formula = formula,
117
                 data = data,
118
                 family = glmmTMB::tweedie(link = link),
119
                 ziformula = ~ 1,
120
                 na.action = na.action
121
               )
122
             ))
123
    }
124
    
125
    summary_function <- function(fit) {
126
      glmmTMB_summary <- coef(summary(fit))
127
      para <- as.data.frame(glmmTMB_summary$cond)[-1,-3]
128
      para$base.model <-
129
        ifelse(is.null(glmmTMB_summary$zi), 'CPLM', 'ZICP')
130
      para$tweedie.index <-
131
        round(unname(plogis(fit$fit$par["thetaf"]) + 1), 3)
132
      para$name <- rownames(glmmTMB_summary$cond)[-1]
133
      return(para)
134
    }
135
  }
136
  
137
  
138
  #######################################
139
  # Init cluster for parallel computing #
140
  #######################################
141
  
142
  cluster <- NULL
143
  if (cores > 1)
144
  {
145
    logging::loginfo("Creating cluster of %s R processes", cores)
146
    cluster <- parallel::makeCluster(cores)
147
    clusterExport(
148
      cluster,
149
      c(
150
        "features",
151
        "metadata",
152
        "formula",
153
        "link",
154
        "optimizer",
155
        "na.action",
156
        "model_function",
157
        "summary_function",
158
        "cutoff_ZSCP"
159
      ),
160
      envir = environment()
161
    )
162
  }
163
  
164
  ##############################
165
  # Apply per-feature modeling #
166
  ##############################
167
  
168
  outputs <-
169
    pbapply::pblapply(seq_len(ncol(features)), cl = cluster, function(x) {
170
      #################################
171
      # Create per-feature data frame #
172
      #################################
173
      
174
      featuresVector <- features[, x]
175
      logging::loginfo("Fitting model to feature number %d, %s",
176
                       x,
177
                       colnames(features)[x])
178
      dat_sub <-
179
        data.frame(expr = as.numeric(featuresVector), metadata)
180
      
181
      #############
182
      # Fit model #
183
      #############
184
      
185
      fit <- tryCatch({
186
        fit1 <-
187
          model_function(
188
            formula = formula,
189
            data = dat_sub,
190
            link = link,
191
            optimizer = optimizer,
192
            na.action = na.action,
193
            cutoff_ZSCP = cutoff_ZSCP
194
          )
195
      }, error = function(err) {
196
        fit1 <-
197
          try({
198
            model_function(
199
              formula = formula,
200
              data = dat_sub,
201
              link = link,
202
              optimizer = optimizer,
203
              na.action = na.action,
204
              cutoff_ZSCP = cutoff_ZSCP
205
            )
206
          })
207
        return(fit1)
208
      })
209
      
210
      #################
211
      # Gather Output #
212
      #################
213
      
214
      output <- list()
215
      if (all(!inherits(fit, "try-error"))) {
216
        output$para <- summary_function(fit)
217
      }
218
      else{
219
        logging::logwarn(paste("Fitting problem for feature", x, "returning NA"))
220
        output$para <-
221
          as.data.frame(matrix(NA,  nrow = ncol(metadata) - 1, ncol = 5)) # Everything except offset
222
        output$para$name <-
223
          colnames(metadata)[-ncol(metadata)] # Everything except offset
224
      }
225
      colnames(output$para) <-
226
        c('coef',
227
          'stderr' ,
228
          'pval',
229
          'base.model',
230
          'tweedie.index',
231
          'name')
232
      output$para$feature <- colnames(features)[x]
233
      return(output)
234
    })
235
  
236
  ####################
237
  # Stop the cluster #
238
  ####################
239
  
240
  if (!is.null(cluster))
241
    parallel::stopCluster(cluster)
242
  
243
  #####################################
244
  # Bind the results for each feature #
245
  #####################################
246
  
247
  paras <-
248
    do.call(rbind, lapply(outputs, function(x) {
249
      return(x$para)
250
    }))
251
  
252
  ################################
253
  # Apply correction to p-values #
254
  ################################
255
  
256
  paras$qval <-
257
    as.numeric(p.adjust(paras$pval, method = correction))
258
  
259
  #####################################################
260
  # Determine the metadata names from the model names #
261
  #####################################################
262
  
263
  metadata_names <- setdiff(colnames(metadata), "offset")
264
  # order the metadata names by decreasing length
265
  metadata_names_ordered <-
266
    metadata_names[order(nchar(metadata_names), decreasing = TRUE)]
267
  # find the metadata name based on the match
268
  # to the beginning of the string
269
  extract_metadata_name <- function(name) {
270
    return(metadata_names_ordered[mapply(startsWith,
271
                                         name,
272
                                         metadata_names_ordered)][1])
273
  }
274
  paras$metadata <-
275
    unlist(lapply(paras$name, extract_metadata_name))
276
  # compute the value as the model contrast minus metadata
277
  paras$value <-
278
    mapply(function(x, y) {
279
      if (x == y)
280
        x
281
      else
282
        gsub(x, "", y)
283
    }, paras$metadata, paras$name)
284
  
285
  ##############################
286
  # Sort by decreasing q-value #
287
  ##############################
288
  
289
  paras <- paras[order(paras$qval, decreasing = FALSE),]
290
  paras <-
291
    dplyr::select(paras,
292
                  c('feature', 'metadata', 'value'),
293
                  dplyr::everything())
294
  paras <- dplyr::select(paras,-name)
295
  rownames(paras) <- NULL
296
  return(list("results" = paras))
297
}