Diff of /inmoose/edgepy/glmFit.py [000000] .. [ea0fd6]

Switch to unified view

a b/inmoose/edgepy/glmFit.py
1
# -----------------------------------------------------------------------------
2
# Copyright (C) 2008-2022 Yunshun Chen, Aaron TL Lun, Davis J McCarthy, Matthew E Ritchie, Belinda Phipson, Yifang Hu, Xiaobei Zhou, Mark D Robinson, Gordon K Smyth
3
# Copyright (C) 2022-2024 Maximilien Colange
4
5
# This program is free software: you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation, either version 3 of the License, or
8
# (at your option) any later version.
9
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
# GNU General Public License for more details.
14
15
# You should have received a copy of the GNU General Public License
16
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
17
# -----------------------------------------------------------------------------
18
19
# This file is based on the file 'R/glmfit.R' of the Bioconductor edgeR package (version 3.38.4).
20
21
22
import numpy as np
23
import pandas as pd
24
import scipy
25
from patsy import DesignMatrix, dmatrix
26
27
from ..utils import asfactor
28
from .DGEGLM import DGEGLM
29
from .DGELRT import DGELRT
30
from .makeCompressedMatrix import _compressDispersions, _compressOffsets
31
from .mglmLevenberg import mglmLevenberg
32
from .mglmOneWay import designAsFactor, mglmOneWay
33
from .nbinomDeviance import nbinomDeviance
34
from .predFC import predFC
35
36
37
def glmFit_DGEList(self, design=None, dispersion=None, prior_count=0.125, start=None):
38
    """
39
    Fit a negative binomial generalized log-linear model to the read counts for
40
    each gene. Conduct genewise statistical tests for a given coefficient or
41
    coefficient contrast.
42
43
    See also
44
    --------
45
    glmFit
46
47
    Arguments
48
    ---------
49
    design : matrix, optional
50
        design matrix for the genewise linear models. Must be of full column
51
        rank. Defaults to a single column of ones, equivalent to treating the
52
        columns as replicate libraries.
53
    dispersion : float or array_like
54
        scalar, vector or matrix of negative binomial dispersions. Can be a
55
        common value for all genese, a vector of dispersion values with one for
56
        each gene, or a matrix of dispersion values with one for each observation.
57
        If :code:`None`, it will be extracted from :code:`y`, with order of
58
        precedence: genewise dispersion, trended dispersion, common dispersion.
59
    prior_count : float
60
        average prior count to be added to observation to shrink the estimated
61
        log-fold-change towards zero.
62
    start : matrix, optional
63
        initial estimates for the linear model coefficients
64
65
    Returns
66
    -------
67
    DGEGLM
68
        object containing the data about the fit
69
    """
70
71
    # The design matrix defaults to the oneway layout defined by self.samples["group"]
72
    # If there is only one group, then the design matrix is left None so that a matrix with a single intercept column will be set later by glmFit.
73
    if design is None:
74
        design = self.design
75
        if design is None:
76
            group = asfactor(self.samples["group"]).droplevels()
77
            if group.nlevels() > 1:
78
                design = dmatrix("~C(self.samples['group'])")
79
80
    if dispersion is None:
81
        dispersion = self.getDispersion()
82
    if dispersion is None:
83
        raise ValueError("No dispersion values found in DGEList object")
84
    offset = self.getOffset()
85
    if self.AveLogCPM is None:
86
        self.AveLogCPM = self.aveLogCPM()
87
88
    fit = glmFit(
89
        y=self.counts,
90
        design=design,
91
        dispersion=dispersion,
92
        offset=offset,
93
        lib_size=None,
94
        weights=self.weights,
95
        prior_count=prior_count,
96
        start=start,
97
    )
98
99
    fit.samples = self.samples
100
    fit.genes = self.genes
101
    fit.prior_df = self.prior_df
102
    fit.AveLogCPM = self.AveLogCPM
103
    return fit
104
105
106
def glmFit(
107
    y,
108
    design=None,
109
    dispersion=None,
110
    offset=None,
111
    lib_size=None,
112
    weights=None,
113
    prior_count=0.125,
114
    start=None,
115
):
116
    """
117
    Fit a negative binomial generalized log-linear model to the read counts for
118
    each gene. Conduct genewise statistical tests for a given coefficient or
119
    coefficient contrast.
120
121
    This function implements one of the GLM methods developed by [McCarthy2012]_.
122
123
    :code:`glmFit` fits genewise negative binomial GLMs, all with the same
124
    design matrix but possibly different dispersions, offsets and weights.
125
    When the design matrix defines a one-way layout, or can be re-parameterized
126
    to a one-way layout, the GLMs are fitting very quickly using
127
    :func:`mglmOneGroup`. Otherwise the default fitting method, implemented in
128
    :func:`mglmLevenberg`, uses a Fisher scoring algorithm with Levenberg-style
129
    damping.
130
131
    Positive :code:`prior_count` cause the returned coefficients to be shrunk in
132
    such a way that fold-changes between the treatment conditions are decreased.
133
    In particular, infinite fold-changes are avoided. Larger values cause more
134
    shrinkage. The returned coefficients are affected but not the likelihood
135
    ratio tests or p-values.
136
137
    See also
138
    --------
139
    mglmOneGroup : low-level computations
140
    mglmLevenberg : low-level computations
141
142
    Arguments
143
    ---------
144
    y : pd.DataFrame
145
        matrix of counts
146
    design : matrix, optional
147
        design matrix for the genewise linear models. Must be of full column
148
        rank. Defaults to a single column of ones, equivalent to treating the
149
        columns as replicate libraries.
150
    dispersion : float or array_like
151
        scalar, vector or matrix of negative binomial dispersions. Can be a
152
        common value for all genes, a vector of dispersion values with one for
153
        each gene, or a matrix of dispersion values with one for each
154
        observation.
155
    offset : float or array_like, optional
156
        matrix of the same shape as :code:`y` giving offsets for the log-linear
157
        models. Can be a scalar or a vector of length :code:`y.shape[1]`, in
158
        which case it is broadcasted to the shape of :code:`y`.
159
    lib_size : array_like, optional
160
        vector of length :code:`y.shape[1]` giving library sizes. Only used if
161
        :code:`offset=None`, in which case :code:`offset` is set to
162
        :code:`log(lib_size)`. Defaults to :code:`colSums(y)`.
163
    weights : matrix, optional
164
        prior weights for the observations (for each library and gene) to be
165
        used in the GLM calculations
166
    prior_count : float
167
        average prior count to be added to observation to shrink the estimated
168
        log-fold-change towards zero.
169
    start : matrix, optional
170
        initial estimates for the linear model coefficients
171
172
    Returns
173
    -------
174
    DGEGLM
175
        object containing:
176
177
        - :code:`counts`, the input matrix of counts
178
        - :code:`design`, the input design matrix
179
        - :code:`weights`, the input weights matrix
180
        - :code:`offset`, matrix of linear model offsets
181
        - :code:`dispersion`, vector of dispersions used for the fit
182
        - :code:`coefficients`, matrix of estimated coefficients from the GLM
183
          fits, on the natural log scale, of size :code:`y.shape[0]` by
184
          :code:`design.shape[1]`.
185
        - :code:`unshrunk_coefficients`, matrix of estimated coefficients from
186
          the GLM fits when no log-fold-changes shrinkage is applied, on the
187
          natural log scale, of size :code:`y.shape[0]` by
188
          :code:`design.shape[1]`. It exists only when :code:`prior_count` is
189
          not 0.
190
        - :code:`fitted_values`, matrix of fitted values from GLM fits, same
191
          shape as :code:`y`
192
        - :code:`deviance`, numeric vector of deviances, one for each gene
193
    """
194
    # Check y
195
    (ntag, nlib) = y.shape
196
197
    # Check design
198
    if design is None:
199
        design = dmatrix("~1", pd.DataFrame(y.T))
200
    try:
201
        design = DesignMatrix(
202
            np.asarray(design, order="F"), design_info=design.design_info
203
        )
204
    except AttributeError:
205
        design = np.asarray(design, order="F")
206
207
    if design.shape[0] != nlib:
208
        raise ValueError("design should have as many rows as y has columns")
209
    if np.linalg.matrix_rank(design) < design.shape[1]:
210
        raise ValueError(
211
            "Design matrix is not full rank. Some coefficients are not estimable"
212
        )
213
214
    # Check dispersion
215
    if dispersion is None:
216
        raise ValueError("No dispersion values provided")
217
    dispersion = np.asanyarray(dispersion)
218
    # TODO check dispersion for NaN and non-numeric values
219
    if dispersion.shape not in [(), (1,), (ntag,), y.shape]:
220
        raise ValueError("Dimensions of dispersion do not agree with dimensions of y")
221
    dispersion_mat = _compressDispersions(y, dispersion)
222
223
    # Check offset
224
    if offset is not None:
225
        # TODO check that offset is numeric
226
        offset = np.asanyarray(offset)
227
        if offset.shape not in [(), (1,), (nlib,), y.shape]:
228
            raise ValueError("Dimensions of offset do not agree with dimensions of y")
229
230
    # Check lib_size
231
    if lib_size is not None:
232
        # TODO check that lib_size is numeric
233
        lib_size = np.asarray(lib_size)
234
        if lib_size.shape not in [(), (1,), (nlib,)]:
235
            raise ValueError("lib_size has wrong length, should agree with ncol(y)")
236
237
    # Consolidate lib_size and offset into a compressed matrix
238
    offset = _compressOffsets(y=y, lib_size=lib_size, offset=offset)
239
240
    # weights are checked in lower-level functions
241
242
    # Fit the tagwise GLMs
243
    # If the design is equivalent to a oneway layout, use a shortcut algorithm
244
    group = designAsFactor(design)
245
    if group.nlevels() == design.shape[1]:
246
        (coef, fitted_values) = mglmOneWay(
247
            y,
248
            design=design,
249
            group=group,
250
            dispersion=dispersion_mat,
251
            offset=offset,
252
            weights=weights,
253
            coef_start=start,
254
        )
255
        deviance = nbinomDeviance(
256
            y=y, mean=fitted_values, dispersion=dispersion_mat, weights=weights
257
        )
258
        fit_method = "oneway"
259
        fit = (coef, fitted_values, deviance, None, None)
260
    else:
261
        fit = mglmLevenberg(
262
            y,
263
            design=design,
264
            dispersion=dispersion_mat,
265
            offset=offset,
266
            weights=weights,
267
            coef_start=start,
268
            maxit=250,
269
        )
270
        fit_method = "levenberg"
271
272
    # Prepare output
273
    fit = DGEGLM(fit)
274
    fit.counts = y
275
    fit.method = fit_method
276
    if prior_count > 0:
277
        fit.unshrunk_coefficients = fit.coefficients
278
        fit.coefficients = predFC(
279
            y,
280
            design,
281
            offset=offset,
282
            dispersion=dispersion_mat,
283
            prior_count=prior_count,
284
            weights=weights,
285
        ) * np.log(2)
286
287
    # counts N,M
288
    # design M,P
289
    assert y.shape[1] == design.shape[0]
290
    w_vec = fit.fitted_values / (1.0 + dispersion_mat * fit.fitted_values)
291
    if weights is not None:
292
        w_vec = weights * w_vec
293
    ridge = np.diag(np.repeat(1e-6 / (np.log(2) ** 2), design.shape[1]))
294
    xtwxr_inv = np.linalg.inv(design.T @ (design * w_vec[:, :, None]) + ridge)
295
    sigma = xtwxr_inv @ design.T @ (design * w_vec[:, :, None]) @ xtwxr_inv
296
    fit.coeff_SE = np.diagonal(sigma, axis1=-2, axis2=-1)
297
298
    # FIXME (from original R source) we are not allowing missing values, so df.residual must be same for all tags
299
    fit.df_residual = np.full(ntag, nlib - design.shape[1])
300
    fit.design = design
301
    fit.offset = offset
302
    fit.dispersion = dispersion
303
    fit.weights = weights
304
    fit.prior_count = prior_count
305
    return fit
306
307
308
def glmLRT(glmfit, coef=None, contrast=None):
309
    """
310
    Conduct genewise statistical tests for a given coefficient or coefficient contrast.
311
312
    This function implements one of the GLM methods developed by [McCarthy2012]_.
313
314
    :func:`glmLRT` conducts likelihood ratio tests for one or more coefficients
315
    in the linear model. If :code:`coef` is used, the null hypothesis is that
316
    all the coefficients indicated by :code:`coef` are equal to zero. If
317
    :code:`contrast` is non-null, then the null hypothesis is that the
318
    specified contrasts of the coefficients are equal to zero. For example, a
319
    contrast of :code:`[0,1,-1]`, assuming there are three coefficients, would
320
    test the hypothesis that the second and third coefficients are equal.
321
322
    Arguments
323
    ---------
324
    glmfit : DGEGLM
325
        a :class:`DGEGLM` object, usually output from :func:`glmFit`
326
    coef : array_like of integers or strings
327
        vector indicating which coefficients of the linear model are to be
328
        tested equal to zero. Values must be column indices or column names of
329
        :code:`design`. Defaults to the last coefficient. Ignored if
330
        :code:`contrast` is specified.
331
    contrast : array or matrix of integers
332
        vector or matrix specifying one or more contrasts of the linear model
333
        coefficients to be tested equal to zero. Number of rows must equal to
334
        the number of columns of :code:`design`. If specified, then takes
335
        precedence over :code:`coef`.
336
337
    Returns
338
    -------
339
    DGELRT
340
        dataframe with two additional components:
341
342
        - :code:`fit` containing the result of :func:`glmFit`
343
        - :code:`comparison`, string describing the coefficient or the contrast
344
          being tested
345
346
        The dataframe has the same rows as :code:`y` and is ready to be
347
        displayed by :func:`topTags`. It contains the following columns:
348
349
        - :code:`"log2FoldChange"`, log2-fold-change of expression between
350
          conditions being tested.
351
        - :code:`"lfcSE"`, standard error of log2-fold-change.
352
        - :code:`"logCPM"`, average log2-counts per million, the average taken
353
          over all libraries in :code:`y`.
354
        - :code:`"stat"`, likelihood ratio statistics.
355
        - :code:`"pvalue"`, *p*-values.
356
    """
357
    if coef is None:
358
        coef = glmfit.design.shape[1] - 1
359
    if not isinstance(glmfit, DGEGLM):
360
        raise ValueError("glmfit must be a DGEGLM object (usually produced by glmFit).")
361
362
    if glmfit.AveLogCPM is None:
363
        glmfit.AveLogCPM = glmfit.aveLogCPM()
364
    nlibs = glmfit.coefficients.shape[1]
365
366
    # check design matrix
367
    design = glmfit.design
368
    nbeta = design.shape[1]
369
    if nbeta < 2:
370
        raise ValueError(
371
            "Need at least two columns for design, usually the first is the intercept columns"
372
        )
373
    coef_names = design.design_info.column_names
374
375
    # Evaluate logFC for coef to be tested
376
    # Note that contrast takes precedence over coef: if contrast is given then reform
377
    # design matrix so that contrast of interest is last column
378
    if contrast is None:
379
        if not isinstance(coef, (list, np.ndarray)):
380
            coef = [coef]
381
        if isinstance(coef[0], str):
382
            check_coef = np.isin(coef, design.design_info.column_names)
383
            if (~check_coef).any():
384
                raise ValueError(
385
                    "One or more named coef arguments do not match a column of the design matrix."
386
                )
387
            coef_name = coef
388
            coef = np.nonzero([design.design_info.column_names == c for c in coef])[0]
389
        else:
390
            coef_name = [coef_names[c] for c in coef]
391
        logFC = glmfit.coefficients[:, coef] / np.log(2)
392
        lfcSE = glmfit.coeff_SE[:, coef] / np.log(2)
393
    else:
394
        # TODO make sure contrast is a matrix
395
        if contrast.shape[0] != glmfit.coefficients.shape[1]:
396
            raise ValueError(
397
                "contrast vector of wrong length, should be equal to number of coefficients in the linear model"
398
            )
399
        ncontrasts = np.linalg.matrix_rank(contrast)
400
        Q, R = np.linalg.qr(contrast)
401
        if ncontrasts == 0:
402
            raise ValueError("contrasts are all zero")
403
        coef = np.arange(ncontrasts)
404
        logFC = (glmfit.coefficients @ contrast) / np.log(2)
405
        lfcSE = (glmfit.coeff_SE @ contrast) / np.log(2)
406
        if ncontrasts > 1:
407
            coef_name = f"LR test on {ncontrasts} degrees of freedom"
408
        else:
409
            contrast = np.squeeze(contrast)
410
            i = contrast != 0
411
            coef_name = " ".join(
412
                [f"{a}*{b}" for a, b in zip(contrast[i], coef_names[i])]
413
            )
414
        Dvec = np.ones(nlibs, int)
415
        Dvec[coef] = np.diag(R)[coef]
416
        Q = Q * Dvec
417
        design = design @ Q
418
419
    # Null design matrix
420
    non_coef = np.setdiff1d(np.arange(design.shape[1]), coef)
421
    design0 = design[:, non_coef]
422
423
    # Null fit
424
    fit_null = glmFit(
425
        glmfit.counts,
426
        design=design0,
427
        offset=glmfit.offset,
428
        weights=glmfit.weights,
429
        dispersion=glmfit.dispersion,
430
        prior_count=0,
431
    )
432
433
    # Likelihood ratio statistic
434
    LR = np.subtract(fit_null.deviance, glmfit.deviance)
435
    df_test = fit_null.df_residual - glmfit.df_residual
436
    LRT_pvalue = scipy.stats.chi2.sf(LR, df=df_test)
437
    tab = pd.DataFrame()
438
    if logFC.ndim > 1:
439
        for i in range(logFC.shape[1]):
440
            tab[f"logFC{i}"] = logFC[:, i]
441
            tab[f"lfcSE{i}"] = lfcSE[:, i]
442
        tab.columns = [
443
            "log2FoldChange" if i % 2 == 0 else "lfcSE"
444
            for i in range(2 * logFC.shape[1])
445
        ]
446
447
    else:
448
        tab["log2FoldChange"] = logFC
449
        tab["lfcSE"] = lfcSE
450
    tab["logCPM"] = glmfit.AveLogCPM
451
    tab["stat"] = LR
452
    tab["pvalue"] = LRT_pvalue
453
    tab.index = glmfit.counts.index
454
    res = DGELRT(tab, glmfit)
455
    res.comparison = coef_name
456
    res.df_test = df_test
457
    return res