Switch to unified view

a b/inmoose/edgepy/mglmOneGroup.py
1
# -----------------------------------------------------------------------------
2
# Copyright (C) 2008-2022 Yunshun Chen, Aaron TL Lun, Davis J McCarthy, Matthew E Ritchie, Belinda Phipson, Yifang Hu, Xiaobei Zhou, Mark D Robinson, Gordon K Smyth
3
# Copyright (C) 2022-2023 Maximilien Colange
4
5
# This program is free software: you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation, either version 3 of the License, or
8
# (at your option) any later version.
9
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
# GNU General Public License for more details.
14
15
# You should have received a copy of the GNU General Public License
16
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
17
# -----------------------------------------------------------------------------
18
19
# This file is based on the file 'R/mglmOneGroup.R' of the Bioconductor edgeR package (version 3.38.4).
20
21
22
import numpy as np
23
24
from ..utils import LOGGER
25
from .glm_one_group import fit_one_group
26
from .makeCompressedMatrix import (
27
    _compressDispersions,
28
    _compressOffsets,
29
    _compressWeights,
30
)
31
from .utils import _isAllZero
32
33
34
def mglmOneGroup(
35
    y,
36
    dispersion=0,
37
    offset=0,
38
    weights=None,
39
    coef_start=None,
40
    maxit=50,
41
    tol=1e-10,
42
    verbose=False,
43
):
44
    """
45
    Fit single-group negative-binomial GLMs genewise.
46
47
    This is a low-level work-horse used by higher-level functions, especially
48
    :func:`glmFit`. Most users will not need to call this function directly.
49
50
    This function fits a negative binomial GLM to each row of :code:`y`. The
51
    row-wise GLMs all have the same design matrix but possibly different
52
    dispersions, offsets and weights. It is low-level in that it
53
    operates on atomic objects (matrices and vectors).
54
55
    This function fits an intercept only model to each response vector. In other
56
    words, it treats all the libraries as belonging to one group. It implements
57
    Fisher scoring with a score-statistic stopping criterion for each gene.  It
58
    treats the dispersion parameter of the negative binomial distribution as a
59
    known input.  Excellent starting values are available for the null model so
60
    this function seldom has any problems with convergence. It is used by other
61
    functions to compute the overall abundance for each gene.
62
63
    Arguments
64
    ---------
65
    y : array_like
66
        matrix of negative binomial counts. Rows for genes and columns for
67
        libraries.
68
    dispersion : float or array_like
69
        scalar or vector giving the dispersion parameter for each GLM. Can be a
70
        scalar giving one value for all genes, or a vector of length equal to
71
        the number of genes giving genewise dispersions.
72
    offset : array_like
73
        vector or matrix giving the offset that is to be included in the log
74
        linear model predictor. Can be a scalar, a vector of length equal to the
75
        number of libraries, or a matrix of the same shape as :code:`y`.
76
    weights : matrix, optional
77
        vector or matrix of non-negative quantitative weights. Can be a vector
78
        of length equal to the number of libraries, or a matrix of the same
79
        shape as :code:`y`.
80
    coef_start : array_like, optional
81
        matrix of starting values for the linear model coefficient. Number of
82
        rows should agree with :code:`y` and a single column. This argument does
83
        not usually need to be set as the automatic starting values perform well.
84
    maxit : int
85
        the maximum number of iterations for the Fisher scoring algorithm. The
86
        iteration will be stopped when this limit is reached even if the
87
        convergence criterion has not been satisfied.
88
    tol : float
89
        the convergence tolerance. Convergence if judged successful when the
90
        step size falls below :code:`tol` in absolute size.
91
92
    Returns
93
    -------
94
    ndarray
95
        vector of coefficients
96
    """
97
    # Check y
98
    # TODO check that y is a matrix and numeric
99
    _isAllZero(y)
100
101
    # Check dispersion
102
    dispersion = _compressDispersions(y, dispersion)
103
104
    # Check offset
105
    offset = _compressOffsets(y, offset=offset)
106
107
    # Check starting values
108
    if coef_start is None:
109
        coef_start = np.nan
110
    coef_start = np.full((y.shape[0],), coef_start, dtype="double")
111
112
    # Check weights
113
    weights = _compressWeights(y, weights)
114
115
    # Fisher scoring iteration
116
    output = fit_one_group(y, offset, dispersion, weights, maxit, tol, coef_start)
117
118
    # Convergence achieved for all tags?
119
    if np.count_nonzero(output[1]) > 0:
120
        LOGGER.debug(f"max iterations exceeded for {np.count_nonzero(output[1])} tags")
121
122
    return output[0]