|
a |
|
b/inmoose/edgepy/mglmOneGroup.py |
|
|
1 |
# ----------------------------------------------------------------------------- |
|
|
2 |
# Copyright (C) 2008-2022 Yunshun Chen, Aaron TL Lun, Davis J McCarthy, Matthew E Ritchie, Belinda Phipson, Yifang Hu, Xiaobei Zhou, Mark D Robinson, Gordon K Smyth |
|
|
3 |
# Copyright (C) 2022-2023 Maximilien Colange |
|
|
4 |
|
|
|
5 |
# This program is free software: you can redistribute it and/or modify |
|
|
6 |
# it under the terms of the GNU General Public License as published by |
|
|
7 |
# the Free Software Foundation, either version 3 of the License, or |
|
|
8 |
# (at your option) any later version. |
|
|
9 |
|
|
|
10 |
# This program is distributed in the hope that it will be useful, |
|
|
11 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
|
12 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
|
13 |
# GNU General Public License for more details. |
|
|
14 |
|
|
|
15 |
# You should have received a copy of the GNU General Public License |
|
|
16 |
# along with this program. If not, see <https://www.gnu.org/licenses/>. |
|
|
17 |
# ----------------------------------------------------------------------------- |
|
|
18 |
|
|
|
19 |
# This file is based on the file 'R/mglmOneGroup.R' of the Bioconductor edgeR package (version 3.38.4). |
|
|
20 |
|
|
|
21 |
|
|
|
22 |
import numpy as np |
|
|
23 |
|
|
|
24 |
from ..utils import LOGGER |
|
|
25 |
from .glm_one_group import fit_one_group |
|
|
26 |
from .makeCompressedMatrix import ( |
|
|
27 |
_compressDispersions, |
|
|
28 |
_compressOffsets, |
|
|
29 |
_compressWeights, |
|
|
30 |
) |
|
|
31 |
from .utils import _isAllZero |
|
|
32 |
|
|
|
33 |
|
|
|
34 |
def mglmOneGroup( |
|
|
35 |
y, |
|
|
36 |
dispersion=0, |
|
|
37 |
offset=0, |
|
|
38 |
weights=None, |
|
|
39 |
coef_start=None, |
|
|
40 |
maxit=50, |
|
|
41 |
tol=1e-10, |
|
|
42 |
verbose=False, |
|
|
43 |
): |
|
|
44 |
""" |
|
|
45 |
Fit single-group negative-binomial GLMs genewise. |
|
|
46 |
|
|
|
47 |
This is a low-level work-horse used by higher-level functions, especially |
|
|
48 |
:func:`glmFit`. Most users will not need to call this function directly. |
|
|
49 |
|
|
|
50 |
This function fits a negative binomial GLM to each row of :code:`y`. The |
|
|
51 |
row-wise GLMs all have the same design matrix but possibly different |
|
|
52 |
dispersions, offsets and weights. It is low-level in that it |
|
|
53 |
operates on atomic objects (matrices and vectors). |
|
|
54 |
|
|
|
55 |
This function fits an intercept only model to each response vector. In other |
|
|
56 |
words, it treats all the libraries as belonging to one group. It implements |
|
|
57 |
Fisher scoring with a score-statistic stopping criterion for each gene. It |
|
|
58 |
treats the dispersion parameter of the negative binomial distribution as a |
|
|
59 |
known input. Excellent starting values are available for the null model so |
|
|
60 |
this function seldom has any problems with convergence. It is used by other |
|
|
61 |
functions to compute the overall abundance for each gene. |
|
|
62 |
|
|
|
63 |
Arguments |
|
|
64 |
--------- |
|
|
65 |
y : array_like |
|
|
66 |
matrix of negative binomial counts. Rows for genes and columns for |
|
|
67 |
libraries. |
|
|
68 |
dispersion : float or array_like |
|
|
69 |
scalar or vector giving the dispersion parameter for each GLM. Can be a |
|
|
70 |
scalar giving one value for all genes, or a vector of length equal to |
|
|
71 |
the number of genes giving genewise dispersions. |
|
|
72 |
offset : array_like |
|
|
73 |
vector or matrix giving the offset that is to be included in the log |
|
|
74 |
linear model predictor. Can be a scalar, a vector of length equal to the |
|
|
75 |
number of libraries, or a matrix of the same shape as :code:`y`. |
|
|
76 |
weights : matrix, optional |
|
|
77 |
vector or matrix of non-negative quantitative weights. Can be a vector |
|
|
78 |
of length equal to the number of libraries, or a matrix of the same |
|
|
79 |
shape as :code:`y`. |
|
|
80 |
coef_start : array_like, optional |
|
|
81 |
matrix of starting values for the linear model coefficient. Number of |
|
|
82 |
rows should agree with :code:`y` and a single column. This argument does |
|
|
83 |
not usually need to be set as the automatic starting values perform well. |
|
|
84 |
maxit : int |
|
|
85 |
the maximum number of iterations for the Fisher scoring algorithm. The |
|
|
86 |
iteration will be stopped when this limit is reached even if the |
|
|
87 |
convergence criterion has not been satisfied. |
|
|
88 |
tol : float |
|
|
89 |
the convergence tolerance. Convergence if judged successful when the |
|
|
90 |
step size falls below :code:`tol` in absolute size. |
|
|
91 |
|
|
|
92 |
Returns |
|
|
93 |
------- |
|
|
94 |
ndarray |
|
|
95 |
vector of coefficients |
|
|
96 |
""" |
|
|
97 |
# Check y |
|
|
98 |
# TODO check that y is a matrix and numeric |
|
|
99 |
_isAllZero(y) |
|
|
100 |
|
|
|
101 |
# Check dispersion |
|
|
102 |
dispersion = _compressDispersions(y, dispersion) |
|
|
103 |
|
|
|
104 |
# Check offset |
|
|
105 |
offset = _compressOffsets(y, offset=offset) |
|
|
106 |
|
|
|
107 |
# Check starting values |
|
|
108 |
if coef_start is None: |
|
|
109 |
coef_start = np.nan |
|
|
110 |
coef_start = np.full((y.shape[0],), coef_start, dtype="double") |
|
|
111 |
|
|
|
112 |
# Check weights |
|
|
113 |
weights = _compressWeights(y, weights) |
|
|
114 |
|
|
|
115 |
# Fisher scoring iteration |
|
|
116 |
output = fit_one_group(y, offset, dispersion, weights, maxit, tol, coef_start) |
|
|
117 |
|
|
|
118 |
# Convergence achieved for all tags? |
|
|
119 |
if np.count_nonzero(output[1]) > 0: |
|
|
120 |
LOGGER.debug(f"max iterations exceeded for {np.count_nonzero(output[1])} tags") |
|
|
121 |
|
|
|
122 |
return output[0] |