Switch to unified view

a b/inmoose/edgepy/dispCoxReid.py
1
# -----------------------------------------------------------------------------
2
# Copyright (C) 2008-2022 Yunshun Chen, Aaron TL Lun, Davis J McCarthy, Matthew E Ritchie, Belinda Phipson, Yifang Hu, Xiaobei Zhou, Mark D Robinson, Gordon K Smyth
3
# Copyright (C) 2022-2023 Maximilien Colange
4
5
# This program is free software: you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation, either version 3 of the License, or
8
# (at your option) any later version.
9
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
# GNU General Public License for more details.
14
15
# You should have received a copy of the GNU General Public License
16
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
17
# -----------------------------------------------------------------------------
18
19
# This file is based on the file 'R/dispCoxReid.R' of the Bioconductor edgeR package (version 3.38.4).
20
21
22
import numpy as np
23
from scipy.optimize import minimize_scalar
24
25
from .adjustedProfileLik import adjustedProfileLik
26
from .aveLogCPM import aveLogCPM
27
from .makeCompressedMatrix import _compressOffsets, _compressWeights
28
from .systematicSubset import systematicSubset
29
30
31
def dispCoxReid(
32
    y,
33
    design=None,
34
    offset=None,
35
    weights=None,
36
    AveLogCPM=None,
37
    interval=(0, 4),
38
    tol=1e-5,
39
    min_row_sum=5,
40
    subset=10000,
41
):
42
    """
43
    Estimate a common dispersion parameter across multiple negative binomial
44
    GLMs, by maximizing the Cox-Reid adjusted profile likelihood.
45
46
    This is a low-level function called by :func:`estimateGLMCommonDisp`.
47
48
    Estimation is done by maximizing the Cox-Reid adjusted profile likelihood
49
    (Cox and Reid, 1987 [1]_), through :func:`scipy.optimize.minimize_scalar`.
50
51
    Robinson and Smyth (2008) [2]_ and McCarthy et al. (2012) [3]_ showed that
52
    the Pearson (pseudo-likelihood) estimator typically under-estimates the true
53
    dispersion. It can be seriously biased when the number of libraries is small.
54
    On the other hand, the deviance (quasi-likelihood) estimator typically
55
    over-estimates the true dispersion when the number of libraries is small.
56
    Robinson and Smyth (2008) [2]_ and McCarthy et al. (2012) [3]_ showed the
57
    Cox-Reid estimator to be the least biased of the three options.
58
59
    Arguments
60
    ---------
61
    y : matrix
62
        matrix of counts. A GLM is fitted to each row
63
    design : matrix, optional
64
        design matrix, as in :func:`glmFit`
65
    offset : array_like, optional
66
        vector or matrix of offsets for the log-linear models, as in
67
        :func:`glmFit`. Defaults to :code:`log(colSums(y))`.
68
    weights : matrix, optional
69
        observation weights
70
    AveLogCPM : array_like, optional
71
        vector giving average log2 counts per million
72
    interval : tuple, optional
73
        pair giving minimum and maximum allowed values for the dispersion,
74
        passed to :func:`scipy.optimize.minimize_scalar`
75
    tol : float, optional
76
        the desired accuracy, see :func:`scipy.optimize.minimize_scalar`
77
    min_row_sum : int, optional
78
        only rows with at least this number of counts are used
79
    subset : int, optional
80
        number of rows to use in the calculation. Rows used are chosen evenly
81
        space by :code:`AveLogCPM`.
82
83
    Returns
84
    -------
85
    float
86
        the estimated common dispersion
87
88
    References
89
    ----------
90
    .. [1] D. R. Cox, N. Reid. 1987. Parameter orthogonality and approximate
91
       conditional inference. Journal of the Royal Statistical Society Series B
92
       49, 1-39.
93
    .. [2] M. D. Robinson, G. K. Smyth. 2008. Small-sample estimation of
94
       negative binomial dispersion, with applications to SAGE data.
95
       Biostatistics 9, 321-332. :doi:`10.1093/biostatistics/kxm030`
96
    .. [3] D. J. McCarthy, Y. Chen, G. K. Smyth. 2012. Differential expression
97
       analysis of multifactor RNA-Seq experiments with respect to biological
98
       variation. Nucleic Acids Research 40, 4288-4297. :doi:`10.1093/nar/gks042`
99
    """
100
    # Check y
101
    y = np.asarray(y)
102
103
    # Check design
104
    if design is None:
105
        design = np.ones((y.shape[1], 1))
106
    else:
107
        design = np.asarray(design)
108
109
    # Check offset
110
    if offset is None:
111
        offset = np.log(y.sum(axis=0))
112
    if offset.ndim == 1:
113
        offset = np.broadcast_to(offset, y.shape)
114
    assert offset.shape == y.shape
115
116
    if interval[0] < 0:
117
        raise ValueError("please give a non-negative interval for the dispersion")
118
119
    if AveLogCPM is not None:
120
        AveLogCPM = np.asarray(AveLogCPM)
121
122
    # Apply min row count
123
    small_row_sum = y.sum(axis=1) < min_row_sum
124
    if small_row_sum.any():
125
        y = y[np.logical_not(small_row_sum)]
126
        offset = offset[np.logical_not(small_row_sum)]
127
        if weights is not None:
128
            weights = weights[np.logical_not(small_row_sum)]
129
        if AveLogCPM is not None:
130
            AveLogCPM = AveLogCPM[np.logical_not(small_row_sum)]
131
    if y.shape[0] < 1:
132
        raise ValueError("no data rows with required number of counts")
133
134
    # Subsetting
135
    if subset is not None and subset <= y.shape[0] / 2:
136
        if AveLogCPM is None:
137
            AveLogCPM = aveLogCPM(y, offset=offset, weights=weights)
138
        i = systematicSubset(subset, AveLogCPM)
139
        y = y[i, :]
140
        offset = offset[i, :]
141
        if weights is not None:
142
            weights = weights[i, :]
143
144
    # Function for optimizing
145
    def sumAPL(par, y, design, offset, weights):
146
        return -sum(adjustedProfileLik(par**4, y, design, offset, weights=weights))
147
148
    # anticipate the calls to _compress* in adjustedProfileLik
149
    y = np.asarray(y)
150
    offset = _compressOffsets(y, offset=offset)
151
    weights = _compressWeights(y, weights)
152
    out = minimize_scalar(
153
        sumAPL,
154
        args=(y, design, offset, weights),
155
        bounds=(interval[0] ** 0.25, interval[1] ** 0.25),
156
        options={"xatol": tol},
157
    )
158
    return out.x**4