|
a |
|
b/inmoose/edgepy/dispCoxReid.py |
|
|
1 |
# ----------------------------------------------------------------------------- |
|
|
2 |
# Copyright (C) 2008-2022 Yunshun Chen, Aaron TL Lun, Davis J McCarthy, Matthew E Ritchie, Belinda Phipson, Yifang Hu, Xiaobei Zhou, Mark D Robinson, Gordon K Smyth |
|
|
3 |
# Copyright (C) 2022-2023 Maximilien Colange |
|
|
4 |
|
|
|
5 |
# This program is free software: you can redistribute it and/or modify |
|
|
6 |
# it under the terms of the GNU General Public License as published by |
|
|
7 |
# the Free Software Foundation, either version 3 of the License, or |
|
|
8 |
# (at your option) any later version. |
|
|
9 |
|
|
|
10 |
# This program is distributed in the hope that it will be useful, |
|
|
11 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
|
12 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
|
13 |
# GNU General Public License for more details. |
|
|
14 |
|
|
|
15 |
# You should have received a copy of the GNU General Public License |
|
|
16 |
# along with this program. If not, see <https://www.gnu.org/licenses/>. |
|
|
17 |
# ----------------------------------------------------------------------------- |
|
|
18 |
|
|
|
19 |
# This file is based on the file 'R/dispCoxReid.R' of the Bioconductor edgeR package (version 3.38.4). |
|
|
20 |
|
|
|
21 |
|
|
|
22 |
import numpy as np |
|
|
23 |
from scipy.optimize import minimize_scalar |
|
|
24 |
|
|
|
25 |
from .adjustedProfileLik import adjustedProfileLik |
|
|
26 |
from .aveLogCPM import aveLogCPM |
|
|
27 |
from .makeCompressedMatrix import _compressOffsets, _compressWeights |
|
|
28 |
from .systematicSubset import systematicSubset |
|
|
29 |
|
|
|
30 |
|
|
|
31 |
def dispCoxReid( |
|
|
32 |
y, |
|
|
33 |
design=None, |
|
|
34 |
offset=None, |
|
|
35 |
weights=None, |
|
|
36 |
AveLogCPM=None, |
|
|
37 |
interval=(0, 4), |
|
|
38 |
tol=1e-5, |
|
|
39 |
min_row_sum=5, |
|
|
40 |
subset=10000, |
|
|
41 |
): |
|
|
42 |
""" |
|
|
43 |
Estimate a common dispersion parameter across multiple negative binomial |
|
|
44 |
GLMs, by maximizing the Cox-Reid adjusted profile likelihood. |
|
|
45 |
|
|
|
46 |
This is a low-level function called by :func:`estimateGLMCommonDisp`. |
|
|
47 |
|
|
|
48 |
Estimation is done by maximizing the Cox-Reid adjusted profile likelihood |
|
|
49 |
(Cox and Reid, 1987 [1]_), through :func:`scipy.optimize.minimize_scalar`. |
|
|
50 |
|
|
|
51 |
Robinson and Smyth (2008) [2]_ and McCarthy et al. (2012) [3]_ showed that |
|
|
52 |
the Pearson (pseudo-likelihood) estimator typically under-estimates the true |
|
|
53 |
dispersion. It can be seriously biased when the number of libraries is small. |
|
|
54 |
On the other hand, the deviance (quasi-likelihood) estimator typically |
|
|
55 |
over-estimates the true dispersion when the number of libraries is small. |
|
|
56 |
Robinson and Smyth (2008) [2]_ and McCarthy et al. (2012) [3]_ showed the |
|
|
57 |
Cox-Reid estimator to be the least biased of the three options. |
|
|
58 |
|
|
|
59 |
Arguments |
|
|
60 |
--------- |
|
|
61 |
y : matrix |
|
|
62 |
matrix of counts. A GLM is fitted to each row |
|
|
63 |
design : matrix, optional |
|
|
64 |
design matrix, as in :func:`glmFit` |
|
|
65 |
offset : array_like, optional |
|
|
66 |
vector or matrix of offsets for the log-linear models, as in |
|
|
67 |
:func:`glmFit`. Defaults to :code:`log(colSums(y))`. |
|
|
68 |
weights : matrix, optional |
|
|
69 |
observation weights |
|
|
70 |
AveLogCPM : array_like, optional |
|
|
71 |
vector giving average log2 counts per million |
|
|
72 |
interval : tuple, optional |
|
|
73 |
pair giving minimum and maximum allowed values for the dispersion, |
|
|
74 |
passed to :func:`scipy.optimize.minimize_scalar` |
|
|
75 |
tol : float, optional |
|
|
76 |
the desired accuracy, see :func:`scipy.optimize.minimize_scalar` |
|
|
77 |
min_row_sum : int, optional |
|
|
78 |
only rows with at least this number of counts are used |
|
|
79 |
subset : int, optional |
|
|
80 |
number of rows to use in the calculation. Rows used are chosen evenly |
|
|
81 |
space by :code:`AveLogCPM`. |
|
|
82 |
|
|
|
83 |
Returns |
|
|
84 |
------- |
|
|
85 |
float |
|
|
86 |
the estimated common dispersion |
|
|
87 |
|
|
|
88 |
References |
|
|
89 |
---------- |
|
|
90 |
.. [1] D. R. Cox, N. Reid. 1987. Parameter orthogonality and approximate |
|
|
91 |
conditional inference. Journal of the Royal Statistical Society Series B |
|
|
92 |
49, 1-39. |
|
|
93 |
.. [2] M. D. Robinson, G. K. Smyth. 2008. Small-sample estimation of |
|
|
94 |
negative binomial dispersion, with applications to SAGE data. |
|
|
95 |
Biostatistics 9, 321-332. :doi:`10.1093/biostatistics/kxm030` |
|
|
96 |
.. [3] D. J. McCarthy, Y. Chen, G. K. Smyth. 2012. Differential expression |
|
|
97 |
analysis of multifactor RNA-Seq experiments with respect to biological |
|
|
98 |
variation. Nucleic Acids Research 40, 4288-4297. :doi:`10.1093/nar/gks042` |
|
|
99 |
""" |
|
|
100 |
# Check y |
|
|
101 |
y = np.asarray(y) |
|
|
102 |
|
|
|
103 |
# Check design |
|
|
104 |
if design is None: |
|
|
105 |
design = np.ones((y.shape[1], 1)) |
|
|
106 |
else: |
|
|
107 |
design = np.asarray(design) |
|
|
108 |
|
|
|
109 |
# Check offset |
|
|
110 |
if offset is None: |
|
|
111 |
offset = np.log(y.sum(axis=0)) |
|
|
112 |
if offset.ndim == 1: |
|
|
113 |
offset = np.broadcast_to(offset, y.shape) |
|
|
114 |
assert offset.shape == y.shape |
|
|
115 |
|
|
|
116 |
if interval[0] < 0: |
|
|
117 |
raise ValueError("please give a non-negative interval for the dispersion") |
|
|
118 |
|
|
|
119 |
if AveLogCPM is not None: |
|
|
120 |
AveLogCPM = np.asarray(AveLogCPM) |
|
|
121 |
|
|
|
122 |
# Apply min row count |
|
|
123 |
small_row_sum = y.sum(axis=1) < min_row_sum |
|
|
124 |
if small_row_sum.any(): |
|
|
125 |
y = y[np.logical_not(small_row_sum)] |
|
|
126 |
offset = offset[np.logical_not(small_row_sum)] |
|
|
127 |
if weights is not None: |
|
|
128 |
weights = weights[np.logical_not(small_row_sum)] |
|
|
129 |
if AveLogCPM is not None: |
|
|
130 |
AveLogCPM = AveLogCPM[np.logical_not(small_row_sum)] |
|
|
131 |
if y.shape[0] < 1: |
|
|
132 |
raise ValueError("no data rows with required number of counts") |
|
|
133 |
|
|
|
134 |
# Subsetting |
|
|
135 |
if subset is not None and subset <= y.shape[0] / 2: |
|
|
136 |
if AveLogCPM is None: |
|
|
137 |
AveLogCPM = aveLogCPM(y, offset=offset, weights=weights) |
|
|
138 |
i = systematicSubset(subset, AveLogCPM) |
|
|
139 |
y = y[i, :] |
|
|
140 |
offset = offset[i, :] |
|
|
141 |
if weights is not None: |
|
|
142 |
weights = weights[i, :] |
|
|
143 |
|
|
|
144 |
# Function for optimizing |
|
|
145 |
def sumAPL(par, y, design, offset, weights): |
|
|
146 |
return -sum(adjustedProfileLik(par**4, y, design, offset, weights=weights)) |
|
|
147 |
|
|
|
148 |
# anticipate the calls to _compress* in adjustedProfileLik |
|
|
149 |
y = np.asarray(y) |
|
|
150 |
offset = _compressOffsets(y, offset=offset) |
|
|
151 |
weights = _compressWeights(y, weights) |
|
|
152 |
out = minimize_scalar( |
|
|
153 |
sumAPL, |
|
|
154 |
args=(y, design, offset, weights), |
|
|
155 |
bounds=(interval[0] ** 0.25, interval[1] ** 0.25), |
|
|
156 |
options={"xatol": tol}, |
|
|
157 |
) |
|
|
158 |
return out.x**4 |