[ea0fd6]: / inmoose / edgepy / dispCoxReid.py

Download this file

159 lines (135 with data), 6.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -----------------------------------------------------------------------------
# Copyright (C) 2008-2022 Yunshun Chen, Aaron TL Lun, Davis J McCarthy, Matthew E Ritchie, Belinda Phipson, Yifang Hu, Xiaobei Zhou, Mark D Robinson, Gordon K Smyth
# Copyright (C) 2022-2023 Maximilien Colange
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
# -----------------------------------------------------------------------------
# This file is based on the file 'R/dispCoxReid.R' of the Bioconductor edgeR package (version 3.38.4).
import numpy as np
from scipy.optimize import minimize_scalar
from .adjustedProfileLik import adjustedProfileLik
from .aveLogCPM import aveLogCPM
from .makeCompressedMatrix import _compressOffsets, _compressWeights
from .systematicSubset import systematicSubset
def dispCoxReid(
y,
design=None,
offset=None,
weights=None,
AveLogCPM=None,
interval=(0, 4),
tol=1e-5,
min_row_sum=5,
subset=10000,
):
"""
Estimate a common dispersion parameter across multiple negative binomial
GLMs, by maximizing the Cox-Reid adjusted profile likelihood.
This is a low-level function called by :func:`estimateGLMCommonDisp`.
Estimation is done by maximizing the Cox-Reid adjusted profile likelihood
(Cox and Reid, 1987 [1]_), through :func:`scipy.optimize.minimize_scalar`.
Robinson and Smyth (2008) [2]_ and McCarthy et al. (2012) [3]_ showed that
the Pearson (pseudo-likelihood) estimator typically under-estimates the true
dispersion. It can be seriously biased when the number of libraries is small.
On the other hand, the deviance (quasi-likelihood) estimator typically
over-estimates the true dispersion when the number of libraries is small.
Robinson and Smyth (2008) [2]_ and McCarthy et al. (2012) [3]_ showed the
Cox-Reid estimator to be the least biased of the three options.
Arguments
---------
y : matrix
matrix of counts. A GLM is fitted to each row
design : matrix, optional
design matrix, as in :func:`glmFit`
offset : array_like, optional
vector or matrix of offsets for the log-linear models, as in
:func:`glmFit`. Defaults to :code:`log(colSums(y))`.
weights : matrix, optional
observation weights
AveLogCPM : array_like, optional
vector giving average log2 counts per million
interval : tuple, optional
pair giving minimum and maximum allowed values for the dispersion,
passed to :func:`scipy.optimize.minimize_scalar`
tol : float, optional
the desired accuracy, see :func:`scipy.optimize.minimize_scalar`
min_row_sum : int, optional
only rows with at least this number of counts are used
subset : int, optional
number of rows to use in the calculation. Rows used are chosen evenly
space by :code:`AveLogCPM`.
Returns
-------
float
the estimated common dispersion
References
----------
.. [1] D. R. Cox, N. Reid. 1987. Parameter orthogonality and approximate
conditional inference. Journal of the Royal Statistical Society Series B
49, 1-39.
.. [2] M. D. Robinson, G. K. Smyth. 2008. Small-sample estimation of
negative binomial dispersion, with applications to SAGE data.
Biostatistics 9, 321-332. :doi:`10.1093/biostatistics/kxm030`
.. [3] D. J. McCarthy, Y. Chen, G. K. Smyth. 2012. Differential expression
analysis of multifactor RNA-Seq experiments with respect to biological
variation. Nucleic Acids Research 40, 4288-4297. :doi:`10.1093/nar/gks042`
"""
# Check y
y = np.asarray(y)
# Check design
if design is None:
design = np.ones((y.shape[1], 1))
else:
design = np.asarray(design)
# Check offset
if offset is None:
offset = np.log(y.sum(axis=0))
if offset.ndim == 1:
offset = np.broadcast_to(offset, y.shape)
assert offset.shape == y.shape
if interval[0] < 0:
raise ValueError("please give a non-negative interval for the dispersion")
if AveLogCPM is not None:
AveLogCPM = np.asarray(AveLogCPM)
# Apply min row count
small_row_sum = y.sum(axis=1) < min_row_sum
if small_row_sum.any():
y = y[np.logical_not(small_row_sum)]
offset = offset[np.logical_not(small_row_sum)]
if weights is not None:
weights = weights[np.logical_not(small_row_sum)]
if AveLogCPM is not None:
AveLogCPM = AveLogCPM[np.logical_not(small_row_sum)]
if y.shape[0] < 1:
raise ValueError("no data rows with required number of counts")
# Subsetting
if subset is not None and subset <= y.shape[0] / 2:
if AveLogCPM is None:
AveLogCPM = aveLogCPM(y, offset=offset, weights=weights)
i = systematicSubset(subset, AveLogCPM)
y = y[i, :]
offset = offset[i, :]
if weights is not None:
weights = weights[i, :]
# Function for optimizing
def sumAPL(par, y, design, offset, weights):
return -sum(adjustedProfileLik(par**4, y, design, offset, weights=weights))
# anticipate the calls to _compress* in adjustedProfileLik
y = np.asarray(y)
offset = _compressOffsets(y, offset=offset)
weights = _compressWeights(y, weights)
out = minimize_scalar(
sumAPL,
args=(y, design, offset, weights),
bounds=(interval[0] ** 0.25, interval[1] ** 0.25),
options={"xatol": tol},
)
return out.x**4