inmoose / Git / Diff of /inmoose/edgepy/dispCoxReid.py

Models:
AlyssaS/
inmoose
Downloads: 1
Diff of /inmoose/edgepy/dispCoxReid.py [000000] .. [ea0fd6]
Switch to side-by-side view

--- a
+++ b/inmoose/edgepy/dispCoxReid.py
@@ -0,0 +1,158 @@
+# -----------------------------------------------------------------------------
+# Copyright (C) 2008-2022 Yunshun Chen, Aaron TL Lun, Davis J McCarthy, Matthew E Ritchie, Belinda Phipson, Yifang Hu, Xiaobei Zhou, Mark D Robinson, Gordon K Smyth
+# Copyright (C) 2022-2023 Maximilien Colange
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+# -----------------------------------------------------------------------------
+
+# This file is based on the file 'R/dispCoxReid.R' of the Bioconductor edgeR package (version 3.38.4).
+
+
+import numpy as np
+from scipy.optimize import minimize_scalar
+
+from .adjustedProfileLik import adjustedProfileLik
+from .aveLogCPM import aveLogCPM
+from .makeCompressedMatrix import _compressOffsets, _compressWeights
+from .systematicSubset import systematicSubset
+
+
+def dispCoxReid(
+    y,
+    design=None,
+    offset=None,
+    weights=None,
+    AveLogCPM=None,
+    interval=(0, 4),
+    tol=1e-5,
+    min_row_sum=5,
+    subset=10000,
+):
+    """
+    Estimate a common dispersion parameter across multiple negative binomial
+    GLMs, by maximizing the Cox-Reid adjusted profile likelihood.
+
+    This is a low-level function called by :func:`estimateGLMCommonDisp`.
+
+    Estimation is done by maximizing the Cox-Reid adjusted profile likelihood
+    (Cox and Reid, 1987 [1]_), through :func:`scipy.optimize.minimize_scalar`.
+
+    Robinson and Smyth (2008) [2]_ and McCarthy et al. (2012) [3]_ showed that
+    the Pearson (pseudo-likelihood) estimator typically under-estimates the true
+    dispersion. It can be seriously biased when the number of libraries is small.
+    On the other hand, the deviance (quasi-likelihood) estimator typically
+    over-estimates the true dispersion when the number of libraries is small.
+    Robinson and Smyth (2008) [2]_ and McCarthy et al. (2012) [3]_ showed the
+    Cox-Reid estimator to be the least biased of the three options.
+
+    Arguments
+    ---------
+    y : matrix
+        matrix of counts. A GLM is fitted to each row
+    design : matrix, optional
+        design matrix, as in :func:`glmFit`
+    offset : array_like, optional
+        vector or matrix of offsets for the log-linear models, as in
+        :func:`glmFit`. Defaults to :code:`log(colSums(y))`.
+    weights : matrix, optional
+        observation weights
+    AveLogCPM : array_like, optional
+        vector giving average log2 counts per million
+    interval : tuple, optional
+        pair giving minimum and maximum allowed values for the dispersion,
+        passed to :func:`scipy.optimize.minimize_scalar`
+    tol : float, optional
+        the desired accuracy, see :func:`scipy.optimize.minimize_scalar`
+    min_row_sum : int, optional
+        only rows with at least this number of counts are used
+    subset : int, optional
+        number of rows to use in the calculation. Rows used are chosen evenly
+        space by :code:`AveLogCPM`.
+
+    Returns
+    -------
+    float
+        the estimated common dispersion
+
+    References
+    ----------
+    .. [1] D. R. Cox, N. Reid. 1987. Parameter orthogonality and approximate
+       conditional inference. Journal of the Royal Statistical Society Series B
+       49, 1-39.
+    .. [2] M. D. Robinson, G. K. Smyth. 2008. Small-sample estimation of
+       negative binomial dispersion, with applications to SAGE data.
+       Biostatistics 9, 321-332. :doi:`10.1093/biostatistics/kxm030`
+    .. [3] D. J. McCarthy, Y. Chen, G. K. Smyth. 2012. Differential expression
+       analysis of multifactor RNA-Seq experiments with respect to biological
+       variation. Nucleic Acids Research 40, 4288-4297. :doi:`10.1093/nar/gks042`
+    """
+    # Check y
+    y = np.asarray(y)
+
+    # Check design
+    if design is None:
+        design = np.ones((y.shape[1], 1))
+    else:
+        design = np.asarray(design)
+
+    # Check offset
+    if offset is None:
+        offset = np.log(y.sum(axis=0))
+    if offset.ndim == 1:
+        offset = np.broadcast_to(offset, y.shape)
+    assert offset.shape == y.shape
+
+    if interval[0] < 0:
+        raise ValueError("please give a non-negative interval for the dispersion")
+
+    if AveLogCPM is not None:
+        AveLogCPM = np.asarray(AveLogCPM)
+
+    # Apply min row count
+    small_row_sum = y.sum(axis=1) < min_row_sum
+    if small_row_sum.any():
+        y = y[np.logical_not(small_row_sum)]
+        offset = offset[np.logical_not(small_row_sum)]
+        if weights is not None:
+            weights = weights[np.logical_not(small_row_sum)]
+        if AveLogCPM is not None:
+            AveLogCPM = AveLogCPM[np.logical_not(small_row_sum)]
+    if y.shape[0] < 1:
+        raise ValueError("no data rows with required number of counts")
+
+    # Subsetting
+    if subset is not None and subset <= y.shape[0] / 2:
+        if AveLogCPM is None:
+            AveLogCPM = aveLogCPM(y, offset=offset, weights=weights)
+        i = systematicSubset(subset, AveLogCPM)
+        y = y[i, :]
+        offset = offset[i, :]
+        if weights is not None:
+            weights = weights[i, :]
+
+    # Function for optimizing
+    def sumAPL(par, y, design, offset, weights):
+        return -sum(adjustedProfileLik(par**4, y, design, offset, weights=weights))
+
+    # anticipate the calls to _compress* in adjustedProfileLik
+    y = np.asarray(y)
+    offset = _compressOffsets(y, offset=offset)
+    weights = _compressWeights(y, weights)
+    out = minimize_scalar(
+        sumAPL,
+        args=(y, design, offset, weights),
+        bounds=(interval[0] ** 0.25, interval[1] ** 0.25),
+        options={"xatol": tol},
+    )
+    return out.x**4