inmoose / Git / [ea0fd6] /inmoose/edgepy/dispCoxReid.py

Models:
AlyssaS/
inmoose
Downloads: 1
[ea0fd6]: / inmoose / edgepy / dispCoxReid.py
History
Download this file
159 lines (135 with data), 6.1 kB

# -----------------------------------------------------------------------------
# Copyright (C) 2008-2022 Yunshun Chen, Aaron TL Lun, Davis J McCarthy, Matthew E Ritchie, Belinda Phipson, Yifang Hu, Xiaobei Zhou, Mark D Robinson, Gordon K Smyth
# Copyright (C) 2022-2023 Maximilien Colange

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
# -----------------------------------------------------------------------------

# This file is based on the file 'R/dispCoxReid.R' of the Bioconductor edgeR package (version 3.38.4).


import numpy as np
from scipy.optimize import minimize_scalar

from .adjustedProfileLik import adjustedProfileLik
from .aveLogCPM import aveLogCPM
from .makeCompressedMatrix import _compressOffsets, _compressWeights
from .systematicSubset import systematicSubset


def dispCoxReid(
    y,
    design=None,
    offset=None,
    weights=None,
    AveLogCPM=None,
    interval=(0, 4),
    tol=1e-5,
    min_row_sum=5,
    subset=10000,
):
    """
    Estimate a common dispersion parameter across multiple negative binomial
    GLMs, by maximizing the Cox-Reid adjusted profile likelihood.

    This is a low-level function called by :func:`estimateGLMCommonDisp`.

    Estimation is done by maximizing the Cox-Reid adjusted profile likelihood
    (Cox and Reid, 1987 [1]_), through :func:`scipy.optimize.minimize_scalar`.

    Robinson and Smyth (2008) [2]_ and McCarthy et al. (2012) [3]_ showed that
    the Pearson (pseudo-likelihood) estimator typically under-estimates the true
    dispersion. It can be seriously biased when the number of libraries is small.
    On the other hand, the deviance (quasi-likelihood) estimator typically
    over-estimates the true dispersion when the number of libraries is small.
    Robinson and Smyth (2008) [2]_ and McCarthy et al. (2012) [3]_ showed the
    Cox-Reid estimator to be the least biased of the three options.

    Arguments
    ---------
    y : matrix
        matrix of counts. A GLM is fitted to each row
    design : matrix, optional
        design matrix, as in :func:`glmFit`
    offset : array_like, optional
        vector or matrix of offsets for the log-linear models, as in
        :func:`glmFit`. Defaults to :code:`log(colSums(y))`.
    weights : matrix, optional
        observation weights
    AveLogCPM : array_like, optional
        vector giving average log2 counts per million
    interval : tuple, optional
        pair giving minimum and maximum allowed values for the dispersion,
        passed to :func:`scipy.optimize.minimize_scalar`
    tol : float, optional
        the desired accuracy, see :func:`scipy.optimize.minimize_scalar`
    min_row_sum : int, optional
        only rows with at least this number of counts are used
    subset : int, optional
        number of rows to use in the calculation. Rows used are chosen evenly
        space by :code:`AveLogCPM`.

    Returns
    -------
    float
        the estimated common dispersion

    References
    ----------
    .. [1] D. R. Cox, N. Reid. 1987. Parameter orthogonality and approximate
       conditional inference. Journal of the Royal Statistical Society Series B
       49, 1-39.
    .. [2] M. D. Robinson, G. K. Smyth. 2008. Small-sample estimation of
       negative binomial dispersion, with applications to SAGE data.
       Biostatistics 9, 321-332. :doi:`10.1093/biostatistics/kxm030`
    .. [3] D. J. McCarthy, Y. Chen, G. K. Smyth. 2012. Differential expression
       analysis of multifactor RNA-Seq experiments with respect to biological
       variation. Nucleic Acids Research 40, 4288-4297. :doi:`10.1093/nar/gks042`
    """
    # Check y
    y = np.asarray(y)

    # Check design
    if design is None:
        design = np.ones((y.shape[1], 1))
    else:
        design = np.asarray(design)

    # Check offset
    if offset is None:
        offset = np.log(y.sum(axis=0))
    if offset.ndim == 1:
        offset = np.broadcast_to(offset, y.shape)
    assert offset.shape == y.shape

    if interval[0] < 0:
        raise ValueError("please give a non-negative interval for the dispersion")

    if AveLogCPM is not None:
        AveLogCPM = np.asarray(AveLogCPM)

    # Apply min row count
    small_row_sum = y.sum(axis=1) < min_row_sum
    if small_row_sum.any():
        y = y[np.logical_not(small_row_sum)]
        offset = offset[np.logical_not(small_row_sum)]
        if weights is not None:
            weights = weights[np.logical_not(small_row_sum)]
        if AveLogCPM is not None:
            AveLogCPM = AveLogCPM[np.logical_not(small_row_sum)]
    if y.shape[0] < 1:
        raise ValueError("no data rows with required number of counts")

    # Subsetting
    if subset is not None and subset <= y.shape[0] / 2:
        if AveLogCPM is None:
            AveLogCPM = aveLogCPM(y, offset=offset, weights=weights)
        i = systematicSubset(subset, AveLogCPM)
        y = y[i, :]
        offset = offset[i, :]
        if weights is not None:
            weights = weights[i, :]

    # Function for optimizing
    def sumAPL(par, y, design, offset, weights):
        return -sum(adjustedProfileLik(par**4, y, design, offset, weights=weights))

    # anticipate the calls to _compress* in adjustedProfileLik
    y = np.asarray(y)
    offset = _compressOffsets(y, offset=offset)
    weights = _compressWeights(y, weights)
    out = minimize_scalar(
        sumAPL,
        args=(y, design, offset, weights),
        bounds=(interval[0] ** 0.25, interval[1] ** 0.25),
        options={"xatol": tol},
    )
    return out.x**4