Switch to unified view

a b/inmoose/limma/squeezeVar.py
1
# -----------------------------------------------------------------------------
2
# Copyright (C) 2004-2022 Gordon Smyth, Yifang Hu, Matthew Ritchie, Jeremy Silver, James Wettenhall, Davis McCarthy, Di Wu, Wei Shi, Belinda Phipson, Aaron Lun, Natalie Thorne, Alicia Oshlack, Carolyn de Graaf, Yunshun Chen, Mette Langaas, Egil Ferkingstad, Marcus Davy, Francois Pepin, Dongseok Choi
3
# Copyright (C) 2024 Maximilien Colange
4
5
# This program is free software: you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation, either version 3 of the License, or
8
# (at your option) any later version.
9
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
# GNU General Public License for more details.
14
15
# You should have received a copy of the GNU General Public License
16
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
17
# -----------------------------------------------------------------------------
18
19
# This file is based on the file 'R/squeezeVar.R' of the Bioconductor limma package (version 3.55.1).
20
21
22
import numpy as np
23
24
from .fitFDist import fitFDist
25
26
27
def squeezeVar(var, df, covariate=None, robust=False, winsor_tail_p=(0.05, 0.1)):
28
    """
29
    Squeeze a set of sample variances together by computing empirical Bayes posterior means
30
    This function implements empirical Bayes algorithms proposed by
31
    [Smyth2004]_ and [Phipson2016]_.
32
33
    A conjugate Bayesian hierarchical model is assumed for a set of sample
34
    variances. The hyperparameters are estimated by fitting a scaled
35
    F-distribution to the sample variances. The function returns the posterior
36
    variances and the estimated hyperparameters.
37
38
    Specifically, the sample variances :code:`var` are assumed to follow scaled
39
    chi-squared distributions, conditional on the true variances, and a scaled
40
    inverse chi-squared prior is assumed for the true variances.  The scale and
41
    degrees of freedom of this prior distribution are estimated from the values
42
    of :code:`var`.
43
44
    The effect of this function is to squeeze the variances towards a common
45
    value, or to a global trend if a :code:`covariate` is provided. The
46
    squeezed variances have a smaller expected mean square error to the true
47
    variances than do the sample variances themselves.
48
49
    If :code:`covariate` is not :code:`None`, then the scale parameter of the
50
    prior distribution is assumed to depend on the covariate. If the covariate
51
    is average log-expression, then the effect is an intensity-dependent trend
52
    similar to that in [Sartor2006]_.
53
54
    :code:`robust=True` implements the robust empirical Bayes procedure of
55
    [Phipson2016]_ which allows some of the :code:`var` values to be outliers.
56
57
    Arguments
58
    ---------
59
    var : array_like
60
        1-D array of independent sample variances
61
    df : array_like
62
        1-D array of degrees of freedom for the sample variances
63
    covariate : ??
64
        if not :code:`None`, :code:`var_prior` will depend on this numeric
65
        covariate. Otherwise, :code:`var_prior` is constant.
66
    robust : bool
67
        whether the estimation of :code:`df_prior` and :code:`var_prior` be
68
        robustified against outlier sample variances
69
    winsor_tail_p : float or pair of floats
70
        left and right tail proportions of :code:`x` to Winsorize. Only used
71
        when :code:`robust=True`
72
73
    Returns
74
    -------
75
    dict
76
        a dictionnary with keys:
77
78
        - :code:`"var_post"`, 1-D array of posterior variances. Same length as
79
          :code:`var`.
80
        - :code:`"var_prior"`, location or scale of prior distribution. 1-D
81
          array of same length as :code:`var` if :code:`covariate` is not
82
          :code:`None`, otherwise a single value.
83
        - :code:`"df_prior"`, degrees of freedom of prior distribution. 1-D
84
          array of same length as :code:`var` if :code:`robust=True`, otherwise
85
          a single value.
86
    """
87
    n = len(var)
88
89
    # Degenerate special cases
90
    if n == 0:
91
        raise ValueError("var is empty")
92
    if n == 1:
93
        return {"var_post": var, "var_prior": var, "df_prior": 0}
94
95
    # When df==0, guard against missing or infinite values in var
96
    df = np.broadcast_to(df, var.shape)
97
    var[df == 0] = 0
98
99
    # Estimate hyperparameters
100
    if robust:
101
        raise NotImplementedError("Robust estimation in squeezeVar is not implemented")
102
    else:
103
        fit = fitFDist(var, df1=df, covariate=covariate)
104
        df_prior = fit["df2"]
105
    if np.isnan(df_prior).any():
106
        raise RuntimeError("Could not estimate prior df")
107
108
    # Posterior variances
109
    var_post = _squeezeVar(var=var, df=df, var_prior=fit["scale"], df_prior=df_prior)
110
111
    return {"var_post": var_post, "var_prior": fit["scale"], "df_prior": df_prior}
112
113
114
def _squeezeVar(var, df, var_prior, df_prior):
115
    """
116
    Squeeze posterior variances given hyperparameters
117
118
    NAs not allowed in :code:`df_prior`
119
120
    Arguments
121
    ---------
122
    var : array_like
123
        1-D array of independent sample variances
124
    df : array_like
125
        1-D array of degrees of freedom for the sample variances
126
    var_prior : array_like
127
        array of prior variances
128
    df_prior :
129
        array of degrees of freedom for the prior variances
130
131
    Returns
132
    -------
133
    ndarray
134
        array of posterior variances
135
    """
136
    df = np.broadcast_to(df, var.shape)
137
    var_prior = np.broadcast_to(var_prior, var.shape)
138
    df_prior = np.broadcast_to(df_prior, var.shape)
139
140
    isfin = np.isfinite(df_prior)
141
    var_post = np.zeros(var.shape)
142
143
    # For infinite df_prior, return var_prior
144
    var_post[~isfin] = var_prior[~isfin]
145
146
    var_post[isfin] = (df[isfin] * var[isfin] + df_prior[isfin] * var_prior[isfin]) / (
147
        df[isfin] + df_prior[isfin]
148
    )
149
150
    return var_post