Switch to unified view

a b/third_party/nucleus/util/math.h
1
/*
2
 * Copyright 2018 Google LLC.
3
 *
4
 * Redistribution and use in source and binary forms, with or without
5
 * modification, are permitted provided that the following conditions
6
 * are met:
7
 *
8
 * 1. Redistributions of source code must retain the above copyright notice,
9
 *    this list of conditions and the following disclaimer.
10
 *
11
 * 2. Redistributions in binary form must reproduce the above copyright
12
 *    notice, this list of conditions and the following disclaimer in the
13
 *    documentation and/or other materials provided with the distribution.
14
 *
15
 * 3. Neither the name of the copyright holder nor the names of its
16
 *    contributors may be used to endorse or promote products derived from this
17
 *    software without specific prior written permission.
18
 *
19
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
 * POSSIBILITY OF SUCH DAMAGE.
30
 *
31
 */
32
33
// Core mathematical routines.
34
//
35
// A quick note on terminology here.
36
//
37
// There are a bunch kinds of probabilities used commonly in genomics:
38
//
39
// -- pError: the probability of being wrong.
40
// -- pTrue: the probability of being correct.
41
//
42
// Normalized probabilities vs. unnormalized likelihoods:
43
//
44
// -- Normalized probabilities: p_1, ..., p_n such that sum(p_i) == 1 are said
45
//    said to be normalized because they represent a valid probability
46
//    distribution over the states 1 ... n.
47
// -- Unnormalized likelihoods: p_1, ..., p_n where sum(p_i) != 1. These are not
48
//    normalized and so aren't a valid probabilities distribution.
49
//
50
// To add even more complexity, probabilities are often represented in three
51
// semi-equivalent spaces:
52
//
53
// -- Real-space: the classic space, with values ranging from [0.0, 1.0]
54
//    inclusive.
55
// -- log10-space: If p is the real-space value, in log10-space this would be
56
//    represented as log10(p). How the p == 0 case is handled is often function
57
//    dependent, which may accept/return -Inf or not handle the case entirely.
58
// -- Phred-scaled: See https://en.wikipedia.org/wiki/Phred_quality_score for
59
//    more information. But briefly, the Phred-scale maintains resolution in the
60
//    lower parts of the probability space using integer quality scores (though
61
//    using ints is optional, really). The phred-scale is defined as
62
//
63
//      `phred(p) = -10 * log10(p)`
64
//
65
//    where p is a real-space probability.
66
//
67
// The code in math.h dealing with probabilities is very explicit about what
68
// kind probability and representation is expects and returns, as unfortunately
69
// these are all represented commonly as doubles in C++. Though tempting to
70
// address this issue with classic software engineering practices like creating
71
// a Probability class, in practice this is extremely difficult to do as this
72
// code is often performance critical and the low-level mathematical operations
73
// used in this code (e.g., log10) don't distiguish themselves among the types
74
// of probabilities.
75
#ifndef THIRD_PARTY_NUCLEUS_UTIL_MATH_H_
76
#define THIRD_PARTY_NUCLEUS_UTIL_MATH_H_
77
78
#include <vector>
79
80
namespace nucleus {
81
82
// Converts Phred scale to probability scale. Phred value must be >= 0.
83
double PhredToPError(int phred);
84
85
// Converts Phred scale to log10 scale. Phred value must be >= 0.
86
double PhredToLog10PError(int phred);
87
88
// Converts Phred scale to pError probability scale.
89
// Note: There is no Phred Scale equivalent for PError = 0 (would be
90
// infinity), so this function does not accept PError == 0.
91
double PErrorToPhred(double perror);
92
int PErrorToRoundedPhred(double perror);
93
94
// Converts probability space to Log10 space.
95
// Note: There is no Phred Scale equivalent for probability = 0 (would be
96
// infinity), so this function does not accept probability == 0.
97
double PErrorToLog10PError(double perror);
98
99
// Converts Log10 scale to Phred scale.
100
double Log10PErrorToPhred(double log10_perror);
101
int Log10PErrorToRoundedPhred(double log10_perror);
102
103
// Converts a Log10(ptrue) value into a phred-scaled value of 1 - 10^log10p.
104
//
105
// This operation is common when you've got a probability of an event occurring,
106
// p, and you want to emit the Phred-equivalent of it being wrong, which is
107
// -10 * log10(1 - p). The operation 1 - p can easily underflow, causing the us
108
// to evaluate log10(0), leading to an infinite value. In that case, the
109
// function returns value_if_not_finite.
110
double Log10PTrueToPhred(double log10_ptrue, double value_if_not_finite);
111
112
// Converts Log10 scale to real scale.
113
double Log10ToReal(double log10_probability);
114
115
// Takes the maximum value (remember, likelihoods are in log10 space and are all
116
// negative values) and subtract it from all genotype likelihoods so that the
117
// most likely likelihood is 0. This gives a bit more resolution in the
118
// conversion.
119
std::vector<double> ZeroShiftLikelihoods(
120
    const std::vector<double>& likelihoods);
121
122
}  // namespace nucleus
123
124
#endif  // THIRD_PARTY_NUCLEUS_UTIL_MATH_H_