[d01132]: / bin / entropy.py

Download this file

119 lines (102 with data), 3.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Calculate the entropy of the given h5ad
"""
import os
import sys
import argparse
import logging
import numpy as np
from entropy_estimators import continuous as ce
from pyitlib import discrete_random_variable as drv
import scipy.stats
import anndata as ad
SRC_DIR = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "babel",
)
assert os.path.isdir(SRC_DIR)
sys.path.append(SRC_DIR)
import utils
logging.basicConfig(level=logging.INFO)
"""
NOTES
Estimator for continuous variables
https://github.com/paulbrodersen/entropy_estimators
- This estimator is NOT symmetric
>>> x = np.random.randn(3000, 30000)
>>> x
array([[ 1.01757666, 0.14706194, 0.17207894, ..., -0.5776106 ,
1.27110965, -0.80688082],
[-0.46566731, -1.65503883, 0.34362236, ..., -0.56790773,
1.58161324, 0.6875425 ],
[ 0.21598618, 0.15462247, -0.66670242, ..., -1.28547741,
-0.1731192 , 0.19815154],
...,
[ 0.30699781, 0.24104934, 0.30279376, ..., 1.95658979,
0.78125961, 0.26259683],
[-1.94023222, -0.79838041, -0.10267371, ..., -0.67825156,
0.75047044, 0.773398 ],
[ 0.73951081, 0.3485434 , -0.17277407, ..., -0.32622845,
-0.59264903, 1.27659335]])
>>> x.shape
(3000, 30000)
>>> h = continuous.get_h(x)
>>> h
69901.37779787864
>>> h = continuous.get_h(x.T)
>>> h
6346.646780095286
(Simple) estimator for discrete variables
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html
For a binary variable, we can calculate (base e) entropy in several ways
We can specify a torch distribution, and get entory per dimension
We can ask scipy to calculate this from an input of unnormalized probs
Both give us the same results
>>> b = torch.distributions.bernoulli.Bernoulli(torch.tensor([0.1, 0.9, 0.00001, 0.5]))
>>> b.entropy()
tensor([3.2508e-01, 3.2508e-01, 1.2541e-04, 6.9315e-01])
>>> scipy.stats.entropy([0.1, 0.9])
0.3250829733914482
>>> scipy.stats.entropy([1, 1]) # scipy normalizes the input probs
0.6931471805599453
Another estimator for discrete variables
https://github.com/pafoster/pyitlib
- This supports calculation of joint entropy
>>> x
array([[1, 1, 1, 0],
[0, 0, 0, 1]])
>>> drv.entropy_joint(x)
0.8112781244591328
>>> drv.entropy_joint(x.T)
1.0
"""
def build_parser():
"""Build CLI parser"""
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("h5ad", type=str, help=".h5ad file to evaluate entropy for")
parser.add_argument(
"--discrete", action="store_true", help="Use discrete calculation for entropy"
)
return parser
def main():
"""Run script"""
parser = build_parser()
args = parser.parse_args()
adata = ad.read_h5ad(args.h5ad)
logging.info(f"Read {args.h5ad} for adata of {adata.shape}")
if args.discrete:
# Use the discrete algorithm from pyitlib
# https://pafoster.github.io/pyitlib/#discrete_random_variable.entropy_joint
# https://github.com/pafoster/pyitlib/blob/master/pyitlib/discrete_random_variable.py#L3535
# Successive realisations of a random variable are indexed by the last axis in the array; multiple random variables may be specified using preceding axes.
# In other words, different variables are axis 0, samples are axis 1
# This is contrary to the default ML format which is samples axis 0, variables axes 1
# Therefore we must transpose
input_arr = utils.ensure_arr(adata.X).T
h = drv.entropy_joint(input_arr, base=np.e)
logging.info(f"Found discrete joint entropy of {h:.6f}")
else:
raise NotImplementedError
if __name__ == "__main__":
main()