"""
Calculate the entropy of the given h5ad
"""
import os
import sys
import argparse
import logging
import numpy as np
from entropy_estimators import continuous as ce
from pyitlib import discrete_random_variable as drv
import scipy.stats
import anndata as ad
SRC_DIR = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "babel",
)
assert os.path.isdir(SRC_DIR)
sys.path.append(SRC_DIR)
import utils
logging.basicConfig(level=logging.INFO)
"""
NOTES
Estimator for continuous variables
https://github.com/paulbrodersen/entropy_estimators
- This estimator is NOT symmetric
>>> x = np.random.randn(3000, 30000)
>>> x
array([[ 1.01757666, 0.14706194, 0.17207894, ..., -0.5776106 ,
1.27110965, -0.80688082],
[-0.46566731, -1.65503883, 0.34362236, ..., -0.56790773,
1.58161324, 0.6875425 ],
[ 0.21598618, 0.15462247, -0.66670242, ..., -1.28547741,
-0.1731192 , 0.19815154],
...,
[ 0.30699781, 0.24104934, 0.30279376, ..., 1.95658979,
0.78125961, 0.26259683],
[-1.94023222, -0.79838041, -0.10267371, ..., -0.67825156,
0.75047044, 0.773398 ],
[ 0.73951081, 0.3485434 , -0.17277407, ..., -0.32622845,
-0.59264903, 1.27659335]])
>>> x.shape
(3000, 30000)
>>> h = continuous.get_h(x)
>>> h
69901.37779787864
>>> h = continuous.get_h(x.T)
>>> h
6346.646780095286
(Simple) estimator for discrete variables
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html
For a binary variable, we can calculate (base e) entropy in several ways
We can specify a torch distribution, and get entory per dimension
We can ask scipy to calculate this from an input of unnormalized probs
Both give us the same results
>>> b = torch.distributions.bernoulli.Bernoulli(torch.tensor([0.1, 0.9, 0.00001, 0.5]))
>>> b.entropy()
tensor([3.2508e-01, 3.2508e-01, 1.2541e-04, 6.9315e-01])
>>> scipy.stats.entropy([0.1, 0.9])
0.3250829733914482
>>> scipy.stats.entropy([1, 1]) # scipy normalizes the input probs
0.6931471805599453
Another estimator for discrete variables
https://github.com/pafoster/pyitlib
- This supports calculation of joint entropy
>>> x
array([[1, 1, 1, 0],
[0, 0, 0, 1]])
>>> drv.entropy_joint(x)
0.8112781244591328
>>> drv.entropy_joint(x.T)
1.0
"""
def build_parser():
"""Build CLI parser"""
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("h5ad", type=str, help=".h5ad file to evaluate entropy for")
parser.add_argument(
"--discrete", action="store_true", help="Use discrete calculation for entropy"
)
return parser
def main():
"""Run script"""
parser = build_parser()
args = parser.parse_args()
adata = ad.read_h5ad(args.h5ad)
logging.info(f"Read {args.h5ad} for adata of {adata.shape}")
if args.discrete:
# Use the discrete algorithm from pyitlib
# https://pafoster.github.io/pyitlib/#discrete_random_variable.entropy_joint
# https://github.com/pafoster/pyitlib/blob/master/pyitlib/discrete_random_variable.py#L3535
# Successive realisations of a random variable are indexed by the last axis in the array; multiple random variables may be specified using preceding axes.
# In other words, different variables are axis 0, samples are axis 1
# This is contrary to the default ML format which is samples axis 0, variables axes 1
# Therefore we must transpose
input_arr = utils.ensure_arr(adata.X).T
h = drv.entropy_joint(input_arr, base=np.e)
logging.info(f"Found discrete joint entropy of {h:.6f}")
else:
raise NotImplementedError
if __name__ == "__main__":
main()