Diff of /bin/entropy.py [000000] .. [d01132]

Switch to side-by-side view

--- a
+++ b/bin/entropy.py
@@ -0,0 +1,118 @@
+"""
+Calculate the entropy of the given h5ad
+"""
+
+import os
+import sys
+import argparse
+import logging
+import numpy as np
+from entropy_estimators import continuous as ce
+from pyitlib import discrete_random_variable as drv
+import scipy.stats
+
+import anndata as ad
+
+SRC_DIR = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "babel",
+)
+assert os.path.isdir(SRC_DIR)
+sys.path.append(SRC_DIR)
+import utils
+
+logging.basicConfig(level=logging.INFO)
+
+"""
+NOTES
+Estimator for continuous variables
+https://github.com/paulbrodersen/entropy_estimators
+- This estimator is NOT symmetric
+>>> x = np.random.randn(3000, 30000)
+>>> x
+array([[ 1.01757666,  0.14706194,  0.17207894, ..., -0.5776106 ,
+         1.27110965, -0.80688082],
+       [-0.46566731, -1.65503883,  0.34362236, ..., -0.56790773,
+         1.58161324,  0.6875425 ],
+       [ 0.21598618,  0.15462247, -0.66670242, ..., -1.28547741,
+        -0.1731192 ,  0.19815154],
+       ...,
+       [ 0.30699781,  0.24104934,  0.30279376, ...,  1.95658979,
+         0.78125961,  0.26259683],
+       [-1.94023222, -0.79838041, -0.10267371, ..., -0.67825156,
+         0.75047044,  0.773398  ],
+       [ 0.73951081,  0.3485434 , -0.17277407, ..., -0.32622845,
+        -0.59264903,  1.27659335]])
+>>> x.shape
+(3000, 30000)
+>>> h = continuous.get_h(x)
+>>> h
+69901.37779787864
+>>> h = continuous.get_h(x.T)
+>>> h
+6346.646780095286
+
+(Simple) estimator for discrete variables
+https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html
+
+For a binary variable, we can calculate (base e) entropy in several ways
+We can specify a torch distribution, and get entory per dimension
+We can ask scipy to calculate this from an input of unnormalized probs
+Both give us the same results
+>>> b = torch.distributions.bernoulli.Bernoulli(torch.tensor([0.1, 0.9, 0.00001, 0.5]))
+>>> b.entropy()
+tensor([3.2508e-01, 3.2508e-01, 1.2541e-04, 6.9315e-01])
+>>> scipy.stats.entropy([0.1, 0.9])
+0.3250829733914482
+>>> scipy.stats.entropy([1, 1])  # scipy normalizes the input probs
+0.6931471805599453
+
+Another estimator for discrete variables
+https://github.com/pafoster/pyitlib
+- This supports calculation of joint entropy
+>>> x
+array([[1, 1, 1, 0],
+       [0, 0, 0, 1]])
+>>> drv.entropy_joint(x)
+0.8112781244591328
+>>> drv.entropy_joint(x.T)
+1.0
+"""
+
+
+def build_parser():
+    """Build CLI parser"""
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("h5ad", type=str, help=".h5ad file to evaluate entropy for")
+    parser.add_argument(
+        "--discrete", action="store_true", help="Use discrete calculation for entropy"
+    )
+    return parser
+
+
+def main():
+    """Run script"""
+    parser = build_parser()
+    args = parser.parse_args()
+
+    adata = ad.read_h5ad(args.h5ad)
+    logging.info(f"Read {args.h5ad} for adata of {adata.shape}")
+
+    if args.discrete:
+        # Use the discrete algorithm from pyitlib
+        # https://pafoster.github.io/pyitlib/#discrete_random_variable.entropy_joint
+        # https://github.com/pafoster/pyitlib/blob/master/pyitlib/discrete_random_variable.py#L3535
+        # Successive realisations of a random variable are indexed by the last axis in the array; multiple random variables may be specified using preceding axes.
+        # In other words, different variables are axis 0, samples are axis 1
+        # This is contrary to the default ML format which is samples axis 0, variables axes 1
+        # Therefore we must transpose
+        input_arr = utils.ensure_arr(adata.X).T
+        h = drv.entropy_joint(input_arr, base=np.e)
+        logging.info(f"Found discrete joint entropy of {h:.6f}")
+    else:
+        raise NotImplementedError
+
+
+if __name__ == "__main__":
+    main()