Diff of /bin/convert_to_adata.py [000000] .. [d01132]

Switch to side-by-side view

--- a
+++ b/bin/convert_to_adata.py
@@ -0,0 +1,120 @@
+"""
+Script to convert files to adata objects for feeding into BABEL
+
+Formatting for all files is autodetected
+"""
+
+import os, sys
+import argparse
+import logging
+from typing import *
+
+import pandas as pd
+import anndata as ad
+
+SRC_DIR = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+    "babel",
+)
+assert os.path.isdir(SRC_DIR)
+sys.path.append(SRC_DIR)
+import utils
+
+logging.basicConfig(level=logging.INFO)
+
+
+def auto_read_matrix_values(fname: str) -> pd.DataFrame:
+    """Read the given counts file"""
+    assert os.path.isfile(fname)
+    ext = utils.get_file_extension_no_gz(fname)
+
+    if ext == "csv":
+        df = pd.read_csv(fname, sep=",")
+    elif ext == "txt" or ext == "tsv":
+        df = pd.read_csv(fname, sep="\t")
+    else:
+        raise ValueError(f"Cannot recognize file extension for {fname}")
+    return df
+
+
+def auto_read_metadata(fname: str, index_col: Optional[int] = None) -> pd.DataFrame:
+    """Read the given metadata file"""
+    assert os.path.isfile(fname)
+    ext = utils.get_file_extension_no_gz(fname)
+    if ext == "csv":
+        df = pd.read_csv(fname, sep=",")
+    elif ext == "tsv" or ext == "txt":
+        df = pd.read_csv(fname, sep="\t")
+    else:
+        raise ValueError(f"Cannot recognize file extension for {fname}")
+    if index_col:
+        df.set_index(df.columns[index_col], inplace=True)
+    return df
+
+
+def build_parser():
+    """Build CLI parser"""
+    parser = argparse.ArgumentParser(
+        usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "matfile",
+        type=str,
+        help="File containing matrix of vlaues. Expects (cell x feature), use transpose otherwise.",
+    )
+    parser.add_argument("out_h5ad", type=str, help="Output file (should end in .h5ad)")
+    parser.add_argument(
+        "-t",
+        "--transpose",
+        action="store_true",
+        help="Apply transpose to matrix values to match expected (cell x feature) format",
+    )
+    parser.add_argument(
+        "--obsinfo", type=str, help="Optional file for cell (observation) annotations"
+    )
+    parser.add_argument(
+        "--obscol",
+        type=int,
+        help="Column of obs table to use as obs names",
+    )
+    parser.add_argument(
+        "--varinfo",
+        type=str,
+        help="Optional file for feature (gene or peaks) annotations",
+    )
+    parser.add_argument(
+        "--varcol",
+        type=int,
+        help="Column of var table to use as var names",
+    )
+    return parser
+
+
+def main():
+    """Run script"""
+    parser = build_parser()
+    args = parser.parse_args()
+
+    # Read in the counts file
+    counts_df = auto_read_matrix_values(args.matfile)
+    if args.transpose:
+        counts_df = counts_df.T
+    logging.info(f"Read input matrix of (cell x feature) {counts_df.shape}")
+
+    # Read in metadata if given
+    obs_df = None
+    if args.obsinfo:
+        obs_df = auto_read_metadata(args.obsinfo, index_col=args.obscol)
+    var_df = None
+    if args.varinfo:
+        var_df = auto_read_metadata(args.varinfo, index_col=args.varcol)
+
+    adata = ad.AnnData(counts_df, obs=obs_df, var=var_df)
+    logging.info(f"Created anndata object: {adata}")
+
+    logging.info(f"Writing adata object to {args.out_h5ad}")
+    adata.write(args.out_h5ad)
+
+
+if __name__ == "__main__":
+    main()