--- a +++ b/bin/convert_to_adata.py @@ -0,0 +1,120 @@ +""" +Script to convert files to adata objects for feeding into BABEL + +Formatting for all files is autodetected +""" + +import os, sys +import argparse +import logging +from typing import * + +import pandas as pd +import anndata as ad + +SRC_DIR = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "babel", +) +assert os.path.isdir(SRC_DIR) +sys.path.append(SRC_DIR) +import utils + +logging.basicConfig(level=logging.INFO) + + +def auto_read_matrix_values(fname: str) -> pd.DataFrame: + """Read the given counts file""" + assert os.path.isfile(fname) + ext = utils.get_file_extension_no_gz(fname) + + if ext == "csv": + df = pd.read_csv(fname, sep=",") + elif ext == "txt" or ext == "tsv": + df = pd.read_csv(fname, sep="\t") + else: + raise ValueError(f"Cannot recognize file extension for {fname}") + return df + + +def auto_read_metadata(fname: str, index_col: Optional[int] = None) -> pd.DataFrame: + """Read the given metadata file""" + assert os.path.isfile(fname) + ext = utils.get_file_extension_no_gz(fname) + if ext == "csv": + df = pd.read_csv(fname, sep=",") + elif ext == "tsv" or ext == "txt": + df = pd.read_csv(fname, sep="\t") + else: + raise ValueError(f"Cannot recognize file extension for {fname}") + if index_col: + df.set_index(df.columns[index_col], inplace=True) + return df + + +def build_parser(): + """Build CLI parser""" + parser = argparse.ArgumentParser( + usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "matfile", + type=str, + help="File containing matrix of vlaues. Expects (cell x feature), use transpose otherwise.", + ) + parser.add_argument("out_h5ad", type=str, help="Output file (should end in .h5ad)") + parser.add_argument( + "-t", + "--transpose", + action="store_true", + help="Apply transpose to matrix values to match expected (cell x feature) format", + ) + parser.add_argument( + "--obsinfo", type=str, help="Optional file for cell (observation) annotations" + ) + parser.add_argument( + "--obscol", + type=int, + help="Column of obs table to use as obs names", + ) + parser.add_argument( + "--varinfo", + type=str, + help="Optional file for feature (gene or peaks) annotations", + ) + parser.add_argument( + "--varcol", + type=int, + help="Column of var table to use as var names", + ) + return parser + + +def main(): + """Run script""" + parser = build_parser() + args = parser.parse_args() + + # Read in the counts file + counts_df = auto_read_matrix_values(args.matfile) + if args.transpose: + counts_df = counts_df.T + logging.info(f"Read input matrix of (cell x feature) {counts_df.shape}") + + # Read in metadata if given + obs_df = None + if args.obsinfo: + obs_df = auto_read_metadata(args.obsinfo, index_col=args.obscol) + var_df = None + if args.varinfo: + var_df = auto_read_metadata(args.varinfo, index_col=args.varcol) + + adata = ad.AnnData(counts_df, obs=obs_df, var=var_df) + logging.info(f"Created anndata object: {adata}") + + logging.info(f"Writing adata object to {args.out_h5ad}") + adata.write(args.out_h5ad) + + +if __name__ == "__main__": + main()