a b/bin/convert_to_adata.py
1
"""
2
Script to convert files to adata objects for feeding into BABEL
3
4
Formatting for all files is autodetected
5
"""
6
7
import os, sys
8
import argparse
9
import logging
10
from typing import *
11
12
import pandas as pd
13
import anndata as ad
14
15
SRC_DIR = os.path.join(
16
    os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
17
    "babel",
18
)
19
assert os.path.isdir(SRC_DIR)
20
sys.path.append(SRC_DIR)
21
import utils
22
23
logging.basicConfig(level=logging.INFO)
24
25
26
def auto_read_matrix_values(fname: str) -> pd.DataFrame:
27
    """Read the given counts file"""
28
    assert os.path.isfile(fname)
29
    ext = utils.get_file_extension_no_gz(fname)
30
31
    if ext == "csv":
32
        df = pd.read_csv(fname, sep=",")
33
    elif ext == "txt" or ext == "tsv":
34
        df = pd.read_csv(fname, sep="\t")
35
    else:
36
        raise ValueError(f"Cannot recognize file extension for {fname}")
37
    return df
38
39
40
def auto_read_metadata(fname: str, index_col: Optional[int] = None) -> pd.DataFrame:
41
    """Read the given metadata file"""
42
    assert os.path.isfile(fname)
43
    ext = utils.get_file_extension_no_gz(fname)
44
    if ext == "csv":
45
        df = pd.read_csv(fname, sep=",")
46
    elif ext == "tsv" or ext == "txt":
47
        df = pd.read_csv(fname, sep="\t")
48
    else:
49
        raise ValueError(f"Cannot recognize file extension for {fname}")
50
    if index_col:
51
        df.set_index(df.columns[index_col], inplace=True)
52
    return df
53
54
55
def build_parser():
56
    """Build CLI parser"""
57
    parser = argparse.ArgumentParser(
58
        usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
59
    )
60
    parser.add_argument(
61
        "matfile",
62
        type=str,
63
        help="File containing matrix of vlaues. Expects (cell x feature), use transpose otherwise.",
64
    )
65
    parser.add_argument("out_h5ad", type=str, help="Output file (should end in .h5ad)")
66
    parser.add_argument(
67
        "-t",
68
        "--transpose",
69
        action="store_true",
70
        help="Apply transpose to matrix values to match expected (cell x feature) format",
71
    )
72
    parser.add_argument(
73
        "--obsinfo", type=str, help="Optional file for cell (observation) annotations"
74
    )
75
    parser.add_argument(
76
        "--obscol",
77
        type=int,
78
        help="Column of obs table to use as obs names",
79
    )
80
    parser.add_argument(
81
        "--varinfo",
82
        type=str,
83
        help="Optional file for feature (gene or peaks) annotations",
84
    )
85
    parser.add_argument(
86
        "--varcol",
87
        type=int,
88
        help="Column of var table to use as var names",
89
    )
90
    return parser
91
92
93
def main():
94
    """Run script"""
95
    parser = build_parser()
96
    args = parser.parse_args()
97
98
    # Read in the counts file
99
    counts_df = auto_read_matrix_values(args.matfile)
100
    if args.transpose:
101
        counts_df = counts_df.T
102
    logging.info(f"Read input matrix of (cell x feature) {counts_df.shape}")
103
104
    # Read in metadata if given
105
    obs_df = None
106
    if args.obsinfo:
107
        obs_df = auto_read_metadata(args.obsinfo, index_col=args.obscol)
108
    var_df = None
109
    if args.varinfo:
110
        var_df = auto_read_metadata(args.varinfo, index_col=args.varcol)
111
112
    adata = ad.AnnData(counts_df, obs=obs_df, var=var_df)
113
    logging.info(f"Created anndata object: {adata}")
114
115
    logging.info(f"Writing adata object to {args.out_h5ad}")
116
    adata.write(args.out_h5ad)
117
118
119
if __name__ == "__main__":
120
    main()