|
a |
|
b/bin/convert_to_adata.py |
|
|
1 |
""" |
|
|
2 |
Script to convert files to adata objects for feeding into BABEL |
|
|
3 |
|
|
|
4 |
Formatting for all files is autodetected |
|
|
5 |
""" |
|
|
6 |
|
|
|
7 |
import os, sys |
|
|
8 |
import argparse |
|
|
9 |
import logging |
|
|
10 |
from typing import * |
|
|
11 |
|
|
|
12 |
import pandas as pd |
|
|
13 |
import anndata as ad |
|
|
14 |
|
|
|
15 |
SRC_DIR = os.path.join( |
|
|
16 |
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), |
|
|
17 |
"babel", |
|
|
18 |
) |
|
|
19 |
assert os.path.isdir(SRC_DIR) |
|
|
20 |
sys.path.append(SRC_DIR) |
|
|
21 |
import utils |
|
|
22 |
|
|
|
23 |
logging.basicConfig(level=logging.INFO) |
|
|
24 |
|
|
|
25 |
|
|
|
26 |
def auto_read_matrix_values(fname: str) -> pd.DataFrame: |
|
|
27 |
"""Read the given counts file""" |
|
|
28 |
assert os.path.isfile(fname) |
|
|
29 |
ext = utils.get_file_extension_no_gz(fname) |
|
|
30 |
|
|
|
31 |
if ext == "csv": |
|
|
32 |
df = pd.read_csv(fname, sep=",") |
|
|
33 |
elif ext == "txt" or ext == "tsv": |
|
|
34 |
df = pd.read_csv(fname, sep="\t") |
|
|
35 |
else: |
|
|
36 |
raise ValueError(f"Cannot recognize file extension for {fname}") |
|
|
37 |
return df |
|
|
38 |
|
|
|
39 |
|
|
|
40 |
def auto_read_metadata(fname: str, index_col: Optional[int] = None) -> pd.DataFrame: |
|
|
41 |
"""Read the given metadata file""" |
|
|
42 |
assert os.path.isfile(fname) |
|
|
43 |
ext = utils.get_file_extension_no_gz(fname) |
|
|
44 |
if ext == "csv": |
|
|
45 |
df = pd.read_csv(fname, sep=",") |
|
|
46 |
elif ext == "tsv" or ext == "txt": |
|
|
47 |
df = pd.read_csv(fname, sep="\t") |
|
|
48 |
else: |
|
|
49 |
raise ValueError(f"Cannot recognize file extension for {fname}") |
|
|
50 |
if index_col: |
|
|
51 |
df.set_index(df.columns[index_col], inplace=True) |
|
|
52 |
return df |
|
|
53 |
|
|
|
54 |
|
|
|
55 |
def build_parser(): |
|
|
56 |
"""Build CLI parser""" |
|
|
57 |
parser = argparse.ArgumentParser( |
|
|
58 |
usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter |
|
|
59 |
) |
|
|
60 |
parser.add_argument( |
|
|
61 |
"matfile", |
|
|
62 |
type=str, |
|
|
63 |
help="File containing matrix of vlaues. Expects (cell x feature), use transpose otherwise.", |
|
|
64 |
) |
|
|
65 |
parser.add_argument("out_h5ad", type=str, help="Output file (should end in .h5ad)") |
|
|
66 |
parser.add_argument( |
|
|
67 |
"-t", |
|
|
68 |
"--transpose", |
|
|
69 |
action="store_true", |
|
|
70 |
help="Apply transpose to matrix values to match expected (cell x feature) format", |
|
|
71 |
) |
|
|
72 |
parser.add_argument( |
|
|
73 |
"--obsinfo", type=str, help="Optional file for cell (observation) annotations" |
|
|
74 |
) |
|
|
75 |
parser.add_argument( |
|
|
76 |
"--obscol", |
|
|
77 |
type=int, |
|
|
78 |
help="Column of obs table to use as obs names", |
|
|
79 |
) |
|
|
80 |
parser.add_argument( |
|
|
81 |
"--varinfo", |
|
|
82 |
type=str, |
|
|
83 |
help="Optional file for feature (gene or peaks) annotations", |
|
|
84 |
) |
|
|
85 |
parser.add_argument( |
|
|
86 |
"--varcol", |
|
|
87 |
type=int, |
|
|
88 |
help="Column of var table to use as var names", |
|
|
89 |
) |
|
|
90 |
return parser |
|
|
91 |
|
|
|
92 |
|
|
|
93 |
def main(): |
|
|
94 |
"""Run script""" |
|
|
95 |
parser = build_parser() |
|
|
96 |
args = parser.parse_args() |
|
|
97 |
|
|
|
98 |
# Read in the counts file |
|
|
99 |
counts_df = auto_read_matrix_values(args.matfile) |
|
|
100 |
if args.transpose: |
|
|
101 |
counts_df = counts_df.T |
|
|
102 |
logging.info(f"Read input matrix of (cell x feature) {counts_df.shape}") |
|
|
103 |
|
|
|
104 |
# Read in metadata if given |
|
|
105 |
obs_df = None |
|
|
106 |
if args.obsinfo: |
|
|
107 |
obs_df = auto_read_metadata(args.obsinfo, index_col=args.obscol) |
|
|
108 |
var_df = None |
|
|
109 |
if args.varinfo: |
|
|
110 |
var_df = auto_read_metadata(args.varinfo, index_col=args.varcol) |
|
|
111 |
|
|
|
112 |
adata = ad.AnnData(counts_df, obs=obs_df, var=var_df) |
|
|
113 |
logging.info(f"Created anndata object: {adata}") |
|
|
114 |
|
|
|
115 |
logging.info(f"Writing adata object to {args.out_h5ad}") |
|
|
116 |
adata.write(args.out_h5ad) |
|
|
117 |
|
|
|
118 |
|
|
|
119 |
if __name__ == "__main__": |
|
|
120 |
main() |