a b/bin/gene_activity.py
1
"""
2
Script for calculating gene activity scores
3
"""
4
5
import os
6
import sys
7
import logging
8
import argparse
9
10
import numpy as np
11
import pandas as pd
12
import anndata as ad
13
import scanpy as sc
14
import scipy
15
16
SRC_DIR = os.path.join(
17
    os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "babel",
18
)
19
assert os.path.isdir(SRC_DIR)
20
sys.path.append(SRC_DIR)
21
import sc_data_loaders
22
import adata_utils
23
import metrics
24
import atac_utils
25
import utils
26
27
logging.basicConfig(level=logging.INFO)
28
29
30
ANNOTATION_DICT = {
31
    "hg19": sc_data_loaders.HG19_GTF,
32
    "hg38": sc_data_loaders.HG38_GTF,
33
}
34
35
36
def build_parser():
37
    """Build parser for running as a script"""
38
    parser = argparse.ArgumentParser(
39
        usage="Convert ATAC h5ad to infered gene activity scores",
40
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
41
    )
42
    parser.add_argument(
43
        "input_h5ad",
44
        type=str,
45
        nargs="*",
46
        help="ATAC h5ad to infer gene activity scores for",
47
    )
48
    parser.add_argument(
49
        "output_h5ad", type=str, help="h5ad file to write gene activity scores to"
50
    )
51
    parser.add_argument(
52
        "--genome",
53
        "-g",
54
        choices=ANNOTATION_DICT.keys(),
55
        default="hg38",
56
        help="Genome annotation to use",
57
    )
58
    parser.add_argument("--raw", action="store_true", help="Use raw atribaute of input")
59
    parser.add_argument(
60
        "--sizenorm",
61
        action="store_true",
62
        help="Normalize gene activity scores by span of gene",
63
    )
64
    parser.add_argument(
65
        "--naive",
66
        action="store_true",
67
        help="Use naive method instead of archr-derived method",
68
    )
69
    return parser
70
71
72
def main():
73
    """On the fly testing"""
74
    parser = build_parser()
75
    args = parser.parse_args()
76
    logging.info(f"Inputs: {args.input_h5ad}")
77
78
    # Note that all inputs should have the same suffix
79
    for input_h5ad in args.input_h5ad:
80
        assert input_h5ad.endswith(".h5ad") or input_h5ad.endswith(".h5")
81
    assert args.output_h5ad.endswith(".h5ad")
82
    logging.info(f"Calculating gene activity scores for: {args.input_h5ad}")
83
84
    reader_func = (
85
        ad.read_h5ad
86
        if args.input_h5ad[0].endswith(".h5ad")
87
        else lambda x: utils.sc_read_10x_h5_ft_type(x, "Peaks")
88
    )
89
    # adata = reader_func(args.input_h5ad)
90
    adata = utils.sc_read_multi_files(args.input_h5ad, reader=reader_func)
91
    logging.info(f"Received input of {adata.shape}")
92
    if args.naive:
93
        gene_act_adata = atac_utils.gene_activity_matrix_from_adata(
94
            adata if not args.raw else adata.raw,
95
            size_norm=args.sizenorm,
96
            annotation=ANNOTATION_DICT[args.genome],
97
        )
98
    else:
99
        logging.warning("Using ArchR based method, size norm is always on")
100
        gene_act_adata = atac_utils.archr_gene_activity_matrix_from_adata(
101
            adata if not args.raw else adata.raw,
102
            annotation=ANNOTATION_DICT[args.genome],
103
        )
104
    logging.info(f"Writing gene activity scores to {args.output_h5ad}")
105
    gene_act_adata.write(args.output_h5ad)
106
107
108
if __name__ == "__main__":
109
    main()