|
a |
|
b/bin/gene_activity.py |
|
|
1 |
""" |
|
|
2 |
Script for calculating gene activity scores |
|
|
3 |
""" |
|
|
4 |
|
|
|
5 |
import os |
|
|
6 |
import sys |
|
|
7 |
import logging |
|
|
8 |
import argparse |
|
|
9 |
|
|
|
10 |
import numpy as np |
|
|
11 |
import pandas as pd |
|
|
12 |
import anndata as ad |
|
|
13 |
import scanpy as sc |
|
|
14 |
import scipy |
|
|
15 |
|
|
|
16 |
SRC_DIR = os.path.join( |
|
|
17 |
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "babel", |
|
|
18 |
) |
|
|
19 |
assert os.path.isdir(SRC_DIR) |
|
|
20 |
sys.path.append(SRC_DIR) |
|
|
21 |
import sc_data_loaders |
|
|
22 |
import adata_utils |
|
|
23 |
import metrics |
|
|
24 |
import atac_utils |
|
|
25 |
import utils |
|
|
26 |
|
|
|
27 |
logging.basicConfig(level=logging.INFO) |
|
|
28 |
|
|
|
29 |
|
|
|
30 |
ANNOTATION_DICT = { |
|
|
31 |
"hg19": sc_data_loaders.HG19_GTF, |
|
|
32 |
"hg38": sc_data_loaders.HG38_GTF, |
|
|
33 |
} |
|
|
34 |
|
|
|
35 |
|
|
|
36 |
def build_parser(): |
|
|
37 |
"""Build parser for running as a script""" |
|
|
38 |
parser = argparse.ArgumentParser( |
|
|
39 |
usage="Convert ATAC h5ad to infered gene activity scores", |
|
|
40 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
41 |
) |
|
|
42 |
parser.add_argument( |
|
|
43 |
"input_h5ad", |
|
|
44 |
type=str, |
|
|
45 |
nargs="*", |
|
|
46 |
help="ATAC h5ad to infer gene activity scores for", |
|
|
47 |
) |
|
|
48 |
parser.add_argument( |
|
|
49 |
"output_h5ad", type=str, help="h5ad file to write gene activity scores to" |
|
|
50 |
) |
|
|
51 |
parser.add_argument( |
|
|
52 |
"--genome", |
|
|
53 |
"-g", |
|
|
54 |
choices=ANNOTATION_DICT.keys(), |
|
|
55 |
default="hg38", |
|
|
56 |
help="Genome annotation to use", |
|
|
57 |
) |
|
|
58 |
parser.add_argument("--raw", action="store_true", help="Use raw atribaute of input") |
|
|
59 |
parser.add_argument( |
|
|
60 |
"--sizenorm", |
|
|
61 |
action="store_true", |
|
|
62 |
help="Normalize gene activity scores by span of gene", |
|
|
63 |
) |
|
|
64 |
parser.add_argument( |
|
|
65 |
"--naive", |
|
|
66 |
action="store_true", |
|
|
67 |
help="Use naive method instead of archr-derived method", |
|
|
68 |
) |
|
|
69 |
return parser |
|
|
70 |
|
|
|
71 |
|
|
|
72 |
def main(): |
|
|
73 |
"""On the fly testing""" |
|
|
74 |
parser = build_parser() |
|
|
75 |
args = parser.parse_args() |
|
|
76 |
logging.info(f"Inputs: {args.input_h5ad}") |
|
|
77 |
|
|
|
78 |
# Note that all inputs should have the same suffix |
|
|
79 |
for input_h5ad in args.input_h5ad: |
|
|
80 |
assert input_h5ad.endswith(".h5ad") or input_h5ad.endswith(".h5") |
|
|
81 |
assert args.output_h5ad.endswith(".h5ad") |
|
|
82 |
logging.info(f"Calculating gene activity scores for: {args.input_h5ad}") |
|
|
83 |
|
|
|
84 |
reader_func = ( |
|
|
85 |
ad.read_h5ad |
|
|
86 |
if args.input_h5ad[0].endswith(".h5ad") |
|
|
87 |
else lambda x: utils.sc_read_10x_h5_ft_type(x, "Peaks") |
|
|
88 |
) |
|
|
89 |
# adata = reader_func(args.input_h5ad) |
|
|
90 |
adata = utils.sc_read_multi_files(args.input_h5ad, reader=reader_func) |
|
|
91 |
logging.info(f"Received input of {adata.shape}") |
|
|
92 |
if args.naive: |
|
|
93 |
gene_act_adata = atac_utils.gene_activity_matrix_from_adata( |
|
|
94 |
adata if not args.raw else adata.raw, |
|
|
95 |
size_norm=args.sizenorm, |
|
|
96 |
annotation=ANNOTATION_DICT[args.genome], |
|
|
97 |
) |
|
|
98 |
else: |
|
|
99 |
logging.warning("Using ArchR based method, size norm is always on") |
|
|
100 |
gene_act_adata = atac_utils.archr_gene_activity_matrix_from_adata( |
|
|
101 |
adata if not args.raw else adata.raw, |
|
|
102 |
annotation=ANNOTATION_DICT[args.genome], |
|
|
103 |
) |
|
|
104 |
logging.info(f"Writing gene activity scores to {args.output_h5ad}") |
|
|
105 |
gene_act_adata.write(args.output_h5ad) |
|
|
106 |
|
|
|
107 |
|
|
|
108 |
if __name__ == "__main__": |
|
|
109 |
main() |