a b/neusomatic/python/merge_post_vcfs.py
1
#!/usr/bin/env python
2
#-------------------------------------------------------------------------
3
# merge_post_vcfs.py
4
# Merge resolved variants and other predicted variants and output the final NeuSomatic .vcf
5
#-------------------------------------------------------------------------
6
import argparse
7
import traceback
8
import fileinput
9
import logging
10
11
import numpy as np
12
13
from utils import get_chromosomes_order
14
15
16
def merge_post_vcfs(ref, resolved_vcf, no_resolve_vcf, out_vcf,
17
                    pass_threshold, lowqual_threshold):
18
19
    logger = logging.getLogger(merge_post_vcfs.__name__)
20
21
    logger.info("------------------------Merge vcfs-------------------------")
22
23
    chroms_order = get_chromosomes_order(reference=ref)
24
25
    good_records = []
26
    # Use fileinput to stream as if the two files were concatenated
27
    for line in fileinput.input([no_resolve_vcf, resolved_vcf]):
28
        if line[0] == "#":
29
            continue
30
        chrom, pos, _, ref, alt, score, _, info, format_, gt = line.strip().split()
31
        good_records.append([chrom, pos, ref, alt, gt, score])
32
33
    with open(out_vcf, "w") as o_f:
34
        o_f.write("##fileformat=VCFv4.2\n")
35
        o_f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
36
        for record in sorted(good_records, key=lambda x: [chroms_order[x[0]], x[1]]):
37
            chrom, pos, ref, alt, gt, score = record
38
            prob = np.round(1 - (10**(-float(score) / 10)), 4)
39
            filter_ = "REJECT"
40
            if prob >= pass_threshold:
41
                filter_ = "PASS"
42
            elif prob >= lowqual_threshold:
43
                filter_ = "LowQual"
44
            o_f.write("\t".join([chrom, pos, ".", ref, alt,
45
                                 "{:.4f}".format(
46
                                     float(score)), filter_, "SCORE={:.4f}".format(prob),
47
                                 "GT", "0/1"]) + "\n")
48
49
if __name__ == '__main__':
50
51
    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
52
    logging.basicConfig(level=logging.INFO, format=FORMAT)
53
    logger = logging.getLogger(__name__)
54
55
    parser = argparse.ArgumentParser(description='merge pred vcfs')
56
    parser.add_argument(
57
        '--ref', help='reference fasta filename', required=True)
58
    parser.add_argument('--resolved_vcf', help='resolved_vcf', required=True)
59
    parser.add_argument('--no_resolve_vcf',
60
                        help='no resolve vcf', required=True)
61
    parser.add_argument('--out_vcf', help='output vcf', required=True)
62
    args = parser.parse_args()
63
    logger.info(args)
64
    try:
65
        merge_post_vcfs(args.ref, args.resolved_vcf,
66
                        args.no_resolve_vcf, args.out_vcf)
67
    except Exception as e:
68
        logger.error(traceback.format_exc())
69
        logger.error("Aborting!")
70
        logger.error(
71
            "merge_post_vcfs.py failure on arguments: {}".format(args))
72
        raise e