Switch to unified view

a b/singlecellmultiomics/fastaProcessing/fastaMaskVariants.py
1
#!/usr/bin/env python3
2
import pysam
3
import numpy as np
4
import argparse
5
import os
6
import itertools
7
import gzip
8
import pandas as pd
9
import multiprocessing
10
11
if __name__ == '__main__':
12
    argparser = argparse.ArgumentParser(
13
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
14
        description="""Convert genome FASTA file to convert alternative bases to Ns""")
15
    argparser.add_argument(
16
        'fasta',
17
        metavar='fastafile',
18
        type=str,
19
        help='Fasta file to mask')
20
    argparser.add_argument(
21
        'vcf',
22
        metavar='vcffile',
23
        type=str,
24
        help='VCF file(s) to extract variants from')
25
    argparser.add_argument(
26
        '-o',
27
        type=str,
28
        default='./masked.fasta.gz',
29
        help='output file')
30
    argparser.add_argument(
31
        '-t',
32
        type=int,
33
        default=10,
34
        help='Amount of contigs to process in parallel')
35
    args = argparser.parse_args()
36
37
    try:
38
        faIn = pysam.FastaFile(args.fasta)
39
    except ValueError as e:
40
        print('FAI index missing. Now creating...')
41
        os.system(f'samtools faidx {args.fasta}')
42
    except Exception as e:
43
        raise
44
45
    try:
46
        faIn = pysam.FastaFile(args.fasta)
47
    except Exception as e:
48
        raise
49
50
    if args.o.endswith('.gz'):
51
        outputHandle = gzip.open(args.o, 'wb')
52
    else:
53
        outputHandle = open(args.o, 'wb')
54
    referenceNames = faIn.references
55
    referenceLengths = dict(zip(referenceNames, faIn.lengths))
56
    totalMasked = 0
57
58
    def get_masked_bytearray( jargs ):
59
60
        totalMasked = 0
61
        (chrom, fasta_file_path, variant_file_path) = jargs
62
        with pysam.FastaFile(fasta_file_path) as faIn, pysam.VariantFile(variant_file_path, threads=4) as  variants:
63
64
            chrom_seq = bytearray(faIn.fetch(chrom), 'ascii')
65
            if chrom in variants.index:
66
                print(f'masking {chrom}')
67
                for rec in variants.fetch(chrom):
68
                    if rec.start >= (referenceLengths[rec.chrom]):
69
                        print(
70
                            f"WARNING: record {rec.chrom} {rec.pos} defines a variant outside the supplied fasta file!")
71
                        continue
72
73
                    if len(rec.alleles) == 1 and len(rec.alleles[0]) == 1:
74
                        chrom_seq[rec.pos-1] = 78  # ord N
75
                        totalMasked += 1
76
                    elif len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1:
77
78
                        chrom_seq[rec.pos-1] = 78  # ord N
79
                        totalMasked += 1
80
            print(f'Masked {totalMasked} bases of {chrom}')
81
            return chrom_seq
82
83
84
    workers = multiprocessing.Pool(args.t)
85
86
87
    for chrom, chrom_seq in zip(
88
                referenceNames,
89
                workers.imap(
90
                    get_masked_bytearray,
91
                    ((chrom, args.fasta, args.vcf )
92
                    for chrom in referenceNames ))):
93
        # Write chromsome
94
        outputHandle.write(f'>{chrom}\n'.encode('ascii'))
95
        outputHandle.write(chrom_seq)
96
        outputHandle.write('\n'.encode('ascii'))
97
98
    outputHandle.close()