Switch to side-by-side view

--- a
+++ b/singlecellmultiomics/fastaProcessing/fastaMaskVariants.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+import pysam
+import numpy as np
+import argparse
+import os
+import itertools
+import gzip
+import pandas as pd
+import multiprocessing
+
+if __name__ == '__main__':
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="""Convert genome FASTA file to convert alternative bases to Ns""")
+    argparser.add_argument(
+        'fasta',
+        metavar='fastafile',
+        type=str,
+        help='Fasta file to mask')
+    argparser.add_argument(
+        'vcf',
+        metavar='vcffile',
+        type=str,
+        help='VCF file(s) to extract variants from')
+    argparser.add_argument(
+        '-o',
+        type=str,
+        default='./masked.fasta.gz',
+        help='output file')
+    argparser.add_argument(
+        '-t',
+        type=int,
+        default=10,
+        help='Amount of contigs to process in parallel')
+    args = argparser.parse_args()
+
+    try:
+        faIn = pysam.FastaFile(args.fasta)
+    except ValueError as e:
+        print('FAI index missing. Now creating...')
+        os.system(f'samtools faidx {args.fasta}')
+    except Exception as e:
+        raise
+
+    try:
+        faIn = pysam.FastaFile(args.fasta)
+    except Exception as e:
+        raise
+
+    if args.o.endswith('.gz'):
+        outputHandle = gzip.open(args.o, 'wb')
+    else:
+        outputHandle = open(args.o, 'wb')
+    referenceNames = faIn.references
+    referenceLengths = dict(zip(referenceNames, faIn.lengths))
+    totalMasked = 0
+
+    def get_masked_bytearray( jargs ):
+
+        totalMasked = 0
+        (chrom, fasta_file_path, variant_file_path) = jargs
+        with pysam.FastaFile(fasta_file_path) as faIn, pysam.VariantFile(variant_file_path, threads=4) as  variants:
+
+            chrom_seq = bytearray(faIn.fetch(chrom), 'ascii')
+            if chrom in variants.index:
+                print(f'masking {chrom}')
+                for rec in variants.fetch(chrom):
+                    if rec.start >= (referenceLengths[rec.chrom]):
+                        print(
+                            f"WARNING: record {rec.chrom} {rec.pos} defines a variant outside the supplied fasta file!")
+                        continue
+
+                    if len(rec.alleles) == 1 and len(rec.alleles[0]) == 1:
+                        chrom_seq[rec.pos-1] = 78  # ord N
+                        totalMasked += 1
+                    elif len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1:
+
+                        chrom_seq[rec.pos-1] = 78  # ord N
+                        totalMasked += 1
+            print(f'Masked {totalMasked} bases of {chrom}')
+            return chrom_seq
+
+
+    workers = multiprocessing.Pool(args.t)
+
+
+    for chrom, chrom_seq in zip(
+                referenceNames,
+                workers.imap(
+                    get_masked_bytearray,
+                    ((chrom, args.fasta, args.vcf )
+                    for chrom in referenceNames ))):
+        # Write chromsome
+        outputHandle.write(f'>{chrom}\n'.encode('ascii'))
+        outputHandle.write(chrom_seq)
+        outputHandle.write('\n'.encode('ascii'))
+
+    outputHandle.close()