SingleCellMultiOmics / Git / [45ad7e] /singlecellmultiomics/alleleTools/heterozygousSNPedit.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
[45ad7e]: / singlecellmultiomics / alleleTools / heterozygousSNPedit.py
History
Download this file
69 lines (52 with data), 3.6 kB

#!/usr/bin/env python3

import pysam
from collections import Counter
import argparse

if __name__ == '__main__':
    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""Alter heterozygous SNPs in the reference vcf to the SNPs present in your data. Currently only works for 129Sv/Bl6 data.""")
    argparser.add_argument('-bamfile', '-bam', metavar='bamfile', help = "Input bam file. This is used to detect which heterozygous SNPs are present in the library.",
                           type=str, required=True)
    argparser.add_argument('-vcffile','-vcf', metavar='vcffile', help = "Input vcf file. This file will be altered based on the SNPS present in the supplied library.",
                           type=str, required=True)
    argparser.add_argument('-allele', help = "define the allele that you wish to alter. Default is 129S1_SvImJ", type = str, default = "129S1_SvImJ" )
    argparser.add_argument('-o', type=str, help="Output vcf file, containing the altered heterozygous SNPs.", required=True)

    args = argparser.parse_args()

origin_vcf = pysam.VariantFile(args.vcffile)
origin_bam = pysam.AlignmentFile(args.bamfile)

### trying to alter only the genotype part of the VCF - so the 1 and 0 that assign the ref/alt base

with pysam.VariantFile(args.o, mode = 'w', header = origin_vcf.header) as output_vcf: #change to 'wb' if output is vcf.gz

    for record in origin_vcf: #.fetch('12',114682700, 114683000): #load in only one small region to check
        record.chrom, record.pos
        
        base_obs = Counter()
        pos = record.pos - 1 
        
# calculate number of observations per base on this position, determined by the input .bam file
        for pileupcolumn in origin_bam.pileup(record.chrom, pos, pos + 1):
            if pileupcolumn.pos != pos: 
                continue
            for pileupread in pileupcolumn.pileups:
                if not pileupread.is_del and not pileupread.is_refskip:
                    base_obs[pileupread.alignment.query_sequence[pileupread.query_position]] +=1
                    
# we need to couple the genotype (0:ref, 1, 2 etc alternative bases) to the base itself (A,C,G,T) to make the genotype compatible with the base counter
        allele_genotype = {} 
        for number in range(len(record.samples[args.allele]['GT'])):
            allele_genotype[record.samples[args.allele]['GT'][number]] = record.samples[args.allele].alleles[number]

# select which alleles to change: we only want to alter the allele if it is not reference and count is >30%    
        new_GT = tuple(GT for GT in record.samples[args.allele]['GT'] if GT != 0 # if GT = 0, it is reference
                       and (sum(base_obs.values()) > 0 # If there are no observations, nothing should be changed
                            and float(base_obs[allele_genotype[GT]])/float(sum(base_obs.values())) > 0.3)) # only change if we have >30% observations
 
 # if the length is 0, nothing should change        
        if len(new_GT) == 0:
            new_GT = record.samples[args.allele]['GT']
 
 # if length is 1, the site should be homozygous               
        if len(new_GT) == 1:
            new_GT = tuple( (new_GT[0], new_GT[0]))

        if record.samples[args.allele].alleles == (None, None):
            continue

# to prevent crashing on monomorphic sites
        if record.alts == None:
            continue

# write altered SNPs to record and to output file      
        record.samples[args.allele]['GT'] = new_GT
        output_vcf.write(record)