--- a +++ b/singlecellmultiomics/fastaProcessing/createMappabilityIndex.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +import singlecellmultiomics.utils +import pysam +import collections +import numpy as np +import re +import more_itertools +import gzip +import argparse +import uuid +import os +from singlecellmultiomics.utils import BlockZip + + +if __name__ == '__main__': + argparser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="""In silico digest genome""") + argparser.add_argument( + 'fasta', + metavar='fastafile', + type=str, + help='Fasta file to digest') + argparser.add_argument( + '-minlen', + type=int, + default=20, + help='minimum read length') + argparser.add_argument( + '-maxlen', + type=int, + default=69, + help='maximum read length') + argparser.add_argument('-digest_sequence', type=str, default='CATG') + argparser.add_argument('-head', type=int) + argparser.add_argument( + '-contigs', + type=str, + default=None, + help='Comma separated contigs to run analysis for, when None specified all contigs are used') + args = argparser.parse_args() + + r1_read_length = args.maxlen + minlen = args.minlen + + ref = pysam.FastaFile(args.fasta) + + def seq_to_fastq(seq, header): + return f'@{header}\n{seq}\n+\n{"E"*len(seq)}\n' + + selected_contigs = None if args.contigs is None else set( + args.contigs.split(',')) + + fastq_path = f'simulated_{args.digest_sequence}_single_{r1_read_length}_{uuid.uuid4()}.fastq.gz' + + outbam = f'simulated_{args.digest_sequence}_single_{r1_read_length}.bam' + + with gzip.open(fastq_path, 'wt') as out: + processed = 0 + + for contig, contig_len in zip(ref.references, ref.lengths): + if selected_contigs is not None and contig not in selected_contigs: + continue + print(f'{contig}\t{contig_len}') + seq = ref.fetch(contig).upper() + frag_locations = np.diff( + [m.start() for m in re.finditer(args.digest_sequence, seq)]) + # Generate fragments: + for i, (start, end_excluding_catg) in enumerate(more_itertools.windowed( + (m.start() for m in re.finditer(args.digest_sequence, seq)), 2)): + if end_excluding_catg is None or args.head is not None and i >= ( + args.head - 1): + continue + end = end_excluding_catg + len(args.digest_sequence) + + forward_read = seq[start:end][:r1_read_length] + if len(forward_read) >= minlen: + out.write( + seq_to_fastq( + forward_read, + f'DR:fwd;ct:{contig};ds:{start};qn:{len(forward_read)}')) + reverse_read = singlecellmultiomics.utils.reverse_complement(seq[start:end])[ + :r1_read_length] + if len(reverse_read) >= minlen: + out.write( + seq_to_fastq( + reverse_read, + f'DR:rev;ct:{contig};ds:{end_excluding_catg};qn:{len(reverse_read)}')) + + processed += contig_len + + # Map the fastq file: + print("Now mapping ...") + os.system(f"bwa mem -t 4 {args.fasta} {fastq_path} | samtools view -b - > ./{outbam}.unsorted.bam; samtools sort -T ./temp_sort -@ 4 ./{outbam}.unsorted.bam > ./{outbam}.unfinished.bam ; mv ./{outbam}.unfinished.bam ./{outbam} ; samtools index ./{outbam} & rm {fastq_path}") + + print("Creating site database ...") + # Create site dictionary: + sites = {} # site-> wrongly_assinged_to, correctly_assigned, lost + + ref_reads = pysam.AlignmentFile(outbam) + # Perform site detection: + for read in ref_reads: + + if read.is_supplementary: + continue + # Check if there is reads mapping to this site + + kv = {kv.split(':')[0]: kv.split(':')[1] + for kv in read.query_name.split(';')} + + site_pos = int(kv['ds']) + site_chrom = kv['ct'] + strand = (kv['DR'] == 'rev') + + read_site = read.reference_end - 4 if read.is_reverse else read.reference_start + read_contig = read.reference_name + + key = (site_chrom, site_pos, strand) + key_mapped = (read_contig, read_site, read.is_reverse) + + if key not in sites: + sites[key] = {'correct': 0, 'wrong_gain': 0, 'lost': 0} + + if read_site == site_pos and site_chrom == read_contig and read.is_reverse == strand: # Correct assignment + sites[key]['correct'] += 1 + else: + # Assign a missing count to the origin site: + sites[key]['lost'] += 1 + + # Assing a wrong annotation to the target site: + if key_mapped not in sites: + sites[key_mapped] = {'correct': 0, 'wrong_gain': 0, 'lost': 0} + sites[key_mapped]['wrong_gain'] += 1 + + print("Writing site statistics ...") + outtab = f'simulated_{args.digest_sequence}_single_{r1_read_length}.mappability.stats.bgzf' + outtabsafe = f'simulated_{args.digest_sequence}_single_{r1_read_length}.mappability.safe.bgzf' + + keys = sorted( + [(contig, pos, strand) + for (contig, pos, strand) in sites.keys() + if contig is not None]) + + with BlockZip(outtab, 'w') as stats, BlockZip(outtabsafe, 'w') as safe: + for (contig, pos, strand) in keys: + measured= sites[(contig, pos, strand)] + stats.write( + contig, + pos, + strand, + f'{measured["correct"]}\t{measured["lost"]}\t{measured["wrong_gain"]}') + + if measured['wrong_gain'] == 0 and measured['lost'] == 0 and measured['correct'] == 1: + safe.write(contig, pos, strand, 'ok')