SingleCellMultiOmics / Git / [45ad7e] /singlecellmultiomics/fastaProcessing/createMappabilityIndex.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
[45ad7e]: / singlecellmultiomics / fastaProcessing / createMappabilityIndex.py
History
Download this file
155 lines (127 with data), 5.8 kB

#!/usr/bin/env python3
import singlecellmultiomics.utils
import pysam
import collections
import numpy as np
import re
import more_itertools
import gzip
import argparse
import uuid
import os
from singlecellmultiomics.utils import BlockZip


if __name__ == '__main__':
    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""In silico digest genome""")
    argparser.add_argument(
        'fasta',
        metavar='fastafile',
        type=str,
        help='Fasta file to digest')
    argparser.add_argument(
        '-minlen',
        type=int,
        default=20,
        help='minimum read length')
    argparser.add_argument(
        '-maxlen',
        type=int,
        default=69,
        help='maximum read length')
    argparser.add_argument('-digest_sequence', type=str, default='CATG')
    argparser.add_argument('-head', type=int)
    argparser.add_argument(
        '-contigs',
        type=str,
        default=None,
        help='Comma separated contigs to run analysis for, when None specified all contigs are used')
    args = argparser.parse_args()

    r1_read_length = args.maxlen
    minlen = args.minlen

    ref = pysam.FastaFile(args.fasta)

    def seq_to_fastq(seq, header):
        return f'@{header}\n{seq}\n+\n{"E"*len(seq)}\n'

    selected_contigs = None if args.contigs is None else set(
        args.contigs.split(','))

    fastq_path = f'simulated_{args.digest_sequence}_single_{r1_read_length}_{uuid.uuid4()}.fastq.gz'

    outbam = f'simulated_{args.digest_sequence}_single_{r1_read_length}.bam'

    with gzip.open(fastq_path, 'wt') as out:
        processed = 0

        for contig, contig_len in zip(ref.references, ref.lengths):
            if selected_contigs is not None and contig not in selected_contigs:
                continue
            print(f'{contig}\t{contig_len}')
            seq = ref.fetch(contig).upper()
            frag_locations = np.diff(
                [m.start() for m in re.finditer(args.digest_sequence, seq)])
            # Generate fragments:
            for i, (start, end_excluding_catg) in enumerate(more_itertools.windowed(
                    (m.start() for m in re.finditer(args.digest_sequence, seq)), 2)):
                if end_excluding_catg is None or args.head is not None and i >= (
                        args.head - 1):
                    continue
                end = end_excluding_catg + len(args.digest_sequence)

                forward_read = seq[start:end][:r1_read_length]
                if len(forward_read) >= minlen:
                    out.write(
                        seq_to_fastq(
                            forward_read,
                            f'DR:fwd;ct:{contig};ds:{start};qn:{len(forward_read)}'))
                reverse_read = singlecellmultiomics.utils.reverse_complement(seq[start:end])[
                    :r1_read_length]
                if len(reverse_read) >= minlen:
                    out.write(
                        seq_to_fastq(
                            reverse_read,
                            f'DR:rev;ct:{contig};ds:{end_excluding_catg};qn:{len(reverse_read)}'))

            processed += contig_len

    # Map the fastq file:
    print("Now mapping ...")
    os.system(f"bwa mem -t 4 {args.fasta} {fastq_path} | samtools view -b - > ./{outbam}.unsorted.bam; samtools sort -T ./temp_sort -@ 4 ./{outbam}.unsorted.bam > ./{outbam}.unfinished.bam ; mv ./{outbam}.unfinished.bam ./{outbam} ; samtools index ./{outbam} & rm {fastq_path}")

    print("Creating site database ...")
    # Create site dictionary:
    sites = {}  # site-> wrongly_assinged_to, correctly_assigned, lost

    ref_reads = pysam.AlignmentFile(outbam)
    # Perform site detection:
    for read in ref_reads:

        if read.is_supplementary:
            continue
        # Check if there is reads mapping to this site

        kv = {kv.split(':')[0]: kv.split(':')[1]
              for kv in read.query_name.split(';')}

        site_pos = int(kv['ds'])
        site_chrom = kv['ct']
        strand = (kv['DR'] == 'rev')

        read_site = read.reference_end - 4 if read.is_reverse else read.reference_start
        read_contig = read.reference_name

        key = (site_chrom, site_pos, strand)
        key_mapped = (read_contig, read_site, read.is_reverse)

        if key not in sites:
            sites[key] = {'correct': 0, 'wrong_gain': 0, 'lost': 0}

        if read_site == site_pos and site_chrom == read_contig and read.is_reverse == strand:  # Correct assignment
            sites[key]['correct'] += 1
        else:
            # Assign a missing count to the origin site:
            sites[key]['lost'] += 1

            # Assing a wrong annotation to the target site:
            if key_mapped not in sites:
                sites[key_mapped] = {'correct': 0, 'wrong_gain': 0, 'lost': 0}
            sites[key_mapped]['wrong_gain'] += 1

    print("Writing site statistics ...")
    outtab = f'simulated_{args.digest_sequence}_single_{r1_read_length}.mappability.stats.bgzf'
    outtabsafe = f'simulated_{args.digest_sequence}_single_{r1_read_length}.mappability.safe.bgzf'

    keys = sorted(
    [(contig, pos, strand)
     for  (contig, pos, strand) in sites.keys()
     if contig is not None])

    with BlockZip(outtab, 'w') as stats, BlockZip(outtabsafe, 'w') as safe:
        for (contig, pos, strand) in keys:
            measured= sites[(contig, pos, strand)]
            stats.write(
                contig,
                pos,
                strand,
                f'{measured["correct"]}\t{measured["lost"]}\t{measured["wrong_gain"]}')

            if measured['wrong_gain'] == 0 and measured['lost'] == 0 and measured['correct'] == 1:
                safe.write(contig, pos, strand, 'ok')