SingleCellMultiOmics / Git / Diff of /singlecellmultiomics/fastaProcessing/createMappabilityIndex.py

Models:

AlyssaS/

SingleCellMultiOmics

Downloads: 1

Diff of /singlecellmultiomics/fastaProcessing/createMappabilityIndex.py [000000] .. [2c420a]

Switch to unified view

 b/singlecellmultiomics/fastaProcessing/createMappabilityIndex.py
+#!/usr/bin/env python3
+import singlecellmultiomics.utils
+import pysam
+import collections
+import numpy as np
+import re
+import more_itertools
+import gzip
+import argparse
+import uuid
+import os
+from singlecellmultiomics.utils import BlockZip
+if __name__ == '__main__':
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="""In silico digest genome""")
+    argparser.add_argument(
+        'fasta',
+        metavar='fastafile',
+        type=str,
+        help='Fasta file to digest')
+    argparser.add_argument(
+        '-minlen',
+        type=int,
+        default=20,
+        help='minimum read length')
+    argparser.add_argument(
+        '-maxlen',
+        type=int,
+        default=69,
+        help='maximum read length')
+    argparser.add_argument('-digest_sequence', type=str, default='CATG')
+    argparser.add_argument('-head', type=int)
+    argparser.add_argument(
+        '-contigs',
+        type=str,
+        default=None,
+        help='Comma separated contigs to run analysis for, when None specified all contigs are used')
+    args = argparser.parse_args()
+    r1_read_length = args.maxlen
+    minlen = args.minlen
+    ref = pysam.FastaFile(args.fasta)
+    def seq_to_fastq(seq, header):
+        return f'@{header}\n{seq}\n+\n{"E"*len(seq)}\n'
+    selected_contigs = None if args.contigs is None else set(
+        args.contigs.split(','))
+    fastq_path = f'simulated_{args.digest_sequence}_single_{r1_read_length}_{uuid.uuid4()}.fastq.gz'
+    outbam = f'simulated_{args.digest_sequence}_single_{r1_read_length}.bam'
+    with gzip.open(fastq_path, 'wt') as out:
+        processed = 0
+        for contig, contig_len in zip(ref.references, ref.lengths):
+            if selected_contigs is not None and contig not in selected_contigs:
+                continue
+            print(f'{contig}\t{contig_len}')
+            seq = ref.fetch(contig).upper()
+            frag_locations = np.diff(
+                [m.start() for m in re.finditer(args.digest_sequence, seq)])
+            # Generate fragments:
+            for i, (start, end_excluding_catg) in enumerate(more_itertools.windowed(
+                    (m.start() for m in re.finditer(args.digest_sequence, seq)), 2)):
+                if end_excluding_catg is None or args.head is not None and i >= (
+                        args.head - 1):
+                    continue
+                end = end_excluding_catg + len(args.digest_sequence)
+                forward_read = seq[start:end][:r1_read_length]
+                if len(forward_read) >= minlen:
+                    out.write(
+                        seq_to_fastq(
+                            forward_read,
+                            f'DR:fwd;ct:{contig};ds:{start};qn:{len(forward_read)}'))
+                reverse_read = singlecellmultiomics.utils.reverse_complement(seq[start:end])[
+                    :r1_read_length]
+                if len(reverse_read) >= minlen:
+                    out.write(
+                        seq_to_fastq(
+                            reverse_read,
+                            f'DR:rev;ct:{contig};ds:{end_excluding_catg};qn:{len(reverse_read)}'))
+            processed += contig_len
+    # Map the fastq file:
+    print("Now mapping ...")
+    os.system(f"bwa mem -t 4 {args.fasta} {fastq_path} | samtools view -b - > ./{outbam}.unsorted.bam; samtools sort -T ./temp_sort -@ 4 ./{outbam}.unsorted.bam > ./{outbam}.unfinished.bam ; mv ./{outbam}.unfinished.bam ./{outbam} ; samtools index ./{outbam} & rm {fastq_path}")
+    print("Creating site database ...")
+    # Create site dictionary:
+    sites = {}  # site-> wrongly_assinged_to, correctly_assigned, lost
+    ref_reads = pysam.AlignmentFile(outbam)
+    # Perform site detection:
+    for read in ref_reads:
+        if read.is_supplementary:
+            continue
+        # Check if there is reads mapping to this site
+        kv = {kv.split(':')[0]: kv.split(':')[1]
+              for kv in read.query_name.split(';')}
+        site_pos = int(kv['ds'])
+        site_chrom = kv['ct']
+        strand = (kv['DR'] == 'rev')
+        read_site = read.reference_end - 4 if read.is_reverse else read.reference_start
+        read_contig = read.reference_name
+        key = (site_chrom, site_pos, strand)
+        key_mapped = (read_contig, read_site, read.is_reverse)
+        if key not in sites:
+            sites[key] = {'correct': 0, 'wrong_gain': 0, 'lost': 0}
+        if read_site == site_pos and site_chrom == read_contig and read.is_reverse == strand:  # Correct assignment
+            sites[key]['correct'] += 1
+        else:
+            # Assign a missing count to the origin site:
+            sites[key]['lost'] += 1
+            # Assing a wrong annotation to the target site:
+            if key_mapped not in sites:
+                sites[key_mapped] = {'correct': 0, 'wrong_gain': 0, 'lost': 0}
+            sites[key_mapped]['wrong_gain'] += 1
+    print("Writing site statistics ...")
+    outtab = f'simulated_{args.digest_sequence}_single_{r1_read_length}.mappability.stats.bgzf'
+    outtabsafe = f'simulated_{args.digest_sequence}_single_{r1_read_length}.mappability.safe.bgzf'
+    keys = sorted(
+    [(contig, pos, strand)
+     for  (contig, pos, strand) in sites.keys()
+     if contig is not None])
+    with BlockZip(outtab, 'w') as stats, BlockZip(outtabsafe, 'w') as safe:
+        for (contig, pos, strand) in keys:
+            measured= sites[(contig, pos, strand)]
+            stats.write(
+                contig,
+                pos,
+                strand,
+                f'{measured["correct"]}\t{measured["lost"]}\t{measured["wrong_gain"]}')
+            if measured['wrong_gain'] == 0 and measured['lost'] == 0 and measured['correct'] == 1:
+                safe.write(contig, pos, strand, 'ok')