[45ad7e]: / singlecellmultiomics / fastaProcessing / createMappabilityIndex.py

Download this file

155 lines (127 with data), 5.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python3
import singlecellmultiomics.utils
import pysam
import collections
import numpy as np
import re
import more_itertools
import gzip
import argparse
import uuid
import os
from singlecellmultiomics.utils import BlockZip
if __name__ == '__main__':
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="""In silico digest genome""")
argparser.add_argument(
'fasta',
metavar='fastafile',
type=str,
help='Fasta file to digest')
argparser.add_argument(
'-minlen',
type=int,
default=20,
help='minimum read length')
argparser.add_argument(
'-maxlen',
type=int,
default=69,
help='maximum read length')
argparser.add_argument('-digest_sequence', type=str, default='CATG')
argparser.add_argument('-head', type=int)
argparser.add_argument(
'-contigs',
type=str,
default=None,
help='Comma separated contigs to run analysis for, when None specified all contigs are used')
args = argparser.parse_args()
r1_read_length = args.maxlen
minlen = args.minlen
ref = pysam.FastaFile(args.fasta)
def seq_to_fastq(seq, header):
return f'@{header}\n{seq}\n+\n{"E"*len(seq)}\n'
selected_contigs = None if args.contigs is None else set(
args.contigs.split(','))
fastq_path = f'simulated_{args.digest_sequence}_single_{r1_read_length}_{uuid.uuid4()}.fastq.gz'
outbam = f'simulated_{args.digest_sequence}_single_{r1_read_length}.bam'
with gzip.open(fastq_path, 'wt') as out:
processed = 0
for contig, contig_len in zip(ref.references, ref.lengths):
if selected_contigs is not None and contig not in selected_contigs:
continue
print(f'{contig}\t{contig_len}')
seq = ref.fetch(contig).upper()
frag_locations = np.diff(
[m.start() for m in re.finditer(args.digest_sequence, seq)])
# Generate fragments:
for i, (start, end_excluding_catg) in enumerate(more_itertools.windowed(
(m.start() for m in re.finditer(args.digest_sequence, seq)), 2)):
if end_excluding_catg is None or args.head is not None and i >= (
args.head - 1):
continue
end = end_excluding_catg + len(args.digest_sequence)
forward_read = seq[start:end][:r1_read_length]
if len(forward_read) >= minlen:
out.write(
seq_to_fastq(
forward_read,
f'DR:fwd;ct:{contig};ds:{start};qn:{len(forward_read)}'))
reverse_read = singlecellmultiomics.utils.reverse_complement(seq[start:end])[
:r1_read_length]
if len(reverse_read) >= minlen:
out.write(
seq_to_fastq(
reverse_read,
f'DR:rev;ct:{contig};ds:{end_excluding_catg};qn:{len(reverse_read)}'))
processed += contig_len
# Map the fastq file:
print("Now mapping ...")
os.system(f"bwa mem -t 4 {args.fasta} {fastq_path} | samtools view -b - > ./{outbam}.unsorted.bam; samtools sort -T ./temp_sort -@ 4 ./{outbam}.unsorted.bam > ./{outbam}.unfinished.bam ; mv ./{outbam}.unfinished.bam ./{outbam} ; samtools index ./{outbam} & rm {fastq_path}")
print("Creating site database ...")
# Create site dictionary:
sites = {} # site-> wrongly_assinged_to, correctly_assigned, lost
ref_reads = pysam.AlignmentFile(outbam)
# Perform site detection:
for read in ref_reads:
if read.is_supplementary:
continue
# Check if there is reads mapping to this site
kv = {kv.split(':')[0]: kv.split(':')[1]
for kv in read.query_name.split(';')}
site_pos = int(kv['ds'])
site_chrom = kv['ct']
strand = (kv['DR'] == 'rev')
read_site = read.reference_end - 4 if read.is_reverse else read.reference_start
read_contig = read.reference_name
key = (site_chrom, site_pos, strand)
key_mapped = (read_contig, read_site, read.is_reverse)
if key not in sites:
sites[key] = {'correct': 0, 'wrong_gain': 0, 'lost': 0}
if read_site == site_pos and site_chrom == read_contig and read.is_reverse == strand: # Correct assignment
sites[key]['correct'] += 1
else:
# Assign a missing count to the origin site:
sites[key]['lost'] += 1
# Assing a wrong annotation to the target site:
if key_mapped not in sites:
sites[key_mapped] = {'correct': 0, 'wrong_gain': 0, 'lost': 0}
sites[key_mapped]['wrong_gain'] += 1
print("Writing site statistics ...")
outtab = f'simulated_{args.digest_sequence}_single_{r1_read_length}.mappability.stats.bgzf'
outtabsafe = f'simulated_{args.digest_sequence}_single_{r1_read_length}.mappability.safe.bgzf'
keys = sorted(
[(contig, pos, strand)
for (contig, pos, strand) in sites.keys()
if contig is not None])
with BlockZip(outtab, 'w') as stats, BlockZip(outtabsafe, 'w') as safe:
for (contig, pos, strand) in keys:
measured= sites[(contig, pos, strand)]
stats.write(
contig,
pos,
strand,
f'{measured["correct"]}\t{measured["lost"]}\t{measured["wrong_gain"]}')
if measured['wrong_gain'] == 0 and measured['lost'] == 0 and measured['correct'] == 1:
safe.write(contig, pos, strand, 'ok')