SingleCellMultiOmics / Git / Diff of /singlecellmultiomics/molecule/molecule.py

Models:

AlyssaS/

SingleCellMultiOmics

Downloads: 1

Diff of /singlecellmultiomics/molecule/molecule.py [000000] .. [2c420a]

Switch to unified view

 b/singlecellmultiomics/molecule/molecule.py
+from singlecellmultiomics.utils.sequtils import hamming_distance
+import pysamiterators.iterators
+import singlecellmultiomics.bamProcessing
+from singlecellmultiomics.fragment import Fragment
+from array import array
+import itertools
+import numpy as np
+from singlecellmultiomics.utils import style_str, prob_to_phred, phredscores_to_base_call, base_probabilities_to_likelihood, likelihood_to_prob
+import textwrap
+import singlecellmultiomics.alleleTools
+import functools
+import typing
+import pysam
+import pysamiterators
+from singlecellmultiomics.utils import find_ranges, create_MD_tag
+import pandas as pd
+from uuid import uuid4
+from cached_property import cached_property
+from collections import Counter, defaultdict
+###############
+# Variant validation function
+def detect_alleles(molecules,
+                   contig,
+                   position,
+                   min_cell_obs=3,
+                   base_confidence_threshold=None,
+                   classifier=None):  # [ alleles]
+    """
+    Detect the alleles (variable bases) present at the selected location
+    Args:
+        molecules : generator to extract molecules from
+        variant_location(tuple) : (contig, position) zero based location of the location to test
+        min_cell_obs (int) : minimum amount of cells containing the allele to be emitted
+        confidence_threshold(float) : minimum confidence of concensus base-call to be taken into account
+        classifier (obj) : classifier used for consensus call, when no classifier is supplied a mayority vote is used
+    """
+    observed_alleles = defaultdict(set)  # cell -> { base_call , .. }
+    for molecule in molecules:
+        base_call = molecule.get_consensus_base(contig, position, classifier=classifier)
+        # confidence = molecule.get_mean_base_quality(*variant_location, base_call)
+        if base_call is not None:
+            observed_alleles[base_call].add(molecule.sample)
+    return [allele for allele, cells in observed_alleles.items() if len(cells) >= min_cell_obs]
+def get_variant_phase(molecules, contig, position, variant_base, allele_resolver,
+                      phasing_ratio_threshold=None):  # (location,base) -> [( location, base, idenfifier)]
+    alleles = [variant_base]
+    phases = defaultdict(Counter)  # Allele_id -> variant->obs
+    for molecule in molecules:
+        # allele_obs = molecule.get_allele(return_allele_informative_base_dict=True,allele_resolver=allele_resolver)
+        allele = list(molecule.get_allele(allele_resolver))
+        if allele is None or len(allele) > 1 or len(allele) == 0:
+            continue
+        allele = allele[0]
+        base = molecule.get_consensus_base(contig, position)
+        if base in alleles:
+            phases[base][allele] += 1
+        else:
+            pass
+    if len(phases[variant_base]) == 0:
+        raise ValueError("Phasing not established, no gSNVS available")
+    phased_allele_id = phases[variant_base].most_common(1)[0][0]
+    # Check if the phasing noise is below the threshold:
+    if phasing_ratio_threshold is not None:
+        correct = phases[variant_base].most_common(1)[0][1]
+        total = sum(phases[variant_base].values())
+        phasing_ratio = correct / total
+        if correct / total < phasing_ratio_threshold:
+            raise ValueError(f'Phasing ratio not met. ({phasing_ratio}) < {phasing_ratio_threshold}')
+    # Check if the other allele i
+    return phased_allele_id
+###############
+@functools.lru_cache(maxsize=1000)
+def might_be_variant(chrom, pos, variants, ref_base=None):
+    """Returns True if a variant exists at the given coordinate"""
+    if ref_base == 'N':
+        return False
+    try:
+        for record in variants.fetch(chrom, pos, pos + 1):
+            return True
+    except ValueError as e:
+        return False # Happens when the contig does not exists, return False
+    return False
+def consensii_default_vector():
+    """a numpy vector with 5 elements initialsed as zeros"""
+    return np.zeros(5)
+# 1: read1 2: read2
+def molecule_to_random_primer_dict(
+        molecule,
+        primer_length=6,
+        primer_read=2,
+        max_N_distance=0):
+    rp = defaultdict(list)
+    # First add all reactions without a N in the sequence:
+    for fragment in molecule:
+        h_contig, hstart, hseq = fragment.get_random_primer_hash()
+        if hseq is None:
+            # This should really not happen with freshly demultiplexed data, it means we cannot extract the random primer sequence
+            # which should be present as a tag (rS) in the record
+            rp[None, None, None].append(fragment)
+        elif 'N' not in hseq:
+            rp[h_contig, hstart, hseq].append(fragment)
+    # Try to match reactions with N with reactions without a N
+    for fragment in molecule:
+        h_contig, hstart, hseq = fragment.get_random_primer_hash()
+        if hseq is not None and 'N' in hseq:
+            # find nearest
+            for other_contig, other_start, other_seq in rp:
+                if other_contig != h_contig or other_start != hstart:
+                    continue
+                if hseq.count('N') > max_N_distance:
+                    continue
+                if 'N' in other_seq:
+                    continue
+                if hamming_distance(hseq, other_seq) == 0:
+                    rp[other_contig, other_start, other_seq].append(fragment)
+    return rp
+class Molecule():
+    """Molecule class, contains one or more associated fragments
+    Attributes:
+        fragments (list): associated fragments
+        spanStart (int): starting coordinate of molecule, None if not available
+        spanEnd (int): ending coordinate of molecule, None if not available
+        chromosome (str): mapping chromosome of molecule, None if not available
+        cache_size (int): radius of molecule assignment cache
+        reference (pysam.FastaFile) : reference file, used to obtain base contexts
+            and correct aligned_pairs iteration when the MD tag is not correct
+        strand (bool): mapping strand.
+            True when strand is REVERSE
+            False when strand is FORWARD
+            None when strand is not determined
+    """
+    def get_empty_clone(self, fragments=None):
+        return type(self)(fragments,
+                          cache_size = self.cache_size,
+                          reference=self.reference,
+                          min_max_mapping_quality=self.min_max_mapping_quality,
+                          allele_assingment_method=self.allele_assingment_method,
+                          allele_resolver=self.allele_resolver,
+                          mapability_reader=self.mapability_reader,
+                          max_associated_fragments=self.max_associated_fragments,
+                          **self.kwargs)
+    def __init__(self,
+                 fragments: typing.Optional[typing.Iterable] = None,
+                 cache_size: int = 10_000,
+                 reference: typing.Union[pysam.FastaFile, pysamiterators.CachedFasta] = None,
+                 # When all fragments have a mapping quality below this value
+                 # the is_valid method will return False,
+                 min_max_mapping_quality: typing.Optional[int] = None,
+                 mapability_reader: typing.Optional[singlecellmultiomics.bamProcessing.MapabilityReader] = None,
+                 allele_resolver: typing.Optional[singlecellmultiomics.alleleTools.AlleleResolver] = None,
+                 max_associated_fragments=None,
+                 allele_assingment_method=1, # 0: all variants from the same allele, 1: likelihood
+                 **kwargs
+                 ):
+        """Initialise Molecule
+        Parameters
+        ----------
+        fragments :  list(singlecellmultiomics.fragment.Fragment)
+            Fragment to assign to Molecule. More fragments can be added later
+        min_max_mapping_quality :  When all fragments have a mapping quality below this value the is_valid method will return False
+        allele_resolver :  alleleTools.AlleleResolver or None. Supply an allele resolver in order to assign an allele to the molecule
+        mapability_reader : singlecellmultiomics.bamProcessing.MapabilityReader, supply a mapability_reader to set mapping_quality of 0 to molecules mapping to locations which are not mapping uniquely during in-silico library generation.
+        cache_size (int): radius of molecule assignment cache
+        max_associated_fragments(int) : Maximum amount of fragments associated to molecule. If more fragments are added using add_fragment() they are not added anymore to the molecule
+        """
+        self.kwargs = kwargs
+        self.reference = reference
+        self.fragments = []
+        self.spanStart = None
+        self.spanEnd = None
+        self.chromosome = None
+        self.cache_size = cache_size
+        self.strand = None
+        self.umi = None
+        self.overflow_fragments = 0
+        self.umi_hamming_distance = None
+        # when set, when comparing to a fragment the fragment to be added has
+        # to match this hash
+        self.fragment_match = None
+        self.min_max_mapping_quality = min_max_mapping_quality
+        self.umi_counter = Counter()  # Observations of umis
+        self.max_associated_fragments = max_associated_fragments
+        if fragments is not None:
+            if isinstance(fragments, list):
+                for frag in fragments:
+                    self.add_fragment(frag)
+            else:
+                self.add_fragment(fragments)
+        self.allele_resolver = allele_resolver
+        self.mapability_reader = mapability_reader
+        self.allele_assingment_method = allele_assingment_method
+        self.methylation_call_dict = None
+        self.finalised = False
+        self.obtained_allele_likelihoods = None
+    @cached_property
+    def can_be_split_into_allele_molecules(self):
+        l = self.allele_likelihoods
+        if l is None or len(l)<=1:
+            return False
+        return True
+    def split_into_allele_molecules(self):
+        """
+        Split this molecule into multiple molecules, associated to multiple alleles
+        Returns:
+            list_of_molecules: list
+        """
+        # Perform allele based clustering
+        allele_clustered_frags = {}
+        for fragment in self:
+            n = self.get_empty_clone(fragment)
+            if n.allele not in allele_clustered_frags:
+                allele_clustered_frags[n.allele] = []
+            allele_clustered_frags[n.allele].append(n)
+        allele_clustered = {}
+        for allele, assigned_molecules in allele_clustered_frags.items():
+            for i, m in enumerate(assigned_molecules):
+                if i == 0:
+                    allele_clustered[allele] = m
+                else:
+                    allele_clustered[allele].add_molecule(m)
+        if len(allele_clustered)>1:
+            for m in allele_clustered.values():
+                m.set_meta('cr', 'SplitUponAlleleClustering')
+        return list(allele_clustered.values())
+    @cached_property
+    def allele(self):
+        if self.allele_resolver is None:
+            return None
+        if self.allele_assingment_method == 0:
+            # Obtain allele if available
+            if self.allele_resolver is not None:
+                try:
+                    hits = self.get_allele(self.allele_resolver)
+                    # Only store when we have a unique single hit:
+                    if len(hits) == 1:
+                        self.allele = list(hits)[0]
+                except ValueError as e:
+                    # This happens when a consensus can not be obtained
+                    pass
+        elif self.allele_assingment_method == 1:
+            al = Counter(self.allele_likelihoods)
+            if al is None or len(al)<1:
+                return None
+            return al.most_common(1)[0][0]
+        raise NotImplementedError(f'allele_assingment_method {self.allele_assingment_method} is not defined')
+    def __finalise__(self):
+        """This function is called when all associated fragments have been gathered"""
+        # Perfom allele assignment based on likelihood:
+        # this is now only generated upon demand, see .allele method
+        if self.mapability_reader is not None:
+            self.update_mapability()
+        self.finalised = True
+    def update_mapability(self, set_mq_zero=False):
+        """ Update mapability of this molecule.
+        mapping qualities are set to 0 if the mapability_reader returns False
+        for site_is_mapable
+        The mapability_reader can be set when initiating the molecule, or added later.
+        Args:
+            set_mq_zero(bool) : set mapping quality of associated reads to 0 when the
+            mappability reader returns a bad verdict
+        Tip:
+            Use `createMapabilityIndex.py` to create an index to feed to the mapability_reader
+        """
+        mapable = None
+        try:
+            mapable = self.mapability_reader.site_is_mapable(
+                *self.get_cut_site())
+        except TypeError:
+            pass
+        except Exception as e:
+            raise
+        if mapable is False:
+            self.set_meta('mp', 'bad')
+            if set_mq_zero:
+                for read in self.iter_reads():
+                    read.mapping_quality = 0
+        elif mapable is True:
+            self.set_meta('mp', 'unique')
+        else:
+            self.set_meta('mp', 'unknown')
+    def calculate_consensus(self, consensus_model, molecular_identifier, out, **model_kwargs):
+        """
+        Create consensus read for molecule
+        Args:
+            consensus_model
+            molecular_identifier (str) : identier for this molecule, will be suffixed to the reference_id
+            out(pysam.AlingmentFile) : target bam file
+            **model_kwargs : arguments passed to the consensus model
+        """
+        try:
+            consensus_reads = self.deduplicate_to_single_CIGAR_spaced(
+                out,
+                f'c_{self.get_a_reference_id()}_{molecular_identifier}',
+                consensus_model,
+                NUC_RADIUS=model_kwargs['consensus_k_rad']
+            )
+            for consensus_read in consensus_reads:
+                consensus_read.set_tag('RG', self[0].get_read_group())
+                consensus_read.set_tag('mi', molecular_identifier)
+                out.write(consensus_read)
+        except Exception as e:
+            self.set_rejection_reason('CONSENSUS_FAILED', set_qcfail=True)
+            self.write_pysam(out)
+    def get_a_reference_id(self):
+        """
+        Obtain a reference id for a random associated mapped read
+        """
+        for read in self.iter_reads():
+            if not read.is_unmapped:
+                return read.reference_id
+        return -1
+    def get_consensus_read(self, target_file,
+                           read_name, consensus=None,
+                           phred_scores=None,
+                           cigarstring=None,
+                           mdstring=None,
+                           start=None,
+                           supplementary=False
+                           ):
+        """get pysam.AlignedSegment containing aggregated molecule information
+        Args:
+            target_file(pysam.AlignmentFile) : File to create the read for
+            read_name(str) : name of the read to write
+        Returns:
+            read(pysam.AlignedSegment)
+        """
+        if start is None:
+            start = self.spanStart
+        if consensus is None:
+            try:  # Obtain the consensus sequence
+                consensus = self.get_consensus()
+            except Exception as e:
+                raise
+        if isinstance(consensus, str):
+            sequence = consensus
+        else:
+            sequence = ''.join(
+                (consensus.get(
+                    (self.chromosome, ref_pos), 'N') for ref_pos in range(
+                    self.spanStart, self.spanEnd + 1)))
+        if isinstance(phred_scores, dict):
+            phred_score_array = list(
+                phred_scores.get(
+                    (self.chromosome, ref_pos), 0) for ref_pos in range(
+                    self.spanStart, self.spanEnd + 1))
+        else:
+            phred_score_array = phred_scores
+        # Construct consensus - read
+        cread = pysam.AlignedSegment(header=target_file.header)
+        cread.reference_name = self.chromosome
+        cread.reference_start = start
+        cread.query_name = read_name
+        cread.query_sequence = sequence
+        cread.query_qualities = phred_score_array
+        cread.is_supplementary = supplementary
+        if cigarstring is not None:
+            cread.cigarstring = cigarstring
+        else:
+            cread.cigarstring = f'{len(sequence)}M'
+        cread.mapping_quality = self.get_max_mapping_qual()
+        cread.is_reverse = self.strand
+        if mdstring is not None:
+            cread.set_tag('MD', mdstring)
+        self.write_tags_to_psuedoreads((cread,))
+        return cread
+    """ method = 1
+        sequence = []
+        cigar = []
+        if method==0:
+            prev_end = None
+            for block_start,block_end in molecule.get_aligned_blocks():
+                if molecule.strand:
+                    print(block_end>block_start,block_start, block_end)
+                if prev_end is not None:
+                    cigar.append(f'{block_start - prev_end}D')
+                block_len = block_end-block_start+1
+                cigar.append(f'{block_len}M')
+                for ref_pos in range(block_start,block_end+1):
+                    call = consensus.get((molecule.chromosome, ref_pos),'N')
+                    sequence.append(call)
+                prev_end = block_end+1
+            cigarstring = ''.join(cigar)
+        """
+    def get_feature_vector(self, window_size=90):
+        """ Obtain a feature vector representation of the molecule
+        Returns:
+            feature_vector(np.array)
+        """
+        return np.array([
+            self.get_strand(),
+            self.has_valid_span(),
+            self.get_umi_error_rate(),
+            self.get_consensus_gc_ratio(),
+            len(self.get_raw_barcode_sequences()),
+            self.get_safely_aligned_length(),
+            self.get_max_mapping_qual(),
+            (self.alleles is None),
+            self.contains_valid_fragment(),
+            self.is_multimapped(),
+            self.get_feature_window(window_size=window_size)
+        ])
+    def get_tag_counter(self):
+        """
+        Obtain a dictionary with tag -> value -> frequency
+        Returns:
+            tag_obs (defaultdict(Counter)):
+                { tag(str) : { value(int/str): frequency:(int) }
+        """
+        tags_obs = defaultdict(Counter)
+        for tag, value in itertools.chain(
+                *[r.tags for r in self.iter_reads()]):
+            try:
+                tags_obs[tag][value] += 1
+            except TypeError:
+                # Dont count arrays for example
+                pass
+        return tags_obs
+    def write_tags(self):
+        """ Write BAM tags to all reads associated to this molecule
+        This function sets the following tags:
+            - mI : most common umi
+            - DA : allele
+            - af : amount of associated fragments
+            - rt : rt_reaction_index
+            - rd : rt_duplicate_index
+            - TR : Total RT reactions
+            - ap : phasing information (if allele_resolver is set)
+            - TF : total fragments
+            - ms : size of the molecule (largest fragment)
+        """
+        self.is_valid(set_rejection_reasons=True)
+        if self.umi is not None:
+            self.set_meta('mI', self.umi)
+        if self.allele is not None:
+            self.set_meta('DA', str(self.allele))
+        # Set total amount of associated fragments
+        self.set_meta('TF', len(self.fragments) + self.overflow_fragments)
+        try:
+            self.set_meta('ms',self.estimated_max_length)
+        except Exception as e:
+            # There is no properly defined aligned length
+            pass
+        # associatedFragmentCount :
+        self.set_meta('af', len(self))
+        for rc, frag in enumerate(self):
+            frag.set_meta('RC', rc)
+            if rc > 0:
+                # Set duplicate bit
+                for read in frag:
+                    if read is not None:
+                        read.is_duplicate = True
+        # Write RT reaction tags (rt: rt reaction index, rd rt duplicate index)
+        # This is only required for fragments which have defined random primers
+        rt_reaction_index = None
+        for rt_reaction_index, ( (contig, random_primer_start, random_primer_sequence), frags) in enumerate(
+                self.get_rt_reactions().items()):
+            for rt_duplicate_index, frag in enumerate(frags):
+                frag.set_meta('rt', rt_reaction_index)
+                frag.set_meta('rd', rt_duplicate_index)
+                frag.set_meta('rp', random_primer_start)
+        self.set_meta('TR', 0 if (rt_reaction_index is None) else rt_reaction_index + 1)
+        if self.allele_resolver is not None:
+            self.write_allele_phasing_information_tag()
+    def write_tags_to_psuedoreads(self, reads):
+        """
+        Write molecule information to the supplied reads as BAM tags
+        """
+        # write methylation tags to new reads if applicable:
+        if self.methylation_call_dict is not None:
+            self.set_methylation_call_tags(
+                self.methylation_call_dict, reads=reads)
+        for read in reads:
+            read.set_tag('SM', self.sample)
+            if hasattr(self, 'get_cut_site'):
+                read.set_tag('DS', self.get_cut_site()[1])
+            if self.umi is not None:
+                read.set_tag('RX', self.umi)
+                bc = list(self.get_barcode_sequences())[0]
+                read.set_tag('BC', bc)
+                read.set_tag('MI', bc + self.umi)
+            # Store total amount of RT reactions:
+            read.set_tag('TR', len(self.get_rt_reactions()))
+            read.set_tag('TF', len(self.fragments) + self.overflow_fragments)
+            if self.allele is not None:
+                read.set_tag('DA', self.allele)
+        if self.allele_resolver is not None:
+            self.write_allele_phasing_information_tag(
+                self.allele_resolver, reads=reads)
+    def deduplicate_to_single(
+            self,
+            target_bam,
+            read_name,
+            classifier,
+            reference=None):
+        """
+        Deduplicate all reads associated to this molecule to a single pseudoread
+        Args:
+            target_bam (pysam.AlignmentFile) : file to associate the read with
+            read_name (str) : name of the pseudoread
+            classifier (sklearn classifier) : classifier for consensus prediction
+        Returns:
+            read (pysam.AlignedSegment) : Pseudo-read containing aggregated information
+        """
+        # Set all associated reads to duplicate
+        for read in self.iter_reads():
+            read.is_duplicate = True
+        features = self.get_base_calling_feature_matrix(reference=reference)
+        # We only use the proba:
+        base_calling_probs = classifier.predict_proba(features)
+        predicted_sequence = ['ACGT'[i] for i in np.argmax(base_calling_probs, 1)]
+        phred_scores = np.rint(-10 * np.log10(np.clip(1 - base_calling_probs.max(1), 0.000000001, 0.999999))).astype(
+            'B')
+        read = self.get_consensus_read(
+            read_name=read_name,
+            target_file=target_bam,
+            consensus=''.join(predicted_sequence),
+            phred_scores=phred_scores)
+        read.is_read1 = True
+        return read
+    def deduplicate_to_single_CIGAR_spaced(
+            self,
+            target_bam,
+            read_name,
+            classifier=None,
+            max_N_span=300,
+            reference=None,
+            **feature_matrix_args
+    ):
+        """
+        Deduplicate all associated reads to a single pseudoread, when the span is larger than max_N_span
+        the read is split up in multi-segments. Uncovered locations are spaced using N's in the CIGAR.
+        Args:
+            target_bam (pysam.AlignmentFile) : file to associate the read with
+            read_name (str) : name of the pseudoread
+            classifier (sklearn classifier) : classifier for consensus prediction
+        Returns:
+            reads( list [ pysam.AlignedSegment ] )
+        """
+        # Set all associated reads to duplicate
+        for read in self.iter_reads():
+            read.is_duplicate = True
+        if classifier is not None:
+            features, reference_bases, CIGAR, alignment_start, alignment_end = self.get_base_calling_feature_matrix_spaced(
+                True, reference=reference, **feature_matrix_args)
+            base_calling_probs = classifier.predict_proba(features)
+            predicted_sequence = ['ACGT'[i] for i in np.argmax(base_calling_probs, 1)]
+            reference_sequence = ''.join(
+                [base for chrom, pos, base in reference_bases])
+            # predicted_sequence[ features[:, [ x*8 for x in range(4) ] ].sum(1)==0 ] ='N'
+            predicted_sequence = ''.join(predicted_sequence)
+            phred_scores = np.rint(
+                -10 * np.log10(np.clip(1 - base_calling_probs.max(1),
+.000000001,
+.999999)
+                               )).astype('B')
+        reads = []
+        query_index_start = 0
+        query_index_end = 0
+        reference_position = alignment_start  # pointer to current position
+        reference_start = alignment_start  # pointer to alignment start of current read
+        supplementary = False
+        partial_CIGAR = []
+        partial_MD = []
+        for operation, amount in CIGAR:
+            if operation == 'M':  # Consume query and reference
+                query_index_end += amount
+                reference_position += amount
+                partial_CIGAR.append(f'{amount}{operation}')
+            if operation == 'N':
+                # Consume reference:
+                reference_position += amount
+                if amount > max_N_span:  # Split up in supplementary alignment
+                    # Eject previous
+                    # reference_seq =
+                    consensus_read = self.get_consensus_read(
+                        read_name=read_name,
+                        target_file=target_bam,
+                        consensus=predicted_sequence[query_index_start:query_index_end],
+                        phred_scores=phred_scores[query_index_start:query_index_end],
+                        cigarstring=''.join(partial_CIGAR),
+                        mdstring=create_MD_tag(
+                            reference_sequence[query_index_start:query_index_end],
+                            predicted_sequence[query_index_start:query_index_end]
+                        ),
+                        start=reference_start,
+                        supplementary=supplementary
+                    )
+                    reads.append(consensus_read)
+                    if not supplementary:
+                        consensus_read.is_read1 = True
+                    supplementary = True
+                    # Start new:
+                    query_index_start = query_index_end
+                    reference_start = reference_position
+                    partial_CIGAR = []
+                else:
+                    partial_CIGAR.append(f'{amount}{operation}')
+        reads.append(self.get_consensus_read(
+            read_name=read_name,
+            target_file=target_bam,
+            consensus=''.join(predicted_sequence[query_index_start:query_index_end]),
+            phred_scores=phred_scores[query_index_start:query_index_end],
+            cigarstring=''.join(partial_CIGAR),
+            mdstring=create_MD_tag(
+                reference_sequence[query_index_start:query_index_end],
+                predicted_sequence[query_index_start:query_index_end]
+            ),
+            start=reference_start,
+            supplementary=supplementary
+        ))
+        # Write last index tag to last read ..
+        if supplementary:
+            reads[-1].is_read2 = True
+        # Write NH tag (the amount of records with the same query read):
+        for read in reads:
+            read.set_tag('NH', len(reads))
+        return reads
+    def extract_stretch_from_dict(self, base_call_dict, alignment_start, alignment_end):
+        base_calling_probs = np.array(
+            [base_call_dict.get((self.chromosome, pos), ('N', 0))[1] for pos in range(alignment_start, alignment_end)])
+        predicted_sequence = [base_call_dict.get((self.chromosome, pos), ('N', 0))[0] for pos in
+                              range(alignment_start, alignment_end)]
+        predicted_sequence = ''.join(predicted_sequence)
+        phred_scores = np.rint(
+            -10 * np.log10(np.clip(1 - base_calling_probs,
+.000000001,
+.999999999)
+                           )).astype('B')
+        return predicted_sequence, phred_scores
+    def get_base_confidence_dict(self):
+        """
+        Get dictionary containing base calls per position and the corresponding confidences
+        Returns:
+            obs (dict) :  (contig (str), position  (int) ) : base (str) : prob correct (list)
+        """
+        # Convert (contig, position) -> (base_call) into:
+        # (contig, position) -> (base_call, confidence)
+        obs = defaultdict(lambda: defaultdict(list))
+        for read in self.iter_reads():
+            for qpos, rpos in read.get_aligned_pairs(matches_only=True):
+                qbase = read.seq[qpos]
+                qqual = read.query_qualities[qpos]
+                # @ todo reads which span multiple chromosomes
+                obs[(self.chromosome, rpos)][qbase].append(1 - np.power(10, -qqual / 10))
+        return obs
+    def deduplicate_majority(self, target_bam, read_name, max_N_span=None):
+        obs = self.get_base_confidence_dict()
+        reads = list(self.get_dedup_reads(read_name,
+                                     target_bam,
+                                     obs={reference_position: phredscores_to_base_call(probs)
+                                          for reference_position, probs in obs.items()},
+                                     max_N_span=max_N_span))
+        self.write_tags_to_psuedoreads([read for read in reads if read is not None])
+        return reads
+    def generate_partial_reads(self, obs, max_N_span=None):
+        CIGAR, alignment_start, alignment_end = self.get_CIGAR()
+        query_index_start = 0
+        query_index_end = 0
+        reference_position = alignment_start  # pointer to current position
+        reference_start = alignment_start  # pointer to alignment start of current read
+        reference_end = None
+        partial_CIGAR = []
+        partial_MD = []
+        partial_sequence = []
+        partial_phred = []
+        for operation, amount in CIGAR:
+            if operation == 'N':
+                if max_N_span is not None and amount > max_N_span:
+                    yield reference_start, reference_end, partial_sequence, partial_phred, partial_CIGAR, partial_MD
+                    # Clear all
+                    partial_CIGAR = []
+                    partial_MD = []
+                    partial_sequence = []
+                    partial_phred = []
+                else:
+                    # Increment
+                    partial_CIGAR.append(f'{amount}{operation}')
+                    query_index_start += sum((len(s) for s in partial_sequence))
+                reference_position += amount
+            elif operation == 'M':  # Consume query and reference
+                query_index_end += amount
+                if len(partial_CIGAR) == 0:
+                    reference_start = reference_position
+                start_fetch = reference_position
+                reference_position += amount
+                reference_end = reference_position
+                partial_CIGAR.append(f'{amount}{operation}')
+                predicted_sequence, phred_scores = self.extract_stretch_from_dict(obs, start_fetch, reference_end)  # [start .. end)
+                partial_sequence.append(predicted_sequence)
+                partial_phred.append(phred_scores)
+        yield reference_start, reference_end, partial_sequence, partial_phred, partial_CIGAR, partial_MD
+    def get_dedup_reads(self, read_name, target_bam, obs, max_N_span=None):
+        if self.chromosome is None:
+            return None # We cannot perform this action
+        for reference_start, reference_end, partial_sequence, partial_phred, partial_CIGAR, partial_MD in self.generate_partial_reads(
+                obs, max_N_span=max_N_span):
+            consensus_read = self.get_consensus_read(
+                read_name=read_name,
+                target_file=target_bam,
+                consensus=''.join(partial_sequence),
+                phred_scores= array('B', np.concatenate(partial_phred)), # Needs to be casted to array
+                cigarstring=''.join(partial_CIGAR),
+                mdstring=create_MD_tag(
+                    self.reference.fetch(self.chromosome, reference_start, reference_end),
+                    ''.join(partial_sequence)
+                ),
+                start=reference_start,
+                supplementary=False
+            )
+            consensus_read.is_reverse = self.strand
+            yield consensus_read
+    def deduplicate_to_single_CIGAR_spaced_from_dict(
+            self,
+            target_bam,
+            read_name,
+            base_call_dict,  # (contig, position) -> (base_call, confidence)
+            max_N_span=300,
+    ):
+        """
+        Deduplicate all associated reads to a single pseudoread, when the span is larger than max_N_span
+        the read is split up in multi-segments. Uncovered locations are spaced using N's in the CIGAR.
+        Args:
+            target_bam (pysam.AlignmentFile) : file to associate the read with
+            read_name (str) : name of the pseudoread
+            classifier (sklearn classifier) : classifier for consensus prediction
+        Returns:
+            reads( list [ pysam.AlignedSegment ] )
+        """
+        # Set all associated reads to duplicate
+        for read in self.iter_reads():
+            read.is_duplicate = True
+        CIGAR, alignment_start, alignment_end = self.get_CIGAR()
+        reads = []
+        query_index_start = 0
+        query_index_end = 0
+        reference_position = alignment_start  # pointer to current position
+        reference_start = alignment_start  # pointer to alignment start of current read
+        reference_end = None
+        supplementary = False
+        partial_CIGAR = []
+        partial_MD = []
+        partial_sequence = []
+        partial_phred = []
+        for operation, amount in CIGAR:
+            if operation == 'N':
+                # Pop the previous read..
+                if len(partial_sequence):
+                    assert reference_end is not None
+                    consensus_read = self.get_consensus_read(
+                        read_name=read_name,
+                        target_file=target_bam,
+                        consensus=''.join(partial_sequence),
+                        phred_scores=array('B',np.concatenate(partial_phred)),
+                        cigarstring=''.join(partial_CIGAR),
+                        mdstring=create_MD_tag(
+                            self.reference.fetch(self.chromosome, reference_start, reference_end),
+                            ''.join(partial_sequence)
+                        ),
+                        start=reference_start,
+                        supplementary=supplementary
+                    )
+                    reads.append(consensus_read)
+                    if not supplementary:
+                        consensus_read.is_read1 = True
+                    supplementary = True
+                    reference_start = reference_position
+                    partial_CIGAR = []
+                    partial_phred = []
+                    partial_sequence = []
+                # Consume reference:
+                reference_position += amount
+                partial_CIGAR.append(f'{amount}{operation}')
+            if operation == 'M':  # Consume query and reference
+                query_index_end += amount
+                # This should only be reset upon a new read:
+                if len(partial_CIGAR) == 0:
+                    reference_start = reference_position
+                reference_position += amount
+                reference_end = reference_position
+                partial_CIGAR.append(f'{amount}{operation}')
+                predicted_sequence, phred_scores = self.extract_stretch_from_dict(base_call_dict, reference_start,
+                                                                                  reference_end)  # [start .. end)
+                partial_sequence.append(predicted_sequence)
+                partial_phred.append(phred_scores)
+        consensus_read = self.get_consensus_read(
+            read_name=read_name,
+            target_file=target_bam,
+            consensus=''.join(partial_sequence),
+            phred_scores= array('B',np.concatenate(partial_phred)),
+            cigarstring=''.join(partial_CIGAR),
+            mdstring=create_MD_tag(
+                self.reference.fetch(self.chromosome, reference_start, reference_end),
+                ''.join(partial_sequence)
+            ),
+            start=reference_start,
+            supplementary=supplementary
+        )
+        reads.append(consensus_read)
+        if not supplementary:
+            consensus_read.is_read1 = True
+        supplementary = True
+        reference_start = reference_position
+        partial_CIGAR = []
+        # Write last index tag to last read ..
+        if supplementary:
+            reads[-1].is_read2 = True
+            reads[0].is_read1 = True
+        # Write NH tag (the amount of records with the same query read):
+        for read in reads:
+            read.set_tag('NH', len(reads))
+        return reads
+    def get_base_calling_feature_matrix(
+            self,
+            return_ref_info=False,
+            start=None,
+            end=None,
+            reference=None,
+            NUC_RADIUS=1,
+            USE_RT=True,
+            select_read_groups=None):
+        """
+        Obtain feature matrix for base calling
+        Args:
+            return_ref_info (bool) : return both X and array with feature information
+            start (int) : start of range, genomic position
+            end (int) : end of range (inclusive), genomic position
+            reference(pysam.FastaFile) : reference to fetch reference bases from, if not supplied the MD tag is used
+            NUC_RADIUS(int) : generate kmer features target nucleotide
+            USE_RT(bool) : use RT reaction features
+            select_read_groups(set) : only use reads from these read groups to generate features
+        """
+        if start is None:
+            start = self.spanStart
+        if end is None:
+            end = self.spanEnd
+        with np.errstate(divide='ignore', invalid='ignore'):
+            BASE_COUNT = 5
+            RT_INDEX = 7 if USE_RT else None
+            STRAND_INDEX = 0
+            PHRED_INDEX = 1
+            RC_INDEX = 2
+            MATE_INDEX = 3
+            CYCLE_INDEX = 4
+            MQ_INDEX = 5
+            FS_INDEX = 6
+            COLUMN_OFFSET = 0
+            features_per_block = 8 - (not USE_RT)
+            origin_start = start
+            origin_end = end
+            end += NUC_RADIUS
+            start -= NUC_RADIUS
+            features = np.zeros(
+                (end - start + 1, (features_per_block * BASE_COUNT) + COLUMN_OFFSET))
+            if return_ref_info:
+                ref_bases = {}
+            for rt_id, fragments in self.get_rt_reactions().items():
+                # we need to keep track what positions where covered by this RT
+                # reaction
+                RT_reaction_coverage = set()  # (pos, base_call)
+                for fragment in fragments:
+                    for read in fragment:
+                        if select_read_groups is not None:
+                            if not read.has_tag('RG'):
+                                raise ValueError(
+                                    "Not all reads in the BAM file have a read group defined.")
+                            if not read.get_tag('RG') in select_read_groups:
+                                continue
+                        # Skip reads outside range
+                        if read is None or read.reference_start > (
+                                end + 1) or read.reference_end < start:
+                            continue
+                        for cycle, q_pos, ref_pos, ref_base in pysamiterators.ReadCycleIterator(
+                                read, matches_only=True, with_seq=True, reference=reference):
+                            row_index = ref_pos - start
+                            if row_index < 0 or row_index >= features.shape[0]:
+                                continue
+                            query_base = read.seq[q_pos]
+                            # Base index block:
+                            block_index = 'ACGTN'.index(query_base)
+                            # Update rt_reactions
+                            if USE_RT:
+                                if not (
+                                               ref_pos, query_base) in RT_reaction_coverage:
+                                    features[row_index][RT_INDEX +
+                                                        COLUMN_OFFSET +
+                                                        features_per_block *
+                                                        block_index] += 1
+                                RT_reaction_coverage.add((ref_pos, query_base))
+                            # Update total phred score
+                            features[row_index][PHRED_INDEX +
+                                                COLUMN_OFFSET +
+                                                features_per_block *
+                                                block_index] += read.query_qualities[q_pos]
+                            # Update total reads
+                            features[row_index][RC_INDEX + COLUMN_OFFSET +
+                                                features_per_block * block_index] += 1
+                            # Update mate index
+                            features[row_index][MATE_INDEX +
+                                                COLUMN_OFFSET +
+                                                features_per_block *
+                                                block_index] += read.is_read2
+                            # Update fragment sizes:
+                            features[row_index][FS_INDEX +
+                                                COLUMN_OFFSET +
+                                                features_per_block *
+                                                block_index] += abs(fragment.span[1] -
+                                                                    fragment.span[2])
+                            # Update cycle
+                            features[row_index][CYCLE_INDEX +
+                                                COLUMN_OFFSET +
+                                                features_per_block *
+                                                block_index] += cycle
+                            # Update MQ:
+                            features[row_index][MQ_INDEX +
+                                                COLUMN_OFFSET +
+                                                features_per_block *
+                                                block_index] += read.mapping_quality
+                            # update strand:
+                            features[row_index][STRAND_INDEX +
+                                                COLUMN_OFFSET +
+                                                features_per_block *
+                                                block_index] += read.is_reverse
+                            if return_ref_info:
+                                row_index_in_output = ref_pos - origin_start
+                                if row_index_in_output < 0 or row_index_in_output >= origin_end - origin_start + 1:
+                                    continue
+                                ref_bases[ref_pos] = ref_base.upper()
+            # Normalize all and return
+            for block_index in range(BASE_COUNT):  # ACGTN
+                for index in (
+                        PHRED_INDEX,
+                        MATE_INDEX,
+                        CYCLE_INDEX,
+                        MQ_INDEX,
+                        FS_INDEX,
+                        STRAND_INDEX):
+                    features[:, index +
+                                COLUMN_OFFSET +
+                                features_per_block *
+                                block_index] /= features[:, RC_INDEX +
+                                                            COLUMN_OFFSET +
+                                                            features_per_block *
+                                                            block_index]
+            # np.nan_to_num( features, nan=-1, copy=False )
+            features[np.isnan(features)] = -1
+            if NUC_RADIUS > 0:
+                # duplicate columns in shifted manner
+                x = features
+                features = np.zeros(
+                    (x.shape[0] - NUC_RADIUS * 2, x.shape[1] * (1 + NUC_RADIUS * 2)))
+                for offset in range(0, NUC_RADIUS * 2 + 1):
+                    slice_start = offset
+                    slice_end = -(NUC_RADIUS * 2) + offset
+                    if slice_end == 0:
+                        features[:, features_per_block *
+                                    BASE_COUNT *
+                                    offset:features_per_block *
+                                           BASE_COUNT *
+                                           (offset +
+)] = x[slice_start:, :]
+                    else:
+                        features[:, features_per_block *
+                                    BASE_COUNT *
+                                    offset:features_per_block *
+                                           BASE_COUNT *
+                                           (offset +
+)] = x[slice_start:slice_end, :]
+            if return_ref_info:
+                ref_info = [
+                    (self.chromosome, ref_pos, ref_bases.get(ref_pos, 'N'))
+                    for ref_pos in range(origin_start, origin_end + 1)]
+                return features, ref_info
+            return features
+    def get_CIGAR(self, reference=None):
+        """ Get alignment of all associated reads
+        Returns:
+            y : reference bases
+            CIGAR : alignment of feature matrix to reference tuples (operation, count)
+            reference(pysam.FastaFile) : reference to fetch reference bases from, if not supplied the MD tag is used
+        """
+        X = None
+        CIGAR = []
+        prev_end = None
+        alignment_start = None
+        alignment_end = None
+        for start, end in self.get_aligned_blocks():
+            if prev_end is not None:
+                CIGAR.append(('N', start - prev_end - 1))
+            CIGAR.append(('M', (end - start + 1)))
+            prev_end = end
+            if alignment_start is None:
+                alignment_start = start
+                alignment_end = end
+            else:
+                alignment_start = min(alignment_start, start)
+                alignment_end = max(alignment_end, end)
+        return CIGAR, alignment_start, alignment_end
+    @functools.lru_cache(maxsize=4)
+    def get_base_calling_feature_matrix_spaced(
+            self,
+            return_ref_info=False,
+            reference=None,
+            **feature_matrix_args):
+        """
+        Obtain a base-calling feature matrix for all reference aligned bases.
+        Returns:
+            X : feature matrix
+            y : reference bases
+            CIGAR : alignment of feature matrix to reference tuples (operation, count)
+            reference(pysam.FastaFile) : reference to fetch reference bases from, if not supplied the MD tag is used
+        """
+        X = None
+        if return_ref_info:
+            y = []
+        CIGAR = []
+        prev_end = None
+        alignment_start = None
+        alignment_end = None
+        for start, end in self.get_aligned_blocks():
+            if return_ref_info:
+                x, y_ = self.get_base_calling_feature_matrix(
+                    return_ref_info=return_ref_info, start=start, end=end,
+                    reference=reference, **feature_matrix_args
+                )
+                y += y_
+            else:
+                x = self.get_base_calling_feature_matrix(
+                    return_ref_info=return_ref_info,
+                    start=start,
+                    end=end,
+                    reference=reference,
+                    **feature_matrix_args)
+            if X is None:
+                X = x
+            else:
+                X = np.append(X, x, axis=0)
+            if prev_end is not None:
+                CIGAR.append(('N', start - prev_end - 1))
+            CIGAR.append(('M', (end - start + 1)))
+            prev_end = end
+            if alignment_start is None:
+                alignment_start = start
+                alignment_end = end
+            else:
+                alignment_start = min(alignment_start, start)
+                alignment_end = max(alignment_end, end)
+        if return_ref_info:
+            return X, y, CIGAR, alignment_start, alignment_end
+        else:
+            return X, CIGAR, alignment_start, alignment_end
+    def get_base_calling_training_data(
+            self,
+            mask_variants=None,
+            might_be_variant_function=None,
+            reference=None,
+            **feature_matrix_args):
+        if mask_variants is not None and might_be_variant_function is None:
+            might_be_variant_function = might_be_variant
+        features, feature_info, _CIGAR, _alignment_start, _alignment_end = self.get_base_calling_feature_matrix_spaced(
+            True, reference=reference, **feature_matrix_args)
+        # Edgecase: it can be that not a single base can be used for base calling
+        # in that case features will be None
+        # when there is no features return None
+        if features is None or len(features) == 0:
+            return None
+        # check which bases should not be used
+        use_indices = [
+            mask_variants is None or
+            not might_be_variant_function(chrom, pos, mask_variants, base)
+            for chrom, pos, base in feature_info]
+        X_molecule = features[use_indices]
+        y_molecule = [
+            base for use, (chrom, pos, base) in
+            zip(use_indices, feature_info) if use
+        ]
+        return X_molecule, y_molecule
+    def has_valid_span(self):
+        """Check if the span of the molecule is determined
+        Returns:
+            has_valid_span (bool)
+        """
+        if self.spanStart is not None and self.spanEnd is not None:
+            return True
+        return False
+    def get_strand_repr(self, unknown='?'):
+        """Get string representation of mapping strand
+        Args:
+            unknown (str) :  set what character/string to return
+                             when the strand is not available
+        Returns:
+            strand_repr (str) : + forward, - reverse, ? unknown
+        """
+        s = self.get_strand()
+        if s is None:
+            return unknown
+        if s:
+            return '-'
+        else:
+            return '+'
+    def set_rejection_reason(self, reason, set_qcfail=False):
+        """ Add rejection reason to all fragments associated to this molecule
+        Args:
+            reason (str) : rejection reason to set
+            set_qcfail(bool) : set qcfail bit to True for all associated reads
+        """
+        for fragment in self:
+            fragment.set_rejection_reason(reason, set_qcfail=set_qcfail)
+    def is_valid(self, set_rejection_reasons=False):
+        """Check if the molecule is valid
+        All of the following requirements should be met:
+        - no multimapping
+        - no low mapping mapping_quality (Change molecule.min_max_mapping_quality to set the threshold)
+        - molecule is associated with at least one valid fragment
+        Args:
+            set_rejection_reasons (bool) : When set to True, all reads get a
+            rejection reason (RR tag) written to them if the molecule is rejected.
+        Returns:
+            is_valid (bool) : True when all requirements are met, False otherwise
+        """
+        if self.is_multimapped():
+            if set_rejection_reasons:
+                self.set_rejection_reason('multimapping')
+            return False
+        if self.min_max_mapping_quality is not None and \
+                self.get_max_mapping_qual() < self.min_max_mapping_quality:
+            if set_rejection_reasons:
+                self.set_rejection_reason('MQ')
+            return False
+        if not self.contains_valid_fragment():
+            if set_rejection_reasons:
+                self.set_rejection_reason('invalid_fragments')
+            return False
+        return True
+    def get_aligned_blocks(self):
+        """ get all consecutive blocks of aligned reference positions
+        Returns:
+            sorted list of aligned blocks (list) : [ (start, end), (start, end) ]
+        """
+        return find_ranges(
+            sorted(list(set(
+                (ref_pos
+                 for read in self.iter_reads()
+                 for q_pos, ref_pos in read.get_aligned_pairs(matches_only=True, with_seq=False)))))
+        )
+    def __len__(self):
+        """Obtain the amount of fragments associated to the molecule"""
+        return len(self.fragments)
+    def get_consensus_base_frequencies(self, allow_N=False):
+        """Obtain the frequency of bases in the molecule consensus sequence
+        Returns:
+            base_frequencies (Counter) : Counter containing base frequecies, for example: { 'A':10,'T':3, C:4 }
+        """
+        return Counter(
+            self.get_consensus(
+                allow_N=allow_N).values())
+    def get_feature_vector(self):
+        """ Obtain a feature vector representation of the molecule
+        Returns:
+            feature_vector(np.array)
+        """
+        return np.array([
+            len(self),
+            self.get_strand(),
+            self.has_valid_span(),
+            self.get_umi_error_rate(),
+            self.get_consensus_gc_ratio(),
+            len(self.get_raw_barcode_sequences()),
+            self.get_safely_aligned_length(),
+            self.get_max_mapping_qual(),
+            (self.allele is None),
+            self.contains_valid_fragment(),
+            self.is_multimapped(),
+            self.get_undigested_site_count(),
+            self.is_valid()
+        ])
+    def get_alignment_tensor(self,
+                             max_reads,
+                             window_radius=20,
+                             centroid=None,
+                             mask_centroid=False,
+                             refence_backed=False,
+                             skip_missing_reads=False
+                             ):
+        """ Obtain a tensor representation of the molecule alignment around the given centroid
+        Args:
+            max_reads (int) : maximum amount of reads returned in the tensor, this will be the amount of rows/4 of the returned feature matrix
+            window_radius (int) : radius of bp around centroid
+            centroid(int) : center of extracted window, when not specified the cut location of the molecule is used
+            mask_centroid(bool) : when True, mask reference base at centroid with N
+            refence_backed(bool) : when True the molecules reference is used to emit reference bases instead of the MD tag
+        Returns:
+            tensor_repr(np.array) : (4*window_radius*2*max_reads) dimensional feature matrix
+        """
+        reference = None
+        if refence_backed:
+            reference = self.reference
+            if self.reference is None:
+                raise ValueError(
+                    "refence_backed set to True, but the molecule has no reference assigned. Assing one using pysam.FastaFile()")
+        height = max_reads
+        chromosome = self.chromosome
+        if centroid is None:
+            _, centroid, strand = self.get_cut_site()
+        span_start = centroid - window_radius
+        span_end = centroid + window_radius
+        span_len = abs(span_start - span_end)
+        base_content_table = np.zeros((height, span_len))
+        base_mismatches_table = np.zeros((height, span_len))
+        base_indel_table = np.zeros((height, span_len))
+        base_qual_table = np.zeros((height, span_len))
+        base_clip_table = np.zeros((height, span_len))
+        pointer = 0
+        mask = None
+        if mask_centroid:
+            mask = set((chromosome, centroid))
+        for _, frags in self.get_rt_reactions().items():
+            for frag in frags:
+                pointer = frag.write_tensor(
+                    chromosome,
+                    span_start,
+                    span_end,
+                    index_start=pointer,
+                    base_content_table=base_content_table,
+                    base_mismatches_table=base_mismatches_table,
+                    base_indel_table=base_indel_table,
+                    base_qual_table=base_qual_table,
+                    base_clip_table=base_clip_table,
+                    height=height,
+                    mask_reference_bases=mask,
+                    reference=reference,
+                    skip_missing_reads=skip_missing_reads)
+        x = np.vstack(
+            [
+                base_content_table,
+                base_mismatches_table,
+                base_indel_table,
+                base_qual_table,
+                base_clip_table
+            ])
+        return x
+    def get_consensus_gc_ratio(self):
+        """Obtain the GC ratio of the molecule consensus sequence
+        Returns:
+            gc_ratio(float) : GC ratio
+        """
+        bf = self.get_consensus_base_frequencies()
+        return (bf['G'] + bf['C']) / sum(bf.values())
+    def get_umi_error_rate(self):
+        """Obtain fraction of fragments that are associated
+        to the molecule with a exact matching UMI vs total amount of associated fragments
+        Returns:
+            exact_matching_fraction (float)
+        """
+        mc = 0
+        other = 0
+        for i, (umi, obs) in enumerate(self.umi_counter.most_common()):
+            if i == 0:
+                mc = obs
+            else:
+                other += obs
+        return mc / (other + mc)
+    def get_barcode_sequences(self):
+        """Obtain (Cell) barcode sequences associated to molecule
+        Returns:
+            barcode sequences (set) : barcode sequence(s)
+        """
+        return set(read.get_tag('BC') for read in self.iter_reads())
+    def get_raw_barcode_sequences(self):
+        """Obtain (Cell) barcode sequences associated to molecule, not hamming corrected
+        Returns:
+            barcode sequences (set) : barcode sequence(s)
+        """
+        return set(read.get_tag('bc') for read in self.iter_reads())
+    def get_strand(self):
+        """Obtain mapping strand of molecule
+        Returns:
+            strand : True,False,None
+                True when strand is REVERSE
+                False when strand is FORWARD
+                None when strand is not determined
+        """
+        return self.strand
+    def __repr__(self):
+        max_show = 6  # maximum amount of fragments to show
+        frag_repr = '\n\t'.join([textwrap.indent(str(fragment), ' ' * 4)
+                                 for fragment in self.fragments[:max_show]])
+        return f"""{self.__class__.__name__}
+        with {len(self.fragments)} assinged fragments
+        {"Allele :" + (self.allele if self.allele is not None else "No allele assigned")}
+        """ + frag_repr + (
+            '' if len(self.fragments) < max_show else f'... {len(self.fragments) - max_show} fragments not shown')
+    def update_umi(self):
+        """Set UMI
+        sets self.umi (str) sets the most common umi associated to the molecule
+        """
+        self.umi = self.umi_counter.most_common(1)[0][0]
+    def get_umi(self):
+        """Obtain umi of molecule
+        Returns:
+            umi (str):
+                return main umi associated with this molecule
+        """
+        return self.umi
+    def get_sample(self):
+        """Obtain sample
+        Returns:
+            sample (str):
+                Sample associated with the molecule. Usually extracted from SM tag.
+                Calls fragment.get_sample() to obtain the sample
+        """
+        for fragment in self.fragments:
+            return fragment.get_sample()
+    def get_cut_site(self):
+        """For restriction based protocol data, obtain genomic location of cut site
+        Returns:
+            None if site is not available
+            chromosome (str)
+            position (int)
+            strand (bool)
+        """
+        for fragment in self.fragments:
+            try:
+                site = fragment.get_site_location()
+            except AttributeError:
+                return None
+            if site is not None:
+                return tuple((*site, fragment.get_strand()))
+        return None
+    def get_mean_mapping_qual(self):
+        """Get mean mapping quality of the molecule
+        Returns:
+            mean_mapping_qual (float)
+        """
+        return np.mean([fragment.mapping_quality for fragment in self])
+    def get_max_mapping_qual(self):
+        """Get max mapping quality of the molecule
+        Returns:
+            max_mapping_qual (float)
+        """
+        return max([fragment.mapping_quality for fragment in self])
+    def get_min_mapping_qual(self) -> float:
+        """Get min mapping quality of the molecule
+        Returns:
+            min_mapping_qual (float)
+        """
+        return min([fragment.mapping_quality for fragment in self])
+    def contains_valid_fragment(self):
+        """Check if an associated fragment exists which returns True for is_valid()
+        Returns:
+            contains_valid_fragment (bool) : True when any associated fragment is_valid()
+        """
+        return any(
+            (hasattr(fragment, 'is_valid') and fragment.is_valid()
+             for fragment in self.fragments))
+    def is_multimapped(self):
+        """Check if the molecule is multimapping
+        Returns:
+            is_multimapped (bool) : True when multimapping
+        """
+        for fragment in self.fragments:
+            if not fragment.is_multimapped:
+                return False
+        return True
+    def add_molecule(self, other):
+        """
+        Merge other molecule into this molecule.
+        Merges by assigning all fragments in other to this molecule.
+        """
+        for fragment in other:
+            self._add_fragment(fragment)
+    def get_span_sequence(self, reference=None):
+        """Obtain the sequence between the start and end of the molecule
+        Args:
+            reference(pysam.FastaFile) : reference  to use.
+                If not specified `self.reference` is used
+        Returns:
+            sequence (str)
+        """
+        if self.chromosome is None:
+            return ''
+        if reference is None:
+            if self.reference is None:
+                raise ValueError('Please supply a reference (PySAM.FastaFile)')
+            reference = self.reference
+        return reference.fetch(
+            self.chromosome,
+            self.spanStart,
+            self.spanEnd).upper()
+    def get_fragment_span_sequence(self, reference=None):
+        return self.get_span_sequence(reference)
+    def _add_fragment(self, fragment):
+        # Do not process the fragment when the max_associated_fragments threshold is exceeded
+        if self.max_associated_fragments is not None and len(self.fragments) >= (self.max_associated_fragments):
+            self.overflow_fragments += 1
+            raise OverflowError()
+        self.match_hash = fragment.match_hash
+        # if we already had a fragment, this fragment is a duplicate:
+        if len(self.fragments) > 1:
+            fragment.set_duplicate(True)
+        self.fragments.append(fragment)
+        # Update span:
+        add_span = fragment.get_span()
+        # It is possible that the span is not defined, then set the respective keys to None
+        # This indicates the molecule is qcfail
+        #if not self.has_valid_span():
+        #    self.spanStart, self.spanEnd, self.chromosome = None,None, None
+        #else:
+        self.spanStart = add_span[1] if self.spanStart is None else min(
+            add_span[1], self.spanStart)
+        self.spanEnd = add_span[2] if self.spanEnd is None else max(
+            add_span[2], self.spanEnd)
+        self.chromosome = add_span[0]
+        self.span = (self.chromosome, self.spanStart, self.spanEnd)
+        if fragment.strand is not None:
+            self.strand = fragment.strand
+        self.umi_counter[fragment.umi] += 1
+        self.umi_hamming_distance = fragment.umi_hamming_distance
+        self.saved_base_obs = None
+        self.update_umi()
+        return True
+    @property
+    def aligned_length(self) -> int:
+        if self.has_valid_span():
+            return self.spanEnd - self.spanStart
+        else:
+            return None
+    @property
+    def is_completely_matching(self) -> bool:
+        """
+        Checks if all associated reads are completely mapped:
+        checks if all cigar operations are M,
+        Returns True when all cigar operations are M, False otherwise
+        """
+        return all(
+                (
+                     all(
+                     [ (operation==0)
+                        for operation, amount in read.cigartuples] )
+                for read in self.iter_reads()))
+    @property
+    def estimated_max_length(self) -> int:
+        """
+        Obtain the estimated size of the fragment,
+        returns None when estimation is not possible
+        Takes into account removed bases (R2)
+        Assumes inwards sequencing orientation
+        """
+        max_size = None
+        for frag in self:
+            r = frag.estimated_length
+            if r is None :
+                continue
+            if max_size is None:
+                max_size = r
+            elif r>max_size:
+                max_size = r
+        return max_size
+    def get_safely_aligned_length(self):
+        """Get the amount of safely aligned bases (excludes primers)
+        Returns:
+            aligned_bases (int) : Amount of safely aligned bases
+             or None when this cannot be determined because both mates are not mapped
+        """
+        if self.spanStart is None or self.spanEnd is None:
+            return None
+        start = None
+        end = None
+        contig = None
+        for fragment in self:
+            if not fragment.safe_span:
+                continue
+            if contig is None:
+                contig = fragment.span[0]
+            if contig == fragment.span[0]:
+                f_start, f_end = fragment.get_safe_span()
+                if start is None:
+                    start = f_start
+                    end = f_end
+                else:
+                    start = min(f_start, start)
+                    end = min(f_end, end)
+        if end is None:
+            raise ValueError('Not safe')
+        return abs(end - start)
+    def add_fragment(self, fragment, use_hash=True):
+        """Associate a fragment with this Molecule
+        Args:
+            fragment (singlecellmultiomics.fragment.Fragment) : Fragment to associate
+        Returns:
+            has_been_added (bool) : Returns False when the fragments which have already been associated to the molecule refuse the fragment
+        Raises:
+            OverflowError : when too many fragments have been associated already
+                            control this with .max_associated_fragments attribute
+        """
+        if len(self.fragments) == 0:
+            self._add_fragment(fragment)
+            self.sample = fragment.sample
+            return True
+        if use_hash:
+            if self == fragment:
+                self._add_fragment(fragment)
+                return True
+        else:
+            for f in self.fragments:
+                if f == fragment:
+                    # it matches this molecule:
+                    self._add_fragment(fragment)
+                    return True
+        return False
+    def can_be_yielded(self, chromosome, position):
+        """Check if the molecule is far enough away from the supplied location to be ejected from a buffer.
+        Args:
+            chromosome (str) : chromosome / contig of location to test
+            position (int) : genomic location of location to test
+        Returns:
+            can_be_yielded (bool) : True when the molecule is far enough away from the supplied location to be ejected from a buffer.
+        """
+        if chromosome is None:
+            return False
+        if chromosome != self.chromosome:
+            return True
+        return position < (
+                self.spanStart -
+                self.cache_size *
+.5) or position > (
+                       self.spanEnd +
+                       self.cache_size *
+.5)
+    def get_rt_reactions(self) -> dict:
+        """Obtain RT reaction dictionary
+        returns:
+            rt_dict (dict):  {(primer,pos) : [fragment, fragment..] }
+        """
+        return molecule_to_random_primer_dict(self)
+    def get_rt_reaction_fragment_sizes(self, max_N_distance=1):
+        """Obtain all RT reaction fragment sizes
+        Returns:
+            rt_sizes (list of ints)
+        """
+        rt_reactions = molecule_to_random_primer_dict(
+            self, max_N_distance=max_N_distance)
+        amount_of_rt_reactions = len(rt_reactions)
+        # this obtains the maximum fragment size:
+        frag_chrom, frag_start, frag_end = pysamiterators.iterators.getListSpanningCoordinates(
+            [v for v in itertools.chain.from_iterable(self) if v is not None])
+        # Obtain the fragment sizes of all RT reactions:
+        rt_sizes = []
+        for (rt_contig, rt_end, hexamer), fragments in rt_reactions.items():
+            if rt_end is None:
+                continue
+            rt_chrom, rt_start, rt_end = pysamiterators.iterators.getListSpanningCoordinates(
+                itertools.chain.from_iterable(
+                    [fragment for fragment in fragments if
+                     fragment is not None and fragment.get_random_primer_hash()[0] is not None]))
+            rt_sizes.append([rt_end - rt_start])
+        return rt_sizes
+    def get_mean_rt_fragment_size(self):
+        """Obtain the mean RT reaction fragment size
+        Returns:
+            mean_rt_size (float)
+        """
+        return np.nanmean(
+            self.get_rt_reaction_fragment_sizes()
+        )
+    def write_pysam(self, target_file, consensus=False, no_source_reads=False, consensus_name=None, consensus_read_callback=None, consensus_read_callback_kwargs=None):
+        """Write all associated reads to the target file
+        Args:
+            target_file (pysam.AlignmentFile) : Target file
+            consensus (bool) : write consensus
+            no_source_reads (bool) : while in consensus mode, don't write original reads
+            consensus_read_callback (function) : this function is called with every consensus read as an arguments
+            consensus_read_callback_kwargs (dict) : arguments to pass to the callback function
+        """
+        if consensus:
+            reads = self.deduplicate_majority(target_file,
+                                              f'molecule_{uuid4()}' if consensus_name is None else consensus_name)
+            if consensus_read_callback is not None:
+                if consensus_read_callback_kwargs is not None:
+                    consensus_read_callback(reads, **consensus_read_callback_kwargs)
+                else:
+                    consensus_read_callback(reads)
+            for read in reads:
+                target_file.write(read)
+            if not no_source_reads:
+                for read in self.iter_reads():
+                    read.is_duplicate=True
+                for fragment in self:
+                    fragment.write_pysam(target_file)
+        else:
+            for fragment in self:
+                fragment.write_pysam(target_file)
+    def set_methylation_call_tags(self,
+                                  call_dict, bismark_call_tag='XM',
+                                  total_methylated_tag='MC',
+                                  total_unmethylated_tag='uC',
+                                  total_methylated_CPG_tag='sZ',
+                                  total_unmethylated_CPG_tag='sz',
+                                  total_methylated_CHH_tag='sH',
+                                  total_unmethylated_CHH_tag='sh',
+                                  total_methylated_CHG_tag='sX',
+                                  total_unmethylated_CHG_tag='sx',
+                                  reads=None
+                                  ):
+        """Set methylation call tags given a methylation dictionary
+        This method sets multiple tags in every read associated to the molecule.
+        The tags being set are the bismark_call_tag, every aligned base is annotated
+        with a zZxXhH or ".", and a tag for both the total methylated C's and unmethylated C's
+        Args:
+            call_dict (dict) : Dictionary containing bismark calls (chrom,pos) :
+                        {'context':letter,'reference_base': letter   , 'consensus': letter, optiona: 'qual': pred_score (int) }
+            bismark_call_tag (str) : tag to write bismark call string
+            total_methylated_tag (str) : tag to write total methylated bases
+            total_unmethylated_tag (str) : tag to write total unmethylated bases
+            reads (iterable) : reads to write the tags to, when not supplied, the tags are written to all associated reads
+        Returns:
+            can_be_yielded (bool)
+        """
+        self.methylation_call_dict = call_dict
+        # molecule_XM dictionary containing count of contexts
+        molecule_XM = Counter(
+            list(
+                d.get(
+                    'context',
+                    '.') for d in self.methylation_call_dict.values()))
+        # Contruct XM strings
+        if reads is None:
+            reads = self.iter_reads()
+        for read in reads:
+            bis_met_call_string = ''.join([
+                call_dict.get(
+                    (read.reference_name, rpos), {}).get('context', '.')
+                # Obtain all aligned positions from the call dict
+                # iterate all positions in the alignment
+                for qpos, rpos in read.get_aligned_pairs(matches_only=True)
+                if qpos is not None and rpos is not None])
+            # make sure to ignore non matching positions ? is this neccesary?
+            read.set_tag(
+                # Write the methylation tag to the read
+                bismark_call_tag,
+                bis_met_call_string
+            )
+            # Set total methylated bases
+            read.set_tag(
+                total_methylated_tag,
+                molecule_XM['Z'] + molecule_XM['X'] + molecule_XM['H'])
+            # Set total unmethylated bases
+            read.set_tag(
+                total_unmethylated_tag,
+                molecule_XM['z'] + molecule_XM['x'] + molecule_XM['h'])
+            # Set total CPG methylated and unmethylated:
+            read.set_tag(
+                total_methylated_CPG_tag,
+                molecule_XM['Z'])
+            read.set_tag(
+                total_unmethylated_CPG_tag,
+                molecule_XM['z'])
+            # Set total CHG methylated and unmethylated:
+            read.set_tag(
+                total_methylated_CHG_tag,
+                molecule_XM['X'])
+            read.set_tag(
+                total_unmethylated_CHG_tag,
+                molecule_XM['x'])
+            # Set total CHH methylated and unmethylated:
+            read.set_tag(
+                total_methylated_CHH_tag,
+                molecule_XM['H'])
+            read.set_tag(
+                total_unmethylated_CHH_tag,
+                molecule_XM['h'])
+            # Set XR (Read conversion string)
+            # @todo: this is TAPS specific, inneficient, ugly
+            try:
+                fwd = 0
+                rev = 0
+                for (qpos, rpos, ref_base), call in zip(
+                    read.get_aligned_pairs(matches_only=True,with_seq=True),
+                    bis_met_call_string):
+                    qbase = read.query_sequence[qpos]
+                    if call.isupper():
+                        if qbase=='A':
+                            rev+=1
+                        elif qbase=='T':
+                            fwd+=1
+                # Set XG (genome conversion string)
+                if rev>fwd:
+                    read.set_tag('XR','CT')
+                    read.set_tag('XG','GA')
+                else:
+                    read.set_tag('XR','CT')
+                    read.set_tag('XG','CT')
+            except ValueError:
+                # Dont set the tag
+                pass
+    def set_meta(self, tag, value):
+        """Set meta information to all fragments
+        Args:
+            tag (str):
+letter tag
+            value: value to set
+        """
+        for f in self:
+            f.set_meta(tag, value)
+    def __getitem__(self, index):
+        """Obtain a fragment belonging to this molecule.
+        Args:
+            index (int):
+                index of the fragment [0 ,1 , 2 ..]
+        Returns:
+            fragment (singlecellmultiomics.fragment.Fragment)
+        """
+        return self.fragments[index]
+    def get_alignment_stats(self):
+        """Get dictionary containing mean amount clip/insert/deletion/matches per base
+        Returns:
+            cigar_stats(dict): dictionary {
+                clips_per_bp(int),
+                deletions_per_bp(int),
+                matches_per_bp(int),
+                inserts_per_bp(int),
+                alternative_hits_per_read(int),
+                }
+        """
+        clips = 0
+        matches = 0
+        inserts = 0
+        deletions = 0
+        totalbases = 0
+        total_reads = 0
+        total_alts = 0
+        for read in self.iter_reads():
+            totalbases += read.query_length
+            total_reads += 1
+            for operation, amount in read.cigartuples:
+                if operation == 4:
+                    clips += amount
+                elif operation == 2:
+                    deletions += amount
+                elif operation == 0:
+                    matches += amount
+                elif operation == 1:
+                    inserts += amount
+            if read.has_tag('XA'):
+                total_alts += len(read.get_tag('XA').split(';'))
+        clips_per_bp = clips / totalbases
+        inserts_per_bp = inserts / totalbases
+        deletions_per_bp = deletions / totalbases
+        matches_per_bp = matches / totalbases
+        alt_per_read = total_alts / total_reads
+        return {
+            'clips_per_bp': clips_per_bp,
+            'inserts_per_bp': inserts_per_bp,
+            'deletions_per_bp': deletions_per_bp,
+            'matches_per_bp': matches_per_bp,
+            'alt_per_read': alt_per_read,
+            'total_bases':totalbases,
+            'total_reads':total_reads,
+        }
+    def get_mean_cycle(
+            self,
+            chromosome,
+            position,
+            base=None,
+            not_base=None):
+        """Get the mean cycle at the supplied coordinate and base-call
+        Args:
+            chromosome (str)
+            position (int)
+            base (str) : select only reads with this base
+            not_base(str) : select only reads without this base
+        Returns:
+            mean_cycles (tuple): mean cycle for R1 and R2
+        """
+        assert (base is not None or not_base is not None), "Supply base or not_base"
+        cycles_R1 = []
+        cycles_R2 = []
+        for read in self.iter_reads():
+            if read is None or read.reference_name != chromosome:
+                continue
+            for cycle, query_pos, ref_pos in pysamiterators.iterators.ReadCycleIterator(
+                    read, with_seq=False):
+                if query_pos is None or ref_pos != position:
+                    continue
+                if not_base is not None and read.seq[query_pos] == not_base:
+                    continue
+                if base is not None and read.seq[query_pos] != base:
+                    continue
+                if read.is_read2:
+                    cycles_R2.append(cycle)
+                else:
+                    cycles_R1.append(cycle)
+        if len(cycles_R2) == 0 and len(cycles_R1)==0:
+            raise IndexError(
+                "There are no observations if the supplied base/location combination")
+        return (np.mean(cycles_R1) if len(cycles_R1) else np.nan),  (np.mean(cycles_R2) if len(cycles_R2) else np.nan)
+    def get_mean_base_quality(
+                self,
+                chromosome,
+                position,
+                base=None,
+                not_base=None):
+            """Get the mean phred score at the supplied coordinate and base-call
+            Args:
+                chromosome (str)
+                position (int)
+                base (str) : select only reads with this base
+                not_base(str) : select only reads without this base
+            Returns:
+                mean_phred_score (float)
+            """
+            assert (base is not None or not_base is not None), "Supply base or not_base"
+            qualities = []
+            for read in self.iter_reads():
+                if read is None or read.reference_name != chromosome:
+                    continue
+                for query_pos, ref_pos in read.get_aligned_pairs(
+                        with_seq=False, matches_only=True):
+                    if query_pos is None or ref_pos != position:
+                        continue
+                    if not_base is not None and read.seq[query_pos] == not_base:
+                        continue
+                    if base is not None and read.seq[query_pos] != base:
+                        continue
+                    qualities.append(ord(read.qual[query_pos]))
+            if len(qualities) == 0:
+                raise IndexError(
+                    "There are no observations if the supplied base/location combination")
+            return np.mean(qualities)
+    @cached_property
+    def allele_likelihoods(self):
+        """
+        Per allele likelihood
+        Returns:
+            likelihoods (dict) : {allele_name : likelihood}
+        """
+        return self.get_allele_likelihoods()[0]
+    @property
+    def library(self):
+        """
+        Associated library
+        Returns:
+           library (str) : Library associated with the first read of this molecule
+        """
+        for read in self.iter_reads():
+            if read.has_tag('LY'):
+                return read.get_tag('LY')
+    @cached_property
+    def allele_probabilities(self):
+        """
+        Per allele probability
+        Returns:
+            likelihoods (dict) : {allele_name : prob}
+        """
+        return likelihood_to_prob( self.get_allele_likelihoods()[0] )
+    @cached_property
+    def allele_confidence(self) -> int:
+        """
+        Returns
+            confidence(int) : a phred scalled confidence value for the allele
+            assignment, returns zero when no allele is associated to the molecule
+        """
+        l = self.allele_probabilities
+        if l is None or len(l) == 0 :
+            return 0
+        return int(prob_to_phred( Counter(l).most_common(1)[0][1] ))
+    @cached_property
+    def base_confidences(self):
+        return self.get_base_confidence_dict()
+    @cached_property
+    def base_likelihoods(self):
+        return {(chrom, pos):base_probabilities_to_likelihood(probs) for (chrom, pos),probs in self.base_confidences.items()}
+    @cached_property
+    def base_probabilities(self):
+        # Optimization which is equal to {location:likelihood_to_prob(liks) for location,liks in self.base_likelihoods.items()}
+        obs = {}
+        for read in self.iter_reads():
+            for qpos, rpos in read.get_aligned_pairs(matches_only=True):
+                qbase = read.seq[qpos]
+                qqual = read.query_qualities[qpos]
+                if qbase=='N':
+                    continue
+                # @ todo reads which span multiple chromosomes
+                k = (self.chromosome, rpos)
+                p = 1 - np.power(10, -qqual / 10)
+                if not k in obs:
+                    obs[k] = {}
+                if not qbase in obs[k]:
+                    obs[k][qbase] = [p,1] # likelihood, n
+                    obs[k]['N'] = [1-p,1] # likelihood, n
+                else:
+                    obs[k][qbase][0] *= p
+                    obs[k][qbase][1] += 1
+                    obs[k]['N'][0] *= 1-p # likelihood, n
+                    obs[k]['N'][1] += 1 # likelihood, n
+        # Perform likelihood conversion and convert to probs
+        return { location: likelihood_to_prob({
+            base:likelihood/np.power(0.25,n-1)
+                    for base,(likelihood,n) in base_likelihoods.items() })
+                    for location,base_likelihoods in obs.items()}
+    ## This is a duplicate of the above but only calculates for allele informative positions
+    @cached_property
+    def allele_informative_base_probabilities(self):
+        # Optimization which is equal to {location:likelihood_to_prob(liks) for location,liks in self.base_likelihoods.items()}
+        obs = {}
+        for read in self.iter_reads():
+            for qpos, rpos in read.get_aligned_pairs(matches_only=True):
+                if not self.allele_resolver.has_location( read.reference_name, rpos ):
+                    continue
+                qbase = read.seq[qpos]
+                qqual = read.query_qualities[qpos]
+                if qbase=='N':
+                    continue
+                # @ todo reads which span multiple chromosomes
+                k = (self.chromosome, rpos)
+                p = 1 - np.power(10, -qqual / 10)
+                if not k in obs:
+                    obs[k] = {}
+                if not qbase in obs[k]:
+                    obs[k][qbase] = [p,1] # likelihood, n
+                    obs[k]['N'] = [1-p,1] # likelihood, n
+                else:
+                    obs[k][qbase][0] *= p
+                    obs[k][qbase][1] += 1
+                    obs[k]['N'][0] *= 1-p # likelihood, n
+                    obs[k]['N'][1] += 1 # likelihood, n
+        # Perform likelihood conversion and convert to probs
+        return { location: likelihood_to_prob({
+            base:likelihood/np.power(0.25,n-1)
+                    for base,(likelihood,n) in base_likelihoods.items() })
+                    for location,base_likelihoods in obs.items()}
+    def calculate_allele_likelihoods(self):
+        self.aibd = defaultdict(list)
+        self.obtained_allele_likelihoods = Counter()  # Allele -> [prob, prob, prob]
+        for (chrom, pos), base_probs in self.allele_informative_base_probabilities.items():
+            for base, p in base_probs.items():
+                if base == 'N':
+                    continue
+                assoc_alleles = self.allele_resolver.getAllelesAt(chrom, pos, base)
+                if assoc_alleles is not None and len(assoc_alleles) == 1:
+                    allele = list(assoc_alleles)[0]
+                    self.obtained_allele_likelihoods[allele] += p
+                    self.aibd[allele].append((chrom, pos, base, p))
+    def get_allele_likelihoods(self,):
+        """Obtain the allele(s) this molecule maps to
+        Args:
+            allele_resolver(singlecellmultiomics.alleleTools.AlleleResolver)  : resolver used
+            return_allele_informative_base_dict(bool) : return dictionary containing the bases used for allele determination
+            defaultdict(list,
+            {'allele1': [
+              ('chr18', 410937, 'T'),
+              ('chr18', 410943, 'G'),
+              ('chr18', 410996, 'G'),
+              ('chr18', 411068, 'A')]})
+        Returns:
+            { 'allele_a': likelihood, 'allele_b':likelihood }
+        """
+        if self.obtained_allele_likelihoods is None:
+            self.calculate_allele_likelihoods()
+        return self.obtained_allele_likelihoods, self.aibd
+    def get_allele(
+            self,
+            allele_resolver=None,
+            return_allele_informative_base_dict=False):
+        """Obtain the allele(s) this molecule maps to
+        Args:
+            allele_resolver(singlecellmultiomics.alleleTools.AlleleResolver)  : resolver used
+            return_allele_informative_base_dict(bool) : return dictionary containing the bases used for allele determination
+            defaultdict(list,
+            {'allele1': [('chr18', 410937, 'T'),
+              ('chr18', 410943, 'G'),
+              ('chr18', 410996, 'G'),
+              ('chr18', 411068, 'A')]})
+        Returns:
+            alleles(set( str )) : Set of strings containing associated alleles
+        """
+        if allele_resolver is None:
+            if self.allele_resolver is not None:
+                allele_resolver = self.allele_resolver
+            else:
+                raise ValueError(
+                    "Supply allele resolver or set it to molecule.allele_resolver")
+        alleles = set()
+        if return_allele_informative_base_dict:
+            aibd = defaultdict(list)
+        try:
+            for (chrom, pos), base in self.get_consensus(
+                    base_obs=self.get_base_observation_dict_NOREF()).items():
+                c = allele_resolver.getAllelesAt(chrom, pos, base)
+                if c is not None and len(c) == 1:
+                    alleles.update(c)
+                    if return_allele_informative_base_dict:
+                        aibd[list(c)[0]].append((chrom, pos, base))
+        except Exception as e:
+            if return_allele_informative_base_dict:
+                return dict()
+            else:
+                return {}
+        if return_allele_informative_base_dict:
+            return aibd
+        return alleles
+    def write_allele_phasing_information_tag(
+            self, allele_resolver=None, tag='ap', reads=None):
+        """
+        Write allele phasing information to ap tag
+        For every associated read a tag wil be written containing:
+        chromosome,postion,base,allele_name|chromosome,postion,base,allele_name|...
+        for all variants found by the AlleleResolver
+        """
+        if reads is None:
+            reads = self.iter_reads()
+        use_likelihood = (self.allele_assingment_method==1)
+        if not use_likelihood:
+            haplotype = self.get_allele(
+                return_allele_informative_base_dict=True,
+                allele_resolver=allele_resolver)
+            phased_locations = [
+                (allele, chromosome, position, base)
+                for allele, bps in haplotype.items()
+                for chromosome, position, base in bps]
+            phase_str = '|'.join(
+                [
+                    f'{chromosome},{position},{base},{allele}' for allele,
+                                                                   chromosome,
+                                                                   position,
+                                                                   base in phased_locations])
+        else:
+            allele_likelihoods, aibd = self.get_allele_likelihoods()
+            allele_likelihoods = likelihood_to_prob(allele_likelihoods)
+            phased_locations = [
+                (allele, chromosome, position, base, confidence)
+                for allele, bps in aibd.items()
+                for chromosome, position, base, confidence in bps]
+            phase_str = '|'.join(
+                [
+                    f'{chromosome},{position},{base},{allele},{ prob_to_phred(confidence) }' for allele,
+                                                                   chromosome,
+                                                                   position,
+                                                                   base,
+                                                                   confidence in phased_locations])
+        if len(phase_str) > 0:
+            for read in reads:
+                read.set_tag(tag, phase_str)
+                if use_likelihood:
+                    read.set_tag('al', self.allele_confidence)
+    def get_base_observation_dict_NOREF(self, allow_N=False):
+        '''
+        identical to get_base_observation_dict but does not obtain reference bases,
+        has to be used when no MD tag is present
+        Args:
+            return_refbases ( bool ):
+                return both observed bases and reference bases
+        Returns:
+            { genome_location (tuple) : base (string) : obs (int) }
+            and
+            { genome_location (tuple) : base (string) if return_refbases is True }
+        '''
+        base_obs = defaultdict(Counter)
+        used = 0  # some alignments yielded valid calls
+        ignored = 0
+        for fragment in self:
+            _, start, end = fragment.span
+            for read in fragment:
+                if read is None:
+                    continue
+                for cycle, query_pos, ref_pos in pysamiterators.iterators.ReadCycleIterator(
+                        read, with_seq=False):
+                    if query_pos is None or ref_pos is None or ref_pos < start or ref_pos > end:
+                        continue
+                    query_base = read.seq[query_pos]
+                    if query_base == 'N' and not allow_N:
+                        continue
+                    base_obs[(read.reference_name, ref_pos)][query_base] += 1
+        if used == 0 and ignored > 0:
+            raise ValueError('Could not extract any safe data from molecule')
+        return base_obs
+    def get_base_observation_dict(self, return_refbases=False, allow_N=False,
+        allow_unsafe=True, one_call_per_frag=False, min_cycle_r1=None,
+         max_cycle_r1=None, min_cycle_r2=None, max_cycle_r2=None, use_cache=True, min_bq=None):
+        '''
+        Obtain observed bases at reference aligned locations
+        Args:
+            return_refbases ( bool ):
+                return both observed bases and reference bases
+            allow_N (bool): Keep N base calls in observations
+            min_cycle_r1(int) : Exclude read 1 base calls with a cycle smaller than this value (excludes bases which are trimmed before mapping)
+            max_cycle_r1(int) : Exclude read 1 base calls with a cycle larger than this value (excludes bases which are trimmed before mapping)
+            min_cycle_r2(int) : Exclude read 2 base calls with a cycle smaller than this value (excludes bases which are trimmed before mapping)
+            max_cycle_r2(int) : Exclude read 2 base calls with a cycle larger than this value (excludes bases which are trimmed before mapping)
+        Returns:
+            { genome_location (tuple) : base (string) : obs (int) }
+            and
+            { genome_location (tuple) : base (string) if return_refbases is True }
+        '''
+        # Check if cached is available
+        if use_cache:
+            if self.saved_base_obs is not None :
+                if not return_refbases:
+                    return self.saved_base_obs[0]
+                else:
+                    if self.saved_base_obs[1] is not None:
+                        return self.saved_base_obs
+        base_obs = defaultdict(Counter)
+        ref_bases = {}
+        used = 0  # some alignments yielded valid calls
+        ignored = 0
+        error = None
+        for fragment in self:
+            _, start, end = fragment.span
+            used += 1
+            if one_call_per_frag:
+                frag_location_obs = set()
+            for read in fragment:
+                if read is None:
+                    continue
+                if allow_unsafe:
+                    for query_pos, ref_pos, ref_base in read.get_aligned_pairs(matches_only=True, with_seq=True):
+                        if query_pos is None or ref_pos is None:  # or ref_pos < start or ref_pos > end:
+                            continue
+                        query_base = read.seq[query_pos]
+                        # query_qual = read.qual[query_pos]
+                        if min_bq is not None and read.query_qualities[query_pos]<min_bq:
+                            continue
+                        if query_base == 'N':
+                            continue
+                        k = (read.reference_name, ref_pos)
+                        if one_call_per_frag:
+                            if k in frag_location_obs:
+                                continue
+                            frag_location_obs.add(k)
+                        base_obs[k][query_base] += 1
+                        if return_refbases:
+                            ref_bases[(read.reference_name, ref_pos)
+                            ] = ref_base.upper()
+                else:
+                    for cycle, query_pos, ref_pos, ref_base in pysamiterators.iterators.ReadCycleIterator(
+                            read, with_seq=True, reference=self.reference):
+                        if query_pos is None or ref_pos is None:  # or ref_pos < start or ref_pos > end:
+                            continue
+                        # Verify cycle filters:
+                        if (not read.is_paired or read.is_read1) and (
+                                ( min_cycle_r1 is not None and cycle <  min_cycle_r1 ) or
+                                ( max_cycle_r1 is not None and  cycle >  max_cycle_r1 )):
+                            continue
+                        if (read.is_paired and read.is_read2) and (
+                                ( min_cycle_r2 is not None and cycle <  min_cycle_r2 ) or
+                                ( max_cycle_r2 is not None and cycle >  max_cycle_r2 )):
+                            continue
+                        query_base = read.seq[query_pos]
+                        # Skip bases with low bq:
+                        if min_bq is not None and read.query_qualities[query_pos]<min_bq:
+                            continue
+                        if query_base == 'N':
+                            continue
+                        k = (read.reference_name, ref_pos)
+                        if one_call_per_frag:
+                            if k in frag_location_obs:
+                                continue
+                            frag_location_obs.add(k)
+                        base_obs[(read.reference_name, ref_pos)][query_base] += 1
+                        if return_refbases:
+                            ref_bases[(read.reference_name, ref_pos)
+                            ] = ref_base.upper()
+        if used == 0 and ignored > 0:
+            raise ValueError(f'Could not extract any safe data from molecule {error}')
+        self.saved_base_obs = (base_obs, ref_bases)
+        if return_refbases:
+            return base_obs, ref_bases
+        return base_obs
+    def get_match_mismatch_frequency(self, ignore_locations=None):
+        """Get amount of base-calls matching and mismatching the reference sequence,
+           mismatches in every read are counted
+        Args:
+            ignore_locations (iterable(tuple([chrom(str),pos(int)])) ) :
+                Locations not to take into account for the match and mismatch frequency
+        Returns:
+            matches(int), mismatches(int)
+        """
+        matches = 0
+        mismatches = 0
+        base_obs, ref_bases = self.get_base_observation_dict(
+            return_refbases=True)
+        for location, obs in base_obs.items():
+            if ignore_locations is not None and location in ignore_locations:
+                continue
+            if location in ref_bases:
+                ref = ref_bases[location]
+                if ref not in 'ACTG':  # don't count weird bases in the reference @warn
+                    continue
+                matches += obs[ref]
+                mismatches += sum((base_obs for base,
+                                                base_obs in obs.most_common() if base != ref))
+        return matches, mismatches
+    def get_consensus(self,
+                    dove_safe: bool = False,
+                    only_include_refbase: str = None,
+                    allow_N=False,
+                    with_probs_and_obs=False, **get_consensus_dictionaries_kwargs):
+        """
+        Obtain consensus base-calls for the molecule
+        """
+        if allow_N:
+            raise NotImplementedError()
+        consensii = defaultdict(consensii_default_vector)  # location -> obs (A,C,G,T,N)
+        if with_probs_and_obs:
+            phred_scores = defaultdict(lambda:defaultdict(list))
+        for fragment in self:
+            if dove_safe and not fragment.has_R2() or not fragment.has_R1():
+                continue
+            try:
+                for position, (q_base, phred_score) in fragment.get_consensus(
+                        dove_safe=dove_safe,only_include_refbase=only_include_refbase,
+                        **get_consensus_dictionaries_kwargs).items():
+                    if q_base == 'N':
+                        continue
+                    #    consensii[position][4] += phred_score
+                    # else:
+                    if with_probs_and_obs:
+                        phred_scores[position][q_base].append(phred_score)
+                    consensii[position]['ACGTN'.index(q_base)] += 1
+            except ValueError as e:
+                # For example: ValueError('This method only works for inwards facing reads')
+                pass
+        if len(consensii)==0:
+            if with_probs_and_obs:
+                return dict(),None,None
+            else:
+                return dict()
+        locations = np.empty(len(consensii), dtype=object)
+        locations[:] = sorted(list(consensii.keys()))
+        v = np.vstack([ consensii[location] for location in locations])
+        majority_base_indices = np.argmax(v, axis=1)
+        # Check if there is ties, this result in multiple hits for argmax (majority_base_indices),
+        # such a situtation is of course terrible and should be dropped
+        proper = (v == v[np.arange(v.shape[0]), majority_base_indices][:, np.newaxis]).sum(1) == 1
+        if with_probs_and_obs:
+            return (
+                dict(zip(locations[proper], ['ACGTN'[idx] for idx in majority_base_indices[proper]])),
+                phred_scores,
+                consensii
+            )
+        else:
+            return  dict(zip(locations[proper], ['ACGTN'[idx] for idx in majority_base_indices[proper]]))
+    def get_consensus_old(
+            self,
+            base_obs=None,
+            classifier=None,
+            store_consensus=True,
+            reuse_cached_consensus=True,
+            allow_unsafe=False,
+            allow_N=False):
+        """Get dictionary containing consensus calls in respect to reference.
+        By default mayority voting is used to determine the consensus base. If a classifier is supplied the classifier is used to determine the consensus base.
+        Args:
+            base_obs (defaultdict(Counter)) :
+                { genome_location (tuple) : base (string) : obs (int) }
+            classifier : fitted classifier to use for consensus calling. When no classifier is provided the consensus is determined by majority voting
+            store_consensus (bool) : Store the generated consensus for re-use
+        Returns:
+            consensus (dict)  :  {location : base}
+        """
+        consensus = {}  # postion -> base , key is not set when not decided
+        if classifier is not None:
+            if reuse_cached_consensus and hasattr(
+                    self, 'classifier_consensus') and self.classifier_consensus is not None:
+                return self.classifier_consensus
+            features, reference_bases, CIGAR, alignment_start, alignment_end = self.get_base_calling_feature_matrix_spaced(
+                True)
+            if features is None:
+                # We cannot determine the consensus as there are no features...
+                return dict()
+            base_calling_probs = classifier.predict_proba(features)
+            predicted_sequence = ['ACGT'[i] for i in np.argmax(base_calling_probs, 1)]
+            reference_sequence = ''.join(
+                [base for chrom, pos, base in reference_bases])
+            phred_scores = np.rint(
+                -10 * np.log10(np.clip(1 - base_calling_probs.max(1),
+.000000001,
+.999999999)
+                               )).astype('B')
+            consensus = {(chrom, pos): consensus_base for (
+                                                              chrom, pos, ref_base), consensus_base in
+                         zip(reference_bases, predicted_sequence)}
+            if store_consensus:
+                self.classifier_consensus = consensus
+                self.classifier_phred_scores = phred_scores
+            return consensus
+        if base_obs is None:
+            try:
+                base_obs, ref_bases = self.get_base_observation_dict(
+                    return_refbases=True, allow_N=allow_N, allow_unsafe=allow_unsafe)
+            except ValueError as e:
+                # We cannot determine safe regions
+                raise
+        for location, obs in base_obs.items():
+            votes = obs.most_common()
+            if len(votes) == 1 or votes[1][1] < votes[0][1]:
+                consensus[location] = votes[0][0]
+        if store_consensus:
+            self.majority_consensus = consensus
+        return consensus
+    def get_consensus_base(self, contig, position, classifier=None):
+        """Obtain base call at single position of the molecule
+        Args:
+            contig (str) : contig to extract base call from
+            position (int) : position to extract base call from (zero based)
+            classifier (obj) : base calling classifier
+        Returns:
+            base_call (str) : base call, or None when no base call could be made
+        """
+        try:
+            c = self.get_consensus(classifier)
+        except ValueError:
+            return None
+        return c.get((contig, position), None)
+    # when enabled other calls (non ref non alt will be set None)
+    def check_variants(self, variants, exclude_other_calls=True):
+        """Verify variants in molecule
+        Args:
+            variants (pysam.VariantFile) : Variant file handle to extract variants from
+        Returns:
+            dict (defaultdict( Counter )) : { (chrom,pos) : ( call (str) ): observations  (int) }
+        """
+        variant_dict = {}
+        for variant in variants.fetch(
+                self.chromosome,
+                self.spanStart,
+                self.spanEnd):
+            variant_dict[(variant.chrom, variant.pos - 1)
+            ] = (variant.ref, variant.alts)
+        variant_calls = defaultdict(Counter)
+        for fragment in self:
+            _, start, end = fragment.span
+            for read in fragment:
+                if read is None:
+                    continue
+                for cycle, query_pos, ref_pos in pysamiterators.iterators.ReadCycleIterator(
+                        read):
+                    if query_pos is None or ref_pos is None or ref_pos < start or ref_pos > end:
+                        continue
+                    query_base = read.seq[query_pos]
+                    k = (read.reference_name, ref_pos)
+                    if k in variant_dict:
+                        call = None
+                        ref, alts = variant_dict[k]
+                        if query_base == ref:
+                            call = ('ref', query_base)
+                        elif query_base in alts:
+                            call = ('alt', query_base)
+                        if not exclude_other_calls or call is not None:
+                            variant_calls[k][call] += 1
+        return variant_calls
+    def get_aligned_reference_bases_dict(self):
+        """Get dictionary containing all reference bases to which this molecule aligns
+        Returns:
+            aligned_reference_positions (dict) :  { (chrom,pos) : 'A', (chrom,pos):'T', .. }
+        """
+        aligned_reference_positions = {}
+        for read in self.iter_reads():
+            for read_pos, ref_pos, ref_base in read.get_aligned_pairs(
+                    with_seq=True, matches_only=True):
+                aligned_reference_positions[(
+                    read.reference_name, ref_pos)] = ref_base.upper()
+        return aligned_reference_positions
+    def iter_reads(self):
+        """Iterate over all associated reads
+        Returns:
+            generator (pysam.AlignedSegment)
+        """
+        for fragment in self.fragments:
+            for read in fragment:
+                if read is not None:
+                    yield read
+    def __iter__(self):
+        """Iterate over all associated fragments
+        Yields:
+            singlecellmultiomics.fragment.Fragment
+        """
+        for fragment in self.fragments:
+            yield fragment
+    @property
+    def span_len(self):
+        return abs(self.spanEnd - self.spanStart)
+    def get_methylated_count(self, context=3):
+        """Get the total amount of methylated bases
+        Args:
+            context (int) : 3 or 4 base context
+        Returns:
+            r (Counter) : sum of methylated bases in contexts
+        """
+        r = Counter()
+    def get_html(
+            self,
+            reference=None,
+            consensus=None,
+            show_reference_sequence=True,
+            show_consensus_sequence=True,
+            reference_bases=None):
+        """Get html representation of the molecule
+        Returns:
+            html_rep(str) : Html representation of the molecule
+        """
+        html = f"""<h3>{self.chromosome}:{self.spanStart}-{self.spanEnd}
+            sample:{self.get_sample()}  {'valid molecule' if self[0].is_valid() else 'Non valid molecule'}</h3>
+            <h5>UMI:{self.get_umi()} Mapping qual:{round(self.get_mean_mapping_qual(), 1)} Cut loc: {"%s:%s" % self[0].get_site_location()} </h5>
+            <div style="white-space:nowrap; font-family:monospace; color:#888">"""
+        # undigested:{self.get_undigested_site_count()}
+        consensus = self.get_consensus()
+        # Obtain reference bases dictionary:
+        if reference_bases is None:
+            if reference is None:
+                reference_bases = self.get_aligned_reference_bases_dict()
+            else:
+                # obtain reference_bases from reference file
+                raise NotImplementedError()
+        for fragment in itertools.chain(*self.get_rt_reactions().values()):
+            html += f'<h5>{fragment.get_R1().query_name}</h5>'
+            for readid, read in [
+                (1, fragment.get_R1()),
+                (2, fragment.get_R2())]:  # go over R1 and R2:
+                # This is just the sequence:
+                if read is None:
+                    continue
+                html += fragment.get_html(
+                    self.chromosome,
+                    self.spanStart,
+                    self.spanEnd,
+                    show_read1=(readid == 1),
+                    show_read2=(readid == 2)
+                ) + '<br />'
+        # Obtain reference sequence and consensus sequence:
+        if consensus is None:
+            consensus = self.get_consensus()
+        span_len = self.spanEnd - self.spanStart
+        visualized = ['.'] * span_len
+        reference_vis = ['?'] * span_len
+        for location, query_base in consensus.items():
+            try:
+                if reference_bases is None or reference_bases.get(
+                        location, '?') == query_base:
+                    visualized[location[1] - self.spanStart] = query_base
+                    if reference_bases is not None:
+                        # or reference_bases.get(location,'?')
+                        reference_vis[location[1] -
+                                      self.spanStart] = query_base
+                else:
+                    visualized[location[1] -
+                               self.spanStart] = style_str(query_base, color='red', weight=800)
+                    if reference_bases is not None:
+                        reference_vis[location[1] - self.spanStart] = style_str(
+                            reference_bases.get(location, '?'), color='black', weight=800)
+            except IndexError as e:
+                pass  # Tried to visualize a base outside view
+        if show_consensus_sequence:
+            html += ''.join(visualized) + '<br />'
+        if show_reference_sequence:
+            html += ''.join(reference_vis) + '<br />'
+        html += "</div>"
+        return html
+    def get_methylation_dict(self):
+        """Obtain methylation dictionary
+        Returns:
+            methylated_positions (Counter):
+                (read.reference_name, rpos) : times seen methylated
+            methylated_state (dict):
+                {(read.reference_name, rpos) : 1/0/-1 }
+for methylated
+for unmethylated
+                -1 for unknown
+        """
+        methylated_positions = Counter()  # chrom-pos->count
+        methylated_state = dict()  # chrom-pos->1, 0, -1
+        for fragment in self:
+            for read in fragment:
+                if read is None or not read.has_tag('XM'):
+                    continue
+                methylation_status_string = read.get_tag('XM')
+                i = 0
+                for qpos, rpos, ref_base in read.get_aligned_pairs(
+                        with_seq=True):
+                    if qpos is None:
+                        continue
+                    if ref_base is None:
+                        continue
+                    if rpos is None:
+                        continue
+                    methylation_status = methylation_status_string[i]
+                    if methylation_status.isupper():
+                        methylated_positions[(read.reference_name, rpos)] += 1
+                        if methylated_state.get(
+                                (read.reference_name, rpos), 1) == 1:
+                            methylated_state[(read.reference_name, rpos)] = 1
+                        else:
+                            methylated_state[(read.reference_name, rpos)] = -1
+                    else:
+                        if methylation_status == '.':
+                            pass
+                        else:
+                            if methylated_state.get(
+                                    (read.reference_name, rpos), 0) == 0:
+                                methylated_state[(
+                                    read.reference_name, rpos)] = 0
+                            else:
+                                methylated_state[(
+                                    read.reference_name, rpos)] = -1
+                    i += 1
+        return methylated_positions, methylated_state
+    def _get_allele_from_reads(self) -> str:
+        """
+        Obtain associated allele based on the associated reads of the molecule
+        """
+        allele = None
+        for frag in self:
+            for read in frag:
+                if read is None or not read.has_tag('DA'):
+                    continue
+                allele = read.get_tag('DA')
+                return allele
+        return None