--- a
+++ b/singlecellmultiomics/bamProcessing/variantStats.py
@@ -0,0 +1,701 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import pysam
+import singlecellmultiomics.molecule
+import singlecellmultiomics.fragment
+import singlecellmultiomics.pyutils
+import pysamiterators
+import collections
+import glob
+import pickle
+import pandas as pd
+from colorama import Fore, Style
+from singlecellmultiomics.bamProcessing.bamFunctions import sorted_bam_file, sort_and_index, get_reference_from_pysam_alignmentFile, add_readgroups_to_header, write_program_tag, GATK_indel_realign
+from singlecellmultiomics.pyutils import meanOfCounter
+import argparse
+import os
+import sys
+import traceback
+f'Please use an environment with python 3.6 or higher!'
+
+
+def obtain_variant_statistics(
+    alignment_file_paths,
+    cell_obs, statistics, cell_call_data,
+    reference,
+    chromosome,
+    ssnv_position, gsnv_position, haplotype_scores,
+    WINDOW_RADIUS, out, min_read_obs, read_groups,
+    umi_hamming_distance,
+    args
+):
+    """
+    Obtain statistics from known gsnv-phased variant location
+
+    Args:
+        alignment_file_paths (list) : List of handles to Pysam.Alignment files from which to extract molecules
+
+        cell_obs ( collections.defaultdict(lambda: collections.defaultdict( collections.Counter) ) )
+
+        statistics ( collections.defaultdict(lambda: collections.defaultdict( collections.Counter) ) )
+
+        cell_call_data(collections.defaultdict(dict) )
+
+        haplotype_scores(dict)
+
+        reference (pysamiterators.CachedFasta)
+
+        chromosome (str)
+
+        ssnv_position (int) : zero based ssnv position
+
+        gsnv_position (int) : zero based gsnv position
+
+        WINDOW_RADIUS(int)
+
+        out(pysam.AlignmentFile )
+
+        min_read_obs(int)
+
+        read_groups(set)
+
+        umi_hamming_distance(int)
+
+        args
+
+
+    """
+
+    sSNV_ref_base = reference.fetch(
+        chromosome, ssnv_position, ssnv_position + 1)
+    gSNV_ref_base = reference.fetch(
+        chromosome, gsnv_position, gsnv_position + 1)
+
+    window_molecules = []
+
+    cell_read_obs = collections.defaultdict(
+        collections.Counter)  # sample -> tuple -> amount of reads
+
+    region_start = ssnv_position - WINDOW_RADIUS
+    region_end = ssnv_position + WINDOW_RADIUS
+
+    for pathi, alignment_path in enumerate(alignment_file_paths):
+
+        # Perform re-alignment:
+        if args.realign:
+            target_bam = f'align_{chromosome}_{region_start}_{region_end}.bam'
+
+            if not os.path.exists(target_bam):
+                temp_target_bam = f'{target_bam}.temp.bam'
+                temp_target_bai = f'{target_bam}.temp.bai'
+                GATK_indel_realign(
+                    alignment_path,
+                    temp_target_bam,
+                    chromosome,
+                    region_start,
+                    region_end,
+                    args.indelvcf,
+                    gatk_path=args.gatk3_path,
+                    interval_path=None,
+                    java_cmd=f'java -jar -Xmx{args.realign_mem}G -Djava.io.tmpdir=./gatk_tmp',
+                    reference=reference.handle.filename.decode('utf8'),
+                    interval_write_path=f'./align_{chromosome}_{region_start}_{region_end}.intervals')
+
+                print(f'Renaming {temp_target_bam} > {target_bam}')
+                os.rename(temp_target_bam,target_bam)
+                os.rename(temp_target_bai,target_bam.replace('.bam','.bai'))
+            alignment_path = target_bam
+
+        with pysam.AlignmentFile(alignment_path, ignore_truncation=args.ignore_bam_issues) as alignments:
+
+            for molecule_id, molecule in enumerate(
+                    singlecellmultiomics.molecule.MoleculeIterator(alignments,
+                                                                   fragment_class_args={
+                                                                       'umi_hamming_distance': umi_hamming_distance,
+                                                                   },
+                                                                   molecule_class_args={
+                                                                       'reference': reference
+                                                                   },
+                                                                   molecule_class=singlecellmultiomics.molecule.NlaIIIMolecule,
+                                                                   fragment_class=singlecellmultiomics.fragment.NlaIIIFragment,
+                                                                   start=ssnv_position - WINDOW_RADIUS,
+                                                                   end=ssnv_position + WINDOW_RADIUS,
+                                                                   contig=chromosome
+                                                                   )):
+
+                # For every molecule obtain the consensus from which to extract
+                # the gSNV and sSNV:
+                try:
+                    consensus = molecule.get_consensus()
+                except Exception as e:
+                    if str(e) == 'Could not extract any safe data from molecule':
+                        statistics[(chromosome, ssnv_position)
+                                   ]['R2_unmapped'][True] += 1
+                    else:
+                        print(e)
+                    continue
+
+                # Extract the gSNV and sSNV:
+                ssnv_state = consensus.get((chromosome, ssnv_position))
+                gsnv_state = consensus.get((chromosome, gsnv_position))
+
+                # Store all used molecules in the window for inspection:
+                window_molecules.append((molecule, ssnv_state, gsnv_state))
+
+                # If both the ssnv and gsnv are none there is no information we
+                # can use.
+                if ssnv_state is None and gsnv_state is None:
+                    continue
+
+                # Store the observation
+                # the amount of reads of evidence is len(molecule)
+                cell_obs[(chromosome, ssnv_position)][molecule.get_sample()][(
+                    ssnv_state, gsnv_state)] += 1
+                cell_read_obs[molecule.get_sample()][(
+                    ssnv_state, gsnv_state)] += len(molecule)
+
+                # Store statistics
+                statistics[(chromosome, ssnv_position)
+                           ]['max_mapping_quality'][molecule.get_max_mapping_qual()] += 1
+                statistics[(chromosome, ssnv_position)
+                           ]['fragment_size'][molecule.get_safely_aligned_length()] += 1
+                statistics[(chromosome, ssnv_position)]['ivt_dups'][len(
+                    molecule.get_rt_reactions())] += 1
+                statistics[(chromosome, ssnv_position)
+                           ]['undigested'][molecule.get_undigested_site_count()] += 1
+                statistics[(chromosome, ssnv_position)
+                           ]['reads'][len(molecule)] += 1
+                statistics[(chromosome, ssnv_position)]['molecules'][1] += 1
+
+                # Store alignment statistics:
+                for operation, per_bp in molecule.get_alignment_stats().items():
+                    statistics[(chromosome, ssnv_position)
+                               ][operation][per_bp] += 1
+
+                try:
+                    statistics[(chromosome, ssnv_position)]['ssnv_ref_phred'][molecule.get_mean_base_quality(
+                        chromosome, ssnv_position, sSNV_ref_base)] += 1
+                except BaseException:
+                    pass
+
+                try:
+                    statistics[(chromosome, ssnv_position)]['ssnv_alt_phred'][molecule.get_mean_base_quality(
+                        chromosome, ssnv_position, not_base=sSNV_ref_base)] += 1
+                except BaseException:
+                    pass
+
+                try:
+                    statistics[(chromosome, ssnv_position)]['gsnv_ref_phred'][molecule.get_mean_base_quality(
+                        chromosome, gsnv_position, gSNV_ref_base)] += 1
+                except BaseException:
+                    pass
+
+                try:
+                    statistics[(chromosome, ssnv_position)]['gsnv_any_alt_phred'][molecule.get_mean_base_quality(
+                        chromosome, gsnv_position, not_base=gSNV_ref_base)] += 1
+                except BaseException:
+                    pass
+
+    # After finishing iteration over all molecules assign genotypes
+    chrom, pos = chromosome, ssnv_position
+    obs_for_cells = cell_obs[(chrom, pos)]
+
+    sSNV_alt_base = None
+    gSNV_alt_base = None
+
+    genotype_obs = collections.Counter()
+    complete_genotype_obs = collections.Counter()
+    sSNV_obs_phased = collections.Counter()
+    gSNV_obs_phased = collections.Counter()
+
+    sSNV_obs = collections.Counter()
+    gSNV_obs = collections.Counter()
+
+    for cell, cell_data in obs_for_cells.items():
+        for ssnv, gsnv in cell_data:
+            genotype_obs[(ssnv, gsnv)] += 1
+
+            gSNV_obs[gsnv] += 1
+            sSNV_obs[ssnv] += 1
+
+            if ssnv is not None and gsnv is not None:
+                complete_genotype_obs[(ssnv, gsnv)] += 1
+                # Only count these when the germline variant is detected
+                gSNV_obs_phased[gsnv] += 1
+                sSNV_obs_phased[ssnv] += 1
+
+    print(
+        Style.BRIGHT +
+        f'Genotype observations for variant {chrom}:{pos}' +
+        Style.RESET_ALL)
+    print('som\tgerm\tobs')
+    for (ssnv, gsnv), obs in complete_genotype_obs.most_common():
+        print(f' {ssnv}\t{gsnv}\t{obs}')
+
+    if len(complete_genotype_obs) <= 2:
+        print(f'not enough genotype observations for a variant call (<=2)')
+
+    ### Conbase algorithm : ###
+    #
+    # determine if there is an alternative base in the first place
+    # a fraction of the reads in a cell need to vote for a tuple,
+    # this fraction is stored in the alpha parameter , or a minimum amount of reads, stored in the beta parameter
+    # determine tp*, the alleles we expect observe
+    # ϴ τ α γ κ λ ν ξ ρ ϕ
+    α = 0.2  # minimum relative abundance of sSNV voting reads in single sample
+    β = 3  # minimum amount of sSSNV reads in cell, or in total if α is exceeded
+    γ = 0.9  # minimum amount of votes for sSNV
+    ε = 2  # minimum amount of cells voting for sSNV
+    ω = 0.9  # gsnv majority
+
+    sSNV_votes = collections.Counter()  # { sSNV_alt_base : votes }
+    total_samples_which_voted = 0
+    for sample, observed_tuples in cell_read_obs.items():
+        # First we create a Counter just counting the amount of evidence per
+        # base for this sample :
+        evidence_total_reads = collections.Counter()
+        total_reads = 0
+        for (sSNV_state, gSNV_state), reads in observed_tuples.most_common():
+            if sSNV_state is None:
+                continue
+            evidence_total_reads[sSNV_state] += reads
+            total_reads += reads
+
+        # this is the amount of reads which contain evidence for the reference
+        # base
+        ref_sSNV_reads = evidence_total_reads[sSNV_ref_base]
+        votes_for_this_sample = set()  # the alternative bases this sample votes for
+        for sSNV_state, sSNV_supporting_reads in evidence_total_reads.most_common():
+            # The reference base does not vote.
+            if sSNV_state == sSNV_ref_base or sSNV_state is None:
+                continue
+
+            # check if at least alpha reads vote for the sSNV
+            alpha_value = 0 if ref_sSNV_reads==0 else sSNV_supporting_reads/ref_sSNV_reads
+
+            vote = (
+                1 if (
+                    alpha_value >= α and (
+                        sSNV_supporting_reads +
+                        ref_sSNV_reads) >= β) or (
+                    alpha_value < α and sSNV_supporting_reads >= β) else 0)
+
+            print(f'{sample}\tsSNV alt:{sSNV_state}\t{sSNV_supporting_reads}\tsSNV ref:{ref_sSNV_reads}\t{ref_sSNV_reads}\tα:{alpha_value}\t{"votes" if vote else "no vote"}')
+
+            if vote:
+                votes_for_this_sample.add(sSNV_state)
+                sSNV_votes[sSNV_state] += 1
+                total_samples_which_voted += 1
+
+    # done voting.
+    # the most probable variant base is at least 90% voted for (lambda parameter)
+    # and at least ε cells need to vote for it
+    statistics[(chromosome, ssnv_position)
+               ]['total_samples_voted'] = total_samples_which_voted
+
+    if total_samples_which_voted < ε:
+        # We don't have enough votes
+        print(f'Not enough votes {total_samples_which_voted} < ε:{ε}')
+        return
+    else:
+        print(f'Enough votes {total_samples_which_voted} >= ε:{ε}')
+
+    sSNV_alt_base, sSNV_alt_obs = sSNV_votes.most_common()[0]
+    statistics[(chromosome, ssnv_position)]['sSNV_alt_vote_ratio'] = (
+        sSNV_alt_obs / total_samples_which_voted)
+    if (sSNV_alt_obs / total_samples_which_voted) < γ:
+        # The ratio of votes is below threshold
+        print(f'sSNV alt is {sSNV_alt_base}, ratio threshold γ:{γ} , not met with {sSNV_alt_obs / total_samples_which_voted}')
+        return
+
+    print(f'sSNV alt is {sSNV_alt_base}, γ: {sSNV_alt_obs / total_samples_which_voted} >= {γ}')
+
+    ### Here the "Stats" part of Conbase ends ###
+    #############################################
+
+    # Now we determined the sSNV alt base,
+    # now determine the linked gSNV
+    gSNV_alt_base = None  # Lazy not defined before
+    for basecall, obs in gSNV_obs_phased.most_common():
+        if basecall != gSNV_ref_base:
+            gSNV_alt_base = basecall
+            break
+
+    if sSNV_alt_base is None or gSNV_alt_base is None:
+        # No phased alt base found ...
+        print(f'No phased allele found')
+        return
+
+    # Determine the phase (most common genotypes)
+    sSNV_phase = None
+    wt_allele_gSNV = None
+    snv_allele_gSNV = None
+    sSNV_phased_votes = sum((obs
+                             for (sSNV_state, gSNV_state), obs
+                             in complete_genotype_obs.most_common()
+                             if sSNV_state == sSNV_alt_base and  gSNV_state is not None
+                             ))
+
+    if sSNV_phased_votes==0:
+        print('No votes cast for phasing the selected alternative allele')
+        return
+    # Find the phased germline variant:
+    for (sSNV_state, gSNV_state), this_phase_obs in complete_genotype_obs.most_common():
+        if sSNV_state != sSNV_alt_base or  gSNV_state is None:
+            continue
+
+        print(f'There are {sSNV_phased_votes} votes for the haplotype {sSNV_state} {gSNV_state}, ratio:{this_phase_obs / sSNV_phased_votes} ')
+
+        if (this_phase_obs / sSNV_phased_votes) < ω:
+            print(f'This does not pass the threshold ω {ω} ')
+            return
+        else:
+            print(f'This does pass the threshold ω {ω} ')
+        break
+
+    sSNV_phase = (sSNV_state, gSNV_state)
+    phased_gSNV = gSNV_state
+    snv_allele_gSNV = None
+    if gSNV_state == gSNV_ref_base:
+        # the reference allele is alt
+        wt_allele_gSNV = gSNV_alt_base
+        snv_allele_gSNV = gSNV_ref_base
+    else:
+        wt_allele_gSNV = gSNV_ref_base
+        snv_allele_gSNV = gSNV_alt_base
+            # the reference allele is ref
+        # break ? why?
+
+    statistics[(chromosome, ssnv_position)]['sSNV_gSNV_phase'] = snv_allele_gSNV
+
+    if snv_allele_gSNV is None:
+        print("No germline variant was phased")
+        return
+
+    # The valid tuples are thus:
+    uninformative_allele = (sSNV_ref_base, wt_allele_gSNV)
+    informative_allele_wt = (sSNV_ref_base, snv_allele_gSNV)
+
+    valid_tuples = [sSNV_phase,  # mutated
+                    informative_allele_wt,  # wt
+                    uninformative_allele]
+
+    # As we have umi's we just have a threshold for the least amount of reads
+    # we want to observe for a molecule to be taken into account
+    # Count how often we found valid and invalid genotypes
+    valid = 0
+    invalid = 0
+    valid_var = 0
+    invalid_var = 0
+    for (ssnv, gsnv), tuple_obs in complete_genotype_obs.most_common():
+        if ssnv == sSNV_alt_base:  # variant:
+            if (ssnv, gsnv) in valid_tuples:
+                valid_var += tuple_obs
+            else:
+                invalid_var += tuple_obs
+
+        if (ssnv, gsnv) in valid_tuples:
+            valid += tuple_obs
+        else:
+            invalid += tuple_obs
+
+    phase_ratio = 0
+    if valid_var + invalid_var > 0:
+        phase_ratio = valid_var / (valid_var + invalid_var)
+
+    # Score Tuples with evidence for variant
+    haplotype_scores[(chrom, pos)] = {
+        'valid_tuples': valid,
+        'invalid_tuples': invalid,
+        'valid_var_tuples': valid_var,
+        'invalid_var_tuples': invalid_var,
+        'phasing_ratio': phase_ratio,
+        'gSNV_allelic_bias': gSNV_obs[gSNV_ref_base] / (gSNV_obs[gSNV_ref_base] + gSNV_obs[gSNV_alt_base])
+    }
+
+    print(f'Germline variant obs: {gSNV_ref_base} {gSNV_alt_base}')
+    print(f'sSNV obs: {sSNV_ref_base} {sSNV_alt_base}')
+    if sSNV_phase is not None:
+        print(f'sSNV variant is phased with {phased_gSNV}')
+    print(Style.BRIGHT + 'Valid tuples:' + Style.RESET_ALL)
+    for g, s in valid_tuples:
+        print(f' {g}\t{s}')
+
+    print(Style.BRIGHT + 'Scores:' + Style.RESET_ALL)
+    for name, obs in haplotype_scores[(chrom, pos)].items():
+        print(f' {name}\t{obs}')
+
+    # Create the cell call dictionary
+
+    uninformative_obs = 0 # This is important, otherwise we might use a SNP ..
+    for cell, observed_tuples in cell_read_obs.items():
+        print(cell,  observed_tuples)
+        total_reads = 0
+        phased_variant_support_reads = 0
+        unphased_variant_support_reads = 0
+        variant_neg_support_reads = 0
+        uninformative_reads = 0
+        conflict_reads = 0
+        for (sSNV_state, gSNV_state), reads in observed_tuples.items():
+            if sSNV_state is None:
+                continue
+            total_reads += reads
+            if sSNV_state == sSNV_alt_base:
+                if gSNV_state == wt_allele_gSNV:
+                    conflict_reads += reads
+                elif gSNV_state == snv_allele_gSNV:
+                    # reads containing the sSNV and gSNV as expected
+                    phased_variant_support_reads += reads
+                elif gSNV_state is None:
+                    # reads containing sSNV but not overlapping with gSNV
+                    unphased_variant_support_reads += reads
+
+            elif sSNV_state == sSNV_ref_base:
+                if gSNV_state == snv_allele_gSNV:
+                    # reads on informative allele where we found evidence
+                    # of the sSNV not being present
+                    variant_neg_support_reads += reads
+                elif gSNV_state == wt_allele_gSNV:
+                    uninformative_reads += reads
+                    uninformative_obs += 1
+
+        if total_reads==0:
+            continue
+        if variant_neg_support_reads>0:
+            cell_call_data[(chrom, pos)][cell] = 0
+
+        if (unphased_variant_support_reads +
+                phased_variant_support_reads) / total_reads > 0.1:
+            cell_call_data[(chrom, pos)][cell] = 1
+            if unphased_variant_support_reads + phased_variant_support_reads >= 3:
+                cell_call_data[(chrom, pos)][cell] = 10
+
+        if (phased_variant_support_reads) / total_reads > 0.1:
+            cell_call_data[(chrom, pos)][cell] = 2
+            if phased_variant_support_reads >= 3:
+                cell_call_data[(chrom, pos)][cell] = 20
+
+        #if uninformative_reads / total_reads > 0.1:
+        #    # 0.1 for ref allele obs
+    #        cell_call_data[(chrom, pos)][cell] += 0.1
+
+        if conflict_reads / (total_reads) > 0.2:
+            cell_call_data[(chrom, pos)][cell] = -1  # invalid
+
+
+    haplotype_scores[(chrom, pos)]['uninformative_obs'] = uninformative_obs
+    # Annotate every molecule...
+
+    for molecule_id, (m, ssnv_state, gsnv_state) in enumerate(
+            window_molecules):
+        m.set_meta('mi', molecule_id)
+        if gsnv_state is None:
+            m.set_meta('gv', '?')
+        else:
+            m.set_meta('gv', gsnv_state)
+
+        if ssnv_state is None:
+            m.set_meta('sv', '?')
+        else:
+            m.set_meta('sv', ssnv_state)
+
+        if ssnv_state is None:
+            m.set_meta('VD', 'NO_SNV_OVERLAP')
+            continue
+
+        if gsnv_state is not None and not (
+                ssnv_state, gsnv_state) in valid_tuples:
+            m.set_meta('VD', 'INVALID_PHASE')
+            continue
+        if ssnv_state == sSNV_alt_base:
+            m.set_meta('VD', 'SNV_ALT')
+            continue
+
+        if ssnv_state == sSNV_ref_base and gsnv_state == phased_gSNV:
+            m.set_meta('VD', 'SNV_REF')
+            continue
+        if gsnv_state != phased_gSNV:
+            m.set_meta('VD', 'UNINFORMATIVE_ALLELE')
+            continue
+
+        m.set_meta('VD', 'REJECTED')
+    # write
+
+    for m, ssnv_state, gsnv_state in window_molecules:
+        m.write_tags()
+        m.write_pysam(out)
+
+        # Update read groups
+        for fragment in m:
+            read_groups.add(fragment.get_read_group())
+
+
+if __name__ == '__main__':
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="""
+    Known variant locations extraction tool
+    """)
+    argparser.add_argument('bamfiles', nargs='+')
+    argparser.add_argument(
+        '-ssnv',
+        help="sSNV bed file, or single sSNV location chr:pos",
+        type=str,
+        required=True)
+    argparser.add_argument(
+        '-gsnv',
+        help="gSNV bed file, or single gSNV location chr:pos",
+        type=str,
+        required=True)
+    argparser.add_argument(
+        '-reference',
+        help="reference fasta file",
+        type=str,
+        required=True)
+    argparser.add_argument('-head', type=int)
+    argparser.add_argument('-min_read_obs', type=int, default=2)
+    argparser.add_argument(
+        '--realign',
+        action='store_true',
+        help='Perform re-alignment using GATK')
+    argparser.add_argument(
+        '--ignore_bam_issues',
+        action='store_true',
+        help='')
+    argparser.add_argument(
+        '-gatk3_path',
+        type=str,
+        default='GenomeAnalysisTK.jar')
+    argparser.add_argument('-indelvcf', type=str)
+
+    argparser.add_argument('-prefix', type=str, default='')
+
+    argparser.add_argument('-window_radius', type=int, default=250)
+    argparser.add_argument('-realign_mem', type=int, default=25)
+    argparser.add_argument('--cluster', action='store_true')
+    args = argparser.parse_args()
+
+    if args.realign and args.indelvcf is None:
+        raise ValueError("supply -indelvcf")
+
+    WINDOW_RADIUS = args.window_radius
+    paths = args.bamfiles
+
+    # Load probed variants
+    probed_variants = {}
+    if ':' in args.ssnv:
+        chrom, snv_pos = args.ssnv.strip().split(':')
+        chrom_g, gsnv_pos = args.gsnv.strip().split(':')
+        assert chrom==chrom_g, 'germline SNV should be on the same chromosome as the sSNV'
+        probed_variants[(chrom, int(snv_pos))] = int( gsnv_pos)
+
+    else:
+        with open(args.ssnv) as s, \
+                open(args.gsnv) as g:
+            for i, (ssnv_line, gsnv_line) in enumerate(zip(s, g)):
+                if ssnv_line.startswith('track name'):
+                    continue
+                chrom, snv_pos, _ = ssnv_line.strip().split()
+                _, gsnv_pos, __ = gsnv_line.strip().split()
+                snv_pos, gsnv_pos = int(snv_pos), int(gsnv_pos)
+                probed_variants[(chrom, snv_pos)] = gsnv_pos
+
+    if args.cluster:
+        for i,((chrom, snv_pos), gsnv_pos) in enumerate(probed_variants.items()):
+            arguments = " ".join(
+                [x for x in sys.argv if x != '--cluster' and '.bed' not in x and '-ssnv'!=x and '-gsnv'!=x ])
+
+            job_name = f'vstat_{i}'
+            out_folder = './variantStats'
+            if not os.path.exists(out_folder):
+                os.makedirs(out_folder)
+            print('submission.py' + f' -y --py36 -time 50 -t 1 -m 50 -N {job_name} "{arguments} -ssnv {chrom}:{snv_pos} -gsnv {chrom}:{gsnv_pos} -prefix {out_folder}/{chrom}_{snv_pos}" ')
+        exit()
+    reference = pysamiterators.CachedFasta(pysam.FastaFile(args.reference))
+
+    cell_obs = collections.defaultdict(
+        lambda: collections.defaultdict(
+            collections.Counter))
+    statistics = collections.defaultdict(
+        lambda: collections.defaultdict(
+            collections.Counter))
+    cell_call_data = collections.defaultdict(dict)  # location->cell->haplotype
+    haplotype_scores = {}
+
+    read_groups = set()  # Store unique read groups in this set
+    with sorted_bam_file(f'{args.prefix}_evidence.bam', origin_bam=pysam.AlignmentFile(paths[0], ignore_truncation=args.ignore_bam_issues), read_groups=read_groups) as out:
+
+        for variant_index, ((chromosome, ssnv_position), potential_gsnv_position) in enumerate(
+                probed_variants.items()):
+
+            try:
+                obtain_variant_statistics(
+                    alignment_file_paths=paths,
+                    cell_obs=cell_obs,
+                    cell_call_data=cell_call_data,
+                    statistics=statistics,
+                    reference=reference,
+                    chromosome=chromosome,
+                    ssnv_position=ssnv_position,
+                    gsnv_position=potential_gsnv_position,
+                    WINDOW_RADIUS=WINDOW_RADIUS,
+                    haplotype_scores=haplotype_scores,
+                    out=out, min_read_obs=args.min_read_obs,
+                    read_groups=read_groups,
+                    args=args,
+                    umi_hamming_distance=1
+                )
+            except Exception as e:
+                traceback.print_exc()
+
+            if args.head and (variant_index > args.head - 1):
+                print(
+                    f'Stopping at variant {variant_index+1} because head was supplied ')
+                break
+
+    lambda_free_dict = {}
+    for key, stats in statistics.items():
+        lambda_free_dict[key] = {
+
+            'mean_clip_pbp': meanOfCounter(stats['clips_per_bp']),
+            'mean_ins_pbp': meanOfCounter(stats['inserts_per_bp']),
+            'mean_del_pbp': meanOfCounter(stats['deletions_per_bp']),
+            'mean_matches_pbp': meanOfCounter(stats['matches_per_bp']),
+            'mean_alt_mapping_per_read': meanOfCounter(stats['alt_per_read']),
+
+            'ssnv_ref_phred': meanOfCounter(stats['ssnv_ref_phred']),
+            'ssnv_alt_phred': meanOfCounter(stats['ssnv_alt_phred']),
+            'gsnv_ref_phred': meanOfCounter(stats['gsnv_ref_phred']),
+            'gsnv_any_alt_phred': meanOfCounter(stats['gsnv_any_alt_phred']),
+
+            'mean_max_mapping_quality': meanOfCounter(stats['max_mapping_quality']),
+            'mean_ivt_dups': meanOfCounter(stats['ivt_dups']),
+            'mean_undigested': meanOfCounter(stats['undigested']),
+            'R2_unmapped': stats['R2_unmapped'][True],
+
+
+            'mean_fragment_size': meanOfCounter(stats['fragment_size']),
+            'mean_reads': meanOfCounter(stats['reads']),
+            'total_reads': sum((amount * frequency for amount, frequency in stats['reads'].most_common())),
+            'total_molecules': sum((amount * frequency for amount, frequency in stats['molecules'].most_common()))
+
+        }
+
+    print('Writing final site table')
+    try:
+        site_stats = pd.DataFrame(lambda_free_dict).T.join(
+            pd.DataFrame(haplotype_scores).T)
+    except Exception as e:
+        print(e)
+        site_stats = pd.DataFrame(lambda_free_dict).T
+
+    site_stats.to_pickle(f'{args.prefix}_site_stats.pickle.gz')
+    site_stats.to_csv(f'{args.prefix}_site_stats.csv')
+
+    print('Writing final cell table')
+    try:
+        cell_call_df = pd.DataFrame(cell_call_data)
+        cell_call_df.to_pickle(f'{args.prefix}_cell_calls.pickle.gz')
+        cell_call_df.to_csv(f'{args.prefix}_cell_calls.csv')
+    except Exception as e:
+        print(e)