--- a +++ b/singlecellmultiomics/bamProcessing/bamToCountTable.py @@ -0,0 +1,747 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import singlecellmultiomics.modularDemultiplexer +import os +import sys +import pysam +import collections +import argparse +import pandas as pd +import numpy as np +import itertools +import singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods +import gzip # for loading blacklist bedfiles +TagDefinitions = singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TagDefinitions + + +def coordinate_to_sliding_bin_locations(dp, bin_size, sliding_increment): + """ + Convert a single value to a list of overlapping bins + + Parameters + ---------- + point : int + coordinate to look up + + bin_size : int + bin size + + sliding_increment : int + sliding window offset, this is the increment between bins + + Returns + ------- + start : int + the start coordinate of the first overlapping bin + end :int + the end of the last overlapping bin + + start_id : int + the index of the first overlapping bin + end_id : int + the index of the last overlapping bin + + """ + start_id = int(np.ceil(((dp - bin_size) / sliding_increment))) + start = start_id * sliding_increment + end_id = int(np.floor(dp / sliding_increment)) + end = end_id * sliding_increment + bin_size + return start, end, start_id, end_id + + +def coordinate_to_bins(point, bin_size, sliding_increment): + """ + Convert a single value to a list of overlapping bins + + Parameters + ---------- + point : int + coordinate to look up + + bin_size : int + bin size + + sliding_increment : int + sliding window offset, this is the increment between bins + + Returns + ------- + list: [(bin_start,bin_end), .. ] + + """ + start, end, start_id, end_id = coordinate_to_sliding_bin_locations( + point, bin_size, sliding_increment) + return [(i * sliding_increment, i * sliding_increment + bin_size) + for i in range(start_id, end_id + 1)] + + +def read_has_alternative_hits_to_non_alts(read): + if read.has_tag('XA'): + for alt_align in read.get_tag('XA').split(';'): + if len(alt_align) == 0: # Sometimes this tag is empty for some reason + continue + + hchrom, hpos, hcigar, hflag = alt_align.split(',') + if not hchrom.endswith('_alt'): + return True + return False + + +def readTag(read, tag, defective='None'): + try: + value = singlecellmultiomics.modularDemultiplexer.metaFromRead( + read, tag) + except Exception as e: + value = defective + return value + +# define if a read should be used + + +def read_should_be_counted(read, args, blacklist_dic = None): + """ + Check if a read should be counted given the filter arguments + + Parameters + ---------- + read : pysam.AlignedSegment or None + read to check if it should be counted + + Returns + ------- + bool + """ + + if args.r1only and read.is_read2: + return False + if args.r2only and read.is_read1: + return False + + if args.filterMP: + if not read.has_tag('mp'): + return False + + if read.get_tag('mp')!='unique': + return False + + if read is None or read.is_qcfail: + return False + + # Mapping quality below threshold + if read.mapping_quality < args.minMQ: + return False + + + if args.proper_pairs_only and not read.is_proper_pair: + return False + + if args.no_indels and ('I' in read.cigarstring or 'D' in read.cigarstring): + return False + + if args.max_base_edits is not None and read.has_tag('NM') and int(read.get_tag('NM'))>args.max_base_edits: + return False + + if args.no_softclips and 'S' in read.cigarstring: + return False + + # Read has alternative hits + if args.filterXA: + if read_has_alternative_hits_to_non_alts(read): + return False + + # Read is a duplicate + # (args.dedup and ( not read.has_tag('RC') or (read.has_tag('RC') and read.get_tag('RC')!=1))): + if read.is_unmapped or (args.dedup and ( + read.has_tag("RR") or read.is_duplicate)): + return False + + # Read is in blacklist + if blacklist_dic is not None: + if read.reference_name in blacklist_dic: + # iterate through tuples of startend to check + for startend in blacklist_dic[read.reference_name]: + # start is 0-based inclusive, end is 0-based exclusive + start_bad = read.reference_start >= startend[0] and read.reference_start < startend[1] + end_bad = read.reference_end >= startend[0] and read.reference_end < startend[1] + if start_bad or end_bad: + return False + + return True + + +def tagToHumanName(tag, TagDefinitions): + if tag not in TagDefinitions: + return tag + return TagDefinitions[tag].humanName + + +def assignReads( + read, + countTable, + args, + joinFeatures, + featureTags, + sampleTags, + more_args=[], + blacklist_dic = None): + + assigned = 0 + if not read_should_be_counted(read, args, blacklist_dic): + return assigned + + # Get the sample to which this read belongs + sample = tuple(readTag(read, tag) for tag in sampleTags) + + # Decide how many counts this read yields + countToAdd = 1 + + if args.r1only or args.r2only: + countToAdd = 1 + elif not args.doNotDivideFragments: # not not = True + # IF the read is paired, and the mate mapped, we should count 0.5, and will end up + # with 1 in total + + countToAdd = (0.5 if (read.is_paired and not read.mate_is_unmapped) else 1) + + assigned += 1 + + if args.divideMultimapping: + if read.has_tag('XA'): + countToAdd = countToAdd / len(read.get_tag('XA').split(';')) + elif read.has_tag('NH'): + countToAdd = countToAdd / int(read.get_tag('NH')) + else: + countToAdd = countToAdd + + # Define what counts to add to what samples + # [ (sample, feature, increment), .. ] + count_increment = [] + + feature_dict = {} + joined_feature = [] + used_features = [] + for tag in featureTags: + feat = str(readTag(read, tag)) + feature_dict[tag] = feat + if args.bin is not None and args.binTag == tag: + continue + if args.byValue is not None and tag == args.byValue: + continue + joined_feature.append(feat) + used_features.append(tag) + + if joinFeatures: + if args.splitFeatures: + # Obtain all key states: + key_states = [] + for tag in featureTags: + value = feature_dict.get(tag) + key_states.append(value.split(args.featureDelimiter)) + + for state in itertools.product(*key_states): + joined_feature = [] + feature_dict = { + feature: value + for feature, value in zip(featureTags, state) + } + for feature, value in zip(featureTags, state): + if args.bin is not None and args.binTag == feature: + continue + + if len(value) > 0: + joined_feature.append(value) + else: + joined_feature.append('None') + + if args.byValue is not None: + raise NotImplementedError( + 'By value is not implemented for --splitFeatures') + + else: + count_increment.append({ + 'key': tuple(joined_feature), + 'features': feature_dict, + 'samples': [sample], + 'increment': countToAdd}) + joined_feature[0] + else: + if args.byValue is not None: + + try: + add = float(feature_dict.get(args.byValue, 0)) + except ValueError: + add = 0 + + count_increment.append({ + 'key': tuple(joined_feature), + 'features': feature_dict, + 'samples': [sample], + 'increment': add}) + + else: + count_increment.append({ + 'key': tuple(joined_feature), + 'features': feature_dict, + 'samples': [sample], + 'increment': countToAdd}) + else: + if args.bin is not None: + raise NotImplementedError('Try using -joinedFeatureTags') + + for feature, value in feature_dict.items(): + if args.byValue is not None and feature == args.byValue: + + try: + add = float(feature_dict.get(args.byValue, 0)) + except ValueError: + add = 0 + + count_increment.append({ + 'key': tuple(joined_feature), + 'features': feature_dict, + 'samples': [sample], + 'increment': add}) + + elif args.splitFeatures: + for f in value.split(args.featureDelimiter): + count_increment.append({ + 'key': (f), + 'features': {feature: f}, + 'samples': [sample], + 'increment': countToAdd + }) + + else: + count_increment.append({ + 'key': (value, ), + 'features': {feature: value}, + 'samples': [sample], + 'increment': countToAdd}) + + """ + Now we have a list of dicts: + { + 'key':feature, + 'features':{feature:value}, + 'samples':[sample], + 'increment':countToAdd}) + } + """ + + # increment the count table accordingly: + if args.bin is not None: + for dtable in count_increment: + key = dtable['key'] + countToAdd = dtable['increment'] + samples = dtable['samples'] + value_to_be_binned = dtable['features'].get(args.binTag, None) + + if value_to_be_binned is None or value_to_be_binned == 'None': + continue + + for start, end in coordinate_to_bins( + int(value_to_be_binned), args.bin, args.sliding): + # Reject bins outside boundary + if not args.keepOverBounds and ( + start < 0 or end > args.ref_lengths[read.reference_name]): + continue + for sample in samples: + countTable[sample][tuple( + list(key) + [start, end])] += countToAdd + + elif args.bedfile is not None: + + for dtable in count_increment: + if args.byValue: + key = [args.byValue] + else: + key = dtable['key'] + countToAdd = dtable['increment'] + samples = dtable['samples'] + + # Get features from bedfile + start, end, bname = more_args[0], more_args[1], more_args[2] + jfeat = tuple(list(key) + [start, end, bname]) + if len(key): + countTable[sample][jfeat] += countToAdd + # else: this will also emit non assigned reads + # countTable[sample][ 'None' ] += countToAdd + + else: + for dtable in count_increment: + key = dtable['key'] + countToAdd = dtable['increment'] + samples = dtable['samples'] + + for sample in samples: + if len(key) == 1: + countTable[sample][key[0]] += countToAdd + else: + countTable[sample][key] += countToAdd + + return assigned + + +def create_count_table(args, return_df=False): + + if len(args.alignmentfiles) == 0: + raise ValueError("Supply at least one bam file") + if args.bedfile is not None: + assert(os.path.isfile(args.bedfile)) + + if args.sliding is None: + args.sliding = args.bin + + if not return_df and args.o is None and args.alignmentfiles is not None: + args.showtags = True + + if args.showtags: + # Find which tags are available in the file: + head = 1000 + tagObs = collections.Counter() + for bamFile in args.alignmentfiles: + with pysam.AlignmentFile(bamFile) as f: + for i, read in enumerate(f): + tagObs += collections.Counter([k for k, + v in read.get_tags(with_value_type=False)]) + if i == (head - 1): + break + import colorama + + print( + f'{colorama.Style.BRIGHT}Tags seen in the supplied bam file(s):{colorama.Style.RESET_ALL}') + for observedTag in tagObs: + tag = observedTag + if observedTag in TagDefinitions: + t = TagDefinitions[observedTag] + humanName = t.humanName + isPhred = t.isPhred + else: + t = observedTag + isPhred = False + humanName = f'{colorama.Style.RESET_ALL}<No information available>' + + allReadsHaveTag = (tagObs[tag] == head) + color = colorama.Fore.GREEN if allReadsHaveTag else colorama.Fore.YELLOW + print( + f'{color}{ colorama.Style.BRIGHT}{tag}{colorama.Style.RESET_ALL}\t{color+humanName}\t{colorama.Style.DIM}{"PHRED" if isPhred else ""}{colorama.Style.RESET_ALL}' + + f'\t{"" if allReadsHaveTag else "Not all reads have this tag"}') + + print(f'{colorama.Style.BRIGHT}\nAVO lab tag list:{colorama.Style.RESET_ALL}') + for tag, t in TagDefinitions.items(): + print(f'{colorama.Style.BRIGHT}{tag}{colorama.Style.RESET_ALL}\t{t.humanName}\t{colorama.Style.DIM}{"PHRED" if t.isPhred else ""}{colorama.Style.RESET_ALL}') + exit() + + if args.o is None and not return_df: + raise ValueError('Supply an output file') + + if args.alignmentfiles is None: + raise ValueError('Supply alignment (BAM) files') + + joinFeatures = False + featureTags = [] + if args.featureTags is not None: + featureTags = args.featureTags.split(',') + + if args.joinedFeatureTags is not None: + featureTags = args.joinedFeatureTags.split(',') + joinFeatures = True + + # When -byValue is used, and joined feature tags are supplied, automatically append the args.byValue tag, otherwise it will get lost + if args.byValue is not None and len(featureTags)>0 and args.byValue not in featureTags: + featureTags.append(args.byValue) + + + if args.bin is not None and args.binTag not in featureTags: + print("The bin tag was not supplied as feature, automatically appending the bin feature.") + featureTags.append(args.binTag) + + if len(featureTags) == 0: + raise ValueError( + "No features supplied! Please supply -featureTags -joinedFeatureTags and or -binTag") + + sampleTags = args.sampleTags.split(',') + countTable = collections.defaultdict( + collections.Counter) # cell->feature->count + + if args.blacklist is not None: + # create blacklist dictionary {chromosome : [ (start1, end1), ..., (startN, endN) ]} + # used to check each read and exclude if it is within any of these start end sites + # + blacklist_dic = {} + print("Creating blacklist dictionary:") + with open(args.blacklist, mode='r') as blfile: + for row in blfile: + parts = row.strip().split() + chromo, start, end = parts[0], int(parts[1]), int(parts[2]) + if chromo not in blacklist_dic: + # init chromo + blacklist_dic[chromo] = [] # list of tuples + blacklist_dic[chromo].append( (start, end) ) + print(blacklist_dic) + else: + blacklist_dic = None + + assigned = 0 + for bamFile in args.alignmentfiles: + + with pysam.AlignmentFile(bamFile) as f: + i = 0 # make sure i is defined + if args.bin: + # Obtain the reference sequence lengths + ref_lengths = { + r: f.get_reference_length(r) for r in f.references} + args.ref_lengths = ref_lengths + if args.bedfile is None: + # for adding counts associated with a tag OR with binning + if args.contig is not None: + pysam_iterator = f.fetch(args.contig) + else: + pysam_iterator = f + + for i, read in enumerate(pysam_iterator): + if i % 1_000_000 == 0: + print( + f"{bamFile} Processed {i} reads, assigned {assigned}, completion:{100*(i/(0.001+f.mapped+f.unmapped+f.nocoordinate))}%") + + if args.head is not None and i > args.head: + break + + assigned += assignReads(read, + countTable, + args, + joinFeatures, + featureTags, + sampleTags, + blacklist_dic = blacklist_dic) + else: # args.bedfile is not None + # for adding counts associated with a bedfile + with open(args.bedfile, "r") as bfile: + #breader = csv.reader(bfile, delimiter = "\t") + for row in bfile: + + parts = row.strip().split() + chromo, start, end, bname = parts[0], int(float( + parts[1])), int(float(parts[2])), parts[3] + if args.contig is not None and chromo != args.contig: + continue + for i, read in enumerate(f.fetch(chromo, start, end)): + if i % 1_000_000 == 0: + print( + f"{bamFile} Processed {i} reads, assigned {assigned}, completion:{100*(i/(0.001+f.mapped+f.unmapped+f.nocoordinate))}%") + assigned += assignReads(read, + countTable, + args, + joinFeatures, + featureTags, + sampleTags, + more_args=[start, + end, + bname], + blacklist_dic = blacklist_dic) + + if args.head is not None and i > args.head: + break + + print( + f"Finished: {bamFile} Processed {i} reads, assigned {assigned}") + print(f"Finished counting, now exporting to {args.o}") + df = pd.DataFrame.from_dict(countTable) + + # Set names of indices + if not args.noNames: + df.columns.set_names([tagToHumanName(t, TagDefinitions) + for t in sampleTags], inplace=True) + + try: + if args.bin is not None: + index_names = [tagToHumanName( + t, TagDefinitions) for t in featureTags if t != args.binTag] + ['start', 'end'] + df.index.set_names(index_names, inplace=True) + elif args.bedfile is not None: + index_names = [tagToHumanName( + t, TagDefinitions) for t in featureTags if t != args.binTag] + ['start', 'end', 'bname'] + df.index.set_names(index_names, inplace=True) + elif joinFeatures: + index_names = [ + tagToHumanName( + t, TagDefinitions) for t in featureTags] + df.index.set_names(index_names, inplace=True) + else: + index_names = ','.join( + [tagToHumanName(t, TagDefinitions) for t in featureTags]) + df.index.set_names(index_names, inplace=True) + except Exception as e: + pass + print(index_names) + + if return_df: + return df + + if args.bulk: + sum_row = df.sum(axis=1) + df = pd.DataFrame(sum_row) + df.columns = ["Bulkseq"] + + if args.o.endswith('.pickle') or args.o.endswith('.pickle.gz'): + df.to_pickle(args.o) + else: + df.to_csv(args.o) + return args.o + print("Finished export.") + + +if __name__ == '__main__': + argparser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description='Convert BAM file(s) which has been annotated using featureCounts or otherwise to a count matrix') + argparser.add_argument( + '-o', + type=str, + help="output csv path, or pandas dataframe if path ends with pickle.gz", + required=False) + argparser.add_argument( + '-featureTags', + type=str, + default=None, + help='Tag(s) used for defining the COLUMNS of the matrix. Single dimension.') + argparser.add_argument( + '-joinedFeatureTags', + type=str, + default=None, + help='These define the COLUMNS of your matrix. For example if you want allele (DA) and restriction site (DS) use DA,DS. If you want a column containing the chromosome mapped to use "chrom" as feature. Use this argument if you want to use a multidimensional index') + argparser.add_argument( + '-sampleTags', + type=str, + default='SM', + help='Comma separated tags defining the names of the ROWS in the output matrix') + argparser.add_argument('alignmentfiles', type=str, nargs='*') + argparser.add_argument( + '-head', + type=int, + help='Run the algorithm only on the first N reads to check if the result looks like what you expect.') + + + argparser.add_argument( + '--bulk', + action='store_true', + help='Include this when making a count table for bulkseq data. This operation will sum the counts of all sampleTags into a single column.') + argparser.add_argument( + '--r1only', + action='store_true', + help='Only count R1') + + argparser.add_argument( + '--r2only', + action='store_true', + help='Only count R2') + + argparser.add_argument( + '--splitFeatures', + action='store_true', + help='Split features by , . For example if a read has a feature Foo,Bar increase counts for both Foo and Bar') + argparser.add_argument('-featureDelimiter', type=str, default=',') + argparser.add_argument( + '-contig', + type=str, + help='Run only on this chromosome') + + multimapping_args = argparser.add_argument_group('Multimapping', '') + multimapping_args.add_argument( + '--divideMultimapping', + action='store_true', + help='Divide multimapping reads over all targets. Requires the XA or NH tag to be set.') + multimapping_args.add_argument( + '--doNotDivideFragments', + action='store_true', + help='When used every read is counted once, a fragment will count as two reads. 0.5 otherwise') + + + binning_args = argparser.add_argument_group('Binning', '') + #binning_args.add_argument('-offset', type=int, default=0, help="Add offset to bin. If bin=1000, offset=200, f=1999 -> 1200. f=4199 -> 3200") + binning_args.add_argument( + '-sliding', + type=int, + help="make bins overlapping, the stepsize is equal to the supplied value here. If nothing is supplied this value equals the bin size") + binning_args.add_argument( + '--keepOverBounds', + action='store_true', + help="Keep bins which go over chromsome bounds (start<0) end > chromsome length") + + binning_args.add_argument( + '-bin', + type=int, + help="Devide and floor to bin features. If bin=1000, f=1999 -> 1000.") + # binning_args.add_argument('--showBinEnd', action='store_true', help="If + # True, then show DS column as 120000-220000, otherwise 120000 only. This + # specifies the bin range in which the read was counted" ) this is now + # always on! + binning_args.add_argument('-binTag', default='DS') + binning_args.add_argument( + '-byValue', + type=str, + help='Extract the value from the supplied tag and use this as count to add') + binning_args.add_argument('-blacklist', metavar='BED FILE BLACKLIST TO FILTER OUT', help = 'Bedfile of blacklist regions to exclude', default=None) + bed_args = argparser.add_argument_group('Bedfiles', '') + bed_args.add_argument( + '-bedfile', + type=str, + help="Bed file containing 3 columns, chromo, start, end to be read for fetching counts") + + + + filters = argparser.add_argument_group('Filters', '') + + filters.add_argument( + '--proper_pairs_only', + action='store_true', + help='Only count reads mapped in a proper pair (within the expected insert size range)') + + filters.add_argument( + '--no_softclips', + action='store_true', + help='Only count reads without softclips') + + filters.add_argument( + '-max_base_edits', + type=int, + help='Count reads with at most this value of bases being different than the reference') + + + filters.add_argument( + '--no_indels', + action='store_true', + help='Only count reads without indels') + + filters.add_argument( + '--dedup', + action='store_true', + help='Count only the first occurence of a molecule. Requires RC tag to be set. Reads without RC tag will be ignored!') + + filters.add_argument( + '-minMQ', + type=int, + default=0, + help="minimum mapping quality") + + filters.add_argument( + '--filterMP', + action= 'store_true', + help="Filter reads which are not uniquely mappable, this is based on presence on the `mp` tag, which can be generated by bamtagmultiome.py ") + + filters.add_argument( + '--filterXA', + action='store_true', + help="Do not count reads where the XA (alternative hits) tag has been set for a non-alternative locus.") + + argparser.add_argument( + '--noNames', + action='store_true', + help='Do not set names of the index and columns, this simplifies the resulting CSV/pickle file') + argparser.add_argument( + '--showtags', + action='store_true', + help='Show a list of commonly used tags') + + args = argparser.parse_args() + create_count_table(args)