SingleCellMultiOmics / Git / [2c420a] /singlecellmultiomics/bamProcessing/bamToMethylationCalls.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
[2c420a]: / singlecellmultiomics / bamProcessing / bamToMethylationCalls.py
History
Download this file
255 lines (209 with data), 12.3 kB

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import matplotlib
matplotlib.rcParams['figure.dpi'] = 160
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import multiprocessing
from singlecellmultiomics.bamProcessing.bamBinCounts import generate_commands, count_methylation_binned
import argparse
from colorama import Fore, Style
from singlecellmultiomics.utils.export import dataframe_to_wig
from singlecellmultiomics.methylation import MethylationCountMatrix
from singlecellmultiomics.bamProcessing.bamFunctions import get_reference_from_pysam_alignmentFile
from colorama import Fore,Style


def prefilter(counts, cell_names, min_measurements, min_variance):
    if min_measurements>0:
        counts = counts.loc[:, (counts >= 0).sum() > min_measurements]
    if min_variance>0:
        return counts.loc[:, counts.var() >= min_variance].reindex(cell_names)
    else:
        return counts.reindex(cell_names)


def panda_and_prefilter(args):
    d, args = args # counts_dict ,(cell_names, min_measurements, min_variance)
    return prefilter(pd.DataFrame(d), *args)


def get_methylation_count_matrix(bam_path,
                                 bin_size: int,
                                 bp_per_job: int,
                                 min_samples: int = None,
                                 min_variance: int = None,
                                 min_mapping_qual: int = None,
                                 skip_contigs: set = None,
                                 known_variants: str = None,
                                 maxtime: int = None,
                                 head: int=None,
                                 threads: int = None,
                                 count_reads: bool = True,
                                 **kwargs
                                 ):


    all_kwargs = {'known': known_variants,
            'maxtime': maxtime,
            'single_location':bin_size==1,
            'min_samples':min_samples,
            'min_variance':min_variance,
            'threads':threads,
            'count_reads':count_reads
            }

    all_kwargs.update(kwargs)
    commands = generate_commands(
        alignments_path=bam_path,
        bin_size=bin_size if bin_size!=1 else bp_per_job,
        key_tags=None,
        max_fragment_size=0,
        dedup=True,
        head=head,
        bins_per_job= int(bp_per_job / bin_size) if bin_size!=1 else 1, min_mq=min_mapping_qual,
        kwargs=all_kwargs,
        skip_contigs=skip_contigs
    )


    count_mat = MethylationCountMatrix()
    read_count_mat = dict()

    if threads==1:
        for command in commands:
            result, result_read_counts = count_methylation_binned(command)
            #result.prune(min_samples=min_samples, min_variance=min_variance)
            count_mat.update( result )
            if count_reads:
                read_count_mat.update(result_read_counts)

    else:
        with multiprocessing.Pool(threads) as workers:

            for result,result_read_counts in workers.imap_unordered(count_methylation_binned, commands):
                #result.prune(min_samples=min_samples, min_variance=min_variance)
                count_mat.update(result)
                if count_reads:
                    read_count_mat.update(result_read_counts)

    return count_mat, read_count_mat



if __name__ == '__main__':
    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""Extract methylation calls from bam file
    """)
    argparser.add_argument('bamfile', metavar='bamfile', type=str)
    argparser.add_argument('-bin_size', default=500, type=int, help='bin size, set to 1 for single CpG')
    argparser.add_argument('-bp_per_job', default=1_000_000, type=int, help='Amount of basepairs to be processed per thread per chunk')
    argparser.add_argument('-threads', default=None, type=int, help='Amount of threads to use for counting, None to use the amount of available threads')
    argparser.add_argument('-threads_agg', default=1, type=int, help='Amount of threads to use for aggregation. Aggregation is very memory intensive, so this amount of threads should probably be lower than -threads')
    argparser.add_argument('-reference', default=None, type=str, help='Path to reference fasta file')

    argparser.add_argument('--mirror_cpg_dyad',action='store_true', help='Count CpG methylation of a single dyad as one position')
    argparser.add_argument('--stranded',action='store_true', help='Perform strand specific methylation calls')


    fi = argparser.add_argument_group("Filters")
    fi.add_argument('-min_variance', default=None, type=float)
    fi.add_argument('-min_mapping_qual', default=40, type=int)
    fi.add_argument('-head', default=None, type=int, help='Process the first n bins')
    fi.add_argument('-min_samples', default=1, type=int)
    fi.add_argument('-skip_contigs', type=str, help='Comma separated contigs to skip', default='MT,chrM')
    fi.add_argument('-known_variants',
                           help='VCF file with known variants, will be not taken into account as methylated/unmethylated',
                           type=str)

    fi.add_argument('-contexts_to_capture', default=None, type=str, help='Contexts to capture, comma separated')\

    og = argparser.add_argument_group("Output")
    #og.add_argument('-bed', type=str, help='Bed file to write methylation calls to')
    og.add_argument('-default_sample_name', type=str, help='Default sample name to use when SM tag is not available in the read')
    og.add_argument('-wig_beta', type=str, help='WIG file to write mean methylation per bin to')
    og.add_argument('-wig_n_samples', type=str, help='WIG file to write amount of samples covering to')
    og.add_argument('-capture_read_depth', default=None, type=str, help='Capture read depth into this csv file')

    og.add_argument('-betas', type=str, help='CSV or pickle file to write single cell methylation betas to')
    #og.add_argument('-mets', type=str, help='CSV or pickle file to write single cell methylation unmethylated frame to')
    #og.add_argument('-unmets', type=str, help='CSV or pickle file to write single cell methylation frame to')

    og.add_argument('-bismark_tabfile', type=str, help='Tabulated file to write to, contains: chr | start | end | unmethylated_counts | methylated_counts | beta_value')
    og.add_argument('-tabfile', type=str,
                    help='Tabulated file to write to, contains: chr | start | end | unmethylated_counts | methylated_counts | beta_value | variance | n_samples')
    og.add_argument('-distmat', type=str, help='CSV or pickle file to write single cell distance matrix to')
    og.add_argument('-distmat_plot', type=str, help='.PNG file to write distance matrix image to')

    args = argparser.parse_args()

    if args.contexts_to_capture is None:
        contexts_to_capture = None
        context_radius = None
    else:
        assert args.mirror_cpg_dyad is False, '--mirror_cpg_dyad can only be used when no contexts_to_capture are supplied. (CpG mode) '
        contexts_to_capture = args.contexts_to_capture.split(',')
        lengths = set( (len(context) for context in contexts_to_capture ) )
        assert len(lengths)==1, 'Please only supply contexts of the same length'
        context_radius = int((list(lengths)[0] - 1) / 2)
        for context in contexts_to_capture:
            assert context[context_radius]=='C', 'Supply only contexts with a C in the middle'

        assert args.reference is not None, 'Please supply supply a fasta file using -reference'


    if args.stranded:
        assert args.mirror_cpg_dyad is False, '--mirror_cpg_dyad can only be used in non-strand specific mode '
        assert args.wig_beta is None, '-wig_beta cannot be used in stranded mode as WIG files are not strand specific'
        assert args.wig_n_samples is None, '-wig_n_samples cannot be used in stranded mode as WIG files are not strand specific'

    if args.distmat_plot is not None and not args.distmat_plot.endswith('.png'):
        args.distmat_plot += '.png'

    print('Obtaining counts ', end="")
    counts, read_dictionary = get_methylation_count_matrix(bam_path=args.bamfile,
                                          bin_size=args.bin_size,
                                          bp_per_job=args.bp_per_job,
                                          min_samples=args.min_samples,
                                          min_variance=args.min_variance,
                                          known_variants=args.known_variants,
                                          skip_contigs=args.skip_contigs.split(','),
                                          min_mapping_qual=args.min_mapping_qual,
                                          head=args.head,
                                          threads=args.threads,
                                          single_location=(args.bin_size==1),
                                          contexts_to_capture=contexts_to_capture,
                                          context_radius=context_radius,
                                          reference_path=args.reference,
                                          dyad_mode=args.mirror_cpg_dyad,
                                          stranded=args.stranded,
                                          count_reads=args.capture_read_depth is not None

    )
    print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")

    if args.capture_read_depth:
        print('Writing read depth file ', end="")
        pd.DataFrame(read_dictionary).sort_index().sort_index(1).to_csv(args.capture_read_depth)
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")


    print(counts)

    if args.betas is not None:
        print('Writing counts ', end="")
        if args.betas.endswith('.pickle.gz'):
            counts.to_pickle(args.betas)
        else:
            counts.get_frame('beta').to_csv(args.betas)
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")

    if args.wig_beta is not None:
        # Calculate mean beta value and write to wig:
        print('Writing WIG beta', end="")
        dataframe_to_wig(counts.get_bulk_frame()[['beta']], args.wig_beta, span=args.bin_size)
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")

    if args.wig_n_samples is not None:
        # Calculate mean beta value and write to wig:
        print('Writing WIG n_samples', end="")
        dataframe_to_wig(counts.get_bulk_frame()[['n_samples']], args.wig_n_samples, span=args.bin_size)
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")

    if args.distmat is not None or args.distmat_plot is not None:
        print('Calculating distance matrix', end="")
        dmat = counts.get_sample_distance_matrix()
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
        if args.distmat_plot is not None:
            print('Writing distmat_image', end="")
            try:
                sns.clustermap(dmat)
                plt.tight_layout()
                plt.savefig(args.distmat_plot)
                print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
            except Exception as e:
                print(e)
                print(f" [ {Fore.RED}FAIL{Style.RESET_ALL} ] ")

        if args.distmat is not None:
            print('Writing distance matrix', end="")
            if args.distmat.endswith('.pickle.gz'):
                dmat.to_pickle(args.distmat)
            else:
                dmat.to_csv(args.distmat, sep='\t')
            print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")

    if args.bismark_tabfile is not None:
        print('Writing bismark_tabfile matrix', end="")
        bf = counts.get_bulk_frame()

        bf.index.set_names(['chr','start', 'end'] + (['strand'] if args.stranded else []), inplace=True)
        bf[['unmethylated', 'methylated', 'beta']].to_csv(args.bismark_tabfile, sep='\t')
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")

    if args.tabfile is not None:
        print('Writing tabfile', end="")
        if args.tabfile.endswith('.pickle.gz'):
            counts.get_bulk_frame().to_pickle(args.tabfile)
        else:
            counts.get_bulk_frame().to_csv(args.tabfile, sep='\t')
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")