Download this file

116 lines (91 with data), 4.0 kB

from glob import glob
import collections
import os
from singlecellmultiomics.utils import get_contig_list_from_fasta
from singlecellmultiomics.bamProcessing.bamFunctions import get_samples_from_bam
"""
This workflow:
    - Detects contigs
    - Runs mutect on every contig
    - Merges results
"""
################## configuration ##################
configfile: "config.json"
# config

# Obtain contigs:
contigs = [contig for contig in get_contig_list_from_fasta(config['reference_file']) if contig!='MISC_ALT_CONTIGS_SCMO']
# If you want to select on which chromosomes to run, change and  uncomment the next line:
# contigs = ['chr1','chr2']

# Obtain sample name from normal bam
try:
    normalSampleName = list( get_samples_from_bam(config['normal_bam']) )[0]
except Exception as e:
    print(e)
    raise ValueError('Supply a NORMAL bam file which has read groups')

# Obtain sample names from tumor bam(s)
try:
    tumor_bam_samples = [list( get_samples_from_bam(tumor_bam) )[0] for tumor_bam in config['tumor_bams'] ]
    tumor_bam_mapping = dict(zip(tumor_bam_samples,config['tumor_bams']))
except Exception as e:
    print(e)
    raise ValueError('Supply a TUMOR bam file which has read groups')

print("Tumor samples:")
for sample_name, bam_path in tumor_bam_mapping.items():
    print(f'\t{sample_name} : {bam_path}')
    pass

print("Normal sample:")
print(f'\t{normalSampleName} : {config["normal_bam"]}')


def get_tumor_bam_file(wildcards):
    global tumor_bam_mapping
    return tumor_bam_mapping[wildcards.tumorSampleName]

def get_normal_bam_file(wildcards):
    global config
    return config['normal_bam']

def get_target_list():
    global normalSampleName
    targets = [f"{ts}_VS_{normalSampleName}/mutect.vcf.gz" for ts in tumor_bam_samples]
    print('TARGETS:',targets)
    return targets


rule all:
    input:
        get_target_list()
        #expand( "{tumorSampleName}_{normalSampleName}/mutect.vcf.gz" , tumorSampleName=tumor_bam_samples, normalSampleName=[normalSampleName])


rule mutect_scatter:
    input:
        tumor=get_tumor_bam_file,
        normal=get_normal_bam_file

    output:
        vcf = "{tumorSampleName}_VS_{normalSampleName}/TEMP/{contig}.vcf.gz",
        vcfindex = "{tumorSampleName}_VS_{normalSampleName}/TEMP/{contig}.vcf.gz.tbi",
        stats = "{tumorSampleName}_VS_{normalSampleName}/TEMP/{contig}.vcf.gz.stats"

    log:
        stdout="log/mutect_scatter/{tumorSampleName}_VS_{normalSampleName}_{contig}.stdout", #.format(compid=compid, contig='{contig}'),
        stderr="log/mutect_scatter/{tumorSampleName}_VS_{normalSampleName}_{contig}.stderr" #.format(compid=compid, contig='{contig}')


    threads: 1
    params:
        runtime="60h",
        reference=config['reference_file'],
        gatk_path=config['gatk_path']

    resources:
        mem_mb = lambda wildcards, attempt, input: attempt * 10000,
        runtime = lambda wildcards, attempt, input: attempt * 24

    shell:
        "{params.gatk_path} Mutect2 -I {input.tumor} -I {input.normal} --normal-sample {normalSampleName} --output {output.vcf} --reference {params.reference} --intervals {wildcards.contig}  > {log.stdout} 2> {log.stderr}"


rule mutect_gather:
    input:
        chr_vcfs =  expand(
        "{{tumorSampleName}}_VS_{{normalSampleName}}/TEMP/{contig}.vcf.gz",
        contig=contigs) #.format(compid=compid, contig='{contig}'), contig=contigs),

    output:
        vcf = "{tumorSampleName}_VS_{normalSampleName}/mutect.vcf.gz"
    log:
        stdout="log/mutect_gather/{tumorSampleName}_VS_{normalSampleName}.stdout", #.format(compid=compid, contig='{contig}'),
        stderr="log/mutect_gather/{tumorSampleName}_VS_{normalSampleName}.stderr" #.format(compid=compid, contig='{contig}')

    threads: 1
    resources:
        runtime=lambda wildcards, attempt, input: ( attempt * 24),
        mem_mb = 2000,
    message:
        'Merging contig VCF files'

    shell:
        "bcftools concat -Oz -o {output.vcf} {input.chr_vcfs}; bcftools index {output.vcf}"