Download this file

286 lines (235 with data), 11.4 kB

from singlecellmultiomics.libraryDetection.sequencingLibraryListing import SequencingLibraryLister
from glob import glob
import collections
from singlecellmultiomics.utils import get_contig_list_from_fasta

"""
This workflow:
    Starts off from a folder containing fastq files
    - Detects libraries
    - Demultiplexes per library, automatically detecting the right barcodes
    - Trims using cutadapt
    - Maps, sorts and indexes the reads per library
    - Deduplicates and identifies molecules in parallel per contig
    - Performs base recallibration
    - Creates QC plots
    - Creates count tables
"""
################## configuration ##################
configfile: "config.json"
# config
counting_bin_sizes = config['counting_bin_sizes']
counting_sliding_window_increments = config['counting_sliding_window_increments']

# Obtain contigs:
contigs = get_contig_list_from_fasta(config['reference_file'])
# If you want to select on which chromosomes to run, change and  uncomment the next line:
# contigs = ['chr1','chr2']

# This code detects which libraries are present in the current folder:
l = SequencingLibraryLister()
LIBRARIES = l.detect(glob('*.fastq.gz'), merge='_')
# Flatten to library:[fastqfile, fastqfile, ...]
fastq_per_lib = collections.defaultdict(list)
for lib,lane_dict in LIBRARIES.items():
    for lane,read_dict in lane_dict.items():
        fastq_per_lib[lib] += read_dict['R1']
        fastq_per_lib[lib] += read_dict['R2']
libraries =  list( fastq_per_lib.keys() )

if len(libraries)==0:
    # Try to look for folders with .tagged bam files in them
    for tagpath in glob('./*/tagged.bam'):
        libraries.append(tagpath.split('/')[1])


################## configuration end ##################

def get_fastq_file_list(wildcards):
    # Obtain a list of fastq files associated to wildcards.library
    global libraries
    return sorted( fastq_per_lib[wildcards.library] )

def get_target_demux_list():
    global libraries
    targets = []
    for lib in libraries:
        targets.append('processed/' + lib + "/demultiplexedR1.fastq.gz" )
        targets.append('processed/' + lib + "/demultiplexedR2.fastq.gz" )
    return targets

def get_target_tagged_bam_list():
    return [f"processed/{library}/tagged.bam" for library in libraries]

rule all:
    input:
        # get_target_demux_list() use this for demux only
        get_target_tagged_bam_list(),
        expand("processed/{library}/count_table_{counting_bin_size}_{counting_sliding_window_increment}.csv",
            library=libraries,
            counting_bin_size=counting_bin_sizes,
            counting_sliding_window_increment=counting_sliding_window_increments),

        expand("processed/{library}/plots/ReadCount.png", library=libraries)

rule SCMO_demux:
    input:
        fastqfiles = get_fastq_file_list
    output:
        temp("processed/{library}/demultiplexedR1.fastq.gz"),
        temp("processed/{library}/demultiplexedR2.fastq.gz"),
        temp("processed/{library}/rejectsR1.fastq.gz"),
        temp("processed/{library}/rejectsR2.fastq.gz")

    log:
        stdout="log/demux/{library}.stdout",
        stderr="log/demux/{library}.stderr"
    params: runtime="30h"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 4000
    shell:
        "demux.py -merge _ {input.fastqfiles} -o processed --y > {log.stdout} 2> {log.stderr}"


rule Trim:
    input:
        r1="processed/{library}/demultiplexedR1.fastq.gz",
        r2="processed/{library}/demultiplexedR2.fastq.gz"
    log:
        stdout="log/trim/{library}.stdout",
        stderr="log/trim/{library}.stderr"
    output:
        r1=temp("processed/{library}/trimmed.R1.fastq.gz"),
        r2=temp("processed/{library}/trimmed.R2.fastq.gz")

    params: runtime="30h"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 4000

    shell:
        'cutadapt -o {output.r1} -p {output.r2} \
        {input.r1} {input.r2} \
        -m 3 -a "IlluminaSmallAdapterConcatBait=GGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTT" \
        -a "IlluminaIndexAdapter=GGAATTCTCGGGTGCCAAGGAACTCCAGTCACN{{6}}ATCTCGTATGCCGTCTTCTGCTTG" \
        -A "IlluminaPairedEndPCRPrimer2.0=AGATCGGAAGAGCGN{{6}}CAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG;min_overlap=5" \
        -A "universalPrimer=GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT;min_overlap=5" \
        -a  "IlluminaGEX=TTTTTAATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGACGATC;min_overlap=5" \
        -a "IlluminaMultiplexingPCRPrimer=GGAACTCCAGTCACN{{6}}TCTCGTATGCCGTCTTCTGCTTG;min_overlap=5" \
        -A "Aseq=TGGCACCCGAGAATTCCA" -a "Aseq=TGGCACCCGAGAATTCCA"  \
        -a "illuminaSmallRNAAdapter=TCGTATGCCGTCTTCTGCTTGT" > {log.stdout} 2> {log.stderr}'


rule map:
    input:
        r1="processed/{library}/trimmed.R1.fastq.gz",
        r2="processed/{library}/trimmed.R2.fastq.gz"

    params:
        ref=config['reference_file_for_mapper']

    output:
        bam = temp("processed/{library}/sorted.bam"),
        bam_index = temp("processed/{library}/sorted.bam.bai")

    log:
        stdout="log/map/{library}.stdout",
        stderr="log/map/{library}.stderr",
        stderr2="log/map/{library}.sort.stderr"
    threads: 8
    params: runtime="30h"
    resources:
        mem_mb=lambda wildcards, attempt: 20000 + attempt * 8000

    run:
        # https://stackoverflow.com/questions/40996597/snakemake-remove-output-file this is probably pretier
        if config['mapper']=='bwa':
            # The sorting and mapping has been disconnected
            shell(
                "bwa mem -t {threads} {params.ref} {input.r1} {input.r2}  2> {log.stderr} |  samtools view -bS - > processed/{wildcards.library}/unsorted.bam; \
                        samtools sort -T processed/{wildcards.library}/temp_sort -@ {threads} processed/{wildcards.library}/unsorted.bam > processed/{wildcards.library}/sorted.unfinished.bam 2>{log.stderr2}; \
                        mv processed/{wildcards.library}/sorted.unfinished.bam {output.bam}; rm processed/{wildcards.library}/unsorted.bam; samtools index {output.bam} > {log.stdout}"
                )
        elif config['mapper']=='bowtie2':
            shell(
                "bowtie2 -p {threads} -q --no-unal --local --sensitive-local -N 1 -x {params.ref} -1 {input.r1} -2 {input.r2} 2>{log.stderr}  > {log.stdout} | samtools view -Sb > processed/{wildcards.library}/unsorted.bam; \
                samtools sort -T processed/{wildcards.library}/temp_sort -@ {threads} processed/{wildcards.library}/unsorted.bam > processed/{wildcards.library}/sorted.unfinished.bam  2> {log.stderr2}; \
                mv processed/{wildcards.library}/sorted.unfinished.bam {output.bam}; rm processed/{wildcards.library}/unsorted.bam; samtools index {output.bam}"
                )


rule SCMO_tagmultiome_NLA_parallel_scatter:
    input:
        bam = "processed/{library}/sorted.bam",

        bam_index = "processed/{library}/sorted.bam.bai",
        mapfile = config['mapfile']

    output:
        bam = temp("processed/{library}/TEMP_CONTIG/{contig}.bam"),
        bam_index = temp("processed/{library}/TEMP_CONTIG/{contig}.bam.bai")

    log:
        stdout="log/tag_scatter/{library}_{contig}.stdout",
        stderr="log/tag_scatter/{library}_{contig}.stderr"


    threads: 1
    params:
        runtime="20h",
        alleles = config['alleles'],
        allele_samples = config['allele_samples']

    resources:
        mem_mb=lambda wildcards, attempt: attempt * 10000

    shell:
        "bamtagmultiome.py -method nla -allele_samples {params.allele_samples} -alleles {params.alleles} -mapfile {input.mapfile} -contig {wildcards.contig} {input.bam} -o {output.bam} > {log.stdout} 2> {log.stderr}"


rule SCMO_tagmultiome_NLA_parallel_gather:
    input:
        chr_bams =  expand("processed/{{library}}/TEMP_CONTIG/{contig}.bam", contig=contigs),
        chr_bams_indices =  expand("processed/{{library}}/TEMP_CONTIG/{contig}.bam.bai", contig=contigs)
    output:
        bam = "processed/{library}/tagged.bam",
        bam_index = "processed/{library}/tagged.bam.bai"
    log:
        stdout="log/tag_gather/{library}.stdout",
        stderr="log/tag_gather/{library}.stderr"

    threads: 1
    params: runtime="8h"
    message:
        'Merging contig BAM files'

    shell:
        "samtools merge -c {output.bam} {input.chr_bams} > {log.stdout} 2> {log.stderr}; samtools index {output.bam}"


rule SCMO_library_stats:
    input:
        bam = "processed/{library}/tagged.bam",
        r1="processed/{library}/demultiplexedR1.fastq.gz", # Its need these to count how many raw reads were present in the lib.
        r2="processed/{library}/demultiplexedR2.fastq.gz",
        r1_rejects="processed/{library}/rejectsR1.fastq.gz",
        r2_rejects="processed/{library}/rejectsR2.fastq.gz",
        reference_file=config['reference_file']
    output:
        "processed/{library}/plots/ReadCount.png"

    log:
        stdout="log/library_stats/{library}.stdout",
        stderr="log/library_stats/{library}.stderr"

    threads: 1
    params: runtime="30h"

    shell:
        "libraryStatistics.py processed/{wildcards.library} -tagged_bam /tagged.bam > {log.stdout} 2> {log.stderr}"

rule SCMO_CN:
    input:
        bam = "processed/{library}/tagged.bam",
        ref = config['reference_file']
    output:
        rawmatplot = "processed/{library}/raw_cn_{counting_bin_size}bp.png",
        rawmat = "processed/{library}/raw_cn_{counting_bin_size}bp.pickle.gz",
        gcmatplot = "processed/{library}/gc_corrected_cn_{counting_bin_size}bp.png",
        gcmat = "processed/{library}/gc_corrected_cn_{counting_bin_size}bp.pickle.gz",
        histplot = "processed/{library}/cell_histogram_{counting_bin_size}.png",
    params:
        molecule_threshold=config['molecule_threshold'],
        counting_min_mq = config['counting_min_mq']

    log:
        stdout="log/cn_table/{library}_{counting_bin_size}.stdout",
        stderr="log/cn_table/{library}_{counting_bin_size}.stderr"

    shell:
        "bamCopyNumber.py -ref {input.reference_file}  \
        -bin_size {wildcards.counting_bin_size} \
        -threads 16 \
        -min_mq {params.counting_min_mq} \
        -molecule_threshold {params.molecule_threshold} \
        -bins_per_job 5 \
        -rawmatplot {output.rawmatplot} \
        -gcmatplot {output.gcmatplot} \
        -gcmat {output.gcmat} \
        -rawmat {output.rawmat} \
        -histplot {output.histplot}"


rule SCMO_count_table:
    input:
        bam = "processed/{library}/tagged.bam"
    output:
        "processed/{library}/count_table_{counting_bin_size}_{counting_sliding_window_increment}.csv"

    threads: 1
    params:
        runtime="50h",
        counting_min_mq = config['counting_min_mq']

    log:
        stdout="log/count_table/{library}_{counting_bin_size}_{counting_sliding_window_increment}.stdout",
        stderr="log/count_table/{library}_{counting_bin_size}_{counting_sliding_window_increment}.stderr"

    resources:
        mem_mb=lambda wildcards, attempt: attempt * 8000

    shell:
        "bamToCountTable.py -bin {wildcards.counting_bin_size} \
        -sliding {wildcards.counting_sliding_window_increment} \
        -minMQ {params.counting_min_mq} \
        --noNames \
        {input.bam} -sampleTags SM -joinedFeatureTags reference_name -binTag DS -o {output} --dedup > {log.stdout} 2> {log.stderr}"