Download this file

259 lines (228 with data), 11.0 kB

from singlecellmultiomics.libraryDetection.sequencingLibraryListing import SequencingLibraryLister
from glob import glob
import collections
from singlecellmultiomics.utils import get_contig_list_from_fasta

"""
This workflow:
    Starts off from a folder containing all fastq files
    - Detects libraries
    - Demultiplexes per library, automatically detecting the right barcodes
    - Trims using cutadapt
    - Maps, sorts and indexes the reads per library
    - Filters bam file based on base quality tag ('SQ') - by default, the threshold is 0.98
    - Filters bam file based on maximum insert size (distance between R1 and R2) to filter out read pairs where R1 and R2 are not mapped to the same amplicon
    - Creates QC plots
    - Creates count tables, for unfiltered data and for SQ filtered data:
       a. Count table containing rows with chromosome, allele, site, scar information and columns with cell information 
       b. Count table to check SQ values for all reads - with rows for cells and columns for SQ scores ## not yet!
"""
################## configuration ##################
configfile: "config.json"

# This code detects which libraries are present in the current folder:
l = SequencingLibraryLister()
LIBRARIES = l.detect(glob('*.fastq.gz'), merge='_')
# Flatten to library:[fastqfile, fastqfile, ...]
fastq_per_lib = collections.defaultdict(list)
for lib,lane_dict in LIBRARIES.items():
    for lane,read_dict in lane_dict.items():
        fastq_per_lib[lib] += read_dict['R1']
        fastq_per_lib[lib] += read_dict['R2']
libraries =  list( fastq_per_lib.keys() )

################## configuration end ##################

def get_fastq_file_list(wildcards):
    # Obtain a list of fastq files associated to wildcards.library
    global libraries
    return sorted( fastq_per_lib[wildcards.library] )

def get_target_demux_list():
    global libraries
    targets = []
    for lib in libraries:
        targets.append('processed/' + lib + "/demultiplexedR1.fastq.gz" )
        targets.append('processed/' + lib + "/demultiplexedR2.fastq.gz" )
    return targets

def get_target_tagged_bam_list():
    return [f"processed/{library}/tagged.bam" for library in libraries]

rule all:
    input:
        # get_target_demux_list() use this for demux only
        get_target_tagged_bam_list(),
        expand("processed/{library}/count_table_unfilteredBam.csv",
            library=libraries),
        expand("processed/{library}/count_table_filteredBam.csv",
            library=libraries),

        expand("processed/{library}/plots/ReadCount.png", library=libraries)


#### 1. Demultiplexing 
rule SCMO_demux:
    input:
        fastqfiles = get_fastq_file_list
    output:
        temp("processed/{library}/demultiplexedR1.fastq.gz"),
        temp("processed/{library}/demultiplexedR2.fastq.gz"),
        temp("processed/{library}/rejectsR1.fastq.gz"),
        temp("processed/{library}/rejectsR2.fastq.gz")
    log:
        stdout="log/demux/{library}.stdout",
        stderr="log/demux/{library}.stderr"
    params: runtime="30h"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 4000
    
    shell:
        "demux.py -merge _ {input.fastqfiles} -o processed --y -use SCARC8R2 > {log.stdout} 2> {log.stderr}"

#### 2. Trimming
rule Trim:
    input:
        r1="processed/{library}/demultiplexedR1.fastq.gz",
        r2="processed/{library}/demultiplexedR2.fastq.gz"
    log:
        stdout="log/trim/{library}.stdout",
        stderr="log/trim/{library}.stderr"
    output:
        r1=temp("processed/{library}/trimmed.R1.fastq.gz"),
        r2=temp("processed/{library}/trimmed.R2.fastq.gz")

    params: runtime="30h"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 4000

    shell:
        'cutadapt -o {output.r1} -p {output.r2} \
        {input.r1} {input.r2} \
        -m 3 -a "IlluminaSmallAdapterConcatBait=GGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTT" \
        -a "IlluminaIndexAdapter=GGAATTCTCGGGTGCCAAGGAACTCCAGTCACN{{6}}ATCTCGTATGCCGTCTTCTGCTTG" \
        -A "IlluminaPairedEndPCRPrimer2.0=AGATCGGAAGAGCGN{{6}}CAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG;min_overlap=5" \
        -A "universalPrimer=GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT;min_overlap=5" \
        -a  "IlluminaGEX=TTTTTAATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGACGATC;min_overlap=5" \
        -a "IlluminaMultiplexingPCRPrimer=GGAACTCCAGTCACN{{6}}TCTCGTATGCCGTCTTCTGCTTG;min_overlap=5" \
        -A "Aseq=TGGCACCCGAGAATTCCA" -a "Aseq=TGGCACCCGAGAATTCCA"  \
        -a "illuminaSmallRNAAdapter=TCGTATGCCGTCTTCTGCTTGT" > {log.stdout} 2> {log.stderr}'

#### 3. Mapping with BWA or bowtie2
rule map:
    input:
        r1="processed/{library}/trimmed.R1.fastq.gz",
        r2="processed/{library}/trimmed.R2.fastq.gz"
    params:
        ref=config['reference_file_for_mapper'] 
    output:
        bam = temp("processed/{library}/sorted.bam"),
        bam_index = temp("processed/{library}/sorted.bam.bai")
    log:
        stdout="log/map/{library}.stdout",
        stderr="log/map/{library}.stderr"
    threads: 8
    params: runtime="30h"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 8000

    run:
        # https://stackoverflow.com/questions/40996597/snakemake-remove-output-file this is probably pretier
        if config['mapper']=='bwa':
            # The sorting and mapping has been disconnected
            shell(
                "bwa mem -M -I 220 -t {threads} {params.ref} {input.r1} {input.r2}  2> {log.stderr} |  samtools view -Sb - > processed/{wildcards.library}/unsorted.bam; \
                samtools sort -T processed/{wildcards.library}/temp_sort -@ {threads} processed/{wildcards.library}/unsorted.bam > processed/{wildcards.library}/sorted.unfinished.bam; \
                mv processed/{wildcards.library}/sorted.unfinished.bam {output.bam}; rm processed/{wildcards.library}/unsorted.bam; samtools index {output.bam} > {log.stdout}"
                )
        elif config['mapper']=='bowtie2':

            shell(
                "bowtie2 -p {threads} -q --no-unal --local --sensitive-local -N 1 -x {params.ref} -1 {input.r1} -2 {input.r2} | samtools view -Sb > processed/{wildcards.library}/unsorted.bam; \
                samtools sort -T processed/{wildcards.library}/temp_sort -@ {threads} processed/{wildcards.library}/unsorted.bam > processed/{wildcards.library}/sorted.unfinished.bam; \
                mv processed/{wildcards.library}/sorted.unfinished.bam {output.bam}; rm processed/{wildcards.library}/unsorted.bam; samtools index {output.bam} > {log.stdout} 2> {log.stderr} "
                )

#### 4   universalBamTagger.py --ftag --scar --dedup -alleles $allele_reference -o $destdir ${bamfiles}
rule SCMO_tagmultiome_Scartrace:
    input:
        bam = "processed/{library}/sorted.bam",
        bam_index = "processed/{library}/sorted.bam.bai"
    output:
        bam = "processed/{library}/tagged.bam",
        bam_index = "processed/{library}/tagged.bam.bai"
    log:
        stdout="log/tagging/{library}.stdout",
        stderr="log/tagging/{library}.stderr"
    threads: 1
    params:
        runtime="20h",
        alleles = config['alleles'],
        allele_samples = config['allele_samples']
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 10000

    shell:
        "bamtagmultiome.py -method scartrace -allele_samples {params.allele_samples} -alleles {params.alleles} {input.bam} -o {output.bam} --every_fragment_as_molecule > {log.stdout} 2> {log.stderr}"
#still needs to be implemented for faster computing: --use_allele_cache

#### 5. Filter bam file based on base quality tag ('SQ')
rule SCMO_SQ_filter:
    input:
        bam = "processed/{library}/tagged.bam",
        bam_index = "processed/{library}/tagged.bam.bai" 
    output:
        bam = "processed/{library}/tagged_SQfiltered.bam",
        bam_index = "processed/{library}/tagged_SQfiltered.bam.bai"
    params:
        SQ_filter = config['SQ_filter']
    log:
        stdout="log/SQfilter/{library}.stdout",
        stderr="log/SQfilter/{library}.stderr"

    shell:
        '''bamFilter.py {input.bam} -o {output.bam} 'r.has_tag("SQ") and r.get_tag("SQ") > {params.SQ_filter}' > {log.stdout} 2> {log.stderr} '''

#### 6. Filter bam file based on maximum insert size
rule SCMO_maxInsert_filtering:
    input:
        bam = "processed/{library}/tagged_SQfiltered.bam",
        bam_index = "processed/{library}/tagged_SQfiltered.bam.bai"
    output:
        bam = "processed/{library}/tagged_filtered.bam",
        bam_index = "processed/{library}/tagged_filtered.bam.bai"
    params:
        insertSizeFilter = config['maxInsertSize']
    log:
        stdout="log/insertSizeFilter/{library}.stdout",
        stderr="log/insertSizeFilter/{library}.stderr"

    shell:
        '''bamFilter.py {input.bam} -o {output.bam} 'r.has_tag("fS") and (r.get_tag("fS") < {params.insertSizeFilter})' > {log.stdout} 2> {log.stderr} '''

#### 7. Librarystatistics
rule SCMO_library_stats:
    input:
        bam = "processed/{library}/tagged_filtered.bam",
        r1="processed/{library}/demultiplexedR1.fastq.gz", # It needs these to count how many raw reads were present in the lib.
        r2="processed/{library}/demultiplexedR2.fastq.gz",
        r1_rejects="processed/{library}/rejectsR1.fastq.gz",
        r2_rejects="processed/{library}/rejectsR2.fastq.gz"
    output:
        "processed/{library}/plots/ReadCount.png"
    log:
        stdout="log/library_stats/{library}.stdout",
        stderr="log/library_stats/{library}.stderr"
    threads: 1
    params: runtime="30h"

    shell:
        "libraryStatistics.py processed/{wildcards.library} -tagged_bam {input.bam} > {log.stdout} 2> {log.stderr}"

### 8. Make count tables of UNFILTERED bam files
rule SCMO_count_table_unfiltered:
    input:
        bam = "processed/{library}/tagged.bam"    
    output:
        countTable = "processed/{library}/count_table_unfilteredBam.csv"
    threads: 1
    params:
        runtime="50h"
    log:
        stdout="log/count_table/{library}_unfiltered.stdout",
        stderr="log/count_table/{library}_unfiltered.stderr"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 8000

    shell:
        "bamToCountTable.py {input.bam} -o {output.countTable} -joinedFeatureTags chrom,DA,DS,SD -sampleTags SM > {log.stdout} 2> {log.stderr}"

### 9. Make count tables of filtered bam files
# This works exactly the same as step 8
rule SCMO_count_table_filtered:
    input:
        bam = "processed/{library}/tagged_filtered.bam"   
    output:
        countTable = "processed/{library}/count_table_filteredBam.csv"
    threads: 1
    params:
        runtime="50h"
    log:
        stdout="log/count_table/{library}_filtered.stdout",
        stderr="log/count_table/{library}_filtered.stderr"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 8000

    shell:
        "bamToCountTable.py {input.bam} -o {output.countTable} -joinedFeatureTags chrom,DA,DS,SD -sampleTags SM > {log.stdout} 2> {log.stderr}"