SingleCellMultiOmics / Git / [4f4f20] /singlecellmultiomics/snakemake_workflows/cs2

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
[4f4f20]: / singlecellmultiomics / snakemake_workflows / cs2_scmo / Snakefile
History
Download this file
221 lines (191 with data), 8.3 kB

from singlecellmultiomics.libraryDetection.sequencingLibraryListing import SequencingLibraryLister
from glob import glob
import collections
from singlecellmultiomics.utils import get_contig_list_from_fasta

"""
CelSeq2 pipeline using STAR and bamtagmultiome.py

Dependencies:
# STAR
# cutadapt
# SingleCellMultiOmics package

This workflow:
    Starts off from a folder containing all fastq files
    - Detects libraries
    - Demultiplexes per library, automatically detecting the right barcodes
    - Trims using cutadapt
    - Maps using STAR, sorts and indexes the reads per library
    - Deduplication using UMI's and feature annotation is performed by bamtagmultiome.py
    - Creates library statistics plots for quality control
    - Creates count tables for deduplicated and non-deduplicated counts:
       -> Count table containing rows with gene information and columns with cell information
"""
################## configuration ##################
configfile: "config.json"

# This code detects which libraries are present in the current folder:
l = SequencingLibraryLister()
LIBRARIES = l.detect(glob('*.fastq.gz'), merge='_')
# Flatten to library:[fastqfile, fastqfile, ...]
fastq_per_lib = collections.defaultdict(list)
for lib,lane_dict in LIBRARIES.items():
    for lane,read_dict in lane_dict.items():
        fastq_per_lib[lib] += read_dict['R1']
        fastq_per_lib[lib] += read_dict['R2']
libraries =  list( fastq_per_lib.keys() )

################## configuration end ##################

def get_fastq_file_list(wildcards):
    # Obtain a list of fastq files associated to wildcards.library
    global libraries
    return sorted( fastq_per_lib[wildcards.library] )

def get_target_demux_list():
    global libraries
    targets = []
    for lib in libraries:
        targets.append('processed/' + lib + "/demultiplexedR1.fastq.gz" )
        targets.append('processed/' + lib + "/demultiplexedR2.fastq.gz" )
    return targets

def get_target_tagged_bam_list():
    return [f"processed/{library}/tagged.bam" for library in libraries]

rule all:
    input:
        # get_target_demux_list() use this for demux only
        get_target_tagged_bam_list(),
        expand("processed/{library}/count_table_not_deduplicated.csv",
            library=libraries),
        expand("processed/{library}/count_table_deduplicated.csv",
            library=libraries),

        expand("processed/{library}/plots/ReadCount.png", library=libraries)


#### 1. Demultiplexing
rule SCMO_demux:
    input:
        fastqfiles = get_fastq_file_list
    output:
        temp("processed/{library}/demultiplexedR1.fastq.gz"),
        temp("processed/{library}/demultiplexedR2.fastq.gz"),
        temp("processed/{library}/rejectsR1.fastq.gz"),
        temp("processed/{library}/rejectsR2.fastq.gz")
    log:
        stdout="log/demux/{library}.stdout",
        stderr="log/demux/{library}.stderr"
    params: runtime="30h"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 4000

    shell:
        "demux.py -merge _ {input.fastqfiles} -o processed --y > {log.stdout} 2> {log.stderr}"

#### 2. Trimming
rule Trim:
    input:
        r1="processed/{library}/demultiplexedR1.fastq.gz",
        r2="processed/{library}/demultiplexedR2.fastq.gz"
    output:
        r1=temp("processed/{library}/trimmed.R1.fastq.gz"),
        r2=temp("processed/{library}/trimmed.R2.fastq.gz")
    log:
        stdout="log/trim/{library}.stdout",
        stderr="log/trim/{library}.stderr"
    params:
        runtime="30h"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 4000

    shell:
        'cutadapt -o {output.r1} -p {output.r2} \
        {input.r1} {input.r2} \
        -m 3 -a "IlluminaSmallAdapterConcatBait=GGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTT" \
        -a "IlluminaIndexAdapter=GGAATTCTCGGGTGCCAAGGAACTCCAGTCACN{{6}}ATCTCGTATGCCGTCTTCTGCTTG" \
        -A "IlluminaPairedEndPCRPrimer2.0=AGATCGGAAGAGCGN{{6}}CAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG;min_overlap=5" \
        -A "universalPrimer=GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT;min_overlap=5" \
        -a  "IlluminaGEX=TTTTTAATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGACGATC;min_overlap=5" \
        -a "IlluminaMultiplexingPCRPrimer=GGAACTCCAGTCACN{{6}}TCTCGTATGCCGTCTTCTGCTTG;min_overlap=5" \
        -A "Aseq=TGGCACCCGAGAATTCCA" -a "Aseq=TGGCACCCGAGAATTCCA"  \
        -a "illuminaSmallRNAAdapter=TCGTATGCCGTCTTCTGCTTGT" > {log.stdout} 2> {log.stderr}'

#### 3. Mapping with STAR
rule map:
    input:
        r2="processed/{library}/trimmed.R2.fastq.gz"
    params:
        ref=config['STAR_index'],
        output_dir = "processed/{library}/"

    output:
        bam = "processed/{library}/Aligned.sortedByCoord.out.bam",
        SJtemp = temp("processed/{library}/SJ.out.tab")
    log:
        stdout="log/map/{library}.stdout",
        stderr="log/map/{library}.stderr"
    threads: 8
    params:
        runtime="30h",
        output_dir = "processed/{library}/"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 60000

    shell:
      "STAR --runThreadN 8 --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax 20 --outMultimapperOrder Random --outSAMattributes All --outSAMmultNmax 1 --readFilesCommand zcat  --readFilesIn {input.r2} \
      --genomeDir {params.ref} --outFileNamePrefix {params.output_dir} > {log.stdout} 2> {log.stderr}"


#### 5. Tagging and deduplication
rule SCMO_tagmultiome_CS2:
    input:
      bam = "processed/{library}/Aligned.sortedByCoord.out.bam",
      introns=config['introns'],
      exons=config['exons']
    output:
      bam = "processed/{library}/tagged.bam",
      bam_index = "processed/{library}/tagged.bam.bai"
    log:
        stdout="log/tagging/{library}.stdout",
        stderr="log/tagging/{library}.stderr"
    threads: 1
    params:
        runtime="20h"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 10000

    shell:
        "bamtagmultiome.py -method cs -exons {input.exons} -introns {input.introns} -umi_hamming_distance 0 {input.bam} -o {output.bam} > {log.stdout} 2> {log.stderr}"

#### 6. Library statistics
rule SCMO_library_stats:
    input:
        bam = "processed/{library}/tagged.bam",
        r1="processed/{library}/demultiplexedR1.fastq.gz", # Its need these to count how many raw reads were present in the lib.
        r2="processed/{library}/demultiplexedR2.fastq.gz",
        r1_rejects="processed/{library}/rejectsR1.fastq.gz",
        r2_rejects="processed/{library}/rejectsR2.fastq.gz"
    output:
        "processed/{library}/plots/ReadCount.png"

    log:
        stdout="log/library_stats/{library}.stdout",
        stderr="log/library_stats/{library}.stderr"

    threads: 1
    params: runtime="30h"

    shell:
        "libraryStatistics.py processed/{wildcards.library} -tagged_bam /tagged.bam > {log.stdout} 2> {log.stderr}"

#### 7. Make dedup and no dedup count tables
#### a. Make count tables of non-deduplicated bam files
rule SCMO_count_table_nondedup:
    input:
        bam = "processed/{library}/tagged.bam"
    output:
        countTable = "processed/{library}/count_table_not_deduplicated.csv"
    threads: 1
    params:
        runtime="10h"
    log:
        stdout="log/count_table/{library}_not_deduplicated.stdout",
        stderr="log/count_table/{library}_not_deduplicated.stderr"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 8000

    shell:
        "bamToCountTable.py --noNames {input.bam} -o {output.countTable} -joinedFeatureTags GN,gn,chrom -sampleTags SM > {log.stdout} 2> {log.stderr}"

#### b. Make count tables of deduplicated bam files
# This works exactly the same as step a
rule SCMO_count_table_filtered:
    input:
        bam = "processed/{library}/tagged.bam"
    output:
        countTable = "processed/{library}/count_table_deduplicated.csv"
    threads: 1
    params:
        runtime="10h"
    log:
        stdout="log/count_table/{library}_deduplicated.stdout",
        stderr="log/count_table/{library}_deduplicated.stderr"
    resources:
        mem_mb=lambda wildcards, attempt: attempt * 8000

    shell:
        "bamToCountTable.py --dedup --noNames {input.bam} -o {output.countTable} -joinedFeatureTags GN,gn,chrom -sampleTags SM > {log.stdout} 2> {log.stderr}"