249 lines (216 with data), 9.4 kB
from singlecellmultiomics.libraryDetection.sequencingLibraryListing import SequencingLibraryLister
from glob import glob
import collections
from singlecellmultiomics.utils import get_contig_list_from_fasta
"""
CelSeq2 pipeline based on the version of Marijn van Loenhout (2018)
Dependencies:
# STAR
# samtools
# umi-tools
# cutadapt (trimgalore/atropos)
# subread
# colorama
# SingleCellMultiOmics package
This workflow:
Starts off from a folder containing all fastq files
- Detects libraries
- Demultiplexes per library, automatically detecting the right barcodes
- Trims using cutadapt
- Maps using STAR, sorts and indexes the reads per library
- uses featureCounting
- Tags and demultiplexes
- Creates library statistics plots for quality control
- Creates count tables for deduplicated and non-deduplicated counts:
-> Count table containing rows with gene information and columns with cell information
"""
################## configuration ##################
configfile: "config.json"
# This code detects which libraries are present in the current folder:
l = SequencingLibraryLister()
LIBRARIES = l.detect(glob('*.fastq.gz'), merge='_')
# Flatten to library:[fastqfile, fastqfile, ...]
fastq_per_lib = collections.defaultdict(list)
for lib,lane_dict in LIBRARIES.items():
for lane,read_dict in lane_dict.items():
fastq_per_lib[lib] += read_dict['R1']
fastq_per_lib[lib] += read_dict['R2']
libraries = list( fastq_per_lib.keys() )
################## configuration end ##################
def get_fastq_file_list(wildcards):
# Obtain a list of fastq files associated to wildcards.library
global libraries
return sorted( fastq_per_lib[wildcards.library] )
def get_target_demux_list():
global libraries
targets = []
for lib in libraries:
targets.append('processed/' + lib + "/demultiplexedR1.fastq.gz" )
targets.append('processed/' + lib + "/demultiplexedR2.fastq.gz" )
return targets
def get_target_tagged_bam_list():
return [f"processed/{library}/tagged.bam" for library in libraries]
rule all:
input:
# get_target_demux_list() use this for demux only
get_target_tagged_bam_list(),
expand("processed/{library}/count_table_not_deduplicated.csv",
library=libraries),
expand("processed/{library}/count_table_deduplicated.csv",
library=libraries),
expand("processed/{library}/plots/ReadCount.png", library=libraries)
#### 1. Demultiplexing
rule SCMO_demux:
input:
fastqfiles = get_fastq_file_list
output:
temp("processed/{library}/demultiplexedR1.fastq.gz"),
temp("processed/{library}/demultiplexedR2.fastq.gz"),
temp("processed/{library}/rejectsR1.fastq.gz"),
temp("processed/{library}/rejectsR2.fastq.gz")
log:
stdout="log/demux/{library}.stdout",
stderr="log/demux/{library}.stderr"
params: runtime="30h"
resources:
mem_mb=lambda wildcards, attempt: attempt * 4000
shell:
"demux.py -merge _ {input.fastqfiles} -o processed --y -use CS2C8U6 > {log.stdout} 2> {log.stderr}"
#### 2. Trimming
rule Trim:
input:
r1="processed/{library}/demultiplexedR1.fastq.gz",
r2="processed/{library}/demultiplexedR2.fastq.gz"
output:
r1=temp("processed/{library}/trimmed.R1.fastq.gz"),
r2=temp("processed/{library}/trimmed.R2.fastq.gz")
log:
stdout="log/trim/{library}.stdout",
stderr="log/trim/{library}.stderr"
params:
runtime="30h"
resources:
mem_mb=lambda wildcards, attempt: attempt * 4000
shell:
'cutadapt -o {output.r1} -p {output.r2} \
{input.r1} {input.r2} \
-m 3 -a "IlluminaSmallAdapterConcatBait=GGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTT" \
-a "IlluminaIndexAdapter=GGAATTCTCGGGTGCCAAGGAACTCCAGTCACN{{6}}ATCTCGTATGCCGTCTTCTGCTTG" \
-A "IlluminaPairedEndPCRPrimer2.0=AGATCGGAAGAGCGN{{6}}CAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG;min_overlap=5" \
-A "universalPrimer=GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT;min_overlap=5" \
-a "IlluminaGEX=TTTTTAATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGACGATC;min_overlap=5" \
-a "IlluminaMultiplexingPCRPrimer=GGAACTCCAGTCACN{{6}}TCTCGTATGCCGTCTTCTGCTTG;min_overlap=5" \
-A "Aseq=TGGCACCCGAGAATTCCA" -a "Aseq=TGGCACCCGAGAATTCCA" \
-a "illuminaSmallRNAAdapter=TCGTATGCCGTCTTCTGCTTGT" > {log.stdout} 2> {log.stderr}'
#### 3. Mapping with STAR
rule map:
input:
r2="processed/{library}/trimmed.R2.fastq.gz"
params:
ref=config['STAR_index'],
output_dir = "processed/{library}/"
output:
bam = temp("processed/{library}/Aligned.sortedByCoord.out.bam"),
SJtemp = temp("processed/{library}/SJ.out.tab")
log:
stdout="log/map/{library}.stdout",
stderr="log/map/{library}.stderr"
threads: 8
params:
runtime="30h",
output_dir = "processed/{library}/"
resources:
mem_mb=lambda wildcards, attempt: attempt * 8000
shell:
"STAR --runThreadN 8 --outSAMtype BAM SortedByCoordinate --outFilterMultimapNmax 20 --outMultimapperOrder Random --outSAMattributes All --outSAMmultNmax 1 --readFilesCommand zcat --readFilesIn {input.r2} \
--genomeDir {params.ref} --outFileNamePrefix {params.output_dir} > {log.stdout} 2> {log.stderr}"
#### 4. Feature counting
rule featureCounting:
input:
bam = "processed/{library}/Aligned.sortedByCoord.out.bam"
output:
bam = temp("processed/{library}/Aligned.sortedByCoord.out.bam.featureCounts.bam"),
log = temp("processed/{library}/fCounts.txt")
params:
GTF = config['GTF']
log:
stdout="log/featureCounts/{library}.stdout",
stderr="log/featureCounts/{library}.stderr"
threads: 4
params: runtime="30h"
resources:
mem_mb=lambda wildcards, attempt: attempt * 8000
shell:
'featureCounts -s 1 -T 1 -R BAM -a {params.GTF} -o processed/{wildcards.library}/fCounts.txt {input.bam} > {log.stdout} 2> {log.stderr}'
#### 5. Tagging and deduplication
rule SCMO_tagmultiome_CS2:
input:
bam = "processed/{library}/Aligned.sortedByCoord.out.bam.featureCounts.bam"
output:
bamSorted = temp("processed/{library}/sorted.bam"),
bamSorted_index = temp("processed/{library}/sorted.bam.bai"),
bam = "processed/{library}/tagged.bam",
bam_index = "processed/{library}/tagged.bam.bai"
log:
stdout="log/tagging/{library}.stdout",
stderr="log/tagging/{library}.stderr"
threads: 1
params:
runtime="20h"
resources:
mem_mb=lambda wildcards, attempt: attempt * 10000
shell:
"samtools sort -T processed/{wildcards.library}/temp_sort -@ {threads} {input.bam} > processed/{wildcards.library}/sorted.unfinished.bam; \
mv processed/{wildcards.library}/sorted.unfinished.bam {output.bamSorted}; samtools index {output.bamSorted}; \
bamtagmultiome.py -method cs_feature_counts -umi_hamming_distance 0 {output.bamSorted} -o {output.bam} > {log.stdout} 2> {log.stderr}"
#### 6. Library statistics
rule SCMO_library_stats:
input:
bam = "processed/{library}/tagged.bam",
r1="processed/{library}/demultiplexedR1.fastq.gz", # Its need these to count how many raw reads were present in the lib.
r2="processed/{library}/demultiplexedR2.fastq.gz",
r1_rejects="processed/{library}/rejectsR1.fastq.gz",
r2_rejects="processed/{library}/rejectsR2.fastq.gz"
output:
"processed/{library}/plots/ReadCount.png"
log:
stdout="log/library_stats/{library}.stdout",
stderr="log/library_stats/{library}.stderr"
threads: 1
params: runtime="30h"
shell:
"libraryStatistics.py processed/{wildcards.library} -tagged_bam /tagged.bam > {log.stdout} 2> {log.stderr}"
#### 7. Make dedup and no dedup count tables
#### a. Make count tables of non-deduplicated bam files
rule SCMO_count_table_nondedup:
input:
bam = "processed/{library}/tagged.bam"
output:
countTable = "processed/{library}/count_table_not_deduplicated.csv"
threads: 1
params:
runtime="10h"
log:
stdout="log/count_table/{library}_not_deduplicated.stdout",
stderr="log/count_table/{library}_not_deduplicated.stderr"
resources:
mem_mb=lambda wildcards, attempt: attempt * 8000
shell:
"bamToCountTable.py --noNames {input.bam} -o {output.countTable} -joinedFeatureTags XT,chrom -sampleTags SM > {log.stdout} 2> {log.stderr}"
#### b. Make count tables of deduplicated bam files
# This works exactly the same as step a
rule SCMO_count_table_filtered:
input:
bam = "processed/{library}/tagged.bam"
output:
countTable = "processed/{library}/count_table_deduplicated.csv"
threads: 1
params:
runtime="10h"
log:
stdout="log/count_table/{library}_deduplicated.stdout",
stderr="log/count_table/{library}_deduplicated.stderr"
resources:
mem_mb=lambda wildcards, attempt: attempt * 8000
shell:
"bamToCountTable.py --dedup --noNames {input.bam} -o {output.countTable} -joinedFeatureTags XT,chrom -sampleTags SM > {log.stdout} 2> {log.stderr}"