259 lines (228 with data), 11.0 kB
from singlecellmultiomics.libraryDetection.sequencingLibraryListing import SequencingLibraryLister
from glob import glob
import collections
from singlecellmultiomics.utils import get_contig_list_from_fasta
"""
This workflow:
Starts off from a folder containing all fastq files
- Detects libraries
- Demultiplexes per library, automatically detecting the right barcodes
- Trims using cutadapt
- Maps, sorts and indexes the reads per library
- Filters bam file based on base quality tag ('SQ') - by default, the threshold is 0.98
- Filters bam file based on maximum insert size (distance between R1 and R2) to filter out read pairs where R1 and R2 are not mapped to the same amplicon
- Creates QC plots
- Creates count tables, for unfiltered data and for SQ filtered data:
a. Count table containing rows with chromosome, allele, site, scar information and columns with cell information
b. Count table to check SQ values for all reads - with rows for cells and columns for SQ scores ## not yet!
"""
################## configuration ##################
configfile: "config.json"
# This code detects which libraries are present in the current folder:
l = SequencingLibraryLister()
LIBRARIES = l.detect(glob('*.fastq.gz'), merge='_')
# Flatten to library:[fastqfile, fastqfile, ...]
fastq_per_lib = collections.defaultdict(list)
for lib,lane_dict in LIBRARIES.items():
for lane,read_dict in lane_dict.items():
fastq_per_lib[lib] += read_dict['R1']
fastq_per_lib[lib] += read_dict['R2']
libraries = list( fastq_per_lib.keys() )
################## configuration end ##################
def get_fastq_file_list(wildcards):
# Obtain a list of fastq files associated to wildcards.library
global libraries
return sorted( fastq_per_lib[wildcards.library] )
def get_target_demux_list():
global libraries
targets = []
for lib in libraries:
targets.append('processed/' + lib + "/demultiplexedR1.fastq.gz" )
targets.append('processed/' + lib + "/demultiplexedR2.fastq.gz" )
return targets
def get_target_tagged_bam_list():
return [f"processed/{library}/tagged.bam" for library in libraries]
rule all:
input:
# get_target_demux_list() use this for demux only
get_target_tagged_bam_list(),
expand("processed/{library}/count_table_unfilteredBam.csv",
library=libraries),
expand("processed/{library}/count_table_filteredBam.csv",
library=libraries),
expand("processed/{library}/plots/ReadCount.png", library=libraries)
#### 1. Demultiplexing
rule SCMO_demux:
input:
fastqfiles = get_fastq_file_list
output:
temp("processed/{library}/demultiplexedR1.fastq.gz"),
temp("processed/{library}/demultiplexedR2.fastq.gz"),
temp("processed/{library}/rejectsR1.fastq.gz"),
temp("processed/{library}/rejectsR2.fastq.gz")
log:
stdout="log/demux/{library}.stdout",
stderr="log/demux/{library}.stderr"
params: runtime="30h"
resources:
mem_mb=lambda wildcards, attempt: attempt * 4000
shell:
"demux.py -merge _ {input.fastqfiles} -o processed --y -use SCARC8R2 > {log.stdout} 2> {log.stderr}"
#### 2. Trimming
rule Trim:
input:
r1="processed/{library}/demultiplexedR1.fastq.gz",
r2="processed/{library}/demultiplexedR2.fastq.gz"
log:
stdout="log/trim/{library}.stdout",
stderr="log/trim/{library}.stderr"
output:
r1=temp("processed/{library}/trimmed.R1.fastq.gz"),
r2=temp("processed/{library}/trimmed.R2.fastq.gz")
params: runtime="30h"
resources:
mem_mb=lambda wildcards, attempt: attempt * 4000
shell:
'cutadapt -o {output.r1} -p {output.r2} \
{input.r1} {input.r2} \
-m 3 -a "IlluminaSmallAdapterConcatBait=GGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTT" \
-a "IlluminaIndexAdapter=GGAATTCTCGGGTGCCAAGGAACTCCAGTCACN{{6}}ATCTCGTATGCCGTCTTCTGCTTG" \
-A "IlluminaPairedEndPCRPrimer2.0=AGATCGGAAGAGCGN{{6}}CAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG;min_overlap=5" \
-A "universalPrimer=GATCGTCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT;min_overlap=5" \
-a "IlluminaGEX=TTTTTAATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGACGATC;min_overlap=5" \
-a "IlluminaMultiplexingPCRPrimer=GGAACTCCAGTCACN{{6}}TCTCGTATGCCGTCTTCTGCTTG;min_overlap=5" \
-A "Aseq=TGGCACCCGAGAATTCCA" -a "Aseq=TGGCACCCGAGAATTCCA" \
-a "illuminaSmallRNAAdapter=TCGTATGCCGTCTTCTGCTTGT" > {log.stdout} 2> {log.stderr}'
#### 3. Mapping with BWA or bowtie2
rule map:
input:
r1="processed/{library}/trimmed.R1.fastq.gz",
r2="processed/{library}/trimmed.R2.fastq.gz"
params:
ref=config['reference_file_for_mapper']
output:
bam = temp("processed/{library}/sorted.bam"),
bam_index = temp("processed/{library}/sorted.bam.bai")
log:
stdout="log/map/{library}.stdout",
stderr="log/map/{library}.stderr"
threads: 8
params: runtime="30h"
resources:
mem_mb=lambda wildcards, attempt: attempt * 8000
run:
# https://stackoverflow.com/questions/40996597/snakemake-remove-output-file this is probably pretier
if config['mapper']=='bwa':
# The sorting and mapping has been disconnected
shell(
"bwa mem -M -I 220 -t {threads} {params.ref} {input.r1} {input.r2} 2> {log.stderr} | samtools view -Sb - > processed/{wildcards.library}/unsorted.bam; \
samtools sort -T processed/{wildcards.library}/temp_sort -@ {threads} processed/{wildcards.library}/unsorted.bam > processed/{wildcards.library}/sorted.unfinished.bam; \
mv processed/{wildcards.library}/sorted.unfinished.bam {output.bam}; rm processed/{wildcards.library}/unsorted.bam; samtools index {output.bam} > {log.stdout}"
)
elif config['mapper']=='bowtie2':
shell(
"bowtie2 -p {threads} -q --no-unal --local --sensitive-local -N 1 -x {params.ref} -1 {input.r1} -2 {input.r2} | samtools view -Sb > processed/{wildcards.library}/unsorted.bam; \
samtools sort -T processed/{wildcards.library}/temp_sort -@ {threads} processed/{wildcards.library}/unsorted.bam > processed/{wildcards.library}/sorted.unfinished.bam; \
mv processed/{wildcards.library}/sorted.unfinished.bam {output.bam}; rm processed/{wildcards.library}/unsorted.bam; samtools index {output.bam} > {log.stdout} 2> {log.stderr} "
)
#### 4 universalBamTagger.py --ftag --scar --dedup -alleles $allele_reference -o $destdir ${bamfiles}
rule SCMO_tagmultiome_Scartrace:
input:
bam = "processed/{library}/sorted.bam",
bam_index = "processed/{library}/sorted.bam.bai"
output:
bam = "processed/{library}/tagged.bam",
bam_index = "processed/{library}/tagged.bam.bai"
log:
stdout="log/tagging/{library}.stdout",
stderr="log/tagging/{library}.stderr"
threads: 1
params:
runtime="20h",
alleles = config['alleles'],
allele_samples = config['allele_samples']
resources:
mem_mb=lambda wildcards, attempt: attempt * 10000
shell:
"bamtagmultiome.py -method scartrace -allele_samples {params.allele_samples} -alleles {params.alleles} {input.bam} -o {output.bam} --every_fragment_as_molecule > {log.stdout} 2> {log.stderr}"
#still needs to be implemented for faster computing: --use_allele_cache
#### 5. Filter bam file based on base quality tag ('SQ')
rule SCMO_SQ_filter:
input:
bam = "processed/{library}/tagged.bam",
bam_index = "processed/{library}/tagged.bam.bai"
output:
bam = "processed/{library}/tagged_SQfiltered.bam",
bam_index = "processed/{library}/tagged_SQfiltered.bam.bai"
params:
SQ_filter = config['SQ_filter']
log:
stdout="log/SQfilter/{library}.stdout",
stderr="log/SQfilter/{library}.stderr"
shell:
'''bamFilter.py {input.bam} -o {output.bam} 'r.has_tag("SQ") and r.get_tag("SQ") > {params.SQ_filter}' > {log.stdout} 2> {log.stderr} '''
#### 6. Filter bam file based on maximum insert size
rule SCMO_maxInsert_filtering:
input:
bam = "processed/{library}/tagged_SQfiltered.bam",
bam_index = "processed/{library}/tagged_SQfiltered.bam.bai"
output:
bam = "processed/{library}/tagged_filtered.bam",
bam_index = "processed/{library}/tagged_filtered.bam.bai"
params:
insertSizeFilter = config['maxInsertSize']
log:
stdout="log/insertSizeFilter/{library}.stdout",
stderr="log/insertSizeFilter/{library}.stderr"
shell:
'''bamFilter.py {input.bam} -o {output.bam} 'r.has_tag("fS") and (r.get_tag("fS") < {params.insertSizeFilter})' > {log.stdout} 2> {log.stderr} '''
#### 7. Librarystatistics
rule SCMO_library_stats:
input:
bam = "processed/{library}/tagged_filtered.bam",
r1="processed/{library}/demultiplexedR1.fastq.gz", # It needs these to count how many raw reads were present in the lib.
r2="processed/{library}/demultiplexedR2.fastq.gz",
r1_rejects="processed/{library}/rejectsR1.fastq.gz",
r2_rejects="processed/{library}/rejectsR2.fastq.gz"
output:
"processed/{library}/plots/ReadCount.png"
log:
stdout="log/library_stats/{library}.stdout",
stderr="log/library_stats/{library}.stderr"
threads: 1
params: runtime="30h"
shell:
"libraryStatistics.py processed/{wildcards.library} -tagged_bam {input.bam} > {log.stdout} 2> {log.stderr}"
### 8. Make count tables of UNFILTERED bam files
rule SCMO_count_table_unfiltered:
input:
bam = "processed/{library}/tagged.bam"
output:
countTable = "processed/{library}/count_table_unfilteredBam.csv"
threads: 1
params:
runtime="50h"
log:
stdout="log/count_table/{library}_unfiltered.stdout",
stderr="log/count_table/{library}_unfiltered.stderr"
resources:
mem_mb=lambda wildcards, attempt: attempt * 8000
shell:
"bamToCountTable.py {input.bam} -o {output.countTable} -joinedFeatureTags chrom,DA,DS,SD -sampleTags SM > {log.stdout} 2> {log.stderr}"
### 9. Make count tables of filtered bam files
# This works exactly the same as step 8
rule SCMO_count_table_filtered:
input:
bam = "processed/{library}/tagged_filtered.bam"
output:
countTable = "processed/{library}/count_table_filteredBam.csv"
threads: 1
params:
runtime="50h"
log:
stdout="log/count_table/{library}_filtered.stdout",
stderr="log/count_table/{library}_filtered.stderr"
resources:
mem_mb=lambda wildcards, attempt: attempt * 8000
shell:
"bamToCountTable.py {input.bam} -o {output.countTable} -joinedFeatureTags chrom,DA,DS,SD -sampleTags SM > {log.stdout} 2> {log.stderr}"