290 lines (268 with data), 11.7 kB
include: 'common.snakemake'
star_map_steps = ['spikein', 'univec', 'rRNA', 'genome', 'circRNA']
map_steps = list(star_map_steps)
if config['remove_duplicates_long']:
map_steps += ['genome_rmdup', 'circRNA_rmdup']
def get_all_inputs(wildcards):
available_inputs = dict(
map_rRNA_paired=expand('{output_dir}/bam/{sample_id}/{map_step}.bam',
output_dir=output_dir, sample_id=sample_ids, map_step=map_steps),
summarize_read_length=expand('{output_dir}/stats/mapped_read_length_by_sample/{sample_id}',
output_dir=output_dir, sample_id=sample_ids),
summarize_insert_size=expand('{output_dir}/stats/mapped_insert_size_by_sample/{sample_id}',
output_dir=output_dir, sample_id=sample_ids),
summarize_mapping_star=expand('{output_dir}/summary/mapping_star.html',
output_dir=output_dir)
)
enabled_inputs = list(available_inputs.keys())
inputs = []
for key, l in available_inputs.items():
if key in enabled_inputs:
inputs += l
return inputs
rule all:
input:
get_all_inputs
rule rename_fastq_pe:
input:
auto_gzip_input('{output_dir}/cutadapt/{sample_id}_{mate_index}.fastq')
output:
'{output_dir}/unmapped/{sample_id}/clean_{mate_index}.fastq.gz'
threads:
1
wildcard_constraints:
mate_index='[12]'
shell:
r'''{bin_dir}/auto_uncompress {input} \
| awk 'NR%4==1{{printf "@%012d\n", int(NR/4);next}} NR%4==3{{printf "+\n";next}} {{print}}' \
| pigz -c -p {threads} > {output}
'''
map_command_pe = '''STAR --genomeDir {params.index} \
--readFilesIn {input.reads1} {input.reads2} \
--runThreadN {threads} \
--outFileNamePrefix {params.output_prefix} \
--outSAMtype BAM Unsorted \
--outReadsUnmapped Fastx \
--readFilesCommand gzip -d -c \
--outSAMmultNmax 1 \
--seedPerWindowNmax {params.seedPerWindowNmax}
mv {params.output_prefix}Aligned.out.bam {output.bam}
gzip -c {params.output_prefix}Unmapped.out.mate1 > {output.unmapped1}
gzip -c {params.output_prefix}Unmapped.out.mate2 > {output.unmapped2}
rm -f {params.output_prefix}Unmapped.out.mate1 {params.output_prefix}Unmapped.out.mate2
'''
rule map_spikein_pe:
input:
reads1='{output_dir}/unmapped/{sample_id}/clean_1.fastq.gz',
reads2='{output_dir}/unmapped/{sample_id}/clean_2.fastq.gz',
index=config.get('spikein_index', genome_dir + '/index/star/spikein_long') + '/SA'
output:
bam='{output_dir}/bam/{sample_id}/spikein.bam',
unmapped1='{output_dir}/unmapped/{sample_id}/spikein_1.fastq.gz',
unmapped2='{output_dir}/unmapped/{sample_id}/spikein_2.fastq.gz',
log='{output_dir}/mapping_star/{sample_id}/spikein/Log.final.out'
params:
output_prefix='{output_dir}/mapping_star/{sample_id}/spikein/',
index=config.get('spikein_index', genome_dir + '/index/star/spikein_long/'),
seedPerWindowNmax=20
threads:
config['threads_mapping']
run:
shell(map_command_pe)
rule map_univec_pe:
input:
reads1='{output_dir}/unmapped/{sample_id}/spikein_1.fastq.gz',
reads2='{output_dir}/unmapped/{sample_id}/spikein_2.fastq.gz',
index=genome_dir + '/index/star/univec/SA'
output:
bam='{output_dir}/bam/{sample_id}/univec.bam',
unmapped1='{output_dir}/unmapped/{sample_id}/univec_1.fastq.gz',
unmapped2='{output_dir}/unmapped/{sample_id}/univec_2.fastq.gz',
log='{output_dir}/mapping_star/{sample_id}/univec/Log.final.out'
params:
output_prefix='{output_dir}/mapping_star/{sample_id}/univec/',
index=genome_dir + '/index/star/univec',
seedPerWindowNmax=20
threads:
config['threads_mapping']
run:
shell(map_command_pe)
rule map_rRNA_pe:
input:
reads1='{output_dir}/unmapped/{sample_id}/univec_1.fastq.gz',
reads2='{output_dir}/unmapped/{sample_id}/univec_2.fastq.gz',
index=genome_dir + '/index/star/rRNA/SA'
output:
bam='{output_dir}/bam/{sample_id}/rRNA.bam',
unmapped1='{output_dir}/unmapped/{sample_id}/rRNA_1.fastq.gz',
unmapped2='{output_dir}/unmapped/{sample_id}/rRNA_2.fastq.gz',
log='{output_dir}/mapping_star/{sample_id}/rRNA/Log.final.out'
params:
output_prefix='{output_dir}/mapping_star/{sample_id}/rRNA/',
index=genome_dir + '/index/star/rRNA',
seedPerWindowNmax=20
threads:
config['threads_mapping']
run:
shell(map_command_pe)
rule map_genome_pe:
input:
reads1='{output_dir}/unmapped/{sample_id}/rRNA_1.fastq.gz',
reads2='{output_dir}/unmapped/{sample_id}/rRNA_2.fastq.gz',
index=genome_dir + '/index/star/genome_long_rna/SA'
output:
bam='{output_dir}/bam/{sample_id}/genome.bam',
unmapped1='{output_dir}/unmapped/{sample_id}/genome_1.fastq.gz',
unmapped2='{output_dir}/unmapped/{sample_id}/genome_2.fastq.gz',
log='{output_dir}/mapping_star/{sample_id}/genome/Log.final.out'
params:
output_prefix='{output_dir}/mapping_star/{sample_id}/genome/',
index=genome_dir + '/index/star/genome_long_rna',
seedPerWindowNmax=50
threads:
config['threads_mapping']
run:
shell(map_command_pe)
rule map_circRNA_pe:
input:
reads1='{output_dir}/unmapped/{sample_id}/genome_1.fastq.gz',
reads2='{output_dir}/unmapped/{sample_id}/genome_2.fastq.gz',
index=genome_dir + '/index/star/circRNA/SA'
output:
bam='{output_dir}/bam/{sample_id}/circRNA.bam',
unmapped1='{output_dir}/unmapped/{sample_id}/circRNA_1.fastq.gz',
unmapped2='{output_dir}/unmapped/{sample_id}/circRNA_2.fastq.gz',
log='{output_dir}/mapping_star/{sample_id}/circRNA/Log.final.out'
params:
output_prefix='{output_dir}/mapping_star/{sample_id}/circRNA/',
index=genome_dir + '/index/star/circRNA',
seedPerWindowNmax=20
threads:
config['threads_mapping']
run:
shell(map_command_pe)
rule sort_bam_by_name:
input:
'{output_dir}/bam/{sample_id}/{map_step}.bam'
output:
'{output_dir}/bam_sorted_by_name/{sample_id}/{map_step}.bam'
params:
temp_dir=config['temp_dir']
shell:
'''samtools sort -n -T {params.temp_dir} -o {output} {input}
'''
rule remove_duplicates:
input:
bam='{output_dir}/bam/{sample_id}/{map_step}.bam'
output:
bam='{output_dir}/bam/{sample_id}/{map_step}_rmdup.bam',
metrics='{output_dir}/log/{map_step}_rmdup/{sample_id}'
wildcard_constraints:
map_step='(rRNA)|(genome)|(circRNA)'
shell:
'''picard MarkDuplicates REMOVE_DUPLICATES=true \
ASSUME_SORT_ORDER=queryname \
I={input.bam} \
O={output.bam} \
M={output.metrics} \
READ_NAME_REGEX=null
'''
rule samtools_stats:
'''Statistics for mapped reads
'''
input:
'{output_dir}/bam/{sample_id}/{map_step}.bam'
output:
'{output_dir}/samtools_stats/{sample_id}/{map_step}.txt'
shell:
'''samtools stats {input} > {output}
'''
rule parse_samtools_stats_pe:
input:
'{output_dir}/samtools_stats/{sample_id}/{map_step}.txt'
output:
fragment_counts='{output_dir}/stats/fragment_counts/{sample_id}/{map_step}',
insert_size_average='{output_dir}/stats/insert_size_average/{sample_id}/{map_step}',
insert_size_hist='{output_dir}/stats/insert_size_hist/{sample_id}/{map_step}',
read_length_hist='{output_dir}/stats/read_length_hist/{sample_id}/{map_step}'
wildcard_constraints:
map_step='(?!clean).*'
shell:
'''awk 'BEGIN{{OFS="\t";FS="\t"}}/^SN/{{if($2 == "reads mapped and paired:") print int($3/2)}}' {input} > {output.fragment_counts}
awk 'BEGIN{{OFS="\t";FS="\t"}}/^SN/{{if($2 == "insert size average:") print $3}}' {input} > {output.insert_size_average}
awk 'BEGIN{{OFS="\t";FS="\t"}}/^IS/{{print $2,$3}}' {input} > {output.insert_size_hist}
awk 'BEGIN{{OFS="\t";FS="\t"}}/^RL/{{print $2,$3}}' {input} > {output.read_length_hist}
'''
rule summarize_read_length:
input:
lambda wildcards: expand('{output_dir}/stats/read_length_hist/{sample_id}/{map_step}',
output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, map_step=map_steps)
output:
'{output_dir}/stats/mapped_read_length_by_sample/{sample_id}'
run:
import pandas as pd
matrix = {}
for filename in input:
map_step = filename.split('/')[-1]
matrix[map_step] = pd.read_table(filename, sep='\t', index_col=0, header=None, names=['read_length', map_step]).iloc[:, 0]
matrix = pd.DataFrame(matrix)
matrix = matrix.loc[:, map_steps]
matrix.fillna(0, inplace=True)
matrix = matrix.astype('int')
matrix.index.name = 'read_length'
matrix.to_csv(output[0], sep='\t', header=True, index=True)
rule summarize_insert_size:
input:
lambda wildcards: expand('{output_dir}/stats/insert_size_hist/{sample_id}/{map_step}',
output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, map_step=map_steps)
output:
'{output_dir}/stats/mapped_insert_size_by_sample/{sample_id}'
run:
import pandas as pd
matrix = {}
for filename in input:
map_step = filename.split('/')[-1]
matrix[map_step] = pd.read_table(filename, sep='\t', index_col=0, header=None, names=['insert_size', map_step]).iloc[:, 0]
matrix = pd.DataFrame(matrix)
matrix = matrix.loc[:, map_steps]
matrix.fillna(0, inplace=True)
matrix = matrix.astype('int')
matrix.index.name = 'insert_size'
matrix.to_csv(output[0], sep='\t', header=True, index=True)
rule summarize_mapping_star:
input:
lambda wildcards: expand('{output_dir}/mapping_star/{sample_id}/{map_step}/Log.final.out',
output_dir=wildcards.output_dir, sample_id=sample_ids, map_step=star_map_steps)
output:
'{output_dir}/summary/mapping_star.txt'
run:
import pandas as pd
records = []
columns = ['sample_id', 'map_step']
for i, filename in enumerate(input):
map_step = filename.split('/')[-2]
sample_id = filename.split('/')[-3]
record = {'sample_id': sample_id, 'map_step': map_step}
with open(filename, 'r') as fin:
for line in fin:
c = line.strip().split('|')
if len(c) == 2:
key, val = c[0].strip(), c[1].strip()
record[key] = val
if i == 0:
columns.append(key)
records.append(record)
summary = pd.DataFrame.from_records(records)
summary = summary.reindex(columns=columns)
summary.set_index('sample_id', inplace=True)
summary.index.name = 'sample_id'
summary.to_csv(output[0], sep='\t', header=True, na_rep='NA', index=True)
rule summarize_fragment_counts_jupyter:
input:
summary='{output_dir}/summary/mapping_star.txt',
jupyter=package_dir + '/templates/summarize_mapping_star.ipynb'
output:
jupyter='{output_dir}/summary/mapping_star.ipynb',
html='{output_dir}/summary/mapping_star.html'
run:
shell(nbconvert_command)