--- a +++ b/exseek/snakefiles/mapping_small.snakemake @@ -0,0 +1,290 @@ +include: 'common.snakemake' + +from collections import OrderedDict + +data_dir = config['data_dir'] +output_dir = config['output_dir'] +rna_types = config['rna_types'] +adaptor = config['adaptor'] +min_read_length = config['min_read_length'] +genome_dir = config['genome_dir'] +max_read_length = config['max_read_length'] +min_base_quality = config['min_base_quality'] +temp_dir = config['temp_dir'] + +def get_all_inputs(wildcards): + available_inputs = dict( + summarize_read_counts=expand('{output_dir}/summary/read_counts.txt', + output_dir=output_dir), + mapped_read_length=expand('{output_dir}/stats/mapped_read_length_by_sample/{sample_id}', + output_dir=output_dir, sample_id=sample_ids), + summarize_read_counts_jupyter=expand('{output_dir}/summary/read_counts.html', + output_dir=output_dir), + unmapped_other=expand('{output_dir}/unmapped/{sample_id}/other.fa.gz', + output_dir=output_dir, sample_id=sample_ids), + tbam=expand('{output_dir}/tbam/{sample_id}/{rna_type}.bam', + output_dir=output_dir, sample_id=sample_ids, rna_type=config['rna_types']), + map_genome=expand('{output_dir}/gbam/{sample_id}/genome.bam', + output_dir=output_dir, sample_id=sample_ids) + ) + enabled_inputs = list(available_inputs.keys()) + inputs = [] + for key, l in available_inputs.items(): + if key in enabled_inputs: + inputs += l + return inputs + +rule all: + input: + get_all_inputs + +rule fastq_to_fasta_se: + input: + auto_gzip_input('{output_dir}/cutadapt/{sample_id}.fastq') + output: + '{output_dir}/unmapped/{sample_id}/clean.fa.gz' + threads: + config['threads_compress'] + shell: + '''{bin_dir}/auto_uncompress {input} \ + | fastq_to_fasta -r -n \ + | pigz -p {threads} -c > {output} + ''' + +# mapping statistics +rule read_counts_raw: + input: + data_dir + '/fastq/{sample_id}.fastq' + output: + '{output_dir}/stats/read_counts_raw/{sample_id}' + shell: + '''wc -l < {input} | awk '{{print int($0/4)}}' > {output} + ''' + +rule read_counts_mapped: + input: + lambda wildcards: '{output_dir}/{bam_type}/{sample_id}/{rna_type}.bam'.format( + output_dir=wildcards.output_dir, + bam_type='gbam' if wildcards.rna_type == 'other' else 'tbam', + sample_id=wildcards.sample_id, rna_type=wildcards.rna_type + ) + output: + '{output_dir}/stats/read_counts_mapped/{sample_id}/{rna_type}' + wildcard_constraints: + rna_type='(?!promoter$)(?!enhancer$)(?!intron$)(?!repeats$)(?!genome).*' + shell: + '''bamtools count -in {input} > {output} + ''' + +rule read_counts_unmapped: + input: + '{output_dir}/unmapped/{sample_id}/{rna_type}.fa.gz' + output: + '{output_dir}/stats/read_counts_unmapped/{sample_id}/{rna_type}' + threads: + config['threads_compress'] + shell: + '''pigz -p {threads} -d -c {input} | wc -l | awk '{{print int($0/2)}}' > {output} + ''' + +rule summarize_read_counts: + input: + mapped=lambda wildcards: expand('{output_dir}/stats/read_counts_mapped/{sample_id}/{rna_type}', + output_dir=wildcards.output_dir, sample_id=sample_ids, + rna_type=config['rna_types'] + ['other', 'promoter', 'enhancer', 'intron', 'repeats']), + unmapped=lambda wildcards: expand('{output_dir}/stats/read_counts_unmapped/{sample_id}/{rna_type}', + output_dir=wildcards.output_dir, sample_id=sample_ids, rna_type=['clean', 'other']) + output: + '{output_dir}/summary/read_counts.txt' + run: + import pandas as pd + import os + from collections import OrderedDict + + records = OrderedDict() + for sample_id in sample_ids: + records[sample_id] = {} + for filename in input.mapped: + sample_id, rna_type = filename.split(os.path.sep)[-2:] + with open(filename, 'r') as f: + records[sample_id][rna_type + '.mapped'] = int(f.read().strip()) + for filename in input.unmapped: + sample_id, rna_type = filename.split(os.path.sep)[-2:] + with open(filename, 'r') as f: + records[sample_id][rna_type + '.unmapped'] = int(f.read().strip()) + records = pd.DataFrame.from_records(records) + records.columns.name = 'sample_id' + records.columns.name = 'item' + records.index.name = 'reads_type' + records.to_csv(output[0], sep='\t', header=True, index=True) + +rule summarize_read_counts_jupyter: + input: + read_counts='{output_dir}/summary/read_counts.txt', + jupyter=package_dir + '/templates/summarize_read_counts_small.ipynb' + output: + jupyter='{output_dir}/summary/read_counts.ipynb', + html='{output_dir}/summary/read_counts.html' + shell: + '''cp {input.jupyter} {output.jupyter} + jupyter nbconvert --execute --to html \ + --HTMLExporter.exclude_code_cell=False \ + --HTMLExporter.exclude_input_prompt=True \ + --HTMLExporter.exclude_output_prompt=True \ + {output.jupyter} + ''' + +rule mapped_read_length: + input: + '{output_dir}/tbam/{sample_id}/{rna_type}.bam' + output: + '{output_dir}/stats/mapped_read_length/{sample_id}/{rna_type}' + shell: + '''{bin_dir}/statistics.py read_length_hist --max-length 600 -i {input} -o {output} + ''' + +rule merge_mapped_read_length: + input: + lambda wildcards: expand('{output_dir}/stats/mapped_read_length/{sample_id}/{rna_type}', + output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=rna_types) + output: + '{output_dir}/stats/mapped_read_length_by_sample/{sample_id}' + run: + import pandas as pd + from collections import OrderedDict + + table = OrderedDict() + for filename in input: + rna_type = os.path.basename(filename) + df = pd.read_table(filename, index_col=0) + table[rna_type] = df['query'] + table = pd.DataFrame(table) + table.to_csv(output[0], sep='\t', header=True, index=True) + +rule bam_to_bed: + input: + bam='{output_dir}/gbam/{sample_id}/{rna_type}.bam' + output: + bed='{output_dir}/gbed/{sample_id}/{rna_type}.bed.gz' + shell: + '''bedtools bamtobed -i {input} | bedtools sort | pigz -c > {output} + ''' + +rule count_genomic_reads: + '''Count genomic reads with a defined order + ''' + input: + reads='{output_dir}/gbed/{sample_id}/other.bed.gz', + intron=genome_dir + '/bed/long_RNA.intron.bed', + promoter=genome_dir + '/bed/promoter.merged.bed', + enhancer=genome_dir + '/bed/enhancer.merged.bed', + repeats=genome_dir + '/bed/rmsk.bed' + output: + intron='{output_dir}/stats/read_counts_mapped/{sample_id}/intron', + promoter='{output_dir}/stats/read_counts_mapped/{sample_id}/promoter', + enhancer='{output_dir}/stats/read_counts_mapped/{sample_id}/enhancer', + repeats='{output_dir}/stats/read_counts_mapped/{sample_id}/repeats' + threads: + 1 + shell: + r'''pigz -c -d -p {threads} {input.reads} \ + | bedtools annotate -counts -s -i - -files {input.intron} {input.promoter} {input.enhancer} {input.repeats} \ + | awk 'BEGIN{{OFS="\t";FS="\t";intron=0;promoter=0;enhancer=0;repeats=0}} + {{ + if($7 > 0) intron++ + else if($8 > 0) promoter++ + else if($9 > 0) enhancer++ + else if($10 > 0) repeats++ + }}END{{ + print intron > "{output.intron}" + print promoter > "{output.promoter}" + print enhancer > "{output.enhancer}" + print repeats > "{output.repeats}" + }}' + ''' + +""" +rule count_reads_intron: + input: + reads='{output_dir}/gbed/{sample_id}/other.bed.gz', + bed=genome_dir + '/bed/long_RNA.intron.bed' + output: + '{output_dir}/stats/read_counts_mapped/{sample_id}/intron' + shell: + '''pigz -d -c {input.reads} | bedtools intersect -bed -wa -s -a - -b {input.bed} | wc -l > {output} + ''' + +rule count_reads_promoter: + input: + reads='{output_dir}/gbed/{sample_id}/other.bed.gz', + bed=genome_dir + '/bed/promoter.merged.bed' + output: + '{output_dir}/stats/read_counts_mapped/{sample_id}/promoter' + shell: + '''pigz -d -c {input.reads} | bedtools intersect -bed -wa -a - -b {input.bed} | wc -l > {output} + ''' + +rule count_reads_enhancer: + input: + reads='{output_dir}/gbed/{sample_id}/other.bed.gz', + bed=genome_dir + '/bed/enhancer.merged.bed' + output: + '{output_dir}/stats/read_counts_mapped/{sample_id}/enhancer' + shell: + '''pigz -d -c {input.reads} | bedtools intersect -bed -wa -a - -b {input.bed} | wc -l > {output} + ''' + +rule count_reads_rmsk: + input: + reads='{output_dir}/gbed/{sample_id}/other.bed.gz', + bed=genome_dir + '/bed/rmsk.bed' + output: + '{output_dir}/stats/read_counts_mapped/{sample_id}/repeats' + shell: + '''pigz -d -c {input.reads} | bedtools intersect -bed -wa -s -a - -b {input.bed} | wc -l > {output} + ''' + +rule map_circRNA: + input: + reads='{output_dir}/unmapped/{sample_id}/other.fa.gz', + index=genome_dir + '/index/bowtie2/circRNA.1.bt2' + output: + unmapped='{output_dir}/unmapped/{sample_id}/circRNA.fa.gz', + unmapped_aligner='{output_dir}/unmapped/{sample_id}/circRNA.aligner.fa.gz', + bam='{output_dir}/tbam/{sample_id}/circRNA.bam', + bam_filtered = '{output_dir}/tbam/{sample_id}/circRNA.filtered.bam' + params: + index=genome_dir + '/index/bowtie2/circRNA' + threads: + config['threads_mapping'] + shell: + '''pigz -d -c {input.reads} \ + | bowtie2 -f -p {threads} --norc --sensitive --no-unal \ + --un-gz {output.unmapped_aligner} -x {params.index} - -S - \ + | {bin_dir}/preprocess.py filter_circrna_reads --filtered-file >(samtools view -b -o {output.bam_filtered}) \ + | samtools view -b -o {output.bam} + + {{ + pigz -d -c {output.unmapped_aligner} + samtools fasta {output.bam_filtered} + }} | pigz -c > {output.unmapped} + ''' +""" + +rule map_genome: + input: + reads='{output_dir}/unmapped/{sample_id}/clean.fa.gz', + index=genome_dir + '/genome_index/bowtie2/genome.1.bt2' + output: + unmapped='{output_dir}/unmapped/{sample_id}/genome.fa.gz', + bam='{output_dir}/gbam/{sample_id}/genome.bam' + params: + index=genome_dir + '/genome_index/bowtie2/genome' + shell: + '''pigz -d -c {input.reads} \ + | bowtie2 -f -p {threads} --sensitive --no-unal \ + --un-gz {output.unmapped} -x {params.index} - -S - \ + | samtools view -b -o {output.bam} + ''' + +