Switch to side-by-side view

--- a
+++ b/exseek/snakefiles/count_matrix_long.snakemake
@@ -0,0 +1,370 @@
+include: 'common.snakemake'
+
+map_steps = ['clean', 'spikein', 'univec', 'rRNA', 
+    'genome', 'exon', 'intron', 'promoter', 'enhancer', 'repeats', 'circRNA']
+if config['remove_duplicates_long']:
+    map_steps += ['genome_rmdup', 'exon_rmdup', 'enhancer_rmdup',
+        'promoter_rmdup', 'repeats_rmdup', 'circRNA_rmdup']
+
+count_method_regex = '(featurecounts)|(htseq)'
+
+def get_all_inputs(wildcards):
+    inputs = {}
+
+    inputs['count_matrix'] = expand('{output_dir}/count_matrix/{count_method}.txt',
+            output_dir=output_dir, count_method=config['count_method'])
+    inputs['count_genomic_reads'] = expand('{output_dir}/stats/fragment_counts/{sample_id}/{map_step}',
+            output_dir=output_dir, sample_id=sample_ids, map_step=map_steps)
+    #rmdup
+
+    # summary
+    inputs['summarize_read_counts'] = expand('{output_dir}/summary/read_counts.txt',
+            output_dir=output_dir, sample_id=sample_ids)
+    return inputs
+    
+rule all:
+    input:
+        unpack(get_all_inputs)
+
+rule featurecounts:
+    input:
+        bam='{output_dir}/bam/{sample_id}/{map_step}.bam',
+        gtf=genome_dir + '/gtf/long_RNA.gtf'
+    output:
+        counts='{output_dir}/counts/featurecounts/{sample_id}/{map_step}',
+        summary='{output_dir}/counts/featurecounts/{sample_id}/{map_step}.summary'
+    params:
+        strandness={'no': 0, 'forward': 1, 'reverse': 2}.get(config['strandness'], 0),
+        paired_end={True: '-p', False: ''}[config['paired_end']],
+        min_mapping_quality=config['min_mapping_quality'],
+        count_multimap_reads='-M' if config['count_multimap_reads'] else '',
+        count_overlapping_features='-O' if config['count_overlapping_features'] else ''
+    wildcard_constraints:
+        map_step='(genome)|(genome_rmdup)'
+    log:
+        '{output_dir}/log/featurecounts/{sample_id}/{map_step}'
+    shell:
+        '''featureCounts {params.count_overlapping_features} -t exon -g gene_id {params.count_multimap_reads} \
+            -s {params.strandness} -Q {params.min_mapping_quality} \
+            {params.paired_end} -a {input.gtf} -o {output.counts} {input.bam} > {log}
+        '''
+
+rule htseq:
+    input:
+        bam='{output_dir}/bam/{sample_id}/{map_step}.bam',
+        gtf=genome_dir + '/gtf/long_RNA.gtf'
+    output:
+        counts='{output_dir}/counts/htseq/{sample_id}/{map_step}'
+    params:
+        strandness={'forward': 'yes', 'reverse': 'reverse'}.get(config['strandness'], 'no'),
+        min_mapping_quality=config['min_mapping_quality'],
+        count_overlapping_features='all' if config['count_overlapping_features'] else 'none'
+    wildcard_constraints:
+        map_step='(genome)|(genome_rmdup)'
+    shell:
+        '''htseq-count -t exon -i gene_id -f bam -a {params.min_mapping_quality} \
+            --nonunique {params.count_overlapping_features} -s {params.strandness} {input.bam} {input.gtf} > {output.counts}
+        '''
+
+rule count_circRNA:
+    input:
+        '{output_dir}/bam/{sample_id}/{map_step}.bam'
+    output:
+        '{output_dir}/counts/count_circrna/{sample_id}/{map_step}'
+    params:
+        paired_end={True: '-p', False: ''}[config['paired_end']],
+        strandness=config['strandness'],
+        min_mapping_quality=config['min_mapping_quality']
+    wildcard_constraints:
+        map_step='(circRNA)|(circRNA_rmdup)'
+    shell:
+        '''{bin_dir}/count_reads.py count_circrna -s {params.strandness} \
+            -q {params.min_mapping_quality} {params.paired_end} -i {input} -o {output}
+        '''
+
+class get_rmdup_counts:
+    def __init__(self, template, rmdup=False):
+        if rmdup:
+            self.template = template + '_rmdup'
+        else:
+            self.template = template
+
+    def __call__(self, wildcards):
+        return expand(self.template, sample_id=sample_ids, **wildcards)
+
+rule count_matrix_circrna:
+    input:
+        circrna_counts=get_rmdup_counts('{output_dir}/counts/count_circrna/{sample_id}/circRNA', rmdup=config['remove_duplicates_long']),
+        circrna_sizes=genome_dir + '/chrom_sizes/circRNA'
+    output:
+        '{output_dir}/count_matrix/circRNA.txt'
+    run:
+        import pandas as pd
+
+        # read circRNA counts from individual files
+        matrix_circrna = {}
+        for filename in input.circrna_counts:
+            sample_id = filename.split('/')[-2]
+            matrix_circrna[sample_id] = pd.read_table(filename, sep='\t', header=None, index_col=0).iloc[:, 0]
+        matrix_circrna = pd.DataFrame(matrix_circrna)
+        matrix_circrna = matrix_circrna.loc[:, sample_ids]
+        # fill missing counts with 0
+        matrix_circrna.fillna(0, inplace=True)
+        matrix_circrna = matrix_circrna.astype('int')
+        # remove circular RNAs with zero counts in all samples
+        matrix_circrna = matrix_circrna.loc[matrix_circrna.sum(axis=1) > 0].copy()
+        # annotate circRNA 
+        circrna_sizes = pd.read_table(input.circrna_sizes, sep='\t', header=None, index_col=0).iloc[:, 0]
+        circrna_ids = matrix_circrna.index.values
+        matrix_circrna.index = circrna_ids + '|circRNA|' + circrna_ids + '|' + circrna_ids\
+             + '|' + circrna_ids + '|0|' + circrna_sizes.loc[circrna_ids].values.astype('str')
+        matrix_circrna.index.name = 'feature'
+        matrix_circrna.to_csv(output[0], sep='\t', header=True, index=True, na_rep='NA')
+
+rule count_matrix:
+    input:
+        counts=get_rmdup_counts('{output_dir}/counts/{count_method}/{sample_id}/genome', rmdup=config['remove_duplicates_long']),
+        matrix_circrna='{output_dir}/count_matrix/circRNA.txt',
+        transcript_table=genome_dir + '/transcript_table/long.txt',
+        gene_length=genome_dir + '/gene_length/long_RNA'
+    output:
+        '{output_dir}/count_matrix/{count_method}.txt'
+    wildcard_constraints:
+        count_method='(featurecounts)|(htseq)'
+    run:
+        import pandas as pd
+        import re
+
+        # annotate features
+        transcript_table = pd.read_table(input.transcript_table, sep='\t', dtype='str')
+        transcript_table = transcript_table.drop_duplicates('gene_id', keep='first')
+        transcript_table.set_index('gene_id', inplace=True, drop=False)
+        # read gene counts from individual files
+        matrix = {}
+        sample_ids = []
+        for filename in input.counts:
+            sample_id = filename.split('/')[-2]
+            sample_ids.append(sample_id)
+            if wildcards.count_method == 'featurecounts':
+                matrix[sample_id] = pd.read_table(filename, comment='#', sep='\t', index_col=0)
+                matrix[sample_id] = matrix[sample_id].iloc[:, -1]
+            elif wildcards.count_method == 'htseq':
+                matrix[sample_id] = pd.read_table(filename, comment='__', sep='\t', header=None, index_col=0).iloc[:, 0]
+        matrix = pd.DataFrame(matrix)
+        matrix = matrix.loc[:, sample_ids]
+        # remove all-zero features
+        matrix = matrix.loc[matrix.sum(axis=1) > 0].copy()
+        gene_ids = matrix.index.values
+        
+        
+        # remove features not in transcript table
+        gene_ids = gene_ids[~(transcript_table.loc[gene_ids, 'gene_id'].isna().values)]
+        matrix = matrix.loc[gene_ids]
+        # read gene lengths
+        gene_lengths = pd.read_table(input.gene_length, sep='\t', index_col=0, dtype='str').loc[:, 'merged']
+        # remove features not in gene length
+        gene_ids = gene_ids[~(gene_lengths.reindex(gene_ids).isna().values)]
+        matrix = matrix.loc[gene_ids]
+        # annotate features
+        feature_names = transcript_table.loc[gene_ids, 'gene_id'].values \
+            + '|' + transcript_table.loc[gene_ids, 'gene_type'].values \
+            + '|' + transcript_table.loc[gene_ids, 'gene_name'].values \
+            + '|' + transcript_table.loc[gene_ids, 'gene_id'].values \
+            + '|' + transcript_table.loc[gene_ids, 'gene_id'].values \
+            + '|0|' + gene_lengths.loc[gene_ids].values
+        
+        matrix.index = feature_names
+        # merge gene matrix and circRNA matrix
+        matrix_circrna = pd.read_table(input.matrix_circrna, sep='\t', index_col=0)
+        matrix = pd.concat([matrix, matrix_circrna], axis=0)
+        matrix.index.name = 'feature'
+        
+        matrix.to_csv(output[0], sep='\t', header=True, index=True, na_rep='NA')
+
+rule count_matrix_mrna:
+    input:
+        '{output_dir}/count_matrix/{count_method}.txt'
+    output:
+        '{output_dir}/count_matrix/{count_method}_mrna.txt'
+    wildcard_constraints:
+        count_method=count_method_regex
+    shell:
+        '''awk 'NR==1{{print}}NR>1{{split($0,a,"|");if(a[2] == "mRNA") print}}' {input} > {output}
+        '''
+
+rule count_matrix_lncrna:
+    input:
+        '{output_dir}/count_matrix/{count_method}.txt'
+    output:
+        '{output_dir}/count_matrix/{count_method}_lncrna.txt'
+    wildcard_constraints:
+        count_method=count_method_regex
+    shell:
+        '''awk 'NR==1{{print}}NR>1{{split($0,a,"|");if(a[2] == "lncRNA") print}}' {input} > {output}
+        '''
+
+rule count_clean_reads_paired:
+    input:
+        '{output_dir}/unmapped/{sample_id}/clean_1.fastq.gz'
+    output:
+        '{output_dir}/stats/read_counts_clean/{sample_id}'
+    shell:
+        '''gzip -d -c {input} | wc -l | awk '{print int($0/4)}' > {output}
+        '''
+
+rule count_clean_fragments:
+    '''Count number of clean read pairs'''
+    input:
+        '{output_dir}/unmapped/{sample_id}/clean_1.fastq.gz'
+    output:
+        '{output_dir}/stats/fragment_counts/{sample_id}/clean'
+    threads:
+        config['threads_compress']
+    shell:
+        '''pigz -p {threads} -d -c {input} | wc -l | awk '{{print int($0/4)}}' > {output}
+        '''
+
+rule samtools_stats:
+    '''Statistics for mapped reads
+    '''
+    input:
+        '{output_dir}/bam/{sample_id}/{map_step}.bam'
+    output:
+        '{output_dir}/samtools_stats/{sample_id}/{map_step}.txt'
+    shell:
+        '''samtools stats {input} > {output}
+        '''
+
+rule parse_samtools_stats_pe:
+    input:
+        '{output_dir}/samtools_stats/{sample_id}/{map_step}.txt'
+    output:
+        fragment_counts='{output_dir}/stats/fragment_counts/{sample_id}/{map_step}',
+        insert_size_average='{output_dir}/stats/insert_size_average/{sample_id}/{map_step}',
+        insert_size_hist='{output_dir}/stats/insert_size_hist/{sample_id}/{map_step}',
+        read_length_hist='{output_dir}/stats/read_length_hist/{sample_id}/{map_step}'
+    wildcard_constraints:
+        map_step='(spikein)|(univec)|(rRNA)|(genome)'
+    shell:
+        '''awk 'BEGIN{{OFS="\t";FS="\t"}}/^SN/{{if($2 == "reads mapped and paired:") print int($3/2)}}' {input} > {output.fragment_counts}
+        awk 'BEGIN{{OFS="\t";FS="\t"}}/^SN/{{if($2 == "insert size average:") print $3}}' {input} > {output.insert_size_average}
+        awk 'BEGIN{{OFS="\t";FS="\t"}}/^IS/{{print $2,$3}}' {input} > {output.insert_size_hist}
+        awk 'BEGIN{{OFS="\t";FS="\t"}}/^RL/{{print $2,$3}}' {input} > {output.read_length_hist}
+        '''
+
+count_genomic_reads_command = r'''
+# output columns: read_id, exon_count, intron_count, promoter_count, enhancer_count, repeat_count
+# only keep proper pairs
+# convert BAM to BEDPE format
+# convert each record of BEDPE to 2 BED records
+# flip strand of mate 2
+# count reads in every annotation BED
+# count a read pair if any of its mate intersects with annotation file
+# assign read pairs to annotations
+samtools view -bf 0x2 {input.bam} \
+    | bedtools bamtobed -bedpe -mate1 \
+    | awk 'BEGIN{{OFS="\t";FS="\t"}}{{print $1,$2,$3,$7,$8,$9; s=$10;if(s=="+"){{s="-"}}else if(s=="-"){{s="+"}}print $4,$5,$6,$7,$8,s}}' \
+    | bedtools annotate -counts {params.strand} -i - -files {input.exon} {input.intron} {input.promoter} {input.enhancer} {input.repeats} \
+    | awk 'BEGIN{{OFS="\t";FS="\t"}}{{if(NR%2==1){{a0=$4;a1=$7;a2=$8;a3=$9;a4=$10;a5=$11}}else{{print a0,a1,a2,a3,a4,a5,$4,$7,$8,$9,$10,$11}} }}' \
+    | awk 'BEGIN{{OFS="\t";FS="\t";exon=0;intron=0;promoter=0;enhancer=0;repeats=0}}
+    {{
+        if(($2>0)||($8>0)) exon++
+        else if(($3>0)||($9>0)) intron++
+        else if(($4>0)||($10>0)) promoter++
+        else if(($5>0)||($11>0)) enhancer++
+        else if(($6>0)||($12>0)) repeats++
+    }}END{{
+        print exon > "{output.exon}"
+        print intron > "{output.intron}"
+        print promoter > "{output.promoter}"
+        print enhancer > "{output.enhancer}"
+        print repeats > "{output.repeats}"
+    }}'
+'''
+
+rule count_genomic_reads:
+    '''Annotate genomic reads that are not mapped to any gene
+    '''
+    input:
+        bam='{output_dir}/bam/{sample_id}/genome.bam',
+        exon=genome_dir + '/bed/long_RNA.exon.bed',
+        intron=genome_dir + '/bed/long_RNA.intron.bed',
+        promoter=genome_dir + '/bed/promoter.stranded.bed',
+        enhancer=genome_dir + '/bed/enhancer.stranded.bed',
+        repeats=genome_dir + '/bed/rmsk.bed'
+    output:
+        exon='{output_dir}/stats/fragment_counts/{sample_id}/exon',
+        intron='{output_dir}/stats/fragment_counts/{sample_id}/intron',
+        promoter='{output_dir}/stats/fragment_counts/{sample_id}/promoter',
+        enhancer='{output_dir}/stats/fragment_counts/{sample_id}/enhancer',
+        repeats='{output_dir}/stats/fragment_counts/{sample_id}/repeats'
+    params:
+        strand=lambda wildcards: {'forward': '-s', 'reverse': '-S'}.get(config['strandness'], '')
+    run:
+        shell(count_genomic_reads_command)
+        
+rule count_genomic_reads_rmdup:
+    '''Annotate genomic reads that are not mapped to any gene
+    '''
+    input:
+        bam='{output_dir}/bam/{sample_id}/genome_rmdup.bam',
+        exon=genome_dir + '/bed/long_RNA.exon.bed',
+        intron=genome_dir + '/bed/long_RNA.intron.bed',
+        promoter=genome_dir + '/bed/promoter.stranded.bed',
+        enhancer=genome_dir + '/bed/enhancer.stranded.bed',
+        repeats=genome_dir + '/bed/rmsk.bed'
+    output:
+        exon='{output_dir}/stats/fragment_counts/{sample_id}/exon_rmdup',
+        intron='{output_dir}/stats/fragment_counts/{sample_id}/intron_rmdup',
+        promoter='{output_dir}/stats/fragment_counts/{sample_id}/promoter_rmdup',
+        enhancer='{output_dir}/stats/fragment_counts/{sample_id}/enhancer_rmdup',
+        repeats='{output_dir}/stats/fragment_counts/{sample_id}/repeats_rmdup'
+    params:
+        strand=lambda wildcards: {'forward': '-s', 'reverse': '-S'}.get(config['strandness'], '')
+    run:
+        shell(count_genomic_reads_command)
+
+rule summarize_fragment_counts:
+    input:
+        fragment_counts=lambda wildcards: expand('{output_dir}/stats/fragment_counts/{sample_id}/{map_step}',
+            output_dir=wildcards.output_dir, sample_id=sample_ids, map_step=map_steps),
+        count_matrix='{output_dir}/count_matrix/featurecounts.txt'
+    output:
+        '{output_dir}/summary/read_counts.txt'
+    wildcard_constraints:
+        count_method=count_method_regex
+    run:
+        import pandas as pd
+        # read fragment counts for each mapping step
+        fragment_counts = pd.DataFrame(index=map_steps, columns=sample_ids)
+        for filename in input.fragment_counts:
+            c = filename.split('/')
+            sample_id = c[-2]
+            map_step = c[-1]
+            with open(filename, 'r') as f:
+                fragment_counts.loc[map_step, sample_id] = int(f.read().strip())
+        fragment_counts = fragment_counts.astype('int')
+        fragment_counts.columns.name = 'sample_id'
+        fragment_counts.drop(index='circRNA', inplace=True)
+
+        # read count matrix
+        count_matrix = pd.read_table(input.count_matrix, sep='\t', index_col=0)
+        feature_info = count_matrix.index.to_series().str.split('|', expand=True)
+        feature_info.columns = ['gene_id', 'gene_type', 'gene_name', 'domain_id', 'transcript_id', 'start', 'end']
+        count_matrix = pd.concat([feature_info, count_matrix], axis=1)
+        counts_by_rnatype = count_matrix.groupby('gene_type')[sample_ids].sum()
+        counts_by_rnatype = counts_by_rnatype.loc[:, sample_ids]
+
+        matrix = pd.concat([fragment_counts, counts_by_rnatype], axis=0)
+        matrix.index.name = 'rna_type'
+        matrix.to_csv(output[0], sep='\t', header=True, index=True, na_rep='NA')
+
+rule summarize_fragment_counts_jupyter:
+    input:
+        summary='{output_dir}/summary/read_counts.txt',
+        jupyter=package_dir + '/templates/summarize_read_counts_long.ipynb'
+    output:
+        jupyter='{output_dir}/summary/read_counts.ipynb',
+        html='{output_dir}/summary/read_counts.html'
+    run:
+        shell(nbconvert_command)
\ No newline at end of file