[41c1e8]: / exseek / snakefiles / call_domains.snakemake

Download this file

202 lines (188 with data), 9.4 kB

include: 'common.snakemake'

rule all:
    input:
        merge_reads_by_rnatype=expand('{output_dir}/tbed_long_RNA/{sample_id}.bed.gz',
            output_dir=output_dir, sample_id=sample_ids),
        domain_count_matrix=expand('{output_dir}/count_matrix/long_fragments.txt',
            output_dir=output_dir),
        domains_localmax=expand('{output_dir}/domains_localmax/domains.bed', output_dir=output_dir),
        domains_localmax_genome=expand('{output_dir}/domains_localmax_genome/domains.bed', output_dir=output_dir),
        domains_localmax_by_sample_genome=expand('{output_dir}/domains_localmax_by_sample_genome/{sample_id}.bed',
            output_dir=output_dir, sample_id=sample_ids)

rule tbam_to_bed:
    input:
        '{output_dir}/tbam/{sample_id}/{rna_type}.bam'
    output:
        '{output_dir}/tbed/{sample_id}/{rna_type}.bed.gz'
    threads: 1
    wildcard_constraints:
        rna_type='(?!other).*'
    shell:
        '''bedtools bamtobed -i {input} | pigz -c -p {threads} > {output}
        '''

rule merge_reads_by_rnatype:
    input:
        tbed=lambda wildcards: expand('{output_dir}/tbed/{sample_id}/{rna_type}.bed.gz',
            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=long_rna_types),
        other='{output_dir}/gbed/{sample_id}/other.bed.gz'
    output:
        '{output_dir}/tbed_long_RNA/{sample_id}.bed.gz'
    threads: 2
    shell:
        '''pigz -p {threads} -d -c {input} \
            | bedtools sort \
            | pigz -c -p {threads} > {output}'''


# call domains using localmax
rule call_domain_localmax:
    input:
        tbigwig='{output_dir}/tbigwig/{sample_id}.transcriptome.bigWig'
    output:
        bed='{output_dir}/domains_localmax_by_sample/{sample_id}.bed'
    shell:
        '''{bin_dir}/call_peak.py call_peaks_localmax -i {input.tbigwig} -o {output.bed}
        '''

rule domain_localmax_recurrence:
    input:
        bed=lambda wildcards: expand('{output_dir}/domains_localmax_by_sample/{sample_id}.bed',
            output_dir=wildcards.output_dir, sample_id=sample_ids),
        chrom_sizes=genome_dir + '/chrom_sizes/transcriptome_genome'
    output:
        '{output_dir}/domains_localmax_recurrence/recurrence.bed'
    shell:
        '''cat {input.bed} \
            | bedtools sort \
            | bedtools genomecov -i - -strand + -g {input.chrom_sizes} -bg \
            | awk 'BEGIN{{OFS="\t";FS="\t"}}{{print $1,$2,$3,"X",$4,"+"}}' \
            > {output}
        '''

rule filter_domain_localmax_by_recurrence:
    input:
        '{output_dir}/domains_localmax_recurrence/recurrence.bed'
    output:
        '{output_dir}/domains_localmax/domains.bed'
    params:
        cov_threshold=len(sample_ids)*config['cov_threshold'],
        min_peak_length=10
    shell:
        r'''awk -v c={params.cov_threshold} '$5 > c' {input} \
            | bedtools merge -s -c 2,3,5,6 -o collapse,collapse,collapse,collapse \
            | awk 'BEGIN{{OFS="\t";FS="\t"}} 
            {{split($4,a,/,/); split($5,b,/,/); split($6,c,/,/); split($7,d,/,/);
            cov=0.0;for(i=1;i<=length(a);i++){{cov+=c[i]*(b[i]-a[i]);}} 
            cov /= $3-$2;
            print $1,$2,$3,"peak_" NR,cov,d[1]
            }}' \
            | awk '($3-$2) >= {params.min_peak_length}' > {output}
        '''

rule map_domains_by_sample_to_genome:
    input:
        bed_long=genome_dir + '/bed/long_RNA.bed',
        bed_tRNA=genome_dir + '/bed/tRNA.bed',
        domains='{output_dir}/domains_localmax_by_sample/{sample_id}.bed'
    output:
        '{output_dir}/domains_localmax_by_sample_genome/{sample_id}.bed'
    shell:
        '''{{
            grep -v '^chr' {input.domains} | {bin_dir}/tbed2gbed <(cat {input.bed_long} {input.bed_tRNA}) /dev/stdin /dev/stdout
            awk 'BEGIN{{OFS="\t";FS="\t"}}/^chr/{{print $1,$2,$3,$4,$5,$6,0,0,0,1,$3-$2,0}}' {input.domains}
           }} | bedtools sort > {output}
        '''
    
rule map_domains_to_genome:
    input:
        bed_long=genome_dir + '/bed/long_RNA.bed',
        bed_tRNA=genome_dir + '/bed/tRNA.bed',
        domains='{output_dir}/domains_localmax/domains.bed'
    output:
        '{output_dir}/domains_localmax_genome/domains.bed'
    shell:
        '''{{
            grep -v '^chr' {input.domains} | {bin_dir}/tbed2gbed <(cat {input.bed_long} {input.bed_tRNA}) /dev/stdin /dev/stdout
            awk 'BEGIN{{OFS="\t";FS="\t"}}/^chr/{{print $1,$2,$3,$4,$5,$6,0,0,0,1,$3-$2,0}}' {input.domains}
           }} | bedtools sort > {output}
        '''

rule domain_read_counts:
    input:
        bed='{output_dir}/tbed_long_RNA/{sample_id}.bed.gz',
        domains='{output_dir}/domains_localmax/domains.bed'
    output:
        bed='{output_dir}/domain_localmax_counts/{sample_id}.bed',
        counts=temp('{output_dir}/domain_localmax_counts/{sample_id}.txt')
    threads: config['threads']
    shell:
        r'''pigz -d -c -p {threads} {input.bed} \
                | bedtools map -s -c 4 -o collapse \
                    -a - -b {input.domains} \
                | awk 'BEGIN{{OFS="\t";FS="\t"}} {{if($NF==".") next; split($NF,a,",");i=int(rand()*length(a)) + 1;count[a[i]]+=1}}
                    END{{for(name in count) print name,count[name]}}' > {output.counts}
            awk 'BEGIN{{OFS="\t";FS="\t"}}FNR==NR{{count[$1]=$2;next}}{{$5=count[$4];if($5 == "")$5=0; print $1,$2,$3,$4,$5,$6}}'\
                {output.counts} {input.domains} > {output.bed}
        '''

rule domain_count_matrix:
    input:
        peaks=lambda wildcards: expand('{output_dir}/domain_localmax_counts/{sample_id}.bed',
            output_dir=wildcards.output_dir,
            bin_size=config['bin_size'], 
            pvalue=config['call_domain_pvalue'], 
            sample_id=sample_ids),
        transcript_table=genome_dir + '/transcript_table/all.txt',
        domains='{output_dir}/domains_localmax/domains.bed',
        chrom_sizes=genome_dir + '/chrom_sizes/genome'
    output:
        '{output_dir}/count_matrix/long_fragments.txt'
    run:
        import pandas as pd
        import re
        import numpy as np

        transcript_table = pd.read_table(input.transcript_table, sep='\t', dtype='str')
        transcript_table.drop_duplicates(['transcript_id'], inplace=True)
        transcript_table.set_index('transcript_id', drop=False, inplace=True)
        transcript_table = transcript_table.loc[:, ['gene_id', 'gene_name', 'gene_type', 'transcript_id', 'start', 'end']].copy()
        # extend transcript_table with genome regions
        chrom_sizes = pd.read_table(input.chrom_sizes, sep='\t', names=['chrom', 'end'])
        chrom_sizes.set_index('chrom', drop=False, inplace=True)
        domains = pd.read_table(input.domains, sep='\t', header=None,
            names=['chrom', 'start', 'end', 'domain_id', 'score', 'strand'], dtype='str')

        pat_cov = re.compile(r'{output_dir}/domain_localmax_counts/(?P<sample_id>[^\.]+).bed'.format(output_dir=output_dir))
        mat = []
        peak_labels = None
        for filename in input.peaks:
            sample_id = pat_cov.match(filename).groupdict()['sample_id']
            df = pd.read_table(filename, header=None)
            if peak_labels is None:
                peak_labels = df.iloc[:, 3].values
            df.index = df.iloc[:, 3]
            cov = df.iloc[:, 4].copy()
            cov.name = sample_id
            mat.append(cov)
        mat = pd.concat(mat, axis=1)
        # get seq
        seq_ids = domains['chrom'].values
        # get transcript peaks
        is_genome_peaks = np.isin(seq_ids, chrom_sizes['chrom'].values)
        seq_ids_genome = seq_ids[is_genome_peaks]
        seq_ids_transcript = seq_ids[~is_genome_peaks]
        # annotate transcript peaks with gene information
        # feature name format: gene_id|gene_type|gene_name|domain_id|transcript_id|start|end
        feature_names = np.empty(mat.shape[0], dtype='object')
        print(np.sum(~is_genome_peaks), seq_ids_transcript.shape, transcript_table.loc[seq_ids_transcript, 'gene_name'].values.shape)
        feature_names[~is_genome_peaks] = transcript_table.loc[seq_ids_transcript, 'gene_id'].values \
            + '|' + transcript_table.loc[seq_ids_transcript, 'gene_type'].values \
            + '|' + transcript_table.loc[seq_ids_transcript, 'gene_name'].values \
            + '|' + domains['domain_id'].values[~is_genome_peaks] \
            + '|' + transcript_table.loc[seq_ids_transcript, 'transcript_id'].values \
            + '|' + domains['start'].values[~is_genome_peaks] \
            + '|' + domains['end'].values[~is_genome_peaks]
        # annotate genome peaks
        print(seq_ids_genome.shape, np.sum(is_genome_peaks))
        gene_ids_genome = seq_ids_genome + '_' + domains['start'].values[is_genome_peaks] \
            + '_' + domains['end'].values[is_genome_peaks] + '_' + domains['strand'].values[is_genome_peaks]
        feature_names[is_genome_peaks] = gene_ids_genome \
            + '|' + 'genomic' \
            + '|' + gene_ids_genome \
            + '|' + domains['domain_id'].values[is_genome_peaks] \
            + '|' + domains['chrom'].values[is_genome_peaks] \
            + '|' + domains['start'].values[is_genome_peaks] \
            + '|' + domains['end'].values[is_genome_peaks]
        mat.index = feature_names
        mat.index.name = 'feature'
        mat.to_csv(output[0], sep='\t', header=True, index=True)