exSEEK / Git / Diff of /exseek/snakefiles/call

Models:
DanielG/
exSEEK
Downloads: 1
Diff of /exseek/snakefiles/call_domains.snakemake [000000] .. [4c33d4]
Switch to side-by-side view

--- a
+++ b/exseek/snakefiles/call_domains.snakemake
@@ -0,0 +1,201 @@
+include: 'common.snakemake'
+
+rule all:
+    input:
+        merge_reads_by_rnatype=expand('{output_dir}/tbed_long_RNA/{sample_id}.bed.gz',
+            output_dir=output_dir, sample_id=sample_ids),
+        domain_count_matrix=expand('{output_dir}/count_matrix/long_fragments.txt',
+            output_dir=output_dir),
+        domains_localmax=expand('{output_dir}/domains_localmax/domains.bed', output_dir=output_dir),
+        domains_localmax_genome=expand('{output_dir}/domains_localmax_genome/domains.bed', output_dir=output_dir),
+        domains_localmax_by_sample_genome=expand('{output_dir}/domains_localmax_by_sample_genome/{sample_id}.bed',
+            output_dir=output_dir, sample_id=sample_ids)
+
+rule tbam_to_bed:
+    input:
+        '{output_dir}/tbam/{sample_id}/{rna_type}.bam'
+    output:
+        '{output_dir}/tbed/{sample_id}/{rna_type}.bed.gz'
+    threads: 1
+    wildcard_constraints:
+        rna_type='(?!other).*'
+    shell:
+        '''bedtools bamtobed -i {input} | pigz -c -p {threads} > {output}
+        '''
+
+rule merge_reads_by_rnatype:
+    input:
+        tbed=lambda wildcards: expand('{output_dir}/tbed/{sample_id}/{rna_type}.bed.gz',
+            output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=long_rna_types),
+        other='{output_dir}/gbed/{sample_id}/other.bed.gz'
+    output:
+        '{output_dir}/tbed_long_RNA/{sample_id}.bed.gz'
+    threads: 2
+    shell:
+        '''pigz -p {threads} -d -c {input} \
+            | bedtools sort \
+            | pigz -c -p {threads} > {output}'''
+
+
+# call domains using localmax
+rule call_domain_localmax:
+    input:
+        tbigwig='{output_dir}/tbigwig/{sample_id}.transcriptome.bigWig'
+    output:
+        bed='{output_dir}/domains_localmax_by_sample/{sample_id}.bed'
+    shell:
+        '''{bin_dir}/call_peak.py call_peaks_localmax -i {input.tbigwig} -o {output.bed}
+        '''
+
+rule domain_localmax_recurrence:
+    input:
+        bed=lambda wildcards: expand('{output_dir}/domains_localmax_by_sample/{sample_id}.bed',
+            output_dir=wildcards.output_dir, sample_id=sample_ids),
+        chrom_sizes=genome_dir + '/chrom_sizes/transcriptome_genome'
+    output:
+        '{output_dir}/domains_localmax_recurrence/recurrence.bed'
+    shell:
+        '''cat {input.bed} \
+            | bedtools sort \
+            | bedtools genomecov -i - -strand + -g {input.chrom_sizes} -bg \
+            | awk 'BEGIN{{OFS="\t";FS="\t"}}{{print $1,$2,$3,"X",$4,"+"}}' \
+            > {output}
+        '''
+
+rule filter_domain_localmax_by_recurrence:
+    input:
+        '{output_dir}/domains_localmax_recurrence/recurrence.bed'
+    output:
+        '{output_dir}/domains_localmax/domains.bed'
+    params:
+        cov_threshold=len(sample_ids)*config['cov_threshold'],
+        min_peak_length=10
+    shell:
+        r'''awk -v c={params.cov_threshold} '$5 > c' {input} \
+            | bedtools merge -s -c 2,3,5,6 -o collapse,collapse,collapse,collapse \
+            | awk 'BEGIN{{OFS="\t";FS="\t"}} 
+            {{split($4,a,/,/); split($5,b,/,/); split($6,c,/,/); split($7,d,/,/);
+            cov=0.0;for(i=1;i<=length(a);i++){{cov+=c[i]*(b[i]-a[i]);}} 
+            cov /= $3-$2;
+            print $1,$2,$3,"peak_" NR,cov,d[1]
+            }}' \
+            | awk '($3-$2) >= {params.min_peak_length}' > {output}
+        '''
+
+rule map_domains_by_sample_to_genome:
+    input:
+        bed_long=genome_dir + '/bed/long_RNA.bed',
+        bed_tRNA=genome_dir + '/bed/tRNA.bed',
+        domains='{output_dir}/domains_localmax_by_sample/{sample_id}.bed'
+    output:
+        '{output_dir}/domains_localmax_by_sample_genome/{sample_id}.bed'
+    shell:
+        '''{{
+            grep -v '^chr' {input.domains} | {bin_dir}/tbed2gbed <(cat {input.bed_long} {input.bed_tRNA}) /dev/stdin /dev/stdout
+            awk 'BEGIN{{OFS="\t";FS="\t"}}/^chr/{{print $1,$2,$3,$4,$5,$6,0,0,0,1,$3-$2,0}}' {input.domains}
+           }} | bedtools sort > {output}
+        '''
+    
+rule map_domains_to_genome:
+    input:
+        bed_long=genome_dir + '/bed/long_RNA.bed',
+        bed_tRNA=genome_dir + '/bed/tRNA.bed',
+        domains='{output_dir}/domains_localmax/domains.bed'
+    output:
+        '{output_dir}/domains_localmax_genome/domains.bed'
+    shell:
+        '''{{
+            grep -v '^chr' {input.domains} | {bin_dir}/tbed2gbed <(cat {input.bed_long} {input.bed_tRNA}) /dev/stdin /dev/stdout
+            awk 'BEGIN{{OFS="\t";FS="\t"}}/^chr/{{print $1,$2,$3,$4,$5,$6,0,0,0,1,$3-$2,0}}' {input.domains}
+           }} | bedtools sort > {output}
+        '''
+
+rule domain_read_counts:
+    input:
+        bed='{output_dir}/tbed_long_RNA/{sample_id}.bed.gz',
+        domains='{output_dir}/domains_localmax/domains.bed'
+    output:
+        bed='{output_dir}/domain_localmax_counts/{sample_id}.bed',
+        counts=temp('{output_dir}/domain_localmax_counts/{sample_id}.txt')
+    threads: config['threads']
+    shell:
+        r'''pigz -d -c -p {threads} {input.bed} \
+                | bedtools map -s -c 4 -o collapse \
+                    -a - -b {input.domains} \
+                | awk 'BEGIN{{OFS="\t";FS="\t"}} {{if($NF==".") next; split($NF,a,",");i=int(rand()*length(a)) + 1;count[a[i]]+=1}}
+                    END{{for(name in count) print name,count[name]}}' > {output.counts}
+            awk 'BEGIN{{OFS="\t";FS="\t"}}FNR==NR{{count[$1]=$2;next}}{{$5=count[$4];if($5 == "")$5=0; print $1,$2,$3,$4,$5,$6}}'\
+                {output.counts} {input.domains} > {output.bed}
+        '''
+
+rule domain_count_matrix:
+    input:
+        peaks=lambda wildcards: expand('{output_dir}/domain_localmax_counts/{sample_id}.bed',
+            output_dir=wildcards.output_dir,
+            bin_size=config['bin_size'], 
+            pvalue=config['call_domain_pvalue'], 
+            sample_id=sample_ids),
+        transcript_table=genome_dir + '/transcript_table/all.txt',
+        domains='{output_dir}/domains_localmax/domains.bed',
+        chrom_sizes=genome_dir + '/chrom_sizes/genome'
+    output:
+        '{output_dir}/count_matrix/long_fragments.txt'
+    run:
+        import pandas as pd
+        import re
+        import numpy as np
+
+        transcript_table = pd.read_table(input.transcript_table, sep='\t', dtype='str')
+        transcript_table.drop_duplicates(['transcript_id'], inplace=True)
+        transcript_table.set_index('transcript_id', drop=False, inplace=True)
+        transcript_table = transcript_table.loc[:, ['gene_id', 'gene_name', 'gene_type', 'transcript_id', 'start', 'end']].copy()
+        # extend transcript_table with genome regions
+        chrom_sizes = pd.read_table(input.chrom_sizes, sep='\t', names=['chrom', 'end'])
+        chrom_sizes.set_index('chrom', drop=False, inplace=True)
+        domains = pd.read_table(input.domains, sep='\t', header=None,
+            names=['chrom', 'start', 'end', 'domain_id', 'score', 'strand'], dtype='str')
+
+        pat_cov = re.compile(r'{output_dir}/domain_localmax_counts/(?P<sample_id>[^\.]+).bed'.format(output_dir=output_dir))
+        mat = []
+        peak_labels = None
+        for filename in input.peaks:
+            sample_id = pat_cov.match(filename).groupdict()['sample_id']
+            df = pd.read_table(filename, header=None)
+            if peak_labels is None:
+                peak_labels = df.iloc[:, 3].values
+            df.index = df.iloc[:, 3]
+            cov = df.iloc[:, 4].copy()
+            cov.name = sample_id
+            mat.append(cov)
+        mat = pd.concat(mat, axis=1)
+        # get seq
+        seq_ids = domains['chrom'].values
+        # get transcript peaks
+        is_genome_peaks = np.isin(seq_ids, chrom_sizes['chrom'].values)
+        seq_ids_genome = seq_ids[is_genome_peaks]
+        seq_ids_transcript = seq_ids[~is_genome_peaks]
+        # annotate transcript peaks with gene information
+        # feature name format: gene_id|gene_type|gene_name|domain_id|transcript_id|start|end
+        feature_names = np.empty(mat.shape[0], dtype='object')
+        print(np.sum(~is_genome_peaks), seq_ids_transcript.shape, transcript_table.loc[seq_ids_transcript, 'gene_name'].values.shape)
+        feature_names[~is_genome_peaks] = transcript_table.loc[seq_ids_transcript, 'gene_id'].values \
+            + '|' + transcript_table.loc[seq_ids_transcript, 'gene_type'].values \
+            + '|' + transcript_table.loc[seq_ids_transcript, 'gene_name'].values \
+            + '|' + domains['domain_id'].values[~is_genome_peaks] \
+            + '|' + transcript_table.loc[seq_ids_transcript, 'transcript_id'].values \
+            + '|' + domains['start'].values[~is_genome_peaks] \
+            + '|' + domains['end'].values[~is_genome_peaks]
+        # annotate genome peaks
+        print(seq_ids_genome.shape, np.sum(is_genome_peaks))
+        gene_ids_genome = seq_ids_genome + '_' + domains['start'].values[is_genome_peaks] \
+            + '_' + domains['end'].values[is_genome_peaks] + '_' + domains['strand'].values[is_genome_peaks]
+        feature_names[is_genome_peaks] = gene_ids_genome \
+            + '|' + 'genomic' \
+            + '|' + gene_ids_genome \
+            + '|' + domains['domain_id'].values[is_genome_peaks] \
+            + '|' + domains['chrom'].values[is_genome_peaks] \
+            + '|' + domains['start'].values[is_genome_peaks] \
+            + '|' + domains['end'].values[is_genome_peaks]
+        mat.index = feature_names
+        mat.index.name = 'feature'
+        mat.to_csv(output[0], sep='\t', header=True, index=True)