--- a +++ b/exseek/snakefiles/call_domains.snakemake @@ -0,0 +1,201 @@ +include: 'common.snakemake' + +rule all: + input: + merge_reads_by_rnatype=expand('{output_dir}/tbed_long_RNA/{sample_id}.bed.gz', + output_dir=output_dir, sample_id=sample_ids), + domain_count_matrix=expand('{output_dir}/count_matrix/long_fragments.txt', + output_dir=output_dir), + domains_localmax=expand('{output_dir}/domains_localmax/domains.bed', output_dir=output_dir), + domains_localmax_genome=expand('{output_dir}/domains_localmax_genome/domains.bed', output_dir=output_dir), + domains_localmax_by_sample_genome=expand('{output_dir}/domains_localmax_by_sample_genome/{sample_id}.bed', + output_dir=output_dir, sample_id=sample_ids) + +rule tbam_to_bed: + input: + '{output_dir}/tbam/{sample_id}/{rna_type}.bam' + output: + '{output_dir}/tbed/{sample_id}/{rna_type}.bed.gz' + threads: 1 + wildcard_constraints: + rna_type='(?!other).*' + shell: + '''bedtools bamtobed -i {input} | pigz -c -p {threads} > {output} + ''' + +rule merge_reads_by_rnatype: + input: + tbed=lambda wildcards: expand('{output_dir}/tbed/{sample_id}/{rna_type}.bed.gz', + output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=long_rna_types), + other='{output_dir}/gbed/{sample_id}/other.bed.gz' + output: + '{output_dir}/tbed_long_RNA/{sample_id}.bed.gz' + threads: 2 + shell: + '''pigz -p {threads} -d -c {input} \ + | bedtools sort \ + | pigz -c -p {threads} > {output}''' + + +# call domains using localmax +rule call_domain_localmax: + input: + tbigwig='{output_dir}/tbigwig/{sample_id}.transcriptome.bigWig' + output: + bed='{output_dir}/domains_localmax_by_sample/{sample_id}.bed' + shell: + '''{bin_dir}/call_peak.py call_peaks_localmax -i {input.tbigwig} -o {output.bed} + ''' + +rule domain_localmax_recurrence: + input: + bed=lambda wildcards: expand('{output_dir}/domains_localmax_by_sample/{sample_id}.bed', + output_dir=wildcards.output_dir, sample_id=sample_ids), + chrom_sizes=genome_dir + '/chrom_sizes/transcriptome_genome' + output: + '{output_dir}/domains_localmax_recurrence/recurrence.bed' + shell: + '''cat {input.bed} \ + | bedtools sort \ + | bedtools genomecov -i - -strand + -g {input.chrom_sizes} -bg \ + | awk 'BEGIN{{OFS="\t";FS="\t"}}{{print $1,$2,$3,"X",$4,"+"}}' \ + > {output} + ''' + +rule filter_domain_localmax_by_recurrence: + input: + '{output_dir}/domains_localmax_recurrence/recurrence.bed' + output: + '{output_dir}/domains_localmax/domains.bed' + params: + cov_threshold=len(sample_ids)*config['cov_threshold'], + min_peak_length=10 + shell: + r'''awk -v c={params.cov_threshold} '$5 > c' {input} \ + | bedtools merge -s -c 2,3,5,6 -o collapse,collapse,collapse,collapse \ + | awk 'BEGIN{{OFS="\t";FS="\t"}} + {{split($4,a,/,/); split($5,b,/,/); split($6,c,/,/); split($7,d,/,/); + cov=0.0;for(i=1;i<=length(a);i++){{cov+=c[i]*(b[i]-a[i]);}} + cov /= $3-$2; + print $1,$2,$3,"peak_" NR,cov,d[1] + }}' \ + | awk '($3-$2) >= {params.min_peak_length}' > {output} + ''' + +rule map_domains_by_sample_to_genome: + input: + bed_long=genome_dir + '/bed/long_RNA.bed', + bed_tRNA=genome_dir + '/bed/tRNA.bed', + domains='{output_dir}/domains_localmax_by_sample/{sample_id}.bed' + output: + '{output_dir}/domains_localmax_by_sample_genome/{sample_id}.bed' + shell: + '''{{ + grep -v '^chr' {input.domains} | {bin_dir}/tbed2gbed <(cat {input.bed_long} {input.bed_tRNA}) /dev/stdin /dev/stdout + awk 'BEGIN{{OFS="\t";FS="\t"}}/^chr/{{print $1,$2,$3,$4,$5,$6,0,0,0,1,$3-$2,0}}' {input.domains} + }} | bedtools sort > {output} + ''' + +rule map_domains_to_genome: + input: + bed_long=genome_dir + '/bed/long_RNA.bed', + bed_tRNA=genome_dir + '/bed/tRNA.bed', + domains='{output_dir}/domains_localmax/domains.bed' + output: + '{output_dir}/domains_localmax_genome/domains.bed' + shell: + '''{{ + grep -v '^chr' {input.domains} | {bin_dir}/tbed2gbed <(cat {input.bed_long} {input.bed_tRNA}) /dev/stdin /dev/stdout + awk 'BEGIN{{OFS="\t";FS="\t"}}/^chr/{{print $1,$2,$3,$4,$5,$6,0,0,0,1,$3-$2,0}}' {input.domains} + }} | bedtools sort > {output} + ''' + +rule domain_read_counts: + input: + bed='{output_dir}/tbed_long_RNA/{sample_id}.bed.gz', + domains='{output_dir}/domains_localmax/domains.bed' + output: + bed='{output_dir}/domain_localmax_counts/{sample_id}.bed', + counts=temp('{output_dir}/domain_localmax_counts/{sample_id}.txt') + threads: config['threads'] + shell: + r'''pigz -d -c -p {threads} {input.bed} \ + | bedtools map -s -c 4 -o collapse \ + -a - -b {input.domains} \ + | awk 'BEGIN{{OFS="\t";FS="\t"}} {{if($NF==".") next; split($NF,a,",");i=int(rand()*length(a)) + 1;count[a[i]]+=1}} + END{{for(name in count) print name,count[name]}}' > {output.counts} + awk 'BEGIN{{OFS="\t";FS="\t"}}FNR==NR{{count[$1]=$2;next}}{{$5=count[$4];if($5 == "")$5=0; print $1,$2,$3,$4,$5,$6}}'\ + {output.counts} {input.domains} > {output.bed} + ''' + +rule domain_count_matrix: + input: + peaks=lambda wildcards: expand('{output_dir}/domain_localmax_counts/{sample_id}.bed', + output_dir=wildcards.output_dir, + bin_size=config['bin_size'], + pvalue=config['call_domain_pvalue'], + sample_id=sample_ids), + transcript_table=genome_dir + '/transcript_table/all.txt', + domains='{output_dir}/domains_localmax/domains.bed', + chrom_sizes=genome_dir + '/chrom_sizes/genome' + output: + '{output_dir}/count_matrix/long_fragments.txt' + run: + import pandas as pd + import re + import numpy as np + + transcript_table = pd.read_table(input.transcript_table, sep='\t', dtype='str') + transcript_table.drop_duplicates(['transcript_id'], inplace=True) + transcript_table.set_index('transcript_id', drop=False, inplace=True) + transcript_table = transcript_table.loc[:, ['gene_id', 'gene_name', 'gene_type', 'transcript_id', 'start', 'end']].copy() + # extend transcript_table with genome regions + chrom_sizes = pd.read_table(input.chrom_sizes, sep='\t', names=['chrom', 'end']) + chrom_sizes.set_index('chrom', drop=False, inplace=True) + domains = pd.read_table(input.domains, sep='\t', header=None, + names=['chrom', 'start', 'end', 'domain_id', 'score', 'strand'], dtype='str') + + pat_cov = re.compile(r'{output_dir}/domain_localmax_counts/(?P<sample_id>[^\.]+).bed'.format(output_dir=output_dir)) + mat = [] + peak_labels = None + for filename in input.peaks: + sample_id = pat_cov.match(filename).groupdict()['sample_id'] + df = pd.read_table(filename, header=None) + if peak_labels is None: + peak_labels = df.iloc[:, 3].values + df.index = df.iloc[:, 3] + cov = df.iloc[:, 4].copy() + cov.name = sample_id + mat.append(cov) + mat = pd.concat(mat, axis=1) + # get seq + seq_ids = domains['chrom'].values + # get transcript peaks + is_genome_peaks = np.isin(seq_ids, chrom_sizes['chrom'].values) + seq_ids_genome = seq_ids[is_genome_peaks] + seq_ids_transcript = seq_ids[~is_genome_peaks] + # annotate transcript peaks with gene information + # feature name format: gene_id|gene_type|gene_name|domain_id|transcript_id|start|end + feature_names = np.empty(mat.shape[0], dtype='object') + print(np.sum(~is_genome_peaks), seq_ids_transcript.shape, transcript_table.loc[seq_ids_transcript, 'gene_name'].values.shape) + feature_names[~is_genome_peaks] = transcript_table.loc[seq_ids_transcript, 'gene_id'].values \ + + '|' + transcript_table.loc[seq_ids_transcript, 'gene_type'].values \ + + '|' + transcript_table.loc[seq_ids_transcript, 'gene_name'].values \ + + '|' + domains['domain_id'].values[~is_genome_peaks] \ + + '|' + transcript_table.loc[seq_ids_transcript, 'transcript_id'].values \ + + '|' + domains['start'].values[~is_genome_peaks] \ + + '|' + domains['end'].values[~is_genome_peaks] + # annotate genome peaks + print(seq_ids_genome.shape, np.sum(is_genome_peaks)) + gene_ids_genome = seq_ids_genome + '_' + domains['start'].values[is_genome_peaks] \ + + '_' + domains['end'].values[is_genome_peaks] + '_' + domains['strand'].values[is_genome_peaks] + feature_names[is_genome_peaks] = gene_ids_genome \ + + '|' + 'genomic' \ + + '|' + gene_ids_genome \ + + '|' + domains['domain_id'].values[is_genome_peaks] \ + + '|' + domains['chrom'].values[is_genome_peaks] \ + + '|' + domains['start'].values[is_genome_peaks] \ + + '|' + domains['end'].values[is_genome_peaks] + mat.index = feature_names + mat.index.name = 'feature' + mat.to_csv(output[0], sep='\t', header=True, index=True)