202 lines (188 with data), 9.4 kB
include: 'common.snakemake'
rule all:
input:
merge_reads_by_rnatype=expand('{output_dir}/tbed_long_RNA/{sample_id}.bed.gz',
output_dir=output_dir, sample_id=sample_ids),
domain_count_matrix=expand('{output_dir}/count_matrix/long_fragments.txt',
output_dir=output_dir),
domains_localmax=expand('{output_dir}/domains_localmax/domains.bed', output_dir=output_dir),
domains_localmax_genome=expand('{output_dir}/domains_localmax_genome/domains.bed', output_dir=output_dir),
domains_localmax_by_sample_genome=expand('{output_dir}/domains_localmax_by_sample_genome/{sample_id}.bed',
output_dir=output_dir, sample_id=sample_ids)
rule tbam_to_bed:
input:
'{output_dir}/tbam/{sample_id}/{rna_type}.bam'
output:
'{output_dir}/tbed/{sample_id}/{rna_type}.bed.gz'
threads: 1
wildcard_constraints:
rna_type='(?!other).*'
shell:
'''bedtools bamtobed -i {input} | pigz -c -p {threads} > {output}
'''
rule merge_reads_by_rnatype:
input:
tbed=lambda wildcards: expand('{output_dir}/tbed/{sample_id}/{rna_type}.bed.gz',
output_dir=wildcards.output_dir, sample_id=wildcards.sample_id, rna_type=long_rna_types),
other='{output_dir}/gbed/{sample_id}/other.bed.gz'
output:
'{output_dir}/tbed_long_RNA/{sample_id}.bed.gz'
threads: 2
shell:
'''pigz -p {threads} -d -c {input} \
| bedtools sort \
| pigz -c -p {threads} > {output}'''
# call domains using localmax
rule call_domain_localmax:
input:
tbigwig='{output_dir}/tbigwig/{sample_id}.transcriptome.bigWig'
output:
bed='{output_dir}/domains_localmax_by_sample/{sample_id}.bed'
shell:
'''{bin_dir}/call_peak.py call_peaks_localmax -i {input.tbigwig} -o {output.bed}
'''
rule domain_localmax_recurrence:
input:
bed=lambda wildcards: expand('{output_dir}/domains_localmax_by_sample/{sample_id}.bed',
output_dir=wildcards.output_dir, sample_id=sample_ids),
chrom_sizes=genome_dir + '/chrom_sizes/transcriptome_genome'
output:
'{output_dir}/domains_localmax_recurrence/recurrence.bed'
shell:
'''cat {input.bed} \
| bedtools sort \
| bedtools genomecov -i - -strand + -g {input.chrom_sizes} -bg \
| awk 'BEGIN{{OFS="\t";FS="\t"}}{{print $1,$2,$3,"X",$4,"+"}}' \
> {output}
'''
rule filter_domain_localmax_by_recurrence:
input:
'{output_dir}/domains_localmax_recurrence/recurrence.bed'
output:
'{output_dir}/domains_localmax/domains.bed'
params:
cov_threshold=len(sample_ids)*config['cov_threshold'],
min_peak_length=10
shell:
r'''awk -v c={params.cov_threshold} '$5 > c' {input} \
| bedtools merge -s -c 2,3,5,6 -o collapse,collapse,collapse,collapse \
| awk 'BEGIN{{OFS="\t";FS="\t"}}
{{split($4,a,/,/); split($5,b,/,/); split($6,c,/,/); split($7,d,/,/);
cov=0.0;for(i=1;i<=length(a);i++){{cov+=c[i]*(b[i]-a[i]);}}
cov /= $3-$2;
print $1,$2,$3,"peak_" NR,cov,d[1]
}}' \
| awk '($3-$2) >= {params.min_peak_length}' > {output}
'''
rule map_domains_by_sample_to_genome:
input:
bed_long=genome_dir + '/bed/long_RNA.bed',
bed_tRNA=genome_dir + '/bed/tRNA.bed',
domains='{output_dir}/domains_localmax_by_sample/{sample_id}.bed'
output:
'{output_dir}/domains_localmax_by_sample_genome/{sample_id}.bed'
shell:
'''{{
grep -v '^chr' {input.domains} | {bin_dir}/tbed2gbed <(cat {input.bed_long} {input.bed_tRNA}) /dev/stdin /dev/stdout
awk 'BEGIN{{OFS="\t";FS="\t"}}/^chr/{{print $1,$2,$3,$4,$5,$6,0,0,0,1,$3-$2,0}}' {input.domains}
}} | bedtools sort > {output}
'''
rule map_domains_to_genome:
input:
bed_long=genome_dir + '/bed/long_RNA.bed',
bed_tRNA=genome_dir + '/bed/tRNA.bed',
domains='{output_dir}/domains_localmax/domains.bed'
output:
'{output_dir}/domains_localmax_genome/domains.bed'
shell:
'''{{
grep -v '^chr' {input.domains} | {bin_dir}/tbed2gbed <(cat {input.bed_long} {input.bed_tRNA}) /dev/stdin /dev/stdout
awk 'BEGIN{{OFS="\t";FS="\t"}}/^chr/{{print $1,$2,$3,$4,$5,$6,0,0,0,1,$3-$2,0}}' {input.domains}
}} | bedtools sort > {output}
'''
rule domain_read_counts:
input:
bed='{output_dir}/tbed_long_RNA/{sample_id}.bed.gz',
domains='{output_dir}/domains_localmax/domains.bed'
output:
bed='{output_dir}/domain_localmax_counts/{sample_id}.bed',
counts=temp('{output_dir}/domain_localmax_counts/{sample_id}.txt')
threads: config['threads']
shell:
r'''pigz -d -c -p {threads} {input.bed} \
| bedtools map -s -c 4 -o collapse \
-a - -b {input.domains} \
| awk 'BEGIN{{OFS="\t";FS="\t"}} {{if($NF==".") next; split($NF,a,",");i=int(rand()*length(a)) + 1;count[a[i]]+=1}}
END{{for(name in count) print name,count[name]}}' > {output.counts}
awk 'BEGIN{{OFS="\t";FS="\t"}}FNR==NR{{count[$1]=$2;next}}{{$5=count[$4];if($5 == "")$5=0; print $1,$2,$3,$4,$5,$6}}'\
{output.counts} {input.domains} > {output.bed}
'''
rule domain_count_matrix:
input:
peaks=lambda wildcards: expand('{output_dir}/domain_localmax_counts/{sample_id}.bed',
output_dir=wildcards.output_dir,
bin_size=config['bin_size'],
pvalue=config['call_domain_pvalue'],
sample_id=sample_ids),
transcript_table=genome_dir + '/transcript_table/all.txt',
domains='{output_dir}/domains_localmax/domains.bed',
chrom_sizes=genome_dir + '/chrom_sizes/genome'
output:
'{output_dir}/count_matrix/long_fragments.txt'
run:
import pandas as pd
import re
import numpy as np
transcript_table = pd.read_table(input.transcript_table, sep='\t', dtype='str')
transcript_table.drop_duplicates(['transcript_id'], inplace=True)
transcript_table.set_index('transcript_id', drop=False, inplace=True)
transcript_table = transcript_table.loc[:, ['gene_id', 'gene_name', 'gene_type', 'transcript_id', 'start', 'end']].copy()
# extend transcript_table with genome regions
chrom_sizes = pd.read_table(input.chrom_sizes, sep='\t', names=['chrom', 'end'])
chrom_sizes.set_index('chrom', drop=False, inplace=True)
domains = pd.read_table(input.domains, sep='\t', header=None,
names=['chrom', 'start', 'end', 'domain_id', 'score', 'strand'], dtype='str')
pat_cov = re.compile(r'{output_dir}/domain_localmax_counts/(?P<sample_id>[^\.]+).bed'.format(output_dir=output_dir))
mat = []
peak_labels = None
for filename in input.peaks:
sample_id = pat_cov.match(filename).groupdict()['sample_id']
df = pd.read_table(filename, header=None)
if peak_labels is None:
peak_labels = df.iloc[:, 3].values
df.index = df.iloc[:, 3]
cov = df.iloc[:, 4].copy()
cov.name = sample_id
mat.append(cov)
mat = pd.concat(mat, axis=1)
# get seq
seq_ids = domains['chrom'].values
# get transcript peaks
is_genome_peaks = np.isin(seq_ids, chrom_sizes['chrom'].values)
seq_ids_genome = seq_ids[is_genome_peaks]
seq_ids_transcript = seq_ids[~is_genome_peaks]
# annotate transcript peaks with gene information
# feature name format: gene_id|gene_type|gene_name|domain_id|transcript_id|start|end
feature_names = np.empty(mat.shape[0], dtype='object')
print(np.sum(~is_genome_peaks), seq_ids_transcript.shape, transcript_table.loc[seq_ids_transcript, 'gene_name'].values.shape)
feature_names[~is_genome_peaks] = transcript_table.loc[seq_ids_transcript, 'gene_id'].values \
+ '|' + transcript_table.loc[seq_ids_transcript, 'gene_type'].values \
+ '|' + transcript_table.loc[seq_ids_transcript, 'gene_name'].values \
+ '|' + domains['domain_id'].values[~is_genome_peaks] \
+ '|' + transcript_table.loc[seq_ids_transcript, 'transcript_id'].values \
+ '|' + domains['start'].values[~is_genome_peaks] \
+ '|' + domains['end'].values[~is_genome_peaks]
# annotate genome peaks
print(seq_ids_genome.shape, np.sum(is_genome_peaks))
gene_ids_genome = seq_ids_genome + '_' + domains['start'].values[is_genome_peaks] \
+ '_' + domains['end'].values[is_genome_peaks] + '_' + domains['strand'].values[is_genome_peaks]
feature_names[is_genome_peaks] = gene_ids_genome \
+ '|' + 'genomic' \
+ '|' + gene_ids_genome \
+ '|' + domains['domain_id'].values[is_genome_peaks] \
+ '|' + domains['chrom'].values[is_genome_peaks] \
+ '|' + domains['start'].values[is_genome_peaks] \
+ '|' + domains['end'].values[is_genome_peaks]
mat.index = feature_names
mat.index.name = 'feature'
mat.to_csv(output[0], sep='\t', header=True, index=True)