370 lines (338 with data), 16.5 kB
include: 'common.snakemake'
map_steps = ['clean', 'spikein', 'univec', 'rRNA',
'genome', 'exon', 'intron', 'promoter', 'enhancer', 'repeats', 'circRNA']
if config['remove_duplicates_long']:
map_steps += ['genome_rmdup', 'exon_rmdup', 'enhancer_rmdup',
'promoter_rmdup', 'repeats_rmdup', 'circRNA_rmdup']
count_method_regex = '(featurecounts)|(htseq)'
def get_all_inputs(wildcards):
inputs = {}
inputs['count_matrix'] = expand('{output_dir}/count_matrix/{count_method}.txt',
output_dir=output_dir, count_method=config['count_method'])
inputs['count_genomic_reads'] = expand('{output_dir}/stats/fragment_counts/{sample_id}/{map_step}',
output_dir=output_dir, sample_id=sample_ids, map_step=map_steps)
#rmdup
# summary
inputs['summarize_read_counts'] = expand('{output_dir}/summary/read_counts.txt',
output_dir=output_dir, sample_id=sample_ids)
return inputs
rule all:
input:
unpack(get_all_inputs)
rule featurecounts:
input:
bam='{output_dir}/bam/{sample_id}/{map_step}.bam',
gtf=genome_dir + '/gtf/long_RNA.gtf'
output:
counts='{output_dir}/counts/featurecounts/{sample_id}/{map_step}',
summary='{output_dir}/counts/featurecounts/{sample_id}/{map_step}.summary'
params:
strandness={'no': 0, 'forward': 1, 'reverse': 2}.get(config['strandness'], 0),
paired_end={True: '-p', False: ''}[config['paired_end']],
min_mapping_quality=config['min_mapping_quality'],
count_multimap_reads='-M' if config['count_multimap_reads'] else '',
count_overlapping_features='-O' if config['count_overlapping_features'] else ''
wildcard_constraints:
map_step='(genome)|(genome_rmdup)'
log:
'{output_dir}/log/featurecounts/{sample_id}/{map_step}'
shell:
'''featureCounts {params.count_overlapping_features} -t exon -g gene_id {params.count_multimap_reads} \
-s {params.strandness} -Q {params.min_mapping_quality} \
{params.paired_end} -a {input.gtf} -o {output.counts} {input.bam} > {log}
'''
rule htseq:
input:
bam='{output_dir}/bam/{sample_id}/{map_step}.bam',
gtf=genome_dir + '/gtf/long_RNA.gtf'
output:
counts='{output_dir}/counts/htseq/{sample_id}/{map_step}'
params:
strandness={'forward': 'yes', 'reverse': 'reverse'}.get(config['strandness'], 'no'),
min_mapping_quality=config['min_mapping_quality'],
count_overlapping_features='all' if config['count_overlapping_features'] else 'none'
wildcard_constraints:
map_step='(genome)|(genome_rmdup)'
shell:
'''htseq-count -t exon -i gene_id -f bam -a {params.min_mapping_quality} \
--nonunique {params.count_overlapping_features} -s {params.strandness} {input.bam} {input.gtf} > {output.counts}
'''
rule count_circRNA:
input:
'{output_dir}/bam/{sample_id}/{map_step}.bam'
output:
'{output_dir}/counts/count_circrna/{sample_id}/{map_step}'
params:
paired_end={True: '-p', False: ''}[config['paired_end']],
strandness=config['strandness'],
min_mapping_quality=config['min_mapping_quality']
wildcard_constraints:
map_step='(circRNA)|(circRNA_rmdup)'
shell:
'''{bin_dir}/count_reads.py count_circrna -s {params.strandness} \
-q {params.min_mapping_quality} {params.paired_end} -i {input} -o {output}
'''
class get_rmdup_counts:
def __init__(self, template, rmdup=False):
if rmdup:
self.template = template + '_rmdup'
else:
self.template = template
def __call__(self, wildcards):
return expand(self.template, sample_id=sample_ids, **wildcards)
rule count_matrix_circrna:
input:
circrna_counts=get_rmdup_counts('{output_dir}/counts/count_circrna/{sample_id}/circRNA', rmdup=config['remove_duplicates_long']),
circrna_sizes=genome_dir + '/chrom_sizes/circRNA'
output:
'{output_dir}/count_matrix/circRNA.txt'
run:
import pandas as pd
# read circRNA counts from individual files
matrix_circrna = {}
for filename in input.circrna_counts:
sample_id = filename.split('/')[-2]
matrix_circrna[sample_id] = pd.read_table(filename, sep='\t', header=None, index_col=0).iloc[:, 0]
matrix_circrna = pd.DataFrame(matrix_circrna)
matrix_circrna = matrix_circrna.loc[:, sample_ids]
# fill missing counts with 0
matrix_circrna.fillna(0, inplace=True)
matrix_circrna = matrix_circrna.astype('int')
# remove circular RNAs with zero counts in all samples
matrix_circrna = matrix_circrna.loc[matrix_circrna.sum(axis=1) > 0].copy()
# annotate circRNA
circrna_sizes = pd.read_table(input.circrna_sizes, sep='\t', header=None, index_col=0).iloc[:, 0]
circrna_ids = matrix_circrna.index.values
matrix_circrna.index = circrna_ids + '|circRNA|' + circrna_ids + '|' + circrna_ids\
+ '|' + circrna_ids + '|0|' + circrna_sizes.loc[circrna_ids].values.astype('str')
matrix_circrna.index.name = 'feature'
matrix_circrna.to_csv(output[0], sep='\t', header=True, index=True, na_rep='NA')
rule count_matrix:
input:
counts=get_rmdup_counts('{output_dir}/counts/{count_method}/{sample_id}/genome', rmdup=config['remove_duplicates_long']),
matrix_circrna='{output_dir}/count_matrix/circRNA.txt',
transcript_table=genome_dir + '/transcript_table/long.txt',
gene_length=genome_dir + '/gene_length/long_RNA'
output:
'{output_dir}/count_matrix/{count_method}.txt'
wildcard_constraints:
count_method='(featurecounts)|(htseq)'
run:
import pandas as pd
import re
# annotate features
transcript_table = pd.read_table(input.transcript_table, sep='\t', dtype='str')
transcript_table = transcript_table.drop_duplicates('gene_id', keep='first')
transcript_table.set_index('gene_id', inplace=True, drop=False)
# read gene counts from individual files
matrix = {}
sample_ids = []
for filename in input.counts:
sample_id = filename.split('/')[-2]
sample_ids.append(sample_id)
if wildcards.count_method == 'featurecounts':
matrix[sample_id] = pd.read_table(filename, comment='#', sep='\t', index_col=0)
matrix[sample_id] = matrix[sample_id].iloc[:, -1]
elif wildcards.count_method == 'htseq':
matrix[sample_id] = pd.read_table(filename, comment='__', sep='\t', header=None, index_col=0).iloc[:, 0]
matrix = pd.DataFrame(matrix)
matrix = matrix.loc[:, sample_ids]
# remove all-zero features
matrix = matrix.loc[matrix.sum(axis=1) > 0].copy()
gene_ids = matrix.index.values
# remove features not in transcript table
gene_ids = gene_ids[~(transcript_table.loc[gene_ids, 'gene_id'].isna().values)]
matrix = matrix.loc[gene_ids]
# read gene lengths
gene_lengths = pd.read_table(input.gene_length, sep='\t', index_col=0, dtype='str').loc[:, 'merged']
# remove features not in gene length
gene_ids = gene_ids[~(gene_lengths.reindex(gene_ids).isna().values)]
matrix = matrix.loc[gene_ids]
# annotate features
feature_names = transcript_table.loc[gene_ids, 'gene_id'].values \
+ '|' + transcript_table.loc[gene_ids, 'gene_type'].values \
+ '|' + transcript_table.loc[gene_ids, 'gene_name'].values \
+ '|' + transcript_table.loc[gene_ids, 'gene_id'].values \
+ '|' + transcript_table.loc[gene_ids, 'gene_id'].values \
+ '|0|' + gene_lengths.loc[gene_ids].values
matrix.index = feature_names
# merge gene matrix and circRNA matrix
matrix_circrna = pd.read_table(input.matrix_circrna, sep='\t', index_col=0)
matrix = pd.concat([matrix, matrix_circrna], axis=0)
matrix.index.name = 'feature'
matrix.to_csv(output[0], sep='\t', header=True, index=True, na_rep='NA')
rule count_matrix_mrna:
input:
'{output_dir}/count_matrix/{count_method}.txt'
output:
'{output_dir}/count_matrix/{count_method}_mrna.txt'
wildcard_constraints:
count_method=count_method_regex
shell:
'''awk 'NR==1{{print}}NR>1{{split($0,a,"|");if(a[2] == "mRNA") print}}' {input} > {output}
'''
rule count_matrix_lncrna:
input:
'{output_dir}/count_matrix/{count_method}.txt'
output:
'{output_dir}/count_matrix/{count_method}_lncrna.txt'
wildcard_constraints:
count_method=count_method_regex
shell:
'''awk 'NR==1{{print}}NR>1{{split($0,a,"|");if(a[2] == "lncRNA") print}}' {input} > {output}
'''
rule count_clean_reads_paired:
input:
'{output_dir}/unmapped/{sample_id}/clean_1.fastq.gz'
output:
'{output_dir}/stats/read_counts_clean/{sample_id}'
shell:
'''gzip -d -c {input} | wc -l | awk '{print int($0/4)}' > {output}
'''
rule count_clean_fragments:
'''Count number of clean read pairs'''
input:
'{output_dir}/unmapped/{sample_id}/clean_1.fastq.gz'
output:
'{output_dir}/stats/fragment_counts/{sample_id}/clean'
threads:
config['threads_compress']
shell:
'''pigz -p {threads} -d -c {input} | wc -l | awk '{{print int($0/4)}}' > {output}
'''
rule samtools_stats:
'''Statistics for mapped reads
'''
input:
'{output_dir}/bam/{sample_id}/{map_step}.bam'
output:
'{output_dir}/samtools_stats/{sample_id}/{map_step}.txt'
shell:
'''samtools stats {input} > {output}
'''
rule parse_samtools_stats_pe:
input:
'{output_dir}/samtools_stats/{sample_id}/{map_step}.txt'
output:
fragment_counts='{output_dir}/stats/fragment_counts/{sample_id}/{map_step}',
insert_size_average='{output_dir}/stats/insert_size_average/{sample_id}/{map_step}',
insert_size_hist='{output_dir}/stats/insert_size_hist/{sample_id}/{map_step}',
read_length_hist='{output_dir}/stats/read_length_hist/{sample_id}/{map_step}'
wildcard_constraints:
map_step='(spikein)|(univec)|(rRNA)|(genome)'
shell:
'''awk 'BEGIN{{OFS="\t";FS="\t"}}/^SN/{{if($2 == "reads mapped and paired:") print int($3/2)}}' {input} > {output.fragment_counts}
awk 'BEGIN{{OFS="\t";FS="\t"}}/^SN/{{if($2 == "insert size average:") print $3}}' {input} > {output.insert_size_average}
awk 'BEGIN{{OFS="\t";FS="\t"}}/^IS/{{print $2,$3}}' {input} > {output.insert_size_hist}
awk 'BEGIN{{OFS="\t";FS="\t"}}/^RL/{{print $2,$3}}' {input} > {output.read_length_hist}
'''
count_genomic_reads_command = r'''
# output columns: read_id, exon_count, intron_count, promoter_count, enhancer_count, repeat_count
# only keep proper pairs
# convert BAM to BEDPE format
# convert each record of BEDPE to 2 BED records
# flip strand of mate 2
# count reads in every annotation BED
# count a read pair if any of its mate intersects with annotation file
# assign read pairs to annotations
samtools view -bf 0x2 {input.bam} \
| bedtools bamtobed -bedpe -mate1 \
| awk 'BEGIN{{OFS="\t";FS="\t"}}{{print $1,$2,$3,$7,$8,$9; s=$10;if(s=="+"){{s="-"}}else if(s=="-"){{s="+"}}print $4,$5,$6,$7,$8,s}}' \
| bedtools annotate -counts {params.strand} -i - -files {input.exon} {input.intron} {input.promoter} {input.enhancer} {input.repeats} \
| awk 'BEGIN{{OFS="\t";FS="\t"}}{{if(NR%2==1){{a0=$4;a1=$7;a2=$8;a3=$9;a4=$10;a5=$11}}else{{print a0,a1,a2,a3,a4,a5,$4,$7,$8,$9,$10,$11}} }}' \
| awk 'BEGIN{{OFS="\t";FS="\t";exon=0;intron=0;promoter=0;enhancer=0;repeats=0}}
{{
if(($2>0)||($8>0)) exon++
else if(($3>0)||($9>0)) intron++
else if(($4>0)||($10>0)) promoter++
else if(($5>0)||($11>0)) enhancer++
else if(($6>0)||($12>0)) repeats++
}}END{{
print exon > "{output.exon}"
print intron > "{output.intron}"
print promoter > "{output.promoter}"
print enhancer > "{output.enhancer}"
print repeats > "{output.repeats}"
}}'
'''
rule count_genomic_reads:
'''Annotate genomic reads that are not mapped to any gene
'''
input:
bam='{output_dir}/bam/{sample_id}/genome.bam',
exon=genome_dir + '/bed/long_RNA.exon.bed',
intron=genome_dir + '/bed/long_RNA.intron.bed',
promoter=genome_dir + '/bed/promoter.stranded.bed',
enhancer=genome_dir + '/bed/enhancer.stranded.bed',
repeats=genome_dir + '/bed/rmsk.bed'
output:
exon='{output_dir}/stats/fragment_counts/{sample_id}/exon',
intron='{output_dir}/stats/fragment_counts/{sample_id}/intron',
promoter='{output_dir}/stats/fragment_counts/{sample_id}/promoter',
enhancer='{output_dir}/stats/fragment_counts/{sample_id}/enhancer',
repeats='{output_dir}/stats/fragment_counts/{sample_id}/repeats'
params:
strand=lambda wildcards: {'forward': '-s', 'reverse': '-S'}.get(config['strandness'], '')
run:
shell(count_genomic_reads_command)
rule count_genomic_reads_rmdup:
'''Annotate genomic reads that are not mapped to any gene
'''
input:
bam='{output_dir}/bam/{sample_id}/genome_rmdup.bam',
exon=genome_dir + '/bed/long_RNA.exon.bed',
intron=genome_dir + '/bed/long_RNA.intron.bed',
promoter=genome_dir + '/bed/promoter.stranded.bed',
enhancer=genome_dir + '/bed/enhancer.stranded.bed',
repeats=genome_dir + '/bed/rmsk.bed'
output:
exon='{output_dir}/stats/fragment_counts/{sample_id}/exon_rmdup',
intron='{output_dir}/stats/fragment_counts/{sample_id}/intron_rmdup',
promoter='{output_dir}/stats/fragment_counts/{sample_id}/promoter_rmdup',
enhancer='{output_dir}/stats/fragment_counts/{sample_id}/enhancer_rmdup',
repeats='{output_dir}/stats/fragment_counts/{sample_id}/repeats_rmdup'
params:
strand=lambda wildcards: {'forward': '-s', 'reverse': '-S'}.get(config['strandness'], '')
run:
shell(count_genomic_reads_command)
rule summarize_fragment_counts:
input:
fragment_counts=lambda wildcards: expand('{output_dir}/stats/fragment_counts/{sample_id}/{map_step}',
output_dir=wildcards.output_dir, sample_id=sample_ids, map_step=map_steps),
count_matrix='{output_dir}/count_matrix/featurecounts.txt'
output:
'{output_dir}/summary/read_counts.txt'
wildcard_constraints:
count_method=count_method_regex
run:
import pandas as pd
# read fragment counts for each mapping step
fragment_counts = pd.DataFrame(index=map_steps, columns=sample_ids)
for filename in input.fragment_counts:
c = filename.split('/')
sample_id = c[-2]
map_step = c[-1]
with open(filename, 'r') as f:
fragment_counts.loc[map_step, sample_id] = int(f.read().strip())
fragment_counts = fragment_counts.astype('int')
fragment_counts.columns.name = 'sample_id'
fragment_counts.drop(index='circRNA', inplace=True)
# read count matrix
count_matrix = pd.read_table(input.count_matrix, sep='\t', index_col=0)
feature_info = count_matrix.index.to_series().str.split('|', expand=True)
feature_info.columns = ['gene_id', 'gene_type', 'gene_name', 'domain_id', 'transcript_id', 'start', 'end']
count_matrix = pd.concat([feature_info, count_matrix], axis=1)
counts_by_rnatype = count_matrix.groupby('gene_type')[sample_ids].sum()
counts_by_rnatype = counts_by_rnatype.loc[:, sample_ids]
matrix = pd.concat([fragment_counts, counts_by_rnatype], axis=0)
matrix.index.name = 'rna_type'
matrix.to_csv(output[0], sep='\t', header=True, index=True, na_rep='NA')
rule summarize_fragment_counts_jupyter:
input:
summary='{output_dir}/summary/read_counts.txt',
jupyter=package_dir + '/templates/summarize_read_counts_long.ipynb'
output:
jupyter='{output_dir}/summary/read_counts.ipynb',
html='{output_dir}/summary/read_counts.html'
run:
shell(nbconvert_command)