115 lines (104 with data), 4.7 kB
shell.prefix('set -x;')
include: 'common.snakemake'
def get_all_inputs(wildcards):
if config['small_rna']:
map_steps = ['transcriptome']
else:
if config['remove_duplicates_long']:
map_steps = ['genome_rmdup', 'rRNA_rmdup', 'circRNA_rmdup']
else:
map_steps = ['genome', 'rRNA', 'circRNA']
track_types = ['bigwig', 'bam']
inputs = []
inputs += expand('igv/html/{dataset}/{map_step}.{track_type}.html',
dataset=dataset, map_step=map_steps, track_type=track_types)
inputs += expand('igv/config/{dataset}/{map_step}.{track_type}.yaml',
dataset=dataset, map_step=map_steps, track_type=track_types)
return inputs
rule all:
input:
get_all_inputs
def get_reference(wildcards):
if wildcards.map_step in ('transcriptome', 'genome', 'genome_rmdup'):
return 'templates/igv/config/genome.yaml'
elif wildcards.map_step in ('rRNA', 'rRNA_rmdup'):
return 'templates/igv/config/rRNA.yaml'
elif wildcards.map_step in ('circRNA', 'circRNA_rmdup'):
return 'templates/igv/config/circRNA.yaml'
else:
raise ValueError('unknown map_step: {}'.format(wildcards.map_step))
def get_track_pattern(wildcards):
if wildcards.track_type == 'bigwig':
return 'bigwig/{dataset}/{{sample_id}}.{map_step}.{{strand}}.bigWig'.format(**wildcards)
elif wildcards.track_type == 'bam':
return 'bam/{dataset}/{{sample_id}}/{map_step}.bam'.format(**wildcards)
else:
raise ValueError('unknown track type: {}'.format(wildcards.track_type))
rule igv_config:
input:
reference=get_reference,
sample_classes=data_dir + '/sample_classes.txt'
output:
config='igv/config/{dataset}/{map_step}.{track_type}.yaml',
params:
igv_base_url=config['igv_base_url'],
max_samples_per_class=lambda wildcards: 10 if wildcards.track_type == 'bigwig' else 1,
strand=lambda wildcards: '--strand "+"' if wildcards.map_step not in ('transcriptome', 'genome', 'genome_rmdup') else '',
track_pattern=get_track_pattern
shell:
'''{bin_dir}/create_igv.py generate_config {params.strand} \
--sample-classes {input.sample_classes} \
--track-pattern '{params.track_pattern}' \
--base-url '{params.igv_base_url}' \
--max-samples-per-class {params.max_samples_per_class} \
--reference '{input.reference}' \
-o '{output.config}'
'''
def get_count_matrix(wildcards):
if config['small_rna']:
return expand('{output_dir}/count_matrix/transcript.txt', **wildcards)
else:
return expand('{output_dir}/count_matrix/featurecounts.txt', **wildcards)
rule abundant_gene_ids:
input:
matrix=get_count_matrix
output:
'{output_dir}/igv_reference/{gene_type}/gene_ids.txt'
params:
max_genes=100
run:
import pandas as pd
matrix = pd.read_table(str(input.matrix), sep='\t', index_col=0)
feature_info = matrix.index.to_series().str.split('|', expand=True)
feature_info.columns = ['gene_id', 'gene_type', 'gene_name', 'domain_id', 'transcript_id', 'start', 'end']
matrix = matrix.loc[feature_info[feature_info['gene_type'] == wildcards.gene_type].index.values]
features = matrix.mean(axis=1).sort_values(ascending=False).head(params.max_genes).index.values
gene_ids = feature_info.loc[features, 'gene_id'].values
pd.Series(gene_ids).to_csv(output[0], index=False, header=False)
rule create_reference:
input:
fa=genome_dir + '/fasta/{gene_type}.fa',
fai=genome_dir + '/fasta/{gene_type}.fa.fai',
gene_ids='{output_dir}/igv_reference/{gene_type}/gene_ids.txt'
output:
fa='{output_dir}/igv_reference/{gene_type}/reference.fa',
fai='{output_dir}/igv_reference/{gene_type}/reference.fa.fai',
cytoband='{output_dir}/igv_reference/{gene_type}/cytoband.txt',
bed='{output_dir}/igv_reference/{gene_type}/annotation.bed'
params:
output_dir='{output_dir}/igv_reference/{gene_type}'
shell:
'''{bin_dir}/create_igv.py create_reference --genome '{wildcards.gene_type}' \
--name '{wildcards.gene_type}' --fasta {input.fa} \
--gene-ids {input.gene_ids} -o {params.output_dir}
'''
rule igv_html:
input:
config='igv/config/{dataset}/{map_step}.yaml',
template='templates/igv/main.html'
output:
html='igv/html/{dataset}/{map_step}.html'
shell:
'''{bin_dir}/create_igv.py render -i {input.template} \
-c {input.config} -o {output.html}
'''