[41c1e8]: / exseek / snakefiles / common.snakemake

Download this file

243 lines (216 with data), 9.1 kB

shell.prefix('set -x; set -e;')
import os
import yaml
import re
from glob import glob

def require_variable(variable, condition=None):
    value = config.get(variable)
    if value is None:
        raise ValueError('configuration variable "{}" is required'.format(variable))
    if (condition == 'input_dir') and (not os.path.isdir(value)):
        raise ValueError('cannot find input directory {}: {}'.format(variable, value))
    elif (condition == 'input_file') and (not os.path.isfile(value)):
        raise ValueError('cannot find input file {}: {}'.format(variable, value))
    return value

def get_config_file(filename):
    for config_dir in config_dirs:
        if os.path.isfile(os.path.join(config_dir, filename)):
            return os.path.join(config_dir, filename)

# setup global variables
package_dir = require_variable('package_dir')
root_dir = require_variable('root_dir', 'input_dir')
config_dirs = require_variable('config_dirs')
config_dirs = config_dirs.split(':')
# load default config
with open(get_config_file('default_config.yaml'), 'r') as f:
    default_config = yaml.load(f)
# read selector-classifier
with open(get_config_file('machine_learning.yaml'), 'r') as f:
    default_config['machine_learning'] = yaml.load(f)

default_config.update(config)
config = default_config

dataset = config['dataset']
data_dir = require_variable('data_dir', 'input_dir')
genome_dir = require_variable('genome_dir')
bin_dir = require_variable('bin_dir', 'input_dir')
output_dir = require_variable('output_dir')
rna_types = require_variable('rna_types')
tools_dir = require_variable('tools_dir')
temp_dir = require_variable('temp_dir')
# r_dir = require_variable('r_dir')
# create temp_dir
if not os.path.isdir(temp_dir):
    os.makedirs(temp_dir)
    
# read sample ids from file
with open(os.path.join(data_dir, 'sample_ids.txt'), 'r') as f:
    sample_ids = f.read().split()
for sample_id in sample_ids:
    if '.' in sample_id:
        raise ValueError('"." is not allowed in sample ID: {}'.format(sample_id))

# rna types with annotation gtf
rna_types_with_gtf = []
for rna_type in config['rna_types']:
    if rna_type not in ('univec', 'rRNA', 'spikein'):
        rna_types_with_gtf.append(rna_type)

# long RNA types
long_rna_types = list(filter(lambda x: x not in ('miRNA', 'piRNA', 'rRNA', 'spikein', 'univec'), config['rna_types']))

# read adapter sequences
for key in ('adaptor', 'adaptor_5p'):
    if isinstance(config[key], str):
        if config[key].startswith('file:'):
            filename = config[key][5:].strip()
            adapters = {}
            with open(filename, 'r') as f:
                for lineno, line in enumerate(f):
                    c = line.strip().split('\t')
                    if len(c) != 2:
                        raise ValueError('expect 2 columns in adapter file: {}, {} found'.format(filename, len(c)))
                    adapters[c[0]] = c[1]
            config[key] = adapters
    
def get_preprocess_methods():
    '''Get combinations of preprocess methods for feature selection and feature evaluation
    '''
    preprocess_methods = []
    for batch_removal_method in config['batch_removal_methods']:
        if batch_removal_method in ('ComBat', 'limma'):
            template = 'filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}'
            preprocess_methods += expand(template,
                output_dir=output_dir,
                imputation_method=config['imputation_methods'],
                normalization_method=config['normalization_methods'],
                batch_removal_method=batch_removal_method,
                batch_index=config['batch_index'])
        elif batch_removal_method in ('RUV', 'RUVn', 'null'):
            template = 'filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_1'
            preprocess_methods += expand(template,
                output_dir=output_dir,
                imputation_method=config['imputation_methods'],
                normalization_method=config['normalization_methods'],
                batch_removal_method=batch_removal_method)
    return preprocess_methods

def auto_gzip_input(template):
    '''Input function that automatically detect gzip files
    '''
    def get_filename(wildcards):
        gzip_names = expand(template + '.gz', **wildcards)
        if all(os.path.exists(f) for f in gzip_names):
            return gzip_names
        original_names = expand(template, **wildcards)
        #if all(os.path.exists(f) for f in original_names):
        return original_names
        
    return get_filename

def parse_fastqc_data(fp):
    '''
    '''
    section = None
    qc_status = OrderedDict()
    basic_statistics = OrderedDict()
    for line in fp:
        line = str(line, encoding='utf-8')
        line = line.strip()
        if line.startswith('>>'):
            if line == '>>END_MODULE':
                continue
            section, status = line[2:].split('\t')
            qc_status[section] = status
        else:
            if section == 'Basic Statistics':
                key, val = line.split('\t')
                basic_statistics[key] = val
    for key, val in qc_status.items():
        basic_statistics[key] = val
    return basic_statistics
    
def get_input_matrix(wildcards):
    # Use RPM for small RNA
    if config['small_rna']:
        return '{output_dir}/matrix_processing/{preprocess_method}.{count_method}.txt'.format(**wildcards)
    # Use RPKM for long RNA
    else:
        return '{output_dir}/rpkm/{preprocess_method}.{count_method}.txt'.format(**wildcards)
    
def get_known_biomarkers():
    biomarkers = []
    for filename in glob(os.path.join(data_dir, 'known_biomarkers', '*/*.txt')):
        c = filename.split('/')
        compare_group = c[-2]
        feature_set = os.path.splitext(c[-1])[0]
        biomarkers.append((compare_group, feature_set))
    return biomarkers

def to_list(obj):
    '''Wrap objects as a list
    '''
    if isinstance(obj, list):
        return obj
    if isinstance(obj, str) or isinstance(obj, bytes):
        return [obj]
    return list(obj)

def get_cutadapt_adapter_args(wildcards, adapter, option):
    '''Get adapter sequence for a sample

    ===========
    Parameters:
        adapter: adapter config value
        option: option for cutadapt

    ========
    Returns:
        arguments: command arguments for cutadapt

    '''
    if isinstance(adapter, str):
        if len(adapter) == 0:
            return ''
        else:
            return option + ' ' +  adapter
    elif isinstance(adapter, list):
        return ' '.join(expand(option + ' {a}', adapter))
    elif isinstance(adapter, dict):
        adapter_seq = adapter.get(wildcards.sample_id)
        if adapter_seq is None:
            raise ValueError('adapter sequence is not found for sample {}'.format(wildcards.sample_id))
        return option + ' ' + adapter_seq
    else:
        return ''
        
def get_library_size_small(summary_file, sample_id):
    '''Get library size for a sample from summary file
    '''
    columns = {}
    data = {}
    with open(summary_file, 'r') as f:
        for lineno, line in enumerate(f):
            c = line.strip().split('\t')
            if lineno == 0:
                columns = {c[i]:i for i in range(1, len(c))}
                column = columns[sample_id]
            else:
                data[c[0]] = int(c[column])
    library_size = data['clean.unmapped'] - data['other.unmapped']
    # remove these types from library size calculation
    for key in ['spikein.mapped', 'univec.mapped', 'rRNA.mapped']:
        if key in data:
            library_size -= data[key]
    return library_size

# template for nbconvert
nbconvert_command = '''cp {input.jupyter} {output.jupyter}
jupyter nbconvert --execute --to html \
    --HTMLExporter.exclude_code_cell=False \
    --HTMLExporter.exclude_input_prompt=True \
    --HTMLExporter.exclude_output_prompt=True \
    {output.jupyter}
'''

# export singularity wrappers
use_singularity = config.get('use_singularity')
if use_singularity:
    os.environ['PATH'] = config['container']['wrapper_dir'] + ':' + os.environ['PATH']

if not os.path.isdir(temp_dir):
    os.makedirs(temp_dir)

sub_matrix_regex = '(mirna_only)'
count_method_regex = '(featurecounts)|(htseq)|(transcript)|(mirna_and_long_fragments)|(featurecounts_lncrna)'
count_method_regex = r'[^\.]+'
imputation_method_regex = '(scimpute_count)|(viper_count)|(null)'
normalization_method_regex = '(SCnorm)|(TMM)|(RLE)|(CPM)|(CPM_top)|(CPM_rm)|(CPM_refer)|(UQ)|(null)'
batch_removal_method_with_batchinfo_regex = '(ComBat)|(limma)'
batch_removal_method_without_batchinfo_regex = '(RUV)|(RUVn)|(null)'

# batch information is provided
has_batch_info = os.path.isfile(os.path.join(data_dir, 'batch_info.txt'))
# clustering scores for normalization, feature_selection and evaluate_features
clustering_scores = ['uca_score']
if has_batch_info:
    #clustering_scores += ['kbet_score', 'combined_score', 'knn_score']
    clustering_scores += ['combined_score', 'knn_score']