Switch to side-by-side view

--- a
+++ b/exseek/snakefiles/common.snakemake
@@ -0,0 +1,242 @@
+shell.prefix('set -x; set -e;')
+import os
+import yaml
+import re
+from glob import glob
+
+def require_variable(variable, condition=None):
+    value = config.get(variable)
+    if value is None:
+        raise ValueError('configuration variable "{}" is required'.format(variable))
+    if (condition == 'input_dir') and (not os.path.isdir(value)):
+        raise ValueError('cannot find input directory {}: {}'.format(variable, value))
+    elif (condition == 'input_file') and (not os.path.isfile(value)):
+        raise ValueError('cannot find input file {}: {}'.format(variable, value))
+    return value
+
+def get_config_file(filename):
+    for config_dir in config_dirs:
+        if os.path.isfile(os.path.join(config_dir, filename)):
+            return os.path.join(config_dir, filename)
+
+# setup global variables
+package_dir = require_variable('package_dir')
+root_dir = require_variable('root_dir', 'input_dir')
+config_dirs = require_variable('config_dirs')
+config_dirs = config_dirs.split(':')
+# load default config
+with open(get_config_file('default_config.yaml'), 'r') as f:
+    default_config = yaml.load(f)
+# read selector-classifier
+with open(get_config_file('machine_learning.yaml'), 'r') as f:
+    default_config['machine_learning'] = yaml.load(f)
+
+default_config.update(config)
+config = default_config
+
+dataset = config['dataset']
+data_dir = require_variable('data_dir', 'input_dir')
+genome_dir = require_variable('genome_dir')
+bin_dir = require_variable('bin_dir', 'input_dir')
+output_dir = require_variable('output_dir')
+rna_types = require_variable('rna_types')
+tools_dir = require_variable('tools_dir')
+temp_dir = require_variable('temp_dir')
+# r_dir = require_variable('r_dir')
+# create temp_dir
+if not os.path.isdir(temp_dir):
+    os.makedirs(temp_dir)
+    
+# read sample ids from file
+with open(os.path.join(data_dir, 'sample_ids.txt'), 'r') as f:
+    sample_ids = f.read().split()
+for sample_id in sample_ids:
+    if '.' in sample_id:
+        raise ValueError('"." is not allowed in sample ID: {}'.format(sample_id))
+
+# rna types with annotation gtf
+rna_types_with_gtf = []
+for rna_type in config['rna_types']:
+    if rna_type not in ('univec', 'rRNA', 'spikein'):
+        rna_types_with_gtf.append(rna_type)
+
+# long RNA types
+long_rna_types = list(filter(lambda x: x not in ('miRNA', 'piRNA', 'rRNA', 'spikein', 'univec'), config['rna_types']))
+
+# read adapter sequences
+for key in ('adaptor', 'adaptor_5p'):
+    if isinstance(config[key], str):
+        if config[key].startswith('file:'):
+            filename = config[key][5:].strip()
+            adapters = {}
+            with open(filename, 'r') as f:
+                for lineno, line in enumerate(f):
+                    c = line.strip().split('\t')
+                    if len(c) != 2:
+                        raise ValueError('expect 2 columns in adapter file: {}, {} found'.format(filename, len(c)))
+                    adapters[c[0]] = c[1]
+            config[key] = adapters
+    
+def get_preprocess_methods():
+    '''Get combinations of preprocess methods for feature selection and feature evaluation
+    '''
+    preprocess_methods = []
+    for batch_removal_method in config['batch_removal_methods']:
+        if batch_removal_method in ('ComBat', 'limma'):
+            template = 'filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}'
+            preprocess_methods += expand(template,
+                output_dir=output_dir,
+                imputation_method=config['imputation_methods'],
+                normalization_method=config['normalization_methods'],
+                batch_removal_method=batch_removal_method,
+                batch_index=config['batch_index'])
+        elif batch_removal_method in ('RUV', 'RUVn', 'null'):
+            template = 'filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_1'
+            preprocess_methods += expand(template,
+                output_dir=output_dir,
+                imputation_method=config['imputation_methods'],
+                normalization_method=config['normalization_methods'],
+                batch_removal_method=batch_removal_method)
+    return preprocess_methods
+
+def auto_gzip_input(template):
+    '''Input function that automatically detect gzip files
+    '''
+    def get_filename(wildcards):
+        gzip_names = expand(template + '.gz', **wildcards)
+        if all(os.path.exists(f) for f in gzip_names):
+            return gzip_names
+        original_names = expand(template, **wildcards)
+        #if all(os.path.exists(f) for f in original_names):
+        return original_names
+        
+    return get_filename
+
+def parse_fastqc_data(fp):
+    '''
+    '''
+    section = None
+    qc_status = OrderedDict()
+    basic_statistics = OrderedDict()
+    for line in fp:
+        line = str(line, encoding='utf-8')
+        line = line.strip()
+        if line.startswith('>>'):
+            if line == '>>END_MODULE':
+                continue
+            section, status = line[2:].split('\t')
+            qc_status[section] = status
+        else:
+            if section == 'Basic Statistics':
+                key, val = line.split('\t')
+                basic_statistics[key] = val
+    for key, val in qc_status.items():
+        basic_statistics[key] = val
+    return basic_statistics
+    
+def get_input_matrix(wildcards):
+    # Use RPM for small RNA
+    if config['small_rna']:
+        return '{output_dir}/matrix_processing/{preprocess_method}.{count_method}.txt'.format(**wildcards)
+    # Use RPKM for long RNA
+    else:
+        return '{output_dir}/rpkm/{preprocess_method}.{count_method}.txt'.format(**wildcards)
+    
+def get_known_biomarkers():
+    biomarkers = []
+    for filename in glob(os.path.join(data_dir, 'known_biomarkers', '*/*.txt')):
+        c = filename.split('/')
+        compare_group = c[-2]
+        feature_set = os.path.splitext(c[-1])[0]
+        biomarkers.append((compare_group, feature_set))
+    return biomarkers
+
+def to_list(obj):
+    '''Wrap objects as a list
+    '''
+    if isinstance(obj, list):
+        return obj
+    if isinstance(obj, str) or isinstance(obj, bytes):
+        return [obj]
+    return list(obj)
+
+def get_cutadapt_adapter_args(wildcards, adapter, option):
+    '''Get adapter sequence for a sample
+
+    ===========
+    Parameters:
+        adapter: adapter config value
+        option: option for cutadapt
+
+    ========
+    Returns:
+        arguments: command arguments for cutadapt
+
+    '''
+    if isinstance(adapter, str):
+        if len(adapter) == 0:
+            return ''
+        else:
+            return option + ' ' +  adapter
+    elif isinstance(adapter, list):
+        return ' '.join(expand(option + ' {a}', adapter))
+    elif isinstance(adapter, dict):
+        adapter_seq = adapter.get(wildcards.sample_id)
+        if adapter_seq is None:
+            raise ValueError('adapter sequence is not found for sample {}'.format(wildcards.sample_id))
+        return option + ' ' + adapter_seq
+    else:
+        return ''
+        
+def get_library_size_small(summary_file, sample_id):
+    '''Get library size for a sample from summary file
+    '''
+    columns = {}
+    data = {}
+    with open(summary_file, 'r') as f:
+        for lineno, line in enumerate(f):
+            c = line.strip().split('\t')
+            if lineno == 0:
+                columns = {c[i]:i for i in range(1, len(c))}
+                column = columns[sample_id]
+            else:
+                data[c[0]] = int(c[column])
+    library_size = data['clean.unmapped'] - data['other.unmapped']
+    # remove these types from library size calculation
+    for key in ['spikein.mapped', 'univec.mapped', 'rRNA.mapped']:
+        if key in data:
+            library_size -= data[key]
+    return library_size
+
+# template for nbconvert
+nbconvert_command = '''cp {input.jupyter} {output.jupyter}
+jupyter nbconvert --execute --to html \
+    --HTMLExporter.exclude_code_cell=False \
+    --HTMLExporter.exclude_input_prompt=True \
+    --HTMLExporter.exclude_output_prompt=True \
+    {output.jupyter}
+'''
+
+# export singularity wrappers
+use_singularity = config.get('use_singularity')
+if use_singularity:
+    os.environ['PATH'] = config['container']['wrapper_dir'] + ':' + os.environ['PATH']
+
+if not os.path.isdir(temp_dir):
+    os.makedirs(temp_dir)
+
+sub_matrix_regex = '(mirna_only)'
+count_method_regex = '(featurecounts)|(htseq)|(transcript)|(mirna_and_long_fragments)|(featurecounts_lncrna)'
+count_method_regex = r'[^\.]+'
+imputation_method_regex = '(scimpute_count)|(viper_count)|(null)'
+normalization_method_regex = '(SCnorm)|(TMM)|(RLE)|(CPM)|(CPM_top)|(CPM_rm)|(CPM_refer)|(UQ)|(null)'
+batch_removal_method_with_batchinfo_regex = '(ComBat)|(limma)'
+batch_removal_method_without_batchinfo_regex = '(RUV)|(RUVn)|(null)'
+
+# batch information is provided
+has_batch_info = os.path.isfile(os.path.join(data_dir, 'batch_info.txt'))
+# clustering scores for normalization, feature_selection and evaluate_features
+clustering_scores = ['uca_score']
+if has_batch_info:
+    #clustering_scores += ['kbet_score', 'combined_score', 'knn_score']
+    clustering_scores += ['combined_score', 'knn_score']