Switch to unified view

a b/exseek/snakefiles/evaluate_features.snakemake
1
include: 'common.snakemake'
2
3
import os
4
import yaml
5
with open(data_dir + '/compare_groups.yaml', 'r') as f:
6
    compare_groups = yaml.load(f)
7
with open(get_config_file('evaluate_features.yaml'), 'r') as f:
8
    cv_config = yaml.load(f)
9
10
classifiers = list(cv_config['classifiers'].keys())
11
12
inputs = {'evaluate_features': []}
13
for compare_group, feature_set in get_known_biomarkers():
14
    inputs['evaluate_features'] += expand('{output_dir}/evaluate_features/{compare_group}/{feature_set}/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.{count_method}/{classifier}',
15
        output_dir=output_dir, 
16
        imputation_method=config['imputation_method'],
17
        normalization_method=config['normalization_method'],
18
        batch_removal_method=config['batch_removal_method'],
19
        batch_index=config['batch_index'],
20
        count_method=config['count_method'],
21
        classifier=classifiers, 
22
        compare_group=compare_group,
23
        feature_set=feature_set)
24
inputs['summarize_evaluate_features'] = expand('{output_dir}/summary/evaluate_features/{summary_name}.txt',
25
    output_dir=output_dir, summary_name=['metrics.train', 'metrics.test'])
26
27
28
rule all:
29
    input:
30
        unpack(lambda wildcards: inputs)
31
32
33
rule preprocess_features:
34
    input:
35
        '{output_dir}/evaluate_features/matrix/{compare_group}/{feature_set}.txt'
36
    output:
37
        '{output_dir}/evaluate_features/preprocess_features/{compare_group}/{feature_set}.txt'
38
    params:
39
        scaler=config['scale_method']
40
    shell:
41
        '''{bin_dir}/feature_selection.py preprocess_features -i {input} --scaler {params.scaler} \
42
            --use-log --transpose -o {output}
43
        '''
44
45
rule evaluate_features:
46
    input:
47
        matrix='{output_dir}/matrix_processing/{preprocess_method}.{count_method}.txt',
48
        sample_classes=data_dir+ '/sample_classes.txt',
49
        features=data_dir + '/known_biomarkers/{compare_group}/{featureset}.txt'
50
    output:
51
        dir=directory('{output_dir}/evaluate_features/{compare_group}/{featureset}/{preprocess_method}.{count_method}/{classifier}')
52
    run:
53
        from copy import deepcopy
54
55
        output_config = {}
56
        # copy global config parameters
57
        for key in ('transpose', 'features', 'cv_params', 'sample_weight', 'preprocess_steps'):
58
            if key in cv_config:
59
                output_config[key] = cv_config[key]
60
        # copy classifier config
61
        classifier_config = deepcopy(cv_config['classifiers'][wildcards.classifier])
62
        classifier_config['params'] = classifier_config.get('params', {})
63
        output_config['classifier'] = classifier_config['classifier']
64
        output_config['classifier_params'] = classifier_config.get('classifier_params', {})
65
        # copy classifier grid search params
66
        if classifier_config.get('grid_search', False):
67
            grid_search_params = deepcopy(cv_config['classifier_grid_search_params'])
68
            grid_search_params.update(classifier_config['grid_search_params'])
69
            # add classifier grid search config
70
            output_config['grid_search'] = True
71
            output_config['grid_search_params'] = grid_search_params
72
        # write output config
73
        if not os.path.isdir(output.dir):
74
            os.makedirs(output.dir)
75
        output_config_file = os.path.join(output.dir, 'config.yaml')
76
        with open(output_config_file, 'w') as f:
77
            yaml.dump(output_config, f, default_flow_style=False)
78
        command = [
79
            os.path.join(config['bin_dir'], 'machine_learning.py'), 'run_pipeline',
80
            '--matrix', input.matrix,
81
            '--sample-classes', input.sample_classes,
82
            '--output-dir', output.dir,
83
            '--features', input.features,
84
            '--positive-class', '"' + compare_groups[wildcards.compare_group][1] + '"',
85
            '--negative-class', '"' + compare_groups[wildcards.compare_group][0] + '"',
86
            '--config', output_config_file
87
        ]
88
        shell(' '.join(command))
89
90
"""
91
rule evaluate_features:
92
    input:
93
        matrix='{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.{count_method}.txt',
94
        sample_classes=data_dir+ '/sample_classes.txt',
95
        features=data_dir + '/known_biomarkers/{compare_group}/{feature_set}.txt'
96
    output:
97
        directory('{output_dir}/evaluate_features/{compare_group}/{feature_set}/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.{count_method}/{classifier}')
98
    params:
99
        count_method=count_method_regex
100
    run:
101
        import json
102
        import os
103
        import subprocess
104
        from shlex import quote
105
        from copy import deepcopy
106
107
        command = [
108
            os.path.join(config['bin_dir'], 'machine_learning.py'), 'cross_validation',
109
            '--matrix', input.matrix,
110
            '--sample-classes', input.sample_classes,
111
            '--output-dir', output[0],
112
            '--transpose',
113
            '--positive-class', compare_groups[wildcards.compare_group][1],
114
            '--negative-class', compare_groups[wildcards.compare_group][0],
115
            '--cv-params', json.dumps(config['cv_params']),
116
            '--selector', 'null',
117
            '--features', input.features
118
        ]
119
        if config['log_transform']:
120
            command += ['--log-transform', '--log-transform-params', json.dumps(config['log_transform_params'])]
121
        if config['scaler']:
122
            command += ['--scaler', config['scaler'], '--scaler-params', json.dumps(config['scaler_params'].get(config['scaler'], {}))]
123
        #if config['grid_search']:
124
        #    command += ['--grid-search', '--grid-search-params', json.dumps(config['grid_search_params'])]
125
        if config['sample_weight']:
126
            command += ['--sample-weight', config['sample_weight']]
127
        command += ['--classifier', wildcards.classifier, 
128
            '--classifier-params', json.dumps(config['classifier_params'].get(wildcards.classifier, {}))]
129
        command = list(map(str, command))
130
        print(' '.join(map(quote, command)))
131
        subprocess.check_call(command)
132
"""
133
134
rule summarize_evaluate_features:
135
    input:
136
        input_dir=inputs['evaluate_features']
137
    output:
138
        metrics_test='{output_dir}/summary/{cross_validation}/metrics.test.txt',
139
        metrics_train='{output_dir}/summary/{cross_validation}/metrics.train.txt'
140
    script:
141
        'scripts/summarize_cross_validation.py'