|
a |
|
b/exseek/snakefiles/evaluate_features.snakemake |
|
|
1 |
include: 'common.snakemake' |
|
|
2 |
|
|
|
3 |
import os |
|
|
4 |
import yaml |
|
|
5 |
with open(data_dir + '/compare_groups.yaml', 'r') as f: |
|
|
6 |
compare_groups = yaml.load(f) |
|
|
7 |
with open(get_config_file('evaluate_features.yaml'), 'r') as f: |
|
|
8 |
cv_config = yaml.load(f) |
|
|
9 |
|
|
|
10 |
classifiers = list(cv_config['classifiers'].keys()) |
|
|
11 |
|
|
|
12 |
inputs = {'evaluate_features': []} |
|
|
13 |
for compare_group, feature_set in get_known_biomarkers(): |
|
|
14 |
inputs['evaluate_features'] += expand('{output_dir}/evaluate_features/{compare_group}/{feature_set}/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.{count_method}/{classifier}', |
|
|
15 |
output_dir=output_dir, |
|
|
16 |
imputation_method=config['imputation_method'], |
|
|
17 |
normalization_method=config['normalization_method'], |
|
|
18 |
batch_removal_method=config['batch_removal_method'], |
|
|
19 |
batch_index=config['batch_index'], |
|
|
20 |
count_method=config['count_method'], |
|
|
21 |
classifier=classifiers, |
|
|
22 |
compare_group=compare_group, |
|
|
23 |
feature_set=feature_set) |
|
|
24 |
inputs['summarize_evaluate_features'] = expand('{output_dir}/summary/evaluate_features/{summary_name}.txt', |
|
|
25 |
output_dir=output_dir, summary_name=['metrics.train', 'metrics.test']) |
|
|
26 |
|
|
|
27 |
|
|
|
28 |
rule all: |
|
|
29 |
input: |
|
|
30 |
unpack(lambda wildcards: inputs) |
|
|
31 |
|
|
|
32 |
|
|
|
33 |
rule preprocess_features: |
|
|
34 |
input: |
|
|
35 |
'{output_dir}/evaluate_features/matrix/{compare_group}/{feature_set}.txt' |
|
|
36 |
output: |
|
|
37 |
'{output_dir}/evaluate_features/preprocess_features/{compare_group}/{feature_set}.txt' |
|
|
38 |
params: |
|
|
39 |
scaler=config['scale_method'] |
|
|
40 |
shell: |
|
|
41 |
'''{bin_dir}/feature_selection.py preprocess_features -i {input} --scaler {params.scaler} \ |
|
|
42 |
--use-log --transpose -o {output} |
|
|
43 |
''' |
|
|
44 |
|
|
|
45 |
rule evaluate_features: |
|
|
46 |
input: |
|
|
47 |
matrix='{output_dir}/matrix_processing/{preprocess_method}.{count_method}.txt', |
|
|
48 |
sample_classes=data_dir+ '/sample_classes.txt', |
|
|
49 |
features=data_dir + '/known_biomarkers/{compare_group}/{featureset}.txt' |
|
|
50 |
output: |
|
|
51 |
dir=directory('{output_dir}/evaluate_features/{compare_group}/{featureset}/{preprocess_method}.{count_method}/{classifier}') |
|
|
52 |
run: |
|
|
53 |
from copy import deepcopy |
|
|
54 |
|
|
|
55 |
output_config = {} |
|
|
56 |
# copy global config parameters |
|
|
57 |
for key in ('transpose', 'features', 'cv_params', 'sample_weight', 'preprocess_steps'): |
|
|
58 |
if key in cv_config: |
|
|
59 |
output_config[key] = cv_config[key] |
|
|
60 |
# copy classifier config |
|
|
61 |
classifier_config = deepcopy(cv_config['classifiers'][wildcards.classifier]) |
|
|
62 |
classifier_config['params'] = classifier_config.get('params', {}) |
|
|
63 |
output_config['classifier'] = classifier_config['classifier'] |
|
|
64 |
output_config['classifier_params'] = classifier_config.get('classifier_params', {}) |
|
|
65 |
# copy classifier grid search params |
|
|
66 |
if classifier_config.get('grid_search', False): |
|
|
67 |
grid_search_params = deepcopy(cv_config['classifier_grid_search_params']) |
|
|
68 |
grid_search_params.update(classifier_config['grid_search_params']) |
|
|
69 |
# add classifier grid search config |
|
|
70 |
output_config['grid_search'] = True |
|
|
71 |
output_config['grid_search_params'] = grid_search_params |
|
|
72 |
# write output config |
|
|
73 |
if not os.path.isdir(output.dir): |
|
|
74 |
os.makedirs(output.dir) |
|
|
75 |
output_config_file = os.path.join(output.dir, 'config.yaml') |
|
|
76 |
with open(output_config_file, 'w') as f: |
|
|
77 |
yaml.dump(output_config, f, default_flow_style=False) |
|
|
78 |
command = [ |
|
|
79 |
os.path.join(config['bin_dir'], 'machine_learning.py'), 'run_pipeline', |
|
|
80 |
'--matrix', input.matrix, |
|
|
81 |
'--sample-classes', input.sample_classes, |
|
|
82 |
'--output-dir', output.dir, |
|
|
83 |
'--features', input.features, |
|
|
84 |
'--positive-class', '"' + compare_groups[wildcards.compare_group][1] + '"', |
|
|
85 |
'--negative-class', '"' + compare_groups[wildcards.compare_group][0] + '"', |
|
|
86 |
'--config', output_config_file |
|
|
87 |
] |
|
|
88 |
shell(' '.join(command)) |
|
|
89 |
|
|
|
90 |
""" |
|
|
91 |
rule evaluate_features: |
|
|
92 |
input: |
|
|
93 |
matrix='{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.{count_method}.txt', |
|
|
94 |
sample_classes=data_dir+ '/sample_classes.txt', |
|
|
95 |
features=data_dir + '/known_biomarkers/{compare_group}/{feature_set}.txt' |
|
|
96 |
output: |
|
|
97 |
directory('{output_dir}/evaluate_features/{compare_group}/{feature_set}/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.{count_method}/{classifier}') |
|
|
98 |
params: |
|
|
99 |
count_method=count_method_regex |
|
|
100 |
run: |
|
|
101 |
import json |
|
|
102 |
import os |
|
|
103 |
import subprocess |
|
|
104 |
from shlex import quote |
|
|
105 |
from copy import deepcopy |
|
|
106 |
|
|
|
107 |
command = [ |
|
|
108 |
os.path.join(config['bin_dir'], 'machine_learning.py'), 'cross_validation', |
|
|
109 |
'--matrix', input.matrix, |
|
|
110 |
'--sample-classes', input.sample_classes, |
|
|
111 |
'--output-dir', output[0], |
|
|
112 |
'--transpose', |
|
|
113 |
'--positive-class', compare_groups[wildcards.compare_group][1], |
|
|
114 |
'--negative-class', compare_groups[wildcards.compare_group][0], |
|
|
115 |
'--cv-params', json.dumps(config['cv_params']), |
|
|
116 |
'--selector', 'null', |
|
|
117 |
'--features', input.features |
|
|
118 |
] |
|
|
119 |
if config['log_transform']: |
|
|
120 |
command += ['--log-transform', '--log-transform-params', json.dumps(config['log_transform_params'])] |
|
|
121 |
if config['scaler']: |
|
|
122 |
command += ['--scaler', config['scaler'], '--scaler-params', json.dumps(config['scaler_params'].get(config['scaler'], {}))] |
|
|
123 |
#if config['grid_search']: |
|
|
124 |
# command += ['--grid-search', '--grid-search-params', json.dumps(config['grid_search_params'])] |
|
|
125 |
if config['sample_weight']: |
|
|
126 |
command += ['--sample-weight', config['sample_weight']] |
|
|
127 |
command += ['--classifier', wildcards.classifier, |
|
|
128 |
'--classifier-params', json.dumps(config['classifier_params'].get(wildcards.classifier, {}))] |
|
|
129 |
command = list(map(str, command)) |
|
|
130 |
print(' '.join(map(quote, command))) |
|
|
131 |
subprocess.check_call(command) |
|
|
132 |
""" |
|
|
133 |
|
|
|
134 |
rule summarize_evaluate_features: |
|
|
135 |
input: |
|
|
136 |
input_dir=inputs['evaluate_features'] |
|
|
137 |
output: |
|
|
138 |
metrics_test='{output_dir}/summary/{cross_validation}/metrics.test.txt', |
|
|
139 |
metrics_train='{output_dir}/summary/{cross_validation}/metrics.train.txt' |
|
|
140 |
script: |
|
|
141 |
'scripts/summarize_cross_validation.py' |