Switch to unified view

a b/exseek/snakefiles/normalization.snakemake
1
include: 'common.snakemake'
2
3
import re
4
5
config['normalization_methods'] = ["TMM", "RLE", "CPM", "CPM_top"]
6
# Batch effect removal methods to try (set "null" to skip batch effect removal)
7
config['batch_removal_methods'] = ["null", "RUV", "RUVn"]
8
if has_batch_info:
9
    config['batch_removal_methods'] += ['ComBat', 'limma']
10
config['imputation_methods'] = ['null']
11
12
def get_all_inputs(wildcards):
13
    available_inputs = []
14
    available_inputs += expand('{output_dir}/matrix_processing/filter.{count_method}.txt',
15
            output_dir=output_dir, count_method=config['count_method'])
16
    for batch_removal_method in config['batch_removal_methods']:
17
        template = '{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.{count_method}.txt'
18
        available_inputs += expand(template,
19
            output_dir=output_dir,
20
            imputation_method=config['imputation_methods'],
21
            normalization_method=config['normalization_methods'],
22
            batch_removal_method=batch_removal_method,
23
            batch_index=config['batch_index'],
24
            count_method=config['count_method'])
25
        template = '{output_dir}/matrix_processing/filter.{count_method}.txt'
26
        available_inputs += expand(template,
27
            output_dir=output_dir,
28
            count_method=config['count_method'])
29
    available_inputs += expand('{output_dir}/select_preprocess_method/{score}/{count_method}/selected_methods.txt',
30
        output_dir=output_dir, score=clustering_scores, count_method=config['count_method'])
31
    
32
    return available_inputs
33
34
35
rule all:
36
    input:
37
        get_all_inputs
38
39
#include: 'rules/filter.snakemake'
40
41
rule filter_step:
42
    input:
43
        matrix='{output_dir}/count_matrix/{count_method}.txt',
44
        sample_classes=data_dir + '/sample_classes.txt'
45
    output:
46
        matrix='{output_dir}/matrix_processing/filter.{count_method}.txt'
47
    threads:
48
        config['threads']
49
    run:
50
        command= '''Rscript {bin_dir}/matrix-process.R -s filter \
51
            -c {input.sample_classes} \
52
            -i {input.matrix} \
53
            -o {output.matrix} \
54
            -p {threads}'''
55
        if config['filtercount'] > 0:
56
            command = command + ' --filtercount {}'.format(config['filtercount'])
57
        if config['filterexpv'] > 0:
58
            if config['small_rna']:
59
                command = command + ' --filtercpm {}'.format(config['filterexpv'])
60
            else:
61
                command = command + ' --filterrpkm {}'.format(config['filterexpv'])
62
        command = command + ' --filtersample {}'.format(config['filtersample'])
63
        shell(command)
64
65
rule imputation_step:
66
    input:
67
        matrix='{output_dir}/matrix_processing/filter.{count_method}.txt',
68
        sample_classes=data_dir + '/sample_classes.txt'
69
    output:
70
        matrix='{output_dir}/matrix_processing/filter.{imputation_method}.{count_method}.txt',
71
    threads:
72
        config['threads']
73
    params:
74
        imputecluster=5,
75
        temp_dir='{output_dir}/matrix_processing/filter.{imputation_method}.{count_method}.tmp'
76
    wildcard_constraints:
77
        imputation_method=imputation_method_regex,
78
        count_method=count_method_regex
79
    shell:
80
        '''Rscript {bin_dir}/matrix-process.R -s imputation \
81
        -i {input.matrix} \
82
        -c {input.sample_classes} \
83
        -o {output.matrix} \
84
        --temp-dir {params.temp_dir} \
85
        --method {wildcards.imputation_method} \
86
        --imputecluster {params.imputecluster} \
87
        -p {threads} \
88
        '''
89
90
rule normalization_step:
91
    input:
92
        matrix='{output_dir}/matrix_processing/filter.{imputation_method}.{count_method}.txt',
93
        sample_classes=data_dir + '/sample_classes.txt'
94
    output:
95
        matrix='{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.{count_method}.txt'
96
    threads:
97
        config['threads']
98
    wildcard_constraints:
99
        normalization_method=normalization_method_regex,
100
        count_method=count_method_regex
101
    params:
102
        cvthreshold=0.5,
103
        remove_gene_types='miRNA,piRNA',
104
        normtopk=20
105
    shell:
106
        '''Rscript {bin_dir}/matrix-process.R -s normalization \
107
        -i {input.matrix} \
108
        -o {output.matrix} \
109
        -c {input.sample_classes} \
110
        -p {threads} \
111
        --method {wildcards.normalization_method} \
112
        --normtopk {params.normtopk} \
113
        --remove-gene-types {params.remove_gene_types} \
114
        --cvthreshold {params.cvthreshold}
115
        '''
116
117
rule sub_matrix_filter:
118
    input:
119
        '{output_dir}/matrix_processing/filter.mirna_and_domains_rna.txt'
120
    output:
121
        '{output_dir}/matrix_processing/filter.mirna_only.txt'
122
    shell:
123
        r'''awk 'BEGIN{{OFS="\t";FS="\t"}}(NR==1)||($1 ~/miRNA/){{print}}' {input} > {output}
124
        '''
125
126
rule sub_matrix:
127
    input:
128
        '{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.mirna_and_domains_rna.txt'
129
    output:
130
        '{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.mirna_only.txt'
131
    shell:
132
        r'''awk 'BEGIN{{OFS="\t";FS="\t"}}(NR==1)||($1 ~/miRNA/){{print}}' {input} > {output}
133
        '''
134
135
rule batch_removal_step_with_batchinfo:
136
    input:
137
        matrix='{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.{count_method}.txt',
138
        sample_classes=data_dir + '/sample_classes.txt',
139
        batch_info=data_dir + '/batch_info.txt'
140
    output:
141
        matrix='{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.{count_method}.txt'
142
    threads:
143
        config['threads']
144
    wildcard_constraints:
145
        batch_removal_method=batch_removal_method_with_batchinfo_regex
146
    shell:
147
        '''Rscript {bin_dir}/matrix-process.R -s batch_removal \
148
        -i {input.matrix} \
149
        -c {input.sample_classes} \
150
        -b {input.batch_info} \
151
        -o {output.matrix} \
152
        -p {threads} \
153
        --method {wildcards.batch_removal_method} \
154
        --batch-index {wildcards.batch_index}
155
        '''
156
157
rule batch_removal_step_without_batchinfo:
158
    input:
159
        matrix='{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.{count_method}.txt',
160
        sample_classes=data_dir + '/sample_classes.txt'
161
    output:
162
        matrix='{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.{count_method}.txt'
163
    threads:
164
        config['threads']
165
    wildcard_constraints:
166
        batch_removal_method=batch_removal_method_without_batchinfo_regex
167
    shell:
168
        '''Rscript {bin_dir}/matrix-process.R -s batch_removal \
169
        -i {input.matrix} \
170
        -c {input.sample_classes} \
171
        -o {output.matrix} \
172
        -p {threads} \
173
        --method {wildcards.batch_removal_method}
174
        '''
175
176
rule uca_score:
177
    input:
178
        matrix='{output_dir}/matrix_processing/{preprocess_method}.{count_method}.txt',
179
        sample_classes=data_dir + '/sample_classes.txt'
180
    output:
181
        '{output_dir}/clustering_scores/uca_score/{count_method}/{preprocess_method}'
182
    shell:
183
        '''{bin_dir}/feature_selection.py calculate_clustering_score \
184
        --matrix {input.matrix} --method uca_score \
185
        --sample-classes {input.sample_classes} --transpose --use-log > {output}
186
        '''
187
188
rule knn_score:
189
    input:
190
        matrix='{output_dir}/matrix_processing/{preprocess_method}.{count_method}.txt',
191
        batch=data_dir + '/batch_info.txt'
192
    output:
193
        '{output_dir}/clustering_scores/knn_score/{count_method}/{preprocess_method}'
194
    params:
195
        batch_index=config['batch_index']
196
    shell:
197
        '''{bin_dir}/feature_selection.py calculate_clustering_score \
198
        --matrix {input.matrix} --method knn_score \
199
        --batch {input.batch} --batch-index {params.batch_index} --transpose --use-log > {output}
200
        '''
201
202
rule kbet_score:
203
    input:
204
        matrix='{output_dir}/matrix_processing/{preprocess_method}.{count_method}.txt',
205
        batch=data_dir + '/batch_info.txt'
206
    output:
207
        '{output_dir}/clustering_scores/kbet_score/{count_method}/{preprocess_method}'
208
    params:
209
        batch_index=config['batch_index']
210
    shell:
211
        '''{bin_dir}/calculate_kbet_score.R --matrix {input.matrix} \
212
        --batch {input.batch} --batch-index {params.batch_index} -o {output}
213
        '''
214
215
rule combined_score:
216
    input:
217
        knn_score='{output_dir}/clustering_scores/knn_score/{count_method}/{preprocess_method}',
218
        clustering_score='{output_dir}/clustering_scores/uca_score/{count_method}/{preprocess_method}'
219
    output:
220
        '{output_dir}/clustering_scores/combined_score/{count_method}/{preprocess_method}'
221
    run:
222
        with open(input.knn_score, 'r') as f:
223
            knn_score = float(f.read().strip())
224
        with open(input.clustering_score, 'r') as f:
225
            clustering_score = float(f.read().strip())
226
        combined_score = 0.5*((1.0 - knn_score) + clustering_score)
227
        with open(output[0], 'w') as f:
228
            f.write(str(combined_score))
229
230
rule select_preprocess_method:
231
    input:
232
        lambda wildcards: expand('{output_dir}/clustering_scores/{score}/{count_method}/{preprocess_method}',
233
            output_dir=wildcards.output_dir, score=wildcards.score, 
234
            preprocess_method=get_preprocess_methods(), count_method=wildcards.count_method)
235
    output:
236
        summary='{output_dir}/select_preprocess_method/{score}/{count_method}/summary.txt',
237
        selected_methods='{output_dir}/select_preprocess_method/{score}/{count_method}/selected_methods.txt'
238
    params:
239
        n_selected_preprocess_method=3
240
    run:
241
        import pandas as pd
242
243
        scores = {}
244
        for score_file in input:
245
            preprocess_method = score_file.split('/')[-1]
246
            with open(score_file, 'r') as f:
247
                score = float(f.read())
248
                scores[preprocess_method] = score
249
        scores = pd.Series(scores)
250
        scores.index.name = 'preprocess_method'
251
        scores.name = wildcards.score
252
        scores = scores.sort_values(ascending=False)
253
        scores.to_csv(output.summary, sep='\t', na_rep='NA', index=True, header=True)
254
255
        selected_methods = scores.index.to_series()[:params.n_selected_preprocess_method]
256
        selected_methods.to_csv(output.selected_methods, index=False, header=False)