|
a |
|
b/exseek/snakefiles/normalization.snakemake |
|
|
1 |
include: 'common.snakemake' |
|
|
2 |
|
|
|
3 |
import re |
|
|
4 |
|
|
|
5 |
config['normalization_methods'] = ["TMM", "RLE", "CPM", "CPM_top"] |
|
|
6 |
# Batch effect removal methods to try (set "null" to skip batch effect removal) |
|
|
7 |
config['batch_removal_methods'] = ["null", "RUV", "RUVn"] |
|
|
8 |
if has_batch_info: |
|
|
9 |
config['batch_removal_methods'] += ['ComBat', 'limma'] |
|
|
10 |
config['imputation_methods'] = ['null'] |
|
|
11 |
|
|
|
12 |
def get_all_inputs(wildcards): |
|
|
13 |
available_inputs = [] |
|
|
14 |
available_inputs += expand('{output_dir}/matrix_processing/filter.{count_method}.txt', |
|
|
15 |
output_dir=output_dir, count_method=config['count_method']) |
|
|
16 |
for batch_removal_method in config['batch_removal_methods']: |
|
|
17 |
template = '{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.{count_method}.txt' |
|
|
18 |
available_inputs += expand(template, |
|
|
19 |
output_dir=output_dir, |
|
|
20 |
imputation_method=config['imputation_methods'], |
|
|
21 |
normalization_method=config['normalization_methods'], |
|
|
22 |
batch_removal_method=batch_removal_method, |
|
|
23 |
batch_index=config['batch_index'], |
|
|
24 |
count_method=config['count_method']) |
|
|
25 |
template = '{output_dir}/matrix_processing/filter.{count_method}.txt' |
|
|
26 |
available_inputs += expand(template, |
|
|
27 |
output_dir=output_dir, |
|
|
28 |
count_method=config['count_method']) |
|
|
29 |
available_inputs += expand('{output_dir}/select_preprocess_method/{score}/{count_method}/selected_methods.txt', |
|
|
30 |
output_dir=output_dir, score=clustering_scores, count_method=config['count_method']) |
|
|
31 |
|
|
|
32 |
return available_inputs |
|
|
33 |
|
|
|
34 |
|
|
|
35 |
rule all: |
|
|
36 |
input: |
|
|
37 |
get_all_inputs |
|
|
38 |
|
|
|
39 |
#include: 'rules/filter.snakemake' |
|
|
40 |
|
|
|
41 |
rule filter_step: |
|
|
42 |
input: |
|
|
43 |
matrix='{output_dir}/count_matrix/{count_method}.txt', |
|
|
44 |
sample_classes=data_dir + '/sample_classes.txt' |
|
|
45 |
output: |
|
|
46 |
matrix='{output_dir}/matrix_processing/filter.{count_method}.txt' |
|
|
47 |
threads: |
|
|
48 |
config['threads'] |
|
|
49 |
run: |
|
|
50 |
command= '''Rscript {bin_dir}/matrix-process.R -s filter \ |
|
|
51 |
-c {input.sample_classes} \ |
|
|
52 |
-i {input.matrix} \ |
|
|
53 |
-o {output.matrix} \ |
|
|
54 |
-p {threads}''' |
|
|
55 |
if config['filtercount'] > 0: |
|
|
56 |
command = command + ' --filtercount {}'.format(config['filtercount']) |
|
|
57 |
if config['filterexpv'] > 0: |
|
|
58 |
if config['small_rna']: |
|
|
59 |
command = command + ' --filtercpm {}'.format(config['filterexpv']) |
|
|
60 |
else: |
|
|
61 |
command = command + ' --filterrpkm {}'.format(config['filterexpv']) |
|
|
62 |
command = command + ' --filtersample {}'.format(config['filtersample']) |
|
|
63 |
shell(command) |
|
|
64 |
|
|
|
65 |
rule imputation_step: |
|
|
66 |
input: |
|
|
67 |
matrix='{output_dir}/matrix_processing/filter.{count_method}.txt', |
|
|
68 |
sample_classes=data_dir + '/sample_classes.txt' |
|
|
69 |
output: |
|
|
70 |
matrix='{output_dir}/matrix_processing/filter.{imputation_method}.{count_method}.txt', |
|
|
71 |
threads: |
|
|
72 |
config['threads'] |
|
|
73 |
params: |
|
|
74 |
imputecluster=5, |
|
|
75 |
temp_dir='{output_dir}/matrix_processing/filter.{imputation_method}.{count_method}.tmp' |
|
|
76 |
wildcard_constraints: |
|
|
77 |
imputation_method=imputation_method_regex, |
|
|
78 |
count_method=count_method_regex |
|
|
79 |
shell: |
|
|
80 |
'''Rscript {bin_dir}/matrix-process.R -s imputation \ |
|
|
81 |
-i {input.matrix} \ |
|
|
82 |
-c {input.sample_classes} \ |
|
|
83 |
-o {output.matrix} \ |
|
|
84 |
--temp-dir {params.temp_dir} \ |
|
|
85 |
--method {wildcards.imputation_method} \ |
|
|
86 |
--imputecluster {params.imputecluster} \ |
|
|
87 |
-p {threads} \ |
|
|
88 |
''' |
|
|
89 |
|
|
|
90 |
rule normalization_step: |
|
|
91 |
input: |
|
|
92 |
matrix='{output_dir}/matrix_processing/filter.{imputation_method}.{count_method}.txt', |
|
|
93 |
sample_classes=data_dir + '/sample_classes.txt' |
|
|
94 |
output: |
|
|
95 |
matrix='{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.{count_method}.txt' |
|
|
96 |
threads: |
|
|
97 |
config['threads'] |
|
|
98 |
wildcard_constraints: |
|
|
99 |
normalization_method=normalization_method_regex, |
|
|
100 |
count_method=count_method_regex |
|
|
101 |
params: |
|
|
102 |
cvthreshold=0.5, |
|
|
103 |
remove_gene_types='miRNA,piRNA', |
|
|
104 |
normtopk=20 |
|
|
105 |
shell: |
|
|
106 |
'''Rscript {bin_dir}/matrix-process.R -s normalization \ |
|
|
107 |
-i {input.matrix} \ |
|
|
108 |
-o {output.matrix} \ |
|
|
109 |
-c {input.sample_classes} \ |
|
|
110 |
-p {threads} \ |
|
|
111 |
--method {wildcards.normalization_method} \ |
|
|
112 |
--normtopk {params.normtopk} \ |
|
|
113 |
--remove-gene-types {params.remove_gene_types} \ |
|
|
114 |
--cvthreshold {params.cvthreshold} |
|
|
115 |
''' |
|
|
116 |
|
|
|
117 |
rule sub_matrix_filter: |
|
|
118 |
input: |
|
|
119 |
'{output_dir}/matrix_processing/filter.mirna_and_domains_rna.txt' |
|
|
120 |
output: |
|
|
121 |
'{output_dir}/matrix_processing/filter.mirna_only.txt' |
|
|
122 |
shell: |
|
|
123 |
r'''awk 'BEGIN{{OFS="\t";FS="\t"}}(NR==1)||($1 ~/miRNA/){{print}}' {input} > {output} |
|
|
124 |
''' |
|
|
125 |
|
|
|
126 |
rule sub_matrix: |
|
|
127 |
input: |
|
|
128 |
'{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.mirna_and_domains_rna.txt' |
|
|
129 |
output: |
|
|
130 |
'{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.mirna_only.txt' |
|
|
131 |
shell: |
|
|
132 |
r'''awk 'BEGIN{{OFS="\t";FS="\t"}}(NR==1)||($1 ~/miRNA/){{print}}' {input} > {output} |
|
|
133 |
''' |
|
|
134 |
|
|
|
135 |
rule batch_removal_step_with_batchinfo: |
|
|
136 |
input: |
|
|
137 |
matrix='{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.{count_method}.txt', |
|
|
138 |
sample_classes=data_dir + '/sample_classes.txt', |
|
|
139 |
batch_info=data_dir + '/batch_info.txt' |
|
|
140 |
output: |
|
|
141 |
matrix='{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.{count_method}.txt' |
|
|
142 |
threads: |
|
|
143 |
config['threads'] |
|
|
144 |
wildcard_constraints: |
|
|
145 |
batch_removal_method=batch_removal_method_with_batchinfo_regex |
|
|
146 |
shell: |
|
|
147 |
'''Rscript {bin_dir}/matrix-process.R -s batch_removal \ |
|
|
148 |
-i {input.matrix} \ |
|
|
149 |
-c {input.sample_classes} \ |
|
|
150 |
-b {input.batch_info} \ |
|
|
151 |
-o {output.matrix} \ |
|
|
152 |
-p {threads} \ |
|
|
153 |
--method {wildcards.batch_removal_method} \ |
|
|
154 |
--batch-index {wildcards.batch_index} |
|
|
155 |
''' |
|
|
156 |
|
|
|
157 |
rule batch_removal_step_without_batchinfo: |
|
|
158 |
input: |
|
|
159 |
matrix='{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.{count_method}.txt', |
|
|
160 |
sample_classes=data_dir + '/sample_classes.txt' |
|
|
161 |
output: |
|
|
162 |
matrix='{output_dir}/matrix_processing/filter.{imputation_method}.Norm_{normalization_method}.Batch_{batch_removal_method}_{batch_index}.{count_method}.txt' |
|
|
163 |
threads: |
|
|
164 |
config['threads'] |
|
|
165 |
wildcard_constraints: |
|
|
166 |
batch_removal_method=batch_removal_method_without_batchinfo_regex |
|
|
167 |
shell: |
|
|
168 |
'''Rscript {bin_dir}/matrix-process.R -s batch_removal \ |
|
|
169 |
-i {input.matrix} \ |
|
|
170 |
-c {input.sample_classes} \ |
|
|
171 |
-o {output.matrix} \ |
|
|
172 |
-p {threads} \ |
|
|
173 |
--method {wildcards.batch_removal_method} |
|
|
174 |
''' |
|
|
175 |
|
|
|
176 |
rule uca_score: |
|
|
177 |
input: |
|
|
178 |
matrix='{output_dir}/matrix_processing/{preprocess_method}.{count_method}.txt', |
|
|
179 |
sample_classes=data_dir + '/sample_classes.txt' |
|
|
180 |
output: |
|
|
181 |
'{output_dir}/clustering_scores/uca_score/{count_method}/{preprocess_method}' |
|
|
182 |
shell: |
|
|
183 |
'''{bin_dir}/feature_selection.py calculate_clustering_score \ |
|
|
184 |
--matrix {input.matrix} --method uca_score \ |
|
|
185 |
--sample-classes {input.sample_classes} --transpose --use-log > {output} |
|
|
186 |
''' |
|
|
187 |
|
|
|
188 |
rule knn_score: |
|
|
189 |
input: |
|
|
190 |
matrix='{output_dir}/matrix_processing/{preprocess_method}.{count_method}.txt', |
|
|
191 |
batch=data_dir + '/batch_info.txt' |
|
|
192 |
output: |
|
|
193 |
'{output_dir}/clustering_scores/knn_score/{count_method}/{preprocess_method}' |
|
|
194 |
params: |
|
|
195 |
batch_index=config['batch_index'] |
|
|
196 |
shell: |
|
|
197 |
'''{bin_dir}/feature_selection.py calculate_clustering_score \ |
|
|
198 |
--matrix {input.matrix} --method knn_score \ |
|
|
199 |
--batch {input.batch} --batch-index {params.batch_index} --transpose --use-log > {output} |
|
|
200 |
''' |
|
|
201 |
|
|
|
202 |
rule kbet_score: |
|
|
203 |
input: |
|
|
204 |
matrix='{output_dir}/matrix_processing/{preprocess_method}.{count_method}.txt', |
|
|
205 |
batch=data_dir + '/batch_info.txt' |
|
|
206 |
output: |
|
|
207 |
'{output_dir}/clustering_scores/kbet_score/{count_method}/{preprocess_method}' |
|
|
208 |
params: |
|
|
209 |
batch_index=config['batch_index'] |
|
|
210 |
shell: |
|
|
211 |
'''{bin_dir}/calculate_kbet_score.R --matrix {input.matrix} \ |
|
|
212 |
--batch {input.batch} --batch-index {params.batch_index} -o {output} |
|
|
213 |
''' |
|
|
214 |
|
|
|
215 |
rule combined_score: |
|
|
216 |
input: |
|
|
217 |
knn_score='{output_dir}/clustering_scores/knn_score/{count_method}/{preprocess_method}', |
|
|
218 |
clustering_score='{output_dir}/clustering_scores/uca_score/{count_method}/{preprocess_method}' |
|
|
219 |
output: |
|
|
220 |
'{output_dir}/clustering_scores/combined_score/{count_method}/{preprocess_method}' |
|
|
221 |
run: |
|
|
222 |
with open(input.knn_score, 'r') as f: |
|
|
223 |
knn_score = float(f.read().strip()) |
|
|
224 |
with open(input.clustering_score, 'r') as f: |
|
|
225 |
clustering_score = float(f.read().strip()) |
|
|
226 |
combined_score = 0.5*((1.0 - knn_score) + clustering_score) |
|
|
227 |
with open(output[0], 'w') as f: |
|
|
228 |
f.write(str(combined_score)) |
|
|
229 |
|
|
|
230 |
rule select_preprocess_method: |
|
|
231 |
input: |
|
|
232 |
lambda wildcards: expand('{output_dir}/clustering_scores/{score}/{count_method}/{preprocess_method}', |
|
|
233 |
output_dir=wildcards.output_dir, score=wildcards.score, |
|
|
234 |
preprocess_method=get_preprocess_methods(), count_method=wildcards.count_method) |
|
|
235 |
output: |
|
|
236 |
summary='{output_dir}/select_preprocess_method/{score}/{count_method}/summary.txt', |
|
|
237 |
selected_methods='{output_dir}/select_preprocess_method/{score}/{count_method}/selected_methods.txt' |
|
|
238 |
params: |
|
|
239 |
n_selected_preprocess_method=3 |
|
|
240 |
run: |
|
|
241 |
import pandas as pd |
|
|
242 |
|
|
|
243 |
scores = {} |
|
|
244 |
for score_file in input: |
|
|
245 |
preprocess_method = score_file.split('/')[-1] |
|
|
246 |
with open(score_file, 'r') as f: |
|
|
247 |
score = float(f.read()) |
|
|
248 |
scores[preprocess_method] = score |
|
|
249 |
scores = pd.Series(scores) |
|
|
250 |
scores.index.name = 'preprocess_method' |
|
|
251 |
scores.name = wildcards.score |
|
|
252 |
scores = scores.sort_values(ascending=False) |
|
|
253 |
scores.to_csv(output.summary, sep='\t', na_rep='NA', index=True, header=True) |
|
|
254 |
|
|
|
255 |
selected_methods = scores.index.to_series()[:params.n_selected_preprocess_method] |
|
|
256 |
selected_methods.to_csv(output.selected_methods, index=False, header=False) |