Diff of /rna/snakemake/Snakefile [000000] .. [fe0e8b]

Switch to unified view

a b/rna/snakemake/Snakefile
1
import os
2
from re import search
3
import getpass
4
5
6
############
7
## Config ##
8
############
9
10
host = os.uname()[1]
11
if search("BI2404M", host) and getpass.getuser()=="argelagr":
12
    configfile: "config_ricard_local.yaml"
13
elif search("[headstone|pebble]", host) and getpass.getuser()=="argelagr":
14
    configfile: "config_ricard_babraham.yaml"
15
elif search("[headstone|pebble]", host) and getpass.getuser()=="stephen":
16
    configfile: "config_stephen_babraham.yaml"
17
else:
18
    print("Computer not recognised")
19
    exit()
20
21
# validate(config, schema="schemas/config.schema.yaml")
22
23
#############################################
24
## Wildcard constraints to avoid ambiguity ##
25
#############################################
26
27
# honestly I don't understand why do I have to do this, but otherwise I get ambiguity and strange wildcards
28
29
wildcard_constraints:
30
    trajectory_metacells = '|'.join([re.escape(x) for x in config["run_metacells_trajectory"]["trajectories"]]),
31
    sample_metacell = '|'.join([re.escape(x) for x in config["samples"]]),
32
    celltypeA = '|'.join([re.escape(x) for x in config["celltypes"]]),
33
    celltypeB = '|'.join([re.escape(x) for x in config["celltypes"]]),
34
    group_by = '|'.join([re.escape(x) for x in config["pseudobulk_rna"]["group_by"]])
35
36
##################################
37
## Filter wildcard combinations ##
38
##################################
39
40
from itertools import product
41
42
def filter_combinator(combinator, blacklist):
43
    def filtered_combinator(*args, **kwargs):
44
        for wc_comb in combinator(*args, **kwargs):
45
            if frozenset(wc_comb) not in blacklist:
46
                yield wc_comb
47
    return filtered_combinator
48
49
forbidden = {
50
    frozenset({("celltypeA",i),("celltypeB",i)}) for i in config["celltypes"]
51
     # frozenset({("celltypeA", "Epiblast"), ("celltypeB", "Epiblast")}),
52
    # frozenset({("text", "C"), ("num", 2)})
53
}
54
55
filtered_product_celltypes = filter_combinator(product, forbidden)
56
57
###########
58
## Rules ##
59
###########
60
61
rule all:
62
    input: 
63
        # Create Seurat and SingleCellExperiment objects
64
        config["directories"]["processed_data"]+"/seurat.rds",
65
        config["directories"]["processed_data"]+"/SingleCellExperiment.rds",
66
67
        # QC
68
        config["directories"]["results"]+"/rna/qc/sample_metadata_after_qc.txt.gz",
69
        config["directories"]["results"]+"/rna/doublet_detection/sample_metadata_after_doublets.txt.gz",
70
71
        # Plot stats
72
        config["directories"]["results"]+"/rna/stats/ncells_per_sample.pdf",
73
74
        # Mapping and cell type proportions
75
        config["directories"]["results"]+"/rna/mapping/sample_metadata_after_mapping.txt.gz",
76
        expand("%s/rna/mapping/mapping_mnn_{sample}.txt.gz" % config["directories"]["results"], sample=config["samples"]),
77
        config["directories"]["results"]+"/rna/mapping/pdf/umap_mapped_allcells.pdf",
78
        config["directories"]["results"]+"/rna/celltype_proportions/celltype_proportions_stacked_barplots.pdf",
79
80
        # Pseudobulk
81
        expand("%s/rna/pseudobulk/{group_by}/SingleCellExperiment_pseudobulk.rds" % config["directories"]["results"], group_by=config["pseudobulk_rna"]["group_by"]),
82
        expand("%s/rna/pseudobulk/{group_by}/SingleCellExperiment_pseudobulk_with_replicates.rds" % config["directories"]["results"], group_by=config["pseudobulk_rna"]["group_by"]),
83
84
        # Dimensionality reduction (all cells)
85
        expand("%s/rna/dimensionality_reduction/sce/per_stage/{stage}/pca_features{dimred_sce_features}_pcs{dimred_sce_npcs}.txt.gz" % (config["directories"]["results"]), 
86
            dimred_sce_features = config["dimensionality_reduction_sce"]["features"], 
87
            dimred_sce_npcs = config["dimensionality_reduction_sce"]["npcs"],
88
            stage = config["stages"]
89
            ),
90
        expand("%s/rna/dimensionality_reduction/sce/batch_correction_{batch_variable}_remove_ExE_cells_{remove_ExE_cells}/pca_features{dimred_sce_features}_pcs{dimred_sce_npcs}.txt.gz" % (config["directories"]["results"]), 
91
            dimred_sce_features = config["dimensionality_reduction_sce"]["features"], 
92
            batch_variable = config["dimensionality_reduction_sce"]["batch_variable"], 
93
            remove_ExE_cells = config["dimensionality_reduction_sce"]["remove_ExE_cells"], 
94
            dimred_sce_npcs = config["dimensionality_reduction_sce"]["npcs"]
95
            ),
96
        # expand("%s/rna/dimensionality_reduction/seurat/batch_correction_{batch_variable}_remove_ExE_cells_{remove_ExE_cells}/pca_features{dimred_seurat_features}_pcs{dimred_seurat_npcs}.txt.gz" % (config["directories"]["results"]), 
97
        #     dimred_seurat_features = config["dimensionality_reduction_seurat"]["features"], 
98
        #     batch_variable = config["dimensionality_reduction_seurat"]["batch_variable"], 
99
        #     remove_ExE_cells = config["dimensionality_reduction_seurat"]["remove_ExE_cells"], 
100
        #     dimred_seurat_npcs = config["dimensionality_reduction_seurat"]["npcs"]
101
        #     ),
102
103
        # Dimensionality reduction (per stage)
104
        expand("%s/rna/dimensionality_reduction/seurat/per_stage/{stage}/pca_features{dimred_seurat_features}_pcs{dimred_seurat_npcs}.txt.gz" % (config["directories"]["results"]), 
105
            dimred_seurat_features = config["dimensionality_reduction_seurat"]["features"], 
106
            dimred_seurat_npcs = config["dimensionality_reduction_seurat"]["npcs"],
107
            stage = config["stages"]
108
            ),
109
110
        # Differential expression between cell types
111
        expand(config["directories"]["results"]+"/rna/differential/cells/celltype/{celltypeA}_vs_{celltypeB}.txt.gz", filtered_product_celltypes, celltypeA=config["celltypes"][0], celltypeB=config["celltypes"]),
112
        expand(config["directories"]["results"]+"/rna/differential/metacells/celltype/{celltypeA}_vs_{celltypeB}.txt.gz", filtered_product_celltypes, celltypeA=config["celltypes"][0], celltypeB=config["celltypes"]),
113
        expand(config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/{celltypeA}_vs_{celltypeB}.txt.gz", filtered_product_celltypes, celltypeA=config["celltypes"][0], celltypeB=config["celltypes"]),
114
    
115
        # Parse differential expression results
116
        config["directories"]["results"]+"/rna/differential/cells/celltype/parsed/diff_expr_results.txt.gz",
117
        config["directories"]["results"]+"/rna/differential/metacells/celltype/parsed/diff_expr_results.txt.gz",
118
        config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/diff_expr_results.txt.gz",
119
        config["directories"]["results"]+"/rna/differential/pseudobulk/celltype_genotype/parsed/diff_expr_results.txt.gz",
120
121
        # Extract TFs from differential analysis
122
        config["directories"]["results"] + "/rna/differential/cells/celltype/parsed/diff_expr_results_tfs.txt.gz",
123
        config["directories"]["results"] + "/rna/differential/metacells/celltype/parsed/diff_expr_results_tfs.txt.gz",
124
        config["directories"]["results"] + "/rna/differential/pseudobulk/celltype/parsed/diff_expr_results_tfs.txt.gz",
125
126
        # Differential expression between genotypes
127
        expand(config["directories"]["results"]+"/rna/differential/pseudobulk/celltype_genotype/{diff_celltype_genotype}.txt.gz", diff_celltype_genotype=config["celltypes"]),
128
129
        # Marker genes
130
        # config["directories"]["results"]+"/rna/differential/cells/celltype/parsed/marker_genes_all.txt.gz",
131
        # config["directories"]["results"]+"/rna/differential/metacells/celltype/parsed/marker_genes_all.txt.gz",
132
        config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/marker_genes_all.txt.gz",
133
134
        # Marker TFs
135
        # # config["directories"]["results"]+"/rna/differential/cells/celltype/parsed/marker_tfs_all.txt.gz",
136
        # config["directories"]["results"]+"/rna/differential/metacells/celltype/parsed/marker_tfs_all.txt.gz",
137
        config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/marker_tfs_all.txt.gz",
138
139
        # TF2gene correlations
140
        expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_pseudobulk.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
141
        expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_pseudobulk.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
142
        expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_pseudobulk_no_ExE.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
143
        expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_pseudobulk_no_ExE.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
144
        expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_metacells.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
145
        expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_metacells.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
146
        # expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_metacells_no_ExE.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
147
        # expand(config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_metacells_no_ExE.rds", cor_test=config["tf2gene_cor"]["cor_test"]),
148
149
        # Extract TF matrices
150
        config["directories"]["processed_data"]+"/SingleCellExperiment_TFs.rds",
151
        expand(config["directories"]["results"]+"/rna/pseudobulk/{group_by}/SingleCellExperiment_TFs_pseudobulk.rds", group_by=config["pseudobulk_rna"]["group_by"]),
152
        config["directories"]["results"] + "/rna/metacells/all_cells/SingleCellExperiment_TFs_metacells.rds",
153
154
        # Trajectory inference
155
        # expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_SingleCellExperiment.rds", 
156
        #     trajectory_name=config["infer_trajectories"]["trajectory_name"]),
157
        # expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_adata.h5ad", 
158
        #     trajectory_name=config["infer_trajectories"]["trajectory_name"]),
159
        # expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_trajectory.txt.gz", 
160
        #     trajectory_name=config["infer_trajectories"]["trajectory_name"]),
161
        # expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_sample_metadata.txt.gz", 
162
        #     trajectory_name=config["infer_trajectories"]["trajectory_name"]),
163
        # expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2gene_single_cells.rds", 
164
        #     trajectory_name=config["infer_trajectories"]["trajectory_name"]),
165
        # expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2tf_single_cells.rds",
166
        #     trajectory_name=config["infer_trajectories"]["trajectory_name"])
167
        # expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2gene_single_cells_denoised.rds", 
168
        #     trajectory_name=config["infer_trajectories"]["trajectory_name"]),
169
        # expand(config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2tf_single_cells_denoised.rds",
170
        #     trajectory_name=config["infer_trajectories"]["trajectory_name"])
171
172
        # anndata
173
        config["directories"]["processed_data"] + "/anndata.h5ad",
174
175
        # velocity
176
        # expand("%s/{sample}/outs/cellsorted_possorted_genome_bam.bam" % config["directories"]["original_data"], sample=config["samples"][0]),
177
        expand("%s/{sample}/velocyto/{sample}.loom" % config["directories"]["original_data"], sample=config["samples"]),
178
        config["directories"]["processed_data"] + "/velocyto/anndata_velocyto.h5ad",
179
180
        # Infer metacells
181
        expand("%s/rna/metacells/all_cells/{sample_metacell}/cell2metacell_assignment.txt.gz" % config["directories"]["results"], sample_metacell=config["samples"]),
182
        expand("%s/rna/metacells/all_cells/{sample_metacell}/anndata_metacells.h5ad" % config["directories"]["results"], sample_metacell=config["samples"]),
183
        config["directories"]["results"] + "/rna/metacells/all_cells/metacells_metadata.txt.gz",
184
        config["directories"]["results"] + "/rna/metacells/all_cells/SingleCellExperiment_metacells.rds",
185
        config["directories"]["results"] + "/rna/metacells/all_cells/anndata_metacells.h5ad",
186
187
        # Infer metacells for tajectories
188
        expand("%s/rna/metacells/trajectories/{trajectory_metacells}/cell2metacell_assignment.txt.gz" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"]),
189
        # expand("%s/rna/metacells/trajectories/{trajectory_metacells}/SingleCellExperiment_metacells.rds" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"]),
190
        expand("%s/rna/metacells/trajectories/{trajectory_metacells}/metacells_metadata.txt.gz" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"]),
191
        expand("%s/rna/metacells/trajectories/{trajectory_metacells}/anndata_metacells.h5ad" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"]),
192
        expand("%s/rna/metacells/trajectories/{trajectory_metacells}/metacell_trajectory.txt.gz" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"])
193
194
        # Trajectory-specific TF2gene coexpression using metacells
195
        # expand("%s/rna/coexpression/trajectories/{trajectory_metacells}/correlation_matrix_tf2gene_metacells.rds" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"]),
196
        # expand("%s/rna/coexpression/trajectories/{trajectory_metacells}/correlation_matrix_tf2tf_metacells.rds" % config["directories"]["results"], trajectory_metacells=config["run_metacells_trajectory"]["trajectories"])
197
198
##################################################
199
## Load count matrices and create Seurat object ##
200
##################################################
201
202
rule create_seurat:
203
    input:
204
        script=config["scripts"]["create_seurat"],
205
        input_dir=config["directories"]["original_data"]
206
    output:
207
        seurat=config["directories"]["processed_data"]+"/seurat.rds",
208
        metadata=config["directories"]["processed_data"]+"/metadata.txt.gz"
209
    params:
210
        outdir=config["directories"]["processed_data"],
211
        samples=expand("{sample}", sample=config["samples"])
212
    conda:
213
        "environment.yaml"
214
    log: 
215
        "logs/create_seurat.log"
216
    threads: 
217
        config["slurm"]["create_seurat"]["threads"]
218
    resources:
219
        mem_mb = config["slurm"]["create_seurat"]["memory"]
220
    shell:
221
        "Rscript {input.script} --inputdir {input.input_dir} --outdir {params.outdir} --samples {params.samples} > {log}"
222
223
#####################
224
## Quality control ##
225
#####################
226
227
rule qc:
228
    input:
229
        metadata=rules.create_seurat.output.metadata,
230
        script=config["scripts"]["qc"]
231
    output:
232
        # qc_metrics_boxplot=config["directories"]["results"]+"/rna/qc/qc_metrics_boxplot.pdf",
233
        qc_metrics_histogram=config["directories"]["results"]+"/rna/qc/qc_metrics_histogram.pdf",
234
        metadata=config["directories"]["results"]+"/rna/qc/sample_metadata_after_qc.txt.gz"
235
    params:
236
        sample = expand("{sample}", sample=config["samples"]),
237
        min_nFeature_RNA = config["qc"]["min_nFeature_RNA"],
238
        max_nFeature_RNA = config["qc"]["max_nFeature_RNA"],
239
        percent_mt = config["qc"]["percent_mt"],
240
        percent_rib = config["qc"]["percent_rib"],
241
        outdir=config["directories"]["results"]+"/rna/qc"
242
    conda:
243
        "environment.yaml"
244
    log: 
245
        "logs/qc.log"
246
    threads: 
247
        config["slurm"]["qc"]["threads"]
248
    resources:
249
        mem_mb = config["slurm"]["qc"]["memory"]
250
    shell:
251
        "Rscript {input.script} --metadata {input.metadata} --outputdir {params.outdir} --samples {params.sample} --min_nFeature_RNA {params.min_nFeature_RNA} \
252
        --max_nFeature_RNA {params.max_nFeature_RNA} --ribosomal_percent_RNA {params.percent_rib} \
253
        --mitochondrial_percent_RNA {params.percent_mt} > {log}"
254
255
###################################################
256
## Convert Seurat object to SingleCellExperiment ##
257
###################################################
258
259
rule seurat_to_sce:
260
    input:
261
        seurat = rules.create_seurat.output.seurat,
262
        metadata = rules.qc.output.metadata,
263
        script = config["scripts"]["seurat_to_sce"],
264
    output:
265
        config["directories"]["processed_data"]+"/SingleCellExperiment.rds",
266
    params:
267
        sample = expand("{sample}", sample=config["samples"])
268
    conda:
269
        "environment.yaml"
270
    log: 
271
        "logs/seurat_to_sce.log"
272
    threads: 
273
        config["slurm"]["seurat_to_sce"]["threads"]
274
    resources:
275
        mem_mb = config["slurm"]["seurat_to_sce"]["memory"]
276
    shell:
277
        "Rscript {input.script} --samples {params.sample} --seurat {input.seurat} \
278
        --metadata {input.metadata} --outfile {output} > {log}"
279
280
#######################
281
## Doublet detection ##
282
#######################
283
284
rule doublet_detection:
285
    input:
286
        sce = rules.seurat_to_sce.output,
287
        metadata = rules.qc.output.metadata,
288
        script = config["scripts"]["doublet_detection"]
289
    output:
290
        outfile=config["directories"]["results"]+"/rna/doublet_detection/doublets_{sample}_{doublet_score_threshold}.txt.gz"
291
        # metadata=config["directories"]["results"]+"/rna/doublet_detection/sample_metadata_after_doublet_detection.txt.gz"
292
    conda:
293
        "environment.yaml"
294
    log: 
295
        "logs/doublet_detection_{sample}_{doublet_score_threshold}.log"
296
    threads: 
297
        config["slurm"]["doublet_detection"]["threads"]
298
    resources:
299
        mem_mb = config["slurm"]["doublet_detection"]["memory"]
300
    shell:
301
        "Rscript {input.script} --metadata {input.metadata} --sce {input.sce} --samples {wildcards.sample} \
302
        --hybrid_score_threshold {wildcards.doublet_score_threshold}  --outfile {output} > {log}"
303
304
rule parse_doublet_results:
305
    input:
306
        metadata = rules.qc.output.metadata,
307
        script = config["scripts"]["parse_doublets"],
308
        doublet_files=expand(config["directories"]["results"]+"/rna/doublet_detection/doublets_{sample}_{doublet_score_threshold}.txt.gz", 
309
            sample=config["samples"], doublet_score_threshold=config["doublet_detection"]["doublet_score_threshold"])
310
        # doublet_files = rules.doublet_detection.output
311
    output:
312
        config["directories"]["results"]+"/rna/doublet_detection/sample_metadata_after_doublets.txt.gz"
313
    conda:
314
        "environment.yaml"
315
    log: 
316
        "logs/parse_doublet_results.log"
317
    threads: 
318
        config["slurm"]["parse_doublet_results"]["threads"]
319
    resources:
320
        mem_mb = config["slurm"]["parse_doublet_results"]["memory"]
321
    shell:
322
        "Rscript {input.script} --metadata {input.metadata} --doublet_files {input.doublet_files} --outfile {output} > {log}"
323
324
################
325
## Plot stats ##
326
################
327
328
rule plot_stats_per_sample:
329
    input:
330
        sce = rules.seurat_to_sce.output,
331
        metadata = rules.parse_doublet_results.output,
332
        script = config["scripts"]["plot_stats_per_sample"]
333
    output:
334
        config["directories"]["results"]+"/rna/stats/ncells_per_sample.pdf"
335
    params:
336
        outdir = config["directories"]["results"]+"/rna/stats",
337
        samples = expand("{sample}", sample=config["samples"])
338
    conda:
339
        "environment.yaml"
340
    log: 
341
        "logs/plot_stats_per_sample.log"
342
    threads: 
343
        config["slurm"]["plot_stats_per_sample"]["threads"]
344
    resources:
345
        mem_mb = config["slurm"]["plot_stats_per_sample"]["memory"]
346
    shell:
347
        "Rscript {input.script} --metadata {input.metadata} --sce {input.sce} --samples {params.samples} \
348
        --outdir {params.outdir} > {log}"
349
350
##########################
351
## Mapping to the atlas ##
352
##########################
353
354
rule mapping_mnn:
355
    input:
356
        atlas_sce = config["directories"]["atlas"]+"/processed/SingleCellExperiment.rds",
357
        atlas_metadata = config["directories"]["atlas"]+"/sample_metadata.txt.gz",
358
        query_sce = rules.seurat_to_sce.output,
359
        query_metadata = rules.parse_doublet_results.output,
360
        script = config["scripts"]["mapping_mnn"]
361
    output:
362
        config["directories"]["results"]+"/rna/mapping/mapping_mnn_{sample}.txt.gz"
363
    params:
364
        # sample = expand("{sample}", sample=config["samples"]),
365
        atlas_stages=config["mapping_mnn"]["atlas_stages"],
366
        npcs = config["mapping_mnn"]["npcs"],
367
        n_neighbours = config["mapping_mnn"]["n_neighbours"]
368
    conda:
369
        "environment.yaml"
370
    log: 
371
        "logs/mapping_mnn_{sample}.log"
372
    threads: 
373
        config["slurm"]["mapping_mnn"]["threads"]
374
    resources:
375
        mem_mb = config["slurm"]["mapping_mnn"]["memory"]
376
    shell:
377
        "Rscript {input.script} --query_samples {wildcards.sample} --atlas_stages {params.atlas_stages} --query_sce {input.query_sce} \
378
        --atlas_sce {input.atlas_sce} --atlas_metadata {input.atlas_metadata} --query_metadata {input.query_metadata} \
379
        --npcs {params.npcs} --n_neighbours {params.n_neighbours} --outfile {output} > {log}"
380
381
rule mapping_seurat:
382
    input:
383
        atlas_sce = config["directories"]["atlas"]+"/processed/SingleCellExperiment.rds",
384
        atlas_metadata = config["directories"]["atlas"]+"/sample_metadata.txt.gz",
385
        query_sce = rules.seurat_to_sce.output,
386
        query_metadata = rules.parse_doublet_results.output,
387
        script = config["scripts"]["mapping_seurat"]
388
    output:
389
        config["directories"]["results"]+"/rna/mapping/mapping_seurat_{sample}.txt.gz"
390
    params:
391
        # sample = expand("{sample}", sample=config["samples"]),
392
        atlas_stages=config["mapping_seurat"]["atlas_stages"],
393
        npcs = config["mapping_seurat"]["npcs"],
394
        n_neighbours = config["mapping_seurat"]["n_neighbours"]
395
    conda:
396
        "environment.yaml"
397
    log: 
398
        "logs/mapping_seurat_{sample}.log"
399
    threads: 
400
        config["slurm"]["mapping_seurat"]["threads"]
401
    resources:
402
        mem_mb = config["slurm"]["mapping_seurat"]["memory"]
403
    shell:
404
        "Rscript {input.script} --query_samples {wildcards.sample} --atlas_stages {params.atlas_stages} --query_sce {input.query_sce} \
405
        --atlas_sce {input.atlas_sce} --atlas_metadata {input.atlas_metadata} --query_metadata {input.query_metadata} \
406
        --npcs {params.npcs} --n_neighbours {params.n_neighbours} --outfile {output}  > {log}"
407
408
rule parse_mapping_results:
409
    input:
410
        query_metadata = rules.parse_doublet_results.output,
411
        # mapping_seurat = expand(config["directories"]["results"]+"/rna/mapping/mapping_seurat_{sample}.txt.gz", sample=config["samples"]),
412
        mapping_mnn = expand(config["directories"]["results"]+"/rna/mapping/mapping_mnn_{sample}.txt.gz", sample=config["samples"]),
413
        # mapping_seurat = rules.mapping_seurat.output,
414
        # mapping_mnn = rules.mapping_mnn.output,
415
        script = config["scripts"]["parse_mapping"]
416
    output:
417
        config["directories"]["results"]+"/rna/mapping/sample_metadata_after_mapping.txt.gz"
418
    conda:
419
        "environment.yaml"
420
    log: 
421
        "logs/parse_mapping_results.log"
422
    threads: 
423
        config["slurm"]["parse_mapping_results"]["threads"]
424
    resources:
425
        mem_mb = config["slurm"]["parse_mapping_results"]["memory"]
426
    shell:
427
        "Rscript {input.script} --metadata {input.query_metadata} --mapping_mnn {input.mapping_mnn} --outfile {output} > {log}"
428
        # "Rscript {input.script} --metadata {input.query_metadata} --mapping_seurat {input.mapping_seurat} --mapping_mnn {input.mapping_mnn} --outfile {output} > {log}"
429
430
##############################
431
## Dimensionality reduction ##
432
##############################
433
434
rule dimensionality_reduction_sce: 
435
    input:
436
        script = config["scripts"]["dimensionality_reduction_sce"],
437
        sce = rules.seurat_to_sce.output,
438
        metadata = rules.parse_mapping_results.output
439
    output:
440
        config["directories"]["results"]+"/rna/dimensionality_reduction/sce/batch_correction_{batch_variable}_remove_ExE_cells_{remove_ExE_cells}/pca_features{dimred_sce_features}_pcs{dimred_sce_npcs}.txt.gz"
441
    params:
442
        outdir = config["directories"]["results"]+"/rna/dimensionality_reduction/sce/batch_correction_{batch_variable}_remove_ExE_cells_{remove_ExE_cells}",
443
        n_neighbors = config["dimensionality_reduction_sce"]["n_neighbors"],
444
        min_dist = config["dimensionality_reduction_sce"]["min_dist"],
445
        vars_to_regress = config["dimensionality_reduction_sce"]["vars_to_regress"],
446
        colour_by = config["dimensionality_reduction_sce"]["colour_by"]
447
    conda:
448
        "environment.yaml"
449
    log: 
450
        "logs/dimensionality_reduction_features{dimred_sce_features}_pcs{dimred_sce_npcs}_batch_correction_{batch_variable}_remove_ExE_cells{remove_ExE_cells}.log"
451
    threads: 
452
        config["slurm"]["dimensionality_reduction_sce"]["threads"]
453
    resources:
454
        mem_mb = config["slurm"]["dimensionality_reduction_sce"]["memory"]
455
    shell:
456
        "Rscript {input.script} --sce {input.sce} --metadata {input.metadata} --npcs {wildcards.dimred_sce_npcs} --features {wildcards.dimred_sce_features} \
457
        --vars_to_regress {params.vars_to_regress}  --remove_ExE_cells {wildcards.remove_ExE_cells} --batch_variable {wildcards.batch_variable} \
458
        --n_neighbors {params.n_neighbors} --min_dist {params.min_dist} --colour_by {params.colour_by} --outdir {params.outdir} > {log}"
459
460
rule dimensionality_reduction_sce_per_stage: 
461
    input:
462
        script=config["scripts"]["dimensionality_reduction_sce"],
463
        sce=rules.seurat_to_sce.output,
464
        metadata=rules.parse_mapping_results.output
465
    output:
466
        # config["directories"]["results"]+"/rna/dimensionality_reduction/umap_features{dimred_sce_features}_pcs{dimred_sce_npcs}_neigh{n_neighbors}_dist{min_dist}.txt.gz",
467
        config["directories"]["results"]+"/rna/dimensionality_reduction/sce/per_stage/{stage}/pca_features{dimred_sce_features}_pcs{dimred_sce_npcs}.txt.gz"
468
    params:
469
        outdir = config["directories"]["results"]+"/rna/dimensionality_reduction/sce/per_stage/{stage}",
470
        n_neighbors = config["dimensionality_reduction_sce"]["n_neighbors"],
471
        min_dist = config["dimensionality_reduction_sce"]["min_dist"],
472
        vars_to_regress = config["dimensionality_reduction_sce"]["vars_to_regress"],
473
        colour_by = config["dimensionality_reduction_sce"]["colour_by"]
474
    conda:
475
        "environment.yaml"
476
    log: 
477
        "logs/dimensionality_reduction_features{dimred_sce_features}_pcs{dimred_sce_npcs}_per_stage_{stage}.log"
478
    threads: 
479
        config["slurm"]["dimensionality_reduction_sce"]["threads"]
480
    resources:
481
        mem_mb = config["slurm"]["dimensionality_reduction_sce"]["memory"]
482
    shell:
483
        "Rscript {input.script} --sce {input.sce} --stages {wildcards.stage} --metadata {input.metadata} --npcs {wildcards.dimred_sce_npcs} --features {wildcards.dimred_sce_features} \
484
        --vars_to_regress {params.vars_to_regress} --n_neighbors {params.n_neighbors} --min_dist {params.min_dist} --colour_by {params.colour_by} --outdir {params.outdir} > {log}"
485
486
rule dimensionality_reduction_seurat: 
487
    input:
488
        script=config["scripts"]["dimensionality_reduction_seurat"],
489
        seurat=rules.create_seurat.output.seurat,
490
        metadata=rules.parse_mapping_results.output
491
    output:
492
        config["directories"]["results"]+"/rna/dimensionality_reduction/seurat/batch_correction_{batch_variable}_remove_ExE_cells_{remove_ExE_cells}/pca_features{dimred_seurat_features}_pcs{dimred_seurat_npcs}.txt.gz"
493
    params:
494
        outdir = config["directories"]["results"]+"/rna/dimensionality_reduction/seurat/batch_correction_{batch_variable}_remove_ExE_cells_{remove_ExE_cells}",
495
        colour_by = config["dimensionality_reduction_seurat"]["colour_by"],
496
        n_neighbors = config["dimensionality_reduction_seurat"]["n_neighbors"],
497
        min_dist = config["dimensionality_reduction_seurat"]["min_dist"],
498
        vars_to_regress = config["dimensionality_reduction_seurat"]["vars_to_regress"],
499
        seed = config["dimensionality_reduction_seurat"]["seed"]
500
    conda:
501
        "environment.yaml"
502
    log: 
503
        "logs/dimensionality_reduction_seurat_features{dimred_seurat_features}_pcs{dimred_seurat_npcs}_batch_correction_{batch_variable}_remove_ExE_cells{remove_ExE_cells}.log"
504
    threads: 
505
        config["slurm"]["dimensionality_reduction_seurat"]["threads"]
506
    resources:
507
        mem_mb = config["slurm"]["dimensionality_reduction_seurat"]["memory"]
508
    shell:
509
        "Rscript {input.script} --seurat {input.seurat} --metadata {input.metadata} --npcs {wildcards.dimred_seurat_npcs} \
510
        --features {wildcards.dimred_seurat_features} --n_neighbors {params.n_neighbors} --min_dist {params.min_dist} \
511
        --vars_to_regress {params.vars_to_regress} --remove_ExE_cells {wildcards.remove_ExE_cells} --batch_variable {wildcards.batch_variable} \
512
        --seed {params.seed} --colour_by {params.colour_by} --outdir {params.outdir} > {log}"
513
514
rule dimensionality_reduction_seurat_per_stage: 
515
    input:
516
        script=config["scripts"]["dimensionality_reduction_seurat"],
517
        seurat=rules.create_seurat.output.seurat,
518
        metadata=rules.parse_mapping_results.output
519
    output:
520
        config["directories"]["results"]+"/rna/dimensionality_reduction/seurat/per_stage/{stage}/pca_features{dimred_seurat_features}_pcs{dimred_seurat_npcs}.txt.gz"
521
    params:
522
        outdir = config["directories"]["results"]+"/rna/dimensionality_reduction/seurat/per_stage/{stage}",
523
        colour_by = config["dimensionality_reduction_seurat"]["colour_by"],
524
        n_neighbors = config["dimensionality_reduction_seurat"]["n_neighbors"],
525
        min_dist = config["dimensionality_reduction_seurat"]["min_dist"],
526
        vars_to_regress = config["dimensionality_reduction_seurat"]["vars_to_regress"],
527
        seed = config["dimensionality_reduction_seurat"]["seed"]
528
    conda:
529
        "environment.yaml"
530
    log: 
531
        "logs/dimensionality_reduction_seurat_features{dimred_seurat_features}_pcs{dimred_seurat_npcs}_per_stage_{stage}.log"
532
    threads: 
533
        config["slurm"]["dimensionality_reduction_seurat_per_stage"]["threads"]
534
    resources:
535
        mem_mb = config["slurm"]["dimensionality_reduction_seurat_per_stage"]["memory"]
536
    shell:
537
        "Rscript {input.script} --seurat {input.seurat} --metadata {input.metadata} --stage {wildcards.stage} --features {wildcards.dimred_seurat_features} \
538
        --npcs {wildcards.dimred_seurat_npcs} --n_neighbors {params.n_neighbors} --min_dist {params.min_dist} \
539
        --vars_to_regress {params.vars_to_regress} --seed {params.seed}\
540
        --colour_by {params.colour_by} --outdir {params.outdir} > {log}"
541
542
rule plot_mapping_results: 
543
    input:
544
        script = config["scripts"]["plot_mapping_results"],
545
        query_metadata=rules.parse_mapping_results.output,
546
        atlas_metadata = config["directories"]["atlas"]+"/sample_metadata.txt.gz"
547
    output:
548
        config["directories"]["results"]+"/rna/mapping/pdf/umap_mapped_allcells.pdf"
549
    params:
550
        samples = expand("{sample}", sample=config["samples"]),
551
        outdir = config["directories"]["results"]+"/rna/mapping/pdf"
552
    conda:
553
        "environment.yaml"
554
    log: 
555
        "logs/plot_mapping_results.log"
556
    threads: 
557
        config["slurm"]["plot_mapping_results"]["threads"]
558
    resources:
559
        mem_mb = config["slurm"]["plot_mapping_results"]["memory"]        
560
    shell:
561
        "Rscript {input.script} --query_metadata {input.query_metadata} --atlas_metadata {input.atlas_metadata} \
562
        --samples {params.samples} --outdir {params.outdir} > {log}"
563
564
####################
565
## Create anndata ##
566
####################
567
568
rule convert_SingleCellExperiment_to_anndata: 
569
    input:
570
        script = config["scripts"]["convert_SingleCellExperiment_to_anndata"],
571
        metadata = rules.parse_mapping_results.output,
572
        sce = rules.seurat_to_sce.output
573
    output:
574
        config["directories"]["processed_data"] + "/anndata.h5ad"
575
    # params:
576
    #     python = config["resources"]["python"]
577
    conda:
578
        "environment.yaml"
579
    log: 
580
        "logs/convert_SingleCellExperiment_to_anndata.log"
581
    threads: 
582
        config["slurm"]["convert_SingleCellExperiment_to_anndata"]["threads"]
583
    resources: 
584
        mem_mb = config["slurm"]["convert_SingleCellExperiment_to_anndata"]["memory"]
585
    shell:
586
        # "Rscript {input.script} --python_path {params.python} --sce {input.sce} --metadata {input.metadata} --outfile {output} > {log}"
587
        "Rscript {input.script} --sce {input.sce} --metadata {input.metadata} --outfile {output} > {log}"
588
589
################
590
## Pseudobulk ##
591
################
592
593
rule pseudobulk_rna:
594
    input:
595
        sce = rules.seurat_to_sce.output,
596
        metadata = rules.parse_mapping_results.output,
597
        script = config["scripts"]["pseudobulk_rna"]
598
    output:
599
        # seurat = config["directories"]["results"]+"/rna/pseudobulk/{group_by}/Seurat_pseudobulk.rds",
600
        sce = config["directories"]["results"]+"/rna/pseudobulk/{group_by}/SingleCellExperiment_pseudobulk.rds",
601
        stats = config["directories"]["results"]+"/rna/pseudobulk/{group_by}/stats.txt"
602
    params:
603
        normalisation_method = config["pseudobulk_rna"]["normalisation_method"],
604
        outdir = config["directories"]["results"]+"/rna/pseudobulk/{group_by}"
605
    conda:
606
        "environment.yaml"
607
    log: 
608
        "logs/pseudobulk_rna_{group_by}.log"
609
    threads: 
610
        config["slurm"]["pseudobulk_rna"]["threads"]
611
    resources:
612
        mem_mb = config["slurm"]["pseudobulk_rna"]["memory"]
613
    shell:
614
        "Rscript {input.script} --sce {input.sce} --metadata {input.metadata} --group_by {wildcards.group_by} \
615
        --normalisation_method {params.normalisation_method} --outdir {params.outdir} > {log}"
616
617
rule pseudobulk_rna_with_replicates:
618
    input:
619
        sce = rules.seurat_to_sce.output,
620
        metadata = rules.parse_mapping_results.output,
621
        script = config["scripts"]["pseudobulk_rna_with_replicates"]
622
    output:
623
        sce = config["directories"]["results"]+"/rna/pseudobulk/{group_by}/SingleCellExperiment_pseudobulk_with_replicates.rds",
624
        cell2replicate = config["directories"]["results"]+"/rna/pseudobulk/{group_by}/cell2replicate.txt.gz"
625
    params:
626
        min_cells = config["pseudobulk_rna_with_replicates"]["min_cells"],
627
        nrep = config["pseudobulk_rna_with_replicates"]["nrep"],
628
        fraction_cells_per_replicate = config["pseudobulk_rna_with_replicates"]["fraction_cells_per_replicate"],
629
        outdir = config["directories"]["results"]+"/rna/pseudobulk/{group_by}"
630
    conda:
631
        "environment.yaml"
632
    log: 
633
        "logs/pseudobulk_rna_with_replicates_{group_by}.log"
634
    threads: 
635
        config["slurm"]["pseudobulk_rna_with_replicates"]["threads"]
636
    resources:
637
        mem_mb = config["slurm"]["pseudobulk_rna_with_replicates"]["memory"]
638
    shell:
639
        "Rscript {input.script} --sce {input.sce} --metadata {input.metadata} --group_by {wildcards.group_by} --nrep {params.nrep} \
640
        --min_cells {params.min_cells} --fraction_cells_per_replicate {params.fraction_cells_per_replicate} --outdir {params.outdir} > {log}"
641
642
###############
643
## Metacells ##
644
###############
645
646
rule run_metacells: 
647
    input:
648
        script = config["scripts"]["run_metacells"],
649
        metadata = rules.parse_mapping_results.output,
650
        anndata = rules.convert_SingleCellExperiment_to_anndata.output
651
    output:
652
        cell2metacell_assignments = config["directories"]["results"] + "/rna/metacells/all_cells/{sample_metacell}/cell2metacell_assignment.txt.gz",
653
        anndata = config["directories"]["results"] + "/rna/metacells/all_cells/{sample_metacell}/anndata_metacells.h5ad"
654
    params:
655
        # python = config["resources"]["python"],
656
        # python = "/bi/group/reik/ricard/software/miniconda3/envs/main/envs/metacells/bin/python",
657
        percent_metacells = config["run_metacells"]["percent_metacells"],
658
        n_pcs = config["run_metacells"]["n_pcs"],
659
        n_features = config["run_metacells"]["n_features"],
660
        outdir = config["directories"]["results"] + "/rna/metacells/all_cells/{sample_metacell}"
661
    conda:
662
        "environment.yaml"
663
    log: 
664
        "logs/run_metacells_{sample_metacell}.log"
665
    threads: 
666
        config["slurm"]["run_metacells"]["threads"]
667
    resources: 
668
        mem_mb = config["slurm"]["run_metacells"]["memory"]
669
    shell:
670
        # "{params.python} {input.script} --anndata {input.anndata} --metadata {input.metadata} --samples {wildcards.sample_metacell} \
671
        "python {input.script} --anndata {input.anndata} --metadata {input.metadata} --samples {wildcards.sample_metacell} \
672
        --percent_metacells {params.percent_metacells} --n_pcs {params.n_pcs} --n_features {params.n_features} --outdir {params.outdir} > {log}"
673
674
rule run_metacells_trajectory: 
675
    input:
676
        script = config["scripts"]["run_metacells_trajectory"],
677
        metadata = rules.parse_mapping_results.output,
678
        anndata = rules.convert_SingleCellExperiment_to_anndata.output
679
    output:
680
        cell2metacell_assignments = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}/cell2metacell_assignment.txt.gz",
681
        trajectory = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}/metacell_trajectory.txt.gz",
682
        anndata = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}/anndata_metacells.h5ad"
683
    params:
684
        # python = config["resources"]["python"],
685
        # python = "/bi/group/reik/ricard/software/miniconda3/envs/main/envs/metacells/bin/python",
686
        percent_metacells = config["run_metacells_trajectory"]["percent_metacells"],
687
        n_pcs = config["run_metacells_trajectory"]["n_pcs"],
688
        n_features = config["run_metacells_trajectory"]["n_features"],
689
        outdir = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}"
690
    conda:
691
        "environment.yaml"
692
    log: 
693
        "logs/run_metacells_trajectory_{trajectory_metacells}.log"
694
    threads: 
695
        config["slurm"]["run_metacells_trajectory"]["threads"]
696
    resources: 
697
        mem_mb = config["slurm"]["run_metacells_trajectory"]["memory"]
698
    shell:
699
        # "{params.python} {input.script} --anndata {input.anndata} --metadata {input.metadata} --samples all --trajectory {wildcards.trajectory_metacells} \
700
        "python {input.script} --anndata {input.anndata} --metadata {input.metadata} --samples all --trajectory {wildcards.trajectory_metacells} \
701
        --percent_metacells {params.percent_metacells} --n_pcs {params.n_pcs} --n_features {params.n_features} --outdir {params.outdir} > {log}"
702
703
rule aggregate_rna_metacells: 
704
    input:
705
        script = config["scripts"]["aggregate_rna_metacells"],
706
        metadata = rules.parse_mapping_results.output,
707
        sce = rules.seurat_to_sce.output,
708
        cell2metacell = expand(rules.run_metacells.output.cell2metacell_assignments, sample_metacell=config["samples"])
709
    output:
710
        metadata = config["directories"]["results"] + "/rna/metacells/all_cells/metacells_metadata.txt.gz",
711
        sce = config["directories"]["results"] + "/rna/metacells/all_cells/SingleCellExperiment_metacells.rds",
712
        anndata = config["directories"]["results"] + "/rna/metacells/all_cells/anndata_metacells.h5ad"
713
    params:
714
        outdir = config["directories"]["results"] + "/rna/metacells/all_cells",
715
        python = config["resources"]["python"],
716
        metacell_min_reads = config["aggregate_rna_metacells"]["metacell_min_reads"],
717
        normalisation_method = config["aggregate_rna_metacells"]["normalisation_method"]
718
    conda:
719
        "environment.yaml"
720
    log: 
721
        "logs/aggregate_rna_metacells.log"
722
    threads: 
723
        config["slurm"]["aggregate_rna_metacells"]["threads"]
724
    resources: 
725
        mem_mb = config["slurm"]["aggregate_rna_metacells"]["memory"]
726
    shell:
727
        "Rscript {input.script} --python {params.python} --sce {input.sce} --metadata {input.metadata} --cell2metacell {input.cell2metacell} \
728
        --metacell_min_reads {params.metacell_min_reads} --normalisation_method {params.normalisation_method} \
729
        --outdir {params.outdir} > {log}"
730
731
rule aggregate_rna_metacells_trajectory: 
732
    input:
733
        script = config["scripts"]["aggregate_rna_metacells"],
734
        metadata = rules.parse_mapping_results.output,
735
        sce = rules.seurat_to_sce.output,
736
        cell2metacell = rules.run_metacells_trajectory.output.cell2metacell_assignments
737
    output:
738
        metadata = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}/metacells_metadata.txt.gz",
739
        sce = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}/SingleCellExperiment_metacells.rds"
740
    params:
741
        outdir = config["directories"]["results"] + "/rna/metacells/trajectories/{trajectory_metacells}",
742
        python = config["resources"]["python"],
743
        metacell_min_reads = config["aggregate_rna_metacells"]["metacell_min_reads"],
744
        normalisation_method = config["aggregate_rna_metacells"]["normalisation_method"]
745
    conda:
746
        "environment.yaml"
747
    log: 
748
        "logs/aggregate_rna_metacells_{trajectory_metacells}.log"
749
    threads: 
750
        config["slurm"]["aggregate_rna_metacells"]["threads"]
751
    resources: 
752
        mem_mb = config["slurm"]["aggregate_rna_metacells"]["memory"]
753
    shell:
754
        "Rscript {input.script} --python {params.python} --sce {input.sce} --metadata {input.metadata} --cell2metacell {input.cell2metacell} \
755
        --metacell_min_reads {params.metacell_min_reads} --normalisation_method {params.normalisation_method} \
756
        --outdir {params.outdir} > {log}"
757
758
############################### 
759
## Extract TF RNA expression ##
760
###############################
761
762
rule extract_TF_SingleCellExperiment_cells:
763
    input:
764
        sce = rules.seurat_to_sce.output,
765
        script = config["scripts"]["extract_TF_SingleCellExperiment"],
766
        TF_file = config["resources"]["TFs_file"]
767
    output:
768
        config["directories"]["processed_data"]+"/SingleCellExperiment_TFs.rds"
769
    conda:
770
        "environment.yaml"
771
    log: 
772
        "logs/extract_TF_SingleCellExperiment_cells.log"
773
    threads: 
774
        config["slurm"]["extract_TF_SingleCellExperiment_cells"]["threads"]
775
    resources:
776
        mem_mb = config["slurm"]["extract_TF_SingleCellExperiment_cells"]["memory"]
777
    shell:
778
        "Rscript {input.script} --sce {input.sce} --TF_file {input.TF_file} --outfile {output} > {log}"
779
780
rule extract_TF_SingleCellExperiment_metacells:
781
    input:
782
        sce = rules.aggregate_rna_metacells.output.sce,
783
        script = config["scripts"]["extract_TF_SingleCellExperiment"],
784
        TF_file = config["resources"]["TFs_file"]
785
    output:
786
        config["directories"]["results"] + "/rna/metacells/all_cells/SingleCellExperiment_TFs_metacells.rds"
787
    conda:
788
        "environment.yaml"
789
    log: 
790
        "logs/extract_TF_SingleCellExperiment_metacells.log"
791
    threads: 
792
        config["slurm"]["extract_TF_SingleCellExperiment_metacells"]["threads"]
793
    resources:
794
        mem_mb = config["slurm"]["extract_TF_SingleCellExperiment_metacells"]["memory"]
795
    shell:
796
        "Rscript {input.script} --sce {input.sce} --TF_file {input.TF_file} --outfile {output} > {log}"
797
798
rule extract_TF_SingleCellExperiment_pseudobulk:
799
    input:
800
        sce = rules.pseudobulk_rna.output.sce,
801
        script = config["scripts"]["extract_TF_SingleCellExperiment"],
802
        TF_file = config["resources"]["TFs_file"]
803
    output:
804
        config["directories"]["results"]+"/rna/pseudobulk/{group_by}/SingleCellExperiment_TFs_pseudobulk.rds"
805
    conda:
806
        "environment.yaml"
807
    log: 
808
        "logs/extract_TF_SingleCellExperiment_pseudobulk_{group_by}.log"
809
    threads: 
810
        config["slurm"]["extract_TF_SingleCellExperiment_pseudobulk"]["threads"]
811
    resources:
812
        mem_mb = config["slurm"]["extract_TF_SingleCellExperiment_pseudobulk"]["memory"]
813
    shell:
814
        "Rscript {input.script} --sce {input.sce} --TF_file {input.TF_file} --outfile {output} > {log}"
815
816
rule extract_TF_SingleCellExperiment_pseudobulk_with_replicates:
817
    input:
818
        sce = rules.pseudobulk_rna_with_replicates.output.sce,
819
        script = config["scripts"]["extract_TF_SingleCellExperiment"],
820
        TF_file = config["resources"]["TFs_file"]
821
    output:
822
        config["directories"]["results"]+"/rna/pseudobulk/{group_by}/SingleCellExperiment_TFs_pseudobulk_with_replicates.rds"
823
    conda:
824
        "environment.yaml"
825
    log: 
826
        "logs/extract_TF_SingleCellExperiment_pseudobulk_with_replicates_{group_by}.log"
827
    threads: 
828
        config["slurm"]["extract_TF_SingleCellExperiment_pseudobulk"]["threads"]
829
    resources:
830
        mem_mb = config["slurm"]["extract_TF_SingleCellExperiment_pseudobulk"]["memory"]
831
    shell:
832
        "Rscript {input.script} --sce {input.sce} --TF_file {input.TF_file} --outfile {output} > {log}"
833
834
################################
835
## Plot cell type proportions ##
836
################################
837
838
rule plot_celltype_proportions: 
839
    input:
840
        script = config["scripts"]["plot_celltype_proportions"],
841
        metadata=rules.parse_mapping_results.output
842
    output:
843
        config["directories"]["results"]+"/rna/celltype_proportions/celltype_proportions_stacked_barplots.pdf"
844
        # expand(config["directories"]["results"]+"/rna/celltype_proportions/celltype_proportions_{stage}_horizontal_barplots.pdf", stage=config["stages"])
845
    params:
846
        samples = expand("{sample}", sample=config["samples"]),
847
        celltype_label = config["plot_celltype_proportions"]["celltype_label"],
848
        outdir = config["directories"]["results"]+"/rna/celltype_proportions"
849
    conda:
850
        "environment.yaml"
851
    log: 
852
        "logs/plot_celltype_proportions.log"
853
    threads: 
854
        config["slurm"]["plot_celltype_proportions"]["threads"]
855
    resources:
856
        mem_mb = config["slurm"]["plot_celltype_proportions"]["memory"]        
857
    shell:
858
        "Rscript {input.script} --metadata {input.metadata} --celltype_label {params.celltype_label} \
859
        --samples {params.samples} --outdir {params.outdir} > {log}"
860
861
#############################
862
## TF vs gene coexpression ##
863
#############################
864
865
# TO-DO: COMPRESS RULES
866
867
rule coexpression_TF_vs_gene_single_cells: 
868
    input:
869
        script = config["scripts"]["coexpression_TF_vs_gene_single_cells"],
870
        TFs_file = config["resources"]["TFs_file"],
871
        metadata = rules.parse_mapping_results.output,
872
        sce = rules.seurat_to_sce.output
873
    output:
874
        config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_single_cells.rds",
875
        config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_single_cells.rds"
876
    params:
877
        outdir = config["directories"]["results"]+"/rna/coexpression"
878
    conda:
879
        "environment.yaml"
880
    log: 
881
        "logs/coexpression_TF_vs_gene_{cor_test}_single_cells.log"
882
    threads: 
883
        config["slurm"]["coexpression_TF_vs_gene_single_cells"]["threads"]
884
    resources: 
885
        mem_mb = config["slurm"]["coexpression_TF_vs_gene_single_cells"]["memory"]
886
    shell:
887
        "Rscript {input.script} --metadata {input.metadata} --sce {input.sce} --cor_test {wildcards.cor_test} \
888
        --TFs_file {input.TFs_file} --outdir {params.outdir} > {log}"
889
890
rule coexpression_TF_vs_gene_single_cells_no_ExE: 
891
    input:
892
        script = config["scripts"]["coexpression_TF_vs_gene_single_cells"],
893
        metadata=rules.parse_mapping_results.output,
894
        sce=rules.seurat_to_sce.output
895
    output:
896
        config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_single_cells_no_ExE.rds",
897
        config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_single_cells_no_ExE.rds"
898
    params:
899
        TFs_file = config["resources"]["TFs_file"],
900
        outdir = config["directories"]["results"]+"/rna/coexpression"
901
    conda:
902
        "environment.yaml"
903
    log: 
904
        "logs/coexpression_TF_vs_gene_{cor_test}_single_cells_no_ExE.log"
905
    threads: 
906
        config["slurm"]["coexpression_TF_vs_gene_single_cells"]["threads"]
907
    resources: 
908
        mem_mb = config["slurm"]["coexpression_TF_vs_gene_single_cells"]["memory"]
909
    shell:
910
        "Rscript {input.script} --metadata {input.metadata} --sce {input.sce} \
911
        --TFs_file {params.TFs_file} --remove_ExE_cells --cor_test {wildcards.cor_test} --outdir {params.outdir} > {log}"
912
913
rule coexpression_TF_vs_gene_pseudobulk: 
914
    input:
915
        script = config["scripts"]["coexpression_TF_vs_gene_pseudobulk"],
916
        sce = expand(rules.pseudobulk_rna.output.sce, group_by="celltype")
917
    output:
918
        config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_pseudobulk.rds",
919
        config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_pseudobulk.rds"
920
    params:
921
        TFs_file = config["resources"]["TFs_file"],
922
        outdir = config["directories"]["results"]+"/rna/coexpression"
923
    conda:
924
        "environment.yaml"
925
    log: 
926
        "logs/coexpression_TF_vs_gene_{cor_test}_pseudobulk.log"
927
    threads: 
928
        config["slurm"]["coexpression_TF_vs_gene_pseudobulk"]["threads"]
929
    resources: 
930
        mem_mb = config["slurm"]["coexpression_TF_vs_gene_pseudobulk"]["memory"]
931
    shell:
932
        "Rscript {input.script} --sce {input.sce} --TFs_file {params.TFs_file} --cor_test {wildcards.cor_test} --outdir {params.outdir} > {log}"
933
934
rule coexpression_TF_vs_gene_pseudobulk_no_ExE: 
935
    input:
936
        script = config["scripts"]["coexpression_TF_vs_gene_pseudobulk"],
937
        sce = expand(rules.pseudobulk_rna.output.sce, group_by="celltype")
938
    output:
939
        config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_pseudobulk_no_ExE.rds",
940
        config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_pseudobulk_no_ExE.rds"
941
    params:
942
        TFs_file = config["resources"]["TFs_file"],
943
        outdir = config["directories"]["results"]+"/rna/coexpression"
944
    conda:
945
        "environment.yaml"
946
    log: 
947
        "logs/coexpression_TF_vs_gene_{cor_test}_pseudobulk_no_ExE.log"
948
    threads: 
949
        config["slurm"]["coexpression_TF_vs_gene_pseudobulk"]["threads"]
950
    resources: 
951
        mem_mb = config["slurm"]["coexpression_TF_vs_gene_pseudobulk"]["memory"]
952
    shell:
953
        "Rscript {input.script} --sce {input.sce} --TFs_file {params.TFs_file} --remove_ExE_cells --cor_test {wildcards.cor_test} --outdir {params.outdir} > {log}"
954
955
rule coexpression_TF_vs_gene_metacells: 
956
    input:
957
        script = config["scripts"]["coexpression_TF_vs_gene_metacells"],
958
        metadata = rules.aggregate_rna_metacells.output.metadata,
959
        sce = rules.aggregate_rna_metacells.output.sce
960
    output:
961
        config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2gene_metacells.rds",
962
        config["directories"]["results"]+"/rna/coexpression/correlation_matrix_{cor_test}_tf2tf_metacells.rds"
963
    params:
964
        TFs_file = config["resources"]["TFs_file"],
965
        outdir = config["directories"]["results"]+"/rna/coexpression"
966
    conda:
967
        "environment.yaml"
968
    log: 
969
        "logs/coexpression_TF_vs_gene_{cor_test}_metacells.log"
970
    threads: 
971
        config["slurm"]["coexpression_TF_vs_gene_metacells"]["threads"]
972
    resources: 
973
        mem_mb = config["slurm"]["coexpression_TF_vs_gene_metacells"]["memory"]
974
    shell:
975
        "Rscript {input.script} --metadata {input.metadata} --sce {input.sce} --TFs_file {params.TFs_file} --cor_test {wildcards.cor_test} --outdir {params.outdir} > {log}"
976
977
rule coexpression_TF_vs_gene_metacells_trajectories: 
978
    input:
979
        script = config["scripts"]["coexpression_TF_vs_gene_metacells"],
980
        metadata = rules.aggregate_rna_metacells_trajectory.output.metadata,
981
        sce = rules.aggregate_rna_metacells_trajectory.output.sce
982
    output:
983
        config["directories"]["results"]+"/rna/coexpression/trajectories/{trajectory_metacells}/correlation_matrix_tf2gene_metacells.rds",
984
        config["directories"]["results"]+"/rna/coexpression/trajectories/{trajectory_metacells}/correlation_matrix_tf2tf_metacells.rds"
985
    params:
986
        TFs_file = config["resources"]["TFs_file"],
987
        outdir = config["directories"]["results"]+"/rna/coexpression/trajectories/{trajectory_metacells}"
988
    conda:
989
        "environment.yaml"
990
    log: 
991
        "logs/coexpression_TF_vs_gene_metacells_{trajectory_metacells}.log"
992
    threads: 
993
        config["slurm"]["coexpression_TF_vs_gene_metacells"]["threads"]
994
    resources: 
995
        mem_mb = config["slurm"]["coexpression_TF_vs_gene_metacells"]["memory"]
996
    shell:
997
        "Rscript {input.script} --metadata {input.metadata} --sce {input.sce} --TFs_file {params.TFs_file} --outdir {params.outdir} > {log}"
998
999
1000
##################
1001
## Trajectories ##
1002
##################
1003
1004
# FIX CELL_METADATA
1005
1006
rule infer_trajectories: 
1007
    input:
1008
        script = config["scripts"]["infer_trajectories"],
1009
        # metadata = rules.parse_mapping_results.output, 
1010
        metadata = config["resources"]["cell_metadata"], # this is part of the ATAC pipeline
1011
        sce = rules.seurat_to_sce.output
1012
    output:
1013
        sce = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_SingleCellExperiment.rds",
1014
        adata = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_adata.h5ad",
1015
        txt = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_trajectory.txt.gz",
1016
        metadata = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/{trajectory_name}_sample_metadata.txt.gz"
1017
    params:
1018
        celltype_label = config["infer_trajectories"]["celltype_label"],
1019
        outdir = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}"
1020
    conda:
1021
        "environment.yaml"
1022
    log: 
1023
        "logs/infer_trajectories_{trajectory_name}.log"
1024
    threads: 
1025
        config["slurm"]["infer_trajectories"]["threads"]
1026
    resources:
1027
        mem_mb = config["slurm"]["infer_trajectories"]["memory"]        
1028
    shell:
1029
        "Rscript {input.script} --metadata {input.metadata} --sce {input.sce} --trajectory_name {wildcards.trajectory_name} --celltype_label {params.celltype_label} \
1030
        --outdir {params.outdir} > {log}"
1031
1032
rule coexpression_TF_vs_gene_trajectories: 
1033
    input:
1034
        script = config["scripts"]["coexpression_TF_vs_gene_single_cells"],
1035
        metadata = rules.infer_trajectories.output.metadata,
1036
        sce = rules.infer_trajectories.output.sce
1037
    output:
1038
        config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2gene_single_cells.rds",
1039
        config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2tf_single_cells.rds"
1040
    params:
1041
        TFs_file = config["resources"]["TFs_file"],
1042
        outdir = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}"
1043
    conda:
1044
        "environment.yaml"
1045
    log: 
1046
        "logs/coexpression_TF_vs_gene_{trajectory_name}_trajectory.log"
1047
    threads: 
1048
        config["slurm"]["coexpression_TF_vs_gene_single_cells"]["threads"]
1049
    resources: 
1050
        mem_mb = config["slurm"]["coexpression_TF_vs_gene_single_cells"]["memory"]
1051
    shell:
1052
        "Rscript {input.script} --metadata {input.metadata} --sce {input.sce} \
1053
        --TFs_file {params.TFs_file} --outdir {params.outdir} > {log}"
1054
1055
rule coexpression_TF_vs_gene_trajectories_denoised: 
1056
    input:
1057
        script = config["scripts"]["coexpression_TF_vs_gene_single_cells"],
1058
        metadata = rules.infer_trajectories.output.metadata,
1059
        sce = rules.infer_trajectories.output.sce
1060
    output:
1061
        config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2gene_single_cells_denoised.rds",
1062
        config["directories"]["results"]+"/rna/trajectories/{trajectory_name}/correlation_matrix_tf2tf_single_cells_denoised.rds"
1063
    params:
1064
        knn = 25,
1065
        TFs_file = config["resources"]["TFs_file"],
1066
        outdir = config["directories"]["results"]+"/rna/trajectories/{trajectory_name}"
1067
    conda:
1068
        "environment.yaml"
1069
    log: 
1070
        "logs/coexpression_TF_vs_gene_{trajectory_name}_trajectory.log"
1071
    threads: 
1072
        config["slurm"]["coexpression_TF_vs_gene_single_cells"]["threads"]
1073
    resources: 
1074
        mem_mb = config["slurm"]["coexpression_TF_vs_gene_single_cells"]["memory"]
1075
    shell:
1076
        "Rscript {input.script} --metadata {input.metadata} --sce {input.sce} \
1077
        --TFs_file {params.TFs_file} --denoise --knn {params.knn} --outdir {params.outdir} > {log}"
1078
1079
##############
1080
## Velocyto ##
1081
##############
1082
1083
# TO-DO: NEEDS A CONDA ENVIRONEMNT
1084
1085
rule run_velocyto: 
1086
    input:
1087
        # rules.samtools_sort.output,
1088
        mask = config["resources"]["mm10_mask"],
1089
        genes_gtf = config["resources"]["genes_gtf"]
1090
    output:
1091
        config["directories"]["original_data"] + "/{sample}/velocyto/{sample}.loom"
1092
    params:
1093
        input_dir = config["directories"]["original_data"],
1094
        output_folder = config["directories"]["original_data"] + "/{sample}/velocyto"
1095
    conda:
1096
        "environment.yaml"
1097
    log: 
1098
        "logs/run_velocyto_{sample}.log"
1099
    threads: 
1100
        config["slurm"]["run_velocyto"]["threads"]
1101
    resources: 
1102
        mem_mb = config["slurm"]["run_velocyto"]["memory"],
1103
        mem_mb_thread = int(config["slurm"]["run_velocyto"]["memory"] / (1.25*config["slurm"]["run_velocyto"]["threads"]))
1104
    shell:
1105
        "velocyto run -e {wildcards.sample} -@ {threads} --samtools-memory {resources.mem_mb_thread} -m {input.mask} -b {params.input_dir}/{wildcards.sample}/outs/barcodes.tsv.gz {params.input_dir}/{wildcards.sample}/outs/gex_possorted.bam {input.genes_gtf} -o {params.output_folder} "
1106
1107
rule create_anndata_velocyto: 
1108
    input:
1109
        loom_files = expand(rules.run_velocyto.output, sample=config["samples"]),
1110
        script = config["scripts"]["create_anndata_velocyto"],
1111
        metadata = rules.parse_mapping_results.output,
1112
        anndata = rules.convert_SingleCellExperiment_to_anndata.output
1113
    output:
1114
        config["directories"]["processed_data"] + "/velocyto/anndata_velocyto.h5ad"
1115
    params:
1116
        samples = config["samples"][:1],
1117
    conda:
1118
        "environment.yaml"
1119
    log: 
1120
        "logs/create_anndata_velocyto.log"
1121
    threads: 
1122
        config["slurm"]["create_anndata_velocyto"]["threads"]
1123
    resources: 
1124
        mem_mb = config["slurm"]["create_anndata_velocyto"]["memory"],
1125
    shell:
1126
        "python {input.script} --anndata {input.anndata} --metadata {input.metadata} --samples {params.samples} --outfile {output} > {log} "
1127
1128
################################################
1129
## Differential expression between cell types ##
1130
################################################
1131
1132
rule differential_celltype_cells: 
1133
    input:
1134
        script = config["scripts"]["differential_celltype_cells"],
1135
        metadata = rules.parse_mapping_results.output,
1136
        sce = rules.seurat_to_sce.output
1137
    output:
1138
        config["directories"]["results"]+"/rna/differential/cells/celltype/{celltypeA}_vs_{celltypeB}.txt.gz"
1139
    params:
1140
        min_cells = config["differential_celltype_cells"]["min_cells"]
1141
    log: 
1142
        "logs/differential_celltype_cells_{celltypeA}_vs_{celltypeB}.log"
1143
    threads: 
1144
        config["slurm"]["differential_cells"]["threads"]
1145
    resources:
1146
        mem_mb = config["slurm"]["differential_cells"]["memory"]
1147
    shell:
1148
        "Rscript {input.script} --sce {input.sce} --metadata {input.metadata} --groupA {wildcards.celltypeA} --groupB {wildcards.celltypeB} \
1149
        --group_variable celltype --min_cells {params.min_cells} --outfile {output} > {log}"
1150
1151
rule parse_differential_celltype_cells: 
1152
    input:
1153
        expand(rules.differential_celltype_cells.output, filtered_product_celltypes, celltypeA=config["celltypes"][0], celltypeB=config["celltypes"]), # dummy wildcard assignment
1154
        script = config["scripts"]["parse_differential_celltype_cells"],
1155
    output:
1156
        results = config["directories"]["results"]+"/rna/differential/cells/celltype/parsed/diff_expr_results.txt.gz",
1157
        stats = config["directories"]["results"]+"/rna/differential/cells/celltype/parsed/diff_expr_stats.txt.gz"
1158
    params:
1159
        diff_results_dir = config["directories"]["results"]+"/rna/differential/cells/celltype",
1160
        outdir = config["directories"]["results"]+"/rna/differential/cells/celltype/parsed",
1161
        min_cells = config["differential_celltype_cells"]["min_cells"]
1162
    log: 
1163
        "logs/parse_differential_celltype_cells.log"
1164
    threads: 
1165
        config["slurm"]["parse_differential_celltype"]["threads"]
1166
    resources:
1167
        mem_mb = config["slurm"]["parse_differential_celltype"]["memory"]
1168
    shell:
1169
        "Rscript {input.script} --diff_results_dir {params.diff_results_dir} --min_cells {params.min_cells} --outdir {params.outdir} > {log}"
1170
1171
rule differential_celltype_metacells: 
1172
    input:
1173
        script = config["scripts"]["differential_celltype_metacells"],
1174
        metadata = rules.aggregate_rna_metacells.output.metadata,
1175
        sce = rules.aggregate_rna_metacells.output.sce
1176
    output:
1177
        config["directories"]["results"]+"/rna/differential/metacells/celltype/{celltypeA}_vs_{celltypeB}.txt.gz"
1178
    params:
1179
        min_cells = config["differential_celltype_metacells"]["min_cells"]
1180
    log: 
1181
        "logs/differential_celltype_metacells_{celltypeA}_vs_{celltypeB}.log"
1182
    threads: 
1183
        config["slurm"]["differential_metacells"]["threads"]
1184
    resources:
1185
        mem_mb = config["slurm"]["differential_metacells"]["memory"]
1186
    shell:
1187
        "Rscript {input.script} --sce {input.sce} --metadata {input.metadata} --groupA {wildcards.celltypeA} --groupB {wildcards.celltypeB} \
1188
        --group_variable celltype --min_cells {params.min_cells} --outfile {output} > {log}"
1189
1190
rule parse_differential_celltype_metacells: 
1191
    input:
1192
        expand(rules.differential_celltype_metacells.output, filtered_product_celltypes, celltypeA=config["celltypes"][0], celltypeB=config["celltypes"]), # dummy wildcard assignment
1193
        script = config["scripts"]["parse_differential_celltype_metacells"]
1194
    output:
1195
        results = config["directories"]["results"]+"/rna/differential/metacells/celltype/parsed/diff_expr_results.txt.gz",
1196
        stats = config["directories"]["results"]+"/rna/differential/metacells/celltype/parsed/diff_expr_stats.txt.gz"
1197
    params:
1198
        diff_results_dir = config["directories"]["results"]+"/rna/differential/metacells/celltype",
1199
        outdir = config["directories"]["results"]+"/rna/differential/metacells/celltype/parsed",
1200
        min_cells = config["differential_celltype_metacells"]["min_cells"]
1201
    log: 
1202
        "logs/parse_differential_celltype_metacells.log"
1203
    threads: 
1204
        config["slurm"]["parse_differential_celltype"]["threads"]
1205
    resources:
1206
        mem_mb = config["slurm"]["parse_differential_celltype"]["memory"]
1207
    shell:
1208
        "Rscript {input.script} --diff_results_dir {params.diff_results_dir} --min_cells {params.min_cells} --outdir {params.outdir} > {log}"
1209
1210
rule differential_celltype_pseudobulk: 
1211
    input:
1212
        script = config["scripts"]["differential_celltype_pseudobulk"],
1213
        sce = expand(rules.pseudobulk_rna_with_replicates.output.sce, group_by="celltype")
1214
    output:
1215
        config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/{celltypeA}_vs_{celltypeB}.txt.gz"
1216
    log: 
1217
        "logs/differential_celltype_pseudobulk_{celltypeA}_vs_{celltypeB}.log"
1218
    threads: 
1219
        config["slurm"]["differential_pseudobulk"]["threads"]
1220
    resources:
1221
        mem_mb = config["slurm"]["differential_pseudobulk"]["memory"]
1222
    shell:
1223
        "Rscript {input.script} --sce {input.sce} --groupA {wildcards.celltypeA} --groupB {wildcards.celltypeB} --outfile {output} > {log}"
1224
1225
rule parse_differential_celltype_pseudobulk: 
1226
    input:
1227
        expand(rules.differential_celltype_pseudobulk.output, filtered_product_celltypes, celltypeA=config["celltypes"][0], celltypeB=config["celltypes"]), # dummy wildcard assignment
1228
        script = config["scripts"]["parse_differential_celltype_pseudobulk"]
1229
    output:
1230
        config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/diff_expr_results.txt.gz"
1231
    params:
1232
        diff_results_dir = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype",
1233
        outdir = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed"
1234
    log: 
1235
        "logs/parse_differential_celltype_pseudobulk.log"
1236
    threads: 
1237
        config["slurm"]["parse_differential_celltype"]["threads"]
1238
    resources:
1239
        mem_mb = config["slurm"]["parse_differential_celltype"]["memory"]
1240
    shell:
1241
        "Rscript {input.script} --diff_results_dir {params.diff_results_dir} --outdir {params.outdir} > {log}"
1242
1243
###############################################
1244
## Differential expression between genotypes ##
1245
###############################################
1246
1247
rule differential_celltype_genotype_pseudobulk: 
1248
    input:
1249
        script = config["scripts"]["differential_celltype_genotype_pseudobulk"],
1250
        sce = expand(rules.pseudobulk_rna_with_replicates.output.sce, group_by="celltype_genotype")
1251
    output:
1252
        config["directories"]["results"]+"/rna/differential/pseudobulk/celltype_genotype/{diff_celltype_genotype}.txt.gz"
1253
    log: 
1254
        "logs/differential_celltype_genotype_pseudobulk_{diff_celltype_genotype}.log"
1255
    threads: 
1256
        config["slurm"]["differential_pseudobulk"]["threads"]
1257
    resources:
1258
        mem_mb = config["slurm"]["differential_pseudobulk"]["memory"]
1259
    shell:
1260
        "Rscript {input.script} --sce {input.sce} --celltypes {wildcards.diff_celltype_genotype} --groupA WT --groupB T_KO --outfile {output} > {log}"
1261
1262
rule parse_differential_celltype_genotype_pseudobulk: 
1263
    input:
1264
        expand(rules.differential_celltype_genotype_pseudobulk.output, diff_celltype_genotype=config["celltypes"]),
1265
        script = config["scripts"]["parse_differential_celltype_genotype_pseudobulk"]
1266
    output:
1267
        config["directories"]["results"]+"/rna/differential/pseudobulk/celltype_genotype/parsed/diff_expr_results.txt.gz"
1268
    params:
1269
        diff_results_dir = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype_genotype"
1270
    log: 
1271
        "logs/parse_differential_celltype_genotype_pseudobulk.log"
1272
    threads: 
1273
        config["slurm"]["parse_differential_celltype"]["threads"]
1274
    resources:
1275
        mem_mb = config["slurm"]["parse_differential_celltype"]["memory"]
1276
    shell:
1277
        "Rscript {input.script} --diff_results_dir {params.diff_results_dir} --outfile {output} > {log}"
1278
1279
1280
############################################
1281
## Extract TFs from differential analysis ##
1282
############################################
1283
1284
rule differential_celltype_extract_TFs_cells: 
1285
    input:
1286
        diff_results = rules.parse_differential_celltype_cells.output.results,
1287
        script = config["scripts"]["differential_extract_TFs"],
1288
        TFs = config["resources"]["TFs_file"],
1289
    output:
1290
        config["directories"]["results"] + "/rna/differential/cells/celltype/parsed/diff_expr_results_tfs.txt.gz"
1291
    log: 
1292
        "logs/differential_celltype_extract_TFs_cells.log"
1293
    threads: 
1294
        config["slurm"]["differential_celltype_extract_TFs"]["threads"]
1295
    resources:
1296
        mem_mb = config["slurm"]["differential_celltype_extract_TFs"]["memory"]
1297
    shell:
1298
        "Rscript {input.script} --diff_results {input.diff_results} --TFs {input.TFs} --outfile {output} > {log}"
1299
1300
rule differential_celltype_extract_TFs_metacells: 
1301
    input:
1302
        diff_results = rules.parse_differential_celltype_metacells.output.results,
1303
        script = config["scripts"]["differential_extract_TFs"],
1304
        TFs = config["resources"]["TFs_file"],
1305
    output:
1306
        config["directories"]["results"] + "/rna/differential/metacells/celltype/parsed/diff_expr_results_tfs.txt.gz"
1307
    log: 
1308
        "logs/differential_celltype_extract_TFs_metacells.log"
1309
    threads: 
1310
        config["slurm"]["differential_celltype_extract_TFs"]["threads"]
1311
    resources:
1312
        mem_mb = config["slurm"]["differential_celltype_extract_TFs"]["memory"]
1313
    shell:
1314
        "Rscript {input.script} --diff_results {input.diff_results} --TFs {input.TFs} --outfile {output} > {log}"
1315
1316
rule differential_celltype_extract_TFs_pseudobulk: 
1317
    input:
1318
        diff_results = rules.parse_differential_celltype_pseudobulk.output,
1319
        script = config["scripts"]["differential_extract_TFs"],
1320
        TFs = config["resources"]["TFs_file"],
1321
    output:
1322
        config["directories"]["results"] + "/rna/differential/pseudobulk/celltype/parsed/diff_expr_results_tfs.txt.gz"
1323
    log: 
1324
        "logs/differential_celltype_extract_TFs_pseudobulk.log"
1325
    threads: 
1326
        config["slurm"]["differential_celltype_extract_TFs"]["threads"]
1327
    resources:
1328
        mem_mb = config["slurm"]["differential_celltype_extract_TFs"]["memory"]
1329
    shell:
1330
        "Rscript {input.script} --diff_results {input.diff_results} --TFs {input.TFs} --outfile {output} > {log}"
1331
1332
##################################
1333
## Define celltype marker genes ##
1334
##################################
1335
1336
rule define_marker_genes_pseudobulk: 
1337
    input:
1338
        script = config["scripts"]["define_marker_genes_pseudobulk"],
1339
        differential_results = rules.parse_differential_celltype_pseudobulk.output
1340
    output:
1341
        all_genes = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/marker_genes_all.txt.gz",
1342
        filt_genes = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/marker_genes_filtered.txt.gz"
1343
    params:
1344
        outdir = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed",
1345
        min_fold_change = config["define_marker_genes"]["min_fold_change"],
1346
        min_score = config["define_marker_genes"]["min_score"],
1347
        fdr = config["define_marker_genes"]["fdr"]
1348
    log: 
1349
        "logs/define_marker_genes_pseudobulk.log"
1350
    threads: 
1351
        config["slurm"]["define_marker_genes"]["threads"]
1352
    resources:
1353
        mem_mb = config["slurm"]["define_marker_genes"]["memory"]
1354
    shell:
1355
        "Rscript {input.script} --differential_results {input.differential_results} --fdr {params.fdr} --min_score {params.min_score} --min_fold_change {params.min_fold_change} --outdir {params.outdir} > {log}"
1356
1357
rule define_marker_tfs_pseudobulk: 
1358
    input:
1359
        script = config["scripts"]["define_marker_TFs_pseudobulk"],
1360
        differential_results = rules.differential_celltype_extract_TFs_pseudobulk.output
1361
    output:
1362
        all_genes = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/marker_tfs_all.txt.gz",
1363
        filt_genes = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed/marker_tfs_filtered.txt.gz"
1364
    params:
1365
        outdir = config["directories"]["results"]+"/rna/differential/pseudobulk/celltype/parsed",
1366
        min_fold_change = config["define_marker_genes"]["min_fold_change"],
1367
        min_score = config["define_marker_genes"]["min_score"],
1368
        fdr = config["define_marker_genes"]["fdr"]
1369
    log: 
1370
        "logs/define_marker_genes_pseudobulk.log"
1371
    threads: 
1372
        config["slurm"]["define_marker_genes"]["threads"]
1373
    resources:
1374
        mem_mb = config["slurm"]["define_marker_genes"]["memory"]
1375
    shell:
1376
        "Rscript {input.script} --differential_results {input.differential_results} --fdr {params.fdr} --min_score {params.min_score} --min_fold_change {params.min_fold_change} --outdir {params.outdir} > {log}"