[4c33d4]: / exseek / config / default_config.yaml

Download this file

142 lines (132 with data), 4.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# RNA types for sequential mapping in small-RNA pipeline
rna_types: [univec, rRNA, lncRNA, mature_miRNA, miRNA, mRNA, piRNA, snoRNA,
snRNA, srpRNA, tRNA, tucpRNA, Y_RNA]
# Define recurrent domain as domains called in fraction of samples above this value
cov_threshold: 0.05
# Maximum number of features to select
n_features_to_select: [10]
# Parameters for evalation of features
evaluation_features:
classifier: logistic_regression
classifier_params:
logistic_regression:
penalty: l2
# Type of counts for feature selection
# domains_combined: combine miRNA/piRNA with long RNA domains
# transcript: transcript-level features
# featurecounts: gene-level features counted using featureCounts
count_method: mirna_and_long_fragments
# Define low expression value as read counts below this value
filtercount: 5
# Threshold for filtering low expression features
filterexpv: 0
# Quantification method for low expression filter
filtermethod: filtercount
# Keep features with high expression in fraction of samples above this value
filtersample: 0.2
# Imputation methods to try (set to "null" to skip imputation)
#imputation_methods: ["viper_count", "null"]
imputation_method: ["null"]
# Read depth normalization methods to try
normalization_method: ["TMM"]
# Batch effect removal methods to try (set "null" to skip batch effect removal)
batch_removal_method: ["ComBat"]
# Column index of batch effect in batch_info.txt to considier for Combat
batch_index: 1
# Root directory
root_dir: "."
# Directory for sequences and annotations
genome_dir: "genome/hg38"
# Temporary directory (e.g. samtools sort, sort)
temp_dir: "tmp"
# Directory for third-party tools
tools_dir: "tools"
# Directory for exSeek scripts
bin_dir: "bin"
# Directory for spike-in sequences and index
spikein_dir: "genome/hg38/spikein"
# bin path to R
# r_dir: "/usr/bin"
# Input files are clean reads
input_clean_reads: false
# Number of threads for uncompression and compression
threads_compress: 1
# Default number of threads to use
threads: 1
# alignment software to use (valie choices: bowtie, star)
aligner: bowtie2
# Remove 3'-end adaptor sequence from single-end reads
adaptor: ""
# Remove 5'-end adaptor sequence from single-end reads
adaptor_5p: ""
# Remove 3'-end adaptor sequence from the first read in a pair
adaptor1: ""
# Remove 3'-end adaptor sequence from the second read in a pair
adaptor2: ""
# Remove 5'-end adaptor sequence from the first read in a pair
adaptor1_5p: ""
# Remove 5'-end adaptor sequence from the second in a pair
adaptor2_5p: ""
# Exact number of bases to trim from 5'-end
trim_5p: 0
# Exact number of bases to trim from 3'-end
trim_3p: 0
# Trim exact number of bases after adapter trimming
trim_after_adapter: false
# Discard reads of length below this value
min_read_length: 16
# Maximum read length
max_read_length: 100
# Trim bases with quality below this value from 3'-end
min_base_quality: 30
# Trim bases with quality below this value from 5'-end
min_base_quality_5p: 30
# Trim bases with quality below this value from 3'-end
min_base_quality_3p: 30
# Quality encoding in FASTQ files
quality_base: 33
# Strandness (valid choices: forward, reverse, no)
strandness: forward
# Filter out reads with mapping quality below this value
min_mapping_quality: 0
# Only considier longest transcript for transcriptome mapping
use_longest_transcript: true
# Expected read length for mapping using STAR
star_genome_generate:
sjdbOverhang: 100
limitGenomeGenerateRAM: 31000000000
# Number of threads for mapping
threads_mapping: 4
# Remove duplicates for long RNA-seq before feature counting
remove_duplicates_long: true
# Input reads are paired-end
paired_end: false
# Use small RNA-seq pipeline (sequential mapping)
small_rna: true
# Remove UMI tags (leading nucleotides)
umi_tags: false
# Length of the UMI barcode
umi_length: 0
# Evaluate published biomarkers
evaluate_features_preprocess_methods: []
# Differential expression method
# Available methods: deseq2, edger_glmlrt, edger_glmqlf, edger_exact, wilcox
diffexp_method: [deseq2, edger_glmlrt]
# Count multi-mapping reads
count_multimap_reads: true
# Count overlapping features
count_overlapping_features: true
# Base URL for IGV web server
igv_base_url: http://127.0.0.1:5000
# Configuration for singularity
container:
singularity_path: singularity
udocker_path: udocker
image: singularity/exseek.simg
wrapper_dir: singularity/wrappers
# Configuration for cluster jobs
cluster:
# Command template for submitting a job to cluster
submit_command: 'bsub -q {cluster.queue} -J {cluster.name} -e {cluster.stderr} -o {cluster.stdout} -R {cluster.resources} -n {cluster.threads}'
# Snakemake configuration file for cluster jobs
config_file: config/cluster.yaml