68 lines (57 with data), 2.1 kB
process AWK_EXTRACT {
tag "$meta.id"
label 'process_single'
conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gawk:5.3.0' :
'biocontainers/gawk:5.3.0' }"
input:
tuple val(meta), val(column_name), val(values)
tuple val(meta2), path(data)
output:
tuple val(meta), path("${prefix}.${extension}"), emit: extracted_data
path("versions.yml") , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: []
def separator = args.separator ?: ( data.getName().endsWith(".csv") ? ',': '\t' )
prefix = task.ext.prefix ?: "${meta.id}.extracted"
extension = data.getName().split("\\.").last()
"""
# Convert comma-separated values to an array
IFS=',' read -r -a values_array <<< $values
# Get the column index for the given column name
column_index=\$(head -1 $data | tr "$separator" "\\n" | nl -v 0 | grep -w $column_name | awk '{print \$1}')
if [ -z \$column_index ]; then
echo "Column '$column_name' not found in the CSV file."
exit 1
fi
# Extract rows where the column has the specified values
awk -v col=\$column_index -v values=$values -v FS="$separator" '
BEGIN {
split(values, vals, ",");
for (i in vals) {
val_map[vals[i]] = 1;
}
}
NR == 1 || val_map[\$(col + 1)] {
print \$0
}
' $data > ${prefix}.${extension}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
END_VERSIONS
"""
stub:
prefix = task.ext.prefix ?: "${meta.id}.extracted"
extension = data.getName().split("\\.").last()
"""
touch ${prefix}.${extension}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
END_VERSIONS
"""
}