diff --git a/conf/modules.config b/conf/modules.config index 2dcf97c..f828bef 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,10 +18,29 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: 'CATPACK_ADDNAMES_BINS' { + ext.args = '--only_official' + publishDir = [ + path: { "${params.outdir}/${params.mode}/taxonomy" }, + mode: params.publish_dir_mode, + pattern: "*.txt", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CHECKM2_PREDICT' { + publishDir = [ + path: { "${params.outdir}/${params.mode}/checkm2" }, + mode: params.publish_dir_mode, + pattern: "*_checkm2_report.tsv", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'COVERM_GENOME' { ext.args = '--min-covered-fraction 0 --methods mean' publishDir = [ - path: { "${params.outdir}/coverage" }, + path: { "${params.outdir}/${params.mode}/coverage" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] diff --git a/conf/test.config b/conf/test.config index e69de29..bb8ae69 100644 --- a/conf/test.config +++ b/conf/test.config @@ -0,0 +1,7 @@ +process { + resourceLimits = [ + cpus: 2, + memory: '15.GB', + time: '1.h' + ] +} diff --git a/modules.json b/modules.json index e300169..158c21f 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,36 @@ "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", "installed_by": ["modules"] }, + "catpack/addnames": { + "branch": "master", + "git_sha": "1100099613c12e944931426f46f10b14c4a74b3d", + "installed_by": ["fasta_classify_catpack"] + }, + "catpack/bins": { + "branch": "master", + "git_sha": "1100099613c12e944931426f46f10b14c4a74b3d", + "installed_by": ["fasta_classify_catpack"] + }, + "catpack/contigs": { + "branch": "master", + "git_sha": "1100099613c12e944931426f46f10b14c4a74b3d", + "installed_by": ["fasta_classify_catpack"] + }, + "catpack/download": { + "branch": "master", + "git_sha": "1100099613c12e944931426f46f10b14c4a74b3d", + "installed_by": ["fasta_classify_catpack"] + }, + "catpack/prepare": { + "branch": "master", + "git_sha": "1100099613c12e944931426f46f10b14c4a74b3d", + "installed_by": ["fasta_classify_catpack"] + }, + "catpack/summarise": { + "branch": "master", + "git_sha": "1100099613c12e944931426f46f10b14c4a74b3d", + "installed_by": ["fasta_classify_catpack"] + }, "checkm2/databasedownload": { "branch": "master", "git_sha": "81470b59ebadb3d01dcfcd37d44f88eb890f4851", @@ -45,11 +75,22 @@ "branch": "master", "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", "installed_by": ["modules"] + }, + "untar": { + "branch": "master", + "git_sha": "447f7bc0fa41dfc2400c8cad4c0291880dc060cf", + "installed_by": ["fasta_classify_catpack"] } } }, "subworkflows": { "nf-core": { + "fasta_classify_catpack": { + "branch": "master", + "git_sha": "1e1d30d328949507ee5af79b7ff58b09738b8227", + "installed_by": ["subworkflows"], + "patch": "subworkflows/nf-core/fasta_classify_catpack/fasta_classify_catpack.diff" + }, "utils_nextflow_pipeline": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", diff --git a/modules/local/rename_fasta_for_catpack/environment.yml b/modules/local/rename_fasta_for_catpack/environment.yml new file mode 100644 index 0000000..1eb8f53 --- /dev/null +++ b/modules/local/rename_fasta_for_catpack/environment.yml @@ -0,0 +1,5 @@ +--- +channels: + - conda-forge +dependencies: + - "conda-forge::gzip" diff --git a/modules/local/rename_fasta_for_catpack/main.nf b/modules/local/rename_fasta_for_catpack/main.nf new file mode 100644 index 0000000..4a2d089 --- /dev/null +++ b/modules/local/rename_fasta_for_catpack/main.nf @@ -0,0 +1,29 @@ +process RENAME_FASTA_FOR_CATPACK { + tag "${meta.id}" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("output/*.fasta"), emit: renamed_fasta + + script: + def is_compressed = fasta.name.endsWith('.gz') + extension = '.fasta' + def base = fasta.name + .replaceAll(/\.gz$/, '') + .replaceAll(/\.(fa|fasta|fna)$/, '') + def output_name = base + extension + + if (is_compressed) { + """ + mkdir -p output + gunzip -c ${fasta} > output/${output_name} + """ + } else { + """ + mkdir -p output + ln -s ../${fasta} output/${output_name} + """ + } +} diff --git a/modules/local/rename_fasta_for_catpack/meta.yml b/modules/local/rename_fasta_for_catpack/meta.yml new file mode 100644 index 0000000..167a6e0 --- /dev/null +++ b/modules/local/rename_fasta_for_catpack/meta.yml @@ -0,0 +1,47 @@ +name: "rename_fasta_for_catpack" +description: | + Renames a FASTA file (stripping the original extension and using .fasta) for compatibility with CAT/BAT (CATpack). Compressed inputs are decompressed; uncompressed inputs are symlinked. +keywords: + - fasta + - rename + - catpack + - cat +tools: + - "gzip": + description: "Standard compression/decompression utility, used here to decompress .gz FASTA files." + homepage: "https://www.gnu.org/software/gzip/" + documentation: "https://www.gnu.org/software/gzip/manual/" + licence: ["GPL-3.0-or-later"] + identifier: null + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. `[ id:'sample1' ]` + - fasta: + type: file + description: | + FASTA file (compressed .gz or uncompressed). Any of .fa, .fna, or .fasta extensions are accepted. + pattern: "*.{fa,fna,fasta,fa.gz,fna.gz,fasta.gz}" + +output: + renamed_fasta: + - - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. `[ id:'sample1' ]` + - "output/*.fasta{,.gz}": + type: file + description: | + Renamed FASTA file with a .fasta extension, decompressed if the input was compressed. + pattern: "output/*.fasta" + +authors: + - "@KateSakharova" + - "@ochkalova" +maintainers: + - "@KateSakharova" + - "@ochkalova" diff --git a/modules/local/rename_fasta_for_catpack/tests/main.nf.test b/modules/local/rename_fasta_for_catpack/tests/main.nf.test new file mode 100644 index 0000000..d5c5a33 --- /dev/null +++ b/modules/local/rename_fasta_for_catpack/tests/main.nf.test @@ -0,0 +1,60 @@ +nextflow_process { + + name "Test Process RENAME_FASTA_FOR_CATPACK" + script "../main.nf" + process "RENAME_FASTA_FOR_CATPACK" + + tag "modules" + tag "rename_fasta_for_catpack" + + test("RENAME_FASTA_FOR_CATPACK - uncompressed fasta") { + + when { + process { + """ + input[0] = [ + [ id: 'test' ], + file("${moduleDir}/tests/test.fasta", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.renamed_fasta.size() == 1 }, + { assert process.out.renamed_fasta[0][1].toString().endsWith(".fasta") }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("RENAME_FASTA_FOR_CATPACK - compressed fasta") { + + when { + process { + """ + input[0] = [ + [ id: 'test_compressed' ], + file("${moduleDir}/tests/test_compressed.fa.gz", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.renamed_fasta.size() == 1 }, + // compressed input should be decompressed and renamed to .fasta (not .fasta.gz) + { assert process.out.renamed_fasta[0][1].toString().endsWith(".fasta") }, + { assert !process.out.renamed_fasta[0][1].toString().endsWith(".fa.fasta") }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/local/rename_fasta_for_catpack/tests/test.fasta b/modules/local/rename_fasta_for_catpack/tests/test.fasta new file mode 100644 index 0000000..b533dcc --- /dev/null +++ b/modules/local/rename_fasta_for_catpack/tests/test.fasta @@ -0,0 +1,4 @@ +>contig_1 +ATGCATGCATGCATGCATGC +>contig_2 +TTTTGGGGCCCCAAAAT diff --git a/modules/local/rename_fasta_for_catpack/tests/test_compressed.fa.gz b/modules/local/rename_fasta_for_catpack/tests/test_compressed.fa.gz new file mode 100644 index 0000000..4df71f5 Binary files /dev/null and b/modules/local/rename_fasta_for_catpack/tests/test_compressed.fa.gz differ diff --git a/modules/nf-core/catpack/addnames/environment.yml b/modules/nf-core/catpack/addnames/environment.yml new file mode 100644 index 0000000..39264bf --- /dev/null +++ b/modules/nf-core/catpack/addnames/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::cat=6.0.1 diff --git a/modules/nf-core/catpack/addnames/main.nf b/modules/nf-core/catpack/addnames/main.nf new file mode 100644 index 0000000..ed3cd1b --- /dev/null +++ b/modules/nf-core/catpack/addnames/main.nf @@ -0,0 +1,46 @@ +process CATPACK_ADDNAMES { + tag "${meta.id}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/cat:6.0.1--hdfd78af_1' + : 'biocontainers/cat:6.0.1--hdfd78af_1'}" + + input: + tuple val(meta), path(input) + tuple val(meta2), path(taxonomy) + + output: + tuple val(meta), path("${prefix}.txt"), emit: txt + tuple val("${task.process}"), val('catpack'), eval("CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'"), topic: versions, emit: versions_catpack + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + if ("${input}" == "${prefix}.txt") { + error("Input and output names are the same, set prefix in module configuration to disambiguate!") + } + """ + CAT_pack add_names \\ + ${args} \\ + -i ${input} \\ + -t ${taxonomy} \\ + -o ${prefix}.txt + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + echo "CAT_pack add_names \\ + ${args} \\ + -i ${input} \\ + -o ${prefix}.txt" + + touch ${prefix}.txt + """ +} diff --git a/modules/nf-core/catpack/addnames/meta.yml b/modules/nf-core/catpack/addnames/meta.yml new file mode 100644 index 0000000..86182a4 --- /dev/null +++ b/modules/nf-core/catpack/addnames/meta.yml @@ -0,0 +1,89 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "catpack_addnames" +description: Taxonomic classification of long DNA sequences and metagenome assembled + genomes (e.g. MAGs / bins). +keywords: + - taxonomic classification + - classification + - long reads + - mags + - assembly +tools: + - "catpack": + description: "CAT/BAT: tool for taxonomic classification of contigs and metagenome-assembled + genomes (MAGs)" + homepage: "https://github.com/MGXlab/CAT_pack" + documentation: "https://github.com/MGXlab/CAT_pack" + tool_dev_url: "https://github.com/MGXlab/CAT_pack" + doi: "10.1186/s13059-019-1817-x" + licence: ["MIT"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - input: + type: file + description: Classification or ORF2LCA output file from CAT/BAT/RAT + pattern: "*.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" + + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - taxonomy: + type: directory + description: "Directory containing taxonomy files: names.dmp, nodes.dmp, acc2taxid.txt" + pattern: "*/" + ontologies: + - edam: "http://edamontology.org/data_1049" + +output: + txt: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - ${prefix}.txt: + type: map + description: | + CAT/BAT/RAT classification file with added taxonomic names + pattern: "*.{bam,cram,sam}" + ontologies: + - edam: "http://edamontology.org/format_3475" + + versions_catpack: + - - ${task.process}: + type: string + description: The process the versions were collected from + - catpack: + type: string + description: The tool name + - "CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'": + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - "${task.process}": + type: string + description: The name of the process + - catpack: + type: string + description: The name of the tool + - "CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'": + type: eval + description: The expression to obtain the version of the tool +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/catpack/addnames/tests/main.nf.test b/modules/nf-core/catpack/addnames/tests/main.nf.test new file mode 100644 index 0000000..0e50115 --- /dev/null +++ b/modules/nf-core/catpack/addnames/tests/main.nf.test @@ -0,0 +1,84 @@ +nextflow_process { + + name "Test Process CATPACK_ADDNAMES" + script "../main.nf" + process "CATPACK_ADDNAMES" + config './nextflow.config' + + tag "modules" + tag "modules_nfcore" + tag "catpack" + tag "catpack/addnames" + tag "catpack/prepare" + tag "catpack/contigs" + + setup { + run('CATPACK_PREPARE') { + script '../../prepare/main.nf' + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] ] + input[1] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_nodes.dmp', checkIfExists: true) ] + input[2] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_names.dmp', checkIfExists: true) ] + input[3] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot.accession2taxid.gz', checkIfExists: true) ] + """ + } + } + + run('CATPACK_CONTIGS') { + script '../../contigs/main.nf' + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] + input[1] = CATPACK_PREPARE.out.db + input[2] = CATPACK_PREPARE.out.taxonomy + input[3] = [[:], []] + input[4] = [[:], []] + """ + } + } + } + + test("sarscov2 - genome - fasta") { + + when { + process { + """ + input[0] = CATPACK_CONTIGS.out.contig2classification + input[1] = CATPACK_PREPARE.out.taxonomy + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - genome -fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = CATPACK_CONTIGS.out.contig2classification + input[1] = CATPACK_PREPARE.out.taxonomy + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/catpack/addnames/tests/main.nf.test.snap b/modules/nf-core/catpack/addnames/tests/main.nf.test.snap new file mode 100644 index 0000000..9a2029b --- /dev/null +++ b/modules/nf-core/catpack/addnames/tests/main.nf.test.snap @@ -0,0 +1,84 @@ +{ + "sarscov2 - genome -fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "CATPACK_ADDNAMES", + "catpack", + "6.0" + ] + ], + "txt": [ + [ + { + "id": "test" + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_catpack": [ + [ + "CATPACK_ADDNAMES", + "catpack", + "6.0" + ] + ] + } + ], + "timestamp": "2026-03-08T13:21:37.033366607", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 - genome - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.txt:md5,bf9914b7772f148d4ed2604a5eba680f" + ] + ], + "1": [ + [ + "CATPACK_ADDNAMES", + "catpack", + "6.0" + ] + ], + "txt": [ + [ + { + "id": "test" + }, + "test.txt:md5,bf9914b7772f148d4ed2604a5eba680f" + ] + ], + "versions_catpack": [ + [ + "CATPACK_ADDNAMES", + "catpack", + "6.0" + ] + ] + } + ], + "timestamp": "2026-03-08T13:21:31.204783657", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/catpack/addnames/tests/nextflow.config b/modules/nf-core/catpack/addnames/tests/nextflow.config new file mode 100644 index 0000000..07f3352 --- /dev/null +++ b/modules/nf-core/catpack/addnames/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: CATPACK_PREPARE { + ext.args = "--common_prefix database" + } +} diff --git a/modules/nf-core/catpack/bins/environment.yml b/modules/nf-core/catpack/bins/environment.yml new file mode 100644 index 0000000..39264bf --- /dev/null +++ b/modules/nf-core/catpack/bins/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::cat=6.0.1 diff --git a/modules/nf-core/catpack/bins/main.nf b/modules/nf-core/catpack/bins/main.nf new file mode 100644 index 0000000..d3d7239 --- /dev/null +++ b/modules/nf-core/catpack/bins/main.nf @@ -0,0 +1,58 @@ +process CATPACK_BINS { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/cat:6.0.1--hdfd78af_1' + : 'biocontainers/cat:6.0.1--hdfd78af_1'}" + + input: + tuple val(meta), path(bins, stageAs: 'bins/*') + tuple val(meta2), path(database) + tuple val(meta3), path(taxonomy) + tuple val(meta4), path(proteins) + tuple val(meta5), path(diamond_table) + val(bin_suffix) + + output: + tuple val(meta), path("*.ORF2LCA.txt"), emit: orf2lca + tuple val(meta), path("*.bin2classification.txt"), emit: bin2classification + tuple val(meta), path("*.log"), emit: log + tuple val(meta), path("*.diamond"), optional: true, emit: diamond + tuple val(meta), path("*.predicted_proteins.faa"), optional: true, emit: faa + tuple val(meta), path("*.gff"), optional: true, emit: gff + tuple val("${task.process}"), val('catpack'), eval("CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'"), topic: versions, emit: versions_catpack + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def premade_proteins = proteins ? "-p ${proteins}" : '' + def premade_table = diamond_table ? "-d ${diamond_table}" : '' + """ + CAT_pack bins \\ + -n ${task.cpus} \\ + -b bins/ \\ + -d ${database} \\ + -t ${taxonomy} \\ + -s ${bin_suffix} \\ + ${premade_proteins} \\ + ${premade_table} \\ + -o ${prefix} \\ + ${args} + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.ORF2LCA.txt + touch ${prefix}.bin2classification.txt + touch ${prefix}.log + touch ${prefix}.diamond + touch ${prefix}.predicted_proteins.faa + touch ${prefix}.predicted_proteins.gff + """ +} diff --git a/modules/nf-core/catpack/bins/meta.yml b/modules/nf-core/catpack/bins/meta.yml new file mode 100644 index 0000000..a4fd2c6 --- /dev/null +++ b/modules/nf-core/catpack/bins/meta.yml @@ -0,0 +1,188 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "catpack_bins" +description: Taxonomic classification of long DNA sequences and metagenome assembled + genomes (e.g. MAGs / bins). +keywords: + - taxonomic classification + - classification + - long reads + - mags + - assembly +tools: + - "catpack": + description: "CAT/BAT: tool for taxonomic classification of contigs and metagenome-assembled + genomes (MAGs)" + homepage: "https://github.com/MGXlab/CAT_pack" + documentation: "https://github.com/MGXlab/CAT_pack" + tool_dev_url: "https://github.com/MGXlab/CAT_pack" + doi: "10.1186/s13059-019-1817-x" + licence: ["MIT"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - bins: + type: file + description: One or more nucleotide FASTA file containing binned long DNA sequences. + pattern: "*.{fasta,fna,fa,fas}" + ontologies: + - edam: "http://edamontology.org/format_1929" + + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - database: + type: directory + description: Directory containing CAT_pack database files (e.g. output from + CAT_pack prepare) + pattern: "*/" + ontologies: + - edam: "http://edamontology.org/data_1049" + + - - meta3: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - taxonomy: + type: directory + description: Directory containing CAT_pack taxonomy files (e.g. output from + CAT_pack prepare) + pattern: "*/" + ontologies: + - edam: "http://edamontology.org/data_1049" + + - - meta4: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - proteins: + type: directory + description: Optional pre predicted-made proteins FASTA + pattern: "*.{fasta,faa,fa,fas}" + ontologies: + - edam: "http://edamontology.org/format_1929" + + - - meta5: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - diamond_table: + type: directory + description: Optional pre-made DIAMOND alignment table + pattern: "*" + ontologies: + - edam: "http://edamontology.org/format_3751" + + - bin_suffix: + type: string + description: Suffix to search for in the input files when `bins` is a directory. + +output: + orf2lca: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.ORF2LCA.txt": + type: file + description: A TSV file with per-ORF hit stats and identified lineage + pattern: "*.ORF2LCA.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" + bin2classification: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.bin2classification.txt": + type: file + description: A TSV file with per-bin hit stats and assignment justification + information + pattern: "*.bin2classification.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" + log: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.log": + type: file + description: Log file with run messages and basic statistics + ontologies: + - edam: "http://edamontology.org/format_2330" + diamond: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.diamond": + type: file + description: Intermediate DIAMOND TSV summary output file with alignment results + ontologies: + - edam: "http://edamontology.org/format_3475" + faa: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.predicted_proteins.faa": + type: file + description: FAA file of DIAMOND predicted proteins hits + ontologies: + - edam: "http://edamontology.org/format_3475" + gff: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.gff": + type: file + description: GFF file of DIAMOND predicted proteins hits + ontologies: + - edam: "http://edamontology.org/format_2305" + versions_catpack: + - - ${task.process}: + type: string + description: The process the versions were collected from + - catpack: + type: string + description: The tool name + - "CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'": + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - "${task.process}": + type: string + description: The name of the process + - catpack: + type: string + description: The name of the tool + - "CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'": + type: eval + description: The expression to obtain the version of the tool +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/catpack/bins/tests/main.nf.test b/modules/nf-core/catpack/bins/tests/main.nf.test new file mode 100644 index 0000000..d72239a --- /dev/null +++ b/modules/nf-core/catpack/bins/tests/main.nf.test @@ -0,0 +1,138 @@ +nextflow_process { + + name "Test Process CATPACK_BINS" + script "../main.nf" + process "CATPACK_BINS" + config './nextflow.config' + + tag "modules" + tag "modules_nfcore" + tag "catpack" + tag "catpack/bins" + tag "catpack/prepare" + tag "catpack/contigs" + + setup { + run('CATPACK_PREPARE') { + script '../../prepare/main.nf' + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] ] + input[1] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_nodes.dmp', checkIfExists: true) ] + input[2] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_names.dmp', checkIfExists: true) ] + input[3] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot.accession2taxid.gz', checkIfExists: true) ] + """ + } + } + } + + test("sarscov2 - genome - fasta") { + + when { + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] + input[1] = CATPACK_PREPARE.out.db + input[2] = CATPACK_PREPARE.out.taxonomy + input[3] = [[:], []] + input[4] = [[:], []] + input[5] = '.fasta' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot ( + process.out.orf2lca, + process.out.bin2classification, + process.out.diamond, + process.out.faa, + process.out.gff, + process.out.versions_catpack, + path(process.out.log.get(0).get(1)).readLines().last().contains("CAT is done!") + ).match() + } + ) + } + + } + + test("sarscov2 - genome - fasta - premade proteins") { + + setup { + run('CATPACK_CONTIGS') { + script '../../contigs/main.nf' + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] + input[1] = CATPACK_PREPARE.out.db + input[2] = CATPACK_PREPARE.out.taxonomy + input[3] = [[:], []] + input[4] = [[:], []] + """ + } + } + } + + when { + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] + input[1] = CATPACK_PREPARE.out.db + input[2] = CATPACK_PREPARE.out.taxonomy + input[3] = CATPACK_CONTIGS.out.faa + input[4] = [[:], []] + input[5] = '.fasta' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot ( + process.out.orf2lca, + process.out.bin2classification, + process.out.diamond, + process.out.gff, + process.out.versions_catpack, + path(process.out.log.get(0).get(1)).readLines().last().contains("CAT is done!") + ).match() + } + ) + } + + } + + test("sarscov2 - genome - fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] + input[1] = CATPACK_PREPARE.out.db + input[2] = CATPACK_PREPARE.out.taxonomy + input[3] = [[:], []] + input[4] = [[:], []] + input[5] = '.fasta' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out + ).match() + } + ) + } + + } + +} diff --git a/modules/nf-core/catpack/bins/tests/main.nf.test.snap b/modules/nf-core/catpack/bins/tests/main.nf.test.snap new file mode 100644 index 0000000..760f213 --- /dev/null +++ b/modules/nf-core/catpack/bins/tests/main.nf.test.snap @@ -0,0 +1,224 @@ +{ + "sarscov2 - genome - fasta": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.ORF2LCA.txt:md5,27e3ca35dc7b977653b5bbf18076fc26" + ] + ], + [ + [ + { + "id": "test" + }, + "test.bin2classification.txt:md5,8cb8c364c1dea229f68e8c7a1747205d" + ] + ], + [ + [ + { + "id": "test" + }, + "test.concatenated.alignment.diamond:md5,9e2f9c188b183c18dd9572395a48a066" + ] + ], + [ + [ + { + "id": "test" + }, + "test.concatenated.predicted_proteins.faa:md5,1f8550f87d044d117422ca02827e4d18" + ] + ], + [ + [ + { + "id": "test" + }, + "test.concatenated.predicted_proteins.gff:md5,cb63331a0282175669107585cf4a66c1" + ] + ], + [ + [ + "CATPACK_BINS", + "catpack", + "6.0" + ] + ], + false + ], + "timestamp": "2026-03-08T13:21:42.728994553", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 - genome - fasta - premade proteins": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.ORF2LCA.txt:md5,27e3ca35dc7b977653b5bbf18076fc26" + ] + ], + [ + [ + { + "id": "test" + }, + "test.bin2classification.txt:md5,8cb8c364c1dea229f68e8c7a1747205d" + ] + ], + [ + [ + { + "id": "test" + }, + "test.concatenated.alignment.diamond:md5,9e2f9c188b183c18dd9572395a48a066" + ] + ], + [ + + ], + [ + [ + "CATPACK_BINS", + "catpack", + "6.0" + ] + ], + false + ], + "timestamp": "2026-03-08T13:21:49.452713707", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 - genome - fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.ORF2LCA.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.bin2classification.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.diamond:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test.predicted_proteins.faa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "test" + }, + "test.predicted_proteins.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + "CATPACK_BINS", + "catpack", + "6.0" + ] + ], + "bin2classification": [ + [ + { + "id": "test" + }, + "test.bin2classification.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "diamond": [ + [ + { + "id": "test" + }, + "test.diamond:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "faa": [ + [ + { + "id": "test" + }, + "test.predicted_proteins.faa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "gff": [ + [ + { + "id": "test" + }, + "test.predicted_proteins.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "orf2lca": [ + [ + { + "id": "test" + }, + "test.ORF2LCA.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_catpack": [ + [ + "CATPACK_BINS", + "catpack", + "6.0" + ] + ] + } + ], + "timestamp": "2026-03-08T13:21:54.940068383", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/catpack/bins/tests/nextflow.config b/modules/nf-core/catpack/bins/tests/nextflow.config new file mode 100644 index 0000000..c1ac485 --- /dev/null +++ b/modules/nf-core/catpack/bins/tests/nextflow.config @@ -0,0 +1,13 @@ +process { + withName: CATPACK_PREPARE { + ext.args = "--common_prefix database" + } + + withName: CATPACK_BINS { + ext.args = "--bin_suffix .fasta" + } + + withName: CATPACK_ADDNAMES { + ext.args = "--only_official" + } +} diff --git a/modules/nf-core/catpack/contigs/environment.yml b/modules/nf-core/catpack/contigs/environment.yml new file mode 100644 index 0000000..39264bf --- /dev/null +++ b/modules/nf-core/catpack/contigs/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::cat=6.0.1 diff --git a/modules/nf-core/catpack/contigs/main.nf b/modules/nf-core/catpack/contigs/main.nf new file mode 100644 index 0000000..76c6763 --- /dev/null +++ b/modules/nf-core/catpack/contigs/main.nf @@ -0,0 +1,56 @@ +process CATPACK_CONTIGS { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/cat:6.0.1--hdfd78af_1' + : 'biocontainers/cat:6.0.1--hdfd78af_1'}" + + input: + tuple val(meta), path(contigs) + tuple val(meta2), path(database) + tuple val(meta3), path(taxonomy) + tuple val(meta4), path(proteins) + tuple val(meta5), path(diamond_table) + + output: + tuple val(meta), path("*.ORF2LCA.txt"), emit: orf2lca + tuple val(meta), path("*.contig2classification.txt"), emit: contig2classification + tuple val(meta), path("*.log"), emit: log + tuple val(meta), path("*.diamond"), optional: true, emit: diamond + tuple val(meta), path("*.predicted_proteins.faa"), optional: true, emit: faa + tuple val(meta), path("*.gff"), optional: true, emit: gff + tuple val("${task.process}"), val('catpack'), eval("CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'"), topic: versions, emit: versions_catpack + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def premade_proteins = proteins ? "--proteins_fasta ${proteins}" : '' + def premade_table = diamond_table ? "--diamond_alignment ${diamond_table}" : '' + """ + CAT_pack contigs \\ + --nproc ${task.cpus} \\ + --contigs_fasta ${contigs} \\ + --database_folder ${database} \\ + --taxonomy_folder ${taxonomy} \\ + --out_prefix ${prefix} \\ + ${premade_proteins} \\ + ${premade_table} \\ + ${args} + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.ORF2LCA.txt + touch ${prefix}.contig2classification.txt + touch ${prefix}.log + touch ${prefix}.diamond + touch ${prefix}.predicted_proteins.faa + touch ${prefix}.predicted_proteins.gff + """ +} diff --git a/modules/nf-core/catpack/contigs/meta.yml b/modules/nf-core/catpack/contigs/meta.yml new file mode 100644 index 0000000..6059790 --- /dev/null +++ b/modules/nf-core/catpack/contigs/meta.yml @@ -0,0 +1,184 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "catpack_contigs" +description: Taxonomic classification of long DNA sequences and metagenome assembled + genomes (e.g. contigs, MAGs / bins). +keywords: + - taxonomic classification + - classification + - long reads + - mags + - assembly +tools: + - "catpack": + description: "CAT/BAT: tool for taxonomic classification of contigs and metagenome-assembled + genomes (MAGs)" + homepage: "https://github.com/MGXlab/CAT_pack" + documentation: "https://github.com/MGXlab/CAT_pack" + tool_dev_url: "https://github.com/MGXlab/CAT_pack" + doi: "10.1186/s13059-019-1817-x" + licence: ["MIT"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - contigs: + type: file + description: A nucleotide FASTA file containing long DNA sequences such as contigs. + pattern: "*.{fasta,fna,fa,fas}" + ontologies: + - edam: "http://edamontology.org/format_1929" + + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - database: + type: directory + description: Directory containing CAT_pack database files (e.g. output from + CAT_pack prepare) + pattern: "*/" + ontologies: + - edam: "http://edamontology.org/data_1049" + + - - meta3: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - taxonomy: + type: directory + description: Directory containing CAT_pack taxonomy files (e.g. output from + CAT_pack prepare) + pattern: "*/" + ontologies: + - edam: "http://edamontology.org/data_1049" + + - - meta4: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - proteins: + type: directory + description: Optional pre predicted-made proteins FASTA + pattern: "*.{fasta,faa,fa,fas}" + ontologies: + - edam: "http://edamontology.org/format_1929" + + - - meta5: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - diamond_table: + type: directory + description: Optional pre-made DIAMOND alignment table + pattern: "*" + ontologies: + - edam: "http://edamontology.org/format_3751" + +output: + orf2lca: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.ORF2LCA.txt": + type: file + description: A TSV file with per-ORF hit stats and identified lineage + pattern: "*.ORF2LCA.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" + contig2classification: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.contig2classification.txt": + type: file + description: A TSV file with per-contig hit stats and assignment justification + information + pattern: "*.contig2classification.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" + log: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.log": + type: file + description: Log file with run messages and basic statistics + ontologies: + - edam: "http://edamontology.org/format_2330" + diamond: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.diamond": + type: file + description: Intermediate DIAMOND TSV summary output file with alignment results + ontologies: + - edam: "http://edamontology.org/format_3475" + faa: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.predicted_proteins.faa": + type: file + description: FAA file of DIAMOND predicted proteins hits + ontologies: + - edam: "http://edamontology.org/format_3475" + gff: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.gff": + type: file + description: GFF file of DIAMOND predicted proteins hits + ontologies: + - edam: "http://edamontology.org/format_2305" + versions_catpack: + - - ${task.process}: + type: string + description: The process the versions were collected from + - catpack: + type: string + description: The tool name + - "CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'": + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - "${task.process}": + type: string + description: The name of the process + - catpack: + type: string + description: The name of the tool + - "CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'": + type: eval + description: The expression to obtain the version of the tool +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/catpack/contigs/tests/main.nf.test b/modules/nf-core/catpack/contigs/tests/main.nf.test new file mode 100644 index 0000000..e1569d2 --- /dev/null +++ b/modules/nf-core/catpack/contigs/tests/main.nf.test @@ -0,0 +1,88 @@ +nextflow_process { + + name "Test Process CATPACK_CONTIGS" + script "../main.nf" + process "CATPACK_CONTIGS" + config './nextflow.config' + + tag "modules" + tag "modules_nfcore" + tag "catpack" + tag "catpack/contigs" + tag "catpack/prepare" + + setup { + run('CATPACK_PREPARE') { + script '../../prepare/main.nf' + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] ] + input[1] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_nodes.dmp', checkIfExists: true) ] + input[2] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_names.dmp', checkIfExists: true) ] + input[3] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot.accession2taxid.gz', checkIfExists: true) ] + """ + } + } + } + + test("sarscov2 - genome - fasta") { + + when { + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] + input[1] = CATPACK_PREPARE.out.db + input[2] = CATPACK_PREPARE.out.taxonomy + input[3] = [[:], []] + input[4] = [[:], []] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot ( + process.out.orf2lca, + process.out.contig2classification, + process.out.diamond, + process.out.faa, + process.out.gff, + process.out.versions_catpack, + path(process.out.log.get(0).get(1)).readLines().last().contains("CAT is done!") + ).match() + } + ) + } + + } + + test("sarscov2 - genome - fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] + input[1] = CATPACK_PREPARE.out.db + input[2] = CATPACK_PREPARE.out.taxonomy + input[3] = [[:], []] + input[4] = [[:], []] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out + ).match() + } + ) + } + + } + +} diff --git a/modules/nf-core/catpack/contigs/tests/main.nf.test.snap b/modules/nf-core/catpack/contigs/tests/main.nf.test.snap new file mode 100644 index 0000000..f8b074d --- /dev/null +++ b/modules/nf-core/catpack/contigs/tests/main.nf.test.snap @@ -0,0 +1,180 @@ +{ + "sarscov2 - genome - fasta": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.ORF2LCA.txt:md5,a623b47f20751db12ce18c9ca7ac2536" + ] + ], + [ + [ + { + "id": "test" + }, + "test.contig2classification.txt:md5,3c3c79045bf6ae8b1292ae9afa2ec4af" + ] + ], + [ + [ + { + "id": "test" + }, + "test.alignment.diamond:md5,9e2f9c188b183c18dd9572395a48a066" + ] + ], + [ + [ + { + "id": "test" + }, + "test.predicted_proteins.faa:md5,1f8550f87d044d117422ca02827e4d18" + ] + ], + [ + [ + { + "id": "test" + }, + "test.predicted_proteins.gff:md5,4fc7a311726723ce5b17cede1dd1059c" + ] + ], + [ + [ + "CATPACK_CONTIGS", + "catpack", + "6.0" + ] + ], + true + ], + "timestamp": "2026-03-08T13:22:00.69823573", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 - genome - fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.ORF2LCA.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.contig2classification.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.diamond:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test.predicted_proteins.faa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "test" + }, + "test.predicted_proteins.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + "CATPACK_CONTIGS", + "catpack", + "6.0" + ] + ], + "contig2classification": [ + [ + { + "id": "test" + }, + "test.contig2classification.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "diamond": [ + [ + { + "id": "test" + }, + "test.diamond:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "faa": [ + [ + { + "id": "test" + }, + "test.predicted_proteins.faa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "gff": [ + [ + { + "id": "test" + }, + "test.predicted_proteins.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "orf2lca": [ + [ + { + "id": "test" + }, + "test.ORF2LCA.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_catpack": [ + [ + "CATPACK_CONTIGS", + "catpack", + "6.0" + ] + ] + } + ], + "timestamp": "2026-03-08T13:22:06.153850855", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/catpack/contigs/tests/nextflow.config b/modules/nf-core/catpack/contigs/tests/nextflow.config new file mode 100644 index 0000000..07f3352 --- /dev/null +++ b/modules/nf-core/catpack/contigs/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: CATPACK_PREPARE { + ext.args = "--common_prefix database" + } +} diff --git a/modules/nf-core/catpack/download/environment.yml b/modules/nf-core/catpack/download/environment.yml new file mode 100644 index 0000000..39264bf --- /dev/null +++ b/modules/nf-core/catpack/download/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::cat=6.0.1 diff --git a/modules/nf-core/catpack/download/main.nf b/modules/nf-core/catpack/download/main.nf new file mode 100644 index 0000000..688ee80 --- /dev/null +++ b/modules/nf-core/catpack/download/main.nf @@ -0,0 +1,53 @@ +process CATPACK_DOWNLOAD { + tag "${meta.id}" + label 'process_single' + label 'process_long' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/cat:6.0.1--hdfd78af_1' + : 'biocontainers/cat:6.0.1--hdfd78af_1'}" + + input: + tuple val(meta), val(db) + + output: + tuple val(meta), path("${prefix}/*.${db}.gz"), emit: fasta + tuple val(meta), path("${prefix}/*.names.dmp"), emit: names + tuple val(meta), path("${prefix}/*.nodes.dmp"), emit: nodes + tuple val(meta), path("${prefix}/*accession2taxid*.gz"), emit: acc2tax + tuple val(meta), path("${prefix}/*.log"), emit: log + tuple val("${task.process}"), val('catpack'), eval("CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'"), topic: versions, emit: versions_catpack + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + CAT_pack \\ + download \\ + ${args} \\ + --db ${db} \\ + -o ${prefix}/ + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + echo "CAT_pack \\ + download \\ + ${args} \\ + --db ${db} + -o ${prefix}/" + + mkdir ${prefix}/ + echo "" | gzip > ${prefix}/${prefix}.${db}.gz + touch ${prefix}/${prefix}.names.dmp + touch ${prefix}/${prefix}.nodes.dmp + echo "" | gzip > ${prefix}/${prefix}.accession2taxid.gz + touch ${prefix}/${prefix}.log + """ +} diff --git a/modules/nf-core/catpack/download/meta.yml b/modules/nf-core/catpack/download/meta.yml new file mode 100644 index 0000000..6aa97e6 --- /dev/null +++ b/modules/nf-core/catpack/download/meta.yml @@ -0,0 +1,117 @@ +name: "catpack_download" +description: Downloads the required files for either Nr or GTDB for building into + a CAT database +keywords: + - taxonomic classification + - classification + - database + - download +tools: + - "catpack": + description: "CAT/BAT: tool for taxonomic classification of contigs and metagenome-assembled + genomes (MAGs)" + homepage: "https://github.com/MGXlab/CAT_pack" + documentation: "https://github.com/MGXlab/CAT_pack" + tool_dev_url: "https://github.com/MGXlab/CAT_pack" + doi: "10.1186/s13059-019-1817-x" + licence: ["MIT"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - db: + type: string + description: Which database to download + pattern: "nr|GTDB" + +output: + fasta: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - ${prefix}/*.${db}.gz: + type: file + description: FASTA file containing all the NCBI NR or GTDB sequences + pattern: "*.${db}.gz" + ontologies: + - edam: "http://edamontology.org/format_1929" + names: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - ${prefix}/*.names.dmp: + type: file + description: NCBI taxonomy-style names.dmp text file + pattern: "*.names.dmp" + ontologies: + - edam: "http://edamontology.org/format_1964" + nodes: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - ${prefix}/*.nodes.dmp: + type: file + description: NCBI taxonomy-style nodes.dmp text file + pattern: "*.nodes.dmp" + ontologies: + - edam: "http://edamontology.org/format_1964" + acc2tax: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - ${prefix}/*accession2taxid*.gz: + type: file + description: NCBI taxonomy names accession to taxonomy file + pattern: "*accession2taxid*" + ontologies: + - edam: "http://edamontology.org/format_1964" + log: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - ${prefix}/*.log: + type: file + description: Log file of the download process + pattern: "*.log" + ontologies: + - edam: "http://edamontology.org/format_1964" + + versions_catpack: + - - ${task.process}: + type: string + description: The process the versions were collected from + - catpack: + type: string + description: The tool name + - "CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'": + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - "${task.process}": + type: string + description: The name of the process + - catpack: + type: string + description: The name of the tool + - "CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'": + type: eval + description: The expression to obtain the version of the tool +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/catpack/download/tests/main.nf.test b/modules/nf-core/catpack/download/tests/main.nf.test new file mode 100644 index 0000000..cc9aae7 --- /dev/null +++ b/modules/nf-core/catpack/download/tests/main.nf.test @@ -0,0 +1,37 @@ +nextflow_process { + + name "Test Process CATPACK_DOWNLOAD" + script "../main.nf" + process "CATPACK_DOWNLOAD" + + tag "modules" + tag "modules_nfcore" + tag "catpack" + tag "catpack/download" + + // Only stub because module downloads extremely large + test("nr - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'nr', single_end:false ], // meta map + 'nr', + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/catpack/download/tests/main.nf.test.snap b/modules/nf-core/catpack/download/tests/main.nf.test.snap new file mode 100644 index 0000000..7cd33be --- /dev/null +++ b/modules/nf-core/catpack/download/tests/main.nf.test.snap @@ -0,0 +1,117 @@ +{ + "nr - stub": { + "content": [ + { + "0": [ + [ + { + "id": "nr", + "single_end": false + }, + "nr.nr.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "nr", + "single_end": false + }, + "nr.names.dmp:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "nr", + "single_end": false + }, + "nr.nodes.dmp:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "nr", + "single_end": false + }, + "nr.accession2taxid.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "4": [ + [ + { + "id": "nr", + "single_end": false + }, + "nr.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + "CATPACK_DOWNLOAD", + "catpack", + "6.0" + ] + ], + "acc2tax": [ + [ + { + "id": "nr", + "single_end": false + }, + "nr.accession2taxid.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "fasta": [ + [ + { + "id": "nr", + "single_end": false + }, + "nr.nr.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "log": [ + [ + { + "id": "nr", + "single_end": false + }, + "nr.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "names": [ + [ + { + "id": "nr", + "single_end": false + }, + "nr.names.dmp:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "nodes": [ + [ + { + "id": "nr", + "single_end": false + }, + "nr.nodes.dmp:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_catpack": [ + [ + "CATPACK_DOWNLOAD", + "catpack", + "6.0" + ] + ] + } + ], + "timestamp": "2026-03-08T13:22:08.84438062", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/catpack/prepare/environment.yml b/modules/nf-core/catpack/prepare/environment.yml new file mode 100644 index 0000000..39264bf --- /dev/null +++ b/modules/nf-core/catpack/prepare/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::cat=6.0.1 diff --git a/modules/nf-core/catpack/prepare/main.nf b/modules/nf-core/catpack/prepare/main.nf new file mode 100644 index 0000000..b058d96 --- /dev/null +++ b/modules/nf-core/catpack/prepare/main.nf @@ -0,0 +1,52 @@ +process CATPACK_PREPARE { + tag "${meta.id}" + label 'process_medium' + label 'process_long' + label 'process_high_memory' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/cat:6.0.1--hdfd78af_1' + : 'biocontainers/cat:6.0.1--hdfd78af_1'}" + + input: + tuple val(meta), path(db_fasta) + path names + path nodes + path acc2tax + + output: + tuple val(meta), path("${prefix}/db/"), emit: db + tuple val(meta), path("${prefix}/tax/"), emit: taxonomy + tuple val("${task.process}"), val('catpack'), eval("CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'"), topic: versions, emit: versions_catpack + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + CAT_pack prepare \\ + -n ${task.cpus} \\ + --db_fasta ${db_fasta} \\ + --names ${names} \\ + --nodes ${nodes} \\ + --acc2tax ${acc2tax} \\ + --db_dir ${prefix}/ \\ + ${args} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch database.log + mkdir -p ${prefix}/db + touch ${prefix}/db/database.dmnd + touch ${prefix}/db/database.fastaid2LCAtaxid + touch ${prefix}/db/database.taxids_with_multiple_offspring + mkdir -p ${prefix}/tax + touch ${prefix}/tax/nodes.dmp + touch ${prefix}/tax/names.dmp + """ +} diff --git a/modules/nf-core/catpack/prepare/meta.yml b/modules/nf-core/catpack/prepare/meta.yml new file mode 100644 index 0000000..119215e --- /dev/null +++ b/modules/nf-core/catpack/prepare/meta.yml @@ -0,0 +1,102 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "catpack_prepare" +description: Creates a CAT_pack database based on input FASTAs +keywords: + - catpack + - cat + - prepare + - database + - profiling + - build +tools: + - "catpack": + description: "CAT/BAT: tool for taxonomic classification of contigs and metagenome-assembled + genomes (MAGs)" + homepage: "https://github.com/MGXlab/CAT_pack" + documentation: "https://github.com/MGXlab/CAT_pack" + tool_dev_url: "https://github.com/MGXlab/CAT_pack" + doi: "10.1186/s13059-019-1817-x" + licence: ["MIT"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - db_fasta: + type: file + description: A FASTA file containing all sequences to be included in the database + pattern: "*.{fasta,fa,fna}" + ontologies: + - edam: "http://edamontology.org/format_1929" + - names: + type: file + description: An NCBI taxonomy-style names text file + pattern: "*" + ontologies: + - edam: "http://edamontology.org/format_1964" + - nodes: + type: file + description: An NCBI taxonomy-style nodes text file + pattern: "*" + ontologies: + - edam: "http://edamontology.org/format_1964" + - acc2tax: + type: file + description: An NCBI taxonomy names accession to taxonomy file + pattern: "*" + ontologies: + - edam: "http://edamontology.org/format_1964" + +output: + db: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - ${prefix}/db/: + type: directory + description: Directory containing CAT database files + pattern: "${db}/" + ontologies: + - edam: "http://edamontology.org/data_1049" + taxonomy: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - ${prefix}/tax/: + type: directory + description: Directory containing CAT prepared taxonomy database files + pattern: "${db}/" + ontologies: + - edam: "http://edamontology.org/data_1049" + versions_catpack: + - - ${task.process}: + type: string + description: The process the versions were collected from + - catpack: + type: string + description: The tool name + - "CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'": + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - "${task.process}": + type: string + description: The name of the process + - catpack: + type: string + description: The name of the tool + - "CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'": + type: eval + description: The expression to obtain the version of the tool +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/catpack/prepare/tests/main.nf.test b/modules/nf-core/catpack/prepare/tests/main.nf.test new file mode 100644 index 0000000..5f62b94 --- /dev/null +++ b/modules/nf-core/catpack/prepare/tests/main.nf.test @@ -0,0 +1,68 @@ +nextflow_process { + + name "Test Process CATPACK_PREPARE" + script "../main.nf" + process "CATPACK_PREPARE" + config './nextflow.config' + + tag "modules" + tag "modules_nfcore" + tag "catpack" + tag "catpack/prepare" + + test("metagenome -sarscov2 - fasta") { + + when { + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] ] + input[1] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_nodes.dmp', checkIfExists: true) ] + input[2] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_names.dmp', checkIfExists: true) ] + input[3] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot.accession2taxid.gz', checkIfExists: true) ] + """ + } + } + + then { + def stablefiles = [] + file(process.out.db.get(0).get(1)).eachFileRecurse{ file -> if (!file.isDirectory() && !["database.dmnd", "database.log", "database.fastaid2LCAtaxid", "database.taxids_with_multiple_offspring"].find {file.toString().endsWith(it)}) {stablefiles.add(file)} } + def unstablefiles = [] + file(process.out.db.get(0).get(1)).eachFileRecurse{ file -> if (["database.dmnd", "database.log", "database.fastaid2LCAtaxid", "database.taxids_with_multiple_offspring"].find {file.toString().endsWith(it)}) {unstablefiles.add(file.getName().toString())} } + assertAll( + { assert process.success }, + { assert snapshot( + stablefiles.sort(), + unstablefiles.sort(), + process.out.taxonomy, + process.out.versions_catpack + ).match() } + ) + } + + } + + test("metagenome -sarscov2 - fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] ] + input[1] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_nodes.dmp', checkIfExists: true) ] + input[2] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_names.dmp', checkIfExists: true) ] + input[3] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot.accession2taxid.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/catpack/prepare/tests/main.nf.test.snap b/modules/nf-core/catpack/prepare/tests/main.nf.test.snap new file mode 100644 index 0000000..4e979ad --- /dev/null +++ b/modules/nf-core/catpack/prepare/tests/main.nf.test.snap @@ -0,0 +1,108 @@ +{ + "metagenome -sarscov2 - fasta": { + "content": [ + [ + + ], + [ + "database.dmnd", + "database.fastaid2LCAtaxid", + "database.taxids_with_multiple_offspring" + ], + [ + [ + { + "id": "test" + }, + [ + "names.dmp:md5,c471c27a4ce85ae74d2c63633c9ce1e3", + "nodes.dmp:md5,130f9132095562e09c732679c562f5e9" + ] + ] + ], + [ + [ + "CATPACK_PREPARE", + "catpack", + "6.0" + ] + ] + ], + "timestamp": "2026-03-08T13:23:36.040383644", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "metagenome -sarscov2 - fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "database.dmnd:md5,d41d8cd98f00b204e9800998ecf8427e", + "database.fastaid2LCAtaxid:md5,d41d8cd98f00b204e9800998ecf8427e", + "database.taxids_with_multiple_offspring:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + [ + { + "id": "test" + }, + [ + "names.dmp:md5,d41d8cd98f00b204e9800998ecf8427e", + "nodes.dmp:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "2": [ + [ + "CATPACK_PREPARE", + "catpack", + "6.0" + ] + ], + "db": [ + [ + { + "id": "test" + }, + [ + "database.dmnd:md5,d41d8cd98f00b204e9800998ecf8427e", + "database.fastaid2LCAtaxid:md5,d41d8cd98f00b204e9800998ecf8427e", + "database.taxids_with_multiple_offspring:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "taxonomy": [ + [ + { + "id": "test" + }, + [ + "names.dmp:md5,d41d8cd98f00b204e9800998ecf8427e", + "nodes.dmp:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions_catpack": [ + [ + "CATPACK_PREPARE", + "catpack", + "6.0" + ] + ] + } + ], + "timestamp": "2026-03-08T13:23:40.348857324", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/catpack/prepare/tests/nextflow.config b/modules/nf-core/catpack/prepare/tests/nextflow.config new file mode 100644 index 0000000..07f3352 --- /dev/null +++ b/modules/nf-core/catpack/prepare/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: CATPACK_PREPARE { + ext.args = "--common_prefix database" + } +} diff --git a/modules/nf-core/catpack/summarise/environment.yml b/modules/nf-core/catpack/summarise/environment.yml new file mode 100644 index 0000000..39264bf --- /dev/null +++ b/modules/nf-core/catpack/summarise/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::cat=6.0.1 diff --git a/modules/nf-core/catpack/summarise/main.nf b/modules/nf-core/catpack/summarise/main.nf new file mode 100644 index 0000000..e1e977c --- /dev/null +++ b/modules/nf-core/catpack/summarise/main.nf @@ -0,0 +1,52 @@ +process CATPACK_SUMMARISE { + tag "${meta.id}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/cat:6.0.1--hdfd78af_1' + : 'biocontainers/cat:6.0.1--hdfd78af_1'}" + + input: + tuple val(meta), path(classification) + tuple val(meta2), path(contigs) + + output: + tuple val(meta), path("*.txt"), emit: txt + tuple val("${task.process}"), val('catpack'), eval("CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'"), topic: versions, emit: versions_catpack + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("${classification}" == "${prefix}.txt") { + error("Input and output names are the same, set prefix in module configuration to disambiguate!") + } + def insert_contigs = contigs ? "-c ${contigs}" : '' + """ + CAT_pack summarise \\ + ${args} \\ + -i ${classification} \\ + ${insert_contigs} \\ + -o ${prefix}.txt + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("${classification}" == "${prefix}.txt") { + error("Input and output names are the same, set prefix in module configuration to disambiguate!") + } + def insert_contigs = contigs ? "-c ${contigs}" : '' + """ + echo "CAT_pack summarise \\ + ${args} \\ + -i ${classification} \\ + ${insert_contigs} \\ + -o ${prefix}.txt" + + touch ${prefix}.txt + """ +} diff --git a/modules/nf-core/catpack/summarise/meta.yml b/modules/nf-core/catpack/summarise/meta.yml new file mode 100644 index 0000000..64b72c3 --- /dev/null +++ b/modules/nf-core/catpack/summarise/meta.yml @@ -0,0 +1,89 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "catpack_summarise" +description: Summarises results from CAT/BAT/RAT classification steps +keywords: + - taxonomic classification + - classification + - long reads + - mags + - assembly +tools: + - "catpack": + description: "CAT/BAT: tool for taxonomic classification of contigs and metagenome-assembled + genomes (MAGs)" + homepage: "https://github.com/MGXlab/CAT_pack" + documentation: "https://github.com/MGXlab/CAT_pack" + tool_dev_url: "https://github.com/MGXlab/CAT_pack" + doi: "10.1186/s13059-019-1817-x" + licence: ["MIT"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - classification: + type: file + description: CAT/BAT/RAT classification table annotated with official names + (from CAT_pack addnames) + pattern: "*.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - contigs: + type: file + description: Optional nucleotide FASTA file containing long DNA sequences such + as contigs that were classified (only if classification table is from CAT_pack + contigs) + pattern: "*.{fasta,fna,fa,fas}" + ontologies: + - edam: "http://edamontology.org/format_1929" + +output: + txt: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.txt": + type: file + description: | + Summary statistics table of CAT/BAT/RAT results + pattern: "*.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" + + versions_catpack: + - - ${task.process}: + type: string + description: The process the versions were collected from + - catpack: + type: string + description: The tool name + - "CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'": + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - "${task.process}": + type: string + description: The name of the process + - catpack: + type: string + description: The name of the tool + - "CAT_pack --version | sed 's/CAT_pack pack v//g;s/ .*//g'": + type: eval + description: The expression to obtain the version of the tool +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/catpack/summarise/tests/main.nf.test b/modules/nf-core/catpack/summarise/tests/main.nf.test new file mode 100644 index 0000000..03ef73b --- /dev/null +++ b/modules/nf-core/catpack/summarise/tests/main.nf.test @@ -0,0 +1,159 @@ +nextflow_process { + + name "Test Process CATPACK_SUMMARISE" + script "../main.nf" + process "CATPACK_SUMMARISE" + config './nextflow.config' + + tag "modules" + tag "modules_nfcore" + tag "catpack" + tag "catpack/summarise" + tag "catpack/prepare" + tag "catpack/contigs" + tag "catpack/bins" + tag "catpack/addnames" + + setup { + run('CATPACK_PREPARE') { + script '../../prepare/main.nf' + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] ] + input[1] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_nodes.dmp', checkIfExists: true) ] + input[2] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_names.dmp', checkIfExists: true) ] + input[3] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot.accession2taxid.gz', checkIfExists: true) ] + """ + } + } + } + + test("sarscov2 - contigs - with names") { + + setup { + run('CATPACK_CONTIGS') { + script '../../contigs/main.nf' + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] + input[1] = CATPACK_PREPARE.out.db + input[2] = CATPACK_PREPARE.out.taxonomy + input[3] = [[:], []] + input[4] = [[:], []] + """ + } + } + + run('CATPACK_ADDNAMES') { + script '../../addnames/main.nf' + process { + """ + input[0] = CATPACK_CONTIGS.out.contig2classification + input[1] = CATPACK_PREPARE.out.taxonomy + """ + } + } + } + + when { + process { + """ + input[0] = CATPACK_ADDNAMES.out.txt + input[1] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bins - raw") { + + setup { + run('CATPACK_BINS') { + script '../../bins/main.nf' + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] + input[1] = CATPACK_PREPARE.out.db + input[2] = CATPACK_PREPARE.out.taxonomy + input[3] = [[:], []] + input[4] = [[:], []] + input[5] = '.fasta' + """ + } + } + + run('CATPACK_ADDNAMES') { + script '../../addnames/main.nf' + process { + """ + input[0] = CATPACK_BINS.out.bin2classification + input[1] = CATPACK_PREPARE.out.taxonomy + """ + } + } + } + + when { + process { + """ + input[0] = CATPACK_ADDNAMES.out.txt + input[1] = [[:],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - contigs - stub") { + + options "-stub" + + setup { + run('CATPACK_CONTIGS') { + script '../../contigs/main.nf' + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] + input[1] = CATPACK_PREPARE.out.db + input[2] = CATPACK_PREPARE.out.taxonomy + input[3] = [[:], []] + input[4] = [[:], []] + """ + } + } + } + + when { + process { + """ + input[0] = CATPACK_CONTIGS.out.contig2classification + input[1] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/catpack/summarise/tests/main.nf.test.snap b/modules/nf-core/catpack/summarise/tests/main.nf.test.snap new file mode 100644 index 0000000..5c5f848 --- /dev/null +++ b/modules/nf-core/catpack/summarise/tests/main.nf.test.snap @@ -0,0 +1,125 @@ +{ + "sarscov2 - contigs - with names": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_summary.txt:md5,f4ae1e889f762d6123e4b8d9e57aa4cc" + ] + ], + "1": [ + [ + "CATPACK_SUMMARISE", + "catpack", + "6.0" + ] + ], + "txt": [ + [ + { + "id": "test" + }, + "test_summary.txt:md5,f4ae1e889f762d6123e4b8d9e57aa4cc" + ] + ], + "versions_catpack": [ + [ + "CATPACK_SUMMARISE", + "catpack", + "6.0" + ] + ] + } + ], + "timestamp": "2026-03-08T13:24:15.055181823", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 - contigs - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "CATPACK_SUMMARISE", + "catpack", + "6.0" + ] + ], + "txt": [ + [ + { + "id": "test" + }, + "test_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_catpack": [ + [ + "CATPACK_SUMMARISE", + "catpack", + "6.0" + ] + ] + } + ], + "timestamp": "2026-03-08T13:24:28.01946467", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 - bins - raw": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_summary.txt:md5,2fc7ecc57cfe0c3362fba27e9693d266" + ] + ], + "1": [ + [ + "CATPACK_SUMMARISE", + "catpack", + "6.0" + ] + ], + "txt": [ + [ + { + "id": "test" + }, + "test_summary.txt:md5,2fc7ecc57cfe0c3362fba27e9693d266" + ] + ], + "versions_catpack": [ + [ + "CATPACK_SUMMARISE", + "catpack", + "6.0" + ] + ] + } + ], + "timestamp": "2026-03-08T13:24:21.784200923", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/catpack/summarise/tests/nextflow.config b/modules/nf-core/catpack/summarise/tests/nextflow.config new file mode 100644 index 0000000..0302d9c --- /dev/null +++ b/modules/nf-core/catpack/summarise/tests/nextflow.config @@ -0,0 +1,17 @@ +process { + withName: CATPACK_PREPARE { + ext.args = "--common_prefix database" + } + + withName: CATPACK_BINS { + ext.args = "--bin_suffix .fasta" + } + + withName: CATPACK_ADDNAMES { + ext.args = "--only_official" + } + + withName: CATPACK_SUMMARISE { + ext.prefix = { "${meta.id}_summary" } + } +} diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml new file mode 100644 index 0000000..9b926b1 --- /dev/null +++ b/modules/nf-core/untar/environment.yml @@ -0,0 +1,12 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::coreutils=9.5 + - conda-forge::grep=3.11 + - conda-forge::gzip=1.13 + - conda-forge::lbzip2=2.5 + - conda-forge::sed=4.8 + - conda-forge::tar=1.34 diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 0000000..b9c324d --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,75 @@ +process UNTAR { + tag "${archive}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/52/52ccce28d2ab928ab862e25aae26314d69c8e38bd41ca9431c67ef05221348aa/data' + : 'community.wave.seqera.io/library/coreutils_grep_gzip_lbzip2_pruned:838ba80435a629f8'}" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("${prefix}"), emit: untar + tuple val("${task.process}"), val('untar'), eval('tar --version 2>&1 | head -1 | sed "s/tar (GNU tar) //; s/ Copyright.*//"'), emit: versions_untar, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir ${prefix} + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C ${prefix} --strip-components 1 \\ + -xavf \\ + ${args} \\ + ${archive} \\ + ${args2} + else + tar \\ + -C ${prefix} \\ + -xavf \\ + ${args} \\ + ${archive} \\ + ${args2} + fi + + """ + + stub: + prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir ${prefix} + ## Dry-run untaring the archive to get the files and place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + for i in `tar -tf ${archive}`; + do + if [[ \$(echo "\${i}" | grep -E "/\$") == "" ]]; + then + touch \${i} + else + mkdir -p \${i} + fi + done + else + for i in `tar -tf ${archive}`; + do + if [[ \$(echo "\${i}" | grep -E "/\$") == "" ]]; + then + touch ${prefix}/\${i} + else + mkdir -p ${prefix}/\${i} + fi + done + fi + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 0000000..571d807 --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,73 @@ +name: untar +description: Extract files from tar, tar.gz, tar.bz2, tar.xz archives +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar, tar.gz, tar.bz2, tar.xz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untarred + pattern: "*.{tar,tar.gz,tar.bz2,tar.xz}" + ontologies: + - edam: http://edamontology.org/format_3981 # TAR format + - edam: http://edamontology.org/format_3989 # GZIP format +output: + untar: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + pattern: "*/" + - ${prefix}: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + pattern: "*/" + versions_untar: + - - ${task.process}: + type: string + description: The name of the process + - untar: + type: string + description: The name of the tool + - tar --version 2>&1 | head -1 | sed "s/tar (GNU tar) //; s/ Copyright.*//": + type: eval + description: The expression to obtain the version of the tool + +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - untar: + type: string + description: The name of the tool + - tar --version 2>&1 | head -1 | sed "s/tar (GNU tar) //; s/ Copyright.*//": + type: eval + description: The expression to obtain the version of the tool + +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test new file mode 100644 index 0000000..fde8db1 --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test @@ -0,0 +1,97 @@ +nextflow_process { + + name "Test Process UNTAR" + script "../main.nf" + process "UNTAR" + tag "modules" + tag "modules_nfcore" + tag "untar" + + test("test_untar") { + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.untar, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() }, + ) + } + } + + test("test_untar_onlyfiles") { + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.untar, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() }, + ) + } + } + + test("test_untar - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.untar, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() }, + ) + } + } + + test("test_untar_onlyfiles - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.untar, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() }, + ) + } + } +} diff --git a/modules/nf-core/untar/tests/main.nf.test.snap b/modules/nf-core/untar/tests/main.nf.test.snap new file mode 100644 index 0000000..51a414d --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test.snap @@ -0,0 +1,118 @@ +{ + "test_untar_onlyfiles": { + "content": [ + [ + [ + [ + + ], + [ + "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + ] + ] + ], + { + "versions_untar": [ + [ + "UNTAR", + "untar", + "1.34" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-28T17:49:32.000491" + }, + "test_untar_onlyfiles - stub": { + "content": [ + [ + [ + [ + + ], + [ + "hello.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + { + "versions_untar": [ + [ + "UNTAR", + "untar", + "1.34" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-28T17:49:58.812479" + }, + "test_untar - stub": { + "content": [ + [ + [ + [ + + ], + [ + "hash.k2d:md5,d41d8cd98f00b204e9800998ecf8427e", + "opts.k2d:md5,d41d8cd98f00b204e9800998ecf8427e", + "taxo.k2d:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + { + "versions_untar": [ + [ + "UNTAR", + "untar", + "1.34" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-28T17:49:48.119456" + }, + "test_untar": { + "content": [ + [ + [ + [ + + ], + [ + "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", + "opts.k2d:md5,a033d00cf6759407010b21700938f543", + "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + ] + ] + ], + { + "versions_untar": [ + [ + "UNTAR", + "untar", + "1.34" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-28T17:49:17.252494" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 0464317..d0c9146 100644 --- a/nextflow.config +++ b/nextflow.config @@ -33,6 +33,10 @@ params { checkm2_db = null checkm2_db_zenodo_id = 14897628 + // NCBI taxonomy + cat_db = null + cat_db_download_id = 'nr' + // MultiQC options multiqc_config = null multiqc_title = null @@ -182,7 +186,7 @@ profiles { singularity.runOptions = '--nv' } // TODO: figure out how to better orginise tests for different workflow types (bins, mags, metagenomic_assemblies) - // test { includeConfig 'conf/test.config' } + test { includeConfig 'conf/test.config' } test_genome { includeConfig 'conf/test_genome.config' } test_assembly { includeConfig 'conf/test_assembly.config' } test_full { includeConfig 'conf/test_full.config' } diff --git a/nextflow_schema.json b/nextflow_schema.json index c8eac29..12399c3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -63,6 +63,28 @@ } } }, + "taxonomy_assignment_options": { + "title": "Assigning NCBI taxonomy with CAT_pack", + "type": "object", + "fa_icon": "fas fa-university", + "description": "CAT_pack required data", + "help_text": "Provide a path to pre-downloaded DB or DB id for automatic download", + "properties": { + "cat_db": { + "type": "string", + "description": "Path to CAT_pack DB", + "help": "Database should be pre-downloaded using CATpack commands. The folder with database should contain sub-folders 'db' with files .dmnd, .fastaid2LCAtaxid, .taxids_with_multiple_offspring, etc. And subfolder 'tax' that should contain taxonomy files like .dmp, .taxdump.tar.gz, .prot.accession2taxid.FULL.gz, etc", + "fa_icon": "fas fa-users-cog" + }, + "cat_db_download_id": { + "type": "string", + "description": "CAT_pack database ID for download", + "help": "There are two options available: nr (default) NCBI non-redundant protein database and gtdb (Genome Taxonomy Database (GTDB) proteins). We use NCBI proteins because ENA supports only NCBI format of taxonomy for submission", + "fa_icon": "fas fa-users-cog", + "default": "nr" + } + } + }, "genome_evaluation_options": { "title": "Evaluation options", "type": "object", @@ -311,6 +333,9 @@ { "$ref": "#/$defs/genome_evaluation_options" }, + { + "$ref": "#/$defs/taxonomy_assignment_options" + }, { "$ref": "#/$defs/institutional_config_options" }, diff --git a/subworkflows/nf-core/fasta_classify_catpack/fasta_classify_catpack.diff b/subworkflows/nf-core/fasta_classify_catpack/fasta_classify_catpack.diff new file mode 100644 index 0000000..6a541ba --- /dev/null +++ b/subworkflows/nf-core/fasta_classify_catpack/fasta_classify_catpack.diff @@ -0,0 +1,28 @@ +Changes in component 'nf-core/fasta_classify_catpack' +'subworkflows/nf-core/fasta_classify_catpack/meta.yml' is unchanged +Changes in 'fasta_classify_catpack/main.nf': +--- subworkflows/nf-core/fasta_classify_catpack/main.nf ++++ subworkflows/nf-core/fasta_classify_catpack/main.nf +@@ -32,11 +32,15 @@ + + // Handle pre-built db: untar if compressed, or use directory directly + ch_cat_db_input = ch_cat_db +- .branch { _meta, db -> ++ .map { meta, db -> ++ def dbPath = db instanceof Path ? db : file(db) ++ return [meta, dbPath] ++ } ++ .branch { meta, db -> + tar: db.name.endsWith('.tar.gz') + dir: db.isDirectory() + other: true +- } ++ } + + ch_cat_db_input.other.subscribe { _meta, _db -> + error("Error: A DB was provided to FASTA_CLASSIFY_CATPACK that is not a `.tar.gz` or a directory.") + +'subworkflows/nf-core/fasta_classify_catpack/tests/main.nf.test.snap' is unchanged +'subworkflows/nf-core/fasta_classify_catpack/tests/nextflow.config' is unchanged +'subworkflows/nf-core/fasta_classify_catpack/tests/main.nf.test' is unchanged +************************************************************ diff --git a/subworkflows/nf-core/fasta_classify_catpack/main.nf b/subworkflows/nf-core/fasta_classify_catpack/main.nf new file mode 100644 index 0000000..e362a5c --- /dev/null +++ b/subworkflows/nf-core/fasta_classify_catpack/main.nf @@ -0,0 +1,133 @@ +/* + * CAT/BAT/RAT: tools for taxonomic classification of contigs and metagenome-assembled genomes (MAGs) + */ +include { CATPACK_ADDNAMES as CATPACK_ADDNAMES_BINS } from '../../../modules/nf-core/catpack/addnames/main' +include { CATPACK_ADDNAMES as CATPACK_ADDNAMES_CONTIGS } from '../../../modules/nf-core/catpack/addnames/main' +include { CATPACK_BINS } from '../../../modules/nf-core/catpack/bins/main' +include { CATPACK_CONTIGS } from '../../../modules/nf-core/catpack/contigs/main' +include { CATPACK_DOWNLOAD } from '../../../modules/nf-core/catpack/download/main' +include { CATPACK_PREPARE } from '../../../modules/nf-core/catpack/prepare/main' +include { CATPACK_SUMMARISE as CATPACK_SUMMARISE_BINS } from '../../../modules/nf-core/catpack/summarise/main' +include { CATPACK_SUMMARISE as CATPACK_SUMMARISE_CONTIGS } from '../../../modules/nf-core/catpack/summarise/main' +include { UNTAR as CAT_DB_UNTAR } from '../../../modules/nf-core/untar/main' + +workflow FASTA_CLASSIFY_CATPACK { + + take: + ch_bins // channel: [ val(meta), path(fasta) ] - binned MAGs/contigs + ch_contigs // channel: [ val(meta), path(fasta) ] - contigs; provide channel.empty() to skip contig classification + ch_cat_db // channel: [ val(meta), path(db) ] - pre-built db as directory (with db/ and tax/ subdirs) or .tar.gz + // provide channel.empty() to trigger automatic download via ch_cat_db_download_id + ch_cat_db_download_id // channel: [ val(meta), val(db_id) ] - db ID for CATPACK_DOWNLOAD (e.g. 'nr') + // provide channel.empty() if supplying a pre-built db via ch_cat_db + // supplying both ch_cat_db and ch_cat_db_download_id will cause a runtime error + run_summarise // val: boolean - whether to run CATPACK_SUMMARISE; requires ext.args = "--only_official" on CATPACK_ADDNAMES_BINS/CONTIGS + bin_suffix // val: string - file extension of bin FASTA files (e.g. '.fa' or '.fasta') + + main: + + // + // Database preparation + // + + // Handle pre-built db: untar if compressed, or use directory directly + ch_cat_db_input = ch_cat_db + .map { meta, db -> + def dbPath = db instanceof Path ? db : file(db) + return [meta, dbPath] + } + .branch { meta, db -> + tar: db.name.endsWith('.tar.gz') + dir: db.isDirectory() + other: true + } + + ch_cat_db_input.other.subscribe { _meta, _db -> + error("Error: A DB was provided to FASTA_CLASSIFY_CATPACK that is not a `.tar.gz` or a directory.") + } + + CAT_DB_UNTAR(ch_cat_db_input.tar) + + ch_prepared_from_dir = ch_cat_db_input.dir + .mix(CAT_DB_UNTAR.out.untar) + .multiMap { meta, dir -> + db: [meta, dir / 'db'] + taxonomy: [meta, dir / 'tax'] + } + + // Download and prepare db from scratch if no pre-built db provided + CATPACK_DOWNLOAD(ch_cat_db_download_id) + + CATPACK_PREPARE( + CATPACK_DOWNLOAD.out.fasta, + CATPACK_DOWNLOAD.out.names.map { _meta, names -> names }, + CATPACK_DOWNLOAD.out.nodes.map { _meta, nodes -> nodes }, + CATPACK_DOWNLOAD.out.acc2tax.map { _meta, acc2tax -> acc2tax }, + ) + + // Combine db sources - one of these channels will be empty depending on inputs + // Guard: fail if both ch_cat_db and ch_cat_db_download_id are provided simultaneously. + // .combine() only emits when both channels have at least one element. + ch_prepared_from_dir.db.combine(CATPACK_PREPARE.out.db).subscribe { + error("Error: Both a pre-built DB and a download ID were provided to FASTA_CLASSIFY_CATPACK! Provide only one via ch_cat_db or ch_cat_db_download_id.") + } + + ch_db = ch_prepared_from_dir.db.mix(CATPACK_PREPARE.out.db).first() + ch_taxonomy = ch_prepared_from_dir.taxonomy.mix(CATPACK_PREPARE.out.taxonomy).first() + + // + // Bin taxonomic classification (optional - skipped when ch_bins is channel.empty()) + // + + CATPACK_BINS( + ch_bins, + ch_db, + ch_taxonomy, + [[:], []], + [[:], []], + bin_suffix, + ) + + CATPACK_ADDNAMES_BINS(CATPACK_BINS.out.bin2classification, ch_taxonomy) + + ch_bat_summary = channel.empty() + if (run_summarise) { + CATPACK_SUMMARISE_BINS(CATPACK_ADDNAMES_BINS.out.txt, [[:], []]) + ch_bat_summary = CATPACK_SUMMARISE_BINS.out.txt + } + + // + // Contig taxonomic classification (optional - skipped when ch_contigs is channel.empty()) + // + + CATPACK_CONTIGS( + ch_contigs, + ch_db, + ch_taxonomy, + [[:], []], + [[:], []], + ) + + CATPACK_ADDNAMES_CONTIGS(CATPACK_CONTIGS.out.contig2classification, ch_taxonomy) + + ch_contigs_summary = channel.empty() + if (run_summarise) { + ch_contigs_input = CATPACK_ADDNAMES_CONTIGS.out.txt + .join(ch_contigs) + .multiMap { meta, names, contigs -> + names: [meta, names] + contigs: [meta, contigs] + } + + CATPACK_SUMMARISE_CONTIGS(ch_contigs_input.names, ch_contigs_input.contigs) + ch_contigs_summary = CATPACK_SUMMARISE_CONTIGS.out.txt + } + + emit: + bin2classification = CATPACK_BINS.out.bin2classification // channel: [ val(meta), path(txt) ] + bat_classification = CATPACK_ADDNAMES_BINS.out.txt // channel: [ val(meta), path(txt) ] + bat_summary = ch_bat_summary // channel: [ val(meta), path(txt) ] + contig2classification = CATPACK_CONTIGS.out.contig2classification // channel: [ val(meta), path(txt) ] + contigs_classification = CATPACK_ADDNAMES_CONTIGS.out.txt // channel: [ val(meta), path(txt) ] + contigs_summary = ch_contigs_summary // channel: [ val(meta), path(txt) ] +} diff --git a/subworkflows/nf-core/fasta_classify_catpack/meta.yml b/subworkflows/nf-core/fasta_classify_catpack/meta.yml new file mode 100644 index 0000000..e94f4a4 --- /dev/null +++ b/subworkflows/nf-core/fasta_classify_catpack/meta.yml @@ -0,0 +1,138 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fasta_classify_catpack" +description: Taxonomic classification of binned MAGs and contigs using CAT/BAT (CAT_pack). +keywords: + - metagenomics + - taxonomy + - classification + - bins + - MAGs + - contigs + - CAT + - BAT +components: + - catpack/addnames + - catpack/bins + - catpack/contigs + - catpack/download + - catpack/prepare + - catpack/summarise + - untar +input: + - ch_bins: + type: file + description: | + Channel containing binned MAG FASTA files. + Structure: [ val(meta), path(fasta) ] + pattern: "*.{fa,fasta,fna}" + ontologies: + - edam: "http://edamontology.org/format_1929" # FASTA + - ch_contigs: + type: file + description: | + Channel containing contig FASTA files. Provide channel.empty() to skip + contig classification. + Structure: [ val(meta), path(fasta) ] + pattern: "*.{fa,fasta,fna}" + ontologies: + - edam: "http://edamontology.org/format_1929" # FASTA + - ch_cat_db: + type: directory + description: | + Channel containing a pre-built CAT/BAT database. Can be a directory with db/ and tax/ + subdirectories, or a .tar.gz archive of such a directory. Provide channel.empty() to + trigger automatic database download using ch_cat_db_download_id. Supplying both + ch_cat_db and ch_cat_db_download_id will cause a runtime error. + Structure: [ val(meta), path(db) ] + ontologies: + - edam: "http://edamontology.org/data_1049" # Genome identifier (database) + - ch_cat_db_download_id: + type: string + description: | + Channel containing the database identifier to download via CATPACK_DOWNLOAD (e.g. 'nr'). + Only used when ch_cat_db is channel.empty(). Provide channel.empty() when supplying a + pre-built database via ch_cat_db. Supplying both inputs will cause a runtime error. + Structure: [ val(meta), val(db_id) ] + - run_summarise: + type: boolean + description: | + Whether to run CATPACK_SUMMARISE on the classification outputs. Requires + ext.args = "--only_official" to be set on CATPACK_ADDNAMES_BINS and + CATPACK_ADDNAMES_CONTIGS in the pipeline configuration, as CATPACK_SUMMARISE + requires official-rank headers in its input. + - bin_suffix: + type: string + description: | + File extension of the bin FASTA files passed to CATPACK_BINS (e.g. '.fa'). +output: + - bin2classification: + type: file + description: | + Raw per-bin taxonomic classification file produced by CATPACK_BINS, before human-readable + names are added by CATPACK_ADDNAMES. Useful for downstream tools that consume the raw + CAT_pack output directly. + Structure: [ val(meta), path(txt) ] + pattern: "*.bin2classification.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" # Textual format (TSV) + - bat_classification: + type: file + description: | + Per-bin taxonomic classification with human-readable names added by CATPACK_ADDNAMES. + Structure: [ val(meta), path(txt) ] + pattern: "*.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" # Textual format (TSV) + - bat_summary: + type: file + description: | + Summary of bin classifications produced by CATPACK_SUMMARISE. Empty channel when + run_summarise is false. + Structure: [ val(meta), path(txt) ] + pattern: "*.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" # Textual format (TSV) + - contig2classification: + type: file + description: | + Raw per-contig taxonomic classification file produced by CATPACK_CONTIGS, before + human-readable names are added by CATPACK_ADDNAMES. Empty channel when ch_contigs + is channel.empty(). Useful for downstream tools that consume the raw CAT_pack output directly. + Structure: [ val(meta), path(txt) ] + pattern: "*.contig2classification.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" # Textual format (TSV) + - contigs_classification: + type: file + description: | + Per-contig taxonomic classification with human-readable names added by CATPACK_ADDNAMES. + Empty channel when ch_contigs is channel.empty(). + Structure: [ val(meta), path(txt) ] + pattern: "*.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" # Textual format (TSV) + - contigs_summary: + type: file + description: | + Summary of contig classifications produced by CATPACK_SUMMARISE. Empty channel when + ch_contigs is channel.empty() or run_summarise is false. + Structure: [ val(meta), path(txt) ] + pattern: "*.txt" + ontologies: + - edam: "http://edamontology.org/format_3475" # Textual format (TSV) + - versions: + type: file + description: | + Channel containing software versions. + Structure: versions +authors: + - "@mberacochea" + - "@dialvarezs" +maintainers: + - "@mberacochea" + - "@dialvarezs" + - "@jfy133" + - "@prototaxites" + - "@dialvarezs" + - "@d4straub" + - "@muabnezor" diff --git a/subworkflows/nf-core/fasta_classify_catpack/tests/main.nf.test b/subworkflows/nf-core/fasta_classify_catpack/tests/main.nf.test new file mode 100644 index 0000000..71bd96d --- /dev/null +++ b/subworkflows/nf-core/fasta_classify_catpack/tests/main.nf.test @@ -0,0 +1,174 @@ +nextflow_workflow { + + name "Test Subworkflow FASTA_CLASSIFY_CATPACK" + script "../main.nf" + workflow "FASTA_CLASSIFY_CATPACK" + config './nextflow.config' + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fasta_classify_catpack" + tag "catpack" + tag "catpack/bins" + tag "catpack/contigs" + tag "catpack/addnames" + tag "catpack/summarise" + tag "catpack/prepare" + tag "catpack/download" + tag "untar" + + setup { + run('CATPACK_PREPARE') { + script 'modules/nf-core/catpack/prepare/main.nf' + process { + """ + input[0] = [ [id:'test'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] ] + input[1] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_nodes.dmp', checkIfExists: true) ] + input[2] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot_names.dmp', checkIfExists: true) ] + input[3] = [ file(params.modules_testdata_base_path + 'genomics/sarscov2/metagenome/prot.accession2taxid.gz', checkIfExists: true) ] + """ + } + } + } + + // Full integration tests: these run real Diamond alignment via CATPACK_BINS/UNBINS + // and are therefore slow by nature. Only two real tests are kept to limit CI time: + // one covering bins only, one covering the full bins + unbinned path. + // Both run with run_summarise = true to exercise CATPACK_SUMMARISE as well. + // The no-summarise and structural paths are covered by the stub test below. + + test("sarscov2 - bins only - summarise") { + + when { + workflow { + """ + input[0] = channel.of([ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + ]) + input[1] = channel.empty() + input[2] = CATPACK_PREPARE.out.db.map { meta, db_path -> [ meta, db_path.parent ] } + input[3] = channel.empty() + input[4] = true + input[5] = '.fasta' + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(sanitizeOutput(workflow.out)).match() } + ) + } + } + + test("sarscov2 - bins and unbinned - summarise") { + + when { + workflow { + """ + input[0] = channel.of([ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + ]) + input[1] = channel.of([ + [ id:'test_unbinned' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + ]) + input[2] = CATPACK_PREPARE.out.db.map { meta, db_path -> [ meta, db_path.parent ] } + input[3] = channel.empty() + input[4] = true + input[5] = '.fasta' + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(sanitizeOutput(workflow.out)).match() } + ) + } + } + + test("sarscov2 - bins only") { + + when { + workflow { + """ + input[0] = channel.of([ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + ]) + input[1] = channel.empty() + input[2] = CATPACK_PREPARE.out.db.map { meta, db_path -> [ meta, db_path.parent ] } + input[3] = channel.empty() + input[4] = false + input[5] = '.fasta' + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(sanitizeOutput(workflow.out)).match() } + ) + } + } + + test("sarscov2 - bins only - stub") { + + options "-stub" + + when { + workflow { + """ + input[0] = channel.of([ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + ]) + input[1] = channel.empty() + input[2] = CATPACK_PREPARE.out.db.map { meta, db_path -> [ meta, db_path.parent ] } + input[3] = channel.empty() + input[4] = false + input[5] = '.fasta' + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(sanitizeOutput(workflow.out)).match() } + ) + } + } + + test("sarscov2 - both ch_cat_db and ch_cat_db_download_id provided - should fail - stub") { + + options "-stub" + + when { + workflow { + """ + input[0] = channel.of([ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + ]) + input[1] = channel.empty() + input[2] = CATPACK_PREPARE.out.db.map { meta, db_path -> [ meta, db_path.parent ] } + input[3] = channel.of([ [id:'cat_db_nr'], 'nr' ]) + input[4] = false + input[5] = '.fasta' + """ + } + } + + then { + assert workflow.failed + assert workflow.errorReport.contains("Both a pre-built DB and a download ID were provided") + } + } +} diff --git a/subworkflows/nf-core/fasta_classify_catpack/tests/main.nf.test.snap b/subworkflows/nf-core/fasta_classify_catpack/tests/main.nf.test.snap new file mode 100644 index 0000000..d2e753e --- /dev/null +++ b/subworkflows/nf-core/fasta_classify_catpack/tests/main.nf.test.snap @@ -0,0 +1,183 @@ +{ + "sarscov2 - bins and unbinned - summarise": { + "content": [ + { + "bat_classification": [ + [ + { + "id": "test" + }, + "test-bins_bin2classification.names.txt:md5,4694012f5a1ade31ed08a068b03ab018" + ] + ], + "bat_summary": [ + [ + { + "id": "test" + }, + "test-bins_summary.txt:md5,2fc7ecc57cfe0c3362fba27e9693d266" + ] + ], + "bin2classification": [ + [ + { + "id": "test" + }, + "test-bins.bin2classification.txt:md5,8cb8c364c1dea229f68e8c7a1747205d" + ] + ], + "contig2classification": [ + [ + { + "id": "test_unbinned" + }, + "test_unbinned-contigs.contig2classification.txt:md5,3c3c79045bf6ae8b1292ae9afa2ec4af" + ] + ], + "contigs_classification": [ + [ + { + "id": "test_unbinned" + }, + "test_unbinned-contigs_contig2classification.names.txt:md5,b7d53cc4b7194bb86e463ac893690a26" + ] + ], + "contigs_summary": [ + [ + { + "id": "test_unbinned" + }, + "test_unbinned-contigs_summary.txt:md5,22976c5d2f46ce65e0a26da5f51bdbba" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.3" + }, + "timestamp": "2026-03-12T17:26:25.519266" + }, + "sarscov2 - bins only - stub": { + "content": [ + { + "bat_classification": [ + [ + { + "id": "test" + }, + "test-bins_bin2classification.names.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bat_summary": [ + + ], + "bin2classification": [ + [ + { + "id": "test" + }, + "test-bins.bin2classification.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "contig2classification": [ + + ], + "contigs_classification": [ + + ], + "contigs_summary": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.3" + }, + "timestamp": "2026-03-12T17:26:54.788797" + }, + "sarscov2 - bins only - summarise": { + "content": [ + { + "bat_classification": [ + [ + { + "id": "test" + }, + "test-bins_bin2classification.names.txt:md5,4694012f5a1ade31ed08a068b03ab018" + ] + ], + "bat_summary": [ + [ + { + "id": "test" + }, + "test-bins_summary.txt:md5,2fc7ecc57cfe0c3362fba27e9693d266" + ] + ], + "bin2classification": [ + [ + { + "id": "test" + }, + "test-bins.bin2classification.txt:md5,8cb8c364c1dea229f68e8c7a1747205d" + ] + ], + "contig2classification": [ + + ], + "contigs_classification": [ + + ], + "contigs_summary": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.3" + }, + "timestamp": "2026-03-12T17:26:06.69212" + }, + "sarscov2 - bins only": { + "content": [ + { + "bat_classification": [ + [ + { + "id": "test" + }, + "test-bins_bin2classification.names.txt:md5,4694012f5a1ade31ed08a068b03ab018" + ] + ], + "bat_summary": [ + + ], + "bin2classification": [ + [ + { + "id": "test" + }, + "test-bins.bin2classification.txt:md5,8cb8c364c1dea229f68e8c7a1747205d" + ] + ], + "contig2classification": [ + + ], + "contigs_classification": [ + + ], + "contigs_summary": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.3" + }, + "timestamp": "2026-03-12T17:26:42.058346" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/fasta_classify_catpack/tests/nextflow.config b/subworkflows/nf-core/fasta_classify_catpack/tests/nextflow.config new file mode 100644 index 0000000..a61e194 --- /dev/null +++ b/subworkflows/nf-core/fasta_classify_catpack/tests/nextflow.config @@ -0,0 +1,31 @@ +process { + withName: CATPACK_PREPARE { + ext.args = "--common_prefix database" + } + + withName: 'FASTA_CLASSIFY_CATPACK:CATPACK_BINS' { + ext.prefix = { "${meta.id}-bins" } + } + + withName: 'FASTA_CLASSIFY_CATPACK:CATPACK_ADDNAMES_BINS' { + ext.prefix = { "${meta.id}-bins_bin2classification.names" } + ext.args = "--only_official" + } + + withName: 'FASTA_CLASSIFY_CATPACK:CATPACK_SUMMARISE_BINS' { + ext.prefix = { "${meta.id}-bins_summary" } + } + + withName: 'FASTA_CLASSIFY_CATPACK:CATPACK_CONTIGS' { + ext.prefix = { "${meta.id}-contigs" } + } + + withName: 'FASTA_CLASSIFY_CATPACK:CATPACK_ADDNAMES_CONTIGS' { + ext.prefix = { "${meta.id}-contigs_contig2classification.names" } + ext.args = "--only_official" + } + + withName: 'FASTA_CLASSIFY_CATPACK:CATPACK_SUMMARISE_CONTIGS' { + ext.prefix = { "${meta.id}-contigs_summary" } + } +} diff --git a/workflows/genomesubmit.nf b/workflows/genomesubmit.nf index 2e7ae0a..23d653c 100644 --- a/workflows/genomesubmit.nf +++ b/workflows/genomesubmit.nf @@ -3,19 +3,21 @@ IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { GENOME_UPLOAD } from '../modules/local/genome_upload' -include { ENA_WEBIN_CLI } from '../modules/local/ena_webin_cli' +include { GENOME_UPLOAD } from '../modules/local/genome_upload' +include { ENA_WEBIN_CLI } from '../modules/local/ena_webin_cli' +include { RENAME_FASTA_FOR_CATPACK } from '../modules/local/rename_fasta_for_catpack' -include { COVERM_GENOME } from '../modules/nf-core/coverm/genome' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-schema' +include { COVERM_GENOME } from '../modules/nf-core/coverm/genome' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { paramsSummaryMap } from 'plugin/nf-schema' -include { GENOME_EVALUATION } from '../subworkflows/local/genome_evaluation' -include { RNA_DETECTION } from '../subworkflows/local/rna_detection' +include { GENOME_EVALUATION } from '../subworkflows/local/genome_evaluation' +include { RNA_DETECTION } from '../subworkflows/local/rna_detection' +include { FASTA_CLASSIFY_CATPACK } from '../subworkflows/nf-core/fasta_classify_catpack/main' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_seqsubmit_pipeline' +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_seqsubmit_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -72,20 +74,20 @@ workflow GENOMESUBMIT { genome_reads = genome_fasta_and_reads.map{meta, _fasta, reads -> [meta, reads]} // --------- Genome coverage calculation + genome_fasta + .branch { meta, fasta -> + genome_coverage_ref_input: meta.genome_coverage == null + genome_coverage_present: true // Everything else goes here + } + .set { branched_coverage_results } genome_reads.filter { meta, reads -> meta.genome_coverage == null } .map { meta, reads -> [meta, reads] } .set { genome_coverage_fq_input } - genome_fasta.filter { meta, fasta -> meta.genome_coverage == null } - .map { meta, fasta -> [meta, fasta] } - .set { genome_coverage_ref_input } - genome_fasta.filter { meta, fasta -> meta.genome_coverage != null } - .map { meta, fasta -> [meta, fasta] } - .set { genome_coverage_present } COVERM_GENOME ( genome_coverage_fq_input, - genome_coverage_ref_input, + branched_coverage_results.genome_coverage_ref_input, false, false, 'file' @@ -93,58 +95,57 @@ workflow GENOMESUBMIT { ch_versions = ch_versions.mix( COVERM_GENOME.out.versions ) // Update metadata for records missing coverage - fasta_updated_with_coverage = COVERM_GENOME.out.coverage.join(genome_coverage_ref_input) + fasta_updated_with_coverage = COVERM_GENOME.out.coverage.join(branched_coverage_results.genome_coverage_ref_input) .map{ meta, coverage_tsv, fasta -> def coverage = coverage_tsv.readLines()[1].split('\t')[1]; // skip header def updated_meta = meta.clone() updated_meta.genome_coverage = coverage; return [updated_meta, fasta] } - .mix(genome_coverage_present) + .mix(branched_coverage_results.genome_coverage_present) // --------- For genomes without RNA_presence info, calculate rRNA and tRNA - fasta_updated_with_coverage.filter { meta, fasta -> meta.RNA_presence == null } - .map { meta, fasta -> [meta, fasta] } - .set { rna_prediction_input } - fasta_updated_with_coverage.filter { meta, fasta -> meta.RNA_presence != null } - .map { meta, fasta -> [meta, fasta] } - .set { rna_present } + fasta_updated_with_coverage + .branch { meta, fasta -> + rna_prediction_input: meta.RNA_presence == null + rna_present: true // Everything else goes here + } + .set { branched_rna_results } RNA_DETECTION ( - rna_prediction_input + branched_rna_results.rna_prediction_input ) ch_versions = ch_versions.mix( RNA_DETECTION.out.versions ) // Update metadata for records missing RNA - fasta_updated_with_rna = RNA_DETECTION.out.rna_detected.join(rna_prediction_input) + fasta_updated_with_rna = RNA_DETECTION.out.rna_detected.join(branched_rna_results.rna_prediction_input) .map{ meta, rna_decision, fasta -> def decision = rna_decision.readLines()[0].split('\t')[1]; def updated_meta = meta.clone() updated_meta.RNA_presence = decision; return [updated_meta, fasta] } - .mix(rna_present) + .mix(branched_rna_results.rna_present) // --------- Completeness and contamination calculation - - fasta_updated_with_rna.filter { meta, fasta -> meta.completeness == null || meta.contamination == null || meta.stats_generation_software == null } - .map { meta, fasta -> [meta, fasta] } - .set { genome_evaluation_input } - fasta_updated_with_rna.filter { meta, fasta -> meta.completeness != null && meta.contamination != null && meta.stats_generation_software != null} - .map { meta, fasta -> [meta, fasta] } - .set { evaluation_present } + fasta_updated_with_rna + .branch { meta, fasta -> + genome_evaluation_input: meta.completeness == null || meta.contamination == null || meta.stats_generation_software == null + evaluation_present: true // Everything else goes here + } + .set { branched_stats_results } GENOME_EVALUATION ( - genome_evaluation_input + branched_stats_results.genome_evaluation_input ) // Create a value channel with the version string def stats_version_ch = GENOME_EVALUATION.out.stats_versions - .map { process_name, tool_name, version_output -> return "${tool_name}_v${version_output}" + .map { _process_name, tool_name, version_output -> return "${tool_name}_v${version_output}" }.first() fasta_updated_with_stats = GENOME_EVALUATION.out.genome_evaluation - .join(genome_evaluation_input) + .join(branched_stats_results.genome_evaluation_input) .combine(stats_version_ch) .map { meta, stats_tsv, fasta, stats_version -> def line = stats_tsv.readLines()[1].split('\t') @@ -155,12 +156,53 @@ workflow GENOMESUBMIT { return [updated_meta, fasta] } - .mix(evaluation_present) + .mix(branched_stats_results.evaluation_present) + + // --------- Taxonomy + fasta_updated_with_stats + .branch { meta, fasta -> + genome_taxonomy_input: meta.NCBI_lineage == null + taxonomy_present: true // Everything else goes here + } + .set { branched_taxonomy_results } + + // Change extension for all files required taxonomy to .fasta because CATPACK requires suffix as input + RENAME_FASTA_FOR_CATPACK ( + branched_taxonomy_results.genome_taxonomy_input + ) + + // build input structures for CAT_DB depending on what provided as input + def cat_db_input = (params.cat_db != null && params.cat_db != '') + ? channel.of( [['id': 'CAT_DB'], file(params.cat_db)] ) + : channel.empty() + + def cat_db_id_input = (params.cat_db_download_id != null && params.cat_db_download_id != '') + ? channel.of( [['id': 'CAT_DB_id'], params.cat_db_download_id] ) + : channel.empty() + + FASTA_CLASSIFY_CATPACK ( + RENAME_FASTA_FOR_CATPACK.out.renamed_fasta, + channel.empty(), + cat_db_input, + cat_db_id_input, + false, // generate summaries + '.fasta' + ) + + fasta_updated_with_taxonomy = FASTA_CLASSIFY_CATPACK.out.bat_classification + .join(branched_taxonomy_results.genome_taxonomy_input) + .map { meta, taxa_tsv, fasta -> + def line = taxa_tsv.readLines()[1].split('\t') + def updated_meta = meta.clone() + updated_meta.NCBI_lineage = line[3] + return [updated_meta, fasta] + } + .mix(branched_taxonomy_results.taxonomy_present) // --------- Combine metadata into TSV - genome_metadata_csv = fasta_updated_with_stats + genome_metadata_csv = fasta_updated_with_taxonomy .map { meta, fasta -> - def row = [ + [ meta.id, fasta, meta.accession, @@ -181,8 +223,26 @@ workflow GENOMESUBMIT { ].join('\t') } .collectFile( - name: "${params.outdir}/genomes_metadata.csv", - seed: 'genome_name\tgenome_path\taccessions\tassembly_software\tbinning_software\tbinning_parameters\tstats_generation_software\tcompleteness\tcontamination\tgenome_coverage\tmetagenome\tco-assembly\tbroad_environment\tlocal_environment\tenvironmental_medium\trRNA_presence\tNCBI_lineage', + name: "${params.outdir}/${params.mode}/genomes_metadata.csv", + seed: [ + 'genome_name', + 'genome_path', + 'accessions', + 'assembly_software', + 'binning_software', + 'binning_parameters', + 'stats_generation_software', + 'completeness', + 'contamination', + 'genome_coverage', + 'metagenome', + 'co-assembly', + 'broad_environment', + 'local_environment', + 'environmental_medium', + 'rRNA_presence', + 'NCBI_lineage' + ].join('\t'), newLine: true )