diff --git a/.nf-core.yml b/.nf-core.yml index f4278d6..fe3b66c 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -2,6 +2,8 @@ lint: files_exist: - conf/igenomes.config - conf/igenomes_ignored.config + nextflow_config: + - params.input files_unchanged: - .github/PULL_REQUEST_TEMPLATE.md nf_core_version: 3.5.1 diff --git a/README.md b/README.md index a629c8f..22f9661 100644 --- a/README.md +++ b/README.md @@ -21,18 +21,20 @@ ## Introduction -**nf-core/seqsubmit** is a bioinformatics pipeline that submits data to public archives such as [ENA](https://www.ebi.ac.uk/ena/browser/home) +**nf-core/seqsubmit** is a Nextflow pipeline for submitting sequence data to [ENA](https://www.ebi.ac.uk/ena/browser/home). +Currently, the pipeline supports three submission modes, each routed to a dedicated workflow and requiring its own input samplesheet structure: -Pipeline will have several modes +- `mags` for Metagenome Assembled Genomes (MAGs) submission with `GENOMESUBMIT` workflow +- `bins` for bins submission with `GENOMESUBMIT` workflow +- `metagenomic_assemblies` for assembly submission with `ASSEMBLYSUBMIT` workflow -- `mags` for MAGs submission with **genome_submitter** wf -- `bins` for bins submission with **genome_submitter** wf -- `assemblies` for assembly submission with **assembly_submitter** wf +![seqsubmit workflow diagram](assets/seqsubmit_schema.png) ## Requirements -- Webin account registered https://www.ebi.ac.uk/ena/submit/webin/login -- Raw reads submitted into [INSDC](https://www.insdc.org/) +- [Nextflow](https://www.nextflow.io/) `>=25.04.0` +- Webin account registered at https://www.ebi.ac.uk/ena/submit/webin/login +- Raw reads used to assemble contigs submitted to [INSDC](https://www.insdc.org/) and associated accessions available Setup your environment secrets before running the pipeline: @@ -40,52 +42,147 @@ Setup your environment secrets before running the pipeline: `nextflow secrets set WEBIN_PASSWORD "XXX"` -Make sure you update with your authorised credentials. +Make sure you update commands above with your authorised credentials. -## genome_submitter +## Input samplesheets -Workflow to submit MAGs and/or bins to ENA. +### `mags` and `bins` modes (`GENOMESUBMIT`) -It takes input `samplesheet.csv` with fields required for [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader). Fields described in [docs](https://github.com/EBI-Metagenomics/genome_uploader/blob/main/README.md#input-tsv-and-fields). -For now workflow converts CSV into required TSV. +The input must follow `assets/schema_input_genome.json`. -_Future implementation will consider missing fields (for example completeness and contamination) and would run steps to fill in the gaps._ +Required columns: - +- `sample` +- `fasta` (must end with `.fa.gz` or `.fasta.gz`) +- `accession` +- `assembly_software` +- `binning_software` +- `binning_parameters` +- `stats_generation_software` +- `metagenome` +- `environmental_medium` +- `broad_environment` +- `local_environment` +- `co-assembly` - -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +Columns that required for now, but will be optional in the nearest future: + +- `completeness` +- `contamination` +- `genome_coverage` +- `rRNA_presence` +- `NCBI_lineage` + +Those fields are metadata required for [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader) package. They are described in [docs](https://github.com/EBI-Metagenomics/genome_uploader/blob/main/README.md#input-tsv-and-fields). + +Example `samplesheet_genome.csv`: + +```csv +sample,fasta,accession,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,rRNA_presence,NCBI_lineage +lachnospira_eligens,data/bin_lachnospira_eligens.fa.gz,SRR24458089,spades_v3.15.5,metabat2_v2.6,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,false,marine,cable_bacteria,marine_sediment,false,d__Bacteria;p__Proteobacteria;s_unclassified_Proteobacteria +``` + +### `metagenomic_assemblies` mode (`ASSEMBLYSUBMIT`) + +The input must follow `assets/schema_input_assembly.json`. + +Required columns: + +- `sample` +- `fasta` (must end with `.fa.gz` or `.fasta.gz`) +- `run_accession` +- `assembler` +- `assembler_version` + +At least one of the following must be provided per row: + +- reads (`fastq_1`, optional `fastq_2` for paired-end) +- `coverage` + +If `coverage` is missing and reads are provided, the workflow calculates average coverage with `coverm`. + +Example `samplesheet_assembly.csv`: + +```csv +sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version +assembly_1,data/contigs_1.fasta.gz,data/reads_1.fastq.gz,data/reads_2.fastq.gz,,ERR011322,SPAdes,3.15.5 +assembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9 +``` ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - -Now, you can run the pipeline using: +Validation run (submission to the ENA TEST server) in `metagenomic_assemblies` mode: - +```bash +nextflow run nf-core/seqsubmit \ + -profile docker \ + --mode metagenomic_assemblies \ + --input assets/samplesheet_assembly.csv \ + --submission_study \ + --centre_name TEST_CENTER \ + --webincli_submit true \ + --test_upload true \ + --outdir results/validate_assemblies +``` + +Live submission example: ```bash nextflow run nf-core/seqsubmit \ - -profile \ - --input samplesheet.csv \ - --outdir + -profile docker \ + --mode metagenomic_assemblies \ + --input assets/samplesheet_assembly.csv \ + --submission_study PRJEB98843 \ + --test_upload false \ + --webincli_submit true \ + --outdir results/live_assembly ``` > [!WARNING] @@ -93,13 +190,16 @@ nextflow run nf-core/seqsubmit \ For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/seqsubmit/usage) and the [parameter documentation](https://nf-co.re/seqsubmit/parameters). - +Key output locations in `--outdir`: + +- `upload/manifests/`: generated manifest files for submission +- `upload/webin_cli/`: ENA Webin CLI reports +- `multiqc/`: MultiQC summary report +- `pipeline_info/`: execution reports, trace, DAG, and software versions + +For full details, see the [output documentation](https://nf-co.re/seqsubmit/output). ## Credits diff --git a/assets/samplesheet_assembly.csv b/assets/samplesheet_assembly.csv new file mode 100644 index 0000000..35945f5 --- /dev/null +++ b/assets/samplesheet_assembly.csv @@ -0,0 +1,4 @@ +sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version +sample1,tests/data/contigs.fasta.gz,tests/data/fastq_1.fastq,tests/data/fastq_2.fastq,,ERR000001,SPAdes,3.15 +sample2,tests/data/invalid_assembly.fasta.gz,,,45,ERR000002,Velvet,1.2.10 +sample3,tests/data/contigs.fasta.gz,,,30,ERR000003,MEGAHIT,1.2.9 diff --git a/assets/samplesheet.csv b/assets/samplesheet_genomes.csv similarity index 100% rename from assets/samplesheet.csv rename to assets/samplesheet_genomes.csv diff --git a/assets/schema_input_assembly.json b/assets/schema_input_assembly.json new file mode 100644 index 0000000..3b55e28 --- /dev/null +++ b/assets/schema_input_assembly.json @@ -0,0 +1,114 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/seqsubmit/main/assets/schema_input_assembly.json", + "title": "nf-core/seqsubmit pipeline - params.input schema", + "description": "Schema for the sample sheet provided with params.input if params.mode is set to 'metagenomic_assemblies'", + "type": "array", + "items": { + "type": "object", + "properties": { + "sample": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Sample must be provided and cannot contain spaces", + "meta": ["id"] + }, + "fasta": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?a\\.gz$", + "errorMessage": "FASTA file must be provided and have extension '.fa', '.fasta', '.fas', '.fna' (optionally gzipped)", + "description": "Metagenomic assembly FASTA file" + }, + "fastq_1": { + "anyOf": [ + { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.(fq|fastq)(\\.gz)?$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "errorMessage": "FASTQ file must have extension '.fq' or '.fastq' (optionally gzipped)", + "description": "Forward reads if paired-end or single-end reads FASTQ file" + }, + "fastq_2": { + "anyOf": [ + { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.(fq|fastq)(\\.gz)?$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "errorMessage": "FASTQ file for reverse reads must have extension '.fq' or '.fastq' (optionally gzipped)", + "description": "Reverse reads FASTQ file if paired-end. Leave empty for single-end reads" + }, + "coverage": { + "anyOf": [ + { + "type": "number", + "minimum": 0 + }, + { + "type": "string", + "maxLength": 0 + } + ], + "errorMessage": "Coverage must be a positive number or empty", + "description": "Estimated value of assembly coverage" + }, + "run_accession": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Accession must be provided and cannot contain spaces", + "description": "Accession of the run used to generate the assembly" + }, + "assembler": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Assembler must be provided and cannot contain spaces", + "description": "Name of the assembler software used to generate the assembly" + }, + "assembler_version": { + "anyOf": [{ "type": "string" }, { "type": "number" }], + "pattern": "^\\S+$", + "errorMessage": "Assembler version must be provided and cannot contain spaces", + "description": "Version of the assembler software used to generate the assembly" + } + }, + "required": ["sample", "fasta", "run_accession", "assembler", "assembler_version"], + "anyOf": [ + { + "properties": { + "fastq_1": { + "type": "string", + "minLength": 1 + } + }, + "required": ["fastq_1"] + }, + { + "properties": { + "coverage": { + "type": "number", + "minimum": 0 + } + }, + "required": ["coverage"] + } + ], + "errorMessage": { + "anyOf": "Either reads or coverage must be provided in the sample sheet for each assembly" + } + } +} diff --git a/assets/schema_input.json b/assets/schema_input_genome.json similarity index 97% rename from assets/schema_input.json rename to assets/schema_input_genome.json index d06d150..388673d 100644 --- a/assets/schema_input.json +++ b/assets/schema_input_genome.json @@ -1,8 +1,8 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://raw.githubusercontent.com/nf-core/seqsubmit/main/assets/schema_input.json", + "$id": "https://raw.githubusercontent.com/nf-core/seqsubmit/main/assets/schema_input_genome.json", "title": "nf-core/seqsubmit pipeline - params.input schema", - "description": "Schema for the file provided with params.input", + "description": "Schema for the file provided with params.input if params.mode is set to 'mags' or 'bins'", "type": "array", "items": { "type": "object", diff --git a/assets/seqsubmit_schema.png b/assets/seqsubmit_schema.png new file mode 100644 index 0000000..aa9a81f Binary files /dev/null and b/assets/seqsubmit_schema.png differ diff --git a/conf/modules.config b/conf/modules.config index 4f90f95..1eadfb0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -20,7 +20,7 @@ process { withName: 'GENOME_UPLOAD' { publishDir = [ - path: { "${params.outdir}/upload/manifests" }, + path: { "${params.outdir}/${params.mode}/upload/manifests" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -28,7 +28,7 @@ process { withName: 'ENA_WEBIN_CLI' { publishDir = [ - path: { "${params.outdir}/upload/webin_cli" }, + path: { "${params.outdir}/${params.mode}/upload/webin_cli" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -37,10 +37,13 @@ process { withName: 'MULTIQC' { ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } publishDir = [ - path: { "${params.outdir}/multiqc" }, + path: { "${params.outdir}/${params.mode}/multiqc" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'GENERATE_ASSEMBLY_MANIFEST|ENA_WEBIN_CLI|REGISTERSTUDY' { + ext.args = { params.test_upload ? "--test" : "" } + } } diff --git a/conf/test.config b/conf/test.config index f2d8871..e69de29 100644 --- a/conf/test.config +++ b/conf/test.config @@ -1,34 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a fast and simple pipeline test. - - Use as follows: - nextflow run nf-core/seqsubmit -profile test, --outdir - ----------------------------------------------------------------------------------------- -*/ - -process { - resourceLimits = [ - cpus: 4, - memory: '15.GB', - time: '1.h' - ] -} - -params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' - - // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/samplesheet_genomesubmit.csv' - - mode = "mags" - ena_genome_study_accession = "PRJEB98843" - centre_name = "TEST_CENTER" - -} diff --git a/conf/test_assembly.config b/conf/test_assembly.config new file mode 100644 index 0000000..d94b5bc --- /dev/null +++ b/conf/test_assembly.config @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '8.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'assets/samplesheet_assembly.csv' + outdir = 'test_output' + + mode = "metagenomic_assemblies" + submission_study = "PRJEB98843" + ena_raw_reads_study_accession = "PRJEB65995" + library = "metagenome" + centre_name = "TEST_CENTER" + +} diff --git a/conf/test_genome.config b/conf/test_genome.config new file mode 100644 index 0000000..43235a8 --- /dev/null +++ b/conf/test_genome.config @@ -0,0 +1,35 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/samplesheet_genomesubmit.csv' + outdir = 'test_output' + + mode = "mags" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" + +} diff --git a/main.nf b/main.nf index 16304a2..9bc5af7 100644 --- a/main.nf +++ b/main.nf @@ -15,8 +15,8 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { SEQSUBMIT } from './workflows/seqsubmit' include { GENOMESUBMIT } from './workflows/genomesubmit' +include { ASSEMBLYSUBMIT } from './workflows/assemblysubmit' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_seqsubmit_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_seqsubmit_pipeline' /* @@ -34,21 +34,28 @@ workflow NFCORE_SEQSUBMIT { samplesheet // channel: samplesheet read in from --input main: - ch_multiqc_report = Channel.empty() + ch_multiqc_report = channel.empty() // // WORKFLOW: Run pipeline // - if ((params.mode == "mags") || (params.mode == "bins")) { + // Depending on the input type (mags/bins or metagenomic_assemblies), one or the another workflow will be triggered + if (params.mode == "mags") { GENOMESUBMIT ( samplesheet, - params.mode + "mags" ) ch_multiqc_report = GENOMESUBMIT.out.multiqc_report - } else { - SEQSUBMIT ( + } else if (params.mode == "bins") { + GENOMESUBMIT ( + samplesheet, + "bins" + ) + ch_multiqc_report = GENOMESUBMIT.out.multiqc_report + } else if (params.mode == "metagenomic_assemblies") { + ASSEMBLYSUBMIT ( samplesheet ) - ch_multiqc_report = SEQSUBMIT.out.multiqc_report + ch_multiqc_report = ASSEMBLYSUBMIT.out.multiqc_report } @@ -68,13 +75,14 @@ workflow { // // SUBWORKFLOW: Run initialisation tasks // + PIPELINE_INITIALISATION ( params.version, params.validate_params, - params.monochrome_logs, args, params.outdir, params.input, + params.mode, params.help, params.help_full, params.show_hidden diff --git a/modules.json b/modules.json index 9601d63..0727774 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "coverm/contig": { + "branch": "master", + "git_sha": "27e31681fdbf1e511257355a236037a8cd9c2b2e", + "installed_by": ["modules"] + }, "fastavalidator": { "branch": "master", "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", diff --git a/modules/local/generate_assembly_manifest/main.nf b/modules/local/generate_assembly_manifest/main.nf index 4cbacfd..fcaf32f 100644 --- a/modules/local/generate_assembly_manifest/main.nf +++ b/modules/local/generate_assembly_manifest/main.nf @@ -2,7 +2,7 @@ process GENERATE_ASSEMBLY_MANIFEST { tag "$meta.id" label 'process_single' - container "community.wave.seqera.io/library/pip_assembly-uploader:7e9461afbdd7a521" + container "community.wave.seqera.io/library/pip_assembly-uploader:2a65298c0161c561" input: tuple val(meta), path(assembly_fasta), path(data_csv) @@ -18,17 +18,19 @@ process GENERATE_ASSEMBLY_MANIFEST { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def tpa = params.upload_tpa ? "--tpa" : "" """ assembly_manifest \\ --study ${assembly_study} \\ --data ${data_csv} \\ --assembly_study ${assembly_study} \\ - --output ${assembly_study}_upload \\ + --output-dir "." \\ + ${tpa} \\ ${args} cat <<-END_VERSIONS > versions.yml "${task.process}": - python: \$(python --version) + assembly_uploader: \$(assembly_manifest --version) END_VERSIONS """ @@ -41,7 +43,7 @@ process GENERATE_ASSEMBLY_MANIFEST { cat <<-END_VERSIONS > versions.yml "${task.process}": - python: \$(python --version) + assembly_uploader: \$(assembly_manifest --version) END_VERSIONS """ } diff --git a/modules/local/generate_assembly_manifest/nextflow.config b/modules/local/generate_assembly_manifest/nextflow.config new file mode 100644 index 0000000..9a4037e --- /dev/null +++ b/modules/local/generate_assembly_manifest/nextflow.config @@ -0,0 +1,9 @@ +process { + withName: GENERATE_ASSEMBLY_MANIFEST { + ext.args2 = '--test' + } +} +env { + ENA_WEBIN = secrets.WEBIN_ACCOUNT + ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD +} diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test b/modules/local/generate_assembly_manifest/tests/main.nf.test index 1a94d78..897744a 100644 --- a/modules/local/generate_assembly_manifest/tests/main.nf.test +++ b/modules/local/generate_assembly_manifest/tests/main.nf.test @@ -2,6 +2,7 @@ nextflow_process { name "Test Process GENERATE_ASSEMBLY_MANIFEST" script "../main.nf" + config "../nextflow.config" process "GENERATE_ASSEMBLY_MANIFEST" tag "modules" diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap index b57f887..7fef896 100644 --- a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap +++ b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap @@ -11,7 +11,7 @@ ] ], "1": [ - "versions.yml:md5,188d7f56d019104ee5883d69eede27b4" + "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e" ], "manifest": [ [ @@ -22,20 +22,20 @@ ] ], "versions": [ - "versions.yml:md5,188d7f56d019104ee5883d69eede27b4" + "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e" ] }, { "GENERATE_ASSEMBLY_MANIFEST": { - "python": "Python 3.14.0" + "assembly_uploader": "assembly_uploader 1.3.3" } } ], "meta": { - "nf-test": "0.9.3", - "nextflow": "25.10.0" + "nf-test": "0.9.0", + "nextflow": "25.04.1" }, - "timestamp": "2025-10-29T11:00:23.072784" + "timestamp": "2025-10-30T15:10:02.229709" }, "GENERATE_ASSEMBLY_MANIFEST completes with expected outputs": { "content": [ @@ -45,34 +45,34 @@ { "id": "test" }, - "233126d4c4d023f18c7836ed36395e3c.manifest:md5,b31fec00db575da44ad17950ae2e37ff" + "233126d4c4d023f18c7836ed36395e3c.manifest:md5,3152b34ddec05a2c9937a2e03416e5e1" ] ], "1": [ - "versions.yml:md5,188d7f56d019104ee5883d69eede27b4" + "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e" ], "manifest": [ [ { "id": "test" }, - "233126d4c4d023f18c7836ed36395e3c.manifest:md5,b31fec00db575da44ad17950ae2e37ff" + "233126d4c4d023f18c7836ed36395e3c.manifest:md5,3152b34ddec05a2c9937a2e03416e5e1" ] ], "versions": [ - "versions.yml:md5,188d7f56d019104ee5883d69eede27b4" + "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e" ] }, { "GENERATE_ASSEMBLY_MANIFEST": { - "python": "Python 3.14.0" + "assembly_uploader": "assembly_uploader 1.3.3" } } ], "meta": { - "nf-test": "0.9.3", - "nextflow": "25.10.0" + "nf-test": "0.9.0", + "nextflow": "25.04.1" }, - "timestamp": "2025-10-29T11:00:16.801535" + "timestamp": "2025-10-30T15:09:57.708757" } } \ No newline at end of file diff --git a/modules/local/genome_upload/main.nf b/modules/local/genome_upload/main.nf index 204c791..03e3d2c 100644 --- a/modules/local/genome_upload/main.nf +++ b/modules/local/genome_upload/main.nf @@ -34,7 +34,7 @@ process GENOME_UPLOAD { export ENA_WEBIN_PASSWORD=\$WEBIN_PASSWORD genome_upload \\ - -u $params.ena_genome_study_accession \\ + -u $params.submission_study \\ --genome_info ${table_for_upload} \\ --centre_name $params.centre_name \\ --${mags_or_bins_flag} \\ diff --git a/modules/local/registerstudy/main.nf b/modules/local/registerstudy/main.nf index 10d29ac..96c4e47 100644 --- a/modules/local/registerstudy/main.nf +++ b/modules/local/registerstudy/main.nf @@ -3,15 +3,15 @@ process REGISTERSTUDY { label 'process_single' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/pip_assembly-uploader:7e9461afbdd7a521" + container "community.wave.seqera.io/library/pip_assembly-uploader:2a65298c0161c561" input: tuple val(meta), val(study), val(center), val(library) output: - tuple val(meta), env(STUDY_ID), emit: study_accession - path "versions.yml" , emit: versions + tuple val(meta), env("STUDY_ID"), emit: study_accession + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -37,7 +37,7 @@ process REGISTERSTUDY { cat <<-END_VERSIONS > versions.yml "${task.process}": - python: \$(python --version) + assembly_uploader: \$(study_xmls --version) END_VERSIONS """ @@ -49,7 +49,7 @@ process REGISTERSTUDY { cat <<-END_VERSIONS > versions.yml "${task.process}": - python: \$(python --version) + assembly_uploader: \$(study_xmls --version) END_VERSIONS """ } diff --git a/modules/local/registerstudy/nextflow.config b/modules/local/registerstudy/nextflow.config index aa9135e..3f71a8e 100644 --- a/modules/local/registerstudy/nextflow.config +++ b/modules/local/registerstudy/nextflow.config @@ -3,3 +3,7 @@ process { ext.args2 = '--test' } } +env { + ENA_WEBIN = secrets.WEBIN_ACCOUNT + ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD +} diff --git a/modules/local/registerstudy/tests/main.nf.test b/modules/local/registerstudy/tests/main.nf.test index a43da6d..d11a6d1 100644 --- a/modules/local/registerstudy/tests/main.nf.test +++ b/modules/local/registerstudy/tests/main.nf.test @@ -9,12 +9,12 @@ nextflow_process { tag "modules" tag "registerstudy" -/* + test("registerstudy - should register a study on ENA test server") { when { process { - """ + """ input[0] = [ [ id:'test', single_end:false ], // meta map "PRJNA318468", @@ -32,7 +32,6 @@ nextflow_process { ) } } -*/ test("registerstudy - stub") { diff --git a/modules/local/registerstudy/tests/main.nf.test.snap b/modules/local/registerstudy/tests/main.nf.test.snap index 4e6c14b..1dd3a79 100644 --- a/modules/local/registerstudy/tests/main.nf.test.snap +++ b/modules/local/registerstudy/tests/main.nf.test.snap @@ -47,7 +47,7 @@ ] ], "1": [ - "versions.yml:md5,049c28351d1fd639673904f17b95c213" + "versions.yml:md5,1d079512d28737f6b925e85563aa2c53" ], "study_accession": [ [ @@ -59,14 +59,14 @@ ] ], "versions": [ - "versions.yml:md5,049c28351d1fd639673904f17b95c213" + "versions.yml:md5,1d079512d28737f6b925e85563aa2c53" ] } ], "meta": { - "nf-test": "0.9.3", - "nextflow": "25.10.0" + "nf-test": "0.9.0", + "nextflow": "25.04.1" }, - "timestamp": "2025-10-29T10:59:32.499098" + "timestamp": "2025-10-30T14:58:53.721718" } } \ No newline at end of file diff --git a/modules/nf-core/coverm/contig/environment.yml b/modules/nf-core/coverm/contig/environment.yml new file mode 100644 index 0000000..33e9e6c --- /dev/null +++ b/modules/nf-core/coverm/contig/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::coverm=0.7.0 diff --git a/modules/nf-core/coverm/contig/main.nf b/modules/nf-core/coverm/contig/main.nf new file mode 100644 index 0000000..884a67a --- /dev/null +++ b/modules/nf-core/coverm/contig/main.nf @@ -0,0 +1,55 @@ +process COVERM_CONTIG { + tag "${meta.id}" + label "process_medium" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/coverm:0.7.0--hcb7b614_4' : + 'biocontainers/coverm:0.7.0--hcb7b614_4' }" + + input: + tuple val(meta), path(input) + tuple val(meta2), path(reference) + val bam_input + val interleaved + + output: + tuple val(meta), path("*.depth.txt"), emit: coverage + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def fastq_input = meta.single_end ? "--single" : interleaved ? "--interleaved" : "--coupled" + def input_type = bam_input ? "--bam-files" : "${fastq_input}" + def reference_str = bam_input ? "" : "--reference ${reference}" + """ + TMPDIR=. + + coverm contig \\ + --threads ${task.cpus} \\ + ${input_type} ${input} \\ + ${reference_str} \\ + ${args} \\ + --output-file ${prefix}.depth.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + coverm: \$(coverm --version | sed 's/coverm //') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.depth.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + coverm: \$(coverm --version | sed 's/coverm //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/coverm/contig/meta.yml b/modules/nf-core/coverm/contig/meta.yml new file mode 100644 index 0000000..d738299 --- /dev/null +++ b/modules/nf-core/coverm/contig/meta.yml @@ -0,0 +1,80 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "coverm_contig" +description: Map reads to contigs and estimate coverage +keywords: + - mapping + - genomics + - metagenomics + - coverage +tools: + - "coverm": + description: "CoverM aims to be a configurable, easy to use and fast DNA read + coverage and relative abundance calculator focused on metagenomics applications" + homepage: "https://github.com/wwood/CoverM" + documentation: "https://wwood.github.io/CoverM/coverm-contig.html" + tool_dev_url: "https://github.com/wwood/CoverM" + doi: "10.5281/zenodo.10531253" + licence: ["GPL v3"] + identifier: biotools:coverm + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - input: + type: file + description: | + FASTA/FASTQ containing reads (can be gzipped), or sorted BAM files of reads mapped to a reference. + If supplying PE fasta for multiple samples, should be in the order "sample1_1, sample1_2, sample2_1, sample2_2...". + pattern: "*.{fa,fq,fa.gz,fq.gz,bam}" + ontologies: + - edam: http://edamontology.org/format_1930 # FASTQ + - - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'sample1', single_end:false ]` + + - reference: + type: file + description: Reference FASTA file to map reads to, or minimap2/strobealign index. + Not required if using BAM input. + pattern: "*.{fasta,fasta.gz,mmi,sti}" + + ontologies: [] + - bam_input: + type: boolean + description: True if input is bam files + + - interleaved: + type: boolean + description: True if input is interleaved fastq file + +output: + coverage: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + pattern: "*.depths.{txt}" + - "*.depth.txt": + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + pattern: "*.depths.{txt}" + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@prototaxites" +maintainers: + - "@prototaxites" diff --git a/modules/nf-core/coverm/contig/tests/main.nf.test b/modules/nf-core/coverm/contig/tests/main.nf.test new file mode 100644 index 0000000..845068c --- /dev/null +++ b/modules/nf-core/coverm/contig/tests/main.nf.test @@ -0,0 +1,107 @@ +nextflow_process { + + name "Test Process COVERM_CONTIG" + script "../main.nf" + process "COVERM_CONTIG" + + tag "modules" + tag "modules_nfcore" + tag "coverm" + tag "coverm/contig" + + test("coverm_contig - fastq") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test1_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test1_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test2_2.fastq.gz', checkIfExists: true), + ] + ] + input[1] = [ + [ id:'test_ref' ], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz') + ] + input[2] = false + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("coverm_contig - bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/bam/test1.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/bam/test2.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [], + [] + ] + input[2] = true + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("coverm_contig - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test1_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test1_2.fastq.gz', checkIfExists: true), + ] + ] + input[1] = [ + [ id:'test_ref' ], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz') + ] + input[2] = false + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/coverm/contig/tests/main.nf.test.snap b/modules/nf-core/coverm/contig/tests/main.nf.test.snap new file mode 100644 index 0000000..0b7ef09 --- /dev/null +++ b/modules/nf-core/coverm/contig/tests/main.nf.test.snap @@ -0,0 +1,107 @@ +{ + "coverm_contig - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.depth.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,c430e2f31175deec3a7e349f635e881b" + ], + "coverage": [ + [ + { + "id": "test", + "single_end": false + }, + "test.depth.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,c430e2f31175deec3a7e349f635e881b" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-08T10:22:23.71903442" + }, + "coverm_contig - fastq": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.depth.txt:md5,2d68cbaf0dc3c88a024197386a68d71d" + ] + ], + "1": [ + "versions.yml:md5,c430e2f31175deec3a7e349f635e881b" + ], + "coverage": [ + [ + { + "id": "test", + "single_end": false + }, + "test.depth.txt:md5,2d68cbaf0dc3c88a024197386a68d71d" + ] + ], + "versions": [ + "versions.yml:md5,c430e2f31175deec3a7e349f635e881b" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-07-25T14:35:02.975504442" + }, + "coverm_contig - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.depth.txt:md5,e62a5d74dbb62d1ef9f52c12d9a8dcb2" + ] + ], + "1": [ + "versions.yml:md5,c430e2f31175deec3a7e349f635e881b" + ], + "coverage": [ + [ + { + "id": "test", + "single_end": false + }, + "test.depth.txt:md5,e62a5d74dbb62d1ef9f52c12d9a8dcb2" + ] + ], + "versions": [ + "versions.yml:md5,c430e2f31175deec3a7e349f635e881b" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-08T10:22:12.710853713" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastavalidator/fastavalidator.diff b/modules/nf-core/fastavalidator/fastavalidator.diff index e3a26eb..4937db2 100644 --- a/modules/nf-core/fastavalidator/fastavalidator.diff +++ b/modules/nf-core/fastavalidator/fastavalidator.diff @@ -17,17 +17,33 @@ Changes in 'fastavalidator/meta.yml': Changes in 'fastavalidator/main.nf': --- modules/nf-core/fastavalidator/main.nf +++ modules/nf-core/fastavalidator/main.nf -@@ -9,6 +9,7 @@ +@@ -9,11 +9,12 @@ input: tuple val(meta), path(fasta) + val(is_metagenome) output: - tuple val(meta), path('*.success.log') , emit: success_log , optional: true -@@ -25,6 +26,25 @@ +- tuple val(meta), path('*.success.log') , emit: success_log , optional: true +- tuple val(meta), path('*.error.log') , emit: error_log , optional: true +- path "versions.yml" , emit: versions ++ tuple val(meta), path('*.success.log'), emit: success_log , optional: true ++ tuple val(meta), path('*.error.log') , emit: error_log , optional: true ++ path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when +@@ -21,10 +22,32 @@ + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ ++ # Ensure *.error.log file exists to append to, even if py_fasta_validator doesn't produce any errors ++ touch "${prefix}.error.log" ++ + py_fasta_validator \\ -f $fasta \\ - 2> "${prefix}.error.log" \\ +- 2> "${prefix}.error.log" \\ ++ 2>> "${prefix}.error.log" \\ || echo "Errors from fasta_validate printed to ${prefix}.error.log" + + # One more check: count contigs. More than 1 contig required. @@ -35,9 +51,9 @@ Changes in 'fastavalidator/main.nf': + + if [ "${is_metagenome}" = true ]; then + if [[ "${fasta}" == *.gz ]]; then -+ CONTIGS=\$(zgrep -c '^>' "${fasta}" || true) ++ CONTIGS=\$(zcat "${fasta}" | grep -c '^>') + else -+ CONTIGS=\$(grep -c '^>' "${fasta}" || true) ++ CONTIGS=\$(grep -c '^>' "${fasta}") + fi + + echo "[INFO] Contigs detected: \${CONTIGS}" @@ -149,7 +165,7 @@ Changes in 'fastavalidator/tests/main.nf.test': + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map -+ file("${moduleDir}/tests/test_data/contig_test.fasta", checkIfExists: true) ++ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = true // is_metagenome flag + """ @@ -169,5 +185,4 @@ Changes in 'fastavalidator/tests/main.nf.test': + } } -'modules/nf-core/fastavalidator/tests/test_data/contig_test.fasta' was created ************************************************************ diff --git a/modules/nf-core/fastavalidator/main.nf b/modules/nf-core/fastavalidator/main.nf index a5fcad0..87db6ca 100644 --- a/modules/nf-core/fastavalidator/main.nf +++ b/modules/nf-core/fastavalidator/main.nf @@ -22,9 +22,12 @@ process FASTAVALIDATOR { script: def prefix = task.ext.prefix ?: "${meta.id}" """ + # Ensure *.error.log file exists to append to, even if py_fasta_validator doesn't produce any errors + touch "${prefix}.error.log" + py_fasta_validator \\ -f $fasta \\ - 2> "${prefix}.error.log" \\ + 2>> "${prefix}.error.log" \\ || echo "Errors from fasta_validate printed to ${prefix}.error.log" # One more check: count contigs. More than 1 contig required. @@ -32,9 +35,9 @@ process FASTAVALIDATOR { if [ "${is_metagenome}" = true ]; then if [[ "${fasta}" == *.gz ]]; then - CONTIGS=\$(zgrep -c '^>' "${fasta}" || true) + CONTIGS=\$(zcat "${fasta}" | grep -c '^>') else - CONTIGS=\$(grep -c '^>' "${fasta}" || true) + CONTIGS=\$(grep -c '^>' "${fasta}") fi echo "[INFO] Contigs detected: \${CONTIGS}" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test index 33316a7..2e56ff3 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -15,7 +15,7 @@ nextflow_process { when { process { """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) + input[0] = channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) input[1] = [] input[2] = [] input[3] = [] @@ -41,8 +41,8 @@ nextflow_process { when { process { """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) + input[0] = channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) + input[1] = channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) input[2] = [] input[3] = [] input[4] = [] @@ -68,7 +68,7 @@ nextflow_process { when { process { """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) + input[0] = channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) input[1] = [] input[2] = [] input[3] = [] diff --git a/nextflow.config b/nextflow.config index 61ceb11..18d11c5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,15 +10,20 @@ params { // Input options - input = null + input = null + mode = null // {mags, bins, metagenomic_assemblies} - mode = "mags" // {mags, bins, contigs} - ena_genome_study_accession = null - centre_name = null - upload_tpa = true - upload_force = true - test_upload = true - webincli_submit = true + // TODO rewrite register_study script to remove this unnecessary parameters + ena_raw_reads_study_accession = null + library = null + + submission_study = null + centre_name = null + upload_tpa = false + // TODO: remove this parameter because it will never be used, and update the genome_uploader module accordingly + upload_force = true + test_upload = true + webincli_submit = true // MultiQC options multiqc_config = null @@ -168,8 +173,11 @@ profiles { apptainer.runOptions = '--nv' singularity.runOptions = '--nv' } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + // TODO: figure out how to better orginise tests for different workflow types (bins, mags, metagenomic_assemblies) + // test { includeConfig 'conf/test.config' } + test_genome { includeConfig 'conf/test_genome.config' } + test_assembly { includeConfig 'conf/test_assembly.config' } + test_full { includeConfig 'conf/test_full.config' } } // Load nf-core custom profiles from different institutions @@ -199,10 +207,12 @@ charliecloud.registry = 'quay.io' // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. env { - PYTHONNOUSERSITE = 1 - R_PROFILE_USER = "/.Rprofile" - R_ENVIRON_USER = "/.Renviron" - JULIA_DEPOT_PATH = "/usr/local/share/julia" + PYTHONNOUSERSITE = 1 + R_PROFILE_USER = "/.Rprofile" + R_ENVIRON_USER = "/.Renviron" + JULIA_DEPOT_PATH = "/usr/local/share/julia" + ENA_WEBIN = secrets.WEBIN_ACCOUNT + ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD } // Set bash options diff --git a/nextflow_schema.json b/nextflow_schema.json index 11968e2..5744a32 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,17 +10,16 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": ["outdir", "input"], "properties": { "input": { "type": "string", "format": "file-path", "exists": true, - "schema": "assets/schema_input.json", "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/seqsubmit/usage#samplesheet-input).", + "description": "Path to comma-separated file describing the data to be submitted. Format depends on the pipeline mode (mags/bins/metagenomic_assemblies).", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with a set of columns depending on the type of data being submitted. See [usage docs](https://nf-co.re/seqsubmit/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, "outdir": { @@ -204,6 +203,7 @@ "title": "Pipeline flow control parameters", "type": "object", "description": "Use these parameters to control the flow of the pipeline execution.", + "required": ["centre_name", "mode"], "properties": { "centre_name": { "type": "string", @@ -212,16 +212,16 @@ }, "upload_tpa": { "type": "boolean", - "description": "Flag to control type of assembly study (third party assembly or not)", - "default": true, - "help": "Use this flag if the study is a third party assembly (TPA). Default: true" + "description": "Flag to control the type of assembly study (third party assembly or not)", + "default": false, + "help": "Use this flag if the study is a third party assembly (TPA). Default: false" }, "mode": { "type": "string", - "default": "mags", + "default": null, "description": "Type of upload", - "help_text": "Different types of data require specific upload steps. That mode controls what upload workflow to run depending on type of data (mags/bins/contigs)", - "enum": ["mags", "bins", "contigs"] + "help_text": "Different types of data require specific upload steps. That mode controls what upload workflow to run depending on type of data (mags/bins/metagenomic_assemblies)", + "enum": ["mags", "bins", "metagenomic_assemblies"] }, "test_upload": { "type": "boolean", @@ -235,11 +235,22 @@ "default": true, "help": "Forces reset of sample xmls generation. This is useful if you changed something in your tsv table, or if ENA metadata haven't been downloaded correctly (you can check this in ENA_backup.json). Default: true" }, - "ena_genome_study_accession": { + "submission_study": { "type": "string", - "description": "ENA study accession (PRJ/ERP) to submit MAGs/bins", + "description": "ENA study accession (PRJ/ERP) to submit the data to", "help_text": "Current implementation of pipeline requires to pre-register ENA project (PRJ/ERP) where you want to upload data to. Documentation how to register study: https://ena-docs.readthedocs.io/en/latest/submit/study.html" }, + "library": { + "type": "string", + "enum": ["metagenome", "metatranscriptome"], + "description": "Type of library for the submission. Required for creation of the new submission study.", + "help_text": "Uses script register_study from assembly_uploader package that requires this parameter to compose study title." + }, + "ena_raw_reads_study_accession": { + "type": "string", + "description": "ENA study accession (PRJ/ERP) of the raw reads study associated with the assembly submission. Required for creation of the new submission study.", + "help_text": "Uses script register_study from assembly_uploader package that requires this parameter to compose study title and description." + }, "webincli_submit": { "type": "boolean", "description": "Submit or validate", diff --git a/nf-test.config b/nf-test.config index 3a1fff5..3525ead 100644 --- a/nf-test.config +++ b/nf-test.config @@ -15,7 +15,7 @@ config { profile "test" // list of filenames or patterns that should be trigger a full test run - triggers 'nextflow.config', 'nf-test.config', 'conf/test.config', 'tests/nextflow.config', 'tests/.nftignore' + triggers 'nextflow.config', 'nf-test.config', 'conf/test_genome.config', 'conf/test_assembly.config', 'tests/nextflow.config', 'tests/.nftignore' // load the necessary plugins plugins { diff --git a/subworkflows/local/utils_nfcore_seqsubmit_pipeline/main.nf b/subworkflows/local/utils_nfcore_seqsubmit_pipeline/main.nf index 7d0200c..f1def27 100644 --- a/subworkflows/local/utils_nfcore_seqsubmit_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_seqsubmit_pipeline/main.nf @@ -29,10 +29,10 @@ workflow PIPELINE_INITIALISATION { take: version // boolean: Display version and exit validate_params // boolean: Boolean whether to validate parameters against the schema at runtime - monochrome_logs // boolean: Do not use coloured log outputs nextflow_cli_args // array: List of positional nextflow CLI args outdir // string: The output directory where the results will be saved input // string: Path to input samplesheet + mode // string: Type of input data (mags, bins, metagenomic_assemblies) help // boolean: Display help message and exit help_full // boolean: Show the full help message show_hidden // boolean: Show hidden parameters in the help message @@ -96,8 +96,15 @@ workflow PIPELINE_INITIALISATION { // Create channel from input file provided through params.input // - ch_samplesheet = channel - .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json")) + if ( mode == "mags" || mode == "bins" ) { + ch_samplesheet = channel + .fromList(samplesheetToList(input, "${projectDir}/assets/schema_input_genome.json")) + } else if ( mode == "metagenomic_assemblies" ) { + ch_samplesheet = channel + .fromList(samplesheetToList(input, "${projectDir}/assets/schema_input_assembly.json")) + } else { + error("No input was found. Please, point to the location of your samplesheet using --input_genome or --input_assembly") + } emit: samplesheet = ch_samplesheet diff --git a/workflows/assemblysubmit.nf b/workflows/assemblysubmit.nf new file mode 100644 index 0000000..918e1d7 --- /dev/null +++ b/workflows/assemblysubmit.nf @@ -0,0 +1,231 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { COVERM_CONTIG } from '../modules/nf-core/coverm/contig/main' +include { FASTAVALIDATOR } from '../modules/nf-core/fastavalidator/main' +include { GENERATE_ASSEMBLY_MANIFEST } from '../modules/local/generate_assembly_manifest/main' +include { REGISTERSTUDY } from '../modules/local/registerstudy/main' +include { ENA_WEBIN_CLI } from '../modules/local/ena_webin_cli' + +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { paramsSummaryMap } from 'plugin/nf-schema' +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_seqsubmit_pipeline' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN THE WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow ASSEMBLYSUBMIT { + + take: + ch_samplesheet // channel: samplesheet read in from --input + + main: + ch_versions = channel.empty() + ch_multiqc_files = channel.empty() + + // Create assembly channel with proper metadata structure + assembly_fasta = ch_samplesheet + .map { row -> + def meta = [ + id: row[0].id, + single_end: row[3] ? false : true, + coverage: row[4] ?: null, + run_accession: row[5], + assembler: row[6], + assembler_version: row[7] + ] + [meta, file(row[1])] + } + + reads_fastq = ch_samplesheet + .filter { row -> row[2] && row[2] != "" } // Check if fastq_1 exists and is not empty + .map { row -> + def meta = [ + id: row[0].id, + single_end: row[3] ? false : true, + coverage: row[4] ?: null, + run_accession: row[5], + assembler: row[6], + assembler_version: row[7] + ] + + if (row[3] && row[3] != "") { + // If paired end reads + [meta, [file(row[2]), file(row[3])]] + } else { + // If single end + [meta, file(row[2])] + } + } + + // Check fasta files are properly formatted + FASTAVALIDATOR ( + assembly_fasta, + "true" // is_metagenome flag + ) + // TODO add some logging here to track discarded assemblies + validated_fastas = assembly_fasta.join(FASTAVALIDATOR.out.success_log) + .map { meta, fasta, _log -> + [meta, fasta] + } + + // TODO add human decontamination step + + // For assemblies without coverage, calculate coverage with CoverM + validated_fastas.filter { meta, _fasta -> meta.coverage == null } + .join(reads_fastq) + .multiMap { meta, fasta, fastq -> + assembly: [ meta, fasta ] + reads: [ meta, fastq ] + } + .set { coverm_input } + COVERM_CONTIG ( + coverm_input.reads, + coverm_input.assembly, + false, // bam_input + false // interleaved + ) + + // Calculate average coverage using map operator + average_coverage_ch = COVERM_CONTIG.out.coverage + .map { meta, coverage_file -> + // Read the file and calculate average + def lines = coverage_file.readLines() + def coverages = lines[1..-1].collect { line -> + line.split('\t')[1] as Double + } + def average = coverages.sum() / coverages.size() + return [meta, average] + } + + // Update metadata with calculated coverage + validated_fastas + .filter { meta, _fasta -> meta.coverage == null } + .join( average_coverage_ch ) + .map { meta, fasta, avg_coverage -> + def updated_meta = meta.clone() + updated_meta.coverage = avg_coverage + [updated_meta, fasta] + } + .set { assemblies_with_added_cov_ch } + + // Combine assemblies with updated metadata (for samples that had coverage calculated) + // and assemblies that already had coverage + assemblies_with_coverage = validated_fastas + .filter { meta, _fasta -> meta.coverage != null } + .mix( assemblies_with_added_cov_ch ) + + // TODO add validation step to check number of lines in CSV matches number of assemblies + + assembly_metadata_csv = assemblies_with_coverage + .map { meta, fasta -> + def header = 'Runs,Coverage,Assembler,Version,Filepath,Sample' + def row = [ + meta.run_accession ?: '', + meta.coverage ?: '', + meta.assembler ?: '', + meta.assembler_version ?: '', + fasta.name, + '' // Sample column left empty because co assemblies are not supported + ].join(',') + + def content = "${header}\n${row}" + def csv_file = file("${params.outdir}/${params.mode}/${meta.id}_assembly_metadata.csv") + csv_file.text = content + + [meta, csv_file] + } + + def study_accession_ch + if (params.submission_study) { + // Use provided study accession directly + study_accession_ch = channel.of(params.submission_study) + } else { + // Register a new study + REGISTERSTUDY( + [[id:"study"], params.ena_raw_reads_study_accession, params.centre_name, params.library ] + ) + study_accession_ch = REGISTERSTUDY.out.study_accession.map { _meta, accession -> accession } + } + + // Generate assembly manifest files and submit them to ENA + GENERATE_ASSEMBLY_MANIFEST( + assemblies_with_coverage.join(assembly_metadata_csv), + study_accession_ch.first() + ) + + ENA_WEBIN_CLI( + assemblies_with_coverage.join(GENERATE_ASSEMBLY_MANIFEST.out.manifest) + ) + + // + // Collate and save software versions + // + softwareVersionsToYAML(ch_versions) + .collectFile( + storeDir: "${params.outdir}/pipeline_info", + name: 'nf_core_' + 'seqsubmit_software_' + 'mqc_' + 'versions.yml', + sort: true, + newLine: true + ).set { ch_collated_versions } + + + // + // MODULE: MultiQC + // + ch_multiqc_config = channel.fromPath( + "$projectDir/assets/multiqc_config.yml", checkIfExists: true) + ch_multiqc_custom_config = params.multiqc_config ? + channel.fromPath(params.multiqc_config, checkIfExists: true) : + channel.empty() + ch_multiqc_logo = params.multiqc_logo ? + channel.fromPath(params.multiqc_logo, checkIfExists: true) : + channel.empty() + + summary_params = paramsSummaryMap( + workflow, parameters_schema: "nextflow_schema.json") + ch_workflow_summary = channel.value(paramsSummaryMultiqc(summary_params)) + ch_multiqc_files = ch_multiqc_files.mix( + ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_custom_methods_description = params.multiqc_methods_description ? + file(params.multiqc_methods_description, checkIfExists: true) : + file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + ch_methods_description = channel.value( + methodsDescriptionText(ch_multiqc_custom_methods_description)) + + ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) + ch_multiqc_files = ch_multiqc_files.mix( + ch_methods_description.collectFile( + name: 'methods_description_mqc.yaml', + sort: true + ) + ) + + MULTIQC ( + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList(), + [], + [] + ) + + emit: + multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] + +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ diff --git a/workflows/genomesubmit.nf b/workflows/genomesubmit.nf index 90e0cf9..8d1b319 100644 --- a/workflows/genomesubmit.nf +++ b/workflows/genomesubmit.nf @@ -26,8 +26,8 @@ workflow GENOMESUBMIT { main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() + ch_versions = channel.empty() + ch_multiqc_files = channel.empty() // Create channel with meta and fasta ch_mags = ch_samplesheet @@ -116,24 +116,24 @@ workflow GENOMESUBMIT { // // MODULE: MultiQC // - ch_multiqc_config = Channel.fromPath( + ch_multiqc_config = channel.fromPath( "$projectDir/assets/multiqc_config.yml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? - Channel.fromPath(params.multiqc_config, checkIfExists: true) : - Channel.empty() + channel.fromPath(params.multiqc_config, checkIfExists: true) : + channel.empty() ch_multiqc_logo = params.multiqc_logo ? - Channel.fromPath(params.multiqc_logo, checkIfExists: true) : - Channel.empty() + channel.fromPath(params.multiqc_logo, checkIfExists: true) : + channel.empty() summary_params = paramsSummaryMap( workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) + ch_workflow_summary = channel.value(paramsSummaryMultiqc(summary_params)) ch_multiqc_files = ch_multiqc_files.mix( ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value( + ch_methods_description = channel.value( methodsDescriptionText(ch_multiqc_custom_methods_description)) ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) diff --git a/workflows/seqsubmit.nf b/workflows/seqsubmit.nf deleted file mode 100644 index 5bf6367..0000000 --- a/workflows/seqsubmit.nf +++ /dev/null @@ -1,106 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_seqsubmit_pipeline' - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - RUN MAIN WORKFLOW -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -workflow SEQSUBMIT { - - take: - ch_samplesheet // channel: samplesheet read in from --input - main: - - ch_versions = channel.empty() - ch_multiqc_files = channel.empty() - - // - // Collate and save software versions - // - def topic_versions = Channel.topic("versions") - .distinct() - .branch { entry -> - versions_file: entry instanceof Path - versions_tuple: true - } - - def topic_versions_string = topic_versions.versions_tuple - .map { process, tool, version -> - [ process[process.lastIndexOf(':')+1..-1], " ${tool}: ${version}" ] - } - .groupTuple(by:0) - .map { process, tool_versions -> - tool_versions.unique().sort() - "${process}:\n${tool_versions.join('\n')}" - } - - softwareVersionsToYAML(ch_versions.mix(topic_versions.versions_file)) - .mix(topic_versions_string) - .collectFile( - storeDir: "${params.outdir}/pipeline_info", - name: 'nf_core_' + 'seqsubmit_software_' + 'mqc_' + 'versions.yml', - sort: true, - newLine: true - ).set { ch_collated_versions } - - - // - // MODULE: MultiQC - // - ch_multiqc_config = channel.fromPath( - "$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? - channel.fromPath(params.multiqc_config, checkIfExists: true) : - channel.empty() - ch_multiqc_logo = params.multiqc_logo ? - channel.fromPath(params.multiqc_logo, checkIfExists: true) : - channel.empty() - - summary_params = paramsSummaryMap( - workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_files = ch_multiqc_files.mix( - ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? - file(params.multiqc_methods_description, checkIfExists: true) : - file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = channel.value( - methodsDescriptionText(ch_multiqc_custom_methods_description)) - - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix( - ch_methods_description.collectFile( - name: 'methods_description_mqc.yaml', - sort: true - ) - ) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList(), - [], - [] - ) - - emit:multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] - -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - THE END -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/