From 521cda46fc3df47ce6ff8e50c158ab7b74a99e4a Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 17 Jan 2025 09:36:59 +0100 Subject: [PATCH] refactor DB NF scripts to DSL2 --- db-scripts/build-db-gtdb.nf | 57 +++++++++++----------- db-scripts/build-db-refseq.nf | 66 ++++++++++++++------------ db-scripts/build-plasmids-db-plsdb.nf | 51 ++++++++++---------- db-scripts/build-plasmids-db-refseq.nf | 38 +++++++++++---- 4 files changed, 120 insertions(+), 92 deletions(-) diff --git a/db-scripts/build-db-gtdb.nf b/db-scripts/build-db-gtdb.nf index 7da541c..6026212 100644 --- a/db-scripts/build-db-gtdb.nf +++ b/db-scripts/build-db-gtdb.nf @@ -1,25 +1,11 @@ -nextflow.enable.dsl=1 +nextflow.enable.dsl=2 import java.nio.file.* -metadata = params.metadata -representatives = params.representatives -domain = params.domain - - -Channel.fromPath( metadata ) - .splitCsv( skip: 1, sep: '\t' ) - .filter( { it[18].toLowerCase() == 't' } ) - .map( { - def acc = it[0] - 'RS_' - 'GB_' - def orgName = it[65].split(';').last() - 's__' - def path = acc.substring(0,3) + '/' + acc.substring(4,7) + '/' + acc.substring(7,10) + '/' + acc.substring(10,13) - def status = it[48].toLowerCase() - return [ acc, '-', status, orgName, path ] - } ) - .dump() - .set { validGenomes } +// params.metadata +// params.representatives +// params.domain process sketch { @@ -32,16 +18,16 @@ process sketch { container 'quay.io/biocontainers/mash:2.3--hd3113c8_6' input: - tuple val(acc), val(taxId), val(status), val(orgName), val(path) from validGenomes + tuple val(acc), val(taxId), val(status), val(orgName), val(path) output: - tuple val(acc), val(taxId), val(status), val(orgName) into chDbEntries - file("${acc}.msh") into outMash - file("${acc}.fna.gz") into outFasta - - publishDir pattern: '*.fna.gz', path: "./${domain}/", mode: 'move' - publishDir pattern: '*.msh', path: './sketches/', mode: 'move' + tuple val(acc), val(taxId), val(status), val(orgName), emit: results + path("${acc}.msh") + path("${acc}.fna.gz") + publishDir pattern: "${acc}.msh", path: './sketches/', mode: 'move' + publishDir pattern: "${acc}.fna.gz", path: "./${params.domain}/", mode: 'move' + script: """ gzip -dc ${params.representatives}/${path}/${acc}_genomic.fna.gz > ${acc} @@ -52,5 +38,22 @@ process sketch { } -chDbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}\n" } - .collectFile( name: 'db.tsv', sort: false, tempDir: "${workDir}/", storeDir: "./${domain}/" ) +workflow { + + processedGenomes = Channel.fromPath( params.metadata ) + | splitCsv( skip: 1, sep: '\t' ) + | filter( { it[18].toLowerCase() == 't' } ) + | map( { + def acc = it[0] - 'RS_' - 'GB_' + def orgName = it[65].split(';').last() - 's__' + def path = acc.substring(0,3) + '/' + acc.substring(4,7) + '/' + acc.substring(7,10) + '/' + acc.substring(10,13) + def status = it[48].toLowerCase() + return [ acc, '-', status, orgName, path ] + } ) + | sketch + + processedGenomes.results + | map( { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}\n" } ) + | collectFile( name: 'db.tsv', sort: false, tempDir: "${workDir}/", storeDir: "./${params.domain}/" ) + +} diff --git a/db-scripts/build-db-refseq.nf b/db-scripts/build-db-refseq.nf index 8e7e9ad..65abf15 100644 --- a/db-scripts/build-db-refseq.nf +++ b/db-scripts/build-db-refseq.nf @@ -1,25 +1,11 @@ +nextflow.enable.dsl=2 import java.nio.file.* -assemblySummary = params.ass_sum -ncbiPath = params.ncbiPath -domain = params.domain - - -Channel.fromPath( assemblySummary ) - .splitCsv( skip: 2, sep: '\t' ) - .filter( { (it[11].toLowerCase() == 'complete genome') || (it[4].toLowerCase() == 'representative genome') || (it[4].toLowerCase() == 'reference genome') } ) - .map( { - def species = it[7] - def strain = it[8] - 'strain=' - def status = it[11].split(' ')[0].toLowerCase() - if( species.contains( strain ) ) - return [ it[0], it[5], species, status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ] - else - return [ it[0], it[5], "${species} ${strain}", status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ] - } ) - .set { chValidGenomes } +// params.ass_sum +// params.ncbiPath +// params.domain process download { @@ -32,14 +18,14 @@ process download { maxRetries 3 input: - tuple val(acc), val(taxId), val(orgName), val(status), val(path) from chValidGenomes + tuple val(acc), val(taxId), val(orgName), val(status), val(path) output: - tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz") into chDownloadedGenomes + tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz") script: """ - wget -O ${acc}.gz ${ncbiPath}/${path}/${path.split('/').last()}_genomic.fna.gz + wget -O ${acc}.gz ${params.ncbiPath}/${path}/${path.split('/').last()}_genomic.fna.gz """ } @@ -54,16 +40,16 @@ process sketch { container 'quay.io/biocontainers/mash:2.3--hd3113c8_6' input: - tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz") from chDownloadedGenomes + tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz") output: - tuple val(acc), val(taxId), val(status), val(orgName) into chDbEntries - file("${acc}.msh") into outMash - file("${acc}.fna.gz") into outFasta - - publishDir pattern: '*.fna.gz', path: "./${domain}-refseq/", mode: 'move' - publishDir pattern: '*.msh', path: './sketches/', mode: 'move' + tuple val(acc), val(taxId), val(status), val(orgName), emit: results + file("${acc}.msh") + file("${acc}.fna.gz") + publishDir pattern: "${acc}.msh", path: './sketches/', mode: 'move' + publishDir pattern: "${acc}.fna.gz", path: "./${params.omain}-refseq/", mode: 'move' + script: """ gunzip -c ${acc}.gz > ${acc} @@ -74,5 +60,25 @@ process sketch { } -chDbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" } - .collectFile( name: 'db.tsv', storeDir: "./${domain}-refseq/", newLine: true ) +workflow { + + processedGenomes = Channel.fromPath( params.assemblySummary ) + | splitCsv( skip: 2, sep: '\t' ) + | filter( { (it[11].toLowerCase() == 'complete genome') || (it[4].toLowerCase() == 'representative genome') || (it[4].toLowerCase() == 'reference genome') } ) + | map( { + def species = it[7] + def strain = it[8] - 'strain=' + def status = it[11].split(' ')[0].toLowerCase() + if( species.contains( strain ) ) + return [ it[0], it[5], species, status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ] + else + return [ it[0], it[5], "${species} ${strain}", status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ] + } ) + | download + | sketch + + processedGenomes.results + | map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" } + | collectFile( name: 'db.tsv', storeDir: "./${params.domain}-refseq/", newLine: true ) + +} diff --git a/db-scripts/build-plasmids-db-plsdb.nf b/db-scripts/build-plasmids-db-plsdb.nf index c0b0403..0d3ea16 100644 --- a/db-scripts/build-plasmids-db-plsdb.nf +++ b/db-scripts/build-plasmids-db-plsdb.nf @@ -1,23 +1,9 @@ +nextflow.enable.dsl=2 import java.nio.file.* -plasmids = params.plasmids - - -Channel.fromPath(plasmids) - .into { fastaFiles; fastaEntries } - -fastaFiles - .splitFasta( by: 1, file: true) - .set { fastaFiles } - -fastaEntries - .splitFasta( record: [id: true, desc: true ] ) - .map( { - return [ it.id, '', it.desc, 'complete' ] - } ) - .into { validGenomes; dbEntries } +// params.plasmids process sketch { @@ -30,16 +16,16 @@ process sketch { container 'quay.io/biocontainers/mash:2.3--hd3113c8_6' input: - file(sequence) from fastaFiles - set val(acc), val(taxId), val(orgName), val(status) from validGenomes + file(sequence) + set val(acc), val(taxId), val(orgName), val(status) output: - file("${acc}.msh") into outMash - file("${acc}.fna.gz") into outFasta - - publishDir pattern: '*.fna.gz', path: "./plasmids-plsdb/", mode: 'move' - publishDir pattern: '*.msh', path: './sketches/', mode: 'move' + file("${acc}.msh") + file("${acc}.fna.gz") + publishDir pattern: "${acc}.msh", path: './sketches/', mode: 'move' + publishDir pattern: "${acc}.fna.gz", path: "./plasmids-plsdb/", mode: 'move' + script: """ mv ${sequence} ${acc} @@ -50,5 +36,20 @@ process sketch { } -dbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" } - .collectFile( name: 'db.tsv', storeDir: './plasmids-plsdb/', newLine: true ) +workflow { + + plasmidsFasta = Channel.fromPath(params.plasmids) + + plasmidSequences = plasmidsFasta + | splitFasta( by: 1, file: true) + + plasmidRecords = plasmidsFasta + | splitFasta( record: [id: true, desc: true ] ) + | map( { [ it.id, '', it.desc, 'complete' ] } ) + + sketch( plasmidSequences, plasmidRecords) + + plasmidRecords + | map( { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" } ) + | collectFile( name: 'db.tsv', storeDir: './plasmids-plsdb/', newLine: true ) +} diff --git a/db-scripts/build-plasmids-db-refseq.nf b/db-scripts/build-plasmids-db-refseq.nf index d219d3a..2c84d43 100644 --- a/db-scripts/build-plasmids-db-refseq.nf +++ b/db-scripts/build-plasmids-db-refseq.nf @@ -1,8 +1,9 @@ +nextflow.enable.dsl=2 import java.nio.file.* -plasmids = params.plasmids +// params.plasmids Channel.fromPath(plasmids) @@ -31,16 +32,16 @@ process sketch { container 'quay.io/biocontainers/mash:2.3--hd3113c8_6' input: - file(sequence) from fastaFiles - set val(acc), val(taxId), val(orgName), val(status) from validGenomes + file(sequence) + set val(acc), val(taxId), val(orgName), val(status) output: - file("${acc}.msh") into outMash - file("${acc}.fna.gz") into outFasta + file("${acc}.msh") + file("${acc}.fna.gz") - publishDir pattern: '*.fna.gz', path: "./plasmids-refseq/", mode: 'move' publishDir pattern: '*.msh', path: './sketches/', mode: 'move' - + publishDir pattern: '*.fna.gz', path: "./plasmids-refseq/", mode: 'move' + script: """ mv ${sequence} ${acc} @@ -50,6 +51,23 @@ process sketch { """ } - -dbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" } - .collectFile( name: 'db.tsv', storeDir: './plasmids-refseq/', newLine: true ) +workflow { + + plasmidsFasta = Channel.fromPath(params.plasmids) + + plasmidSequences = plasmidsFasta + | splitFasta( by: 1, file: true) + + plasmidRecords = plasmidsFasta + | splitFasta( record: [id: true, desc: true ] ) + | map( { + def plasmidName = it.desc - ', complete sequence' + return [ it.id, '', plasmidName, 'complete' ] + } ) + + sketch( plasmidSequences, plasmidRecords) + + plasmidRecords + | map( { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" } ) + | collectFile( name: 'db.tsv', storeDir: './plasmids-refseq/', newLine: true ) +}