From 521cda46fc3df47ce6ff8e50c158ab7b74a99e4a Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 17 Jan 2025 09:36:59 +0100
Subject: [PATCH] refactor DB NF scripts to DSL2

---
 db-scripts/build-db-gtdb.nf            | 57 +++++++++++-----------
 db-scripts/build-db-refseq.nf          | 66 ++++++++++++++------------
 db-scripts/build-plasmids-db-plsdb.nf  | 51 ++++++++++----------
 db-scripts/build-plasmids-db-refseq.nf | 38 +++++++++++----
 4 files changed, 120 insertions(+), 92 deletions(-)

diff --git a/db-scripts/build-db-gtdb.nf b/db-scripts/build-db-gtdb.nf
index 7da541c..6026212 100644
--- a/db-scripts/build-db-gtdb.nf
+++ b/db-scripts/build-db-gtdb.nf
@@ -1,25 +1,11 @@
-nextflow.enable.dsl=1
+nextflow.enable.dsl=2
 
 import java.nio.file.*
 
 
-metadata        = params.metadata
-representatives = params.representatives
-domain          = params.domain
-
-
-Channel.fromPath( metadata )
-    .splitCsv( skip: 1, sep: '\t'  )
-    .filter( { it[18].toLowerCase() == 't' } )
-    .map( {
-        def acc = it[0] - 'RS_' - 'GB_'
-        def orgName = it[65].split(';').last() - 's__'
-        def path = acc.substring(0,3) + '/' + acc.substring(4,7) + '/' + acc.substring(7,10) + '/' + acc.substring(10,13)
-        def status = it[48].toLowerCase()
-        return [ acc, '-', status, orgName, path ]
-    } )
-    .dump()
-    .set { validGenomes }
+// params.metadata
+// params.representatives
+// params.domain
 
 
 process sketch {
@@ -32,16 +18,16 @@ process sketch {
     container 'quay.io/biocontainers/mash:2.3--hd3113c8_6'
 
     input:
-    tuple val(acc), val(taxId), val(status), val(orgName), val(path) from validGenomes
+    tuple val(acc), val(taxId), val(status), val(orgName), val(path)
 
     output:
-    tuple val(acc), val(taxId), val(status), val(orgName) into chDbEntries
-    file("${acc}.msh") into outMash
-    file("${acc}.fna.gz") into outFasta
-
-    publishDir pattern: '*.fna.gz', path: "./${domain}/", mode: 'move'
-    publishDir pattern: '*.msh', path: './sketches/',  mode: 'move'
+    tuple val(acc), val(taxId), val(status), val(orgName), emit: results
+    path("${acc}.msh")
+    path("${acc}.fna.gz")
 
+    publishDir pattern: "${acc}.msh", path: './sketches/',  mode: 'move'
+    publishDir pattern: "${acc}.fna.gz", path: "./${params.domain}/", mode: 'move'
+    
     script:
     """
     gzip -dc ${params.representatives}/${path}/${acc}_genomic.fna.gz > ${acc}
@@ -52,5 +38,22 @@ process sketch {
 }
 
 
-chDbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}\n" }
-    .collectFile( name: 'db.tsv', sort: false, tempDir: "${workDir}/", storeDir: "./${domain}/" )
+workflow {
+
+    processedGenomes = Channel.fromPath( params.metadata )
+        | splitCsv( skip: 1, sep: '\t'  )
+        | filter( { it[18].toLowerCase() == 't' } )
+        | map( {
+            def acc = it[0] - 'RS_' - 'GB_'
+            def orgName = it[65].split(';').last() - 's__'
+            def path = acc.substring(0,3) + '/' + acc.substring(4,7) + '/' + acc.substring(7,10) + '/' + acc.substring(10,13)
+            def status = it[48].toLowerCase()
+            return [ acc, '-', status, orgName, path ]
+        } )
+        | sketch
+
+    processedGenomes.results
+        | map( { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}\n" } )
+        | collectFile( name: 'db.tsv', sort: false, tempDir: "${workDir}/", storeDir: "./${params.domain}/" )
+
+}
diff --git a/db-scripts/build-db-refseq.nf b/db-scripts/build-db-refseq.nf
index 8e7e9ad..65abf15 100644
--- a/db-scripts/build-db-refseq.nf
+++ b/db-scripts/build-db-refseq.nf
@@ -1,25 +1,11 @@
+nextflow.enable.dsl=2
 
 import java.nio.file.*
 
 
-assemblySummary = params.ass_sum
-ncbiPath        = params.ncbiPath
-domain          = params.domain
-
-
-Channel.fromPath( assemblySummary )
-    .splitCsv( skip: 2, sep: '\t'  )
-    .filter( { (it[11].toLowerCase() == 'complete genome')  ||  (it[4].toLowerCase() == 'representative genome')  ||  (it[4].toLowerCase() == 'reference genome') } )
-    .map( {
-        def species = it[7]
-        def strain  = it[8] - 'strain='
-	def status  = it[11].split(' ')[0].toLowerCase()
-        if( species.contains( strain ) )
-            return [ it[0], it[5], species, status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ]
-        else
-            return [ it[0], it[5], "${species} ${strain}", status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ]
-    } )
-    .set { chValidGenomes }
+// params.ass_sum
+// params.ncbiPath
+// params.domain
 
 
 process download {
@@ -32,14 +18,14 @@ process download {
     maxRetries 3
 
     input:
-    tuple val(acc), val(taxId), val(orgName), val(status), val(path) from chValidGenomes
+    tuple val(acc), val(taxId), val(orgName), val(status), val(path)
 
     output:
-    tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz") into chDownloadedGenomes
+    tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz")
 
     script:
     """
-    wget -O ${acc}.gz ${ncbiPath}/${path}/${path.split('/').last()}_genomic.fna.gz
+    wget -O ${acc}.gz ${params.ncbiPath}/${path}/${path.split('/').last()}_genomic.fna.gz
     """
 }
 
@@ -54,16 +40,16 @@ process sketch {
     container 'quay.io/biocontainers/mash:2.3--hd3113c8_6'
 
     input:
-    tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz") from chDownloadedGenomes
+    tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz")
 
     output:
-    tuple val(acc), val(taxId), val(status), val(orgName) into chDbEntries
-    file("${acc}.msh") into outMash
-    file("${acc}.fna.gz") into outFasta
-
-    publishDir pattern: '*.fna.gz', path: "./${domain}-refseq/", mode: 'move'
-    publishDir pattern: '*.msh', path: './sketches/',  mode: 'move'
+    tuple val(acc), val(taxId), val(status), val(orgName), emit: results
+    file("${acc}.msh")
+    file("${acc}.fna.gz")
 
+    publishDir pattern: "${acc}.msh", path: './sketches/',  mode: 'move'
+    publishDir pattern: "${acc}.fna.gz", path: "./${params.omain}-refseq/", mode: 'move'
+    
     script:
     """
     gunzip -c ${acc}.gz > ${acc}
@@ -74,5 +60,25 @@ process sketch {
 }
 
 
-chDbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" }
-    .collectFile( name: 'db.tsv', storeDir: "./${domain}-refseq/", newLine: true )
+workflow {
+
+    processedGenomes = Channel.fromPath( params.assemblySummary )
+        | splitCsv( skip: 2, sep: '\t'  )
+        | filter( { (it[11].toLowerCase() == 'complete genome')  ||  (it[4].toLowerCase() == 'representative genome')  ||  (it[4].toLowerCase() == 'reference genome') } )
+        | map( {
+            def species = it[7]
+            def strain  = it[8] - 'strain='
+	        def status  = it[11].split(' ')[0].toLowerCase()
+            if( species.contains( strain ) )
+                return [ it[0], it[5], species, status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ]
+            else
+                return [ it[0], it[5], "${species} ${strain}", status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ]
+        } )
+        | download
+        | sketch
+
+    processedGenomes.results
+        | map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" }
+        | collectFile( name: 'db.tsv', storeDir: "./${params.domain}-refseq/", newLine: true )
+
+}
diff --git a/db-scripts/build-plasmids-db-plsdb.nf b/db-scripts/build-plasmids-db-plsdb.nf
index c0b0403..0d3ea16 100644
--- a/db-scripts/build-plasmids-db-plsdb.nf
+++ b/db-scripts/build-plasmids-db-plsdb.nf
@@ -1,23 +1,9 @@
+nextflow.enable.dsl=2
 
 import java.nio.file.*
 
 
-plasmids        = params.plasmids
-
-
-Channel.fromPath(plasmids)
-    .into { fastaFiles; fastaEntries }
-
-fastaFiles
-    .splitFasta( by: 1, file: true)
-    .set { fastaFiles }
-
-fastaEntries
-    .splitFasta( record: [id: true, desc: true ] )
-    .map( {
-	    return [ it.id, '', it.desc, 'complete' ]
-    } )
-    .into { validGenomes; dbEntries }
+// params.plasmids
 
 
 process sketch {
@@ -30,16 +16,16 @@ process sketch {
     container 'quay.io/biocontainers/mash:2.3--hd3113c8_6'
 
     input:
-    file(sequence) from fastaFiles
-    set val(acc), val(taxId), val(orgName), val(status) from validGenomes
+    file(sequence)
+    set val(acc), val(taxId), val(orgName), val(status)
 
     output:
-    file("${acc}.msh") into outMash
-    file("${acc}.fna.gz") into outFasta
-
-    publishDir pattern: '*.fna.gz', path: "./plasmids-plsdb/", mode: 'move'
-    publishDir pattern: '*.msh', path: './sketches/', mode: 'move'
+    file("${acc}.msh")
+    file("${acc}.fna.gz")
 
+    publishDir pattern: "${acc}.msh", path: './sketches/', mode: 'move'
+    publishDir pattern: "${acc}.fna.gz", path: "./plasmids-plsdb/", mode: 'move'
+    
     script:
     """
     mv ${sequence} ${acc}
@@ -50,5 +36,20 @@ process sketch {
 }
 
 
-dbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" }
-    .collectFile( name: 'db.tsv', storeDir: './plasmids-plsdb/', newLine: true )
+workflow {
+    
+    plasmidsFasta = Channel.fromPath(params.plasmids)
+
+    plasmidSequences = plasmidsFasta
+        | splitFasta( by: 1, file: true)
+    
+    plasmidRecords = plasmidsFasta
+        | splitFasta( record: [id: true, desc: true ] )
+        | map( { [ it.id, '', it.desc, 'complete' ] } )
+
+    sketch( plasmidSequences, plasmidRecords)
+
+    plasmidRecords
+        | map( { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" } )
+        | collectFile( name: 'db.tsv', storeDir: './plasmids-plsdb/', newLine: true )
+}
diff --git a/db-scripts/build-plasmids-db-refseq.nf b/db-scripts/build-plasmids-db-refseq.nf
index d219d3a..2c84d43 100644
--- a/db-scripts/build-plasmids-db-refseq.nf
+++ b/db-scripts/build-plasmids-db-refseq.nf
@@ -1,8 +1,9 @@
+nextflow.enable.dsl=2
 
 import java.nio.file.*
 
 
-plasmids        = params.plasmids
+// params.plasmids
 
 
 Channel.fromPath(plasmids)
@@ -31,16 +32,16 @@ process sketch {
     container 'quay.io/biocontainers/mash:2.3--hd3113c8_6'
 
     input:
-    file(sequence) from fastaFiles
-    set val(acc), val(taxId), val(orgName), val(status) from validGenomes
+    file(sequence)
+    set val(acc), val(taxId), val(orgName), val(status)
 
     output:
-    file("${acc}.msh") into outMash
-    file("${acc}.fna.gz") into outFasta
+    file("${acc}.msh")
+    file("${acc}.fna.gz")
 
-    publishDir pattern: '*.fna.gz', path: "./plasmids-refseq/", mode: 'move'
     publishDir pattern: '*.msh', path: './sketches/', mode: 'move'
-
+    publishDir pattern: '*.fna.gz', path: "./plasmids-refseq/", mode: 'move'
+    
     script:
     """
     mv ${sequence} ${acc}
@@ -50,6 +51,23 @@ process sketch {
     """
 }
 
-
-dbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" }
-    .collectFile( name: 'db.tsv', storeDir: './plasmids-refseq/', newLine: true )
+workflow {
+    
+    plasmidsFasta = Channel.fromPath(params.plasmids)
+
+    plasmidSequences = plasmidsFasta
+        | splitFasta( by: 1, file: true)
+    
+    plasmidRecords = plasmidsFasta
+        | splitFasta( record: [id: true, desc: true ] )
+        | map( {
+            def plasmidName = it.desc - ', complete sequence'
+	        return [ it.id, '', plasmidName, 'complete' ]
+        } )
+
+    sketch( plasmidSequences, plasmidRecords)
+
+    plasmidRecords
+        | map( { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" } )
+        | collectFile( name: 'db.tsv', storeDir: './plasmids-refseq/', newLine: true )
+}