Skip to content

Commit

Permalink
refactor DB NF scripts to DSL2
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Jan 17, 2025
1 parent 48478b7 commit 521cda4
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 92 deletions.
57 changes: 30 additions & 27 deletions db-scripts/build-db-gtdb.nf
Original file line number Diff line number Diff line change
@@ -1,25 +1,11 @@
nextflow.enable.dsl=1
nextflow.enable.dsl=2

import java.nio.file.*


metadata = params.metadata
representatives = params.representatives
domain = params.domain


Channel.fromPath( metadata )
.splitCsv( skip: 1, sep: '\t' )
.filter( { it[18].toLowerCase() == 't' } )
.map( {
def acc = it[0] - 'RS_' - 'GB_'
def orgName = it[65].split(';').last() - 's__'
def path = acc.substring(0,3) + '/' + acc.substring(4,7) + '/' + acc.substring(7,10) + '/' + acc.substring(10,13)
def status = it[48].toLowerCase()
return [ acc, '-', status, orgName, path ]
} )
.dump()
.set { validGenomes }
// params.metadata
// params.representatives
// params.domain


process sketch {
Expand All @@ -32,16 +18,16 @@ process sketch {
container 'quay.io/biocontainers/mash:2.3--hd3113c8_6'

input:
tuple val(acc), val(taxId), val(status), val(orgName), val(path) from validGenomes
tuple val(acc), val(taxId), val(status), val(orgName), val(path)

output:
tuple val(acc), val(taxId), val(status), val(orgName) into chDbEntries
file("${acc}.msh") into outMash
file("${acc}.fna.gz") into outFasta

publishDir pattern: '*.fna.gz', path: "./${domain}/", mode: 'move'
publishDir pattern: '*.msh', path: './sketches/', mode: 'move'
tuple val(acc), val(taxId), val(status), val(orgName), emit: results
path("${acc}.msh")
path("${acc}.fna.gz")

publishDir pattern: "${acc}.msh", path: './sketches/', mode: 'move'
publishDir pattern: "${acc}.fna.gz", path: "./${params.domain}/", mode: 'move'

script:
"""
gzip -dc ${params.representatives}/${path}/${acc}_genomic.fna.gz > ${acc}
Expand All @@ -52,5 +38,22 @@ process sketch {
}


chDbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}\n" }
.collectFile( name: 'db.tsv', sort: false, tempDir: "${workDir}/", storeDir: "./${domain}/" )
workflow {

processedGenomes = Channel.fromPath( params.metadata )
| splitCsv( skip: 1, sep: '\t' )
| filter( { it[18].toLowerCase() == 't' } )
| map( {
def acc = it[0] - 'RS_' - 'GB_'
def orgName = it[65].split(';').last() - 's__'
def path = acc.substring(0,3) + '/' + acc.substring(4,7) + '/' + acc.substring(7,10) + '/' + acc.substring(10,13)
def status = it[48].toLowerCase()
return [ acc, '-', status, orgName, path ]
} )
| sketch

processedGenomes.results
| map( { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}\n" } )
| collectFile( name: 'db.tsv', sort: false, tempDir: "${workDir}/", storeDir: "./${params.domain}/" )

}
66 changes: 36 additions & 30 deletions db-scripts/build-db-refseq.nf
Original file line number Diff line number Diff line change
@@ -1,25 +1,11 @@
nextflow.enable.dsl=2

import java.nio.file.*


assemblySummary = params.ass_sum
ncbiPath = params.ncbiPath
domain = params.domain


Channel.fromPath( assemblySummary )
.splitCsv( skip: 2, sep: '\t' )
.filter( { (it[11].toLowerCase() == 'complete genome') || (it[4].toLowerCase() == 'representative genome') || (it[4].toLowerCase() == 'reference genome') } )
.map( {
def species = it[7]
def strain = it[8] - 'strain='
def status = it[11].split(' ')[0].toLowerCase()
if( species.contains( strain ) )
return [ it[0], it[5], species, status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ]
else
return [ it[0], it[5], "${species} ${strain}", status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ]
} )
.set { chValidGenomes }
// params.ass_sum
// params.ncbiPath
// params.domain


process download {
Expand All @@ -32,14 +18,14 @@ process download {
maxRetries 3

input:
tuple val(acc), val(taxId), val(orgName), val(status), val(path) from chValidGenomes
tuple val(acc), val(taxId), val(orgName), val(status), val(path)

output:
tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz") into chDownloadedGenomes
tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz")

script:
"""
wget -O ${acc}.gz ${ncbiPath}/${path}/${path.split('/').last()}_genomic.fna.gz
wget -O ${acc}.gz ${params.ncbiPath}/${path}/${path.split('/').last()}_genomic.fna.gz
"""
}

Expand All @@ -54,16 +40,16 @@ process sketch {
container 'quay.io/biocontainers/mash:2.3--hd3113c8_6'

input:
tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz") from chDownloadedGenomes
tuple val(acc), val(taxId), val(orgName), val(status), path("${acc}.gz")

output:
tuple val(acc), val(taxId), val(status), val(orgName) into chDbEntries
file("${acc}.msh") into outMash
file("${acc}.fna.gz") into outFasta

publishDir pattern: '*.fna.gz', path: "./${domain}-refseq/", mode: 'move'
publishDir pattern: '*.msh', path: './sketches/', mode: 'move'
tuple val(acc), val(taxId), val(status), val(orgName), emit: results
file("${acc}.msh")
file("${acc}.fna.gz")

publishDir pattern: "${acc}.msh", path: './sketches/', mode: 'move'
publishDir pattern: "${acc}.fna.gz", path: "./${params.omain}-refseq/", mode: 'move'

script:
"""
gunzip -c ${acc}.gz > ${acc}
Expand All @@ -74,5 +60,25 @@ process sketch {
}


chDbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" }
.collectFile( name: 'db.tsv', storeDir: "./${domain}-refseq/", newLine: true )
workflow {

processedGenomes = Channel.fromPath( params.assemblySummary )
| splitCsv( skip: 2, sep: '\t' )
| filter( { (it[11].toLowerCase() == 'complete genome') || (it[4].toLowerCase() == 'representative genome') || (it[4].toLowerCase() == 'reference genome') } )
| map( {
def species = it[7]
def strain = it[8] - 'strain='
def status = it[11].split(' ')[0].toLowerCase()
if( species.contains( strain ) )
return [ it[0], it[5], species, status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ]
else
return [ it[0], it[5], "${species} ${strain}", status, it[19] - 'https://ftp.ncbi.nlm.nih.gov/genomes/' ]
} )
| download
| sketch

processedGenomes.results
| map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" }
| collectFile( name: 'db.tsv', storeDir: "./${params.domain}-refseq/", newLine: true )

}
51 changes: 26 additions & 25 deletions db-scripts/build-plasmids-db-plsdb.nf
Original file line number Diff line number Diff line change
@@ -1,23 +1,9 @@
nextflow.enable.dsl=2

import java.nio.file.*


plasmids = params.plasmids


Channel.fromPath(plasmids)
.into { fastaFiles; fastaEntries }

fastaFiles
.splitFasta( by: 1, file: true)
.set { fastaFiles }

fastaEntries
.splitFasta( record: [id: true, desc: true ] )
.map( {
return [ it.id, '', it.desc, 'complete' ]
} )
.into { validGenomes; dbEntries }
// params.plasmids


process sketch {
Expand All @@ -30,16 +16,16 @@ process sketch {
container 'quay.io/biocontainers/mash:2.3--hd3113c8_6'

input:
file(sequence) from fastaFiles
set val(acc), val(taxId), val(orgName), val(status) from validGenomes
file(sequence)
set val(acc), val(taxId), val(orgName), val(status)

output:
file("${acc}.msh") into outMash
file("${acc}.fna.gz") into outFasta

publishDir pattern: '*.fna.gz', path: "./plasmids-plsdb/", mode: 'move'
publishDir pattern: '*.msh', path: './sketches/', mode: 'move'
file("${acc}.msh")
file("${acc}.fna.gz")

publishDir pattern: "${acc}.msh", path: './sketches/', mode: 'move'
publishDir pattern: "${acc}.fna.gz", path: "./plasmids-plsdb/", mode: 'move'

script:
"""
mv ${sequence} ${acc}
Expand All @@ -50,5 +36,20 @@ process sketch {
}


dbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" }
.collectFile( name: 'db.tsv', storeDir: './plasmids-plsdb/', newLine: true )
workflow {

plasmidsFasta = Channel.fromPath(params.plasmids)

plasmidSequences = plasmidsFasta
| splitFasta( by: 1, file: true)

plasmidRecords = plasmidsFasta
| splitFasta( record: [id: true, desc: true ] )
| map( { [ it.id, '', it.desc, 'complete' ] } )

sketch( plasmidSequences, plasmidRecords)

plasmidRecords
| map( { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" } )
| collectFile( name: 'db.tsv', storeDir: './plasmids-plsdb/', newLine: true )
}
38 changes: 28 additions & 10 deletions db-scripts/build-plasmids-db-refseq.nf
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
nextflow.enable.dsl=2

import java.nio.file.*


plasmids = params.plasmids
// params.plasmids


Channel.fromPath(plasmids)
Expand Down Expand Up @@ -31,16 +32,16 @@ process sketch {
container 'quay.io/biocontainers/mash:2.3--hd3113c8_6'

input:
file(sequence) from fastaFiles
set val(acc), val(taxId), val(orgName), val(status) from validGenomes
file(sequence)
set val(acc), val(taxId), val(orgName), val(status)

output:
file("${acc}.msh") into outMash
file("${acc}.fna.gz") into outFasta
file("${acc}.msh")
file("${acc}.fna.gz")

publishDir pattern: '*.fna.gz', path: "./plasmids-refseq/", mode: 'move'
publishDir pattern: '*.msh', path: './sketches/', mode: 'move'

publishDir pattern: '*.fna.gz', path: "./plasmids-refseq/", mode: 'move'

script:
"""
mv ${sequence} ${acc}
Expand All @@ -50,6 +51,23 @@ process sketch {
"""
}


dbEntries.map { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" }
.collectFile( name: 'db.tsv', storeDir: './plasmids-refseq/', newLine: true )
workflow {

plasmidsFasta = Channel.fromPath(params.plasmids)

plasmidSequences = plasmidsFasta
| splitFasta( by: 1, file: true)

plasmidRecords = plasmidsFasta
| splitFasta( record: [id: true, desc: true ] )
| map( {
def plasmidName = it.desc - ', complete sequence'
return [ it.id, '', plasmidName, 'complete' ]
} )

sketch( plasmidSequences, plasmidRecords)

plasmidRecords
| map( { "${it[0]}\t${it[1]}\t${it[2]}\t${it[3]}" } )
| collectFile( name: 'db.tsv', storeDir: './plasmids-refseq/', newLine: true )
}

0 comments on commit 521cda4

Please # to comment.