forked from IARCbioinfo/abra-nf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
abra.nf
executable file
·278 lines (234 loc) · 12.9 KB
/
abra.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#! /usr/bin/env nextflow
// Copyright (C) 2017 IARC/WHO
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
params.help = null
params.tumor_bam_folder = null
params.normal_bam_folder = null
params.bam_folder = null
params.bed = null
params.single = null
params.ref = null
params.abra_path = '/opt/conda/envs/abra-nf/share/abra2*/abra2.jar'
params.junctions = null
params.gtf = null
params.rna = null
params.ignore_bad_assembly = null
log.info ""
log.info "--------------------------------------------------------"
log.info " abra2-nf v3.0: Nextflow pipeline for ABRA2 "
log.info "--------------------------------------------------------"
log.info "Copyright (C) IARC/WHO"
log.info "This program comes with ABSOLUTELY NO WARRANTY; for details see LICENSE"
log.info "This is free software, and you are welcome to redistribute it"
log.info "under certain conditions; see LICENSE for details."
log.info "--------------------------------------------------------"
log.info ""
if (params.help) {
log.info ''
log.info '--------------------------------------------------'
log.info ' USAGE '
log.info '--------------------------------------------------'
log.info ''
log.info 'Usage: '
log.info 'nextflow run iarcbioinf/abra-nf --tumor_bam_folder tumor_BAM/ --normal_bam_folder normal_BAM/ --ref ref.fasta'
log.info ''
log.info 'Mandatory arguments:'
log.info ' When using Tumor/Normal pairs:'
log.info ' --tumor_bam_folder FOLDER Folder containing tumor BAM files.'
log.info ' --normal_bam_folder FOLDER Folder containing matched normal BAM files.'
log.info ' In other cases:'
log.info ' --bam_folder FOLDER Folder containing BAM files.'
log.info ' In all cases:'
log.info ' --ref FILE (with index) Reference fasta file indexed.'
log.info ' --abra_path FILE abra.jar explicit path.'
log.info 'Optional arguments:'
log.info ' When using Tumor/Normal pairs:'
log.info ' --suffix_tumor STRING Suffix identifying tumor bam (default: "_T").'
log.info ' --suffix_normal STRING Suffix identifying normal bam (default: "_N").'
log.info ' In all cases:'
log.info ' --single Flag for single-end sequencing.'
log.info ' --bed FILE Bed file containing target intervals.'
log.info ' --junctions Flag to use STAR identified junctions.'
log.info ' --gtf FILE GTF file containing junction annotations.'
log.info ' --rna Flag to add RNA-specific recommended ABRA2 parameters.'
log.info ' --mem INTEGER RAM used (in GB, default: 16)'
log.info ' --threads INTEGER Number of threads (default: 4)'
log.info ' --output_folder FOLDER Output folder (default: abra_BAM).'
log.info ''
exit 0
}else {
/* Software information */
log.info "bam_folder = ${params.bam_folder}"
log.info "ref = ${params.ref}"
log.info "cpu = ${params.cpu}"
log.info "mem = ${params.mem}"
log.info "output_folder= ${params.output_folder}"
log.info "bed = ${params.bed}"
log.info "abra_path = ${params.abra_path}"
log.info "gtf = ${params.gtf}"
log.info "junctions = ${params.junctions}"
log.info "help=${params.help}"
}
assert (params.ref != true) && (params.ref != null) : "please specify --ref option (--ref reference.fasta(.gz))"
if (params.bam_folder) {
assert (params.bam_folder != true) && (params.bam_folder != null) : "please specify --bam_folder option (--bam_folder bamfolder)"
} else {
assert (params.normal_bam_folder != true) && (params.normal_bam_folder != null) : "please specify --normal_bam_folder option (--normal_bam_folder bamfolder)"
assert (params.tumor_bam_folder != true) && (params.tumor_bam_folder != null) : "please specify --tumor_bam_folder option (--tumor_bam_folder bamfolder)"
}
if (params.bed!=null) {
assert (params.bed != true) : "please specify file when using --bed option (--bed regions.bed)"
try { assert file(params.bed).exists() : "\n WARNING : input bed file not located in execution directory" } catch (AssertionError e) { println e.getMessage() }
}
bed = params.bed ? file(params.bed) : file('nothing')
if (params.gtf!=null) {
assert (params.gtf != true) : "please specify file when using --gtf option (--gtf annotations.gtf)"
try { assert file(params.gtf).exists() : "\n WARNING : input gtf file not located in execution directory" } catch (AssertionError e) { println e.getMessage() }
}
gtf = params.gtf ? file(params.gtf) : file('nothing')
assert (params.abra_path != true) && (params.abra_path != null) : "please specify --abra_path option (--abra_path /path/to/abra.jar)"
fasta_ref = file(params.ref)
fasta_ref_fai = file( params.ref+'.fai' )
fasta_ref_sa = file( params.ref+'.sa' )
fasta_ref_bwt = file( params.ref+'.bwt' )
fasta_ref_ann = file( params.ref+'.ann' )
fasta_ref_amb = file( params.ref+'.amb' )
fasta_ref_pac = file( params.ref+'.pac' )
params.suffix_tumor = "_T"
params.suffix_normal = "_N"
params.mem = 16
params.cpu = 4
params.output_folder = "abra_BAM"
try { assert fasta_ref.exists() : "\n WARNING : fasta reference not located in execution directory. Make sure reference index is in the same folder as fasta reference" } catch (AssertionError e) { println e.getMessage() }
if (fasta_ref.exists()) {assert fasta_ref_fai.exists() : "input fasta reference does not seem to have a .fai index (use samtools faidx)"}
if (fasta_ref.exists() && params.ref.tokenize('.')[-1] == 'gz') {assert fasta_ref_gzi.exists() : "input gz fasta reference does not seem to have a .gzi index (use samtools faidx)"}
if(params.bam_folder) {
try { assert file(params.bam_folder).exists() : "\n WARNING : input BAM folder not located in execution directory" } catch (AssertionError e) { println e.getMessage() }
assert file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() > 0 : "BAM folder contains no BAM"
// recovering of bam files
bams = Channel.fromPath( params.bam_folder+'/*.bam' )
.ifEmpty { error "Cannot find any bam file in: ${params.bam_folder}" }
.map { path -> [ path.name.replace(".bam",""), path ] }
//.println()
// recovering of bai files
bais = Channel.fromPath( params.bam_folder+'/*.bam.bai' )
.ifEmpty { error "Cannot find any bai file in: ${params.bam_folder}" }
.map { path -> [ path.name.replace(".bam.bai",""), path ] }
//.println()
if(params.junctions){
// recovering junctions files
junctions = Channel.fromPath( params.bam_folder+'/*.SJ.out.tab' )
.ifEmpty { error "Cannot find any junction tab files in: ${params.bam_folder}" }
.map { path -> [ path.name.replace(".SJ.out.tab","").replace("STAR.",""), path ] }
// building bam-bai pairs
bam_bai = bams
.join(bais)
.join(junctions)
}else{
// building bam-bai pairs
bam_bai = bams
.join(bais)
//.map { bam, bai -> [ bam[1], bai[1] ] }
}
process abra {
cpus params.cpu
memory params.mem+'GB'
tag { bam_tag }
publishDir params.output_folder, mode: 'move'
input:
set bam_tag, file(bam), file(bai), file(junctions) from bam_bai
file bed
file fasta_ref
file fasta_ref_fai
file fasta_ref_sa
file fasta_ref_bwt
file fasta_ref_ann
file fasta_ref_amb
file fasta_ref_pac
output:
file("${bam_tag}_abra.ba*") into bam_out
shell:
java_mem = params.mem - 2
abra_single = params.single ? '--single --mapq 20' : ''
abra_bed = params.bed ? "--targets $bed" : ''
abra_junctions = params.junctions ? "--junctions $junctions" : ''
abra_gtf = params.gtf ? "--gtf $gtf" : ''
abra_rna = params.rna ? "--sua --dist 500000" : ''
abra_iba = params.ignore_bad_assembly ? "--ignore-bad-assembly ": ""
'''
java -Xmx!{java_mem}g -jar !{params.abra_path} --in !{bam_tag}.bam --out "!{bam_tag}_abra.bam" --ref !{fasta_ref} --tmpdir . --threads !{params.cpu} --index !{abra_single} !{abra_bed} !{abra_junctions} !{abra_gtf} !{abra_rna} > !{bam_tag}_abra.log 2>&1
'''
}
} else {
try { assert file(params.tumor_bam_folder).exists() : "\n WARNING : input tumor BAM folder not located in execution directory" } catch (AssertionError e) { println e.getMessage() }
assert file(params.tumor_bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() > 0 : "tumor BAM folder contains no BAM"
try { assert file(params.normal_bam_folder).exists() : "\n WARNING : input normal BAM folder not located in execution directory" } catch (AssertionError e) { println e.getMessage() }
assert file(params.normal_bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() > 0 : "normal BAM folder contains no BAM"
// FOR TUMOR
// recovering of bam files
tumor_bams = Channel.fromPath( params.tumor_bam_folder+'/*'+params.suffix_tumor+'.bam' )
.ifEmpty { error "Cannot find any bam file in: ${params.tumor_bam_folder}" }
.map { path -> [ path.name.replace("${params.suffix_tumor}.bam",""), path ] }
// recovering of bai files
tumor_bais = Channel.fromPath( params.tumor_bam_folder+'/*'+params.suffix_tumor+'.bam.bai' )
.ifEmpty { error "Cannot find any bai file in: ${params.tumor_bam_folder}" }
.map { path -> [ path.name.replace("${params.suffix_tumor}.bam.bai",""), path ] }
// building bam-bai pairs
tumor_bam_bai = tumor_bams
.phase(tumor_bais)
.map { tumor_bam, tumor_bai -> [ tumor_bam[0], tumor_bam[1], tumor_bai[1] ] }
// FOR NORMAL
// recovering of bam files
normal_bams = Channel.fromPath( params.normal_bam_folder+'/*'+params.suffix_normal+'.bam' )
.ifEmpty { error "Cannot find any bam file in: ${params.normal_bam_folder}" }
.map { path -> [ path.name.replace("${params.suffix_normal}.bam",""), path ] }
// recovering of bai files
normal_bais = Channel.fromPath( params.normal_bam_folder+'/*'+params.suffix_normal+'.bam.bai' )
.ifEmpty { error "Cannot find any bai file in: ${params.normal_bam_folder}" }
.map { path -> [ path.name.replace("${params.suffix_normal}.bam.bai",""), path ] }
// building bam-bai pairs
normal_bam_bai = normal_bams
.phase(normal_bais)
.map { normal_bam, normal_bai -> [ normal_bam[0], normal_bam[1], normal_bai[1] ] }
// building 4-uplets corresponding to {tumor_bam, tumor_bai, normal_bam, normal_bai}
tn_bambai = tumor_bam_bai
.phase(normal_bam_bai)
.map {tumor_bb, normal_bb -> [ tumor_bb[1], tumor_bb[2], normal_bb[1], normal_bb[2] ] }
// here each element X of tn_bambai channel is a 4-uplet. X[0] is the tumor bam, X[1] the tumor bai, X[2] the normal bam and X[3] the normal bai.
process abra_TN {
cpus params.cpu
memory params.mem+'GB'
tag { tumor_normal_tag }
publishDir params.output_folder, mode: 'move'
input:
file tn from tn_bambai
file bed
file fasta_ref
file fasta_ref_fai
file fasta_ref_gzi
file fasta_ref_sa
file fasta_ref_bwt
file fasta_ref_ann
file fasta_ref_amb
file fasta_ref_pac
output:
file("${tumor_normal_tag}${params.suffix_normal}_abra.ba*") into normal_output
file("${tumor_normal_tag}${params.suffix_tumor}_abra.ba*") into tumor_output
shell:
tumor_normal_tag = tn[0].baseName.replace(params.suffix_tumor,"")
abra_single = params.single ? '--single --mapq 20' : ''
abra_bed = params.bed ? "--targets $bed" : ''
'''
java -Xmx!{params.mem}g -jar !{params.abra_path} --in !{tumor_normal_tag}!{params.suffix_normal}.bam,!{tumor_normal_tag}!{params.suffix_tumor}.bam --out "!{tumor_normal_tag}!{params.suffix_normal}_abra.bam","!{tumor_normal_tag}!{params.suffix_tumor}_abra.bam" --ref !{fasta_ref} --threads !{params.cpu} --index !{abra_single} !{abra_bed} > !{tumor_normal_tag}_abra.log 2>&1
'''
}
}