-
Notifications
You must be signed in to change notification settings - Fork 0
/
nextflow.config
125 lines (112 loc) · 5.1 KB
/
nextflow.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/*
* -------------------------------------------------
* TRON-Bioinformatics/tronflow-bam-preprocessing Nextflow config file
* -------------------------------------------------
* Default config options for all environments.
*/
profiles {
conda {
params.enable_conda = true
conda.enabled = true
}
debug { process.beforeScript = 'echo $HOSTNAME' }
ci {
params.prepare_bam_cpus = 1
params.prepare_bam_memory = "3g"
params.mark_duplicates_cpus = 1
params.mark_duplicates_memory = "3g"
params.realignment_around_indels_cpus = 1
params.realignment_around_indels_memory = "3g"
params.bqsr_cpus = 1
params.bqsr_memory = "3g"
params.metrics_cpus = 1
params.metrics_memory = "3g"
params.index_cpus = 1
params.index_memory = "3g"
timeline.enabled = false
report.enabled = false
trace.enabled = false
dag.enabled = false
}
test {
params.input_files = "test_data/test_input.txt"
params.reference = "$baseDir/test_data/ucsc.hg19.minimal.fasta"
params.known_indels1 = "$baseDir/test_data/1000G_phase1.indels.hg19.sites.minimal.vcf"
params.known_indels2 = "$baseDir/test_data/Mills_and_1000G_gold_standard.indels.hg19.sites.sorted.minimal.vcf"
params.intervals = "$baseDir/test_data/minimal_intervals.bed"
params.dbsnp = "$baseDir/test_data/dbsnp_138.hg19.minimal.vcf"
}
}
// Export this variable to prevent local Python libraries from conflicting with those in the container
env {
PYTHONNOUSERSITE = 1
}
// Capture exit codes from upstream processes when piping
process.shell = ['/bin/bash', '-euo', 'pipefail']
cleanup = true
VERSION = '2.1.0'
DOI = 'https://zenodo.org/badge/latestdoi/358400957'
manifest {
name = 'TRON-Bioinformatics/tronflow-bam-preprocessing'
author = 'Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola'
homePage = 'https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing'
description = 'Picard and GATK BAM preprocessing pipeline'
mainScript = 'main.nf'
nextflowVersion = '>=19.10.0'
version = VERSION
doi = DOI
}
params.manifest = manifest
params.help_message = """
TronFlow bam preprocessing v${VERSION} ${DOI}
Usage:
main.nf --input_files input_files
Input:
* --input_bam: the path to a single BAM (this option is not compatible with --input_files)
* --input_files: the path to a tab-separated values file containing in each row the sample name, sample type (eg: tumor or normal) and path to the BAM file. The sample name should be unique for each bam file. (this option is not compatible with --input_bam)
Sample type will be added to the BAM header @SN sample name
The input file does not have header!
Example input file:
name1_t tumor tumor.1.bam
name1_n normal normal.1.bam
name2_t tumor tumor.2.bam
* --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
Optional input:
* --input_name: the name of the sample. Only used when --input_bam is provided (default: normal)
* --dbsnp: path to the dbSNP VCF (required to perform BQSR)
* --known_indels1: path to a VCF of known indels (optional to perform realignment around indels)
* --known_indels2: path to a second VCF of known indels (optional to perform realignment around indels)
* --intervals: path to a BED file to collect coverage and HS metrics from (default: None)
* --collect_hs_minimum_base_quality: minimum base quality for a base to contribute coverage (default: 20).
* --collect_hs_minimum_mapping_quality: minimum mapping quality for a read to contribute coverage (default: 20).
* --skip_bqsr: optionally skip BQSR (default: false)
* --skip_realignment: optionally skip realignment (default: false)
* --skip_deduplication: optionally skip deduplication (default: false)
* --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true)
* --skip_metrics: optionally skip metrics (default: false)
* --output: the folder where to publish output (default: ./output)
* --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)
* --split_cigarn: split reads that contain Ns in their cigar string (e.g. spanning splicing events in RNAseq data) using GATKs SplitNCigarReads
* --split_cigarn_args: additional arguments for SplitNCigarReads
Computational resources:
* --prepare_bam_cpus: (default: 3)
* --prepare_bam_memory: (default: 8g)
* --mark_duplicates_cpus: (default: 16)
* --mark_duplicates_memory: (default: 64g)
* --realignment_around_indels_cpus: (default: 2)
* --realignment_around_indels_memory: (default: 31g)
* --bqsr_cpus: (default: 3)
* --bqsr_memory: (default: 4g)
* --metrics_cpus: (default: 1)
* --metrics_memory: (default: 8g)
Output:
* Preprocessed and indexed BAMs
* Tab-separated values file with the absolute paths to the preprocessed BAMs, preprocessed_bams.txt
Optional output:
* Recalibration report
* Deduplication metrics
* Realignment intervals
* GATK multiple metrics
* HS metrics
* Horizontal and vertical coverage metrics
"""