nextflow.config

/*
 * -------------------------------------------------
 *  TRON-Bioinformatics/tronflow-bam-preprocessing Nextflow config file
 * -------------------------------------------------
 * Default config options for all environments.
 */

profiles {
  conda { 
    params.enable_conda = true 
    conda.enabled = true
  }
  debug { process.beforeScript = 'echo $HOSTNAME' }
  ci {
    params.prepare_bam_cpus = 1
    params.prepare_bam_memory = "3g"
    params.mark_duplicates_cpus = 1
    params.mark_duplicates_memory = "3g"
    params.realignment_around_indels_cpus = 1
    params.realignment_around_indels_memory = "3g"
    params.bqsr_cpus = 1
    params.bqsr_memory = "3g"
    params.metrics_cpus = 1
    params.metrics_memory = "3g"
    params.index_cpus = 1
    params.index_memory = "3g"
    timeline.enabled = false
    report.enabled = false
    trace.enabled = false
    dag.enabled = false
  }
  test {
    params.input_files = "test_data/test_input.txt"
    params.reference = "$baseDir/test_data/ucsc.hg19.minimal.fasta"
    params.known_indels1 = "$baseDir/test_data/1000G_phase1.indels.hg19.sites.minimal.vcf"
    params.known_indels2 = "$baseDir/test_data/Mills_and_1000G_gold_standard.indels.hg19.sites.sorted.minimal.vcf"
    params.intervals = "$baseDir/test_data/minimal_intervals.bed"
    params.dbsnp = "$baseDir/test_data/dbsnp_138.hg19.minimal.vcf"
  }
}

// Export this variable to prevent local Python libraries from conflicting with those in the container
env {
  PYTHONNOUSERSITE = 1
}

// Capture exit codes from upstream processes when piping
process.shell = ['/bin/bash', '-euo', 'pipefail']

cleanup = true

VERSION = '2.1.0'
DOI = 'https://zenodo.org/badge/latestdoi/358400957'

manifest {
  name = 'TRON-Bioinformatics/tronflow-bam-preprocessing'
  author = 'Pablo Riesgo-Ferreiro, Özlem Muslu, Luisa Bresadola'
  homePage = 'https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing'
  description = 'Picard and GATK BAM preprocessing pipeline'
  mainScript = 'main.nf'
  nextflowVersion = '>=19.10.0'
  version = VERSION
  doi = DOI
}
params.manifest = manifest

params.help_message = """
TronFlow bam preprocessing v${VERSION} ${DOI}

Usage:
    main.nf --input_files input_files

Input:
    * --input_bam: the path to a single BAM (this option is not compatible with --input_files)
    * --input_files: the path to a tab-separated values file containing in each row the sample name, sample type (eg: tumor or normal) and path to the BAM file. The sample name should be unique for each bam file. (this option is not compatible with --input_bam)
    Sample type will be added to the BAM header @SN sample name
    The input file does not have header!
    Example input file:
    name1_t   tumor   tumor.1.bam
    name1_n   normal  normal.1.bam
    name2_t   tumor   tumor.2.bam
    * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)

Optional input:
    * --input_name: the name of the sample. Only used when --input_bam is provided (default: normal)
    * --dbsnp: path to the dbSNP VCF (required to perform BQSR)
    * --known_indels1: path to a VCF of known indels (optional to perform realignment around indels)
    * --known_indels2: path to a second VCF of known indels (optional to perform realignment around indels)
    * --intervals: path to a BED file to collect coverage and HS metrics from (default: None)
    * --collect_hs_minimum_base_quality: minimum base quality for a base to contribute coverage (default: 20).
    * --collect_hs_minimum_mapping_quality: minimum mapping quality for a read to contribute coverage (default: 20).
    * --skip_bqsr: optionally skip BQSR (default: false)
    * --skip_realignment: optionally skip realignment (default: false)
    * --skip_deduplication: optionally skip deduplication (default: false)
    * --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true)
    * --skip_metrics: optionally skip metrics (default: false)
    * --output: the folder where to publish output (default: ./output)
    * --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA)
    * --split_cigarn: split reads that contain Ns in their cigar string (e.g. spanning splicing events in RNAseq data) using GATKs SplitNCigarReads
    * --split_cigarn_args: additional arguments for SplitNCigarReads

Computational resources:
    * --prepare_bam_cpus: (default: 3)
    * --prepare_bam_memory: (default: 8g)
    * --mark_duplicates_cpus: (default: 16)
    * --mark_duplicates_memory: (default: 64g)
    * --realignment_around_indels_cpus: (default: 2)
    * --realignment_around_indels_memory: (default: 31g)
    * --bqsr_cpus: (default: 3)
    * --bqsr_memory: (default: 4g)
    * --metrics_cpus: (default: 1)
    * --metrics_memory: (default: 8g)

 Output:
    * Preprocessed and indexed BAMs
    * Tab-separated values file with the absolute paths to the preprocessed BAMs, preprocessed_bams.txt

Optional output:
    * Recalibration report
    * Deduplication metrics
    * Realignment intervals
    * GATK multiple metrics
    * HS metrics
    * Horizontal and vertical coverage metrics
  """