main.nf

#!/usr/bin/env nextflow
/*
========================================================================================
                         nf-core/chipseq
========================================================================================
 nf-core/chipseq Analysis Pipeline.
 #### Homepage / Documentation
 https://github.com/nf-core/chipseq
----------------------------------------------------------------------------------------
*/

def helpMessage() {
    log.info nfcoreHeader()
    log.info"""
    Usage:

    The typical command for running the pipeline is as follows:

      nextflow run nf-core/chipseq --input design.csv --genome GRCh37 -profile docker

    Mandatory arguments:
      --input [file]                  Comma-separated file containing information about the samples in the experiment (see docs/usage.md)
      --fasta [file]                  Path to Fasta reference. Not mandatory when using reference in iGenomes config via --genome
      --gtf [file]                    Path to GTF file. Not mandatory when using reference in iGenomes config via --genome
      -profile [str]                  Configuration profile to use. Can use multiple (comma separated)
                                      Available: conda, docker, singularity, awsbatch, test

    Generic
      --single_end [bool]             Specifies that the input is single-end reads
      --seq_center [str]              Sequencing center information to be added to read group of BAM files
      --fragment_size [int]           Estimated fragment size used to extend single-end reads (Default: 200)
      --fingerprint_bins [int]        Number of genomic bins to use when calculating fingerprint plot (Default: 500000)

    References                        If not specified in the configuration file or you wish to overwrite any of the references
      --genome [str]                  Name of iGenomes reference
      --bwa_index [file]              Full path to directory containing BWA index including base name i.e. /path/to/index/genome.fa
      --gene_bed [file]               Path to BED file containing gene intervals
      --tss_bed [file]                Path to BED file containing transcription start sites
      --macs_gsize [str]              Effective genome size parameter required by MACS2. If using iGenomes config, values have only been provided when --genome is set as GRCh37, GRCm38, hg19, mm10, BDGP6 and WBcel235
      --blacklist [file]              Path to blacklist regions (.BED format), used for filtering alignments
      --save_reference [bool]         If generated by the pipeline save the BWA index in the results directory

    Trimming
      --clip_r1 [int]                 Instructs Trim Galore to remove bp from the 5' end of read 1 (or single-end reads) (Default: 0)
      --clip_r2 [int]                 Instructs Trim Galore to remove bp from the 5' end of read 2 (paired-end reads only) (Default: 0)
      --three_prime_clip_r1 [int]     Instructs Trim Galore to remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed (Default: 0)
      --three_prime_clip_r2 [int]     Instructs Trim Galore to re move bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed (Default: 0)
      --trim_nextseq [int]            Instructs Trim Galore to apply the --nextseq=X option, to trim based on quality after removing poly-G tails (Default: 0)
      --skip_trimming [bool]          Skip the adapter trimming step
      --save_trimmed [bool]           Save the trimmed FastQ files in the results directory

    Alignments
      --keep_dups [bool]              Duplicate reads are not filtered from alignments
      --keep_multi_map [bool]         Reads mapping to multiple locations are not filtered from alignments
      --save_align_intermeds [bool]   Save the intermediate BAM files from the alignment step - not done by default

    Peaks
      --narrow_peak [bool]            Run MACS2 in narrowPeak mode
      --broad_cutoff [float]          Specifies broad cutoff value for MACS2. Only used when --narrow_peak isnt specified (Default: 0.1)
      --min_reps_consensus [int]      Number of biological replicates required from a given condition for a peak to contribute to a consensus peak (Default: 1)
      --save_macs_pileup [bool]       Instruct MACS2 to create bedGraph files normalised to signal per million reads
      --skip_diff_analysis [bool]     Skip differential binding analysis

    QC
      --skip_fastqc [bool]            Skip FastQC
      --skip_picard_metrics [bool]    Skip Picard CollectMultipleMetrics
      --skip_preseq [bool]            Skip Preseq
      --skip_plot_profile [bool]      Skip deepTools plotProfile
      --skip_plot_fingerprint [bool]  Skip deepTools plotFingerprint
      --skip_spp [bool]               Skip Phantompeakqualtools
      --skip_igv [bool]               Skip IGV
      --skip_multiqc [bool]           Skip MultiQC

    Other
      --outdir [file]                 The output directory where the results will be saved
      --email [email]                 Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
      --email_on_fail [email]         Same as --email, except only send mail if the workflow is not successful
      --max_multiqc_email_size [str]  Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
      -name [str]                     Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic

    AWSBatch
      --awsqueue [str]                The AWSBatch JobQueue that needs to be set when running on AWSBatch
      --awsregion [str]               The AWS Region for your AWS Batch job to run on
    """.stripIndent()
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                SET UP CONFIGURATION VARIABLES                       -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * SET UP CONFIGURATION VARIABLES
 */

// Show help message
if (params.help) {
    helpMessage()
    exit 0
}

// Check if genome exists in the config file
if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
    exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
}

////////////////////////////////////////////////////
/* --         DEFAULT PARAMETER VALUES         -- */
////////////////////////////////////////////////////

// Configurable variables
params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
params.bwa_index = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
params.gtf = params.genome ? params.genomes[ params.genome ].gtf ?: false : false
params.gene_bed = params.genome ? params.genomes[ params.genome ].bed12 ?: false : false
params.macs_gsize = params.genome ? params.genomes[ params.genome ].macs_gsize ?: false : false
params.blacklist = params.genome ? params.genomes[ params.genome ].blacklist ?: false : false

// Global variables
def PEAK_TYPE = params.narrow_peak ? "narrowPeak" : "broadPeak"

// Has the run name been specified by the user?
//  this has the bonus effect of catching both -name and --name
custom_runName = params.name
if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) {
    custom_runName = workflow.runName
}

////////////////////////////////////////////////////
/* --          CONFIG FILES                    -- */
////////////////////////////////////////////////////

// Pipeline config
ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true)

// JSON files required by BAMTools for alignment filtering
if (params.single_end) {
    ch_bamtools_filter_config = file(params.bamtools_filter_se_config, checkIfExists: true)
} else {
    ch_bamtools_filter_config = file(params.bamtools_filter_pe_config, checkIfExists: true)
}

// Header files for MultiQC
ch_multiqc_config = file(params.multiqc_config, checkIfExists: true)
ch_peak_count_header = file("$baseDir/assets/multiqc/peak_count_header.txt", checkIfExists: true)
ch_frip_score_header = file("$baseDir/assets/multiqc/frip_score_header.txt", checkIfExists: true)
ch_peak_annotation_header = file("$baseDir/assets/multiqc/peak_annotation_header.txt", checkIfExists: true)
ch_deseq2_pca_header = file("$baseDir/assets/multiqc/deseq2_pca_header.txt", checkIfExists: true)
ch_deseq2_clustering_header = file("$baseDir/assets/multiqc/deseq2_clustering_header.txt", checkIfExists: true)
ch_spp_correlation_header = file("$baseDir/assets/multiqc/spp_correlation_header.txt", checkIfExists: true)
ch_spp_nsc_header = file("$baseDir/assets/multiqc/spp_nsc_header.txt", checkIfExists: true)
ch_spp_rsc_header = file("$baseDir/assets/multiqc/spp_rsc_header.txt", checkIfExists: true)

////////////////////////////////////////////////////
/* --          VALIDATE INPUTS                 -- */
////////////////////////////////////////////////////

// Validate inputs
if (params.input)     { ch_input = file(params.input, checkIfExists: true) } else { exit 1, "Samples design file not specified!" }
if (params.gtf)       { ch_gtf = file(params.gtf, checkIfExists: true) } else { exit 1, "GTF annotation file not specified!" }
if (params.gene_bed)  { ch_gene_bed = file(params.gene_bed, checkIfExists: true) }
if (params.tss_bed)   { ch_tss_bed = file(params.tss_bed, checkIfExists: true) }
if (params.blacklist) { ch_blacklist = Channel.fromPath(params.blacklist, checkIfExists: true) } else { ch_blacklist = Channel.empty() }

if (params.fasta) {
    lastPath = params.fasta.lastIndexOf(File.separator)
    bwa_base = params.fasta.substring(lastPath+1)
    ch_fasta = file(params.fasta, checkIfExists: true)
} else {
    exit 1, "Fasta file not specified!"
}

if (params.bwa_index) {
    lastPath = params.bwa_index.lastIndexOf(File.separator)
    bwa_dir =  params.bwa_index.substring(0,lastPath+1)
    bwa_base = params.bwa_index.substring(lastPath+1)
    Channel
        .fromPath(bwa_dir, checkIfExists: true)
        .set { ch_bwa_index }
}

////////////////////////////////////////////////////
/* --                   AWS                    -- */
////////////////////////////////////////////////////

if (workflow.profile == 'awsbatch') {
    // AWSBatch sanity checking
    if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!"
    // Check outdir paths to be S3 buckets if running on AWSBatch
    // related: https://github.com/nextflow-io/nextflow/issues/813
    if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!"
    // Prevent trace files to be stored on S3 since S3 does not support rolling files.
    if (workflow.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles."
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                       HEADER LOG INFO                               -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

// Header log info
log.info nfcoreHeader()
def summary = [:]
summary['Run Name']               = custom_runName ?: workflow.runName
summary['Data Type']              = params.single_end ? 'Single-End' : 'Paired-End'
summary['Design File']            = params.input
summary['Genome']                 = params.genome ?: 'Not supplied'
summary['Fasta File']             = params.fasta
summary['GTF File']               = params.gtf
if (params.gene_bed)              summary['Gene BED File'] = params.gene_bed
if (params.tss_bed)               summary['TSS BED File'] = params.tss_bed
if (params.bwa_index)             summary['BWA Index'] = params.bwa_index
if (params.blacklist)             summary['Blacklist BED'] = params.blacklist
summary['MACS2 Genome Size']      = params.macs_gsize ?: 'Not supplied'
summary['Min Consensus Reps']     = params.min_reps_consensus
if (params.macs_gsize)            summary['MACS2 Narrow Peaks'] = params.narrow_peak ? 'Yes' : 'No'
if (!params.narrow_peak)          summary['MACS2 Broad Cutoff'] = params.broad_cutoff
if (params.skip_trimming) {
    summary['Trimming Step']      = 'Skipped'
} else {
    summary['Trim R1']            = "$params.clip_r1 bp"
    summary['Trim R2']            = "$params.clip_r2 bp"
    summary["Trim 3' R1"]         = "$params.three_prime_clip_r1 bp"
    summary["Trim 3' R2"]         = "$params.three_prime_clip_r2 bp"
    summary["NextSeq Trim"]       = "$params.trim_nextseq bp"
}
if (params.seq_center)            summary['Sequencing Center'] = params.seq_center
if (params.single_end)            summary['Fragment Size'] = "$params.fragment_size bp"
summary['Fingerprint Bins']       = params.fingerprint_bins
if (params.keep_dups)             summary['Keep Duplicates'] = 'Yes'
if (params.keep_multi_map)        summary['Keep Multi-mapped'] = 'Yes'
summary['Save Genome Index']      = params.save_reference ? 'Yes' : 'No'
if (params.save_trimmed)          summary['Save Trimmed'] = 'Yes'
if (params.save_align_intermeds)  summary['Save Intermeds'] =  'Yes'
if (params.save_macs_pileup)      summary['Save MACS2 Pileup'] = 'Yes'
if (params.skip_diff_analysis)    summary['Skip Diff Analysis'] = 'Yes'
if (params.skip_fastqc)           summary['Skip FastQC'] = 'Yes'
if (params.skip_picard_metrics)   summary['Skip Picard Metrics'] = 'Yes'
if (params.skip_preseq)           summary['Skip Preseq'] = 'Yes'
if (params.skip_plot_profile)     summary['Skip plotProfile'] = 'Yes'
if (params.skip_plot_fingerprint) summary['Skip plotFingerprint'] = 'Yes'
if (params.skip_spp)              summary['Skip spp'] = 'Yes'
if (params.skip_igv)              summary['Skip IGV'] = 'Yes'
if (params.skip_multiqc)          summary['Skip MultiQC'] = 'Yes'
summary['Max Resources']          = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
summary['Output Dir']             = params.outdir
summary['Launch Dir']             = workflow.launchDir
summary['Working Dir']            = workflow.workDir
summary['Script Dir']             = workflow.projectDir
summary['User']                   = workflow.userName
if (workflow.profile == 'awsbatch') {
    summary['AWS Region']         = params.awsregion
    summary['AWS Queue']          = params.awsqueue
}
summary['Config Profile']         = workflow.profile
if (params.config_profile_description) summary['Config Description'] = params.config_profile_description
if (params.config_profile_contact)     summary['Config Contact']     = params.config_profile_contact
if (params.config_profile_url)         summary['Config URL']         = params.config_profile_url
if (params.email || params.email_on_fail) {
    summary['E-mail Address']     = params.email
    summary['E-mail on failure']  = params.email_on_fail
    summary['MultiQC Max Size']   = params.max_multiqc_email_size
}
log.info summary.collect { k,v -> "${k.padRight(20)}: $v" }.join("\n")
log.info "-\033[2m--------------------------------------------------\033[0m-"

// Check the hostnames against configured profiles
checkHostname()

// Show a big warning message if we're not running MACS
if (!params.macs_gsize) {
    def warnstring = params.genome ? "supported for '${params.genome}'" : 'supplied'
    log.warn "=================================================================\n" +
             "  WARNING! MACS genome size parameter not $warnstring.\n" +
             "  Peak calling, annotation and differential analysis will be skipped.\n" +
             "  Please specify value for '--macs_gsize' to run these steps.\n" +
             "======================================================================="
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                     PARSE DESIGN FILE                               -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * PREPROCESSING - REFORMAT DESIGN FILE, CHECK VALIDITY & CREATE IP vs CONTROL MAPPINGS
 */
process CheckDesign {
    tag "$design"
    publishDir "${params.outdir}/pipeline_info", mode: 'copy'

    input:
    file design from ch_input

    output:
    file "design_reads.csv" into ch_design_reads_csv
    file "design_controls.csv" into ch_design_controls_csv

    script:  // This script is bundled with the pipeline, in nf-core/chipseq/bin/
    """
    check_design.py $design design_reads.csv design_controls.csv
    """
}

/*
 * Create channels for input fastq files
 */
if (params.single_end) {
    ch_design_reads_csv
        .splitCsv(header:true, sep:',')
        .map { row -> [ row.sample_id, [ file(row.fastq_1, checkIfExists: true) ] ] }
        .into { ch_raw_reads_fastqc;
                ch_raw_reads_trimgalore }
} else {
    ch_design_reads_csv
        .splitCsv(header:true, sep:',')
        .map { row -> [ row.sample_id, [ file(row.fastq_1, checkIfExists: true), file(row.fastq_2, checkIfExists: true) ] ] }
        .into { ch_raw_reads_fastqc;
                ch_raw_reads_trimgalore }
}

/*
 * Create a channel with [sample_id, control id, antibody, replicatesExist, multipleGroups]
 */
ch_design_controls_csv
    .splitCsv(header:true, sep:',')
    .map { row -> [ row.sample_id, row.control_id, row.antibody, row.replicatesExist.toBoolean(), row.multipleGroups.toBoolean() ] }
    .set { ch_design_controls_csv }

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                     PREPARE ANNOTATION FILES                        -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * PREPROCESSING - Build BWA index
 */
if (!params.bwa_index) {
    process BWAIndex {
        tag "$fasta"
        label 'process_high'
        publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir },
            saveAs: { params.save_reference ? it : null }, mode: 'copy'

        input:
        file fasta from ch_fasta

        output:
        file "BWAIndex" into ch_bwa_index

        script:
        """
        bwa index -a bwtsw $fasta
        mkdir BWAIndex && mv ${fasta}* BWAIndex
        """
    }
}

/*
 * PREPROCESSING - Generate gene BED file
 */
if (!params.gene_bed) {
    process MakeGeneBED {
        tag "$gtf"
        label 'process_low'
        publishDir "${params.outdir}/reference_genome", mode: 'copy'

        input:
        file gtf from ch_gtf

        output:
        file "*.bed" into ch_gene_bed

        script: // This script is bundled with the pipeline, in nf-core/chipseq/bin/
        """
        gtf2bed $gtf > ${gtf.baseName}.bed
        """
    }
}

/*
 * PREPROCESSING - Generate TSS BED file
 */
if (!params.tss_bed) {
    process MakeTSSBED {
        tag "$bed"
        publishDir "${params.outdir}/reference_genome", mode: 'copy'

        input:
        file bed from ch_gene_bed

        output:
        file "*.bed" into ch_tss_bed

        script:
        """
        cat $bed | awk -v FS='\t' -v OFS='\t' '{ if(\$6=="+") \$3=\$2+1; else \$2=\$3-1; print \$1, \$2, \$3, \$4, \$5, \$6;}' > ${bed.baseName}.tss.bed
        """
    }
}

/*
 * PREPROCESSING - Prepare genome intervals for filtering
 */
process MakeGenomeFilter {
    tag "$fasta"
    publishDir "${params.outdir}/reference_genome", mode: 'copy'

    input:
    file fasta from ch_fasta
    file blacklist from ch_blacklist.ifEmpty([])

    output:
    file "$fasta" into ch_genome_fasta                 // FASTA FILE FOR IGV
    file "*.fai" into ch_genome_fai                    // FAI INDEX FOR REFERENCE GENOME
    file "*.bed" into ch_genome_filter_regions         // BED FILE WITHOUT BLACKLIST REGIONS
    file "*.sizes" into ch_genome_sizes_bigwig         // CHROMOSOME SIZES FILE FOR BEDTOOLS

    script:
    blacklist_filter = params.blacklist ? "sortBed -i $blacklist -g ${fasta}.sizes | complementBed -i stdin -g ${fasta}.sizes" : "awk '{print \$1, '0' , \$2}' OFS='\t' ${fasta}.sizes"
    """
    samtools faidx $fasta
    cut -f 1,2 ${fasta}.fai > ${fasta}.sizes
    $blacklist_filter > ${fasta}.include_regions.bed
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                        FASTQ QC                                     -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 1 - FastQC
 */
process FastQC {
    tag "$name"
    label 'process_medium'
    publishDir "${params.outdir}/fastqc", mode: 'copy',
        saveAs: { filename ->
                      filename.endsWith(".zip") ? "zips/$filename" : "$filename"
                }

    when:
    !params.skip_fastqc

    input:
    set val(name), file(reads) from ch_raw_reads_fastqc

    output:
    file "*.{zip,html}" into ch_fastqc_reports_mqc

    script:
    // Added soft-links to original fastqs for consistent naming in MultiQC
    if (params.single_end) {
        """
        [ ! -f  ${name}.fastq.gz ] && ln -s $reads ${name}.fastq.gz
        fastqc -q -t $task.cpus ${name}.fastq.gz
        """
    } else {
        """
        [ ! -f  ${name}_1.fastq.gz ] && ln -s ${reads[0]} ${name}_1.fastq.gz
        [ ! -f  ${name}_2.fastq.gz ] && ln -s ${reads[1]} ${name}_2.fastq.gz
        fastqc -q -t $task.cpus ${name}_1.fastq.gz
        fastqc -q -t $task.cpus ${name}_2.fastq.gz
        """
    }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                        ADAPTER TRIMMING                             -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 2 - Trim Galore!
 */
if (params.skip_trimming) {
    ch_trimmed_reads = ch_raw_reads_trimgalore
    ch_trimgalore_results_mqc = []
    ch_trimgalore_fastqc_reports_mqc = []
} else {
    process TrimGalore {
        tag "$name"
        label 'process_long'
        publishDir "${params.outdir}/trim_galore", mode: 'copy',
            saveAs: { filename ->
                          if (filename.endsWith(".html")) "fastqc/$filename"
                          else if (filename.endsWith(".zip")) "fastqc/zips/$filename"
                          else if (filename.endsWith("trimming_report.txt")) "logs/$filename"
                          else params.save_trimmed ? filename : null
                    }

        input:
        set val(name), file(reads) from ch_raw_reads_trimgalore

        output:
        set val(name), file("*.fq.gz") into ch_trimmed_reads
        file "*.txt" into ch_trimgalore_results_mqc
        file "*.{zip,html}" into ch_trimgalore_fastqc_reports_mqc

        script:
        // Added soft-links to original fastqs for consistent naming in MultiQC
        c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : ''
        c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : ''
        tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : ''
        tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : ''
        nextseq = params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : ''
        if (params.single_end) {
            """
            [ ! -f  ${name}.fastq.gz ] && ln -s $reads ${name}.fastq.gz
            trim_galore --fastqc --gzip $c_r1 $tpc_r1 $nextseq ${name}.fastq.gz
            """
        } else {
            """
            [ ! -f  ${name}_1.fastq.gz ] && ln -s ${reads[0]} ${name}_1.fastq.gz
            [ ! -f  ${name}_2.fastq.gz ] && ln -s ${reads[1]} ${name}_2.fastq.gz
            trim_galore --paired --fastqc --gzip $c_r1 $c_r2 $tpc_r1 $tpc_r2 $nextseq ${name}_1.fastq.gz ${name}_2.fastq.gz
            """
        }
    }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                        ALIGN                                        -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 3.1 - Align read 1 with bwa
 */
process BWAMem {
    tag "$name"
    label 'process_high'

    input:
    set val(name), file(reads) from ch_trimmed_reads
    file index from ch_bwa_index.collect()

    output:
    set val(name), file("*.bam") into ch_bwa_bam

    script:
    prefix = "${name}.Lb"
    rg = "\'@RG\\tID:${name}\\tSM:${name.split('_')[0..-2].join('_')}\\tPL:ILLUMINA\\tLB:${name}\\tPU:1\'"
    if (params.seq_center) {
        rg = "\'@RG\\tID:${name}\\tSM:${name.split('_')[0..-2].join('_')}\\tPL:ILLUMINA\\tLB:${name}\\tPU:1\\tCN:${params.seq_center}\'"
    }
    """
    bwa mem \\
        -t $task.cpus \\
        -M \\
        -R $rg \\
        ${index}/${bwa_base} \\
        $reads \\
        | samtools view -@ $task.cpus -b -h -F 0x0100 -O BAM -o ${prefix}.bam -
    """
}

/*
 * STEP 3.2 - Convert .bam to coordinate sorted .bam
 */
process SortBAM {
    tag "$name"
    label 'process_medium'
    if (params.save_align_intermeds) {
        publishDir path: "${params.outdir}/bwa/library", mode: 'copy',
            saveAs: { filename ->
                          if (filename.endsWith(".flagstat")) "samtools_stats/$filename"
                          else if (filename.endsWith(".idxstats")) "samtools_stats/$filename"
                          else if (filename.endsWith(".stats")) "samtools_stats/$filename"
                          else filename
                    }
    }

    input:
    set val(name), file(bam) from ch_bwa_bam

    output:
    set val(name), file("*.sorted.{bam,bam.bai}") into ch_sort_bam_merge
    file "*.{flagstat,idxstats,stats}" into ch_sort_bam_flagstat_mqc

    script:
    prefix = "${name}.Lb"
    """
    samtools sort -@ $task.cpus -o ${prefix}.sorted.bam -T $name $bam
    samtools index ${prefix}.sorted.bam
    samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
    samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
    samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                    MERGE LIBRARY BAM                                -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 4.1 Merge BAM files for all libraries from same replicate
 */
ch_sort_bam_merge
    .map { it -> [ it[0].split('_')[0..-2].join('_'), it[1] ] }
    .groupTuple(by: [0])
    .map { it ->  [ it[0], it[1].flatten() ] }
    .set { ch_sort_bam_merge }

process MergeBAM {
    tag "$name"
    label 'process_medium'
    publishDir "${params.outdir}/bwa/mergedLibrary", mode: 'copy',
        saveAs: { filename ->
                      if (filename.endsWith(".flagstat")) "samtools_stats/$filename"
                      else if (filename.endsWith(".idxstats")) "samtools_stats/$filename"
                      else if (filename.endsWith(".stats")) "samtools_stats/$filename"
                      else if (filename.endsWith(".metrics.txt")) "picard_metrics/$filename"
                      else params.save_align_intermeds ? filename : null
                }

    input:
    set val(name), file(bams) from ch_sort_bam_merge

    output:
    set val(name), file("*${prefix}.sorted.{bam,bam.bai}") into ch_merge_bam_filter,
                                                                ch_merge_bam_preseq
    file "*.{flagstat,idxstats,stats}" into ch_merge_bam_stats_mqc
    file "*.txt" into ch_merge_bam_metrics_mqc

    script:
    prefix = "${name}.mLb.mkD"
    bam_files = bams.findAll { it.toString().endsWith('.bam') }.sort()
    def avail_mem = 3
    if (!task.memory) {
        log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this."
    } else {
        avail_mem = task.memory.toGiga()
    }
    if (bam_files.size() > 1) {
        """
        picard -Xmx${avail_mem}g MergeSamFiles \\
            ${'INPUT='+bam_files.join(' INPUT=')} \\
            OUTPUT=${name}.sorted.bam \\
            SORT_ORDER=coordinate \\
            VALIDATION_STRINGENCY=LENIENT \\
            TMP_DIR=tmp
        samtools index ${name}.sorted.bam

        picard -Xmx${avail_mem}g MarkDuplicates \\
            INPUT=${name}.sorted.bam \\
            OUTPUT=${prefix}.sorted.bam \\
            ASSUME_SORTED=true \\
            REMOVE_DUPLICATES=false \\
            METRICS_FILE=${prefix}.MarkDuplicates.metrics.txt \\
            VALIDATION_STRINGENCY=LENIENT \\
            TMP_DIR=tmp

        samtools index ${prefix}.sorted.bam
        samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
        samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
        samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
        """
    } else {
      """
      picard -Xmx${avail_mem}g MarkDuplicates \\
          INPUT=${bam_files[0]} \\
          OUTPUT=${prefix}.sorted.bam \\
          ASSUME_SORTED=true \\
          REMOVE_DUPLICATES=false \\
          METRICS_FILE=${prefix}.MarkDuplicates.metrics.txt \\
          VALIDATION_STRINGENCY=LENIENT \\
          TMP_DIR=tmp

      samtools index ${prefix}.sorted.bam
      samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
      samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
      samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
      """
    }
}

/*
 * STEP 4.2 Filter BAM file at merged library-level
 */
process MergeBAMFilter {
    tag "$name"
    label 'process_medium'
    publishDir path: "${params.outdir}/bwa/mergedLibrary", mode: 'copy',
        saveAs: { filename ->
                      if (params.single_end || params.save_align_intermeds) {
                          if (filename.endsWith(".flagstat")) "samtools_stats/$filename"
                          else if (filename.endsWith(".idxstats")) "samtools_stats/$filename"
                          else if (filename.endsWith(".stats")) "samtools_stats/$filename"
                          else if (filename.endsWith(".sorted.bam")) filename
                          else if (filename.endsWith(".sorted.bam.bai")) filename
                          else null
                      }
                }

    input:
    set val(name), file(bam) from ch_merge_bam_filter
    file bed from ch_genome_filter_regions.collect()
    file bamtools_filter_config from ch_bamtools_filter_config

    output:
    set val(name), file("*.{bam,bam.bai}") into ch_filter_bam
    set val(name), file("*.flagstat") into ch_filter_bam_flagstat
    file "*.{idxstats,stats}" into ch_filter_bam_stats_mqc

    script:
    prefix = params.single_end ? "${name}.mLb.clN" : "${name}.mLb.flT"
    filter_params = params.single_end ? "-F 0x004" : "-F 0x004 -F 0x0008 -f 0x001"
    dup_params = params.keep_dups ? "" : "-F 0x0400"
    multimap_params = params.keep_multi_map ? "" : "-q 1"
    blacklist_params = params.blacklist ? "-L $bed" : ""
    name_sort_bam = params.single_end ? "" : "samtools sort -n -@ $task.cpus -o ${prefix}.bam -T $prefix ${prefix}.sorted.bam"
    """
    samtools view \\
        $filter_params \\
        $dup_params \\
        $multimap_params \\
        $blacklist_params \\
        -b ${bam[0]} \\
        | bamtools filter \\
            -out ${prefix}.sorted.bam \\
            -script $bamtools_filter_config

    samtools index ${prefix}.sorted.bam
    samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
    samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
    samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats

    $name_sort_bam
    """
}

/*
 * STEP 4.3 Remove orphan reads from paired-end BAM file
 */
if (params.single_end) {
    ch_filter_bam
        .into { ch_rm_orphan_bam_metrics;
                ch_rm_orphan_bam_bigwig;
                ch_rm_orphan_bam_macs_1;
                ch_rm_orphan_bam_macs_2;
                ch_rm_orphan_bam_phantompeakqualtools;
                ch_rm_orphan_name_bam_counts }

    ch_filter_bam_flagstat
        .into { ch_rm_orphan_flagstat_bigwig;
                ch_rm_orphan_flagstat_macs;
                ch_rm_orphan_flagstat_mqc }

    ch_filter_bam_stats_mqc
        .set { ch_rm_orphan_stats_mqc }
} else {
    process MergeBAMRemoveOrphan {
        tag "$name"
        label 'process_medium'
        publishDir path: "${params.outdir}/bwa/mergedLibrary", mode: 'copy',
            saveAs: { filename ->
                          if (filename.endsWith(".flagstat")) "samtools_stats/$filename"
                          else if (filename.endsWith(".idxstats")) "samtools_stats/$filename"
                          else if (filename.endsWith(".stats")) "samtools_stats/$filename"
                          else if (filename.endsWith(".sorted.bam")) filename
                          else if (filename.endsWith(".sorted.bam.bai")) filename
                          else null
                    }

        input:
        set val(name), file(bam) from ch_filter_bam

        output:
        set val(name), file("*.sorted.{bam,bam.bai}") into ch_rm_orphan_bam_metrics,
                                                           ch_rm_orphan_bam_bigwig,
                                                           ch_rm_orphan_bam_macs_1,
                                                           ch_rm_orphan_bam_macs_2,
                                                           ch_rm_orphan_bam_phantompeakqualtools
        set val(name), file("${prefix}.bam") into ch_rm_orphan_name_bam_counts
        set val(name), file("*.flagstat") into ch_rm_orphan_flagstat_bigwig,
                                               ch_rm_orphan_flagstat_macs,
                                               ch_rm_orphan_flagstat_mqc
        file "*.{idxstats,stats}" into ch_rm_orphan_stats_mqc

        script: // This script is bundled with the pipeline, in nf-core/chipseq/bin/
        prefix = "${name}.mLb.clN"
        """
        bampe_rm_orphan.py ${bam[0]} ${prefix}.bam --only_fr_pairs

        samtools sort -@ $task.cpus -o ${prefix}.sorted.bam -T $prefix ${prefix}.bam
        samtools index ${prefix}.sorted.bam
        samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
        samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
        samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
        """
    }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                 MERGE LIBRARY BAM POST-ANALYSIS                     -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 5.1 preseq analysis after merging libraries and before filtering
 */
process Preseq {
    tag "$name"
    label 'process_low'
    publishDir "${params.outdir}/bwa/mergedLibrary/preseq", mode: 'copy'

    when:
    !params.skip_preseq

    input:
    set val(name), file(bam) from ch_merge_bam_preseq

    output:
    file "*.ccurve.txt" into ch_preseq_mqc

    script:
    prefix = "${name}.mLb.clN"
    """
    preseq lc_extrap -v -output ${prefix}.ccurve.txt -bam ${bam[0]}
    """
}

/*
 * STEP 5.2 Picard CollectMultipleMetrics after merging libraries and filtering
 */
process CollectMultipleMetrics {
    tag "$name"
    label 'process_medium'
    publishDir path: "${params.outdir}/bwa/mergedLibrary", mode: 'copy',
        saveAs: { filename ->
                      if (filename.endsWith("_metrics")) "picard_metrics/$filename"
                      else if (filename.endsWith(".pdf")) "picard_metrics/pdf/$filename"
                      else null
                }

    when:
    !params.skip_picard_metrics

    input:
    set val(name), file(bam) from ch_rm_orphan_bam_metrics
    file fasta from ch_fasta

    output:
    file "*_metrics" into ch_collectmetrics_mqc
    file "*.pdf" into ch_collectmetrics_pdf

    script:
    prefix = "${name}.mLb.clN"
    def avail_mem = 3
    if (!task.memory) {
        log.info "[Picard CollectMultipleMetrics] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this."
    } else {
        avail_mem = task.memory.toGiga()
    }
    """
    picard -Xmx${avail_mem}g CollectMultipleMetrics \\
        INPUT=${bam[0]} \\
        OUTPUT=${prefix}.CollectMultipleMetrics \\
        REFERENCE_SEQUENCE=$fasta \\
        VALIDATION_STRINGENCY=LENIENT \\
        TMP_DIR=tmp
    """
}

/*
 * STEP 5.3 Read depth normalised bigWig
 */
process BigWig {
    tag "$name"
    label 'process_medium'
    publishDir "${params.outdir}/bwa/mergedLibrary/bigwig", mode: 'copy',
        saveAs: { filename ->
                      if (filename.endsWith("scale_factor.txt")) "scale/$filename"
                      else if (filename.endsWith(".bigWig")) "$filename"
                      else null
                }

    input:
    set val(name), file(bam), file(flagstat) from ch_rm_orphan_bam_bigwig.join(ch_rm_orphan_flagstat_bigwig, by: [0])
    file sizes from ch_genome_sizes_bigwig.collect()

    output:
    set val(name), file("*.bigWig") into ch_bigwig_plotprofile
    file "*scale_factor.txt" into ch_bigwig_scale
    file "*igv.txt" into ch_bigwig_igv

    script:
    prefix = "${name}.mLb.clN"
    pe_fragment = params.single_end ? "" : "-pc"
    extend = (params.single_end && params.fragment_size > 0) ? "-fs ${params.fragment_size}" : ''
    """
    SCALE_FACTOR=\$(grep 'mapped (' $flagstat | awk '{print 1000000/\$1}')
    echo \$SCALE_FACTOR > ${prefix}.scale_factor.txt
    genomeCoverageBed -ibam ${bam[0]} -bg -scale \$SCALE_FACTOR $pe_fragment $extend | sort -k1,1 -k2,2n >  ${prefix}.bedGraph

    bedGraphToBigWig ${prefix}.bedGraph $sizes ${prefix}.bigWig

    find * -type f -name "*.bigWig" -exec echo -e "bwa/mergedLibrary/bigwig/"{}"\\t0,0,178" \\; > ${prefix}.bigWig.igv.txt
    """
}

/*
 * STEP 5.4 generate gene body coverage plot with deepTools
 */
process PlotProfile {
    tag "$name"
    label 'process_high'
    publishDir "${params.outdir}/bwa/mergedLibrary/deepTools/plotProfile", mode: 'copy'

    when:
    !params.skip_plot_profile

    input:
    set val(name), file(bigwig) from ch_bigwig_plotprofile
    file bed from ch_gene_bed

    output:
    file '*.{gz,pdf}' into ch_plotprofile_results
    file '*.plotProfile.tab' into ch_plotprofile_mqc

    script:
    """
    computeMatrix scale-regions \\
        --regionsFileName $bed \\
        --scoreFileName $bigwig \\
        --outFileName ${name}.computeMatrix.mat.gz \\
        --outFileNameMatrix ${name}.computeMatrix.vals.mat.gz \\
        --regionBodyLength 1000 \\
        --beforeRegionStartLength 3000 \\
        --afterRegionStartLength 3000 \\
        --skipZeros \\
        --smartLabels \\
        --numberOfProcessors $task.cpus

    plotProfile --matrixFile ${name}.computeMatrix.mat.gz \\
        --outFileName ${name}.plotProfile.pdf \\
        --outFileNameData ${name}.plotProfile.tab
    """
}

/*
 * STEP 5.5 Phantompeakqualtools
 */
process PhantomPeakQualTools {
    tag "$name"
    label 'process_medium'
    publishDir "${params.outdir}/bwa/mergedLibrary/phantompeakqualtools", mode: 'copy'

    when:
    !params.skip_spp

    input:
    set val(name), file(bam) from ch_rm_orphan_bam_phantompeakqualtools
    file spp_correlation_header from ch_spp_correlation_header
    file spp_nsc_header from ch_spp_nsc_header
    file spp_rsc_header from ch_spp_rsc_header

    output:
    file '*.pdf' into ch_spp_plot
    file '*.spp.out' into ch_spp_out,
                          ch_spp_out_mqc
    file '*_mqc.tsv' into ch_spp_csv_mqc

    script:
    """
    RUN_SPP=`which run_spp.R`
    Rscript -e "library(caTools); source(\\"\$RUN_SPP\\")" -c="${bam[0]}" -savp="${name}.spp.pdf" -savd="${name}.spp.Rdata" -out="${name}.spp.out" -p=$task.cpus
    cp $spp_correlation_header ${name}_spp_correlation_mqc.tsv
    Rscript -e "load('${name}.spp.Rdata'); write.table(crosscorr\\\$cross.correlation, file=\\"${name}_spp_correlation_mqc.tsv\\", sep=",", quote=FALSE, row.names=FALSE, col.names=FALSE,append=TRUE)"

    awk -v OFS='\t' '{print "${name}", \$9}' ${name}.spp.out | cat $spp_nsc_header - > ${name}_spp_nsc_mqc.tsv
    awk -v OFS='\t' '{print "${name}", \$10}' ${name}.spp.out | cat $spp_rsc_header - > ${name}_spp_rsc_mqc.tsv
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                 MERGE LIBRARY PEAK ANALYSIS                         -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

// Create channel linking IP bams with control bams
ch_rm_orphan_bam_macs_1
    .combine(ch_rm_orphan_bam_macs_2)
    .set { ch_rm_orphan_bam_macs_1 }

ch_design_controls_csv
    .combine(ch_rm_orphan_bam_macs_1)
    .filter { it[0] == it[5] && it[1] == it[7] }
    .join(ch_rm_orphan_flagstat_macs)
    .map { it ->  it[2..-1] }
    .into { ch_group_bam_macs;
            ch_group_bam_plotfingerprint;
            ch_group_bam_deseq }

/*
 * STEP 6.1 deepTools plotFingerprint
 */
process PlotFingerprint {
    tag "${ip} vs ${control}"
    label 'process_high'
    publishDir "${params.outdir}/bwa/mergedLibrary/deepTools/plotFingerprint", mode: 'copy'

    when:
    !params.skip_plot_fingerprint

    input:
    set val(antibody), val(replicatesExist), val(multipleGroups), val(ip), file(ipbam), val(control), file(controlbam), file(ipflagstat) from ch_group_bam_plotfingerprint

    output:
    file '*.{txt,pdf}' into ch_plotfingerprint_results
    file '*.raw.txt' into ch_plotfingerprint_mqc

    script:
    extend = (params.single_end && params.fragment_size > 0) ? "--extendReads ${params.fragment_size}" : ''
    """
    plotFingerprint \\
        --bamfiles ${ipbam[0]} ${controlbam[0]} \\
        --plotFile ${ip}.plotFingerprint.pdf \\
        $extend \\
        --labels $ip $control \\
        --outRawCounts ${ip}.plotFingerprint.raw.txt \\
        --outQualityMetrics ${ip}.plotFingerprint.qcmetrics.txt \\
        --skipZeros \\
        --JSDsample ${controlbam[0]} \\
        --numberOfProcessors $task.cpus \\
        --numberOfSamples $params.fingerprint_bins
    """
}

/*
 * STEP 6.2 Call peaks with MACS2 and calculate FRiP score
 */
process MACSCallPeak {
    tag "${ip} vs ${control}"
    label 'process_medium'
    publishDir "${params.outdir}/bwa/mergedLibrary/macs/${PEAK_TYPE}", mode: 'copy',
        saveAs: { filename ->
                      if (filename.endsWith(".tsv")) "qc/$filename"
                      else if (filename.endsWith(".igv.txt")) null
                      else filename
                }

    when:
    params.macs_gsize

    input:
    set val(antibody), val(replicatesExist), val(multipleGroups), val(ip), file(ipbam), val(control), file(controlbam), file(ipflagstat) from ch_group_bam_macs
    file peak_count_header from ch_peak_count_header
    file frip_score_header from ch_frip_score_header

    output:
    set val(ip), file("*.{bed,xls,gappedPeak,bdg}") into ch_macs_output
    set val(antibody), val(replicatesExist), val(multipleGroups), val(ip), val(control), file("*.$PEAK_TYPE") into ch_macs_homer, ch_macs_qc, ch_macs_consensus
    file "*igv.txt" into ch_macs_igv
    file "*_mqc.tsv" into ch_macs_mqc

    script:
    broad = params.narrow_peak ? '' : "--broad --broad-cutoff ${params.broad_cutoff}"
    format = params.single_end ? "BAM" : "BAMPE"
    pileup = params.save_macs_pileup ? "-B --SPMR" : ""
    """
    macs2 callpeak \\
        -t ${ipbam[0]} \\
        -c ${controlbam[0]} \\
        $broad \\
        -f $format \\
        -g $params.macs_gsize \\
        -n $ip \\
        $pileup \\
        --keep-dup all

    cat ${ip}_peaks.${PEAK_TYPE} | wc -l | awk -v OFS='\t' '{ print "${ip}", \$1 }' | cat $peak_count_header - > ${ip}_peaks.count_mqc.tsv

    READS_IN_PEAKS=\$(intersectBed -a ${ipbam[0]} -b ${ip}_peaks.${PEAK_TYPE} -bed -c -f 0.20 | awk -F '\t' '{sum += \$NF} END {print sum}')
    grep 'mapped (' $ipflagstat | awk -v a="\$READS_IN_PEAKS" -v OFS='\t' '{print "${ip}", a/\$1}' | cat $frip_score_header - > ${ip}_peaks.FRiP_mqc.tsv

    find * -type f -name "*.${PEAK_TYPE}" -exec echo -e "bwa/mergedLibrary/macs/${PEAK_TYPE}/"{}"\\t0,0,178" \\; > ${ip}_peaks.igv.txt
    """
}

/*
 * STEP 6.3 Annotate peaks with HOMER
 */
process AnnotatePeaks {
    tag "${ip} vs ${control}"
    label 'process_medium'
    publishDir "${params.outdir}/bwa/mergedLibrary/macs/${PEAK_TYPE}", mode: 'copy'

    when:
    params.macs_gsize

    input:
    set val(antibody), val(replicatesExist), val(multipleGroups), val(ip), val(control), file(peak) from ch_macs_homer
    file fasta from ch_fasta
    file gtf from ch_gtf

    output:
    file "*.txt" into ch_macs_annotate

    script:
    """
    annotatePeaks.pl \\
        $peak \\
        $fasta \\
        -gid \\
        -gtf $gtf \\
        -cpu $task.cpus \\
        > ${ip}_peaks.annotatePeaks.txt
    """
}

/*
 * STEP 6.4 Aggregated QC plots for peaks, FRiP and peak-to-gene annotation
 */
process PeakQC {
    label "process_medium"
    publishDir "${params.outdir}/bwa/mergedLibrary/macs/${PEAK_TYPE}/qc", mode: 'copy'

    when:
    params.macs_gsize

    input:
    file peaks from ch_macs_qc.collect{ it[-1] }
    file annos from ch_macs_annotate.collect()
    file peak_annotation_header from ch_peak_annotation_header

    output:
    file "*.{txt,pdf}" into ch_macs_qc_output
    file "*.tsv" into ch_macs_qc_mqc

    script:  // This script is bundled with the pipeline, in nf-core/chipseq/bin/
    """
    plot_macs_qc.r \\
        -i ${peaks.join(',')} \\
        -s ${peaks.join(',').replaceAll("_peaks.${PEAK_TYPE}","")} \\
        -o ./ \\
        -p macs_peak

    plot_homer_annotatepeaks.r \\
        -i ${annos.join(',')} \\
        -s ${annos.join(',').replaceAll("_peaks.annotatePeaks.txt","")} \\
        -o ./ \\
        -p macs_annotatePeaks

    cat $peak_annotation_header macs_annotatePeaks.summary.txt > macs_annotatePeaks.summary_mqc.tsv
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                 CONSENSUS PEAKS ANALYSIS                            -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

// group by ip from this point and carry forward boolean variables
ch_macs_consensus
    .map { it ->  [ it[0], it[1], it[2], it[-1] ] }
    .groupTuple()
    .map { it ->  [ it[0], it[1][0], it[2][0], it[3].sort() ] }
    .set { ch_macs_consensus }

/*
 * STEP 7.1 Consensus peaks across samples, create boolean filtering file, .saf file for featureCounts and UpSetR plot for intersection
 */
process ConsensusPeakSet {
    tag "${antibody}"
    label 'process_long'
    publishDir "${params.outdir}/bwa/mergedLibrary/macs/${PEAK_TYPE}/consensus/${antibody}", mode: 'copy',
        saveAs: { filename ->
                      if (filename.endsWith(".igv.txt")) null
                      else filename
                }

    when:
    params.macs_gsize && (replicatesExist || multipleGroups)

    input:
    set val(antibody), val(replicatesExist), val(multipleGroups), file(peaks) from ch_macs_consensus

    output:
    set val(antibody), val(replicatesExist), val(multipleGroups), file("*.bed") into ch_macs_consensus_bed
    set val(antibody), file("*.saf") into ch_macs_consensus_saf
    file "*.boolean.txt" into ch_macs_consensus_bool
    file "*.intersect.{txt,plot.pdf}" into ch_macs_consensus_intersect
    file "*igv.txt" into ch_macs_consensus_igv

    script: // scripts are bundled with the pipeline, in nf-core/chipseq/bin/
    prefix = "${antibody}.consensus_peaks"
    mergecols = params.narrow_peak ? (2..10).join(',') : (2..9).join(',')
    collapsecols = params.narrow_peak ? (["collapse"]*9).join(',') : (["collapse"]*8).join(',')
    expandparam = params.narrow_peak ? "--is_narrow_peak" : ""
    """
    sort -k1,1 -k2,2n ${peaks.collect{it.toString()}.sort().join(' ')} \\
        | mergeBed -c $mergecols -o $collapsecols > ${prefix}.txt

    macs2_merged_expand.py ${prefix}.txt \\
        ${peaks.collect{it.toString()}.sort().join(',').replaceAll("_peaks.${PEAK_TYPE}","")} \\
        ${prefix}.boolean.txt \\
        --min_replicates $params.min_reps_consensus \\
        $expandparam

    awk -v FS='\t' -v OFS='\t' 'FNR > 1 { print \$1, \$2, \$3, \$4, "0", "+" }' ${prefix}.boolean.txt > ${prefix}.bed

    echo -e "GeneID\tChr\tStart\tEnd\tStrand" > ${prefix}.saf
    awk -v FS='\t' -v OFS='\t' 'FNR > 1 { print \$4, \$1, \$2, \$3,  "+" }' ${prefix}.boolean.txt >> ${prefix}.saf

    plot_peak_intersect.r -i ${prefix}.boolean.intersect.txt -o ${prefix}.boolean.intersect.plot.pdf

    find * -type f -name "${prefix}.bed" -exec echo -e "bwa/mergedLibrary/macs/${PEAK_TYPE}/consensus/${antibody}/"{}"\\t0,0,0" \\; > ${prefix}.bed.igv.txt
    """
}

/*
 * STEP 7.2 Annotate consensus peaks with HOMER, and add annotation to boolean output file
 */
process ConsensusPeakSetAnnotate {
    tag "${antibody}"
    label 'process_medium'
    publishDir "${params.outdir}/bwa/mergedLibrary/macs/${PEAK_TYPE}/consensus/${antibody}", mode: 'copy'

    when:
    params.macs_gsize && (replicatesExist || multipleGroups)

    input:
    set val(antibody), val(replicatesExist), val(multipleGroups), file(bed) from ch_macs_consensus_bed
    file bool from ch_macs_consensus_bool
    file fasta from ch_fasta
    file gtf from ch_gtf

    output:
    file "*.annotatePeaks.txt" into ch_macs_consensus_annotate

    script:
    prefix = "${antibody}.consensus_peaks"
    """
    annotatePeaks.pl \\
        $bed \\
        $fasta \\
        -gid \\
        -gtf $gtf \\
        -cpu $task.cpus \\
        > ${prefix}.annotatePeaks.txt

    cut -f2- ${prefix}.annotatePeaks.txt | awk 'NR==1; NR > 1 {print \$0 | "sort -k1,1 -k2,2n"}' | cut -f6- > tmp.txt
    paste $bool tmp.txt > ${prefix}.boolean.annotatePeaks.txt
    """
}

// get bam and saf files for each ip
ch_group_bam_deseq
    .map { it -> [ it[3], [ it[0], it[1], it[2] ] ] }
    .join(ch_rm_orphan_name_bam_counts)
    .map { it -> [ it[1][0], it[1][1], it[1][2], it[2] ] }
    .groupTuple()
    .map { it -> [ it[0], it[1][0], it[2][0], it[3].flatten().sort() ] }
    .join(ch_macs_consensus_saf)
    .set { ch_group_bam_deseq }

/*
 * STEP 7.3 Count reads in consensus peaks with featureCounts and perform differential analysis with DESeq2
 */
process ConsensusPeakSetDESeq {
    tag "${antibody}"
    label 'process_medium'
    publishDir "${params.outdir}/bwa/mergedLibrary/macs/${PEAK_TYPE}/consensus/${antibody}/deseq2", mode: 'copy',
        saveAs: { filename ->
                      if (filename.endsWith(".igv.txt")) null
                      else filename
                }

    when:
    params.macs_gsize && replicatesExist && multipleGroups && !params.skip_diff_analysis

    input:
    set val(antibody), val(replicatesExist), val(multipleGroups), file(bams) ,file(saf) from ch_group_bam_deseq
    file deseq2_pca_header from ch_deseq2_pca_header
    file deseq2_clustering_header from ch_deseq2_clustering_header

    output:
    file "*featureCounts.txt" into ch_macs_consensus_counts
    file "*featureCounts.txt.summary" into ch_macs_consensus_counts_mqc
    file "*.{RData,results.txt,pdf,log}" into ch_macs_consensus_deseq_results
    file "sizeFactors" into ch_macs_consensus_deseq_factors
    file "*vs*/*.{pdf,txt}" into ch_macs_consensus_deseq_comp_results
    file "*vs*/*.bed" into ch_macs_consensus_deseq_comp_bed
    file "*igv.txt" into ch_macs_consensus_deseq_comp_igv
    file "*.tsv" into ch_macs_consensus_deseq_mqc

    script:
    prefix = "${antibody}.consensus_peaks"
    bam_files = bams.findAll { it.toString().endsWith('.bam') }.sort()
    bam_ext = params.single_end ? ".mLb.clN.sorted.bam" : ".mLb.clN.bam"
    pe_params = params.single_end ? '' : "-p --donotsort"
    """
    featureCounts \\
        -F SAF \\
        -O \\
        --fracOverlap 0.2 \\
        -T $task.cpus \\
        $pe_params \\
        -a $saf \\
        -o ${prefix}.featureCounts.txt \\
        ${bam_files.join(' ')}

    featurecounts_deseq2.r -i ${prefix}.featureCounts.txt -b '$bam_ext' -o ./ -p $prefix -s .mLb

    sed 's/deseq2_pca/deseq2_pca_${task.index}/g' <$deseq2_pca_header >tmp.txt
    sed -i -e 's/DESeq2:/${antibody} DESeq2:/g' tmp.txt
    cat tmp.txt ${prefix}.pca.vals.txt > ${prefix}.pca.vals_mqc.tsv

    sed 's/deseq2_clustering/deseq2_clustering_${task.index}/g' <$deseq2_clustering_header >tmp.txt
    sed -i -e 's/DESeq2:/${antibody} DESeq2:/g' tmp.txt
    cat tmp.txt ${prefix}.sample.dists.txt > ${prefix}.sample.dists_mqc.tsv

    find * -type f -name "*.FDR0.05.results.bed" -exec echo -e "bwa/mergedLibrary/macs/${PEAK_TYPE}/consensus/${antibody}/deseq2/"{}"\\t255,0,0" \\; > ${prefix}.igv.txt
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                             IGV                                     -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 8 - Create IGV session file
 */
process IGV {
    publishDir "${params.outdir}/igv/${PEAK_TYPE}", mode: 'copy'

    when:
    !params.skip_igv

    input:
    file fasta from ch_fasta
    file bigwigs from ch_bigwig_igv.collect().ifEmpty([])
    file peaks from ch_macs_igv.collect().ifEmpty([])
    file consensus_peaks from ch_macs_consensus_igv.collect().ifEmpty([])
    file differential_peaks from ch_macs_consensus_deseq_comp_igv.collect().ifEmpty([])

    output:
    file "*.{txt,xml}" into ch_igv_session

    script: // scripts are bundled with the pipeline, in nf-core/chipseq/bin/
    """
    cat *.txt > igv_files.txt
    igv_files_to_session.py igv_session.xml igv_files.txt ../../reference_genome/${fasta.getName()} --path_prefix '../../'
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                          MULTIQC                                    -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * Parse software version numbers
 */
process get_software_versions {
    publishDir "${params.outdir}/pipeline_info", mode: 'copy',
        saveAs: { filename ->
                      if (filename.indexOf(".csv") > 0) filename
                      else null
                }

    output:
    file 'software_versions_mqc.yaml' into ch_software_versions_mqc
    file "software_versions.csv"

    script:
    """
    echo $workflow.manifest.version > v_pipeline.txt
    echo $workflow.nextflow.version > v_nextflow.txt
    fastqc --version > v_fastqc.txt
    trim_galore --version > v_trim_galore.txt
    echo \$(bwa 2>&1) > v_bwa.txt
    samtools --version > v_samtools.txt
    bedtools --version > v_bedtools.txt
    echo \$(bamtools --version 2>&1) > v_bamtools.txt
    echo \$(plotFingerprint --version 2>&1) > v_deeptools.txt || true
    picard MarkDuplicates --version &> v_picard.txt  || true
    echo \$(R --version 2>&1) > v_R.txt
    python -c "import pysam; print(pysam.__version__)" > v_pysam.txt
    echo \$(macs2 --version 2>&1) > v_macs2.txt
    touch v_homer.txt
    echo \$(featureCounts -v 2>&1) > v_featurecounts.txt
    preseq &> v_preseq.txt
    multiqc --version > v_multiqc.txt
    scrape_software_versions.py &> software_versions_mqc.yaml
    """
}

def create_workflow_summary(summary) {

    def yaml_file = workDir.resolve('workflow_summary_mqc.yaml')
    yaml_file.text  = """
    id: 'nf-core-chipseq-summary'
    description: " - this information is collected when the pipeline is started."
    section_name: 'nf-core/chipseq Workflow Summary'
    section_href: 'https://github.com/nf-core/chipseq'
    plot_type: 'html'
    data: |
        <dl class=\"dl-horizontal\">
${summary.collect { k,v -> "            <dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }.join("\n")}
        </dl>
    """.stripIndent()

   return yaml_file
}

/*
 * STEP 9 - MultiQC
 */
process MultiQC {
    publishDir "${params.outdir}/multiqc/${PEAK_TYPE}", mode: 'copy'

    when:
    !params.skip_multiqc

    input:
    file multiqc_config from ch_multiqc_config

    file ('software_versions/*') from ch_software_versions_mqc.collect()
    file ('workflow_summary/*') from create_workflow_summary(summary)

    file ('fastqc/*') from ch_fastqc_reports_mqc.collect().ifEmpty([])
    file ('trimgalore/*') from ch_trimgalore_results_mqc.collect().ifEmpty([])
    file ('trimgalore/fastqc/*') from ch_trimgalore_fastqc_reports_mqc.collect().ifEmpty([])

    file ('alignment/library/*') from ch_sort_bam_flagstat_mqc.collect()
    file ('alignment/mergedLibrary/*') from ch_merge_bam_stats_mqc.collect()
    file ('alignment/mergedLibrary/*') from ch_rm_orphan_flagstat_mqc.collect{it[1]}
    file ('alignment/mergedLibrary/*') from ch_rm_orphan_stats_mqc.collect()
    file ('alignment/mergedLibrary/picard_metrics/*') from ch_merge_bam_metrics_mqc.collect()
    file ('alignment/mergedLibrary/picard_metrics/*') from ch_collectmetrics_mqc.collect()

    file ('macs/*') from ch_macs_mqc.collect().ifEmpty([])
    file ('macs/*') from ch_macs_qc_mqc.collect().ifEmpty([])
    file ('macs/consensus/*') from ch_macs_consensus_counts_mqc.collect().ifEmpty([])
    file ('macs/consensus/*') from ch_macs_consensus_deseq_mqc.collect().ifEmpty([])

    file ('preseq/*') from ch_preseq_mqc.collect().ifEmpty([])
    file ('deeptools/*') from ch_plotfingerprint_mqc.collect().ifEmpty([])
    file ('deeptools/*') from ch_plotprofile_mqc.collect().ifEmpty([])
    file ('phantompeakqualtools/*') from ch_spp_out_mqc.collect().ifEmpty([])
    file ('phantompeakqualtools/*') from ch_spp_csv_mqc.collect().ifEmpty([])

    output:
    file "*multiqc_report.html" into ch_multiqc_report
    file "*_data"
    file "multiqc_plots"

    script:
    rtitle = custom_runName ? "--title \"$custom_runName\"" : ''
    rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : ''
    """
    multiqc . -f $rtitle $rfilename --config $multiqc_config \\
        -m custom_content -m fastqc -m cutadapt -m samtools -m picard -m preseq -m featureCounts -m deeptools -m phantompeakqualtools
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                       REPORTS/DOCUMENTATION                         -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 10 - Output description HTML
 */
process output_documentation {
    publishDir "${params.outdir}/Documentation", mode: 'copy'

    input:
    file output_docs from ch_output_docs

    output:
    file "results_description.html"

    script:
    """
    markdown_to_html.r $output_docs results_description.html
    """
}

/*
 * Completion e-mail notification
 */
workflow.onComplete {

    // Set up the e-mail variables
    def subject = "[nf-core/chipseq] Successful: $workflow.runName"
    if (!workflow.success) {
        subject = "[nf-core/chipseq] FAILED: $workflow.runName"
    }
    def email_fields = [:]
    email_fields['version'] = workflow.manifest.version
    email_fields['runName'] = custom_runName ?: workflow.runName
    email_fields['success'] = workflow.success
    email_fields['dateComplete'] = workflow.complete
    email_fields['duration'] = workflow.duration
    email_fields['exitStatus'] = workflow.exitStatus
    email_fields['errorMessage'] = (workflow.errorMessage ?: 'None')
    email_fields['errorReport'] = (workflow.errorReport ?: 'None')
    email_fields['commandLine'] = workflow.commandLine
    email_fields['projectDir'] = workflow.projectDir
    email_fields['summary'] = summary
    email_fields['summary']['Date Started'] = workflow.start
    email_fields['summary']['Date Completed'] = workflow.complete
    email_fields['summary']['Pipeline script file path'] = workflow.scriptFile
    email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId
    if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository
    if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId
    if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision
    if (workflow.container) email_fields['summary']['Docker image'] = workflow.container
    email_fields['summary']['Nextflow Version'] = workflow.nextflow.version
    email_fields['summary']['Nextflow Build'] = workflow.nextflow.build
    email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp

    // On success try attach the multiqc report
    def mqc_report = null
    try {
        if (workflow.success) {
            mqc_report = ch_multiqc_report.getVal()
            if (mqc_report.getClass() == ArrayList) {
                log.warn "[nf-core/chipseq] Found multiple reports from process 'multiqc', will use only one"
                mqc_report = mqc_report[0]
            }
        }
    } catch (all) {
        log.warn "[nf-core/chipseq] Could not attach MultiQC report to summary email"
    }

    // Check if we are only sending emails on failure
    email_address = params.email
    if (!params.email && params.email_on_fail && !workflow.success) {
        email_address = params.email_on_fail
    }

    // Render the TXT template
    def engine = new groovy.text.GStringTemplateEngine()
    def tf = new File("$baseDir/assets/email_template.txt")
    def txt_template = engine.createTemplate(tf).make(email_fields)
    def email_txt = txt_template.toString()

    // Render the HTML template
    def hf = new File("$baseDir/assets/email_template.html")
    def html_template = engine.createTemplate(hf).make(email_fields)
    def email_html = html_template.toString()

    // Render the sendmail template
    def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ]
    def sf = new File("$baseDir/assets/sendmail_template.txt")
    def sendmail_template = engine.createTemplate(sf).make(smail_fields)
    def sendmail_html = sendmail_template.toString()

    // Send the HTML e-mail
    if (email_address) {
        try {
            if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') }
            // Try to send HTML e-mail using sendmail
            [ 'sendmail', '-t' ].execute() << sendmail_html
            log.info "[nf-core/chipseq] Sent summary e-mail to $email_address (sendmail)"
        } catch (all) {
            // Catch failures and try with plaintext
            [ 'mail', '-s', subject, email_address ].execute() << email_txt
            log.info "[nf-core/chipseq] Sent summary e-mail to $email_address (mail)"
        }
    }

    // Write summary e-mail HTML to a file
    def output_d = new File("${params.outdir}/pipeline_info/")
    if (!output_d.exists()) {
        output_d.mkdirs()
    }
    def output_hf = new File(output_d, "pipeline_report.html")
    output_hf.withWriter { w -> w << email_html }
    def output_tf = new File(output_d, "pipeline_report.txt")
    output_tf.withWriter { w -> w << email_txt }

    c_reset = params.monochrome_logs ? '' : "\033[0m";
    c_purple = params.monochrome_logs ? '' : "\033[0;35m";
    c_green = params.monochrome_logs ? '' : "\033[0;32m";
    c_red = params.monochrome_logs ? '' : "\033[0;31m";

    if (workflow.stats.ignoredCount > 0 && workflow.success) {
        log.info "${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}"
        log.info "${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}"
        log.info "${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}"
    }

    if (workflow.success) {
        log.info "${c_purple}[nf-core/chipseq]${c_green} Pipeline completed successfully${c_reset}"
    } else {
        checkHostname()
        log.info "${c_purple}[nf-core/chipseq]${c_red} Pipeline completed with errors${c_reset}"
    }

}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                       NF-CORE HEADER                                -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

def nfcoreHeader() {
    // Log colors ANSI codes
    c_reset = params.monochrome_logs ? '' : "\033[0m";
    c_dim = params.monochrome_logs ? '' : "\033[2m";
    c_black = params.monochrome_logs ? '' : "\033[0;30m";
    c_green = params.monochrome_logs ? '' : "\033[0;32m";
    c_yellow = params.monochrome_logs ? '' : "\033[0;33m";
    c_blue = params.monochrome_logs ? '' : "\033[0;34m";
    c_purple = params.monochrome_logs ? '' : "\033[0;35m";
    c_cyan = params.monochrome_logs ? '' : "\033[0;36m";
    c_white = params.monochrome_logs ? '' : "\033[0;37m";

    return """    -${c_dim}--------------------------------------------------${c_reset}-
                                            ${c_green},--.${c_black}/${c_green},-.${c_reset}
    ${c_blue}        ___     __   __   __   ___     ${c_green}/,-._.--~\'${c_reset}
    ${c_blue}  |\\ | |__  __ /  ` /  \\ |__) |__         ${c_yellow}}  {${c_reset}
    ${c_blue}  | \\| |       \\__, \\__/ |  \\ |___     ${c_green}\\`-._,-`-,${c_reset}
                                            ${c_green}`._,._,\'${c_reset}
    ${c_purple}  nf-core/chipseq v${workflow.manifest.version}${c_reset}
    -${c_dim}--------------------------------------------------${c_reset}-
    """.stripIndent()
}

def checkHostname() {
    def c_reset = params.monochrome_logs ? '' : "\033[0m"
    def c_white = params.monochrome_logs ? '' : "\033[0;37m"
    def c_red = params.monochrome_logs ? '' : "\033[1;91m"
    def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m"
    if (params.hostnames) {
        def hostname = "hostname".execute().text.trim()
        params.hostnames.each { prof, hnames ->
            hnames.each { hname ->
                if (hostname.contains(hname) && !workflow.profile.contains(prof)) {
                    log.error "====================================================\n" +
                            "  ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" +
                            "  but your machine hostname is ${c_white}'$hostname'${c_reset}\n" +
                            "  ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" +
                            "============================================================"
                }
            }
        }
    }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                        END OF PIPELINE                              -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////