Merge pull request #348 from sanger-tol/dev

Release 1.2.0
sanger-tol · Jan 14, 2025 · 577d30b · 577d30b
2 parents 6afa1ae + 715d33e
commit 577d30b
Show file tree

Hide file tree

Showing 269 changed files with 11,872 additions and 2,546 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,54 +22,27 @@ jobs:
     name: Run pipeline with test data
     # Only run on push if this is the nf-core dev branch (merged PRs)
     if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'sanger-tol/treeval') }}"
-    runs-on: ubuntu2204-8c
+    runs-on: [ubuntu-latest]
     strategy:
       matrix:
         NXF_VER:
           - "22.10.1"
           - "latest-everything"
     steps:
-      - name: Get branch names
-        # Pulls the names of current branches in repo
-        # steps.branch-names.outputs.current_branch is used later and returns the name of the branch the PR is made FROM not to
-        id: branch-names
-        uses: tj-actions/branch-names@v8
+      - name: Check out pipeline code
+        uses: actions/checkout@v4
 
       - name: Install Nextflow
-        uses: nf-core/setup-nextflow@v1
+        uses: nf-core/setup-nextflow@v2
         with:
           version: "${{ matrix.NXF_VER }}"
 
-      - name: Setup apptainer
-        uses: eWaterCycle/setup-apptainer@main
-
-      - name: Set up Singularity
-        run: |
-          mkdir -p $NXF_SINGULARITY_CACHEDIR
-          mkdir -p $NXF_SINGULARITY_LIBRARYDIR
-
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-
-      - name: Install nf-core
-        run: |
-          pip install nf-core
-
-      - name: NF-Core Download - download singularity containers
-        # Forcibly download repo on active branch and download SINGULARITY containers into the CACHE dir if not found
-        # Must occur after singularity install or will crash trying to dl containers
-        # Zip up this fresh download and run the checked out version
-        run: |
-          nf-core download sanger-tol/treeval --revision ${{ steps.branch-names.outputs.current_branch }} --compress none -d --force --outdir sanger-treeval --container-cache-utilisation amend --container-system singularity
-
       - name: Download Tiny test data
         # Download A fungal test data set that is full enough to show some real output.
         run: |
           curl https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData.tar.gz | tar xzf -
 
-      - name: Singularity - Run FULL pipeline with test data
+      - name: Run FULL pipeline with test data
         # Remember that you can parallelise this by using strategy.matrix
         run: |
-          nextflow run ./sanger-treeval/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test_github,singularity --outdir ./Sing-Full
+          nextflow run ${GITHUB_WORKSPACE} -profile test_github,docker --outdir ./Sing-Full
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,8 @@ output/
 null/
 error
 out
+OscSUBSET-TEST
+OscSUBSET-TEST-MODULE-UPDATE
+OscSUBSET-TEST-MODULE-UPDATE-3
+TreeValTinyData
+yaml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,86 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.2.0] - Ancient Destiny - [2024-11-15]
+
+Our 3rd release for sanger-tol/treeval.
+
+### Enhancements & Fixes
+
+- Togglable subworkflows
+- Adds a JBrowse Only workflow (this will lead to an update to the FULL workflow which can now call JBROWSE_ONLY and RAPID).
+- Updates to containers (local modules) to remove Anaconda dependencies following policy changes.
+- Updates to modules to remove Anaconda dependencies following policy changes
+  - The majority of these updates only remove the `default` channel from the environment.yml
+- CONDA warnings for modules which cannot use CONDA.
+- Removable of a liberal use of spaces.
+- reformat_intersect was previously not outputing version data.
+- Adding arch specification to Pretext GitHub actions runner. Hopefully this will stop the spurious errors we see on there.
+- Addition of steps into schema.
+- Adds \*ktab as an output.
+- Adds \*bin as an output for faster downsteam map generation.
+- Updated singularity containers
+- Added `--metaeuk` to BUSCO_BUSCO, default was causing pipeline errors on Actions -- Needs more investigation.
+- Replaced Pyfasta split (depreciated 6 years ago) with Seqkit split which is frequently updated and very fast.
+- Allocated resource review
+
+### Parameters
+
+| Old Parameter | New Parameter |
+| ------------- | ------------- |
+| -             | --steps       |
+
+### Software dependencies
+
+Note, since the pipeline is using Nextflow DSL2, each process will be run with its own Biocontainer. This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference.
+
+| Module                                 | Old Version      | New Versions      |
+| -------------------------------------- | ---------------- | ----------------- |
+| bamtobed_sort ( bedtools + samtools )  | 2.31.0 + 1.17    |                   |
+| bedtools                               | 2.31.1           | -                 |
+| busco\*                                | 5.5.0            | -                 |
+| bwa-mem2                               | 2.2.1            |                   |
+| cat                                    | 2.3.4            |                   |
+| chunk_fasta ( pyfasta )                | 0.5.2-1          | REMOVED           |
+| cooler                                 | 0.9.2            |                   |
+| cram_filter_align_bwamem2_fixmate_sort | -                |                   |
+| ^ ( samtools + bwamem2 ) ^             | 1.17 + 2.2.1     |                   |
+| coreutils                              | 9.1              |                   |
+| fastk                                  | 1.0.1            |                   |
+| gcc                                    | 10.4.0           |                   |
+| find_telomere_windows ( java-jdk )     | 8.0.112          |                   |
+| generate_cram_csv ( samtools )         | 1.17             |                   |
+| gnu-sort                               | 8.25             | 9.3               |
+| juicer_tools_pre ( java-jdk )          | 8.0.112          |                   |
+| perl                                   | 5.26.2           |                   |
+| merquryfk                              | 1.0.1            |                   |
+| minimap2 + samtools                    | 2.24 + 1.14      |                   |
+| minimap2_index                         | 2.24             | 2.28              |
+| miniprot                               | 0.11--he4a0461_2 |                   |
+| mummer                                 | 3.23             |                   |
+| paftools ( minimap2 + samtools )       | 2.24 + 1.14      |                   |
+| pretextmap + samtools                  | 0.0.2 + 1.17     | 0.0.3 + 1.17      |
+| python                                 | 3.9              | -                 |
+| - pandas                               | 1.5.2            | -                 |
+| samtools                               | 1.18             | 1.21              |
+| selfcomp_splitfasta ( perl-bioperl )   | 1.7.8-1          |                   |
+| seqtk                                  | 1.4              |                   |
+| seqkit                                 | ADDED            | 2.9.0--h9ee0642_0 |
+| tabix                                  | 1.11             |                   |
+| ucsc                                   | 377              | 447               |
+| windowmasker (blast)                   | 2.14.0           | 2.15.0            |
+
+- busco is currently pinned to v5.5.0 - Upgrading v5.7.1 would cause github actions to crash. Further investigation needed.
+
+## [1.1.1] - Ancient Aurora (H1) - [2024-04-26]
+
+### Enhancements & Fixes
+
+- Generate CRAM CSV fix to allow for multi-readgroup cram files
+- Removing KMER_READCOV
+- tmp directory was being used
+- Output file adjustment (names and location)
+
 ## [1.1.0] - Ancient Aurora - [2024-04-26]
 
 The second release for sanger-tol, created with the [nf-core](https://nf-co.re/) template.
@@ -40,6 +120,13 @@ This builds on the initial release by adding subworkflows which generate kmer ba
 - Fix a bug in build_alignment_blocks.py to avoid indexing errors happening in large genomes.
 - Change output BEDGRAPH from EXTRACT_TELO module.
 
+#### Hot Fix 1
+
+- Generate CRAM CSV fix to allow for multi-readgroup cram files
+- Removing KMER_READCOV
+- tmp directory was being used
+- Output file adjustment (names and location)
+
 ### Parameters
 
 | Old Parameter | New Parameter |

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -108,9 +108,9 @@
 
 ## Software packaging/containerisation tools
 
-- [Anaconda](https://anaconda.com)
+- [Conda](https://conda.org/)
 
-  > Anaconda Software Distribution. 2016. Computer software. Vers. 2-2.4.0. Anaconda, Web.
+  > conda contributors. conda: A system-level, binary package and environment manager running on all major operating systems and platforms. Computer software. https://github.com/conda/conda
 
 - [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/)
 

diff --git a/README.md b/README.md
@@ -1,13 +1,13 @@
 [![Cite with Zenodo](https://zenodo.org/badge/509096312.svg)](https://zenodo.org/doi/10.5281/zenodo.10047653)
 [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/)
-[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
+[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=conda)](https://docs.conda.io/en/latest/)
 [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
 [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
 [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/sanger-tol/treeval)
 
 ## Introduction
 
-**sanger-tol/treeval [1.1.0 - Ancient Aurora]** is a bioinformatics best-practice analysis pipeline for the generation of data supplemental to the curation of reference quality genomes. This pipeline has been written to generate flat files compatible with [JBrowse2](https://jbrowse.org/jb2/) as well as HiC maps for use in Juicebox, PretextView and HiGlass.
+**sanger-tol/treeval [1.2.0 - Ancient Destiny-]** is a bioinformatics best-practice analysis pipeline for the generation of data supplemental to the curation of reference quality genomes. This pipeline has been written to generate flat files compatible with [JBrowse2](https://jbrowse.org/jb2/) as well as HiC maps for use in Juicebox, PretextView and HiGlass.
 
 The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
 
@@ -80,8 +80,6 @@ If you would like to contribute to this pipeline, please see the [contributing g
 
 ## Citations
 
-<!--TODO: Citation-->
-
 If you use sanger-tol/treeval for your analysis, please cite it using the following doi: [10.5281/zenodo.10047653](https://doi.org/10.5281/zenodo.10047653).
 
 ### Tools

diff --git a/assets/github_testing/TreeValTinyFullTest.yaml b/assets/github_testing/TreeValTinyFullTest.yaml
@@ -19,10 +19,8 @@ kmer_profile:
   kmer_length: 31
   dir: /home/runner/work/treeval/treeval/TreeValTinyData/
 alignment:
-  data_dir: /home/runner/work/treeval/treeval/TreeValTinyData/gene_alignment_data/
-  common_name: "" # For future implementation (adding bee, wasp, ant etc)
-  geneset_id: "LaetiporusSulphureus.gfLaeSulp1"
-  #Path should end up looking like "{data_dir}{classT}/{common_name}/csv_data/{geneset}-data.csv"
+  genesets:
+    - /home/runner/work/treeval/treeval/TreeValTinyData/gene_alignment_data/fungi/csv_data/LaetiporusSulphureus.gfLaeSulp1-data.csv
 self_comp:
   motif_len: 0
   mummer_chunk: 10
@@ -31,8 +29,7 @@ intron:
 telomere:
   teloseq: TTAGGG
 synteny:
-  synteny_path: /home/runner/work/treeval/treeval/treeval/TreeValTinyData/synteny
-  synteny_genomes: "LaetiporusSulphureus"
+  - /home/runner/work/treeval/treeval/TreeValTinyData/synteny/fungi/LaetiporusSulphureus.fasta
 busco:
   lineages_path: /home/runner/work/treeval/treeval/TreeValTinyData/busco/subset/
   lineage: fungi_odb10
diff --git a/assets/local_testing/nxOscDF5033-BGA.yaml b/assets/local_testing/nxOscDF5033-BGA.yaml
@@ -11,8 +11,8 @@ assem_reads:
   hic: /workspace/treeval-curation/Oscheius_DF5033/hic-arima2/
   supplementary: path # Not currently in use
 alignment:
-  data_dir: /workspace/treeval-curation/gene_alignment_data/
-  geneset: "OscheiusTipulae.ASM1342590v1,CaenorhabditisElegans.WBcel235,Gae_host.Gae"
+  genesets:
+    - /lustre/scratch123/tol/resources/treeval/gene_alignment_data/nematode/csv_data/OscheiusTipulae.ASM1342590v1-data.csv
 self_comp:
   motif_len: 0
   mummer_chunk: 10
@@ -21,8 +21,7 @@ intron:
 telomere:
   teloseq: TTAGGG
 synteny:
-  synteny_path: /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/synteny/
-  synteny_genomes: "LaetiporusSulphureus"
+  - /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/synteny/fungi/LaetiporusSulphureus.fasta
 busco:
   lineages_path: /workspace/treeval-curation/busco/v5
   lineage: nematoda_odb10
diff --git a/assets/local_testing/nxOscDF5033.yaml b/assets/local_testing/nxOscDF5033.yaml
@@ -19,10 +19,10 @@ kmer_profile:
   kmer_length: 31
   dir: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/
 alignment:
-  data_dir: /lustre/scratch123/tol/resources/treeval/gene_alignment_data/
-  common_name: "" # For future implementation (adding bee, wasp, ant etc)
-  geneset_id: "OscheiusTipulae.ASM1342590v1,CaenorhabditisElegans.WBcel235,Gae_host.Gae"
-  #Path should end up looking like "{data_dir}{classT}/{common_name}/csv_data/{geneset}-data.csv"
+  genesets:
+    - /lustre/scratch123/tol/resources/treeval/gene_alignment_data/nematode/csv_data/OscheiusTipulae.ASM1342590v1-data.csv
+    - /lustre/scratch123/tol/resources/treeval/gene_alignment_data/nematode/csv_data/CaenorhabditisElegans.WBcel235-data.csv
+    - /lustre/scratch123/tol/resources/treeval/gene_alignment_data/nematode/csv_data/Gae_host.Gae-data.csv
 self_comp:
   motif_len: 0
   mummer_chunk: 10
@@ -31,8 +31,7 @@ intron:
 telomere:
   teloseq: TTAGGG
 synteny:
-  synteny_path: /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/synteny/
-  synteny_genomes: ""
+  - /lustre/scratch123/tol/resources/treeval/synteny/bird/bCucCan1.fasta
 busco:
   lineages_path: /lustre/scratch123/tol/resources/busco/v5
   lineage: nematoda_odb10
diff --git a/assets/local_testing/nxOscSUBSET.yaml b/assets/local_testing/nxOscSUBSET.yaml
@@ -19,10 +19,8 @@ kmer_profile:
   kmer_length: 31
   dir: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/
 alignment:
-  data_dir: /lustre/scratch123/tol/resources/treeval/gene_alignment_data/
-  common_name: "" # For future implementation (adding bee, wasp, ant etc)
-  geneset_id: "Gae_host.Gae"
-  #Path should end up looking like "{data_dir}{classT}/{common_name}/csv_data/{geneset}-data.csv"
+  genesets:
+    - /lustre/scratch123/tol/resources/treeval/gene_alignment_data/nematode/csv_data/OscheiusTipulae.ASM1342590v1-data.csv
 self_comp:
   motif_len: 0
   mummer_chunk: 10
@@ -31,8 +29,8 @@ intron:
 telomere:
   teloseq: TTAGGG
 synteny:
-  synteny_path: /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/synteny/
-  synteny_genomes: ""
+  - /lustre/scratch123/tol/resources/treeval/synteny/bird/bCucCan1.fasta
+  - /lustre/scratch123/tol/resources/treeval/synteny/bird/bGalGal1.fasta
 busco:
   lineages_path: /lustre/scratch123/tol/resources/busco/v5
   lineage: nematoda_odb10
diff --git a/bin/FKprof b/bin/FKprof
diff --git a/bin/awk_filter_reads.sh b/bin/awk_filter_reads.sh
@@ -1 +1,7 @@
-awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}'
+version='1.0.0'
+if [ $1 == '-v' ];
+then
+    echo "$version"
+else
+    awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}'
+fi
diff --git a/bin/bed_to_contacts.sh b/bin/bed_to_contacts.sh
@@ -1,2 +1,9 @@
 #!/bin/bash
-paste -d '\t' - - < $1 | awk 'BEGIN {FS="\t"; OFS="\t"} {if ($1 > $7) {print substr($4,1,length($4)-2),$12,$7,$8,"16",$6,$1,$2,"8",$11,$5} else {print substr($4,1,length($4)-2),$6,$1,$2,"8",$12,$7,$8,"16",$5,$11} }' | tr '\-+' '01'  | sort -k3,3d -k7,7d | awk 'NF==11'
+
+version='1.0.0'
+if [ $1 == '-v' ];
+then
+    echo "$version"
+else
+    paste -d '\t' - - < $1 | awk 'BEGIN {FS="\t"; OFS="\t"} {if ($1 > $7) {print substr($4,1,length($4)-2),$12,$7,$8,"16",$6,$1,$2,"8",$11,$5} else {print substr($4,1,length($4)-2),$6,$1,$2,"8",$12,$7,$8,"16",$5,$11} }' | tr '\-+' '01'  | sort -k3,3d -k7,7d | awk 'NF==11'
+fi
diff --git a/bin/find_telomere b/bin/find_telomere
diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh
@@ -71,6 +71,11 @@ if [ -z "$1" ]; then
     exit 1
 fi
 
+if [ $1 == "-v" ]; then
+    echo "1.0"
+    exit 1
+fi
+
 cram_path=$1
 chunkn=0
 outcsv=$2

diff --git a/bin/get_avgcov.sh b/bin/get_avgcov.sh
@@ -8,7 +8,7 @@
 # Author = yy5
 # -------------------
 version='1.0.0'
-if [ $1 == '-v'];
+if [ $1 == '-v' ];
 then
     echo "$version"
 else

diff --git a/bin/get_busco_gene.sh b/bin/get_busco_gene.sh
@@ -11,5 +11,10 @@
 # Update for BUSCO 5.5.0 - by we3
 # Reorder start and end so smallest always second column. Also, trim range from scaffold name in first column.
 # -------------------
-
-cat $1| grep -v '#'|awk '$2!="Missing"'| awk '{if($4>$5){print $3"\t"$5"\t"$4"\t"$1"\t"$7"\t"$6"\t"$9}else{print $3"\t"$4"\t"$5"\t"$1"\t"$7"\t"$6"\t"$9}}'| awk -F'\t' -v OFS='\t' '{if($7==""){$7="no_orthodb_link"}; sub(/:.*/,"",$1);print $1,$2,$3,$4,$5,$6,$7}'
+version='1.0.0'
+if [ $1 == '-v' ];
+then
+    echo "$version"
+else
+    cat $1| grep -v '#'|awk '$2!="Missing"'| awk '{if($4>$5){print $3"\t"$5"\t"$4"\t"$1"\t"$7"\t"$6"\t"$9}else{print $3"\t"$4"\t"$5"\t"$1"\t"$7"\t"$6"\t"$9}}'| awk -F'\t' -v OFS='\t' '{if($7==""){$7="no_orthodb_link"}; sub(/:.*/,"",$1);print $1,$2,$3,$4,$5,$6,$7}'
+fi
diff --git a/bin/grep_pg.sh b/bin/grep_pg.sh
@@ -7,4 +7,10 @@
 # -------------------
 # Author = yy5
 
-grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}'
+version='1.0.0'
+if [ $1 == '-v' ];
+then
+    echo "$version"
+else
+    grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}'
+fi
diff --git a/bin/paf_to_bed.sh b/bin/paf_to_bed.sh
@@ -10,7 +10,7 @@
 
 version='1.0.0'
 
-if [ $1 == '-v'];
+if [ $1 == '-v' ];
 then
     echo "$version"
 else