diff --git a/.github/workflows/build_and_push.yml b/.github/workflows/build_and_push.yml new file mode 100644 index 00000000..a2e6251b --- /dev/null +++ b/.github/workflows/build_and_push.yml @@ -0,0 +1,26 @@ +name: build and push docker images +on: + push: + branches: + - main + pull_request: + branches: + - "*" +env: + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} +jobs: + build-and-push: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Login to GHCR (GitHub Packages) + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + # add --with-dev to below commands to build & push the dev image + - name: Build docker image + run: ./docker/build + - name: Push docker image + run: ./docker/push diff --git a/README.md b/README.md index d48211cd..f00c6f7b 100644 --- a/README.md +++ b/README.md @@ -107,3 +107,24 @@ Testing can be done in a second terminal (make sure to activate 'beebop_py') by ``` TESTING=True poetry run pytest ``` + +### Diagrams + +- There is a .drawio graph in the `diagrams` folder illustrating the process of running a analysis. This includes +all the files created and how they are used in each job. You can open and view the diagram at [draw.io](https://draw.io). + +## Use/Deploy specific version of PopPUNK + +To use a specific version, commit or branch of PopPUNK in a beebop_py deployment, you can update `POPPUNK_VERSION` in `common`. + +The new dev images built with `/docker/build --with-dev` will have a *-dev* postfix. + +### Local Development + +You can build the image with `/docker/build --with-dev`, this new image can now be used by Beebop. + +### Deployment + +A pull request can be created so GHA pushes the images to the docker hub. Add `--with-dev` to the build & push commands `pipeline.yaml`. +**Ensure to remove the `--with-dev` flag before merging the PR.** +Then on the `beebop-deploy` the api image can be updated with the new dev image. diff --git a/beebop/app.py b/beebop/app.py index b11075bb..2ac033cb 100644 --- a/beebop/app.py +++ b/beebop/app.py @@ -304,18 +304,23 @@ def run_poppunk_internal(sketches: dict, depends_on=job_assign, **queue_kwargs) redis.hset("beebop:hash:job:network", p_hash, job_network.id) # microreact - # delete all previous microreact cluster job results + # delete all previous microreact cluster job results for this project redis.delete(f"beebop:hash:job:microreact:{p_hash}") - job_microreact = q.enqueue(visualise.microreact, - args=(p_hash, - fs, - full_db_fs, - args, - name_mapping, - species, - redis_host, - queue_kwargs), - depends_on=Dependency([job_assign, job_network], allow_failure=True), **queue_kwargs) + job_microreact = q.enqueue( + visualise.microreact, + args=( + p_hash, + fs, + full_db_fs, + args, + name_mapping, + species, + redis_host, + queue_kwargs, + ), + depends_on=Dependency([job_assign, job_network], allow_failure=True), + **queue_kwargs, + ) redis.hset("beebop:hash:job:microreact", p_hash, job_microreact.id) return jsonify( response_success( diff --git a/beebop/assignClusters.py b/beebop/assignClusters.py index fc779655..3c0a6bd4 100644 --- a/beebop/assignClusters.py +++ b/beebop/assignClusters.py @@ -196,8 +196,10 @@ def handle_external_clusters( config.external_clusters_prefix, ) if not_found_query_names: - queries_names, queries_clusters, not_found_query_clusters = filter_queries( - queries_names, queries_clusters, not_found_query_names + queries_names, queries_clusters, not_found_query_clusters = ( + filter_queries( + queries_names, queries_clusters, not_found_query_names + ) ) output_full_tmp = config.fs.output_tmp(config.p_hash) not_found_query_names_new, not_found_query_clusters_new = ( @@ -241,15 +243,19 @@ def handle_not_found_queries( """ [Handles queries that were not found in the initial external clusters file. - This function processes the sketches of the queries that were not found for external clusters from the reference db, - assigns clusters to them from the full db, and then summarizes the clusters. It also + This function processes the sketches of the queries that were not found + for external clusters from the reference db, + assigns clusters to them from the full db, and then + summarizes the clusters. It also handles all file manipulations needed] :param config: [ClusteringConfig with all necessary information] - :param sketches_dict: [dictionary with filehash (key) and sketch (value)] + :param sketches_dict: [dictionary + with filehash (key) and sketch (value)] :param not_found_query_names: [list of sample hashes that were not found] :param output_full_tmp: [path to temporary output directory] - :param not_found_query_clusters: [set of clusters assigned to initial not found samples] + :param not_found_query_clusters: [set of clusters assigned to + initial not found samples] :return tuple[list, list]: [list initial not found sample hashes, list of clusters assigned to initial not found samples] """ @@ -280,6 +286,7 @@ def handle_not_found_queries( return query_names, query_clusters + def handle_files_manipulation( config: ClusteringConfig, output_full_tmp: str, @@ -288,13 +295,16 @@ def handle_files_manipulation( """ [Handles file manipulations for queries that were not found in the initial external clusters file. - This function copies include files from the full assign output directory - to the output directory, deletes include files for queries that were not found, + This function copies include files from the + full assign output directory + to the output directory, deletes include files for queries + that were not found, and merges the partial query graph files.] - + :param config: [ClusteringConfig with all necessary information] :param output_full_tmp: [path to temporary output directory] - :param not_found_query_clusters: [set of clusters assigned to initial not found samples] + :param not_found_query_clusters: [set of clusters assigned + to initial not found samples] """ delete_include_files( config.fs, @@ -302,7 +312,11 @@ def handle_files_manipulation( not_found_query_clusters, ) copy_include_files(output_full_tmp, config.out_dir) - merge_txt_files(config.fs.partial_query_graph(config.p_hash), config.fs.partial_query_graph_tmp(config.p_hash)) + merge_txt_files( + config.fs.partial_query_graph(config.p_hash), + config.fs.partial_query_graph_tmp(config.p_hash), + ) + def update_external_clusters( config: ClusteringConfig, @@ -316,8 +330,10 @@ def update_external_clusters( using the full database. This function reads the external clusters from the new previous query clustering file - and updates the initial external clusters file on ref db with the clusters for samples - that were initially not found, and have now been assigned by the current query with the full database.] + and updates the initial external clusters + file on ref db with the clusters for samples + that were initially not found, and have now been + assigned by the current query with the full database.] :param config: [ClusteringConfig with all necessary information] @@ -399,10 +415,11 @@ def filter_queries( :param queries_names: [list of sample hashes] :param queries_clusters: [list of sample PopPUNK clusters] - :param not_found: [list of sample hashes that were not found] - :param config: [ClusteringConfig with all necessary information] + :param not_found: [list of sample hashes + that were not found] :return tuple[list[str], list[str], set[str]]: [filtered sample hashes, - filtered sample PopPUNK clusters, set of clusters assigned to not found samples] + filtered sample PopPUNK clusters, + set of clusters assigned to not found samples] """ filtered_names = [name for name in queries_names if name not in not_found] filtered_clusters = [ @@ -410,8 +427,12 @@ def filter_queries( for name, cluster in zip(queries_names, queries_clusters) if name not in not_found ] - - return filtered_names, filtered_clusters, set(queries_clusters) - set(filtered_clusters) + + return ( + filtered_names, + filtered_clusters, + set(queries_clusters) - set(filtered_clusters), + ) def delete_include_files( diff --git a/beebop/visualise.py b/beebop/visualise.py index e22ef4b5..b2601650 100644 --- a/beebop/visualise.py +++ b/beebop/visualise.py @@ -39,7 +39,8 @@ def microreact( redis = Redis(host=redis_host) # get results from previous job current_job = get_current_job(redis) - assign_result = current_job.dependency.result # gets first dependency result (i.e assign_clusters) + # gets first dependency result (i.e assign_clusters) + assign_result = current_job.dependency.result external_to_poppunk_clusters = None try: diff --git a/buildkite/pipeline.yml b/buildkite/pipeline.yml deleted file mode 100644 index 34b4fa6a..00000000 --- a/buildkite/pipeline.yml +++ /dev/null @@ -1,8 +0,0 @@ -steps: - - label: ":whale::python: Build" - command: docker/build - - - wait - - - label: ":shipit: Push images" - command: docker/push \ No newline at end of file diff --git a/diagrams/Beebop-PopPUNK-run.drawio b/diagrams/Beebop-PopPUNK-run.drawio new file mode 100644 index 00000000..ecd9b2ab --- /dev/null +++ b/diagrams/Beebop-PopPUNK-run.drawio @@ -0,0 +1,543 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev new file mode 100644 index 00000000..bb4e2587 --- /dev/null +++ b/docker/Dockerfile.dev @@ -0,0 +1,61 @@ +FROM continuumio/miniconda3 + +# ensure conda does not install packages for the wrong architecture +ENV CONDA_OVERRIDE_ARCHSPEC=skylake + +ARG POPPUNK_VERSION="v2.6.7" + +# Make RUN commands use the new environment: +SHELL ["conda", "run", "-n", "base", "/bin/bash", "-c"] + +# Set up conda environment +RUN conda install python=3.10 +RUN conda config --append channels conda-forge && \ + conda config --append channels bioconda + +# Install mamba: which is a faster package manager than conda +RUN conda install -c conda-forge mamba +RUN conda config --set channel_priority flexible + +#Install PopPUNK conda dependencies +RUN mamba install -y -c conda-forge -y graph-tool mandrake +RUN mamba install -y \ + # Core data packages + pandas \ + requests \ + networkx \ + scikit-learn \ + # Bioinformatics tools + pp-sketchlib \ + biopython \ + treeswift \ + rapidnj \ + # Analysis tools + hdbscan \ + # Progress tracking + tqdm \ + && mamba clean -afy + +# System dependencies +RUN apt-get update && \ + apt-get install -y \ + build-essential \ + cmake \ + libeigen3-dev \ + libhdf5-dev \ + libopenblas-dev + + +RUN pip install git+https://github.com/bacpop/PopPUNK@${POPPUNK_VERSION}#egg=PopPUNK + +# Poetry setup +RUN pip install poetry +COPY *.toml *.lock / +RUN poetry config virtualenvs.create false && \ + poetry install + +COPY . /beebop +WORKDIR /beebop +EXPOSE 5000 + +CMD ["conda", "run" ,"--no-capture-output", "-n", "base", "poetry", "run", "waitress-serve", "--port=5000", "beebop.app:app"] \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile.prod similarity index 100% rename from docker/Dockerfile rename to docker/Dockerfile.prod diff --git a/docker/build b/docker/build index d91bf1b6..a3bd7d11 100755 --- a/docker/build +++ b/docker/build @@ -4,11 +4,21 @@ set -ex HERE=$(dirname $0) . $HERE/common +# Build and push the production image docker build --pull \ --tag $TAG_SHA \ - -f docker/Dockerfile \ + -f docker/Dockerfile.prod \ $PACKAGE_ROOT - # We always push the SHA tagged versions, for debugging if the tests # after this step fail docker push $TAG_SHA + +# Build and push dev image only if --with-dev flag is set +if [ "$1" == "--with-dev" ]; then + docker build --pull \ + --build-arg POPPUNK_VERSION=$POPPUNK_VERSION \ + --tag $TAG_DEV_SHA \ + -f docker/Dockerfile.dev \ + $PACKAGE_ROOT + docker push $TAG_DEV_SHA +fi diff --git a/docker/common b/docker/common index ad2f9552..a985b07e 100644 --- a/docker/common +++ b/docker/common @@ -1,25 +1,24 @@ #!/usr/bin/env bash set -e - +REGISTRY=ghcr.io PACKAGE_ROOT=$(realpath $HERE/..) PACKAGE_NAME=beebop-py -PACKAGE_ORG=mrcide +PACKAGE_ORG=bacpop +PACKAGE_DEV=dev -# Buildkite doesn't check out a full history from the remote (just the -# single commit) so you end up with a detached head and git rev-parse -# doesn't work -if [ "$BUILDKITE" = "true" ]; then - GIT_SHA=${BUILDKITE_COMMIT:0:7} +GIT_SHA=$(git -C "$PACKAGE_ROOT" rev-parse --short=7 HEAD) +if [[ -v "BRANCH_NAME" ]]; then + GIT_BRANCH=${BRANCH_NAME} else - GIT_SHA=$(git -C "$PACKAGE_ROOT" rev-parse --short=7 HEAD) + GIT_BRANCH=$(git symbolic-ref --short HEAD) fi -if [ "$BUILDKITE" = "true" ]; then - GIT_BRANCH=$BUILDKITE_BRANCH -else - GIT_BRANCH=$(git -C "$PACKAGE_ROOT" symbolic-ref --short HEAD) -fi +# production image +TAG_SHA="${REGISTRY}/${PACKAGE_ORG}/${PACKAGE_NAME}:${GIT_SHA}" +TAG_BRANCH="${REGISTRY}/${PACKAGE_ORG}/${PACKAGE_NAME}:${GIT_BRANCH}" +TAG_LATEST="${REGISTRY}/${PACKAGE_ORG}/${PACKAGE_NAME}:latest" -TAG_SHA="${PACKAGE_ORG}/${PACKAGE_NAME}:${GIT_SHA}" -TAG_BRANCH="${PACKAGE_ORG}/${PACKAGE_NAME}:${GIT_BRANCH}" -TAG_LATEST="${PACKAGE_ORG}/${PACKAGE_NAME}:latest" \ No newline at end of file +# development image +TAG_DEV_SHA="${REGISTRY}/${TAG_SHA}-${PACKAGE_DEV}" +TAG_DEV_BRANCH="${REGISTRY}/${TAG_BRANCH}-${PACKAGE_DEV}" +POPPUNK_VERSION=v2.6.7 # can be version, branch or commit \ No newline at end of file diff --git a/docker/push b/docker/push index 3fecfecb..adc42edb 100755 --- a/docker/push +++ b/docker/push @@ -5,11 +5,16 @@ HERE=$(dirname $0) . $HERE/common # In case we switch agents between steps -[ ! -z $(docker images -q $TAG_SHA) ] || docker pull $TAG_SHA docker tag $TAG_SHA $TAG_BRANCH docker push $TAG_BRANCH +if [ "$1" == "--with-dev" ]; then + [ ! -z $(docker images -q $TAG_DEV_SHA) ] || docker pull $TAG_DEV_SHA + docker tag $TAG_DEV_SHA $TAG_DEV_BRANCH + docker push $TAG_DEV_BRANCH +fi + if [ $GIT_BRANCH == "main" ]; then docker tag $TAG_SHA $TAG_LATEST docker push $TAG_LATEST diff --git a/scripts/run_app b/scripts/run_app index 40bbe940..784b99bb 100755 --- a/scripts/run_app +++ b/scripts/run_app @@ -10,4 +10,4 @@ function cleanup() { trap cleanup INT trap cleanup ERR -STORAGE_LOCATION=./storage DBS_LOCATION=./storage/dbs FLASK_APP=beebop/app.py poetry run flask run +rq worker & STORAGE_LOCATION=./storage DBS_LOCATION=./storage/dbs FLASK_APP=beebop/app.py poetry run flask run diff --git a/tests/test_unit.py b/tests/test_unit.py index 68778e9f..6b03b58f 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -1130,9 +1130,7 @@ def test_handle_external_clusters_with_not_found(mocker, config): ) # not found function calls - mock_filter_queries.assert_called_once_with( - q_names, q_clusters, not_found - ) + mock_filter_queries.assert_called_once_with(q_names, q_clusters, not_found) mock_handle_not_found.assert_called_once_with( config, {}, not_found, tmp_output, not_found_q_clusters ) @@ -1184,6 +1182,7 @@ def test_handle_not_found_queries( assert query_names == ["hash1"] assert query_clusters == [10] + @patch("beebop.assignClusters.merge_txt_files") @patch("beebop.assignClusters.copy_include_files") @patch("beebop.assignClusters.delete_include_files") @@ -1192,12 +1191,18 @@ def test_handle_files_manipulation(mock_delete, mock_copy, mock_merge, config): not_found_query_clusters = {1234, 6969} config.fs.partial_query_graph.return_value = "partial_query_graph" config.fs.partial_query_graph_tmp.return_value = "partial_query_graph_tmp" - - assignClusters.handle_files_manipulation(config, outdir_tmp, not_found_query_clusters) - mock_delete.assert_called_once_with(config.fs, config.p_hash, not_found_query_clusters) + assignClusters.handle_files_manipulation( + config, outdir_tmp, not_found_query_clusters + ) + + mock_delete.assert_called_once_with( + config.fs, config.p_hash, not_found_query_clusters + ) mock_copy.assert_called_once_with(outdir_tmp, config.out_dir) - mock_merge.assert_called_once_with("partial_query_graph", "partial_query_graph_tmp") + mock_merge.assert_called_once_with( + "partial_query_graph", "partial_query_graph_tmp" + ) @patch("beebop.assignClusters.update_external_clusters_csv") @@ -1284,38 +1289,44 @@ def test_copy_include_files_no_conflict(tmp_path): assert (output_full_tmp / f).exists() # Still in original location assert not (outdir / f).exists() # Not in new location + def test_copy_include_file_conflict(tmp_path): output_full_tmp = tmp_path / "output_full_tmp" outdir = tmp_path / "outdir" output_full_tmp.mkdir() outdir.mkdir() - - include_files_tmp = ["include_1.txt",] + + include_files_tmp = [ + "include_1.txt", + ] include_files = ["include_1.txt"] - + # Create include files (output_full_tmp / include_files_tmp[0]).write_text("new content") (outdir / include_files[0]).write_text("original content") - + assignClusters.copy_include_files(str(output_full_tmp), str(outdir)) - - assert not (output_full_tmp / include_files_tmp[0]).exists() # Original removed + + assert not ( + output_full_tmp / include_files_tmp[0] + ).exists() # Original removed included_file_content = (outdir / include_files[0]).read_text() assert "new content" in included_file_content # New content assert "original content" in included_file_content # Original content + def test_filter_queries(): q_names = ["sample1", "sample2", "sample3"] q_clusters = [1, 2, 3] not_found = ["sample2"] - filtered_names, filtered_clusters, not_found_q_clusters = assignClusters.filter_queries( - q_names, q_clusters, not_found + filtered_names, filtered_clusters, not_found_q_clusters = ( + assignClusters.filter_queries(q_names, q_clusters, not_found) ) assert filtered_names == ["sample1", "sample3"] assert filtered_clusters == [1, 3] - assert not_found_q_clusters + assert not_found_q_clusters def test_delete_include_files(tmp_path):