Skip to content

Commit

Permalink
Merge pull request #192 from broadinstitute/sn_POD-2367_remove_rsync
Browse files Browse the repository at this point in the history
Remove rsync option
  • Loading branch information
snovod authored Jan 10, 2025
2 parents 2b2b0d5 + 11011e4 commit 265a280
Show file tree
Hide file tree
Showing 8 changed files with 15 additions and 117 deletions.
33 changes: 8 additions & 25 deletions python/hard_clone_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ def get_args() -> Namespace:
parser.add_argument('--batch_size', "-b", type=int,
help="Number of files validate and copy at a time. If not specified, "
"all files will be copied at once")
parser.add_argument('--metadata_only', "-m", action="store_true",
help="Only copy metadata, no actual file copy")
parser.add_argument('--do_not_update_acls', action="store_true",
help="Do not update the destination workspace ACLs with the source workspace ACLs. " +
"If you do not have owner access of the source workspace, you should use this flag.")
Expand Down Expand Up @@ -191,16 +189,6 @@ def run(self) -> None:
self.dest_workspace.update_multiple_users_acl(acl_list=src_workspace_acls_list)


def make_bucket_files(src_bucket: str, dest_bucket: str) -> None:
logging.info(f"Creating {DEST_BUCKET_FILE}")
with open(DEST_BUCKET_FILE, "w") as f:
f.write(f"gs://{dest_bucket}/")

logging.info(f"Creating {SOURCE_BUCKET_FILE}")
with open(SOURCE_BUCKET_FILE, "w") as f:
f.write(f"gs://{src_bucket}/")


def check_and_wait_for_permissions(external_bucket: str, total_hours: int) -> None:
"""Checks if the account has write permissions for a given bucket. Retries every 30 minutes
for a total time of the user provided hours. Cannot wait for more than 5 hours total.
Expand Down Expand Up @@ -257,7 +245,6 @@ def check_and_wait_for_permissions(external_bucket: str, total_hours: int) -> No
workers = args.workers
extensions_to_ignore = args.extensions_to_ignore
batch_size = args.batch_size
metadata_only = args.metadata_only
do_not_update_acls = args.do_not_update_acls
external_bucket = args.external_bucket

Expand Down Expand Up @@ -335,18 +322,14 @@ def check_and_wait_for_permissions(external_bucket: str, total_hours: int) -> No
logging.info(f"Uploading {tsv} to destination workspace")
dest_workspace.upload_metadata_to_workspace_table(entities_tsv=tsv)

if not metadata_only:
# Copy files from source workspace to destination workspace
CopyFilesToDestWorkspace(
src_bucket=src_bucket,
dest_bucket=dest_bucket,
extensions_to_ignore=extensions_to_ignore,
workers=workers,
batch_size=batch_size
).run()

# This is just done for the wdl to run rsync
make_bucket_files(src_bucket, dest_bucket)
# Copy files from source workspace to destination workspace
CopyFilesToDestWorkspace(
src_bucket=src_bucket,
dest_bucket=dest_bucket,
extensions_to_ignore=extensions_to_ignore,
workers=workers,
batch_size=batch_size
).run()

# Set the destination workspace ACLs
if not do_not_update_acls:
Expand Down
22 changes: 0 additions & 22 deletions wdl/HardCloneWithExternalBucket/HardCloneWithExternalBucket.wdl
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
version 1.0

import "../utils/GcpUtils.wdl" as gcp_utils

workflow HardCloneWithExternalBucket {
input {
String source_billing_project
Expand All @@ -10,7 +8,6 @@ workflow HardCloneWithExternalBucket {
String dest_workspace_name
String external_bucket
Boolean allow_already_created
Boolean rsync_workspace
Boolean do_not_update_acls
Int? workers
String? extensions_to_ignore
Expand All @@ -22,8 +19,6 @@ workflow HardCloneWithExternalBucket {
}

String docker_name = select_first([docker, "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest"])
# Ignore HardCloneTerraWorkspace submisisons files so do not write to src as copying to dest
String rysnc_regex_exclude = ".*/HardCloneWithExternalBucket/.*"
Int memory = select_first([memory_gb, 8])

call HardCloneWithExternalBucketTask {
Expand All @@ -39,20 +34,10 @@ workflow HardCloneWithExternalBucket {
docker_name=docker_name,
memory_gb=memory,
batch_size=batch_size,
metadata_only=rsync_workspace,
do_not_update_acls=do_not_update_acls,
check_and_wait_for_permissions=check_and_wait_for_permissions,
max_permissions_wait_time=max_permissions_wait_time
}
if (rsync_workspace) {
call gcp_utils.GcloudRsync {
input:
source=HardCloneWithExternalBucketTask.src_bucket,
destination=HardCloneWithExternalBucketTask.dest_bucket,
exclude_regex=rysnc_regex_exclude
}
}
}

task HardCloneWithExternalBucketTask {
Expand All @@ -67,7 +52,6 @@ task HardCloneWithExternalBucketTask {
String? extensions_to_ignore
String docker_name
Int memory_gb
Boolean metadata_only
Boolean do_not_update_acls
Int? batch_size
Boolean check_and_wait_for_permissions
Expand All @@ -85,17 +69,11 @@ task HardCloneWithExternalBucketTask {
~{"--workers " + workers} \
~{"--extensions_to_ignore " + extensions_to_ignore} \
~{"--batch_size " + batch_size} \
~{if metadata_only then "--metadata_only" else ""} \
~{if do_not_update_acls then "--do_not_update_acls" else ""} \
~{if check_and_wait_for_permissions then "--check_and_wait_for_permissions" else ""} \
~{"--max_permissions_wait_time " + max_permissions_wait_time}
>>>

output {
String dest_bucket = read_string("dest_workspace_bucket.txt")
String src_bucket = read_string("source_workspace_bucket.txt")
}

runtime {
docker: docker_name
memory: "${memory_gb} GiB"
Expand Down
7 changes: 3 additions & 4 deletions wdl/HardCloneWithExternalBucket/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,13 @@ This WDL script creates a new workspace that is nearly identical to the source w
| **external_bucket** | External GCP bucket to copy files to and reference in metadata. Should be like gs://bucket/ | String | Yes | N/A |
| **dest_workspace_name** | The name of the new workspace to be created. | String | Yes | N/A |
| **allow_already_created** | If `true`, allows the script to proceed without failing if the destination workspace already exists. | Boolean | Yes | N/A |
| **rsync_workspace** | If you would like to use rsync for copy instead of gcloud libraries. Can be quicker if large copies. | Boolean | Yes | N/A |
| **do_not_update_acls** | If you want to skip updating ACLs to match source workspace. If you don't have OWNER access of source workspace you HAVE to use this option or it will fail. | Boolean | Yes | N/A |
| **check_and_wait_for_permissions** | When used, workflow will check write permissions on destination bucket every 30 minutes for 5 hours total before exiting. Useful for when permissions were newly added and could take some time to propagate | Boolean | Yes | N/A |
| **workers** | The number of worker threads to use for the file transfer. Only used if not rsyncing. Optional. | Int | No | 10 |
| **extensions_to_ignore** | A comma-separated list of file extensions to ignore during the file copy process. Only used if not rsyncing. Optional. Do not include spaces (i.e. ".txt,.tsv") | String | No | N/A |
| **workers** | The number of worker threads to use for the file transfer. Optional. | Int | No | 10 |
| **extensions_to_ignore** | A comma-separated list of file extensions to ignore during the file copy process. Optional. Do not include spaces (i.e. ".txt,.tsv") | String | No | N/A |
| **docker** | Specifies a custom Docker image to use. Optional. | String | No | us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest |
| **memory_gb** | How much memory given to task | Int | No | 8 |
| **batch_size** | Can be used if you want to batch up how much files at a time to copy over. If not used will do it all in one batch. Only used if not rsyncing | Int | No | N/A |
| **batch_size** | Can be used if you want to batch up how much files at a time to copy over. If not used will do it all in one batch. | Int | No | N/A |
| **max_permissions_wait_time** | Optional total time to wait for permissions to propagate. Defaults to 5 hours. Won't run for more than 5 hours total. | Int | No | 5 |


Expand Down
1 change: 0 additions & 1 deletion wdl/HardCloneWithExternalBucket/template_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,5 @@
"HardCloneWithExternalBucket.dest_workspace_name": "String",
"HardCloneWithExternalBucket.allow_already_created": "Boolean",
"HardCloneWithExternalBucket.external_bucket": "String",
"HardCloneWithExternalBucket.rsync": "Boolean",
"HardCloneWithExternalBucket.check_and_wait_for_permissions": "Boolean"
}
22 changes: 0 additions & 22 deletions wdl/HardCloneWorkspace/HardCloneWorkspace.wdl
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
version 1.0

import "../utils/GcpUtils.wdl" as gcp_utils

workflow HardCloneTerraWorkspace {
input {
String source_billing_project
String source_workspace_name
String dest_billing_project
String dest_workspace_name
Boolean allow_already_created
Boolean rsync_workspace
Boolean do_not_update_acls
Int? workers
String? extensions_to_ignore
Expand All @@ -21,8 +18,6 @@ workflow HardCloneTerraWorkspace {
}

String docker_name = select_first([docker, "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest"])
# Ignore HardCloneTerraWorkspace submisisons files so do not write to src as copying to dest
String rysnc_regex_exclude = ".*/HardCloneTerraWorkspace/.*"
Int memory = select_first([memory_gb, 8])

call HardCloneTerraWorkspaceTask {
Expand All @@ -37,20 +32,10 @@ workflow HardCloneTerraWorkspace {
docker_name=docker_name,
memory_gb=memory,
batch_size=batch_size,
metadata_only=rsync_workspace,
do_not_update_acls=do_not_update_acls,
check_and_wait_for_permissions=check_and_wait_for_permissions,
max_permissions_wait_time=max_permissions_wait_time
}
if (rsync_workspace) {
call gcp_utils.GcloudRsync {
input:
source=HardCloneTerraWorkspaceTask.src_bucket,
destination=HardCloneTerraWorkspaceTask.dest_bucket,
exclude_regex=rysnc_regex_exclude
}
}
}

task HardCloneTerraWorkspaceTask {
Expand All @@ -64,7 +49,6 @@ task HardCloneTerraWorkspaceTask {
String? extensions_to_ignore
String docker_name
Int memory_gb
Boolean metadata_only
Boolean do_not_update_acls
Int? batch_size
Boolean check_and_wait_for_permissions
Expand All @@ -81,17 +65,11 @@ task HardCloneTerraWorkspaceTask {
~{"--workers " + workers} \
~{"--extensions_to_ignore " + extensions_to_ignore} \
~{"--batch_size " + batch_size} \
~{if metadata_only then "--metadata_only" else ""} \
~{if do_not_update_acls then "--do_not_update_acls" else ""} \
~{if check_and_wait_for_permissions then "--check_and_wait_for_permissions" else ""} \
~{"--max_permissions_wait_time " + max_permissions_wait_time}
>>>

output {
String dest_bucket = read_string("dest_workspace_bucket.txt")
String src_bucket = read_string("source_workspace_bucket.txt")
}

runtime {
docker: docker_name
memory: "${memory_gb} GiB"
Expand Down
7 changes: 3 additions & 4 deletions wdl/HardCloneWorkspace/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@ This WDL script creates a new workspace that is nearly identical to the source w
| **dest_billing_project** | The billing project for the new destination workspace. | String | Yes | N/A |
| **dest_workspace_name** | The name of the new workspace to be created. | String | Yes | N/A |
| **allow_already_created** | If `true`, allows the script to proceed without failing if the destination workspace already exists. | Boolean | Yes | N/A |
| **rsync_workspace** | If you would like to use rsync for copy instead of gcloud libraries. Can be quicker if large copies. | Boolean | Yes | N/A |
| **do_not_update_acls** | If you want to skip updating ACLs to match source workspace. If you don't have OWNER access of source workspace you HAVE to use this option or it will fail. | Boolean | Yes | N/A |
| **check_and_wait_for_permissions** | When used, workflow will check write permissions on destination bucket every 30 minutes for 5 hours total before exiting. Useful for when permissions were newly added and could take some time to propagate | Boolean | Yes | N/A |
| **workers** | The number of worker threads to use for the file transfer. Only used if not rsyncing. Optional. | Int | No | 10 |
| **extensions_to_ignore** | A comma-separated list of file extensions to ignore during the file copy process. Only used if not rsyncing. Optional. Do not include spaces (i.e. ".txt,.tsv") | String | No | N/A |
| **workers** | The number of worker threads to use for the file transfer. Optional. | Int | No | 10 |
| **extensions_to_ignore** | A comma-separated list of file extensions to ignore during the file copy process. Optional. Do not include spaces (i.e. ".txt,.tsv") | String | No | N/A |
| **docker** | Specifies a custom Docker image to use. Optional. | String | No | us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest |
| **memory_gb** | How much memory given to task | Int | No | 8 |
| **batch_size** | Can be used if you want to batch up how much files at a time to copy over. If not used will do it all in one batch. Only used if not rsyncing | Int | No | N/A |
| **batch_size** | Can be used if you want to batch up how much files at a time to copy over. If not used will do it all in one batch. | Int | No | N/A |
| **max_permissions_wait_time** | Optional total time to wait for permissions to propagate. Defaults to 5 hours. Won't run for more than 5 hours total. | Int | No | 5 |


Expand Down
3 changes: 1 addition & 2 deletions wdl/HardCloneWorkspace/template_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@
"HardCloneTerraWorkspace.source_workspace_name": "String",
"HardCloneTerraWorkspace.dest_billing_project": "String",
"HardCloneTerraWorkspace.dest_workspace_name": "String",
"HardCloneTerraWorkspace.allow_already_created": "Boolean",
"HardCloneTerraWorkspace.rsync": "Boolean"
"HardCloneTerraWorkspace.allow_already_created": "Boolean"
}
37 changes: 0 additions & 37 deletions wdl/utils/GcpUtils.wdl

This file was deleted.

0 comments on commit 265a280

Please # to comment.