From 9cba2a572d54d4bb491b3701471be42269e45762 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian <45041478+sahakiann@users.noreply.github.com> Date: Tue, 21 Jan 2025 14:25:46 -0500 Subject: [PATCH] POD-2374: Add optional Version to Staging Workspace (#198) * initial commit * update logic for populating workspace metrics * add more defaults --- .../set_up_staging_workspace_and_dataset.py | 87 +++++++++++++++---- wdl/SetUpStagingWorkspaceAndDataset/README.md | 35 ++++---- .../SetUpStagingWorkspaceAndDataset.wdl | 8 +- 3 files changed, 95 insertions(+), 35 deletions(-) diff --git a/python/set_up_staging_workspace_and_dataset.py b/python/set_up_staging_workspace_and_dataset.py index b83d899..f9d3259 100644 --- a/python/set_up_staging_workspace_and_dataset.py +++ b/python/set_up_staging_workspace_and_dataset.py @@ -10,7 +10,8 @@ from utils.requests_utils.request_util import RunRequest from utils.token_util import Token from utils.gcp_utils import GCPCloudFunctions -from utils import GCP, comma_separated_list +from utils import GCP, comma_separated_list, ARG_DEFAULTS + logging.basicConfig( format="%(levelname)s: %(asctime)s : %(message)s", level=logging.INFO @@ -88,6 +89,13 @@ def get_args() -> Namespace: action="store_true", help="If dataset already exists, delete it before creating a new one", ) + parser.add_argument( + "--workspace_version", + help="The version of the workspace. This should be used when more data needs to be " + "uploaded to a dataset, but it's not ready to be made public yet.", + required=False, + type=int, + ) return parser.parse_args() @@ -100,7 +108,8 @@ def __init__( continue_if_exists: bool, controlled_access: bool, resource_owners: list[str], - resource_members: Optional[list[str]] + resource_members: Optional[list[str]], + workspace_version: Optional[int], ): self.terra_workspace = terra_workspace self.terra_groups = terra_groups @@ -109,10 +118,12 @@ def __init__( self.controlled_access = controlled_access self.resource_owners = resource_owners self.resource_members = resource_members + self.workspace_version = workspace_version def _set_up_access_group(self) -> None: logging.info(f"Creating group {self.auth_group}") - self.terra_groups.create_group(group_name=self.auth_group, continue_if_exists=self.continue_if_exists) + continue_if_exists = True if self.workspace_version else self.continue_if_exists + self.terra_groups.create_group(group_name=self.auth_group, continue_if_exists=continue_if_exists) for user in self.resource_owners: self.terra_groups.add_user_to_group(email=user, group=self.auth_group, role=ADMIN) if self.resource_members: @@ -199,7 +210,8 @@ def __init__( controlled_access: bool, terra_billing_project: str, delete_existing_dataset: bool, - phs_id: Optional[str] = None + workspace_version: Optional[int], + phs_id: Optional[str] = None, ): self.tdr = tdr self.dataset_name = dataset_name @@ -212,6 +224,7 @@ def __init__( self.auth_group = auth_group self.delete_existing_dataset = delete_existing_dataset self.controlled_access = controlled_access + self.workspace_version = workspace_version def _create_dataset_properties(self) -> dict: additional_properties = { @@ -225,17 +238,53 @@ def _create_dataset_properties(self) -> dict: return additional_properties def _add_row_to_table(self, dataset_id: str) -> None: + dataset_metrics = tdr.get_dataset_table_metrics(dataset_id=dataset_id, target_table_name=self.REFERENCE_TABLE) + workspace_billing_combo = f"{self.terra_billing_project}/{self.workspace_name}" + + ingest_records = [] + if not dataset_metrics: + ingest_records.extend( + [ + { + "key": "Staging Workspace", + "value": workspace_billing_combo, + }, + { + "key": "Authorization Group", + "value": self.auth_group + } + ] + ) + else: + linked_workspaces = [w["value"] for w in dataset_metrics if dataset_metrics] + if workspace_billing_combo not in linked_workspaces: + if not self.workspace_version: + ingest_records.extend( + [ + { + "key": "Staging Workspace", + "value": workspace_billing_combo + } + ] + ) + else: + ingest_records.extend( + [ + { + "key": f"Staging Workspace Version {self.workspace_version}", + "value": workspace_billing_combo + } + ] + ) + StartAndMonitorIngest( - ingest_records=[ - {"key": "Staging Workspace", "value": f'{self.terra_billing_project}/{self.workspace_name}'}, - {"key": "Authorization Group", "value": self.auth_group} - ], + ingest_records=ingest_records, target_table_name=self.REFERENCE_TABLE, dataset_id=dataset_id, load_tag=f"{dataset_name}_initial_load", bulk_mode=False, - update_strategy="replace", - waiting_time_to_poll=15, + update_strategy=ARG_DEFAULTS["update_strategy"], + waiting_time_to_poll=ARG_DEFAULTS["waiting_time_to_poll"], tdr=self.tdr, ).run() @@ -308,7 +357,7 @@ def run(self) -> None: self.terra_groups.remove_user_from_group( group=self.auth_group, email=self.current_user_email, - role="admin" + role=ADMIN ) @@ -481,6 +530,7 @@ def run(self) -> list[WorkflowConfigs]: wdls_to_import = args.wdls_to_import notebooks_to_import = args.notebooks_to_import delete_existing_dataset = args.delete_existing_dataset + workspace_version = args.workspace_version if args.workspace_version and args.workspace_version > 1 else None # Validate wdls to import are valid and exclude any that are not if wdls_to_import: @@ -489,12 +539,15 @@ def run(self) -> list[WorkflowConfigs]: if wdl in GetWorkflowNames().get_workflow_names() ] - workspace_name = f"{dataset_name}_Staging" + workspace_name = ( + f"{dataset_name}_Staging_v{workspace_version}" if workspace_version else f"{dataset_name}_Staging" + ) auth_group = f"AUTH_{dataset_name}" # Set up Terra, TerraGroups, and TDR classes token = Token(cloud=GCP) - request_util = RunRequest(token=token, max_retries=5, max_backoff_time=60) + request_util = RunRequest( + token=token, max_retries=ARG_DEFAULTS["max_retries"], max_backoff_time=ARG_DEFAULTS["max_backoff_time"]) tdr = TDR(request_util=request_util) terra_groups = TerraGroups(request_util=request_util) terra_workspace = TerraWorkspace( @@ -511,7 +564,8 @@ def run(self) -> list[WorkflowConfigs]: continue_if_exists=continue_if_exists, controlled_access=controlled_access, resource_owners=resource_owners, - resource_members=resource_members + resource_members=resource_members, + workspace_version=workspace_version, ).run() logging.info("Finished setting up Terra workspace") workspace_bucket = f"gs://{terra_workspace.get_workspace_bucket()}" @@ -521,13 +575,14 @@ def run(self) -> list[WorkflowConfigs]: dataset_name=dataset_name, tdr_billing_profile=tdr_billing_profile, phs_id=phs_id, - continue_if_exists=continue_if_exists, + continue_if_exists=True if workspace_version else continue_if_exists, terra_billing_project=terra_billing_project, workspace_name=workspace_name, resource_owners=resource_owners, auth_group=auth_group, controlled_access=controlled_access, - delete_existing_dataset=delete_existing_dataset + delete_existing_dataset=delete_existing_dataset, + workspace_version=workspace_version, ) if delete_existing_dataset: sa_for_dataset_to_delete = dataset_setup.get_sa_for_dataset_to_delete() diff --git a/wdl/SetUpStagingWorkspaceAndDataset/README.md b/wdl/SetUpStagingWorkspaceAndDataset/README.md index 122a0ea..93b7036 100644 --- a/wdl/SetUpStagingWorkspaceAndDataset/README.md +++ b/wdl/SetUpStagingWorkspaceAndDataset/README.md @@ -3,23 +3,24 @@ This workflow automates the process of setting up a staging workspace in Terra a ## Inputs Table: -| Input Name | Description | Type | Required | Default | -|------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|----------|---------------------------------------------------------------------------------------------| -| **dataset_name** | The name of the TDR dataset which will be created. | String | Yes | N/A | -| **tdr_billing_profile** | The billing profile of the TDR dataset that will be created. | String | Yes | N/A | -| **terra_billing_project** | The billing project to use for the target Terra workspace. | String | Yes | N/A | -| **controlled_access** | Whether the Terra workspace/TDR dataset should have controlled access | Boolean | Yes | N/A | -| **resource_owners** | Comma separate list of resource owner(s). At least one resource owner is required. Do not include spaces between entries (i.e. use the following format: "user1@broadinstitute.org,user2@broadinstitute.org") | String | Yes | N/A | -| **continue_if_exists** | Whether to continue even if the workspace/dataset already exists. | Boolean | Yes | N/A | -| **delete_existing_dataset** | Delete existing dataset if one already exists. Deleting dataset WILL fail if snapshots still exist. | Boolean | Yes | N/A | -| **current_user_email** | The email of the current user (used for removing current user from workspace) | String | Yes | N/A | -| **dbgap_consent_code** | The dbGaP consent code (if it exists). Optional. | String | No | N/A | -| **phs_id** | The PHS id if it exists. Optional. | String | No | N/A | -| **duos_identifier** | The DUOS identifier (if it exists). Optional | String | No | N/A | -| **wdls_to_import** | A comma separate list of WDLs to import. Optional. Do not include spaces between entries (i.e. user the following format: "Workflow1,Workflow2") | String | No | N/A | -| **notebooks_to_import** | A comma separate list of notebooks to import. Optional. Do not include spaces between entries (i.e. user the following format: "Notebook1.ipynb,Notebook2.ipynb") | String | No | N/A | -| **resource_members** | Comma separate list of resource members (if they exist). Optional. Do not include spaces between entries (i.e. use the following format: "user1@broadinstitute.org,user2@broadinstitute.org") | String | No | N/A | -| **docker** | The docker image | String | No | us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest | +| Input Name | Description | Type | Required | Default | +|-----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|----------|---------------------------------------------------------------------------------------------| +| **dataset_name** | The name of the TDR dataset which will be created. | String | Yes | N/A | +| **tdr_billing_profile** | The billing profile of the TDR dataset that will be created. | String | Yes | N/A | +| **terra_billing_project** | The billing project to use for the target Terra workspace. | String | Yes | N/A | +| **controlled_access** | Whether the Terra workspace/TDR dataset should have controlled access | Boolean | Yes | N/A | +| **resource_owners** | Comma separate list of resource owner(s). At least one resource owner is required. Do not include spaces between entries (i.e. use the following format: "user1@broadinstitute.org,user2@broadinstitute.org") | String | Yes | N/A | +| **continue_if_exists** | Whether to continue even if the workspace/dataset already exists. | Boolean | Yes | N/A | +| **delete_existing_dataset** | Delete existing dataset if one already exists. Deleting dataset WILL fail if snapshots still exist. | Boolean | Yes | N/A | +| **current_user_email** | The email of the current user (used for removing current user from workspace) | String | Yes | N/A | +| **dbgap_consent_code** | The dbGaP consent code (if it exists). Optional. | String | No | N/A | +| **phs_id** | The PHS id if it exists. Optional. | String | No | N/A | +| **duos_identifier** | The DUOS identifier (if it exists). Optional | String | No | N/A | +| **wdls_to_import** | A comma separate list of WDLs to import. Optional. Do not include spaces between entries (i.e. user the following format: "Workflow1,Workflow2") | String | No | N/A | +| **notebooks_to_import** | A comma separate list of notebooks to import. Optional. Do not include spaces between entries (i.e. user the following format: "Notebook1.ipynb,Notebook2.ipynb") | String | No | N/A | +| **resource_members** | Comma separate list of resource members (if they exist). Optional. Do not include spaces between entries (i.e. use the following format: "user1@broadinstitute.org,user2@broadinstitute.org") | String | No | N/A | +| **workspace_version** | An optional version number to append to the workspace name. This is useful when one batch of data has already been imported to a dataset and another batch needs to be imported before publicizing the data. | Int | No | N/A | +| **docker** | The docker image | String | No | us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest | ## Outputs Table: This script does not generate any direct outputs. But the target workspace and dataset will be generated if the workflow completes successfully. The logs, which include any errors or warnings, can be reviewed in the stderr file for additional details about the process. diff --git a/wdl/SetUpStagingWorkspaceAndDataset/SetUpStagingWorkspaceAndDataset.wdl b/wdl/SetUpStagingWorkspaceAndDataset/SetUpStagingWorkspaceAndDataset.wdl index cfc2f25..62c1ea1 100644 --- a/wdl/SetUpStagingWorkspaceAndDataset/SetUpStagingWorkspaceAndDataset.wdl +++ b/wdl/SetUpStagingWorkspaceAndDataset/SetUpStagingWorkspaceAndDataset.wdl @@ -17,6 +17,7 @@ workflow SetUpStagingWorkspaceAndDataset { String? wdls_to_import String? notebooks_to_import String? docker + Int? workspace_version } String docker_image = select_first([docker, "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest"]) @@ -37,7 +38,8 @@ workflow SetUpStagingWorkspaceAndDataset { wdls_to_import = wdls_to_import, notebooks_to_import = notebooks_to_import, delete_existing_dataset = delete_existing_dataset, - docker = docker_image + docker = docker_image, + workspace_version = workspace_version } } @@ -58,6 +60,7 @@ task SetUpStagingEnvironments { String? wdls_to_import String? notebooks_to_import String docker + Int? workspace_version } command <<< @@ -75,7 +78,8 @@ task SetUpStagingEnvironments { ~{"--duos_identifier " + duos_identifier} \ ~{"--wdls_to_import " + wdls_to_import} \ ~{"--notebooks_to_import " + notebooks_to_import} \ - ~{if delete_existing_dataset then "--delete_existing_dataset" else ""} + ~{if delete_existing_dataset then "--delete_existing_dataset" else ""} \ + ~{"--workspace_version " + workspace_version} >>> runtime {