Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
sahakiann committed Jan 21, 2025
1 parent fbdaa42 commit 5f7dd0f
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 33 deletions.
61 changes: 47 additions & 14 deletions python/set_up_staging_workspace_and_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from utils.requests_utils.request_util import RunRequest
from utils.token_util import Token
from utils.gcp_utils import GCPCloudFunctions
from utils import GCP, comma_separated_list
from utils import GCP, comma_separated_list, ARG_DEFAULTS


logging.basicConfig(
format="%(levelname)s: %(asctime)s : %(message)s", level=logging.INFO
Expand Down Expand Up @@ -88,6 +89,13 @@ def get_args() -> Namespace:
action="store_true",
help="If dataset already exists, delete it before creating a new one",
)
parser.add_argument(
"--workspace_version",
help="The version of the workspace. This should be used when more data needs to be "
"uploaded to a dataset, but it's not ready to be made public yet.",
required=False,
type=int,
)
return parser.parse_args()


Expand All @@ -100,7 +108,8 @@ def __init__(
continue_if_exists: bool,
controlled_access: bool,
resource_owners: list[str],
resource_members: Optional[list[str]]
resource_members: Optional[list[str]],
workspace_version: Optional[int],
):
self.terra_workspace = terra_workspace
self.terra_groups = terra_groups
Expand All @@ -109,10 +118,12 @@ def __init__(
self.controlled_access = controlled_access
self.resource_owners = resource_owners
self.resource_members = resource_members
self.workspace_version = workspace_version

def _set_up_access_group(self) -> None:
logging.info(f"Creating group {self.auth_group}")
self.terra_groups.create_group(group_name=self.auth_group, continue_if_exists=self.continue_if_exists)
continue_if_exists = True if self.workspace_version else self.continue_if_exists
self.terra_groups.create_group(group_name=self.auth_group, continue_if_exists=continue_if_exists)
for user in self.resource_owners:
self.terra_groups.add_user_to_group(email=user, group=self.auth_group, role=ADMIN)
if self.resource_members:
Expand Down Expand Up @@ -199,7 +210,8 @@ def __init__(
controlled_access: bool,
terra_billing_project: str,
delete_existing_dataset: bool,
phs_id: Optional[str] = None
workspace_version: Optional[int],
phs_id: Optional[str] = None,
):
self.tdr = tdr
self.dataset_name = dataset_name
Expand All @@ -212,6 +224,7 @@ def __init__(
self.auth_group = auth_group
self.delete_existing_dataset = delete_existing_dataset
self.controlled_access = controlled_access
self.workspace_version = workspace_version

def _create_dataset_properties(self) -> dict:
additional_properties = {
Expand All @@ -225,17 +238,32 @@ def _create_dataset_properties(self) -> dict:
return additional_properties

def _add_row_to_table(self, dataset_id: str) -> None:
ingest_records = [
{
"key": "Staging Workspace",
"value": f"{self.terra_billing_project}/{self.workspace_name}"
},
{
"key": "Authorization Group",
"value": self.auth_group
}
]
if workspace_version:
ingest_records.append(
{
"key": f"Staging Workspace Version {self.workspace_version}",
"value": f"{self.terra_billing_project}/{self.workspace_name}"
}
)

StartAndMonitorIngest(
ingest_records=[
{"key": "Staging Workspace", "value": f'{self.terra_billing_project}/{self.workspace_name}'},
{"key": "Authorization Group", "value": self.auth_group}
],
ingest_records=ingest_records,
target_table_name=self.REFERENCE_TABLE,
dataset_id=dataset_id,
load_tag=f"{dataset_name}_initial_load",
bulk_mode=False,
update_strategy="replace",
waiting_time_to_poll=15,
waiting_time_to_poll=ARG_DEFAULTS["waiting_time_to_poll"],
tdr=self.tdr,
).run()

Expand Down Expand Up @@ -308,7 +336,7 @@ def run(self) -> None:
self.terra_groups.remove_user_from_group(
group=self.auth_group,
email=self.current_user_email,
role="admin"
role=ADMIN
)


Expand Down Expand Up @@ -481,6 +509,7 @@ def run(self) -> list[WorkflowConfigs]:
wdls_to_import = args.wdls_to_import
notebooks_to_import = args.notebooks_to_import
delete_existing_dataset = args.delete_existing_dataset
workspace_version = args.workspace_version if args.workspace_version and args.workspace_version > 1 else None

# Validate wdls to import are valid and exclude any that are not
if wdls_to_import:
Expand All @@ -489,7 +518,9 @@ def run(self) -> list[WorkflowConfigs]:
if wdl in GetWorkflowNames().get_workflow_names()
]

workspace_name = f"{dataset_name}_Staging"
workspace_name = (
f"{dataset_name}_Staging_v{workspace_version}" if workspace_version else f"{dataset_name}_Staging"
)
auth_group = f"AUTH_{dataset_name}"

# Set up Terra, TerraGroups, and TDR classes
Expand All @@ -511,7 +542,8 @@ def run(self) -> list[WorkflowConfigs]:
continue_if_exists=continue_if_exists,
controlled_access=controlled_access,
resource_owners=resource_owners,
resource_members=resource_members
resource_members=resource_members,
workspace_version=workspace_version,
).run()
logging.info("Finished setting up Terra workspace")
workspace_bucket = f"gs://{terra_workspace.get_workspace_bucket()}"
Expand All @@ -521,13 +553,14 @@ def run(self) -> list[WorkflowConfigs]:
dataset_name=dataset_name,
tdr_billing_profile=tdr_billing_profile,
phs_id=phs_id,
continue_if_exists=continue_if_exists,
continue_if_exists=True if workspace_version else continue_if_exists,
terra_billing_project=terra_billing_project,
workspace_name=workspace_name,
resource_owners=resource_owners,
auth_group=auth_group,
controlled_access=controlled_access,
delete_existing_dataset=delete_existing_dataset
delete_existing_dataset=delete_existing_dataset,
workspace_version=workspace_version,
)
if delete_existing_dataset:
sa_for_dataset_to_delete = dataset_setup.get_sa_for_dataset_to_delete()
Expand Down
35 changes: 18 additions & 17 deletions wdl/SetUpStagingWorkspaceAndDataset/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,24 @@ This workflow automates the process of setting up a staging workspace in Terra a


## Inputs Table:
| Input Name | Description | Type | Required | Default |
|------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|----------|---------------------------------------------------------------------------------------------|
| **dataset_name** | The name of the TDR dataset which will be created. | String | Yes | N/A |
| **tdr_billing_profile** | The billing profile of the TDR dataset that will be created. | String | Yes | N/A |
| **terra_billing_project** | The billing project to use for the target Terra workspace. | String | Yes | N/A |
| **controlled_access** | Whether the Terra workspace/TDR dataset should have controlled access | Boolean | Yes | N/A |
| **resource_owners** | Comma separate list of resource owner(s). At least one resource owner is required. Do not include spaces between entries (i.e. use the following format: "user1@broadinstitute.org,user2@broadinstitute.org") | String | Yes | N/A |
| **continue_if_exists** | Whether to continue even if the workspace/dataset already exists. | Boolean | Yes | N/A |
| **delete_existing_dataset** | Delete existing dataset if one already exists. Deleting dataset WILL fail if snapshots still exist. | Boolean | Yes | N/A |
| **current_user_email** | The email of the current user (used for removing current user from workspace) | String | Yes | N/A |
| **dbgap_consent_code** | The dbGaP consent code (if it exists). Optional. | String | No | N/A |
| **phs_id** | The PHS id if it exists. Optional. | String | No | N/A |
| **duos_identifier** | The DUOS identifier (if it exists). Optional | String | No | N/A |
| **wdls_to_import** | A comma separate list of WDLs to import. Optional. Do not include spaces between entries (i.e. user the following format: "Workflow1,Workflow2") | String | No | N/A |
| **notebooks_to_import** | A comma separate list of notebooks to import. Optional. Do not include spaces between entries (i.e. user the following format: "Notebook1.ipynb,Notebook2.ipynb") | String | No | N/A |
| **resource_members** | Comma separate list of resource members (if they exist). Optional. Do not include spaces between entries (i.e. use the following format: "user1@broadinstitute.org,user2@broadinstitute.org") | String | No | N/A |
| **docker** | The docker image | String | No | us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest |
| Input Name | Description | Type | Required | Default |
|-----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|----------|---------------------------------------------------------------------------------------------|
| **dataset_name** | The name of the TDR dataset which will be created. | String | Yes | N/A |
| **tdr_billing_profile** | The billing profile of the TDR dataset that will be created. | String | Yes | N/A |
| **terra_billing_project** | The billing project to use for the target Terra workspace. | String | Yes | N/A |
| **controlled_access** | Whether the Terra workspace/TDR dataset should have controlled access | Boolean | Yes | N/A |
| **resource_owners** | Comma separate list of resource owner(s). At least one resource owner is required. Do not include spaces between entries (i.e. use the following format: "user1@broadinstitute.org,user2@broadinstitute.org") | String | Yes | N/A |
| **continue_if_exists** | Whether to continue even if the workspace/dataset already exists. | Boolean | Yes | N/A |
| **delete_existing_dataset** | Delete existing dataset if one already exists. Deleting dataset WILL fail if snapshots still exist. | Boolean | Yes | N/A |
| **current_user_email** | The email of the current user (used for removing current user from workspace) | String | Yes | N/A |
| **dbgap_consent_code** | The dbGaP consent code (if it exists). Optional. | String | No | N/A |
| **phs_id** | The PHS id if it exists. Optional. | String | No | N/A |
| **duos_identifier** | The DUOS identifier (if it exists). Optional | String | No | N/A |
| **wdls_to_import** | A comma separate list of WDLs to import. Optional. Do not include spaces between entries (i.e. user the following format: "Workflow1,Workflow2") | String | No | N/A |
| **notebooks_to_import** | A comma separate list of notebooks to import. Optional. Do not include spaces between entries (i.e. user the following format: "Notebook1.ipynb,Notebook2.ipynb") | String | No | N/A |
| **resource_members** | Comma separate list of resource members (if they exist). Optional. Do not include spaces between entries (i.e. use the following format: "user1@broadinstitute.org,user2@broadinstitute.org") | String | No | N/A |
| **workspace_version** | An optional version number to append to the workspace name. This is useful when one batch of data has already been imported to a dataset and another batch needs to be imported before publicizing the data. | Int | No | N/A |
| **docker** | The docker image | String | No | us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest |

## Outputs Table:
This script does not generate any direct outputs. But the target workspace and dataset will be generated if the workflow completes successfully. The logs, which include any errors or warnings, can be reviewed in the stderr file for additional details about the process.
Expand Down
Loading

0 comments on commit 5f7dd0f

Please # to comment.