diff --git a/python/delete_tdr_rows.py b/python/delete_tdr_rows.py index f008914..d49da13 100644 --- a/python/delete_tdr_rows.py +++ b/python/delete_tdr_rows.py @@ -39,7 +39,7 @@ def _fetch_file_ref_columns(self) -> list[str]: table_schema = self.tdr.get_table_schema_info(dataset_id=self.dataset_id, table_name=self.table_name) return [col['name'] for col in table_schema['columns'] if col['datatype'] == 'fileref'] - def _log_rows_found_info(self, found_row_ids: list[str], file_uuids: list[str]) -> None: + def _log_rows_found_info(self, found_row_ids: list[str], file_uuids: set[str]) -> None: logging.info(f"Found {len(found_row_ids)} rows to delete") not_found_ids = set(self.ids_to_delete) - set(found_row_ids) if not_found_ids: @@ -48,12 +48,12 @@ def _log_rows_found_info(self, found_row_ids: list[str], file_uuids: list[str]) ) logging.info(f"Found {len(file_uuids)} files linked to the rows to delete") - def run(self) -> tuple[list[str], list[str]]: + def run(self) -> tuple[list[str], set[str]]: table_metrics = tdr.get_dataset_table_metrics(dataset_id=dataset_id, target_table_name=table_name) # tdr_row_ids to be deleted tdr_row_ids = [] # file uuids to be deleted later if options used - file_uuids = [] + file_uuids = set() # Used to log the ids that were not found found_row_ids = [] @@ -67,7 +67,11 @@ def run(self) -> tuple[list[str], list[str]]: tdr_row_id = row['datarepo_row_id'] # If the column is a fileref, store the file_uuid if column in file_ref_columns: - row_file_uuids.append(row[column]) + # If the column is a list, store all the file_uuids + if isinstance(row[column], list): + row_file_uuids.extend(row[column]) + else: + row_file_uuids.append(row[column]) # If the column is the id column, check if the id is in the ids_to_delete_file if column == self.id_column_name: if row[column] in self.ids_to_delete: @@ -75,7 +79,7 @@ def run(self) -> tuple[list[str], list[str]]: store = True # If the row is to be deleted, store the file_uuids and tdr_row_id if store: - file_uuids.extend(row_file_uuids) + file_uuids.update(row_file_uuids) tdr_row_ids.append(tdr_row_id) self._log_rows_found_info(found_row_ids, file_uuids) return tdr_row_ids, file_uuids @@ -111,7 +115,7 @@ def run(self) -> tuple[list[str], list[str]]: if delete_files: if file_uuids: tdr.delete_files( - file_ids=file_uuids, + file_ids=list(file_uuids), dataset_id=dataset_id ) else: diff --git a/wdl/DeleteTdrRows/README.md b/wdl/DeleteTdrRows/README.md index 582d67c..4b11fb7 100644 --- a/wdl/DeleteTdrRows/README.md +++ b/wdl/DeleteTdrRows/README.md @@ -1,17 +1,19 @@ # WDL Input Overview This WDl will delete rows from a table in a TDR. You can specify the table and the rows to delete. Optionally can delete files linked to the rows. -If data / files being deleted are STILL part of active snapshot you will run into issues. Make sure to delete the snapshots first. +## Notes +* If data / files being deleted are STILL part of active snapshot you will run into issues. Make sure to delete the snapshots first. +* This should kick off ONE workflow for all entities. Make sure you are selecting a SET containing all entities. ## Inputs Table: -| Input Name | Description | Type | Required | Default | -|--------------------------|-------------------------------------------------------------------|---------------|----------|-----------------------------------------------------------------------------------------------| -| **dataset_id** | dataset id where table exists | String | Yes | N/A | -| **tdr_table_name** | Table name in dataset | String | Yes | N/A | -| **ids_to_delete** | list of ids to look for and delete in table | Array[String] | Yes | N/A | -| **id_column_name** | Name of column where ids exist | String | Yes | N/A | -| **delete_files** | Use if want to delete files that are referenced in rows to delete | Boolean | Yes | N/A | -| **docker** | Specifies a custom Docker image to use. Optional. | String | No | "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest" | +| Input Name | Description | Type | Required | Default | +|--------------------------|--------------------------------------------------------------------------------|---------------|----------|-----------------------------------------------------------------------------------------------| +| **dataset_id** | dataset id where table exists | String | Yes | N/A | +| **tdr_table_name** | Table name in dataset | String | Yes | N/A | +| **ids_to_delete** | list of ids to look for and delete in table. This should be selected as a set. | Array[String] | Yes | N/A | +| **id_column_name** | Name of column where ids exist | String | Yes | N/A | +| **delete_files** | Use if want to delete files that are referenced in rows to delete | Boolean | Yes | N/A | +| **docker** | Specifies a custom Docker image to use. Optional. | String | No | "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest" | ## Outputs Table: