diff --git a/.dockstore.yml b/.dockstore.yml index 3cb4044..dc79ad9 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -132,3 +132,10 @@ workflows: readMePath: /wdl/GetTdrSchemaJson/README.md testParameterFiles: - /wdl/GetTdrSchemaJson/GetTdrSchemaJson.wdl + + - name: DeleteTdrRows + subclass: WDL + primaryDescriptorPath: /wdl/DeleteTdrRows/DeleteTdrRows.wdl + readMePath: /wdl/DeleteTdrRows/README.md + testParameterFiles: + - /wdl/DeleteTdrRows/DeleteTdrRows.wdl diff --git a/python/delete_tdr_rows.py b/python/delete_tdr_rows.py new file mode 100644 index 0000000..f008914 --- /dev/null +++ b/python/delete_tdr_rows.py @@ -0,0 +1,118 @@ +from argparse import ArgumentParser, Namespace + +from utils import GCP +from utils.requests_utils.request_util import RunRequest +from utils.tdr_utils.tdr_api_utils import TDR +from utils.token_util import Token +import logging + +logging.basicConfig( + format="%(levelname)s: %(asctime)s : %(message)s", level=logging.INFO +) + +CLOUD_TYPE = GCP + + +def get_args() -> Namespace: + parser = ArgumentParser( + description="Delete rows from TDR dataset table") + parser.add_argument("--dataset_id", "-i", required=True) + parser.add_argument("--table", "-t", required=True) + parser.add_argument("--ids_to_delete_file", "-if", help="The file containing the ids to delete", + required=True) + parser.add_argument("--id_column_name", "-ic", help="The column name of the id to delete", + required=True) + parser.add_argument("--delete_files", "-df", help="Delete the files associated with the rows", + action="store_true") + return parser.parse_args() + + +class GetRowAndFileInfo: + def __init__(self, ids_to_delete: list[str], id_column_name: str, dataset_id: str, table_name: str, tdr: TDR): + self.ids_to_delete = ids_to_delete + self.id_column_name = id_column_name + self.dataset_id = dataset_id + self.table_name = table_name + self.tdr = tdr + + def _fetch_file_ref_columns(self) -> list[str]: + table_schema = self.tdr.get_table_schema_info(dataset_id=self.dataset_id, table_name=self.table_name) + return [col['name'] for col in table_schema['columns'] if col['datatype'] == 'fileref'] + + def _log_rows_found_info(self, found_row_ids: list[str], file_uuids: list[str]) -> None: + logging.info(f"Found {len(found_row_ids)} rows to delete") + not_found_ids = set(self.ids_to_delete) - set(found_row_ids) + if not_found_ids: + logging.warning( + f"Could not find the following {len(not_found_ids)} ids in table {self.table_name}: {not_found_ids}" + ) + logging.info(f"Found {len(file_uuids)} files linked to the rows to delete") + + def run(self) -> tuple[list[str], list[str]]: + table_metrics = tdr.get_dataset_table_metrics(dataset_id=dataset_id, target_table_name=table_name) + # tdr_row_ids to be deleted + tdr_row_ids = [] + # file uuids to be deleted later if options used + file_uuids = [] + # Used to log the ids that were not found + found_row_ids = [] + + # Get the columns whose datatype is filerefs + file_ref_columns = self._fetch_file_ref_columns() + + for row in table_metrics: + store = False + row_file_uuids = [] + for column in row: + tdr_row_id = row['datarepo_row_id'] + # If the column is a fileref, store the file_uuid + if column in file_ref_columns: + row_file_uuids.append(row[column]) + # If the column is the id column, check if the id is in the ids_to_delete_file + if column == self.id_column_name: + if row[column] in self.ids_to_delete: + found_row_ids.append(row[column]) + store = True + # If the row is to be deleted, store the file_uuids and tdr_row_id + if store: + file_uuids.extend(row_file_uuids) + tdr_row_ids.append(tdr_row_id) + self._log_rows_found_info(found_row_ids, file_uuids) + return tdr_row_ids, file_uuids + + +if __name__ == '__main__': + args = get_args() + dataset_id = args.dataset_id + table_name = args.table + ids_to_delete_file = args.ids_to_delete_file + id_column_name = args.id_column_name + delete_files = args.delete_files + + with open(ids_to_delete_file, 'r') as f: + ids_to_delete = list(set(f.read().splitlines())) + logging.info(f"Found {len(ids_to_delete)} ids in {ids_to_delete_file} to delete") + + token = Token(cloud=CLOUD_TYPE) + request_util = RunRequest(token=token) + tdr = TDR(request_util=request_util) + + # Get the rows to delete and the file_uuids + tdr_rows_to_delete, file_uuids = GetRowAndFileInfo( + ids_to_delete=ids_to_delete, + id_column_name=id_column_name, + dataset_id=dataset_id, + table_name=table_name, + tdr=tdr + ).run() + + if tdr_rows_to_delete: + tdr.soft_delete_entries(dataset_id=dataset_id, table_name=table_name, datarepo_row_ids=tdr_rows_to_delete) + if delete_files: + if file_uuids: + tdr.delete_files( + file_ids=file_uuids, + dataset_id=dataset_id + ) + else: + logging.info("No files to delete") diff --git a/python/utils/tdr_utils/tdr_api_utils.py b/python/utils/tdr_utils/tdr_api_utils.py index 108c37b..d89eb42 100644 --- a/python/utils/tdr_utils/tdr_api_utils.py +++ b/python/utils/tdr_utils/tdr_api_utils.py @@ -176,7 +176,7 @@ def delete_files( self, file_ids: list[str], dataset_id: str, - batch_size_to_delete_files: int = 100, + batch_size_to_delete_files: int = 250, check_interval: int = 15) -> None: """ Delete multiple files from a dataset in batches and monitor delete jobs until completion for each batch. diff --git a/wdl/DeleteTdrRows/DeleteTdrRows.wdl b/wdl/DeleteTdrRows/DeleteTdrRows.wdl new file mode 100644 index 0000000..966461d --- /dev/null +++ b/wdl/DeleteTdrRows/DeleteTdrRows.wdl @@ -0,0 +1,50 @@ +version 1.0 + +workflow DeleteTdrRows { + input { + String dataset_id + String tdr_table_name + Array[String] ids_to_delete + String id_column_name + Boolean delete_files + String? docker + } + + String docker_name = select_first([docker, "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest"]) + + call DeleteTdrRowsTask { + input: + dataset_id=dataset_id, + table=tdr_table_name, + id_column_name=id_column_name, + delete_files=delete_files, + ids_to_delete=ids_to_delete, + docker_name=docker_name + } +} + +task DeleteTdrRowsTask { + input { + String dataset_id + String table + Array[String] ids_to_delete + String id_column_name + Boolean delete_files + String docker_name + } + + File ids_to_delete = write_lines(ids_to_delete) + + command <<< + python /etc/terra_utils/python/delete_tdr_rows.py \ + --dataset_id ~{dataset_id} \ + --table ~{table} \ + --ids_to_delete_file ~{ids_to_delete} \ + --id_column_name ~{id_column_name} \ + ~{if delete_files then "--delete_files" else ""} + >>> + + runtime { + docker: docker_name + } +} diff --git a/wdl/DeleteTdrRows/README.md b/wdl/DeleteTdrRows/README.md new file mode 100644 index 0000000..582d67c --- /dev/null +++ b/wdl/DeleteTdrRows/README.md @@ -0,0 +1,18 @@ +# WDL Input Overview +This WDl will delete rows from a table in a TDR. You can specify the table and the rows to delete. Optionally can delete files linked to the rows. + +If data / files being deleted are STILL part of active snapshot you will run into issues. Make sure to delete the snapshots first. + +## Inputs Table: +| Input Name | Description | Type | Required | Default | +|--------------------------|-------------------------------------------------------------------|---------------|----------|-----------------------------------------------------------------------------------------------| +| **dataset_id** | dataset id where table exists | String | Yes | N/A | +| **tdr_table_name** | Table name in dataset | String | Yes | N/A | +| **ids_to_delete** | list of ids to look for and delete in table | Array[String] | Yes | N/A | +| **id_column_name** | Name of column where ids exist | String | Yes | N/A | +| **delete_files** | Use if want to delete files that are referenced in rows to delete | Boolean | Yes | N/A | +| **docker** | Specifies a custom Docker image to use. Optional. | String | No | "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest" | + + +## Outputs Table: +This script does not generate any outputs directly. However, logs will be provided to track the progress of the dataset transfer, including details about dataset creation, table confirmation, and data ingestion. You can review the logs in the stderr file for information about the transfer process and the status of the ingestion jobs. diff --git a/wdl/DeleteTdrRows/template_inputs.json b/wdl/DeleteTdrRows/template_inputs.json new file mode 100644 index 0000000..98715d2 --- /dev/null +++ b/wdl/DeleteTdrRows/template_inputs.json @@ -0,0 +1,8 @@ +{ + "DeleteTdrRows.dataset_id": "String", + "DeleteTdrRows.tdr_table_name": "String", + "DeleteTdrRows.id_column_name": "String", + "DeleteTdrRows.delete_files": "Boolean", + "DeleteTdrRows.ids_to_delete": "List", + "DeleteTdrRows.docker": "String" +}