Skip to content

Commit

Permalink
Merge pull request #196 from broadinstitute/sn_POD-2373_delete_tdr_ro…
Browse files Browse the repository at this point in the history
…ws_script

Adding script and wdl to delete rows from tdr dataset
  • Loading branch information
snovod authored Jan 17, 2025
2 parents 6bb3834 + 42e694b commit e70412a
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 1 deletion.
7 changes: 7 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,10 @@ workflows:
readMePath: /wdl/GetTdrSchemaJson/README.md
testParameterFiles:
- /wdl/GetTdrSchemaJson/GetTdrSchemaJson.wdl

- name: DeleteTdrRows
subclass: WDL
primaryDescriptorPath: /wdl/DeleteTdrRows/DeleteTdrRows.wdl
readMePath: /wdl/DeleteTdrRows/README.md
testParameterFiles:
- /wdl/DeleteTdrRows/DeleteTdrRows.wdl
118 changes: 118 additions & 0 deletions python/delete_tdr_rows.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from argparse import ArgumentParser, Namespace

from utils import GCP
from utils.requests_utils.request_util import RunRequest
from utils.tdr_utils.tdr_api_utils import TDR
from utils.token_util import Token
import logging

logging.basicConfig(
format="%(levelname)s: %(asctime)s : %(message)s", level=logging.INFO
)

CLOUD_TYPE = GCP


def get_args() -> Namespace:
parser = ArgumentParser(
description="Delete rows from TDR dataset table")
parser.add_argument("--dataset_id", "-i", required=True)
parser.add_argument("--table", "-t", required=True)
parser.add_argument("--ids_to_delete_file", "-if", help="The file containing the ids to delete",
required=True)
parser.add_argument("--id_column_name", "-ic", help="The column name of the id to delete",
required=True)
parser.add_argument("--delete_files", "-df", help="Delete the files associated with the rows",
action="store_true")
return parser.parse_args()


class GetRowAndFileInfo:
def __init__(self, ids_to_delete: list[str], id_column_name: str, dataset_id: str, table_name: str, tdr: TDR):
self.ids_to_delete = ids_to_delete
self.id_column_name = id_column_name
self.dataset_id = dataset_id
self.table_name = table_name
self.tdr = tdr

def _fetch_file_ref_columns(self) -> list[str]:
table_schema = self.tdr.get_table_schema_info(dataset_id=self.dataset_id, table_name=self.table_name)
return [col['name'] for col in table_schema['columns'] if col['datatype'] == 'fileref']

def _log_rows_found_info(self, found_row_ids: list[str], file_uuids: list[str]) -> None:
logging.info(f"Found {len(found_row_ids)} rows to delete")
not_found_ids = set(self.ids_to_delete) - set(found_row_ids)
if not_found_ids:
logging.warning(
f"Could not find the following {len(not_found_ids)} ids in table {self.table_name}: {not_found_ids}"
)
logging.info(f"Found {len(file_uuids)} files linked to the rows to delete")

def run(self) -> tuple[list[str], list[str]]:
table_metrics = tdr.get_dataset_table_metrics(dataset_id=dataset_id, target_table_name=table_name)
# tdr_row_ids to be deleted
tdr_row_ids = []
# file uuids to be deleted later if options used
file_uuids = []
# Used to log the ids that were not found
found_row_ids = []

# Get the columns whose datatype is filerefs
file_ref_columns = self._fetch_file_ref_columns()

for row in table_metrics:
store = False
row_file_uuids = []
for column in row:
tdr_row_id = row['datarepo_row_id']
# If the column is a fileref, store the file_uuid
if column in file_ref_columns:
row_file_uuids.append(row[column])
# If the column is the id column, check if the id is in the ids_to_delete_file
if column == self.id_column_name:
if row[column] in self.ids_to_delete:
found_row_ids.append(row[column])
store = True
# If the row is to be deleted, store the file_uuids and tdr_row_id
if store:
file_uuids.extend(row_file_uuids)
tdr_row_ids.append(tdr_row_id)
self._log_rows_found_info(found_row_ids, file_uuids)
return tdr_row_ids, file_uuids


if __name__ == '__main__':
args = get_args()
dataset_id = args.dataset_id
table_name = args.table
ids_to_delete_file = args.ids_to_delete_file
id_column_name = args.id_column_name
delete_files = args.delete_files

with open(ids_to_delete_file, 'r') as f:
ids_to_delete = list(set(f.read().splitlines()))
logging.info(f"Found {len(ids_to_delete)} ids in {ids_to_delete_file} to delete")

token = Token(cloud=CLOUD_TYPE)
request_util = RunRequest(token=token)
tdr = TDR(request_util=request_util)

# Get the rows to delete and the file_uuids
tdr_rows_to_delete, file_uuids = GetRowAndFileInfo(
ids_to_delete=ids_to_delete,
id_column_name=id_column_name,
dataset_id=dataset_id,
table_name=table_name,
tdr=tdr
).run()

if tdr_rows_to_delete:
tdr.soft_delete_entries(dataset_id=dataset_id, table_name=table_name, datarepo_row_ids=tdr_rows_to_delete)
if delete_files:
if file_uuids:
tdr.delete_files(
file_ids=file_uuids,
dataset_id=dataset_id
)
else:
logging.info("No files to delete")
2 changes: 1 addition & 1 deletion python/utils/tdr_utils/tdr_api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def delete_files(
self,
file_ids: list[str],
dataset_id: str,
batch_size_to_delete_files: int = 100,
batch_size_to_delete_files: int = 250,
check_interval: int = 15) -> None:
"""
Delete multiple files from a dataset in batches and monitor delete jobs until completion for each batch.
Expand Down
50 changes: 50 additions & 0 deletions wdl/DeleteTdrRows/DeleteTdrRows.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
version 1.0

workflow DeleteTdrRows {
input {
String dataset_id
String tdr_table_name
Array[String] ids_to_delete
String id_column_name
Boolean delete_files
String? docker
}

String docker_name = select_first([docker, "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest"])

call DeleteTdrRowsTask {
input:
dataset_id=dataset_id,
table=tdr_table_name,
id_column_name=id_column_name,
delete_files=delete_files,
ids_to_delete=ids_to_delete,
docker_name=docker_name
}
}

task DeleteTdrRowsTask {
input {
String dataset_id
String table
Array[String] ids_to_delete
String id_column_name
Boolean delete_files
String docker_name
}

File ids_to_delete = write_lines(ids_to_delete)

command <<<
python /etc/terra_utils/python/delete_tdr_rows.py \
--dataset_id ~{dataset_id} \
--table ~{table} \
--ids_to_delete_file ~{ids_to_delete} \
--id_column_name ~{id_column_name} \
~{if delete_files then "--delete_files" else ""}
>>>

runtime {
docker: docker_name
}
}
18 changes: 18 additions & 0 deletions wdl/DeleteTdrRows/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# WDL Input Overview
This WDl will delete rows from a table in a TDR. You can specify the table and the rows to delete. Optionally can delete files linked to the rows.

If data / files being deleted are STILL part of active snapshot you will run into issues. Make sure to delete the snapshots first.

## Inputs Table:
| Input Name | Description | Type | Required | Default |
|--------------------------|-------------------------------------------------------------------|---------------|----------|-----------------------------------------------------------------------------------------------|
| **dataset_id** | dataset id where table exists | String | Yes | N/A |
| **tdr_table_name** | Table name in dataset | String | Yes | N/A |
| **ids_to_delete** | list of ids to look for and delete in table | Array[String] | Yes | N/A |
| **id_column_name** | Name of column where ids exist | String | Yes | N/A |
| **delete_files** | Use if want to delete files that are referenced in rows to delete | Boolean | Yes | N/A |
| **docker** | Specifies a custom Docker image to use. Optional. | String | No | "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest" |


## Outputs Table:
This script does not generate any outputs directly. However, logs will be provided to track the progress of the dataset transfer, including details about dataset creation, table confirmation, and data ingestion. You can review the logs in the stderr file for information about the transfer process and the status of the ingestion jobs.
8 changes: 8 additions & 0 deletions wdl/DeleteTdrRows/template_inputs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"DeleteTdrRows.dataset_id": "String",
"DeleteTdrRows.tdr_table_name": "String",
"DeleteTdrRows.id_column_name": "String",
"DeleteTdrRows.delete_files": "Boolean",
"DeleteTdrRows.ids_to_delete": "List",
"DeleteTdrRows.docker": "String"
}

0 comments on commit e70412a

Please # to comment.