-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #196 from broadinstitute/sn_POD-2373_delete_tdr_ro…
…ws_script Adding script and wdl to delete rows from tdr dataset
- Loading branch information
Showing
6 changed files
with
202 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
from argparse import ArgumentParser, Namespace | ||
|
||
from utils import GCP | ||
from utils.requests_utils.request_util import RunRequest | ||
from utils.tdr_utils.tdr_api_utils import TDR | ||
from utils.token_util import Token | ||
import logging | ||
|
||
logging.basicConfig( | ||
format="%(levelname)s: %(asctime)s : %(message)s", level=logging.INFO | ||
) | ||
|
||
CLOUD_TYPE = GCP | ||
|
||
|
||
def get_args() -> Namespace: | ||
parser = ArgumentParser( | ||
description="Delete rows from TDR dataset table") | ||
parser.add_argument("--dataset_id", "-i", required=True) | ||
parser.add_argument("--table", "-t", required=True) | ||
parser.add_argument("--ids_to_delete_file", "-if", help="The file containing the ids to delete", | ||
required=True) | ||
parser.add_argument("--id_column_name", "-ic", help="The column name of the id to delete", | ||
required=True) | ||
parser.add_argument("--delete_files", "-df", help="Delete the files associated with the rows", | ||
action="store_true") | ||
return parser.parse_args() | ||
|
||
|
||
class GetRowAndFileInfo: | ||
def __init__(self, ids_to_delete: list[str], id_column_name: str, dataset_id: str, table_name: str, tdr: TDR): | ||
self.ids_to_delete = ids_to_delete | ||
self.id_column_name = id_column_name | ||
self.dataset_id = dataset_id | ||
self.table_name = table_name | ||
self.tdr = tdr | ||
|
||
def _fetch_file_ref_columns(self) -> list[str]: | ||
table_schema = self.tdr.get_table_schema_info(dataset_id=self.dataset_id, table_name=self.table_name) | ||
return [col['name'] for col in table_schema['columns'] if col['datatype'] == 'fileref'] | ||
|
||
def _log_rows_found_info(self, found_row_ids: list[str], file_uuids: list[str]) -> None: | ||
logging.info(f"Found {len(found_row_ids)} rows to delete") | ||
not_found_ids = set(self.ids_to_delete) - set(found_row_ids) | ||
if not_found_ids: | ||
logging.warning( | ||
f"Could not find the following {len(not_found_ids)} ids in table {self.table_name}: {not_found_ids}" | ||
) | ||
logging.info(f"Found {len(file_uuids)} files linked to the rows to delete") | ||
|
||
def run(self) -> tuple[list[str], list[str]]: | ||
table_metrics = tdr.get_dataset_table_metrics(dataset_id=dataset_id, target_table_name=table_name) | ||
# tdr_row_ids to be deleted | ||
tdr_row_ids = [] | ||
# file uuids to be deleted later if options used | ||
file_uuids = [] | ||
# Used to log the ids that were not found | ||
found_row_ids = [] | ||
|
||
# Get the columns whose datatype is filerefs | ||
file_ref_columns = self._fetch_file_ref_columns() | ||
|
||
for row in table_metrics: | ||
store = False | ||
row_file_uuids = [] | ||
for column in row: | ||
tdr_row_id = row['datarepo_row_id'] | ||
# If the column is a fileref, store the file_uuid | ||
if column in file_ref_columns: | ||
row_file_uuids.append(row[column]) | ||
# If the column is the id column, check if the id is in the ids_to_delete_file | ||
if column == self.id_column_name: | ||
if row[column] in self.ids_to_delete: | ||
found_row_ids.append(row[column]) | ||
store = True | ||
# If the row is to be deleted, store the file_uuids and tdr_row_id | ||
if store: | ||
file_uuids.extend(row_file_uuids) | ||
tdr_row_ids.append(tdr_row_id) | ||
self._log_rows_found_info(found_row_ids, file_uuids) | ||
return tdr_row_ids, file_uuids | ||
|
||
|
||
if __name__ == '__main__': | ||
args = get_args() | ||
dataset_id = args.dataset_id | ||
table_name = args.table | ||
ids_to_delete_file = args.ids_to_delete_file | ||
id_column_name = args.id_column_name | ||
delete_files = args.delete_files | ||
|
||
with open(ids_to_delete_file, 'r') as f: | ||
ids_to_delete = list(set(f.read().splitlines())) | ||
logging.info(f"Found {len(ids_to_delete)} ids in {ids_to_delete_file} to delete") | ||
|
||
token = Token(cloud=CLOUD_TYPE) | ||
request_util = RunRequest(token=token) | ||
tdr = TDR(request_util=request_util) | ||
|
||
# Get the rows to delete and the file_uuids | ||
tdr_rows_to_delete, file_uuids = GetRowAndFileInfo( | ||
ids_to_delete=ids_to_delete, | ||
id_column_name=id_column_name, | ||
dataset_id=dataset_id, | ||
table_name=table_name, | ||
tdr=tdr | ||
).run() | ||
|
||
if tdr_rows_to_delete: | ||
tdr.soft_delete_entries(dataset_id=dataset_id, table_name=table_name, datarepo_row_ids=tdr_rows_to_delete) | ||
if delete_files: | ||
if file_uuids: | ||
tdr.delete_files( | ||
file_ids=file_uuids, | ||
dataset_id=dataset_id | ||
) | ||
else: | ||
logging.info("No files to delete") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
version 1.0 | ||
|
||
workflow DeleteTdrRows { | ||
input { | ||
String dataset_id | ||
String tdr_table_name | ||
Array[String] ids_to_delete | ||
String id_column_name | ||
Boolean delete_files | ||
String? docker | ||
} | ||
|
||
String docker_name = select_first([docker, "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest"]) | ||
|
||
call DeleteTdrRowsTask { | ||
input: | ||
dataset_id=dataset_id, | ||
table=tdr_table_name, | ||
id_column_name=id_column_name, | ||
delete_files=delete_files, | ||
ids_to_delete=ids_to_delete, | ||
docker_name=docker_name | ||
} | ||
} | ||
|
||
task DeleteTdrRowsTask { | ||
input { | ||
String dataset_id | ||
String table | ||
Array[String] ids_to_delete | ||
String id_column_name | ||
Boolean delete_files | ||
String docker_name | ||
} | ||
|
||
File ids_to_delete = write_lines(ids_to_delete) | ||
|
||
command <<< | ||
python /etc/terra_utils/python/delete_tdr_rows.py \ | ||
--dataset_id ~{dataset_id} \ | ||
--table ~{table} \ | ||
--ids_to_delete_file ~{ids_to_delete} \ | ||
--id_column_name ~{id_column_name} \ | ||
~{if delete_files then "--delete_files" else ""} | ||
>>> | ||
|
||
runtime { | ||
docker: docker_name | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# WDL Input Overview | ||
This WDl will delete rows from a table in a TDR. You can specify the table and the rows to delete. Optionally can delete files linked to the rows. | ||
|
||
If data / files being deleted are STILL part of active snapshot you will run into issues. Make sure to delete the snapshots first. | ||
|
||
## Inputs Table: | ||
| Input Name | Description | Type | Required | Default | | ||
|--------------------------|-------------------------------------------------------------------|---------------|----------|-----------------------------------------------------------------------------------------------| | ||
| **dataset_id** | dataset id where table exists | String | Yes | N/A | | ||
| **tdr_table_name** | Table name in dataset | String | Yes | N/A | | ||
| **ids_to_delete** | list of ids to look for and delete in table | Array[String] | Yes | N/A | | ||
| **id_column_name** | Name of column where ids exist | String | Yes | N/A | | ||
| **delete_files** | Use if want to delete files that are referenced in rows to delete | Boolean | Yes | N/A | | ||
| **docker** | Specifies a custom Docker image to use. Optional. | String | No | "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest" | | ||
|
||
|
||
## Outputs Table: | ||
This script does not generate any outputs directly. However, logs will be provided to track the progress of the dataset transfer, including details about dataset creation, table confirmation, and data ingestion. You can review the logs in the stderr file for information about the transfer process and the status of the ingestion jobs. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"DeleteTdrRows.dataset_id": "String", | ||
"DeleteTdrRows.tdr_table_name": "String", | ||
"DeleteTdrRows.id_column_name": "String", | ||
"DeleteTdrRows.delete_files": "Boolean", | ||
"DeleteTdrRows.ids_to_delete": "List", | ||
"DeleteTdrRows.docker": "String" | ||
} |