Skip to content

Commit

Permalink
Merge pull request #197 from broadinstitute/sn_update_script_to_handl…
Browse files Browse the repository at this point in the history
…e_list_of_files

Handle list of files to delete and update read me
  • Loading branch information
snovod authored Jan 17, 2025
2 parents e70412a + 91fed62 commit fbdaa42
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 15 deletions.
16 changes: 10 additions & 6 deletions python/delete_tdr_rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def _fetch_file_ref_columns(self) -> list[str]:
table_schema = self.tdr.get_table_schema_info(dataset_id=self.dataset_id, table_name=self.table_name)
return [col['name'] for col in table_schema['columns'] if col['datatype'] == 'fileref']

def _log_rows_found_info(self, found_row_ids: list[str], file_uuids: list[str]) -> None:
def _log_rows_found_info(self, found_row_ids: list[str], file_uuids: set[str]) -> None:
logging.info(f"Found {len(found_row_ids)} rows to delete")
not_found_ids = set(self.ids_to_delete) - set(found_row_ids)
if not_found_ids:
Expand All @@ -48,12 +48,12 @@ def _log_rows_found_info(self, found_row_ids: list[str], file_uuids: list[str])
)
logging.info(f"Found {len(file_uuids)} files linked to the rows to delete")

def run(self) -> tuple[list[str], list[str]]:
def run(self) -> tuple[list[str], set[str]]:
table_metrics = tdr.get_dataset_table_metrics(dataset_id=dataset_id, target_table_name=table_name)
# tdr_row_ids to be deleted
tdr_row_ids = []
# file uuids to be deleted later if options used
file_uuids = []
file_uuids = set()
# Used to log the ids that were not found
found_row_ids = []

Expand All @@ -67,15 +67,19 @@ def run(self) -> tuple[list[str], list[str]]:
tdr_row_id = row['datarepo_row_id']
# If the column is a fileref, store the file_uuid
if column in file_ref_columns:
row_file_uuids.append(row[column])
# If the column is a list, store all the file_uuids
if isinstance(row[column], list):
row_file_uuids.extend(row[column])
else:
row_file_uuids.append(row[column])
# If the column is the id column, check if the id is in the ids_to_delete_file
if column == self.id_column_name:
if row[column] in self.ids_to_delete:
found_row_ids.append(row[column])
store = True
# If the row is to be deleted, store the file_uuids and tdr_row_id
if store:
file_uuids.extend(row_file_uuids)
file_uuids.update(row_file_uuids)
tdr_row_ids.append(tdr_row_id)
self._log_rows_found_info(found_row_ids, file_uuids)
return tdr_row_ids, file_uuids
Expand Down Expand Up @@ -111,7 +115,7 @@ def run(self) -> tuple[list[str], list[str]]:
if delete_files:
if file_uuids:
tdr.delete_files(
file_ids=file_uuids,
file_ids=list(file_uuids),
dataset_id=dataset_id
)
else:
Expand Down
20 changes: 11 additions & 9 deletions wdl/DeleteTdrRows/README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
# WDL Input Overview
This WDl will delete rows from a table in a TDR. You can specify the table and the rows to delete. Optionally can delete files linked to the rows.

If data / files being deleted are STILL part of active snapshot you will run into issues. Make sure to delete the snapshots first.
## Notes
* If data / files being deleted are STILL part of active snapshot you will run into issues. Make sure to delete the snapshots first.
* This should kick off ONE workflow for all entities. Make sure you are selecting a SET containing all entities.

## Inputs Table:
| Input Name | Description | Type | Required | Default |
|--------------------------|-------------------------------------------------------------------|---------------|----------|-----------------------------------------------------------------------------------------------|
| **dataset_id** | dataset id where table exists | String | Yes | N/A |
| **tdr_table_name** | Table name in dataset | String | Yes | N/A |
| **ids_to_delete** | list of ids to look for and delete in table | Array[String] | Yes | N/A |
| **id_column_name** | Name of column where ids exist | String | Yes | N/A |
| **delete_files** | Use if want to delete files that are referenced in rows to delete | Boolean | Yes | N/A |
| **docker** | Specifies a custom Docker image to use. Optional. | String | No | "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest" |
| Input Name | Description | Type | Required | Default |
|--------------------------|--------------------------------------------------------------------------------|---------------|----------|-----------------------------------------------------------------------------------------------|
| **dataset_id** | dataset id where table exists | String | Yes | N/A |
| **tdr_table_name** | Table name in dataset | String | Yes | N/A |
| **ids_to_delete** | list of ids to look for and delete in table. This should be selected as a set. | Array[String] | Yes | N/A |
| **id_column_name** | Name of column where ids exist | String | Yes | N/A |
| **delete_files** | Use if want to delete files that are referenced in rows to delete | Boolean | Yes | N/A |
| **docker** | Specifies a custom Docker image to use. Optional. | String | No | "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest" |


## Outputs Table:
Expand Down

0 comments on commit fbdaa42

Please # to comment.