From 1ac13b1103c75325bbfdb266ebf35ccf5327c8a6 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian <45041478+sahakiann@users.noreply.github.com> Date: Wed, 8 Jan 2025 14:40:40 -0500 Subject: [PATCH] POD-2359: Add workflow to get TDR schema JSON (#188) * initial commit * undo unrelated change * add template json * update wdl * fix logic for workspace entities * re add validation back in * add workflow to dockstore * update readme * fix indentation * fix import statments * add quotes * fix pascal case error * treat input file as local file * remove validation for file path * use existing functionality to transform metadata * fix pascal case issue * remove f string * fix improts * --no-verify --- .dockstore.yml | 7 ++ python/generate_tdr_schema_json.py | 119 ++++++++++++++++++++++ wdl/GetTdrSchemaJson/GetTdrSchemaJson.wdl | 109 ++++++++++++++++++++ wdl/GetTdrSchemaJson/README.md | 26 +++++ wdl/GetTdrSchemaJson/template_input.json | 6 ++ 5 files changed, 267 insertions(+) create mode 100644 python/generate_tdr_schema_json.py create mode 100644 wdl/GetTdrSchemaJson/GetTdrSchemaJson.wdl create mode 100644 wdl/GetTdrSchemaJson/README.md create mode 100644 wdl/GetTdrSchemaJson/template_input.json diff --git a/.dockstore.yml b/.dockstore.yml index 00b61c1..825266d 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -125,3 +125,10 @@ workflows: readMePath: /wdl/TerraSummaryStatistics/README.md testParameterFiles: - /wdl/TerraSummaryStatistics/TerraSummaryStatistics.wdl + + - name: GetTdrSchemaJson + subclass: WDL + primaryDescriptorPath: /wdl/GetTdrSchemaJson/GetTdrSchemaJson.wdl + readMePath: /wdl/GetTdrSchemaJson/README.md + testParameterFiles: + - /wdl/GetTdrSchemaJson/GetTdrSchemaJson.wdl diff --git a/python/generate_tdr_schema_json.py b/python/generate_tdr_schema_json.py new file mode 100644 index 0000000..05b8f87 --- /dev/null +++ b/python/generate_tdr_schema_json.py @@ -0,0 +1,119 @@ +import argparse +import json +from pathlib import Path + +from utils import GCP, ARG_DEFAULTS, comma_separated_list +from utils.csv_util import Csv +from utils.requests_utils.request_util import RunRequest +from utils.tdr_utils.tdr_ingest_utils import ConvertTerraTableInfoForIngest +from utils.tdr_utils.tdr_schema_utils import InferTDRSchema +from utils.terra_utils.terra_util import TerraWorkspace +from utils.token_util import Token + +CLOUD_TYPE = GCP + + +def get_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Get TDR schema JSON file") + + workspace_args = parser.add_argument_group("All arguments required if using workspace metadata as input") + workspace_args.add_argument( + "--billing_project", + required=False, + type=str, + help="The billing project for the Terra workspace" + ) + workspace_args.add_argument( + "--workspace_name", + required=False, + type=str, + help="The Terra workspace name" + ) + workspace_args.add_argument( + "--terra_table_names", + required=False, + type=comma_separated_list, + help="The name(s) of the table(s) within the Terra workspace to generate the schema JSON for" + ) + + tsv_group = parser.add_mutually_exclusive_group(required=False) + tsv_group.add_argument( + "--input_tsv", + required=False, + type=str, + help="The GCP path to the tsv containing the metadata to generate a schema JSON for", + ) + + parser.add_argument( + "--force_disparate_rows_to_string", + action="store_true", + help="If used, all rows in a column containing disparate data types will be forced to a string" + ) + + args = parser.parse_args() + + # Custom validation logic + workspace_args = [args.billing_project, args.workspace_name, args.terra_table_names] + if args.input_tsv: + if any(workspace_args): + parser.error( + "Cannot provide BOTH input_tsv AND the combination of billing_project, workspace_name, " + "and terra_table_names." + ) + else: + if not all(workspace_args): + parser.error( + "If input_tsv is not provided, you must provide ALL of workspace_name, billing_project, " + "and terra_table_names." + ) + + return args + + +if __name__ == '__main__': + args = get_args() + + schema_metadata = [] + + if args.input_tsv: + metadata = Csv(file_path=args.input_tsv).create_list_of_dicts_from_tsv() + schema = InferTDRSchema( + input_metadata=metadata, + table_name=Path(args.input_tsv).stem, + all_fields_non_required=False, + allow_disparate_data_types_in_column=args.force_disparate_rows_to_string, + ).infer_schema() + schema_metadata.append(schema) + else: + token = Token(cloud=CLOUD_TYPE) + request_util = RunRequest( + token=token, + max_retries=ARG_DEFAULTS["max_retries"], + max_backoff_time=ARG_DEFAULTS["max_backoff_time"], # ignore type[arg-type] + ) + terra_workspace = TerraWorkspace( + billing_project=args.billing_project, + workspace_name=args.workspace_name, + request_util=request_util + ) + for table_name in args.terra_table_names: + table_metadata = terra_workspace.get_gcp_workspace_metrics( + entity_type=table_name, remove_dicts=True + ) + primary_key = [f"{i['entityType']}_id" for i in table_metadata][0] + + parsed_metadata = ConvertTerraTableInfoForIngest( + table_metadata=table_metadata, tdr_row_id=primary_key + ).run() + + schema = InferTDRSchema( + input_metadata=parsed_metadata, + table_name=table_name, + all_fields_non_required=False, + allow_disparate_data_types_in_column=args.force_disparate_rows_to_string, + ).infer_schema() + + schema_metadata.append(schema) + + with open("schema.json", "w") as schema_json: + schema_json.write(json.dumps(schema_metadata, indent=2)) diff --git a/wdl/GetTdrSchemaJson/GetTdrSchemaJson.wdl b/wdl/GetTdrSchemaJson/GetTdrSchemaJson.wdl new file mode 100644 index 0000000..62bbf77 --- /dev/null +++ b/wdl/GetTdrSchemaJson/GetTdrSchemaJson.wdl @@ -0,0 +1,109 @@ +version 1.0 + +workflow GetTDRSchemaJson { + input { + File? input_metadata_tsv + String? billing_project + String? workspace_name + String? terra_table_names + Boolean force_disparate_rows_to_string + String? docker + } + String docker_name = select_first([docker, "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest"]) + + call ValidateInputs { + input: + input_metadata_tsv = input_metadata_tsv, + billing_project = billing_project, + workspace_name = workspace_name, + terra_table_names = terra_table_names, + docker_name = docker_name + } + + call GenerateSchemaJson { + input: + input_metadata_tsv = input_metadata_tsv, + billing_project = billing_project, + workspace_name = workspace_name, + terra_table_names = terra_table_names, + docker_name = docker_name, + force_disparate_rows_to_string = force_disparate_rows_to_string + } +} + +task ValidateInputs { + input { + File? input_metadata_tsv + String? billing_project + String? workspace_name + String? terra_table_names + String docker_name + } + + command <<< + set -euo pipefail + + python3 <>> + + runtime { + docker: docker_name + } +} + + +task GenerateSchemaJson { + input { + File? input_metadata_tsv + String? billing_project + String? workspace_name + String? terra_table_names + String docker_name + Boolean force_disparate_rows_to_string + } + + command <<< + if [ ! -z "~{input_metadata_tsv}" ]; then + python /etc/terra_utils/python/generate_tdr_schema_json.py \ + --input_tsv ~{input_metadata_tsv} \ + ~{if force_disparate_rows_to_string then "--force_disparate_rows_to_string" else ""} + + else + python /etc/terra_utils/python/generate_tdr_schema_json.py \ + --billing_project ~{billing_project} \ + --workspace_name ~{workspace_name} \ + --terra_table_names ~{terra_table_names} \ + ~{if force_disparate_rows_to_string then "--force_disparate_rows_to_string" else ""} + fi + + >>> + + output { + File tdr_schema_json = "schema.json" + } + + runtime { + docker: docker_name + } +} diff --git a/wdl/GetTdrSchemaJson/README.md b/wdl/GetTdrSchemaJson/README.md new file mode 100644 index 0000000..7b5549c --- /dev/null +++ b/wdl/GetTdrSchemaJson/README.md @@ -0,0 +1,26 @@ +# WDL Input Overview + +This WDL accepts either a GCS path to a TSV file or a combination of a billing project and workspace to generate a TDR +schema JSON based on the provided metadata. Note that this script does not interact with TDR. Its purpose is to +demonstrate what the resulting schema would look like if the data were imported into TDR. + +## Notes +* Please provide _either_ the `input_metadata_tsv` OR a combination of `billing_project`, `workspace_name`, and + `terra_table_names`. + +## Inputs Table: + +| Input Name | Description | Type | Required | Default | +|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|----------|---------------------------------------------------------------------------------------------| +| **input_metadata_tsv** | A GSC path to the TSV file containing metadata (if not using a workspace as input). Must start with 'gs://' | String | No | N/A | +| **billing_project** | The workspace billing project (if not using the TSV) | String | No | N/A | +| **workspace_name** | The workspace name (if not using the TSV) | String | No | N/A | +| **terra_table_names** | Comma separate list of Terra table names to generate JSONs for. Do not include spaces between entries (i.e. use the following format: "table1,table2") | String | No | N/A | +| **force_disparate_rows_to_string** | If rows of a column are of different data types, setting this to True will force them all to be strings in the resulting TDR schema JSON. The same option will be available when importing data into TDR. | Boolean | Yes | N/A | +| **docker** | Specifies a custom Docker image to use. Optional. | String | No | us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest | + + +## Outputs Table: +| Output Name | Description | +|-----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **tdr_schema_json** | The path to the GSC file containing the schema JSON. If multiple Terra tables were provided as input, all tables will be included in the same output JSON file. | diff --git a/wdl/GetTdrSchemaJson/template_input.json b/wdl/GetTdrSchemaJson/template_input.json new file mode 100644 index 0000000..11e0924 --- /dev/null +++ b/wdl/GetTdrSchemaJson/template_input.json @@ -0,0 +1,6 @@ +{ + "GetTdrSchemaJson.billing_project": "String", + "GetTdrSchemaJson.workspace_name": "String", + "GetTdrSchemaJson.terra_table_names": "String", + "GetTdrSchemaJson.force_disparate_rows_to_string": "Boolean" +}