Skip to content

Commit

Permalink
POD-2359: Add workflow to get TDR schema JSON (#188)
Browse files Browse the repository at this point in the history
* initial commit

* undo unrelated change

* add template json

* update wdl

* fix logic for workspace entities

* re add validation back in

* add workflow to dockstore

* update readme

* fix indentation

* fix import statments

* add quotes

* fix pascal case error

* treat input file as local file

* remove validation for file path

* use existing functionality to transform metadata

* fix pascal case issue

* remove f string

* fix improts

* --no-verify
  • Loading branch information
sahakiann authored Jan 8, 2025
1 parent 552bd99 commit 1ac13b1
Show file tree
Hide file tree
Showing 5 changed files with 267 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,10 @@ workflows:
readMePath: /wdl/TerraSummaryStatistics/README.md
testParameterFiles:
- /wdl/TerraSummaryStatistics/TerraSummaryStatistics.wdl

- name: GetTdrSchemaJson
subclass: WDL
primaryDescriptorPath: /wdl/GetTdrSchemaJson/GetTdrSchemaJson.wdl
readMePath: /wdl/GetTdrSchemaJson/README.md
testParameterFiles:
- /wdl/GetTdrSchemaJson/GetTdrSchemaJson.wdl
119 changes: 119 additions & 0 deletions python/generate_tdr_schema_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import argparse
import json
from pathlib import Path

from utils import GCP, ARG_DEFAULTS, comma_separated_list
from utils.csv_util import Csv
from utils.requests_utils.request_util import RunRequest
from utils.tdr_utils.tdr_ingest_utils import ConvertTerraTableInfoForIngest
from utils.tdr_utils.tdr_schema_utils import InferTDRSchema
from utils.terra_utils.terra_util import TerraWorkspace
from utils.token_util import Token

CLOUD_TYPE = GCP


def get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Get TDR schema JSON file")

workspace_args = parser.add_argument_group("All arguments required if using workspace metadata as input")
workspace_args.add_argument(
"--billing_project",
required=False,
type=str,
help="The billing project for the Terra workspace"
)
workspace_args.add_argument(
"--workspace_name",
required=False,
type=str,
help="The Terra workspace name"
)
workspace_args.add_argument(
"--terra_table_names",
required=False,
type=comma_separated_list,
help="The name(s) of the table(s) within the Terra workspace to generate the schema JSON for"
)

tsv_group = parser.add_mutually_exclusive_group(required=False)
tsv_group.add_argument(
"--input_tsv",
required=False,
type=str,
help="The GCP path to the tsv containing the metadata to generate a schema JSON for",
)

parser.add_argument(
"--force_disparate_rows_to_string",
action="store_true",
help="If used, all rows in a column containing disparate data types will be forced to a string"
)

args = parser.parse_args()

# Custom validation logic
workspace_args = [args.billing_project, args.workspace_name, args.terra_table_names]
if args.input_tsv:
if any(workspace_args):
parser.error(
"Cannot provide BOTH input_tsv AND the combination of billing_project, workspace_name, "
"and terra_table_names."
)
else:
if not all(workspace_args):
parser.error(
"If input_tsv is not provided, you must provide ALL of workspace_name, billing_project, "
"and terra_table_names."
)

return args


if __name__ == '__main__':
args = get_args()

schema_metadata = []

if args.input_tsv:
metadata = Csv(file_path=args.input_tsv).create_list_of_dicts_from_tsv()
schema = InferTDRSchema(
input_metadata=metadata,
table_name=Path(args.input_tsv).stem,
all_fields_non_required=False,
allow_disparate_data_types_in_column=args.force_disparate_rows_to_string,
).infer_schema()
schema_metadata.append(schema)
else:
token = Token(cloud=CLOUD_TYPE)
request_util = RunRequest(
token=token,
max_retries=ARG_DEFAULTS["max_retries"],
max_backoff_time=ARG_DEFAULTS["max_backoff_time"], # ignore type[arg-type]
)
terra_workspace = TerraWorkspace(
billing_project=args.billing_project,
workspace_name=args.workspace_name,
request_util=request_util
)
for table_name in args.terra_table_names:
table_metadata = terra_workspace.get_gcp_workspace_metrics(
entity_type=table_name, remove_dicts=True
)
primary_key = [f"{i['entityType']}_id" for i in table_metadata][0]

parsed_metadata = ConvertTerraTableInfoForIngest(
table_metadata=table_metadata, tdr_row_id=primary_key
).run()

schema = InferTDRSchema(
input_metadata=parsed_metadata,
table_name=table_name,
all_fields_non_required=False,
allow_disparate_data_types_in_column=args.force_disparate_rows_to_string,
).infer_schema()

schema_metadata.append(schema)

with open("schema.json", "w") as schema_json:
schema_json.write(json.dumps(schema_metadata, indent=2))
109 changes: 109 additions & 0 deletions wdl/GetTdrSchemaJson/GetTdrSchemaJson.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
version 1.0

workflow GetTDRSchemaJson {
input {
File? input_metadata_tsv
String? billing_project
String? workspace_name
String? terra_table_names
Boolean force_disparate_rows_to_string
String? docker
}
String docker_name = select_first([docker, "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest"])

call ValidateInputs {
input:
input_metadata_tsv = input_metadata_tsv,
billing_project = billing_project,
workspace_name = workspace_name,
terra_table_names = terra_table_names,
docker_name = docker_name
}

call GenerateSchemaJson {
input:
input_metadata_tsv = input_metadata_tsv,
billing_project = billing_project,
workspace_name = workspace_name,
terra_table_names = terra_table_names,
docker_name = docker_name,
force_disparate_rows_to_string = force_disparate_rows_to_string
}
}

task ValidateInputs {
input {
File? input_metadata_tsv
String? billing_project
String? workspace_name
String? terra_table_names
String docker_name
}

command <<<
set -euo pipefail

python3 <<CODE
tsv = "~{input_metadata_tsv}"
billing_project = "~{billing_project}"
workspace_name = "~{workspace_name}"
terra_table_names = "~{terra_table_names}"
terra_params = [billing_project, workspace_name, terra_table_names]
if tsv and any(terra_params):
raise ValueError(
"If the 'input_metadata_tsv' is provided, none of the terra parameters can also be provided. Please "
"leave 'billing_project', 'workspace_name' and 'terra_table_name' all blank if providing a tsv as input."
)
elif not tsv and not all(terra_params):
raise ValueError(
"If using the Terra workspace table as input, the 'billing_project', 'workspace_name' and "
"'terra_table_names' must ALL be provided"
)
if (tsv and not any(terra_params)) or (not tsv and all(terra_params)):
print("Input parameters validated, continuing")
CODE
>>>
runtime {
docker: docker_name
}
}
task GenerateSchemaJson {
input {
File? input_metadata_tsv
String? billing_project
String? workspace_name
String? terra_table_names
String docker_name
Boolean force_disparate_rows_to_string
}
command <<<
if [ ! -z "~{input_metadata_tsv}" ]; then
python /etc/terra_utils/python/generate_tdr_schema_json.py \
--input_tsv ~{input_metadata_tsv} \
~{if force_disparate_rows_to_string then "--force_disparate_rows_to_string" else ""}
else
python /etc/terra_utils/python/generate_tdr_schema_json.py \
--billing_project ~{billing_project} \
--workspace_name ~{workspace_name} \
--terra_table_names ~{terra_table_names} \
~{if force_disparate_rows_to_string then "--force_disparate_rows_to_string" else ""}
fi
>>>
output {
File tdr_schema_json = "schema.json"
}
runtime {
docker: docker_name
}
}
26 changes: 26 additions & 0 deletions wdl/GetTdrSchemaJson/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# WDL Input Overview

This WDL accepts either a GCS path to a TSV file or a combination of a billing project and workspace to generate a TDR
schema JSON based on the provided metadata. Note that this script does not interact with TDR. Its purpose is to
demonstrate what the resulting schema would look like if the data were imported into TDR.

## Notes
* Please provide _either_ the `input_metadata_tsv` OR a combination of `billing_project`, `workspace_name`, and
`terra_table_names`.

## Inputs Table:

| Input Name | Description | Type | Required | Default |
|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|----------|---------------------------------------------------------------------------------------------|
| **input_metadata_tsv** | A GSC path to the TSV file containing metadata (if not using a workspace as input). Must start with 'gs://' | String | No | N/A |
| **billing_project** | The workspace billing project (if not using the TSV) | String | No | N/A |
| **workspace_name** | The workspace name (if not using the TSV) | String | No | N/A |
| **terra_table_names** | Comma separate list of Terra table names to generate JSONs for. Do not include spaces between entries (i.e. use the following format: "table1,table2") | String | No | N/A |
| **force_disparate_rows_to_string** | If rows of a column are of different data types, setting this to True will force them all to be strings in the resulting TDR schema JSON. The same option will be available when importing data into TDR. | Boolean | Yes | N/A |
| **docker** | Specifies a custom Docker image to use. Optional. | String | No | us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest |


## Outputs Table:
| Output Name | Description |
|-----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **tdr_schema_json** | The path to the GSC file containing the schema JSON. If multiple Terra tables were provided as input, all tables will be included in the same output JSON file. |
6 changes: 6 additions & 0 deletions wdl/GetTdrSchemaJson/template_input.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"GetTdrSchemaJson.billing_project": "String",
"GetTdrSchemaJson.workspace_name": "String",
"GetTdrSchemaJson.terra_table_names": "String",
"GetTdrSchemaJson.force_disparate_rows_to_string": "Boolean"
}

0 comments on commit 1ac13b1

Please # to comment.