-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
POD-2359: Add workflow to get TDR schema JSON (#188)
* initial commit * undo unrelated change * add template json * update wdl * fix logic for workspace entities * re add validation back in * add workflow to dockstore * update readme * fix indentation * fix import statments * add quotes * fix pascal case error * treat input file as local file * remove validation for file path * use existing functionality to transform metadata * fix pascal case issue * remove f string * fix improts * --no-verify
- Loading branch information
Showing
5 changed files
with
267 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import argparse | ||
import json | ||
from pathlib import Path | ||
|
||
from utils import GCP, ARG_DEFAULTS, comma_separated_list | ||
from utils.csv_util import Csv | ||
from utils.requests_utils.request_util import RunRequest | ||
from utils.tdr_utils.tdr_ingest_utils import ConvertTerraTableInfoForIngest | ||
from utils.tdr_utils.tdr_schema_utils import InferTDRSchema | ||
from utils.terra_utils.terra_util import TerraWorkspace | ||
from utils.token_util import Token | ||
|
||
CLOUD_TYPE = GCP | ||
|
||
|
||
def get_args() -> argparse.Namespace: | ||
parser = argparse.ArgumentParser(description="Get TDR schema JSON file") | ||
|
||
workspace_args = parser.add_argument_group("All arguments required if using workspace metadata as input") | ||
workspace_args.add_argument( | ||
"--billing_project", | ||
required=False, | ||
type=str, | ||
help="The billing project for the Terra workspace" | ||
) | ||
workspace_args.add_argument( | ||
"--workspace_name", | ||
required=False, | ||
type=str, | ||
help="The Terra workspace name" | ||
) | ||
workspace_args.add_argument( | ||
"--terra_table_names", | ||
required=False, | ||
type=comma_separated_list, | ||
help="The name(s) of the table(s) within the Terra workspace to generate the schema JSON for" | ||
) | ||
|
||
tsv_group = parser.add_mutually_exclusive_group(required=False) | ||
tsv_group.add_argument( | ||
"--input_tsv", | ||
required=False, | ||
type=str, | ||
help="The GCP path to the tsv containing the metadata to generate a schema JSON for", | ||
) | ||
|
||
parser.add_argument( | ||
"--force_disparate_rows_to_string", | ||
action="store_true", | ||
help="If used, all rows in a column containing disparate data types will be forced to a string" | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
# Custom validation logic | ||
workspace_args = [args.billing_project, args.workspace_name, args.terra_table_names] | ||
if args.input_tsv: | ||
if any(workspace_args): | ||
parser.error( | ||
"Cannot provide BOTH input_tsv AND the combination of billing_project, workspace_name, " | ||
"and terra_table_names." | ||
) | ||
else: | ||
if not all(workspace_args): | ||
parser.error( | ||
"If input_tsv is not provided, you must provide ALL of workspace_name, billing_project, " | ||
"and terra_table_names." | ||
) | ||
|
||
return args | ||
|
||
|
||
if __name__ == '__main__': | ||
args = get_args() | ||
|
||
schema_metadata = [] | ||
|
||
if args.input_tsv: | ||
metadata = Csv(file_path=args.input_tsv).create_list_of_dicts_from_tsv() | ||
schema = InferTDRSchema( | ||
input_metadata=metadata, | ||
table_name=Path(args.input_tsv).stem, | ||
all_fields_non_required=False, | ||
allow_disparate_data_types_in_column=args.force_disparate_rows_to_string, | ||
).infer_schema() | ||
schema_metadata.append(schema) | ||
else: | ||
token = Token(cloud=CLOUD_TYPE) | ||
request_util = RunRequest( | ||
token=token, | ||
max_retries=ARG_DEFAULTS["max_retries"], | ||
max_backoff_time=ARG_DEFAULTS["max_backoff_time"], # ignore type[arg-type] | ||
) | ||
terra_workspace = TerraWorkspace( | ||
billing_project=args.billing_project, | ||
workspace_name=args.workspace_name, | ||
request_util=request_util | ||
) | ||
for table_name in args.terra_table_names: | ||
table_metadata = terra_workspace.get_gcp_workspace_metrics( | ||
entity_type=table_name, remove_dicts=True | ||
) | ||
primary_key = [f"{i['entityType']}_id" for i in table_metadata][0] | ||
|
||
parsed_metadata = ConvertTerraTableInfoForIngest( | ||
table_metadata=table_metadata, tdr_row_id=primary_key | ||
).run() | ||
|
||
schema = InferTDRSchema( | ||
input_metadata=parsed_metadata, | ||
table_name=table_name, | ||
all_fields_non_required=False, | ||
allow_disparate_data_types_in_column=args.force_disparate_rows_to_string, | ||
).infer_schema() | ||
|
||
schema_metadata.append(schema) | ||
|
||
with open("schema.json", "w") as schema_json: | ||
schema_json.write(json.dumps(schema_metadata, indent=2)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
version 1.0 | ||
|
||
workflow GetTDRSchemaJson { | ||
input { | ||
File? input_metadata_tsv | ||
String? billing_project | ||
String? workspace_name | ||
String? terra_table_names | ||
Boolean force_disparate_rows_to_string | ||
String? docker | ||
} | ||
String docker_name = select_first([docker, "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest"]) | ||
|
||
call ValidateInputs { | ||
input: | ||
input_metadata_tsv = input_metadata_tsv, | ||
billing_project = billing_project, | ||
workspace_name = workspace_name, | ||
terra_table_names = terra_table_names, | ||
docker_name = docker_name | ||
} | ||
|
||
call GenerateSchemaJson { | ||
input: | ||
input_metadata_tsv = input_metadata_tsv, | ||
billing_project = billing_project, | ||
workspace_name = workspace_name, | ||
terra_table_names = terra_table_names, | ||
docker_name = docker_name, | ||
force_disparate_rows_to_string = force_disparate_rows_to_string | ||
} | ||
} | ||
|
||
task ValidateInputs { | ||
input { | ||
File? input_metadata_tsv | ||
String? billing_project | ||
String? workspace_name | ||
String? terra_table_names | ||
String docker_name | ||
} | ||
|
||
command <<< | ||
set -euo pipefail | ||
|
||
python3 <<CODE | ||
tsv = "~{input_metadata_tsv}" | ||
billing_project = "~{billing_project}" | ||
workspace_name = "~{workspace_name}" | ||
terra_table_names = "~{terra_table_names}" | ||
terra_params = [billing_project, workspace_name, terra_table_names] | ||
if tsv and any(terra_params): | ||
raise ValueError( | ||
"If the 'input_metadata_tsv' is provided, none of the terra parameters can also be provided. Please " | ||
"leave 'billing_project', 'workspace_name' and 'terra_table_name' all blank if providing a tsv as input." | ||
) | ||
elif not tsv and not all(terra_params): | ||
raise ValueError( | ||
"If using the Terra workspace table as input, the 'billing_project', 'workspace_name' and " | ||
"'terra_table_names' must ALL be provided" | ||
) | ||
if (tsv and not any(terra_params)) or (not tsv and all(terra_params)): | ||
print("Input parameters validated, continuing") | ||
CODE | ||
>>> | ||
runtime { | ||
docker: docker_name | ||
} | ||
} | ||
task GenerateSchemaJson { | ||
input { | ||
File? input_metadata_tsv | ||
String? billing_project | ||
String? workspace_name | ||
String? terra_table_names | ||
String docker_name | ||
Boolean force_disparate_rows_to_string | ||
} | ||
command <<< | ||
if [ ! -z "~{input_metadata_tsv}" ]; then | ||
python /etc/terra_utils/python/generate_tdr_schema_json.py \ | ||
--input_tsv ~{input_metadata_tsv} \ | ||
~{if force_disparate_rows_to_string then "--force_disparate_rows_to_string" else ""} | ||
else | ||
python /etc/terra_utils/python/generate_tdr_schema_json.py \ | ||
--billing_project ~{billing_project} \ | ||
--workspace_name ~{workspace_name} \ | ||
--terra_table_names ~{terra_table_names} \ | ||
~{if force_disparate_rows_to_string then "--force_disparate_rows_to_string" else ""} | ||
fi | ||
>>> | ||
output { | ||
File tdr_schema_json = "schema.json" | ||
} | ||
runtime { | ||
docker: docker_name | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# WDL Input Overview | ||
|
||
This WDL accepts either a GCS path to a TSV file or a combination of a billing project and workspace to generate a TDR | ||
schema JSON based on the provided metadata. Note that this script does not interact with TDR. Its purpose is to | ||
demonstrate what the resulting schema would look like if the data were imported into TDR. | ||
|
||
## Notes | ||
* Please provide _either_ the `input_metadata_tsv` OR a combination of `billing_project`, `workspace_name`, and | ||
`terra_table_names`. | ||
|
||
## Inputs Table: | ||
|
||
| Input Name | Description | Type | Required | Default | | ||
|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|----------|---------------------------------------------------------------------------------------------| | ||
| **input_metadata_tsv** | A GSC path to the TSV file containing metadata (if not using a workspace as input). Must start with 'gs://' | String | No | N/A | | ||
| **billing_project** | The workspace billing project (if not using the TSV) | String | No | N/A | | ||
| **workspace_name** | The workspace name (if not using the TSV) | String | No | N/A | | ||
| **terra_table_names** | Comma separate list of Terra table names to generate JSONs for. Do not include spaces between entries (i.e. use the following format: "table1,table2") | String | No | N/A | | ||
| **force_disparate_rows_to_string** | If rows of a column are of different data types, setting this to True will force them all to be strings in the resulting TDR schema JSON. The same option will be available when importing data into TDR. | Boolean | Yes | N/A | | ||
| **docker** | Specifies a custom Docker image to use. Optional. | String | No | us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest | | ||
|
||
|
||
## Outputs Table: | ||
| Output Name | Description | | ||
|-----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| | ||
| **tdr_schema_json** | The path to the GSC file containing the schema JSON. If multiple Terra tables were provided as input, all tables will be included in the same output JSON file. | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"GetTdrSchemaJson.billing_project": "String", | ||
"GetTdrSchemaJson.workspace_name": "String", | ||
"GetTdrSchemaJson.terra_table_names": "String", | ||
"GetTdrSchemaJson.force_disparate_rows_to_string": "Boolean" | ||
} |