Merge pull request #207 from broadinstitute/sn_POD-2450_create_script_to_upload_files

snovod · web-flow · commit 3d82b4e8ed39 · 2025-02-10T13:34:29.000-05:00
Adding script to upload tsv and files to Terra
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -139,3 +139,10 @@ workflows:
     readMePath: /wdl/DeleteTdrRows/README.md
     testParameterFiles:
       - /wdl/DeleteTdrRows/DeleteTdrRows.wdl
+
+  - name: UploadMetricsAndGcpFilesToTerra
+    subclass: WDL
+    primaryDescriptorPath: /wdl/UploadMetricsAndGcpFilesToTerra/UploadMetricsAndGcpFilesToTerra.wdl
+    readMePath: /wdl/UploadMetricsAndGcpFilesToTerra/README.md
+    testParameterFiles:
+      - /wdl/UploadMetricsAndGcpFilesToTerra/UploadMetricsAndGcpFilesToTerra.wdl
diff --git a/python/upload_metrics_and_files_to_terra.py b/python/upload_metrics_and_files_to_terra.py
@@ -0,0 +1,172 @@
+import logging
+from argparse import ArgumentParser, Namespace
+from utils.requests_utils.request_util import RunRequest
+from utils.token_util import Token
+from utils import GCP, comma_separated_list
+from utils.gcp_utils import GCPCloudFunctions
+from utils.terra_utils.terra_util import TerraWorkspace
+from utils.csv_util import Csv
+from utils import ARG_DEFAULTS
+import os
+
+logging.basicConfig(
+    format="%(levelname)s: %(asctime)s : %(message)s", level=logging.INFO
+)
+
+
+def get_args() -> Namespace:
+    parser = ArgumentParser()
+    parser.add_argument("--workspace_name", "-w", type=str, required=True, help="Terra workspace name")
+    parser.add_argument("--billing_project", "-b", type=str, required=True, help="Billing project name")
+    parser.add_argument("--metrics_tsv", "-m", type=str, required=True, help="Path to the metrics TSV file")
+    parser.add_argument("--skip_upload_column", "-s", type=comma_separated_list,
+                        help="Column name to skip upload. Use comma separated values for multiple columns. Optional")
+    parser.add_argument("--flatten_path", "-f", action="store_true",
+                        help="If you want to flatten all file paths and put all files in one dir. Optional")
+    parser.add_argument("--subdir", "-d", type=str, help="Subdirectory to upload files to. Optional")
+    parser.add_argument("--id_column", "-i", type=str, help="Column name for the id column", required=True)
+    return parser.parse_args()
+
+
+class ConvertContents:
+    def __init__(
+            self,
+            contents: list[dict],
+            id_column: str,
+            bucket_name: str,
+            flatten_path: bool,
+            subdir: str,
+            skip_upload_column: list[str],
+    ):
+        self.contents = contents
+        self.id_column = id_column
+        self.flatten_path = flatten_path
+        self.skip_upload_column = skip_upload_column
+        self.new_bucket_path = f"gs://{bucket_name}" if not subdir else f"gs://{bucket_name}/{subdir}"
+        self.files_to_copy: list[dict] = []
+        self.headers: set = set()
+
+    def _get_file_copy_dict(self, file_path: str) -> dict:
+        if self.flatten_path:
+            file_name = os.path.basename(file_path)
+            new_path = f"{self.new_bucket_path}/{file_name}"
+        else:
+            path_without_bucket = '/'.join(file_path.split("/")[3:])
+            new_path = f"{self.new_bucket_path}/{path_without_bucket}"
+        return {"source_file": file_path, "full_destination_path": new_path}
+
+    @staticmethod
+    def _check_paths_unique(file_destinations: list[str]) -> bool:
+        seen = set()
+        duplicates = set()
+        for file_path in file_destinations:
+            if file_path in seen:
+                duplicates.add(file_path)
+            seen.add(file_path)
+        if duplicates:
+            logging.error(f"Duplicate destination files paths found. Will overwrite each other: {duplicates}")
+            return False
+        return True
+
+    def _update_file_paths(self, cell_value: str) -> str:
+        if cell_value.startswith("gs://"):
+            copy_dict = self._get_file_copy_dict(cell_value)
+            self.files_to_copy.append(copy_dict)
+            return copy_dict["full_destination_path"]
+        return cell_value
+
+    def _validate_results(self) -> None:
+        dest_file_paths = [copy_dict["full_destination_path"] for copy_dict in self.files_to_copy]
+        if not self._check_paths_unique(dest_file_paths):
+            raise ValueError("Duplicate destination file paths found. Will overwrite each other.")
+        if f"entity:{self.id_column}" not in self.headers:
+            raise ValueError(f"ID column {self.id_column} not found in TSV file.")
+
+    def run(self) -> tuple[list[dict], list[dict], set]:
+        new_tsv_contents = []
+        # Create set of all headers
+        for row in self.contents:
+            new_row = {}
+            for header, value in row.items():
+                # Add entity: to the id column
+                if header == self.id_column:
+                    header = f"entity:{header}"
+                # Add header to set of headers
+                self.headers.add(header)
+                if self.skip_upload_column and header in self.skip_upload_column:
+                    # if skip upload column then leave as is
+                    new_row[header] = value
+                # If value is a list then check each entry in the list
+                elif isinstance(value, list):
+                    new_list = []
+                    for entry in value:
+                        new_list.append(self._update_file_paths(entry))
+                    new_row[header] = new_list
+                else:
+                    new_row[header] = self._update_file_paths(value)
+            new_tsv_contents.append(new_row)
+        self._validate_results()
+        return new_tsv_contents, self.files_to_copy, self.headers
+
+
+class UploadContentsToTerra:
+    NEW_TSV = "updated_metrics.tsv"
+
+    def __init__(self, terra_workspace: TerraWorkspace, contents: list[dict], id_column: str, headers: set):
+        self.terra_workspace = terra_workspace
+        self.contents = contents
+        self.id_column = f"entity:{id_column}"
+        self.headers = headers
+
+    def run(self) -> None:
+        header_list = [self.id_column] + [header for header in self.headers if header != self.id_column]
+        Csv(file_path=self.NEW_TSV).create_tsv_from_list_of_dicts(
+            list_of_dicts=self.contents,
+            header_list=header_list
+        )
+        logging.info(f"Uploading {self.NEW_TSV} to Terra")
+        self.terra_workspace.upload_metadata_to_workspace_table(self.NEW_TSV)
+
+
+if __name__ == '__main__':
+    args = get_args()
+    billing_project, workspace_name = args.billing_project, args.workspace_name
+    metrics_tsv, skip_upload_column = args.metrics_tsv, args.skip_upload_column
+    flatten_path, subdir = args.flatten_path, args.subdir
+
+    token = Token(cloud=GCP)
+    request_util = RunRequest(token=token)
+    # Create Terra object to interact with the Terra with the request_util object
+    terra_workspace = TerraWorkspace(
+        billing_project=billing_project,
+        workspace_name=workspace_name,
+        request_util=request_util
+    )
+
+    workspace_bucket = terra_workspace.get_workspace_bucket()
+    # Read in TSV file
+    metrics_tsv_contents = Csv(file_path=metrics_tsv).create_list_of_dicts_from_tsv()
+
+    converted_contents, files_to_copy, headers = ConvertContents(
+        contents=metrics_tsv_contents,
+        id_column=args.id_column,
+        bucket_name=workspace_bucket,
+        flatten_path=flatten_path,
+        subdir=subdir,
+        skip_upload_column=skip_upload_column
+    ).run()
+
+    logging.info(f"Copying {len(files_to_copy)} files to {workspace_bucket}")
+    # Copy files to new location
+    GCPCloudFunctions().multithread_copy_of_files_with_validation(
+        files_to_copy=files_to_copy,
+        workers=ARG_DEFAULTS['multithread_workers'],
+        max_retries=5
+    )
+
+    UploadContentsToTerra(
+        terra_workspace=terra_workspace,
+        contents=converted_contents,
+        id_column=args.id_column,
+        headers=headers
+    ).run()
diff --git a/python/utils/gcp_utils.py b/python/utils/gcp_utils.py
@@ -382,6 +382,7 @@ def multithread_copy_of_files_with_validation(
 
         Args:
             files_to_copy (list[dict]): List of dictionaries containing source and destination file paths.
+                dict should have keys "source_file" and "full_destination_path"
             workers (int): Number of worker threads.
             max_retries (int): Maximum number of retries.
         """
diff --git a/wdl/UploadMetricsAndGcpFilesToTerra/README.md b/wdl/UploadMetricsAndGcpFilesToTerra/README.md
@@ -0,0 +1,21 @@
+# WDL Input Overview
+This workflow will copy metadata and files from a tsv into a Terra workspace. All GCP files linked from the sheet will be copied to the workspace bucket and entries from the TSV that contained a link to a file path will be updated to point to the new file locations.
+
+## Prerequisites
+* Make sure that your proxy service account has access to the source files in the tsv
+
+## Inputs Table:
+| Input Name              | Description                                                                                                                                                                                                                                   | Type    | Required | Default                                                                                     |
+|-------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|----------|---------------------------------------------------------------------------------------------|
+| **billing_project**     | The Terra billing project.                                                                                                                                                                                                                    | String  | Yes      | N/A                                                                                         |
+| **workspace_name**      | The GCP Workspace ingest the metadata and files into.                                                                                                                                                                                         | String  | Yes      | N/A                                                                                         |
+| **metrics_tsv**         | Source tsv                                                                                                                                                                                                                                    | File    | Yes      | N/A                                                                                         |
+| **flatten_path**        | Use if you want all paths to be in same directory. If not used it will mantain current path as source files (with bucket updated)                                                                                                             | Boolean | Yes      | N/A                                                                                         |
+| **id_column**           | Column to be used as the primary key in the Terra table. Must be present in the intput tsv.                                                                                                                                                   | String  | Yes      | N/A                                                                                         |
+| **skip_upload_columns** | Pass in comma seperated list (no spaces) of columns you do not want to try copying files in from. This is only helpful if there is columns WITH file paths in it that you do NOT want copied in                                               | String  | No       | N/A                                                                                         |
+| **subdir**              | Subdirectory to put files into in new bucket. If flatten path is used all files will be directory in this directory. If flatten is not used then the path structure will stay intact, but all paths will start with `gs://{bucket}/{subdir}/` | String  | No       | N/A                                                                                         |
+| **docker**              | Specifies a custom Docker image to use. Optional.                                                                                                                                                                                             | String  | No       | us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest |
+
+
+## Outputs Table:
+This script does not generate any outputs directly. However, logs will be provided to track the progress of the file ingestion and metadata transfer. These logs will include details on ingestion status, any errors encountered, and retries if necessary. You can review the logs in the stderr file for detailed information.
diff --git a/wdl/UploadMetricsAndGcpFilesToTerra/UploadMetricsAndGcpFilesToTerra.wdl b/wdl/UploadMetricsAndGcpFilesToTerra/UploadMetricsAndGcpFilesToTerra.wdl
@@ -0,0 +1,57 @@
+version 1.0
+
+workflow GCPWorkspaceToDatasetIngest {
+    input {
+        String billing_project
+        String workspace_name
+        File metrics_tsv
+        String? skip_upload_columns
+        Boolean flatten_path
+        String? subdir
+        String id_column
+        String? docker
+    }
+
+    String docker_image = select_first([docker, "us-central1-docker.pkg.dev/operations-portal-427515/ops-toolbox/ops_terra_utils_slim:latest"])
+
+    call UploadMetricsAndFilesToTerra {
+        input:
+            billing_project = billing_project,
+            workspace_name = workspace_name,
+            metrics_tsv = metrics_tsv,
+            skip_upload_columns = skip_upload_columns,
+            flatten_path = flatten_path,
+            subdir = subdir,
+            id_column = id_column,
+            docker_image = docker_image
+    }
+}
+
+task UploadMetricsAndFilesToTerra {
+    input {
+        String billing_project
+        String workspace_name
+        File metrics_tsv
+        String? skip_upload_columns
+        Boolean flatten_path
+        String? subdir
+        String id_column
+        String docker_image
+    }
+
+    command <<<
+        python /etc/terra_utils/python/upload_metrics_and_files_to_terra.py \
+        --billing_project  ~{billing_project} \
+        --workspace_name  "~{workspace_name}" \
+        --metrics_tsv  ~{metrics_tsv} \
+        --id_column  ~{id_column} \
+        ~{if flatten_path then "--flatten_path" else ""} \
+        ~{"--skip_upload_columns" + skip_upload_columns} \
+        ~{"--subdir " + subdir}
+    >>>
+
+    runtime {
+		docker: docker_image
+	}
+
+}
diff --git a/wdl/UploadMetricsAndGcpFilesToTerra/template_input.json b/wdl/UploadMetricsAndGcpFilesToTerra/template_input.json
@@ -0,0 +1,10 @@
+{
+    "UploadMetricsAndGcpFilesToTerra.billing_project": "String",
+    "UploadMetricsAndGcpFilesToTerra.workspace_name": "String",
+	"UploadMetricsAndGcpFilesToTerra.metrics_tsv": "String",
+	"UploadMetricsAndGcpFilesToTerra.skip_upload_columns": "String",
+	"UploadMetricsAndGcpFilesToTerra.flatten_path": "Boolean",
+	"UploadMetricsAndGcpFilesToTerra.subdir": "String",
+	"UploadMetricsAndGcpFilesToTerra.id_column": "String",
+	"UploadMetricsAndGcpFilesToTerra.docker": "String"
+}