diff --git a/src/sagemaker/workflow/steps.py b/src/sagemaker/workflow/steps.py index f49e457bc6..dbc37371db 100644 --- a/src/sagemaker/workflow/steps.py +++ b/src/sagemaker/workflow/steps.py @@ -18,7 +18,6 @@ from enum import Enum from typing import Dict, List, Set, Union, Optional, Any, TYPE_CHECKING -from urllib.parse import urlparse import attr @@ -465,6 +464,7 @@ def __init__( self.step_args = step_args self.estimator = estimator self.inputs = inputs + self.job_name = None self._properties = Properties( step_name=name, step=self, shape_name="DescribeTrainingJobResponse" @@ -493,19 +493,6 @@ def __init__( DeprecationWarning, ) - self.job_name = None - if estimator and (estimator.source_dir or estimator.entry_point): - # By default, `Estimator` will upload the local code to an S3 path - # containing a timestamp. This causes cache misses whenever a - # pipeline is updated, even if the underlying script hasn't changed. - # To avoid this, hash the contents of the training script and include it - # in the `job_name` passed to the `Estimator`, which will be used - # instead of the timestamped path. - if not is_pipeline_variable(estimator.source_dir) and not is_pipeline_variable( - estimator.entry_point - ): - self.job_name = self._generate_code_upload_path() - @property def arguments(self) -> RequestType: """The arguments dictionary that is used to call `create_training_job`. @@ -554,26 +541,6 @@ def to_request(self) -> RequestType: return request_dict - def _generate_code_upload_path(self) -> str or None: - """Generate an upload path for local training scripts based on their content.""" - from sagemaker.workflow.utilities import hash_files_or_dirs - - if self.estimator.source_dir: - source_dir_url = urlparse(self.estimator.source_dir) - if source_dir_url.scheme == "" or source_dir_url.scheme == "file": - code_hash = hash_files_or_dirs( - [self.estimator.source_dir] + self.estimator.dependencies - ) - return f"{self.name}-{code_hash}"[:1024] - elif self.estimator.entry_point: - entry_point_url = urlparse(self.estimator.entry_point) - if entry_point_url.scheme == "" or entry_point_url.scheme == "file": - code_hash = hash_files_or_dirs( - [self.estimator.entry_point] + self.estimator.dependencies - ) - return f"{self.name}-{code_hash}"[:1024] - return None - class CreateModelStep(ConfigurableRetryStep): """`CreateModelStep` for SageMaker Pipelines Workflows.""" @@ -895,16 +862,6 @@ def __init__( "code argument has to be a valid S3 URI or local file path " + "rather than a pipeline variable" ) - code_url = urlparse(code) - if code_url.scheme == "" or code_url.scheme == "file": - # By default, `Processor` will upload the local code to an S3 path - # containing a timestamp. This causes cache misses whenever a - # pipeline is updated, even if the underlying script hasn't changed. - # To avoid this, hash the contents of the script and include it - # in the `job_name` passed to the `Processor`, which will be used - # instead of the timestamped path. - self.job_name = self._generate_code_upload_path() - warnings.warn( ( 'We are deprecating the instantiation of ProcessingStep using "processor".' diff --git a/tests/unit/sagemaker/workflow/test_steps.py b/tests/unit/sagemaker/workflow/test_steps.py index 248fee6532..84906ce620 100644 --- a/tests/unit/sagemaker/workflow/test_steps.py +++ b/tests/unit/sagemaker/workflow/test_steps.py @@ -671,7 +671,7 @@ def test_processing_step_normalizes_args_with_local_code(mock_normalize_args, sc mock_normalize_args.return_value = [step.inputs, step.outputs] step.to_request() mock_normalize_args.assert_called_with( - job_name="MyProcessingStep-a22fc59b38f13da26f6a40b18687ba598cf669f74104b793cefd9c63eddf4ac7", + job_name=None, arguments=step.job_arguments, inputs=step.inputs, outputs=step.outputs, diff --git a/tests/unit/sagemaker/workflow/test_utils.py b/tests/unit/sagemaker/workflow/test_utils.py index e16293a1c5..b18ed71f9b 100644 --- a/tests/unit/sagemaker/workflow/test_utils.py +++ b/tests/unit/sagemaker/workflow/test_utils.py @@ -80,11 +80,11 @@ def test_repack_model_step(estimator): assert hyperparameters["inference_script"] == '"dummy_script.py"' assert hyperparameters["model_archive"] == '"s3://my-bucket/model.tar.gz"' assert hyperparameters["sagemaker_program"] == f'"{REPACK_SCRIPT_LAUNCHER}"' - assert ( - hyperparameters["sagemaker_submit_directory"] - == '"s3://my-bucket/MyRepackModelStep-717d7bdd388168c27e9ad2938ff0314e35be50b3157cf2498688c7525ea27e1e\ -/source/sourcedir.tar.gz"' - ) + + # ex: "gits3://my-bucket/sagemaker-scikit-learn-2025-04-07-20-39-38-854/source/sourcedir.tar.gz" + sagemaker_submit_directory = hyperparameters["sagemaker_submit_directory"] + assert sagemaker_submit_directory.startswith('"s3://my-bucket/sagemaker-scikit-learn-') + assert sagemaker_submit_directory.endswith('/source/sourcedir.tar.gz"') del request_dict["Arguments"]["HyperParameters"] del request_dict["Arguments"]["AlgorithmSpecification"]["TrainingImage"]