Intelligent defaults for Model Trainer (#1586)

nargokul · pintaoz-aws · commit 784a18f99ed9 · 2024-12-04T01:45:28.000-08:00
* Intelligent defaults for Model Trainer

* Codestyle Fixes

* Unit test fixes

* New unit tests

* Codestyle fixes

* Codestyle fixes

* Parity support with Estimator

* Refactor

* CodeStyle fixes

* Update to use self.sagemaker_session instead

* Codestyle checks

* Fix notebooks
diff --git a/src/sagemaker/config/config_schema.py b/src/sagemaker/config/config_schema.py
@@ -116,6 +116,7 @@
 REGION_NAME = "region_name"
 TELEMETRY_OPT_OUT = "TelemetryOptOut"
 NOTEBOOK_JOB = "NotebookJob"
+MODEL_TRAINER = "ModelTrainer"
 
 
 def _simple_path(*args: str):
@@ -142,6 +143,7 @@ def _simple_path(*args: str):
 )
 TRAINING_JOB_ROLE_ARN_PATH = _simple_path(SAGEMAKER, TRAINING_JOB, ROLE_ARN)
 TRAINING_JOB_VPC_CONFIG_PATH = _simple_path(SAGEMAKER, TRAINING_JOB, VPC_CONFIG)
+TRAINING_JOB_TAGS_PATH = _simple_path(SAGEMAKER, TRAINING_JOB, TAGS)
 TRAINING_JOB_SECURITY_GROUP_IDS_PATH = _simple_path(
     TRAINING_JOB_VPC_CONFIG_PATH, SECURITY_GROUP_IDS
 )
@@ -656,6 +658,64 @@ def _simple_path(*args: str):
             "minItems": 1,
             "maxItems": 15,
         },
+        "role": {
+            TYPE: "string",
+            "pattern": r"^arn:aws[a-z\-]*:iam::\d{12}:role/?[a-zA-Z_0-9+=,.@\-_/]+$",
+            "minLength": 20,
+            "maxLength": 2048,
+        },
+        "baseJobName": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
+        "sourceCode": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
+        "distributed_runner": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
+        "compute": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
+        "networking": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
+        "stoppingCondition": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
+        "trainingImage": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
+        "trainingImageConfig": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
+        "algorithmName": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
+        "outputDataConfig": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
+        "trainingInputMode": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
+        "environment": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
+        "hyperparameters": {
+            TYPE: OBJECT,
+            ADDITIONAL_PROPERTIES: True
+        },
     },
     PROPERTIES: {
         SCHEMA_VERSION: {
@@ -709,6 +769,10 @@ def _simple_path(*args: str):
                                         },
                                     },
                                 },
+                                MODEL_TRAINER: {
+                                    TYPE: OBJECT,
+                                    ADDITIONAL_PROPERTIES: True
+                                },
                                 ESTIMATOR: {
                                     TYPE: OBJECT,
                                     ADDITIONAL_PROPERTIES: False,
diff --git a/src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb b/src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb
@@ -10,8 +10,8 @@
    ]
   },
   {
-   "cell_type": "markdown",
    "metadata": {},
+   "cell_type": "markdown",
    "source": [
     "# ModelTrainer\n",
     "The ModelTrainer is a new interface for training designed to tackle many of the challenges that exist in todays Estimator class. Some key features include:\n",
diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py
@@ -19,13 +19,31 @@
 import shutil
 from tempfile import TemporaryDirectory
 
-from typing import Optional, List, Union, Dict, Any
+from typing import Optional, List, Union, Dict, Any, ClassVar
+
+from graphene.utils.str_converters import to_camel_case, to_snake_case
+
 from sagemaker_core.main import resources
 from sagemaker_core.resources import TrainingJob
 from sagemaker_core.shapes import AlgorithmSpecification
 
 from pydantic import BaseModel, ConfigDict, PrivateAttr, validate_call
 
+from sagemaker.config.config_schema import (_simple_path, SAGEMAKER,
+                                            MODEL_TRAINER, MODULES,
+                                            PYTHON_SDK,
+                                            TRAINING_JOB_ENVIRONMENT_PATH,
+                                            TRAINING_JOB_ENABLE_NETWORK_ISOLATION_PATH,
+                                            TRAINING_JOB_VPC_CONFIG_PATH,
+                                            TRAINING_JOB_SUBNETS_PATH,
+                                            TRAINING_JOB_SECURITY_GROUP_IDS_PATH,
+                                            TRAINING_JOB_OUTPUT_DATA_CONFIG_PATH,
+                                            TRAINING_JOB_PROFILE_CONFIG_PATH,
+                                            TRAINING_JOB_RESOURCE_CONFIG_PATH,
+                                            TRAINING_JOB_ROLE_ARN_PATH,
+                                            TRAINING_JOB_TAGS_PATH)
+
+from sagemaker.utils import resolve_value_from_config
 from sagemaker.modules import Session, get_execution_role
 from sagemaker.modules.configs import (
     Compute,
@@ -204,6 +222,125 @@ class ModelTrainer(BaseModel):
     tags: Optional[List[Tag]] = None
     local_container_root: Optional[str] = os.getcwd()
 
+    CONFIGURABLE_ATTRIBUTES: ClassVar[List[str]] = ["role",
+                                                    "base_job_name",
+                                                    "source_code",
+                                                    "distributed_runner",
+                                                    "compute",
+                                                    "networking",
+                                                    "stopping_condition",
+                                                    "training_image",
+                                                    "training_image_config",
+                                                    "algorithm_name",
+                                                    "output_data_config",
+                                                    "checkpoint_config",
+                                                    "training_input_mode",
+                                                    "environment",
+                                                    "hyperparameters"]
+
+    SERIALIZABLE_CONFIG_ATTRIBUTES: ClassVar[Any] = {
+        "source_code": SourceCode,
+        "distributed_runner": type(DistributedRunner),
+        "compute": type(Compute),
+        "networking": type(Networking),
+        "stopping_condition": type(StoppingCondition),
+        "training_image_config": type(TrainingImageConfig),
+        "output_data_config": type(OutputDataConfig),
+        "checkpoint_config": type(CheckpointConfig)
+    }
+
+    def _populate_intelligent_defaults(self):
+        """Function to populate all the possible default configs
+
+        Model Trainer specific configs take precedence over the generic training job ones.
+        """
+        self._populate_intelligent_defaults_from_model_trainer_space()
+        self._populate_intelligent_defaults_from_training_job_space()
+
+    def _populate_intelligent_defaults_from_training_job_space(self):
+        """Function to populate all the possible default configs from Training Job Space"""
+        if not self.environment:
+            self.environment = resolve_value_from_config(
+                config_path=TRAINING_JOB_ENVIRONMENT_PATH,
+                sagemaker_session=self.sagemaker_session)
+
+        default_enable_network_isolation = resolve_value_from_config(
+            config_path=TRAINING_JOB_ENABLE_NETWORK_ISOLATION_PATH,
+            sagemaker_session=self.sagemaker_session)
+        default_vpc_config = resolve_value_from_config(
+            config_path=TRAINING_JOB_VPC_CONFIG_PATH,
+            sagemaker_session=self.sagemaker_session)
+
+        if not self.networking:
+            if (default_enable_network_isolation is not None
+                    or default_vpc_config is not None):
+                self.networking = Networking(
+                    default_enable_network_isolation=default_enable_network_isolation,
+                    subnets=resolve_value_from_config(config_path=TRAINING_JOB_SUBNETS_PATH),
+                    security_group_ids=resolve_value_from_config(
+                        config_path=TRAINING_JOB_SECURITY_GROUP_IDS_PATH),
+                )
+        else:
+            if self.networking.enable_network_isolation is None:
+                self.networking.enable_network_isolation = default_enable_network_isolation
+            if self.networking.subnets is None:
+                self.networking.subnets = (
+                    resolve_value_from_config(config_path=TRAINING_JOB_SUBNETS_PATH))
+            if self.networking.security_group_ids is None:
+                self.networking.subnets = (
+                    resolve_value_from_config(config_path=TRAINING_JOB_SUBNETS_PATH))
+
+        if not self.output_data_config:
+            default_output_data_config = resolve_value_from_config(
+                config_path=TRAINING_JOB_OUTPUT_DATA_CONFIG_PATH)
+            if default_output_data_config:
+                self.output_data_config = OutputDataConfig(
+                    **self._convert_keys_to_snake(default_output_data_config))
+
+        if not self._profiler_config:
+            default_profiler_config = resolve_value_from_config(
+                config_path=TRAINING_JOB_PROFILE_CONFIG_PATH)
+            if default_profiler_config:
+                self._profiler_config = ProfilerConfig(
+                    **self._convert_keys_to_snake(default_profiler_config))
+
+        if not self.compute:
+            default_resource_config = resolve_value_from_config(
+                config_path=TRAINING_JOB_RESOURCE_CONFIG_PATH)
+            if default_resource_config:
+                self.compute = Compute(**self._convert_keys_to_snake(default_resource_config))
+
+        if not self.role:
+            self.role = resolve_value_from_config(config_path=TRAINING_JOB_ROLE_ARN_PATH)
+
+        if not self.tags:
+            self.tags = resolve_value_from_config(config_path=TRAINING_JOB_TAGS_PATH)
+
+    def _convert_keys_to_snake(self, config: dict) -> dict:
+        """Utility helper function that converts the keys of a dictionary into snake case"""
+        return {
+            to_snake_case(key): value
+            for key, value in config.items()
+        }
+
+    def _populate_intelligent_defaults_from_model_trainer_space(self):
+        """Function to populate all the possible default configs from Model Trainer Space"""
+
+        for configurable_attribute in self.CONFIGURABLE_ATTRIBUTES:
+            if getattr(self, configurable_attribute) is None:
+                default_config = resolve_value_from_config(
+                    config_path=_simple_path(SAGEMAKER,
+                                             PYTHON_SDK,
+                                             MODULES,
+                                             MODEL_TRAINER,
+                                             to_camel_case(configurable_attribute)),
+                    sagemaker_session=self.sagemaker_session)
+                if default_config is not None:
+                    if configurable_attribute in self.SERIALIZABLE_CONFIG_ATTRIBUTES:
+                        default_config = (self.SERIALIZABLE_CONFIG_ATTRIBUTES
+                                          .get(configurable_attribute)(**default_config))  # noqa
+                    setattr(self, configurable_attribute, default_config)
+
     # Created Artifacts
     _latest_training_job: Optional[resources.TrainingJob] = PrivateAttr(default=None)
 
@@ -374,6 +511,7 @@ def train(
                 Whether to display the training container logs while training.
                 Defaults to True.
         """
+        self._populate_intelligent_defaults()
         if input_data_config:
             self.input_data_config = input_data_config
 
@@ -745,7 +883,8 @@ def with_debugger_settings(
             debug_hook_config (Optional[DebugHookConfig]):
                 Configuration information for the Amazon SageMaker Debugger hook parameters,
                 metric and tensor collections, and storage paths.
-                To learn more see: https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html
+                To learn more see:
+                 https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html
             debug_rule_configurations (Optional[List[DebugRuleConfiguration]]):
                 Configuration information for Amazon SageMaker Debugger rules for debugging
                 output ensors.
diff --git a/tests/unit/sagemaker/modules/train/test_model_trainer.py b/tests/unit/sagemaker/modules/train/test_model_trainer.py

Original file line number	Diff line number	Diff line change
`@@ -10,8 +10,8 @@`
`10`	`10`	`]`
`11`	`11`	`},`
`12`	`12`	`{`
`13`		`- "cell_type": "markdown",`
`14`	`13`	`"metadata": {},`
	`14`	`+ "cell_type": "markdown",`
`15`	`15`	`"source": [`
`16`	`16`	`"# ModelTrainer\n",`
`17`	`17`	`"The ModelTrainer is a new interface for training designed to tackle many of the challenges that exist in todays Estimator class. Some key features include:\n",`