│ /home/ec2-user/mambaforge/envs/py310/lib/python3.10/site-packages/autogluon/cloud/predictor/clou │
│ d_predictor.py:273 in fit │
│ │
│ 270 │ │ if backend_kwargs is None: │
│ 271 │ │ │ backend_kwargs = {} │
│ 272 │ │ backend_kwargs = self.backend.parse_backend_fit_kwargs(backend_kwargs) │
│ ❱ 273 │ │ self.backend.fit( │
│ 274 │ │ │ predictor_init_args=predictor_init_args, │
│ 275 │ │ │ predictor_fit_args=predictor_fit_args, │
│ 276 │ │ │ image_column=image_column, │
│ │
│ /home/ec2-user/mambaforge/envs/py310/lib/python3.10/site-packages/autogluon/cloud/backend/sagema │
│ ker_backend.py:337 in fit │
│ │
│ 334 │ │ if fit_kwargs is None: │
│ 335 │ │ │ fit_kwargs = {} │
│ 336 │ │ │
│ ❱ 337 │ │ self._fit_job.run( │
│ 338 │ │ │ role=self.role_arn, │
│ 339 │ │ │ entry_point=entry_point, │
│ 340 │ │ │ region=self._region, │
│ │
│ /home/ec2-user/mambaforge/envs/py310/lib/python3.10/site-packages/autogluon/cloud/job/sagemaker_ │
│ job.py:206 in run │
│ │
│ 203 │ │ │ self._output_path = sagemaker_estimator.output_path + "/" + latest_training_ │
│ 204 │ │ except Exception as e: │
│ 205 │ │ │ logger.error(f"Training failed. Please check sagemaker console training jobs │
│ ❱ 206 │ │ │ raise e │
│ 207 │
│ 208 │
│ 209 class SageMakerBatchTransformationJob(SageMakerJob): │
│ │
│ /home/ec2-user/mambaforge/envs/py310/lib/python3.10/site-packages/autogluon/cloud/job/sagemaker_ │
│ job.py:193 in run │
│ │
│ 190 │ │ ) │
│ 191 │ │ logger.log(20, f"Start sagemaker training job `{job_name}`") │
│ 192 │ │ try: │
│ ❱ 193 │ │ │ sagemaker_estimator.fit(inputs=inputs, wait=wait, job_name=job_name, **kwarg │
│ 194 │ │ │ self._job_name = job_name │
│ 195 │ │ │ self._framework_version = framework_version │
│ 196 │
│ │
│ /home/ec2-user/mambaforge/envs/py310/lib/python3.10/site-packages/sagemaker/workflow/pipeline_co │
│ ntext.py:346 in wrapper │
│ │
│ 343 │ │ │ │
│ 344 │ │ │ return _StepArguments(retrieve_caller_name(self_instance), run_func, *args, │
│ 345 │ │ │
│ ❱ 346 │ │ return run_func(*args, **kwargs) │
│ 347 │ │
│ 348 │ return wrapper │
│ 349 │
│ │
│ /home/ec2-user/mambaforge/envs/py310/lib/python3.10/site-packages/sagemaker/estimator.py:1349 in │
│ fit │
│ │
│ 1346 │ │ self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_confi │
│ 1347 │ │ self.jobs.append(self.latest_training_job) │
│ 1348 │ │ if wait: │
│ ❱ 1349 │ │ │ self.latest_training_job.wait(logs=logs) │
│ 1350 │ │
│ 1351 │ def _compilation_job_name(self): │
│ 1352 │ │ """Placeholder docstring""" │
│ │
│ /home/ec2-user/mambaforge/envs/py310/lib/python3.10/site-packages/sagemaker/estimator.py:2710 in │
│ wait │
│ │
│ 2707 │ │ │ logs = log_string_map[logs] │
│ 2708 │ │ # If logs are requested, call logs_for_jobs. │
│ 2709 │ │ if logs != "None": │
│ ❱ 2710 │ │ │ self.sagemaker_session.logs_for_job(self.job_name, wait=True, log_type=logs) │
│ 2711 │ │ else: │
│ 2712 │ │ │ self.sagemaker_session.wait_for_job(self.job_name) │
│ 2713 │
│ │
│ /home/ec2-user/mambaforge/envs/py310/lib/python3.10/site-packages/sagemaker/session.py:5853 in │
│ logs_for_job │
│ │
│ 5850 │ │ │ exceptions.CapacityError: If the training job fails with CapacityError. │
│ 5851 │ │ │ exceptions.UnexpectedStatusException: If waiting and the training job fails. │
│ 5852 │ │ """ │
│ ❱ 5853 │ │ _logs_for_job(self, job_name, wait, poll, log_type, timeout) │
│ 5854 │ │
│ 5855 │ def logs_for_processing_job(self, job_name, wait=False, poll=10): │
│ 5856 │ │ """Display logs for a given processing job, optionally tailing them until the is │
│ │
│ /home/ec2-user/mambaforge/envs/py310/lib/python3.10/site-packages/sagemaker/session.py:8386 in │
│ _logs_for_job │
│ │
│ 8383 │ last_profiler_rule_statuses = None │
│ 8384 │ │
│ 8385 │ while True: │
│ ❱ 8386 │ │ _flush_log_streams( │
│ 8387 │ │ │ stream_names, │
│ 8388 │ │ │ instance_count, │
│ 8389 │ │ │ client, │
│ │
│ /home/ec2-user/mambaforge/envs/py310/lib/python3.10/site-packages/sagemaker/session.py:8553 in │
│ _flush_log_streams │
│ │
│ 8550 │ │ # Log streams are created whenever a container starts writing to stdout/err, so │
│ 8551 │ │ # may be dynamic until we have a stream for every instance. │
│ 8552 │ │ try: │
│ ❱ 8553 │ │ │ streams = client.describe_log_streams( │
│ 8554 │ │ │ │ logGroupName=log_group, │
│ 8555 │ │ │ │ logStreamNamePrefix=job_name + "/", │
│ 8556 │ │ │ │ orderBy="LogStreamName", │
│ │
│ /home/ec2-user/mambaforge/envs/py310/lib/python3.10/site-packages/botocore/client.py:565 in │
│ _api_call │
│ │
│ 562 │ │ │ │ │ f"{py_operation_name}() only accepts keyword arguments." │
│ 563 │ │ │ │ ) │
│ 564 │ │ │ # The "self" in this scope is referring to the BaseClient. │
│ ❱ 565 │ │ │ return self._make_api_call(operation_name, kwargs) │
│ 566 │ │ │
│ 567 │ │ _api_call.__name__ = str(py_operation_name) │
│ 568 │
│ │
│ /home/ec2-user/mambaforge/envs/py310/lib/python3.10/site-packages/botocore/client.py:1021 in │
│ _make_api_call │
│ │
│ 1018 │ │ │ │ "Code" │
│ 1019 │ │ │ ) │
│ 1020 │ │ │ error_class = self.exceptions.from_code(error_code) │
│ ❱ 1021 │ │ │ raise error_class(parsed_response, operation_name) │
│ 1022 │ │ else: │
│ 1023 │ │ │ return parsed_response │
│ 1024 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ClientError: An error occurred (ExpiredTokenException) when calling the DescribeLogStreams operation: The security
token included in the request is expired
response = boto3.Session().client("sts").assume_role(
RoleArn = ROLE_NAME,
RoleSessionName = 'AutoGluonCloudSession',
DurationSeconds = 99999999999999)
boto3.setup_default_session(
aws_access_key_id = response['Credentials']['AccessKeyId'],
aws_secret_access_key = response['Credentials']['SecretAccessKey'],
aws_session_token = response['Credentials']['SessionToken'])
which is 12 hours.
My model trains usually take longer than 12 hours.
I want to run
TabularCloudPredictorwithwait=True, but getting the error:I was hoping I could trust extend the role duration with
DurationSeconds:But according to this:
https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html
which is 12 hours.
My model trains usually take longer than 12 hours.
Has anyone found an elegant solution for this?