Creating tmpptzb0bfs_algo-1-g4d94_1 ...
Attaching to tmpptzb0bfs_algo-1-g4d94_12mdone
algo-1-g4d94_1 | 2020-08-25 20:00:05,420 sagemaker-training-toolkit ERROR Reporting training FAILURE
algo-1-g4d94_1 | 2020-08-25 20:00:05,420 sagemaker-training-toolkit ERROR framework error:
algo-1-g4d94_1 | Traceback (most recent call last):
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/trainer.py", line 92, in train
algo-1-g4d94_1 | entry_point.run(
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/entry_point.py", line 92, in run
algo-1-g4d94_1 | files.download_and_extract(uri=uri, path=environment.code_dir)
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/files.py", line 131, in download_and_extract
algo-1-g4d94_1 | s3_download(uri, dst)
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/files.py", line 167, in s3_download
algo-1-g4d94_1 | s3.Bucket(bucket).download_file(key, dst)
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/boto3/s3/inject.py", line 244, in bucket_download_file
algo-1-g4d94_1 | return self.meta.client.download_file(
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/boto3/s3/inject.py", line 170, in download_file
algo-1-g4d94_1 | return transfer.download_file(
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/boto3/s3/transfer.py", line 307, in download_file
algo-1-g4d94_1 | future.result()
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/futures.py", line 106, in result
algo-1-g4d94_1 | return self._coordinator.result()
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/futures.py", line 265, in result
algo-1-g4d94_1 | raise self._exception
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/tasks.py", line 255, in _main
algo-1-g4d94_1 | self._submit(transfer_future=transfer_future, **kwargs)
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/download.py", line 340, in _submit
algo-1-g4d94_1 | response = client.head_object(
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/botocore/client.py", line 316, in _api_call
algo-1-g4d94_1 | return self._make_api_call(operation_name, kwargs)
algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/botocore/client.py", line 635, in _make_api_call
algo-1-g4d94_1 | raise error_class(parsed_response, operation_name)
algo-1-g4d94_1 | botocore.exceptions.ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden
algo-1-g4d94_1 |
algo-1-g4d94_1 | An error occurred (403) when calling the HeadObject operation: Forbidden
tmpptzb0bfs_algo-1-g4d94_1 exited with code 1
Aborting on container exit...
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name)
160 try:
--> 161 _stream_output(process)
162 except RuntimeError as e:
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/image.py in _stream_output(process)
676 if exit_code != 0:
--> 677 raise RuntimeError("Process exited with code: %s" % exit_code)
678
RuntimeError: Process exited with code: 1
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<ipython-input-22-059e808d1544> in <module>()
10 train_config = sagemaker.session.s3_input(input_data, content_type='application/x-parquet')
11
---> 12 local_framework.fit({'train':train_config}, logs=True)
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
491 self._prepare_for_training(job_name=job_name)
492
--> 493 self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config)
494 self.jobs.append(self.latest_training_job)
495 if wait:
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs, experiment_config)
1058 train_args["enable_sagemaker_metrics"] = estimator.enable_sagemaker_metrics
1059
-> 1060 estimator.sagemaker_session.train(**train_args)
1061
1062 return cls(estimator.sagemaker_session, estimator._current_job_name)
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image, algorithm_arn, encrypt_inter_container_traffic, train_use_spot_instances, checkpoint_s3_uri, checkpoint_local_path, experiment_config, debugger_rule_configs, debugger_hook_config, tensorboard_output_config, enable_sagemaker_metrics)
588 LOGGER.info("Creating training-job with name: %s", job_name)
589 LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
--> 590 self.sagemaker_client.create_training_job(**train_request)
591
592 def process(
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)
100 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
101 logger.info("Starting training job")
--> 102 training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName)
103
104 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name)
94
95 self.model_artifacts = self.container.train(
---> 96 input_data_config, output_data_config, hyperparameters, job_name
97 )
98 self.end_time = datetime.datetime.now()
~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name)
164 # which contains the exit code and append the command line to it.
165 msg = "Failed to run: %s, %s" % (compose_command, str(e))
--> 166 raise RuntimeError(msg)
167 finally:
168 artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name)
RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmpptzb0bfs/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1
Describe the bug
A clear and concise description of what the bug is.
When defining a custom estimator, remote training works but local training does not.
To reproduce
A clear, step-by-step set of instructions to reproduce the bug.
where
myEstimatoris from:Expected behavior
A clear and concise description of what you expected to happen.
local mode should work if remote works
Screenshots or logs
If applicable, add screenshots or logs to help explain your problem.
Creating tmpptzb0bfs_algo-1-g4d94_1 ... Attaching to tmpptzb0bfs_algo-1-g4d94_12mdone algo-1-g4d94_1 | 2020-08-25 20:00:05,420 sagemaker-training-toolkit ERROR Reporting training FAILURE algo-1-g4d94_1 | 2020-08-25 20:00:05,420 sagemaker-training-toolkit ERROR framework error: algo-1-g4d94_1 | Traceback (most recent call last): algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/trainer.py", line 92, in train algo-1-g4d94_1 | entry_point.run( algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/entry_point.py", line 92, in run algo-1-g4d94_1 | files.download_and_extract(uri=uri, path=environment.code_dir) algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/files.py", line 131, in download_and_extract algo-1-g4d94_1 | s3_download(uri, dst) algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/files.py", line 167, in s3_download algo-1-g4d94_1 | s3.Bucket(bucket).download_file(key, dst) algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/boto3/s3/inject.py", line 244, in bucket_download_file algo-1-g4d94_1 | return self.meta.client.download_file( algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/boto3/s3/inject.py", line 170, in download_file algo-1-g4d94_1 | return transfer.download_file( algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/boto3/s3/transfer.py", line 307, in download_file algo-1-g4d94_1 | future.result() algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/futures.py", line 106, in result algo-1-g4d94_1 | return self._coordinator.result() algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/futures.py", line 265, in result algo-1-g4d94_1 | raise self._exception algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/tasks.py", line 255, in _main algo-1-g4d94_1 | self._submit(transfer_future=transfer_future, **kwargs) algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/download.py", line 340, in _submit algo-1-g4d94_1 | response = client.head_object( algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/botocore/client.py", line 316, in _api_call algo-1-g4d94_1 | return self._make_api_call(operation_name, kwargs) algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/botocore/client.py", line 635, in _make_api_call algo-1-g4d94_1 | raise error_class(parsed_response, operation_name) algo-1-g4d94_1 | botocore.exceptions.ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden algo-1-g4d94_1 | algo-1-g4d94_1 | An error occurred (403) when calling the HeadObject operation: Forbidden tmpptzb0bfs_algo-1-g4d94_1 exited with code 1 Aborting on container exit... --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name) 160 try: --> 161 _stream_output(process) 162 except RuntimeError as e: ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/image.py in _stream_output(process) 676 if exit_code != 0: --> 677 raise RuntimeError("Process exited with code: %s" % exit_code) 678 RuntimeError: Process exited with code: 1 During handling of the above exception, another exception occurred: RuntimeError Traceback (most recent call last) <ipython-input-22-059e808d1544> in <module>() 10 train_config = sagemaker.session.s3_input(input_data, content_type='application/x-parquet') 11 ---> 12 local_framework.fit({'train':train_config}, logs=True) ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config) 491 self._prepare_for_training(job_name=job_name) 492 --> 493 self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config) 494 self.jobs.append(self.latest_training_job) 495 if wait: ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs, experiment_config) 1058 train_args["enable_sagemaker_metrics"] = estimator.enable_sagemaker_metrics 1059 -> 1060 estimator.sagemaker_session.train(**train_args) 1061 1062 return cls(estimator.sagemaker_session, estimator._current_job_name) ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image, algorithm_arn, encrypt_inter_container_traffic, train_use_spot_instances, checkpoint_s3_uri, checkpoint_local_path, experiment_config, debugger_rule_configs, debugger_hook_config, tensorboard_output_config, enable_sagemaker_metrics) 588 LOGGER.info("Creating training-job with name: %s", job_name) 589 LOGGER.debug("train request: %s", json.dumps(train_request, indent=4)) --> 590 self.sagemaker_client.create_training_job(**train_request) 591 592 def process( ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs) 100 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {} 101 logger.info("Starting training job") --> 102 training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName) 103 104 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name) 94 95 self.model_artifacts = self.container.train( ---> 96 input_data_config, output_data_config, hyperparameters, job_name 97 ) 98 self.end_time = datetime.datetime.now() ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name) 164 # which contains the exit code and append the command line to it. 165 msg = "Failed to run: %s, %s" % (compose_command, str(e)) --> 166 raise RuntimeError(msg) 167 finally: 168 artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name) RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmpptzb0bfs/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1Also fails when I
System information
A description of your system. Please provide:
Additional context
Add any other context about the problem here.