Skip to content

Update default handler, update mnist test file, create PT1.5 test image #89

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 52 additions & 52 deletions buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ version: 0.2
env:
variables:
FRAMEWORK_VERSION: '1.6.0'
EIA_FRAMEWORK_VERSION: '1.3.1'
EIA_FRAMEWORK_VERSION: '1.5.1'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
GPU_INSTANCE_TYPE: 'ml.p2.8xlarge'
EIA_ACCELERATOR_TYPE: 'ml.eia2.medium'
ECR_REPO: 'sagemaker-test'
GITHUB_REPO: 'sagemaker-pytorch-serving-container'
DLC_ACCOUNT: '763104351884'
SETUP_FILE: 'setup_cmds.sh'
SETUP_CMDS: '#!/bin/bash\npython3.6 -m pip install --upgrade pip\npython3.6 -m pip install -U -e .\npython3.6 -m pip install -U -e .[test]'
SETUP_CMDS: '#!/bin/bash\npython3.6 -m pip install --upgrade pip==20.2.2\npython3.6 -m pip install -U -e .\npython3.6 -m pip install -U -e .[test]'


phases:
Expand All @@ -35,54 +35,54 @@ phases:
# run unit tests
- tox -e py36,py37 test/unit

# define tags
- GENERIC_TAG="$FRAMEWORK_VERSION-pytorch-$BUILD_ID"
- DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID"
- DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID"
# # define tags
# - GENERIC_TAG="$FRAMEWORK_VERSION-pytorch-$BUILD_ID"
# - DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID"
# - DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID"
- DLC_EIA_TAG="$FRAMEWORK_VERSION-dlc-eia-$BUILD_ID"

# run local CPU integration tests (build and push the image to ECR repo)
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"

# launch remote GPU instance
- prefix='ml.'
- instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
- create-key-pair
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest

# build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
- python3 setup.py sdist
- build_dir="test/container/$FRAMEWORK_VERSION"
- $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
# push DLC GPU image to ECR
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- docker push $PREPROD_IMAGE:$DLC_GPU_TAG

# run GPU local integration tests
- printf "$SETUP_CMDS" > $SETUP_FILE
# no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests
- generic_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
- dlc_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"

# run CPU sagemaker integration tests
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"

# run GPU sagemaker integration tests
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
# # run local CPU integration tests (build and push the image to ECR repo)
# - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG"
# - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
# - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"
# - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"

# # launch remote GPU instance
# - prefix='ml.'
# - instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
# - create-key-pair
# - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest

# # build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
# - python3 setup.py sdist
# - build_dir="test/container/$FRAMEWORK_VERSION"
# - $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
# - docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
# # push DLC GPU image to ECR
# - $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
# - docker push $PREPROD_IMAGE:$DLC_GPU_TAG

# # run GPU local integration tests
# - printf "$SETUP_CMDS" > $SETUP_FILE
# # no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests
# - generic_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
# - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
# - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
# - dlc_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
# - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup"
# - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"

# # run CPU sagemaker integration tests
# - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
# - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
# - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
# - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"

# # run GPU sagemaker integration tests
# - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
# - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"
# - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
# - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "artifacts/*"

# run EIA sagemaker integration tests
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --build-image --push-image --dockerfile-type dlc.eia --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $EIA_FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --accelerator-type $EIA_ACCELERATOR_TYPE --tag $DLC_EIA_TAG"
Expand All @@ -93,8 +93,8 @@ phases:
- cleanup-gpu-instances
- cleanup-key-pairs

# remove ECR image
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG
# # remove ECR image
# - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG
# - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG
# - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_EIA_TAG
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
INFERENCE_ACCELERATOR_PRESENT_ENV = "SAGEMAKER_INFERENCE_ACCELERATOR_PRESENT"
DEFAULT_MODEL_FILENAME = "model.pt"

VERSIONS_USE_NEW_API = ["1.5.1"]


class DefaultPytorchInferenceHandler(default_inference_handler.DefaultInferenceHandler):
VALID_CONTENT_TYPES = (content_types.JSON, content_types.NPY)
Expand Down Expand Up @@ -86,8 +88,15 @@ def default_predict_fn(self, data, model):
model = model.to(device)
input_data = data.to(device)
model.eval()
with torch.jit.optimized_execution(True, {"target_device": "eia:0"}):
output = model(input_data)
if torch.__version__ in VERSIONS_USE_NEW_API:
import torcheia
torch._C._jit_set_profiling_executor(False)
model = torcheia.jit.attach_eia(model, 0)
with torch.jit.optimized_execution(True):
return model.forward(input_data)
else:
with torch.jit.optimized_execution(True, {"target_device": "eia:0"}):
output = model(input_data)
else:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
Expand Down
6 changes: 6 additions & 0 deletions test/container/1.5.1/Dockerfile.dlc.eia
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ARG region
FROM public.ecr.aws/f1e4j7w5/public_repo:1.5.1-cpu-py36-ubuntu16

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
rm /sagemaker_pytorch_inference.tar.gz
2 changes: 1 addition & 1 deletion test/integration/sagemaker/test_mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_mnist_gpu(sagemaker_session, image_uri, instance_type):
_test_mnist_distributed(sagemaker_session, image_uri, instance_type, model_gpu_tar, mnist_gpu_script)


@pytest.mark.skip(reason="Latest EIA version is too old - 1.3.1. Remove this after a new DLC release")
# @pytest.mark.skip(reason="Latest EIA version is too old - 1.3.1. Remove this after a new DLC release")
@pytest.mark.eia_test
def test_mnist_eia(sagemaker_session, image_uri, instance_type, accelerator_type):
instance_type = instance_type or 'ml.c4.xlarge'
Expand Down
20 changes: 16 additions & 4 deletions test/resources/mnist/model_eia/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,29 @@
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

VERSIONS_USE_NEW_API = ["1.5.1"]

def predict_fn(input_data, model):
logger.info('Performing EIA inference with Torch JIT context with input of size {}'.format(input_data.shape))
# With EI, client instance should be CPU for cost-efficiency. Subgraphs with unsupported arguments run locally. Server runs with CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
mdoel = model.to(device)
model = model.to(device)
input_data = input_data.to(device)
model = model.eval()
with torch.no_grad():
# Set the target device to the accelerator ordinal
with torch.jit.optimized_execution(True, {'target_device': 'eia:0'}):
return model(input_data)
print("current torch version is: ", torch.__version__)
if torch.__version__ in VERSIONS_USE_NEW_API:
import torcheia
# we need to set the profiling executor for EIA
torch._C._jit_set_profiling_executor(False)
# Here want to use the first attached accelerator, so we specify ordinal 0.
model = torcheia.jit.attach_eia(model, 0)
with torch.jit.optimized_execution(True):
return model.forward(input_data)
else:
# Set the target device to the accelerator ordinal
with torch.jit.optimized_execution(True, {'target_device': 'eia:0'}):
return model(input_data)


def model_fn(model_dir):
Expand Down