Skip to content

Commit b903090

Browse files
authored
Adding PyTorch EI Support (#38)
* PyTorch Elastic Inference DLC dockerfile * Add EI inference script and model tar.gz
1 parent 53c290a commit b903090

File tree

8 files changed

+231
-19
lines changed

8 files changed

+231
-19
lines changed

buildspec.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,15 @@ version: 0.2
33
env:
44
variables:
55
FRAMEWORK_VERSION: '1.4.0'
6+
EIA_FRAMEWORK_VERSION: '1.3.1'
67
CPU_PY2_VERSION: '2'
78
CPU_PY3_VERSION: '3'
89
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
910
GPU_PY2_VERSION: '2'
1011
GPU_PY3_VERSION: '3'
12+
EIA_PY3_VERSION: '3'
1113
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
14+
EIA_ACCELERATOR_TYPE: 'ml.eia2.medium'
1215
LOCAL_BASE_REPO: 'pytorch-base'
1316
ECR_REPO: 'sagemaker-test'
1417
GITHUB_REPO: 'sagemaker-pytorch-serving-container'
@@ -44,6 +47,7 @@ phases:
4447

4548
- cpu_dockerfile="Dockerfile.cpu"
4649
- gpu_dockerfile="Dockerfile.gpu"
50+
- eia_dockerfile="Dockerfile.eia"
4751

4852
# build py2 images
4953
- build_dir="docker/$FRAMEWORK_VERSION/py$CPU_PY2_VERSION"
@@ -60,15 +64,22 @@ phases:
6064
- cp -r docker/build_artifacts/* $build_dir/
6165
- CPU_PY3_TAG="$FRAMEWORK_VERSION-cpu-py3-$BUILD_ID"
6266
- GPU_PY3_TAG="$FRAMEWORK_VERSION-gpu-py3-$BUILD_ID"
67+
- EIA_PY3_TAG="$EIA_FRAMEWORK_VERSION-eia-py3-$BUILD_ID"
6368
- docker build -f "$build_dir/$cpu_dockerfile" -t $PREPROD_IMAGE:$CPU_PY3_TAG $build_dir
6469
- docker build -f "$build_dir/$gpu_dockerfile" -t $PREPROD_IMAGE:$GPU_PY3_TAG $build_dir
70+
# PY2 not offered for EIA PyTorch
71+
- eia_build_dir="docker/$EIA_FRAMEWORK_VERSION/py$EIA_PY3_VERSION"
72+
- cp sagemaker_pytorch_inference.tar.gz $eia_build_dir/
73+
- cp -r docker/build_artifacts/* $eia_build_dir/
74+
- docker build -f "$eia_build_dir/$eia_dockerfile" -t $PREPROD_IMAGE:$EIA_PY3_TAG $eia_build_dir
6575

6676
# push images to ecr
6777
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
6878
- docker push $PREPROD_IMAGE:$CPU_PY2_TAG
6979
- docker push $PREPROD_IMAGE:$CPU_PY3_TAG
7080
- docker push $PREPROD_IMAGE:$GPU_PY2_TAG
7181
- docker push $PREPROD_IMAGE:$GPU_PY3_TAG
82+
- docker push $PREPROD_IMAGE:$EIA_PY3_TAG
7283

7384
# launch remote gpu instance
7485
- prefix='ml.'
@@ -104,6 +115,10 @@ phases:
104115
- execute-command-if-has-matching-changes "$py3_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "docker/*" "buildspec.yml"
105116
- execute-command-if-has-matching-changes "$py2_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "docker/*" "buildspec.yml"
106117

118+
# run eia sagemaker tests
119+
- py3_cmd="pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $EIA_FRAMEWORK_VERSION --py-version $EIA_PY3_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --accelerator-type $EIA_ACCELERATOR_TYPE --tag $EIA_PY3_TAG"
120+
- execute-command-if-has-matching-changes "$py3_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "docker/*" "buildspec.yml"
121+
107122
finally:
108123
# shut down remote gpu instance
109124
- cleanup-gpu-instances
@@ -114,3 +129,4 @@ phases:
114129
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_PY3_TAG
115130
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_PY2_TAG
116131
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_PY3_TAG
132+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$EIA_PY3_TAG

docker/1.3.1/py3/Dockerfile.eia

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
FROM ubuntu:16.04
2+
LABEL maintainer="Amazon AI"
3+
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
4+
5+
# Add arguments to achieve the version, python and url
6+
ARG PYTHON_VERSION=3.6.6
7+
ARG PYTORCH_VERSION=1.3.1
8+
ARG TORCHVISION_VERSION=0.4.2
9+
ARG GRAPHVIZ_VERSION=0.13.2
10+
ARG MMS_VERSION=1.0.8
11+
ARG HEALTH_CHECK_VERSION=1.5.3
12+
13+
# See http://bugs.python.org/issue19846
14+
ENV LANG C.UTF-8
15+
ENV LD_LIBRARY_PATH /opt/conda/lib/:$LD_LIBRARY_PATH
16+
ENV PATH /opt/conda/bin:$PATH
17+
ENV SAGEMAKER_SERVING_MODULE sagemaker_pytorch_serving_container.serving:main
18+
ENV TEMP=/home/model-server/tmp
19+
20+
RUN apt-get update \
21+
&& apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
22+
build-essential \
23+
ca-certificates \
24+
cmake \
25+
curl \
26+
git \
27+
jq \
28+
libgl1-mesa-glx \
29+
libglib2.0-0 \
30+
libgomp1 \
31+
libibverbs-dev \
32+
libsm6 \
33+
libxext6 \
34+
libxrender-dev \
35+
openjdk-8-jdk-headless \
36+
vim \
37+
wget \
38+
zlib1g-dev
39+
40+
# Install OpenSSH. Allow OpenSSH to talk to containers without asking for confirmation
41+
RUN apt-get install -y --no-install-recommends \
42+
openssh-client \
43+
openssh-server \
44+
&& mkdir -p /var/run/sshd \
45+
&& cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
46+
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
47+
&& mv /etc/ssh/ssh_config.new /etc/ssh/ssh_configs
48+
49+
RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
50+
&& chmod +x ~/miniconda.sh \
51+
&& ~/miniconda.sh -b -p /opt/conda \
52+
&& rm ~/miniconda.sh \
53+
&& /opt/conda/bin/conda update conda \
54+
&& /opt/conda/bin/conda install -y \
55+
python=$PYTHON_VERSION \
56+
cython==0.29.12 \
57+
ipython==7.7.0 \
58+
numpy==1.16.4 \
59+
scipy==1.3.0 \
60+
typing==3.6.4 \
61+
&& /opt/conda/bin/conda clean -ya
62+
63+
RUN conda install -c \
64+
conda-forge \
65+
awscli \
66+
opencv==4.0.1 \
67+
&& conda install -y \
68+
scikit-learn==0.21.2 \
69+
pandas==0.25.0 \
70+
pillow==6.2.1 \
71+
h5py==2.9.0 \
72+
requests==2.22.0 \
73+
&& conda clean -ya \
74+
&& /opt/conda/bin/conda config --set ssl_verify False \
75+
&& pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \
76+
&& ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \
77+
# Torchvision wheel must be installed first, so that PyTorch-EI framework is not overwritten.
78+
&& pip install https://download.pytorch.org/whl/cpu/torchvision-0.4.2%2Bcpu-cp36-cp36m-linux_x86_64.whl \
79+
&& pip install https://s3.amazonaws.com/amazonei-pytorch/torch_eia-1.3.1-cp36-cp36m-manylinux1_x86_64.whl \
80+
&& pip install graphviz==$GRAPHVIZ_VERSION \
81+
&& pip install mxnet-model-server==$MMS_VERSION
82+
83+
RUN useradd -m model-server \
84+
&& mkdir -p /home/model-server/tmp \
85+
&& chown -R model-server /home/model-server
86+
87+
COPY mms-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
88+
COPY config.properties /home/model-server
89+
90+
RUN chmod +x /usr/local/bin/dockerd-entrypoint.py
91+
92+
COPY sagemaker_pytorch_inference.tar.gz /sagemaker_pytorch_inference.tar.gz
93+
RUN pip install --no-cache-dir \
94+
/sagemaker_pytorch_inference.tar.gz \
95+
&& rm /sagemaker_pytorch_inference.tar.gz
96+
97+
RUN curl https://aws-dlc-licenses.s3.amazonaws.com/pytorch/license.txt -o /license.txt
98+
99+
RUN wget https://amazonei-tools.s3.amazonaws.com/v${HEALTH_CHECK_VERSION}/ei_tools_${HEALTH_CHECK_VERSION}.tar.gz -O /opt/ei_tools_${HEALTH_CHECK_VERSION}.tar.gz \
100+
&& tar -xvf /opt/ei_tools_${HEALTH_CHECK_VERSION}.tar.gz -C /opt/ \
101+
&& rm -rf /opt/ei_tools_${HEALTH_CHECK_VERSION}.tar.gz \
102+
&& chmod a+x /opt/ei_tools/bin/health_check \
103+
&& mkdir -p /opt/ei_health_check/bin \
104+
&& ln -s /opt/ei_tools/bin/health_check /opt/ei_health_check/bin/health_check \
105+
&& ln -s /opt/ei_tools/lib /opt/ei_health_check/lib
106+
107+
EXPOSE 8080 8081
108+
ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
109+
CMD ["mxnet-model-server", "--start", "--mms-config", "/home/model-server/config.properties"]

test/conftest.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,12 @@ def pytest_addoption(parser):
4848
parser.addoption('--build-base-image', '-B', action='store_true')
4949
parser.addoption('--aws-id')
5050
parser.addoption('--instance-type')
51+
parser.addoption('--accelerator-type')
5152
parser.addoption('--docker-base-name', default='pytorch')
5253
parser.addoption('--region', default='us-west-2')
5354
parser.addoption('--framework-version', default=PyTorch.LATEST_VERSION)
5455
parser.addoption('--py-version', choices=['2', '3'], default=str(sys.version_info.major))
56+
# Processor is still "cpu" for EIA tests
5557
parser.addoption('--processor', choices=['gpu', 'cpu'], default='cpu')
5658
# If not specified, will default to {framework-version}-{processor}-py{py-version}
5759
parser.addoption('--tag', default=None)
@@ -162,6 +164,11 @@ def fixture_instance_type(request, processor):
162164
return provided_instance_type or default_instance_type
163165

164166

167+
@pytest.fixture(name='accelerator_type', scope='session')
168+
def fixture_accelerator_type(request):
169+
return request.config.getoption('--accelerator-type')
170+
171+
165172
@pytest.fixture(name='docker_registry', scope='session')
166173
def fixture_docker_registry(aws_id, region):
167174
return '{}.dkr.ecr.{}.amazonaws.com'.format(aws_id, region)
@@ -173,10 +180,22 @@ def fixture_ecr_image(docker_registry, docker_base_name, tag):
173180

174181

175182
@pytest.fixture(autouse=True)
176-
def skip_by_device_type(request, use_gpu, instance_type):
183+
def skip_by_device_type(request, use_gpu, instance_type, accelerator_type):
177184
is_gpu = use_gpu or instance_type[3] in ['g', 'p']
178-
if (request.node.get_closest_marker('skip_gpu') and is_gpu) or \
179-
(request.node.get_closest_marker('skip_cpu') and not is_gpu):
185+
is_eia = accelerator_type is not None
186+
187+
# Separate out cases for clearer logic.
188+
# When running GPU test, skip CPU test. When running CPU test, skip GPU test.
189+
if (request.node.get_closest_marker('gpu_test') and not is_gpu) or \
190+
(request.node.get_closest_marker('cpu_test') and is_gpu):
191+
pytest.skip('Skipping because running on \'{}\' instance'.format(instance_type))
192+
193+
# When running EIA test, skip the CPU and GPU functions
194+
elif (request.node.get_closest_marker('gpu_test') or request.node.get_closest_marker('cpu_test')) and is_eia:
195+
pytest.skip('Skipping because running on \'{}\' instance'.format(instance_type))
196+
197+
# When running CPU or GPU test, skip EIA test.
198+
elif request.node.get_closest_marker('eia_test') and not is_eia:
180199
pytest.skip('Skipping because running on \'{}\' instance'.format(instance_type))
181200

182201

test/integration/__init__.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,19 @@
1919
mnist_script = os.path.join(mnist_path, 'mnist.py')
2020
data_dir = os.path.join(mnist_path, 'data')
2121
training_dir = os.path.join(data_dir, 'training')
22+
cpu_sub_dir = 'model_cpu'
23+
gpu_sub_dir = 'model_gpu'
24+
eia_sub_dir = 'model_eia'
2225

23-
model_cpu_dir = os.path.join(mnist_path, 'model_cpu')
26+
model_cpu_dir = os.path.join(mnist_path, cpu_sub_dir)
27+
mnist_cpu_script = os.path.join(model_cpu_dir, 'mnist.py')
2428
model_cpu_1d_dir = os.path.join(model_cpu_dir, '1d')
2529
mnist_1d_script = os.path.join(model_cpu_1d_dir, 'mnist_1d.py')
26-
model_gpu_dir = os.path.join(mnist_path, 'model_gpu')
30+
model_gpu_dir = os.path.join(mnist_path, gpu_sub_dir)
31+
mnist_gpu_script = os.path.join(model_gpu_dir, 'mnist.py')
2732
model_gpu_1d_dir = os.path.join(model_gpu_dir, '1d')
33+
model_eia_dir = os.path.join(mnist_path, eia_sub_dir)
34+
mnist_eia_script = os.path.join(model_eia_dir, 'mnist.py')
2835
call_model_fn_once_script = os.path.join(resources_path, 'call_model_fn_once.py')
2936

3037
ROLE = 'dummy/unused-role'

test/integration/sagemaker/test_mnist.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,41 +19,54 @@
1919
import sagemaker
2020
from sagemaker.pytorch import PyTorchModel
2121

22-
from test.integration import mnist_script, model_cpu_dir
22+
from test.integration import model_cpu_dir, mnist_cpu_script, mnist_gpu_script, model_eia_dir, mnist_eia_script
2323
from test.integration.sagemaker.timeout import timeout_and_delete_endpoint
2424

2525

26-
@pytest.mark.skip_gpu
26+
@pytest.mark.cpu_test
2727
def test_mnist_distributed_cpu(sagemaker_session, ecr_image, instance_type):
2828
instance_type = instance_type or 'ml.c4.xlarge'
29-
_test_mnist_distributed(sagemaker_session, ecr_image, instance_type)
29+
model_dir = os.path.join(model_cpu_dir, 'model_mnist.tar.gz')
30+
_test_mnist_distributed(sagemaker_session, ecr_image, instance_type, model_dir, mnist_cpu_script)
3031

3132

32-
@pytest.mark.skip_cpu
33+
@pytest.mark.gpu_test
3334
def test_mnist_distributed_gpu(sagemaker_session, ecr_image, instance_type):
3435
instance_type = instance_type or 'ml.p2.xlarge'
35-
_test_mnist_distributed(sagemaker_session, ecr_image, instance_type)
36+
model_dir = os.path.join(model_cpu_dir, 'model_mnist.tar.gz')
37+
_test_mnist_distributed(sagemaker_session, ecr_image, instance_type, model_dir, mnist_gpu_script)
3638

3739

38-
def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type):
39-
model_dir = os.path.join(model_cpu_dir, 'model_mnist.tar.gz')
40+
@pytest.mark.eia_test
41+
def test_mnist_eia(sagemaker_session, ecr_image, instance_type, accelerator_type):
42+
instance_type = instance_type or 'ml.c4.xlarge'
43+
# Scripted model is serialized with torch.jit.save().
44+
# Inference test for EIA doesn't need to instantiate model definition then load state_dict
45+
model_dir = os.path.join(model_eia_dir, 'model_mnist.tar.gz')
46+
_test_mnist_distributed(sagemaker_session, ecr_image, instance_type, model_dir, mnist_eia_script,
47+
accelerator_type=accelerator_type)
48+
4049

50+
def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, model_dir, mnist_script,
51+
accelerator_type=None):
4152
endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-pytorch-serving")
4253

4354
model_data = sagemaker_session.upload_data(
4455
path=model_dir,
4556
key_prefix="sagemaker-pytorch-serving/models",
4657
)
4758

48-
pytorch = PyTorchModel(model_data,
49-
'SageMakerRole',
50-
mnist_script,
51-
image=ecr_image,
52-
sagemaker_session=sagemaker_session)
59+
pytorch = PyTorchModel(model_data=model_data, role='SageMakerRole', entry_point=mnist_script,
60+
image=ecr_image, sagemaker_session=sagemaker_session)
5361

5462
with timeout_and_delete_endpoint(endpoint_name, sagemaker_session, minutes=30):
55-
predictor = pytorch.deploy(initial_instance_count=1, instance_type=instance_type,
56-
endpoint_name=endpoint_name)
63+
# Use accelerator type to differentiate EI vs. CPU and GPU. Don't use processor value
64+
if accelerator_type is not None:
65+
predictor = pytorch.deploy(initial_instance_count=1, instance_type=instance_type,
66+
accelerator_type=accelerator_type, endpoint_name=endpoint_name)
67+
else:
68+
predictor = pytorch.deploy(initial_instance_count=1, instance_type=instance_type,
69+
endpoint_name=endpoint_name)
5770

5871
batch_size = 100
5972
data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
import logging
15+
import os
16+
import sys
17+
18+
import torch
19+
20+
logger = logging.getLogger(__name__)
21+
logger.setLevel(logging.DEBUG)
22+
logger.addHandler(logging.StreamHandler(sys.stdout))
23+
24+
25+
def predict_fn(input_data, model):
26+
logger.info('Performing EIA inference with Torch JIT context with input of size {}'.format(input_data.shape))
27+
# With EI, client instance should be CPU for cost-efficiency. Subgraphs with unsupported arguments run locally. Server runs with CUDA
28+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
29+
mdoel = model.to(device)
30+
input_data = input_data.to(device)
31+
with torch.no_grad():
32+
# Set the target device to the accelerator ordinal
33+
with torch.jit.optimized_execution(True, {'target_device': 'eia:0'}):
34+
return model(input_data)
35+
36+
37+
def model_fn(model_dir):
38+
logger.info('model_fn: Loading model with TorchScript from {}'.format(model_dir))
39+
# Scripted model is serialized with torch.jit.save().
40+
# No need to instantiate model definition then load state_dict
41+
model = torch.jit.load('model.pth')
42+
return model
43+
44+
45+
def save_model(model, model_dir):
46+
logger.info("Saving the model to {}.".format(model_dir))
47+
path = os.path.join(model_dir, 'model.pth')
48+
torch.jit.save(model, path)
128 KB
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)