aws · jeet4320 · May 6, 2021 · May 5, 2021 · May 5, 2021 · May 5, 2021
diff --git a/pytorch/buildspec.yml b/pytorch/buildspec.yml
@@ -83,28 +83,3 @@ images:
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example, /Dockerfile., *DEVICE_TYPE ]
     context:
       <<: *TRAINING_CONTEXT
-  BuildCPUPTInferencePy3DockerImage:
-    <<: *INFERENCE_REPOSITORY
-    build: &PYTORCH_CPU_INFERENCE_PY3 false
-    image_size_baseline: 4899
-    device_type: &DEVICE_TYPE cpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py36
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *INFERENCE_CONTEXT
-  BuildGPUPTInferencePy3DockerImage:
-    <<: *INFERENCE_REPOSITORY
-    build: &PYTORCH_GPU_INFERENCE_PY3 false
-    image_size_baseline: 14000
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py36
-    cuda_version: &CUDA_VERSION cu111
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *INFERENCE_CONTEXT
@@ -6,7 +6,9 @@ LABEL dlc_major_version="1"
 ARG PYTHON=python3
 ARG PYTHON_VERSION=3.6.13
 ARG CUBLAS_VERSION=11.3.0.106
-ARG OPEN_MPI_PATH=/opt/amazon/openmpi/
+ARG OPEN_MPI_PATH=/opt/amazon/openmpi
+ARG EFA_PATH=/opt/amazon/efa
+
 ARG CUDA_HOME=/usr/local/cuda
 ARG CONDA_PREFIX=/opt/conda
 ARG METIS=metis-5.1.0
@@ -37,6 +39,7 @@ ENV DGLBACKEND=pytorch
 ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
 ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
 ENV MANUAL_BUILD=0
+ENV RDMAV_FORK_SAFE=1
 
 ARG PT_TRAINING_URL=https://aws-pytorch-binaries.s3-us-west-2.amazonaws.com/r1.8.1_aws/20210325-012734/e1343088f0beb99438343e1e99e8d71ffb972b47/gpu/torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl
 ARG PT_TORCHVISION_URL=https://torchvision-build.s3-us-west-2.amazonaws.com/1.8.1/gpu/torchvision-0.9.1-cp36-cp36m-linux_x86_64.whl
@@ -100,8 +103,9 @@ RUN mkdir /tmp/efa \
   && rm -rf /tmp/efa \
   && rm -rf /tmp/aws-efa-installer-${EFA_VERSION}.tar.gz
 
+RUN echo "pml = ob1" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf
 ENV PATH="$OPEN_MPI_PATH/bin:$PATH"
-ENV LD_LIBRARY_PATH="$OPEN_MPI_PATH/lib/:$LD_LIBRARY_PATH"
+ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH
 
 RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \
  && curl -L -o ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \

diff --git a/src/config/build_config.py b/src/config/build_config.py
@@ -5,9 +5,9 @@
 # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR)
 ENABLE_NEURON_MODE = False
 # Frameworks for which you want to disable both builds and tests
-DISABLE_FRAMEWORK_TESTS = []
+DISABLE_FRAMEWORK_TESTS = ["mxnet", "pytorch", "huggingface_pytorch", "huggingface_tensorflow"]
 # Disable new builds or build without datetime tag
-DISABLE_DATETIME_TAG = False
+DISABLE_DATETIME_TAG = True
 # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True
 # before disabling new builds or tests will fail
-DISABLE_NEW_BUILDS = False
+DISABLE_NEW_BUILDS = True
diff --git a/src/config/test_config.py b/src/config/test_config.py
@@ -7,9 +7,9 @@
 # It is recommended to set DISABLE_EFA_TESTS to True to disable EFA tests if there is no change to EFA installer version or Frameworks.
 DISABLE_EFA_TESTS = False
 
-DISABLE_SANITY_TESTS = False
+DISABLE_SANITY_TESTS = True
 DISABLE_SAGEMAKER_TESTS = False
-DISABLE_ECS_TESTS = False
-DISABLE_EKS_TESTS = False
-DISABLE_EC2_TESTS = False
+DISABLE_ECS_TESTS = True
+DISABLE_EKS_TESTS = True
+DISABLE_EC2_TESTS = True
 USE_SCHEDULER = False
diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml
@@ -86,28 +86,3 @@ images:
       /Dockerfile., *DEVICE_TYPE ]
     context:
       <<: *TRAINING_CONTEXT
-  BuildTensorflowCPUInferencePy3DockerImage:
-    <<: *INFERENCE_REPOSITORY
-    build: &TENSORFLOW_CPU_INFERENCE_PY3 false
-    image_size_baseline: 4899
-    device_type: &DEVICE_TYPE cpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *INFERENCE_CONTEXT
-  BuildTensorflowGPUInferencePy3DockerImage:
-    <<: *INFERENCE_REPOSITORY
-    build: &TENSORFLOW_GPU_INFERENCE_PY3 false
-    image_size_baseline: 7738
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu110
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *INFERENCE_CONTEXT
@@ -21,12 +21,14 @@ ENV KMP_AFFINITY=granularity=fine,compact,1,0
 ENV KMP_BLOCKTIME=1
 ENV KMP_SETTINGS=0
 ENV MANUAL_BUILD=0
+ENV RDMAV_FORK_SAFE=1
 
 ARG PYTHON=python3.7
 ARG PYTHON_PIP=python3-pip
 ARG PIP=pip3
 ARG PYTHON_VERSION=3.7.10
-ARG OPEN_MPI_PATH=/opt/amazon/openmpi/
+ARG OPEN_MPI_PATH=/opt/amazon/openmpi
+ARG EFA_PATH=/opt/amazon/efa
 ARG NCCL_VERSION=2.7.8
 ARG EFA_VERSION=1.11.2
 ARG BRANCH_OFI=1.1.1
@@ -139,8 +141,8 @@ RUN echo "hwloc_base_binding_policy = none" >> $OPEN_MPI_PATH/etc/openmpi-mca-pa
 
 # Set default NCCL parameters
 RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
-
-ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib:$LD_LIBRARY_PATH
+RUN echo "pml = ob1" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf
+ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH
 # /usr/local/lib/libpython* needs to be accessible for dynamic linking
 ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 ENV PATH=$OPEN_MPI_PATH/bin/:$PATH

@@ -42,8 +42,7 @@
               "custom_mpi_options": (
                   "-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 "
                   "-x HOROVOD_FUSION_THRESHOLD=16777216 "
-                  "-x TF_CPP_MIN_LOG_LEVEL=3 "
-                  "-x RDMAV_FORK_SAFE=1"
+                  "-x TF_CPP_MIN_LOG_LEVEL=3"
               ),
             }
         },
@@ -69,8 +68,7 @@
                 "custom_mpi_options": (
                     "-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 "
                     "-x HOROVOD_FUSION_THRESHOLD=16777216 "
-                    "-x TF_CPP_MIN_LOG_LEVEL=3 "
-                    "-x RDMAV_FORK_SAFE=1"
+                    "-x TF_CPP_MIN_LOG_LEVEL=3"
                 ),
             }
         },

@@ -4,7 +4,6 @@ HOME_DIR=/test
 BIN_DIR=${HOME_DIR}/bin
 LOG_DIR=${HOME_DIR}/logs
 HOROVOD_VERSION=v0.16.4
-export RDMAV_FORK_SAFE=1
 
 git clone -b ${HOROVOD_VERSION} https://github.com/horovod/horovod.git ${HOME_DIR}/artifacts/horovod
 ${BIN_DIR}/pytorch_tests/testPTHVDHelper || exit 1

@@ -5,7 +5,7 @@ BIN_DIR=${HOME_DIR}/bin
 LOG_DIR=${HOME_DIR}/logs
 TRAINING_LOG=${LOG_DIR}/tensorflow_horovod_test.log
 HOVOROD_DIR=${BIN_DIR}/examples/Horovod
-export RDMAV_FORK_SAFE=1
+
 set -e
 
 echo "Simply verify if Horovod works well. You can follow progress on the log file : $TRAINING_LOG" | tee -a $TRAINING_LOG
@@ -45,7 +45,7 @@ if [ ${RETURN_VAL} -eq 0 ]; then
         -x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \
         -x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \
         -x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \
-        -x TF_CPP_MIN_LOG_LEVEL=0 -x RDMAV_FORK_SAFE\
+        -x TF_CPP_MIN_LOG_LEVEL=0 \
         python -W ignore ${HOVOROD_DIR}/tf2_train_imagenet_resnet_hvd.py \
         --synthetic --batch_size 64 --num_batches 100 --clear_log 2> ${TRAINING_LOG}
 else

@@ -4,7 +4,6 @@ HOME_DIR=/test
 BIN_DIR=${HOME_DIR}/bin
 LOG_DIR=${HOME_DIR}/logs
 
-export RDMAV_FORK_SAFE=1
 python ${BIN_DIR}/testTFKerasHVD.py AMP || exit 1
 
 exit 0
@@ -4,7 +4,6 @@ HOME_DIR=/test
 BIN_DIR=${HOME_DIR}/bin
 LOG_DIR=${HOME_DIR}/logs
 
-export RDMAV_FORK_SAFE=1
 python ${BIN_DIR}/testTFKerasHVD.py FP32 || exit 1
 
 exit 0
@@ -31,8 +31,6 @@ spec:
               - -x
               - LD_LIBRARY_PATH
               - -x
-              - RDMAV_FORK_SAFE=1
-              - -x
               - PATH
               - -x
               - NCCL_SOCKET_IFNAME=eth0

diff --git a/test/dlc_tests/eks/pytorch/training/test_eks_pytorch_training.py b/test/dlc_tests/eks/pytorch/training/test_eks_pytorch_training.py
@@ -161,7 +161,7 @@ def test_eks_pytorch_dgl_single_node_training(pytorch_training, py3_only):
         run("kubectl delete pods {}".format(pod_name))
 
 
-@pytest.mark.skipif(is_pr_context(), reason=SKIP_PR_REASON)
+# @pytest.mark.skipif(is_pr_context(), reason=SKIP_PR_REASON)
 @pytest.mark.model("mnist")
 @pytest.mark.multinode(4)
 def test_eks_pytorch_multinode_node_training(pytorch_training, example_only):

diff --git a/test/dlc_tests/eks/tensorflow/training/test_eks_tensorflow_multi_node_training.py b/test/dlc_tests/eks/tensorflow/training/test_eks_tensorflow_multi_node_training.py
@@ -14,7 +14,7 @@
 
 
 # Test only runs in region us-west-2, on instance type p3.16xlarge, on PR_EKS_CLUSTER_NAME_TEMPLATE cluster
-@pytest.mark.skipif(is_pr_context(), reason=SKIP_PR_REASON)
+# @pytest.mark.skipif(is_pr_context(), reason=SKIP_PR_REASON)
 @pytest.mark.integration("horovod")
 @pytest.mark.model("resnet")
 @pytest.mark.multinode(3)

@@ -178,7 +178,7 @@ def test_smmodelparallel_mnist_multigpu_multinode(n_virginia_ecr_image, instance
                 "mpi": {
                     "enabled": True,
                     "processes_per_host": num_processes,
-                    "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x RDMAV_FORK_SAFE=1 ",
+                    "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none ",
                 },
             },
         )
@@ -223,7 +223,7 @@ def test_smmodelparallel_mnist_multigpu_multinode_efa(n_virginia_ecr_image, efa_
                 "mpi": {
                     "enabled": True,
                     "processes_per_host": num_processes,
-                    "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 ",
+                    "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa ",
                 },
             },
         )

@@ -4,6 +4,6 @@
 
 set -ex
 
-smddpsinglenode -x RDMAV_FORK_SAFE=1 python smdataparallel_mnist.py
+smddpsinglenode python smdataparallel_mnist.py
 
 bash smmodelparallel_mnist_script_mode.sh
@@ -1,4 +1,4 @@
 set -ex
 
 export SM_HP_MP_PARAMETERS=\{\"ddp\":true,\"microbatches\":4,\"partitions\":2,\"pipeline\":\"interleaved\"\}
-mpirun -mca btl_vader_single_copy_mechanism none --allow-run-as-root -x RDMAV_FORK_SAFE=1 -np 8 python smmodelparallel_pt_mnist.py --assert-losses 1 --data-dir data/training
+mpirun -mca btl_vader_single_copy_mechanism none --allow-run-as-root -np 8 python smmodelparallel_pt_mnist.py --assert-losses 1 --data-dir data/training
diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py
@@ -52,6 +52,8 @@ def pytest_runtest_setup(item):
         efa_tests = [mark for mark in item.iter_markers("efa")]
         if not efa_tests:
             pytest.skip("Skipping non-efa tests")
+        if efa_tests and are_efa_tests_disabled():
+            pytest.skip('Skipping EFA tests as EFA tests are disabled.')
 
 
 def pytest_collection_modifyitems(session, config, items):

@@ -33,7 +33,7 @@ def test_distributed_training_horovod(sagemaker_session,
                                       tmpdir,
                                       framework_version):
 
-    mpi_options = '-verbose -x orte_base_help_aggregate=0 -x RDMAV_FORK_SAFE=1'
+    mpi_options = '-verbose -x orte_base_help_aggregate=0'
     estimator = TensorFlow(
         entry_point=os.path.join(RESOURCE_PATH, 'mnist', 'horovod_mnist.py'),
         role='SageMakerRole',
@@ -63,7 +63,7 @@ def test_distributed_training_horovod_with_env_vars(
         sagemaker_session, instance_type, ecr_image, tmpdir, framework_version
 ):
 
-    mpi_options = "-verbose -x orte_base_help_aggregate=0 -x RDMAV_FORK_SAFE=1"
+    mpi_options = "-verbose -x orte_base_help_aggregate=0"
     estimator = TensorFlow(
         entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_env_vars.py"),
         role="SageMakerRole",

@@ -71,7 +71,7 @@ def test_smmodelparallel_efa(n_virginia_sagemaker_session, efa_instance_type, n_
                                "mpi": {
                                    "enabled": True,
                                    "processes_per_host": num_processes,
-                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 ",
+                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa ",
                                 }
                            },
                            sagemaker_session=n_virginia_sagemaker_session,
@@ -105,7 +105,7 @@ def test_smmodelparallel_multinode_efa(n_virginia_sagemaker_session, efa_instanc
                                "mpi": {
                                    "enabled": True,
                                    "processes_per_host": num_processes,
-                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 ",
+                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa ",
                                 }
                            },
                            sagemaker_session=n_virginia_sagemaker_session,
@@ -138,7 +138,7 @@ def test_smmodelparallel(n_virginia_sagemaker_session, instance_type, n_virginia
                                "mpi": {
                                    "enabled": True,
                                    "processes_per_host": num_processes,
-                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ",
+                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 ",
                                 }
                            },
                            sagemaker_session=n_virginia_sagemaker_session,
@@ -172,7 +172,7 @@ def test_smmodelparallel_multinode(n_virginia_sagemaker_session, instance_type,
                                "mpi": {
                                    "enabled": True,
                                    "processes_per_host": num_processes,
-                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ",
+                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 ",
                                 }
                            },
                            sagemaker_session=n_virginia_sagemaker_session,

@@ -4,5 +4,5 @@
 
 set -ex
 
-smddpsinglenode -x RDMAV_FORK_SAFE=1 python smdataparallel_mnist.py
-mpirun --allow-run-as-root -x RDMAV_FORK_SAFE=1 -np 2 python tf2_conv.py
+smddpsinglenode python smdataparallel_mnist.py
+mpirun --allow-run-as-root -np 2 python tf2_conv.py
diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py
@@ -123,6 +123,13 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type):
     else:
         integration_path = os.path.join("integration", sagemaker_test_type)
 
+    if job_type == "training":
+        if framework == "tensorflow":
+            if framework_major_version == "2":
+                integration_path = f"integration/sagemaker/test_mnist.py::test_smdataparallel_smmodelparallel_mnist"
+            else:
+                integration_path = f"integration/sagemaker/test_tuning_model_dir.py"
+
     # Conditions for modifying tensorflow SageMaker pytest commands
     if framework == "tensorflow" and sagemaker_test_type == SAGEMAKER_REMOTE_TEST_TYPE:
         if job_type == "inference":

diff --git a/test/testrunner.py b/test/testrunner.py
@@ -198,7 +198,7 @@ def setup_eks_cluster(framework_name, is_neuron):
     long_name = framework_name
     short_name = frameworks[long_name]
     codebuild_version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")[0:7]
-    num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4
+    num_nodes = 4
     cluster_name = f"dlc-{short_name}-cluster-{codebuild_version}-{random.randint(1, 10000)}"
     # default volume size
     volume_size = 80
-Original file line number
+Diff line change
@@ Expand Up / @@ -31,8 +31,6 @@ spec: @@
                   - -x
                   - LD_LIBRARY_PATH
                   - -x
-                  - RDMAV_FORK_SAFE=1
-                  - -x
                   - PATH
                   - -x
                   - NCCL_SOCKET_IFNAME=eth0
@@ Expand Down @@