Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ LABEL dlc_major_version="1"
ARG PYTHON=python3
ARG PYTHON_VERSION=3.6.13
ARG CUBLAS_VERSION=11.3.0.106
ARG OPEN_MPI_PATH=/opt/amazon/openmpi/
ARG OPEN_MPI_PATH=/opt/amazon/openmpi
ARG EFA_PATH=/opt/amazon/efa

ARG CUDA_HOME=/usr/local/cuda
ARG CONDA_PREFIX=/opt/conda
ARG METIS=metis-5.1.0
Expand Down Expand Up @@ -37,6 +39,7 @@ ENV DGLBACKEND=pytorch
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
ENV MANUAL_BUILD=0
ENV RDMAV_FORK_SAFE=1

ARG PT_TRAINING_URL=https://aws-pytorch-binaries.s3-us-west-2.amazonaws.com/r1.8.1_aws/20210325-012734/e1343088f0beb99438343e1e99e8d71ffb972b47/gpu/torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl
ARG PT_TORCHVISION_URL=https://torchvision-build.s3-us-west-2.amazonaws.com/1.8.1/gpu/torchvision-0.9.1-cp36-cp36m-linux_x86_64.whl
Expand Down Expand Up @@ -100,8 +103,9 @@ RUN mkdir /tmp/efa \
&& rm -rf /tmp/efa \
&& rm -rf /tmp/aws-efa-installer-${EFA_VERSION}.tar.gz

RUN echo "pml = ob1" >> $OPEN_MPI_PATH/etc/openmpi-mca-params.conf
ENV PATH="$OPEN_MPI_PATH/bin:$PATH"
ENV LD_LIBRARY_PATH="$OPEN_MPI_PATH/lib/:$LD_LIBRARY_PATH"
ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH

RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \
&& curl -L -o ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
Expand Down
8 changes: 5 additions & 3 deletions tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ ENV KMP_AFFINITY=granularity=fine,compact,1,0
ENV KMP_BLOCKTIME=1
ENV KMP_SETTINGS=0
ENV MANUAL_BUILD=0
ENV RDMAV_FORK_SAFE=1

ARG PYTHON=python3.7
ARG PYTHON_PIP=python3-pip
ARG PIP=pip3
ARG PYTHON_VERSION=3.7.10
ARG OPEN_MPI_PATH=/opt/amazon/openmpi/
ARG OPEN_MPI_PATH=/opt/amazon/openmpi
ARG EFA_PATH=/opt/amazon/efa
ARG NCCL_VERSION=2.7.8
ARG EFA_VERSION=1.11.2
ARG BRANCH_OFI=1.1.1
Expand Down Expand Up @@ -139,8 +141,8 @@ RUN echo "hwloc_base_binding_policy = none" >> $OPEN_MPI_PATH/etc/openmpi-mca-pa

# Set default NCCL parameters
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf

ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib:$LD_LIBRARY_PATH
RUN echo "pml = ob1" >> $OPEN_MPI_PATH/etc/openmpi-mca-params.conf
ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH
# /usr/local/lib/libpython* needs to be accessible for dynamic linking
ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
ENV PATH=$OPEN_MPI_PATH/bin/:$PATH
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@
"custom_mpi_options": (
"-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 "
"-x HOROVOD_FUSION_THRESHOLD=16777216 "
"-x TF_CPP_MIN_LOG_LEVEL=3 "
"-x RDMAV_FORK_SAFE=1"
Copy link
Contributor Author

@jeet4320 jeet4320 May 6, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removing RDMAV_FORK_SAFE from tests

"-x TF_CPP_MIN_LOG_LEVEL=3"
),
}
},
Expand All @@ -69,8 +68,7 @@
"custom_mpi_options": (
"-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 "
"-x HOROVOD_FUSION_THRESHOLD=16777216 "
"-x TF_CPP_MIN_LOG_LEVEL=3 "
"-x RDMAV_FORK_SAFE=1"
"-x TF_CPP_MIN_LOG_LEVEL=3"
),
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ HOME_DIR=/test
BIN_DIR=${HOME_DIR}/bin
LOG_DIR=${HOME_DIR}/logs
HOROVOD_VERSION=v0.16.4
export RDMAV_FORK_SAFE=1

git clone -b ${HOROVOD_VERSION} https://github.com/horovod/horovod.git ${HOME_DIR}/artifacts/horovod
${BIN_DIR}/pytorch_tests/testPTHVDHelper || exit 1
Expand Down
4 changes: 2 additions & 2 deletions test/dlc_tests/container_tests/bin/testTF2HVDHelper
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BIN_DIR=${HOME_DIR}/bin
LOG_DIR=${HOME_DIR}/logs
TRAINING_LOG=${LOG_DIR}/tensorflow_horovod_test.log
HOVOROD_DIR=${BIN_DIR}/examples/Horovod
export RDMAV_FORK_SAFE=1

set -e

echo "Simply verify if Horovod works well. You can follow progress on the log file : $TRAINING_LOG" | tee -a $TRAINING_LOG
Expand Down Expand Up @@ -45,7 +45,7 @@ if [ ${RETURN_VAL} -eq 0 ]; then
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \
-x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \
-x TF_CPP_MIN_LOG_LEVEL=0 -x RDMAV_FORK_SAFE\
-x TF_CPP_MIN_LOG_LEVEL=0 \
python -W ignore ${HOVOROD_DIR}/tf2_train_imagenet_resnet_hvd.py \
--synthetic --batch_size 64 --num_batches 100 --clear_log 2> ${TRAINING_LOG}
else
Expand Down
1 change: 0 additions & 1 deletion test/dlc_tests/container_tests/bin/testTFKerasHVDAMP
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ HOME_DIR=/test
BIN_DIR=${HOME_DIR}/bin
LOG_DIR=${HOME_DIR}/logs

export RDMAV_FORK_SAFE=1
python ${BIN_DIR}/testTFKerasHVD.py AMP || exit 1

exit 0
1 change: 0 additions & 1 deletion test/dlc_tests/container_tests/bin/testTFKerasHVDFP32
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ HOME_DIR=/test
BIN_DIR=${HOME_DIR}/bin
LOG_DIR=${HOME_DIR}/logs

export RDMAV_FORK_SAFE=1
python ${BIN_DIR}/testTFKerasHVD.py FP32 || exit 1

exit 0
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ spec:
- -x
- LD_LIBRARY_PATH
- -x
- RDMAV_FORK_SAFE=1
- -x
- PATH
- -x
- NCCL_SOCKET_IFNAME=eth0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def test_smmodelparallel_mnist_multigpu_multinode(n_virginia_ecr_image, instance
"mpi": {
"enabled": True,
"processes_per_host": num_processes,
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x RDMAV_FORK_SAFE=1 ",
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none ",
},
},
)
Expand Down Expand Up @@ -223,7 +223,7 @@ def test_smmodelparallel_mnist_multigpu_multinode_efa(n_virginia_ecr_image, efa_
"mpi": {
"enabled": True,
"processes_per_host": num_processes,
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 ",
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa ",
},
},
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@

set -ex

smddpsinglenode -x RDMAV_FORK_SAFE=1 python smdataparallel_mnist.py
smddpsinglenode python smdataparallel_mnist.py

bash smmodelparallel_mnist_script_mode.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
set -ex

export SM_HP_MP_PARAMETERS=\{\"ddp\":true,\"microbatches\":4,\"partitions\":2,\"pipeline\":\"interleaved\"\}
mpirun -mca btl_vader_single_copy_mechanism none --allow-run-as-root -x RDMAV_FORK_SAFE=1 -np 8 python smmodelparallel_pt_mnist.py --assert-losses 1 --data-dir data/training
mpirun -mca btl_vader_single_copy_mechanism none --allow-run-as-root -np 8 python smmodelparallel_pt_mnist.py --assert-losses 1 --data-dir data/training
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_distributed_training_horovod(sagemaker_session,
tmpdir,
framework_version):

mpi_options = '-verbose -x orte_base_help_aggregate=0 -x RDMAV_FORK_SAFE=1'
mpi_options = '-verbose -x orte_base_help_aggregate=0'
estimator = TensorFlow(
entry_point=os.path.join(RESOURCE_PATH, 'mnist', 'horovod_mnist.py'),
role='SageMakerRole',
Expand Down Expand Up @@ -63,7 +63,7 @@ def test_distributed_training_horovod_with_env_vars(
sagemaker_session, instance_type, ecr_image, tmpdir, framework_version
):

mpi_options = "-verbose -x orte_base_help_aggregate=0 -x RDMAV_FORK_SAFE=1"
mpi_options = "-verbose -x orte_base_help_aggregate=0"
estimator = TensorFlow(
entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_env_vars.py"),
role="SageMakerRole",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_smmodelparallel_efa(n_virginia_sagemaker_session, efa_instance_type, n_
"mpi": {
"enabled": True,
"processes_per_host": num_processes,
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 ",
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa ",
}
},
sagemaker_session=n_virginia_sagemaker_session,
Expand Down Expand Up @@ -105,7 +105,7 @@ def test_smmodelparallel_multinode_efa(n_virginia_sagemaker_session, efa_instanc
"mpi": {
"enabled": True,
"processes_per_host": num_processes,
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 ",
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa ",
}
},
sagemaker_session=n_virginia_sagemaker_session,
Expand Down Expand Up @@ -138,7 +138,7 @@ def test_smmodelparallel(n_virginia_sagemaker_session, instance_type, n_virginia
"mpi": {
"enabled": True,
"processes_per_host": num_processes,
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ",
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 ",
}
},
sagemaker_session=n_virginia_sagemaker_session,
Expand Down Expand Up @@ -172,7 +172,7 @@ def test_smmodelparallel_multinode(n_virginia_sagemaker_session, instance_type,
"mpi": {
"enabled": True,
"processes_per_host": num_processes,
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ",
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 ",
}
},
sagemaker_session=n_virginia_sagemaker_session,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@

set -ex

smddpsinglenode -x RDMAV_FORK_SAFE=1 python smdataparallel_mnist.py
mpirun --allow-run-as-root -x RDMAV_FORK_SAFE=1 -np 2 python tf2_conv.py
smddpsinglenode python smdataparallel_mnist.py
mpirun --allow-run-as-root -np 2 python tf2_conv.py