diff --git a/README.md b/README.md index d1eb0b1..761d0da 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ After building, verify the image is accessible in Docker: ```bash $ docker images | grep nvidia/bobber -nvidia/bobber 6.3.0 c697a75ee482 36 minutes ago 12.4GB +nvidia/bobber 6.3.0 8e545fee7a4d 10 minutes ago 5.23GB ``` ## Save container @@ -153,7 +153,7 @@ scp -r nvidia_bobber_{version}.tar user@test-machine-3:~/bobber Do this for each host you intend to include in the test. A bash `for` loop to can be used to iterate over all systems - you could also target the high -performance network to speed up the copy further (this is a 10+ GB copy). Like +performance network to speed up the copy further (this is a 5+ GB copy). Like so: ```bash @@ -170,7 +170,7 @@ On all other nodes, load the copied Docker image. ```bash $ docker load < nvidia_bobber_{version}.tar $ docker images | grep bobber -nvidia/bobber 6.3.0 c697a75ee482 36 minutes ago 12.4GB +nvidia/bobber 6.3.0 8e545fee7a4d 10 minutes ago 5.23GB ``` ## Ensure shared filesystem is mounted, if necessary @@ -219,7 +219,7 @@ To verify the container is running, use `docker ps`: ```bash $ docker ps CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES -317b6cf928f8 c697a75ee482 "/usr/local/bin/nvid…" 30 hours ago Up 30 hours bobber +317b6cf928f8 8e545fee7a4d "/usr/local/bin/nvid…" 30 hours ago Up 30 hours bobber ``` ## Create log dir on primary test system diff --git a/bobber/lib/analysis/fio.py b/bobber/lib/analysis/fio.py index bf574ae..9b3836a 100644 --- a/bobber/lib/analysis/fio.py +++ b/bobber/lib/analysis/fio.py @@ -155,7 +155,7 @@ def fio_iops_results(log_contents: str, systems: int, string_to_match: str, return [] for result in match: iops = re.findall(r'[-+]?\d*\.\d+[kMG]|\d+[kMG]|\d+', result) - if len(iops) != 5: + if len(iops) not in [5, 6]: raise ValueError('IOPS cannot be parsed from FIO log!') iops = clean_iops(iops[0]) final_iops.append(iops) diff --git a/bobber/lib/docker/Dockerfile b/bobber/lib/docker/Dockerfile index bad0d7f..248f684 100644 --- a/bobber/lib/docker/Dockerfile +++ b/bobber/lib/docker/Dockerfile @@ -1,15 +1,16 @@ # SPDX-License-Identifier: MIT -FROM nvcr.io/nvidia/tensorflow:20.11-tf2-py3 +# Larger base stage with required items for building various tools +FROM nvcr.io/nvidia/cuda:11.2.0-devel-ubuntu20.04 as build ENV DEBIAN_FRONTEND=noninteractive +# Install all required build dependencies RUN apt-get update && apt-get -y install apt-utils && rm -rf /var/lib/apt/lists/* RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ - openssh-client \ - openssh-server \ swig \ bison \ - libgfortran3 \ + gcc \ + libgfortran4 \ pkg-config \ autotools-dev \ debhelper \ @@ -42,8 +43,46 @@ RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held- kmod \ libnuma1 \ lsof \ + libopenmpi-dev && \ + rm -rf /var/lib/apt/lists/* + +# Compile NVIDIA's NCCL tests +RUN git clone https://github.com/NVIDIA/nccl-tests && \ + cd nccl-tests/ && \ + git reset --hard ec1b5e22e618d342698fda659efdd5918da6bd9f && \ + make MPI=1 MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi + +# Compile OSU microbenchmarks +RUN wget --no-check-certificate https://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.6.2.tar.gz && \ + tar zxf osu-micro-benchmarks-5.6.2.tar.gz && \ + cd osu-micro-benchmarks-5.6.2 && \ + ./configure CC=/usr/bin/mpicc CXX=/usr/bin/mpicxx --enable-cuda --with-cuda-include=/usr/local/cuda/include --with-cuda-libpath=/usr/local/cuda/lib64 && \ + make && \ + make install && \ + rm -rf ../*.tar.gz + +# Build IO500, IOR, and mdtest +RUN git clone https://github.com/jyvet/io-500-dev && \ + cd io-500-dev && \ + git reset --hard 0232acfa8e64f7c543db8930dd279009ec9c32bc && \ + utilities/prepare.sh + +# Lighter runtime stage copying only necessary build artifacts from earlier +FROM nvcr.io/nvidia/cuda:11.2.0-runtime-ubuntu20.04 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ + openssh-client \ + openssh-server \ + git \ fio \ psmisc \ + libopenmpi-dev \ + openmpi-bin \ + python \ + python3-dev \ + python3-pip \ python3-distutils && \ rm -rf /var/lib/apt/lists/* @@ -65,32 +104,25 @@ RUN mkdir -p /var/run/sshd && \ WORKDIR / -RUN git clone https://github.com/NVIDIA/nccl-tests && \ - cd nccl-tests/ && \ - git reset --hard ec1b5e22e618d342698fda659efdd5918da6bd9f && \ - make MPI=1 MPI_HOME=/usr/local/mpi +# Copy the compiled nccl-tests binaries to the runtime image +COPY --from=build /nccl-tests/build /nccl-tests/build -RUN wget --no-check-certificate https://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.6.2.tar.gz && \ - tar zxf osu-micro-benchmarks-5.6.2.tar.gz && \ - cd osu-micro-benchmarks-5.6.2 && \ - ./configure CC=/usr/local/mpi/bin/mpicc CXX=/usr/local/mpi/bin/mpicxx --enable-cuda --with-cuda-include=/usr/local/cuda/include --with-cuda-libpath=/usr/local/cuda/lib64 && \ - make && \ - make install && \ - rm -rf ../*.tar.gz - -RUN python3 -m pip install nvidia-pyindex && \ - python3 -m pip install \ - nvidia-imageinary['mxnet']>=1.1.2 +# Copy the compiled OSU microbenchmarks to the runtime image +COPY --from=build /usr/local/libexec/osu-micro-benchmarks/mpi/collective/ /usr/local/libexec/osu-micro-benchmarks/mpi/collective/ -RUN git clone https://github.com/jyvet/io-500-dev && \ - cd io-500-dev && \ - git reset --hard 0232acfa8e64f7c543db8930dd279009ec9c32bc && \ - utilities/prepare.sh +# Copy the compiled IO500 binaries to the runtime image +COPY --from=build /io-500-dev/bin /io-500-dev/bin RUN git clone https://github.com/NVIDIA/DALI dali && \ cd dali/ && \ git reset --hard fd30786d773d08185d78988b2903dce2ace0a00b +RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools && \ + python3 -m pip install --no-cache-dir nvidia-pyindex && \ + python3 -m pip install --no-cache-dir \ + nvidia-imageinary['tfrecord']>=1.1.2 \ + nvidia-dali-cuda110 + COPY test_scripts /tests/ EXPOSE 2222