Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ After building, verify the image is accessible in Docker:

```bash
$ docker images | grep nvidia/bobber
nvidia/bobber 6.3.0 c697a75ee482 36 minutes ago 12.4GB
nvidia/bobber 6.3.0 8e545fee7a4d 10 minutes ago 5.23GB
```

## Save container
Expand Down Expand Up @@ -153,7 +153,7 @@ scp -r nvidia_bobber_{version}.tar user@test-machine-3:~/bobber

Do this for each host you intend to include in the test. A bash `for` loop to
can be used to iterate over all systems - you could also target the high
performance network to speed up the copy further (this is a 10+ GB copy). Like
performance network to speed up the copy further (this is a 5+ GB copy). Like
so:

```bash
Expand All @@ -170,7 +170,7 @@ On all other nodes, load the copied Docker image.
```bash
$ docker load < nvidia_bobber_{version}.tar
$ docker images | grep bobber
nvidia/bobber 6.3.0 c697a75ee482 36 minutes ago 12.4GB
nvidia/bobber 6.3.0 8e545fee7a4d 10 minutes ago 5.23GB
```

## Ensure shared filesystem is mounted, if necessary
Expand Down Expand Up @@ -219,7 +219,7 @@ To verify the container is running, use `docker ps`:
```bash
$ docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
317b6cf928f8 c697a75ee482 "/usr/local/bin/nvid…" 30 hours ago Up 30 hours bobber
317b6cf928f8 8e545fee7a4d "/usr/local/bin/nvid…" 30 hours ago Up 30 hours bobber
```

## Create log dir on primary test system
Expand Down
2 changes: 1 addition & 1 deletion bobber/lib/analysis/fio.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def fio_iops_results(log_contents: str, systems: int, string_to_match: str,
return []
for result in match:
iops = re.findall(r'[-+]?\d*\.\d+[kMG]|\d+[kMG]|\d+', result)
if len(iops) != 5:
if len(iops) not in [5, 6]:
raise ValueError('IOPS cannot be parsed from FIO log!')
iops = clean_iops(iops[0])
final_iops.append(iops)
Expand Down
78 changes: 55 additions & 23 deletions bobber/lib/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
# SPDX-License-Identifier: MIT
FROM nvcr.io/nvidia/tensorflow:20.11-tf2-py3
# Larger base stage with required items for building various tools
FROM nvcr.io/nvidia/cuda:11.2.0-devel-ubuntu20.04 as build

ENV DEBIAN_FRONTEND=noninteractive

# Install all required build dependencies
RUN apt-get update && apt-get -y install apt-utils && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
openssh-client \
openssh-server \
swig \
bison \
libgfortran3 \
gcc \
libgfortran4 \
pkg-config \
autotools-dev \
debhelper \
Expand Down Expand Up @@ -42,8 +43,46 @@ RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-
kmod \
libnuma1 \
lsof \
libopenmpi-dev && \
rm -rf /var/lib/apt/lists/*

# Compile NVIDIA's NCCL tests
RUN git clone https://github.com/NVIDIA/nccl-tests && \
cd nccl-tests/ && \
git reset --hard ec1b5e22e618d342698fda659efdd5918da6bd9f && \
make MPI=1 MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi

# Compile OSU microbenchmarks
RUN wget --no-check-certificate https://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.6.2.tar.gz && \
tar zxf osu-micro-benchmarks-5.6.2.tar.gz && \
cd osu-micro-benchmarks-5.6.2 && \
./configure CC=/usr/bin/mpicc CXX=/usr/bin/mpicxx --enable-cuda --with-cuda-include=/usr/local/cuda/include --with-cuda-libpath=/usr/local/cuda/lib64 && \
make && \
make install && \
rm -rf ../*.tar.gz

# Build IO500, IOR, and mdtest
RUN git clone https://github.com/jyvet/io-500-dev && \
cd io-500-dev && \
git reset --hard 0232acfa8e64f7c543db8930dd279009ec9c32bc && \
utilities/prepare.sh

# Lighter runtime stage copying only necessary build artifacts from earlier
FROM nvcr.io/nvidia/cuda:11.2.0-runtime-ubuntu20.04

ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
openssh-client \
openssh-server \
git \
fio \
psmisc \
libopenmpi-dev \
openmpi-bin \
python \
python3-dev \
python3-pip \
python3-distutils && \
rm -rf /var/lib/apt/lists/*

Expand All @@ -65,32 +104,25 @@ RUN mkdir -p /var/run/sshd && \

WORKDIR /

RUN git clone https://github.com/NVIDIA/nccl-tests && \
cd nccl-tests/ && \
git reset --hard ec1b5e22e618d342698fda659efdd5918da6bd9f && \
make MPI=1 MPI_HOME=/usr/local/mpi
# Copy the compiled nccl-tests binaries to the runtime image
COPY --from=build /nccl-tests/build /nccl-tests/build

RUN wget --no-check-certificate https://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.6.2.tar.gz && \
tar zxf osu-micro-benchmarks-5.6.2.tar.gz && \
cd osu-micro-benchmarks-5.6.2 && \
./configure CC=/usr/local/mpi/bin/mpicc CXX=/usr/local/mpi/bin/mpicxx --enable-cuda --with-cuda-include=/usr/local/cuda/include --with-cuda-libpath=/usr/local/cuda/lib64 && \
make && \
make install && \
rm -rf ../*.tar.gz

RUN python3 -m pip install nvidia-pyindex && \
python3 -m pip install \
nvidia-imageinary['mxnet']>=1.1.2
# Copy the compiled OSU microbenchmarks to the runtime image
COPY --from=build /usr/local/libexec/osu-micro-benchmarks/mpi/collective/ /usr/local/libexec/osu-micro-benchmarks/mpi/collective/

RUN git clone https://github.com/jyvet/io-500-dev && \
cd io-500-dev && \
git reset --hard 0232acfa8e64f7c543db8930dd279009ec9c32bc && \
utilities/prepare.sh
# Copy the compiled IO500 binaries to the runtime image
COPY --from=build /io-500-dev/bin /io-500-dev/bin

RUN git clone https://github.com/NVIDIA/DALI dali && \
cd dali/ && \
git reset --hard fd30786d773d08185d78988b2903dce2ace0a00b

RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools && \
python3 -m pip install --no-cache-dir nvidia-pyindex && \
python3 -m pip install --no-cache-dir \
nvidia-imageinary['tfrecord']>=1.1.2 \
nvidia-dali-cuda110

COPY test_scripts /tests/

EXPOSE 2222