pytorch · wonjoo-wj · May 31, 2023 · May 31, 2023
diff --git a/infra/ansible/.ansible-lint b/infra/ansible/.ansible-lint
@@ -0,0 +1,6 @@
+---
+# .ansible-lint
+
+profile: moderate
+skip_list:
+  - schema[tasks]
diff --git a/infra/ansible/Dockerfile b/infra/ansible/Dockerfile
@@ -0,0 +1,32 @@
+ARG python_version=3.8
+ARG debian_version=buster
+
+FROM python:${python_version}-${debian_version} AS build
+
+WORKDIR /ansible
+RUN pip install ansible
+COPY . /ansible
+
+ARG ansible_vars
+RUN ansible-playbook -vvv playbook.yaml -e "stage=build" -e "${ansible_vars}"
+
+FROM python:${python_version}-${debian_version} AS release
+
+WORKDIR /ansible
+RUN pip install ansible
+COPY . /ansible
+
+ARG ansible_vars
+RUN ansible-playbook -vvv playbook.yaml -e "stage=release" -e "${ansible_vars}" --tags "install_deps"
+
+WORKDIR /tmp/wheels
+COPY --from=build /src/pytorch/dist/*.whl ./
+COPY --from=build /src/pytorch/xla/dist/*.whl ./
+
+RUN echo "Installing the following wheels" && ls *.whl
+RUN pip install *.whl
+
+WORKDIR /
+
+RUN rm -rf /ansible /tmp/wheels
+COPY --from=build /dist/*.whl /dist/
diff --git a/infra/ansible/README.md b/infra/ansible/README.md
@@ -0,0 +1,58 @@
+# Ansible playbook
+
+This ansible playbook will perform the following actions on the localhost:
+  * install required pip and apt packages, depending on the specified stage,
+    architecture and accelerator (see [apt.yaml](config/apt.yaml) and
+    [pip.yaml](config/pip.yaml)).
+  * fetch bazel (version configured in [vars.yaml](config/vars.yaml)),
+  * fetch PyTorch and XLA sources at master (or specific revisions,
+    see role `fetch_srcs` in [playbook.yaml](playbook.yaml)).
+  * set required environment variables (see [env.yaml](config/env.yaml)),
+  * build and install PyTorch and XLA wheels,
+  * apply infrastructure tests (see `*/tests.yaml` files in [roles](roles)).
+
+## Prerequisites
+
+* Python 3.8+
+* Ansible. Install with `pip install ansible`.
+
+## Running
+
+The playbook requires passing explicitly 3 variables that configure playbook
+behavior (installed pip/apt packages and set environment variables):
+* `stage`: build or release. Different packages are installed depending on
+  the chosen stage.
+* `arch`: aarch64 or amd64. Architecture of the built image and wheels.
+* `accelerator`: tpu or cuda. Available accelerator.
+
+The variables can be passed through `-e` flag: `-e "<var>=<value>"`.
+
+Example: `ansible-playbook playbook.yaml -e "stage=build arch=amd64 accelerator=tpu"`
+
+## Config structure
+
+The playbook configuration is split into 4 files, per each logical system.
+The configuration is simply loaded as playbook variables which are then passed
+to specific roles and tasks.
+Only variables in [config/env.yaml](config/env.yaml) are passed as env variables.
+
+* [apt.yaml](config/apt.yaml) - specifies apt packages for each stage and
+  architecture or accelerator.
+  Packages shared between all architectures and accelerators in a given stage
+  are specified in `*_common`. They are appended to any architecture specific list.
+
+  This config also contains a list of required apt repos and signing keys.
+  These variables are mainly consumed by the [install_deps](roles/install_deps/tasks/main.yaml) role.
+
+* [pip.yaml](config/pip.yaml) - similarly to apt.yaml, lists pip packages per stage and arch / accelerator.
+  In both pip and apt config files stage and and arch / accelerator are
+  concatenated together and specified under one key (e.g. build_amd64, release_tpu).
+
+* [env.yaml](config/env.yaml) - contains Ansible variables that are passed as env variables when
+  building PyTorch and XLA (`build_env`). Variables in `release_env` are saved in `/etc/environment` (executed for the `release` stage).
+
+* [vars.yaml](config/vars.yaml) - Ansible variables used in other config files and throughout the playbook.
+  Not associated with any particular system.
+
+Variables from these config files are dynamically loaded (during playbook execution),
+see [playbook.yaml](playbook.yaml).
diff --git a/infra/ansible/ansible.cfg b/infra/ansible/ansible.cfg
@@ -0,0 +1,16 @@
+# See https://docs.ansible.com/ansible/latest/reference_appendices/config.html
+# for various configuration options.
+
+[defaults]
+# Displays tasks execution duration.
+callbacks_enabled = profile_tasks
+# The playbooks is only run on the implicit localhost.
+# Silence warning about empty hosts inventory.
+localhost_warning = False
+# Make output human-readable.
+stdout_callback = yaml
+
+[inventory]
+# Silence warning about no inventory.
+# This option is available since Ansible 2.14 (available only with Python 3.9+).
+inventory_unparsed_warning = False
diff --git a/infra/ansible/config/apt.yaml b/infra/ansible/config/apt.yaml
@@ -0,0 +1,60 @@
+# Contains lists of apt packages for each stage (build|release) and arch or accelerator.
+apt:
+  pkgs:
+    build_common:
+      - ccache
+      - curl
+      - git
+      - gnupg
+      - libopenblas-dev
+      - ninja-build
+      - procps
+      - python3-pip
+      - rename
+      - vim
+      - wget
+      - clang-format-7
+      - lcov
+      - less
+
+    build_cuda:
+      - "cuda-libraries-{{ cuda_version | replace('.', '-') }}"
+      - "cuda-toolkit-{{ cuda_version | replace('.', '-') }}"
+      - "cuda-minimal-build-{{ cuda_version | replace('.', '-') }}"
+      - "{{ cuda_deps['libcudnn'][cuda_version] }}"
+      - "{{ cuda_deps['libcudnn-dev'][cuda_version] }}"
+
+    build_amd64:
+      - "clang-{{ clang_version }}"
+
+    build_aarch64:
+      - scons
+      - gcc-10
+      - g++-10
+
+    release_common:
+      - curl
+      - git
+      - gnupg
+      - libgomp1
+      - libopenblas-base
+      - patch
+
+    release_cuda:
+      - "cuda-libraries-{{ cuda_version | replace('.', '-') }}"
+      - "cuda-minimal-build-{{ cuda_version | replace('.', '-') }}"
+      - "{{ cuda_deps['libcudnn'][cuda_version] }}"
+
+  # Specify objects with string fields `url` and `keyring`.
+  # The keyring path should start with /usr/share/keyrings/ for debian and ubuntu.
+  signing_keys:
+    - url: https://apt.llvm.org/llvm-snapshot.gpg.key
+      keyring: /usr/share/keyrings/llvm.pgp
+    - url: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_repo }}/x86_64/3bf863cc.pub"
+      keyring: /usr/share/keyrings/cuda.pgp
+
+  repos:
+    # signed-by path should match the corresponding keyring path above.
+    - "deb [signed-by=/usr/share/keyrings/llvm.pgp] http://apt.llvm.org/{{ llvm_debian_repo }}/ llvm-toolchain-{{ llvm_debian_repo }}-{{ clang_version }} main"
+    - "deb-src [signed-by=/usr/share/keyrings/llvm.pgp] http://apt.llvm.org/{{ llvm_debian_repo }}/ llvm-toolchain-{{ llvm_debian_repo }}-{{ clang_version }} main"
+    - "deb [signed-by=/usr/share/keyrings/cuda.pgp] https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_repo }}/x86_64/ /"
diff --git a/infra/ansible/config/cuda_deps.yaml b/infra/ansible/config/cuda_deps.yaml
@@ -0,0 +1,12 @@
+# Versions of cuda dependencies for given cuda versions.
+# Note: wrap version in quotes to ensure they're treated as strings.
+cuda_deps:
+  # List all libcudnn8 versions with `apt list -a libcudnn8`
+  libcudnn:
+    "11.8": libcudnn8=8.8.0.121-1+cuda11.8
+    "11.7": libcudnn8=8.5.0.96-1+cuda11.7
+    "11.2": libcudnn8=8.1.1.33-1+cuda11.2
+  libcudnn-dev:
+    "11.8": libcudnn8-dev=8.8.0.121-1+cuda11.8
+    "11.7": libcudnn8-dev=8.5.0.96-1+cuda11.7
+    "11.2": libcudnn8-dev=8.1.1.33-1+cuda11.2
diff --git a/infra/ansible/config/env.yaml b/infra/ansible/config/env.yaml
@@ -0,0 +1,49 @@
+# Variables that will be stored in ~/.bashrc and ~/.zshrc files for the release stage.
+# They'll be accessible for all processes on the host, also in the development image.
+release_env:
+  common:
+    # Force GCC because clang/bazel has issues.
+    CC: gcc
+    CXX: g++
+    # CC: "clang-{{ clang_version }}"
+    # CXX: "clang++-{{ clang_version }}"
+    LD_LIBRARY_PATH: "$LD_LIBRARY_PATH:/usr/local/lib"
+
+  tpu:
+    ACCELERATOR: tpu
+    TPUVM_MODE: 1
+
+  cuda:
+    TF_CUDA_COMPUTE_CAPABILITIES: 7.0,7.5,8.0
+    XLA_CUDA: 1
+
+# Variables that will be passed to shell environment only for building PyTorch and XLA libs.
+build_env:
+  common:
+    LD_LIBRARY_PATH: "$LD_LIBRARY_PATH:/usr/local/lib"
+    # Set explicitly to 0 as setup.py defaults this flag to true if unset.
+    BUILD_CPP_TESTS: 0
+    # Force GCC because clang/bazel has issues.
+    CC: gcc
+    CXX: g++
+    PYTORCH_BUILD_NUMBER: 1
+    TORCH_XLA_VERSION: "{{ package_version }}"
+    PYTORCH_BUILD_VERSION: "{{ package_version }}"
+    XLA_SANDBOX_BUILD: 1
+    BAZEL_REMOTE_CACHE: 1
+    SILO_NAME: "cache-silo-{{ arch }}-{{ accelerator }}"
+
+  amd64:
+    ARCH: amd64
+
+  aarch64:
+
+  cuda:
+    TF_CUDA_COMPUTE_CAPABILITIES: 7.0,7.5,8.0
+    XLA_CUDA: 1
+
+  tpu:
+    ACCELERATOR: tpu
+    TPUVM_MODE: 1
+    BUNDLE_LIBTPU: 1
+
diff --git a/infra/ansible/config/pip.yaml b/infra/ansible/config/pip.yaml
@@ -0,0 +1,53 @@
+# Contains lists of pip packages for each stage (build|release) and arch or accelerator.
+pip:
+  pkgs:
+    # Shared between all architectures and accelerators for the build stage.
+    build_common:
+      - astunparse
+      - cffi
+      - cloud-tpu-client
+      - cmake
+      - coverage
+      - dataclasses
+      - expecttest==0.1.3
+      - future
+      - git-archive-all
+      - google-api-python-client
+      - google-cloud-storage
+      - hypothesis
+      - lark-parser
+      - ninja
+      - numpy
+      - oauth2client
+      - pyyaml
+      - requests
+      - setuptools
+      - six
+      - tensorboard
+      - tensorboardX
+      - tqdm
+      - typing
+      - typing_extensions
+      - sympy
+      - yapf==0.30.0
+
+    build_amd64:
+      - mkl
+      - mkl-include
+
+    build_aarch64:
+
+    # Shared between all architectures and accelerators for the release stage.
+    release_common:
+      - numpy
+      - pyyaml
+      - mkl
+      - mkl-include
+
+    release_tpu:
+
+  # Packages that will be installed with the `--nodeps` flag.
+  pkgs_nodeps:
+    release_common:
+      - torchvision
+      - pillow
diff --git a/infra/ansible/config/vars.yaml b/infra/ansible/config/vars.yaml
@@ -0,0 +1,10 @@
+# Used for fetching cuda from the right repo, see apt.yaml.
+cuda_repo: ubuntu1804
+cuda_version: "11.8"
+# Used for fetching clang from the right repo, see apt.yaml.
+llvm_debian_repo: buster
+clang_version: 10
+# PyTorch and PyTorch/XLA wheel versions.
+package_version: 2.0
+# If set to true, wheels will be renamed to $WHEEL_NAME-nightly-cp38-cp38-linux_x86_64.whl.
+nightly_release: false
diff --git a/infra/ansible/development.Dockerfile b/infra/ansible/development.Dockerfile
@@ -0,0 +1,19 @@
+# Dockerfile for building a development image.
+# The built image contains all required pip and apt packages for building and
+# running PyTorch and PyTorch/XLA. The image doesn't contain any source code.
+ARG python_version=3.8
+ARG debian_version=buster
+
+FROM python:${python_version}-${debian_version}
+
+RUN pip install ansible
+
+COPY . /ansible
+WORKDIR /ansible
+
+# List Asnible tasks to apply for the dev image.
+ENV TAGS="bazel,configure_env,install_deps"
+
+ARG ansible_vars
+RUN ansible-playbook playbook.yaml -e "stage=build" -e "${ansible_vars}" --tags "${TAGS}"
+RUN ansible-playbook playbook.yaml -e "stage=release" -e "${ansible_vars}" --tags "${TAGS}"
diff --git a/infra/ansible/e2e_tests.Dockerfile b/infra/ansible/e2e_tests.Dockerfile
@@ -0,0 +1,38 @@
+ARG python_version=3.8
+ARG debian_version=buster
+
+FROM python:${python_version}-${debian_version} AS build
+
+WORKDIR /ansible
+RUN pip install ansible
+COPY . /ansible
+
+# Build PyTorch and PyTorch/XLA wheels.
+ARG ansible_vars
+RUN ansible-playbook -vvv playbook.yaml -e "stage=build" -e "${ansible_vars}"
+
+FROM python:${python_version}-${debian_version}
+WORKDIR /ansible
+RUN pip install ansible
+COPY . /ansible
+
+# Install runtime pip and apt dependencies.
+ARG ansible_vars
+RUN ansible-playbook -vvv playbook.yaml -e "stage=release" -e "${ansible_vars}" --tags "install_deps"
+
+# Copy test sources.
+RUN mkdir -p /src/pytorch/xla
+COPY --from=build /src/pytorch/xla/test /src/pytorch/xla/test
+
+# Copy and install wheels.
+WORKDIR /tmp/wheels
+COPY --from=build /src/pytorch/dist/*.whl ./
+COPY --from=build /src/pytorch/xla/dist/*.whl ./
+
+RUN echo "Installing the following wheels" && ls *.whl
+RUN pip install *.whl
+
+WORKDIR /
+
+# Clean-up unused directories.
+RUN rm -rf /ansible /tmp/wheels