fix: reduce Docker layers, add auto CI trigger, fix fake ops import (#363)

JacoCheung · claude · geoffreyQiu · web-flow · commit bb398ee8c7d2 · 2026-04-17T09:52:15.000+08:00
* fix: reduce Docker image layers to avoid overlay2 max depth limit Aggressively merge RUN instructions in the Dockerfile to reduce total layer count from ~126 to ~119. The inference image was hitting the overlay2 128-layer limit ("failed to register layer: max depth exceeded") on CI nodes. devel stage: 8 RUN + 1 COPY -> 4 RUN + 1 COPY (-4 layers) build stage: 4 RUN + 1 COPY -> 1 RUN + 1 COPY (-3 layers) FBGEMM and TorchRec kept as separate layers for build cache efficiency. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * ci: add pull_request_target trigger for auto CI on PR open/sync Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Fix imports for fake ops wrapper used in expor * fix: remove invalid import of hstu.hstu_ops_gpu The module hstu.hstu_ops_gpu does not exist as a Python module. The C++ source hstu_ops_gpu.cpp compiles into hstu/fbgemm_gpu_experimental_hstu.so, not a separate hstu_ops_gpu submodule. This import was incorrectly added in PR #327 and causes ModuleNotFoundError in CI. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: update FBGEMM submodule to include hstu_ops_gpu.py fake impl Update from 04df536 to 65bad42 which adds fake tensor implementations for torch.export (hstu_ops_gpu.py). This was missing since PR #340 accidentally reverted the submodule pointer. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * ci: allow /build with flags by matching prefix instead of exact string Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * ci: remove pull_request_target trigger, keep only /build comment Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Junyi Qiu <junyiq@nvidia.com>
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
@@ -33,9 +33,10 @@ jobs:
     outputs:
       args: ${{ env.args }}
 
-    # This job only runs for pull request comments
+    # This job only runs for /build comments
     if: |
-         github.event.comment.body == '/build' && contains(fromJson('["EmmaQiaoCh","JacoCheung","kanghui0204","jiashuy","shijieliu"]'), github.actor)
+         contains(fromJson('["EmmaQiaoCh","JacoCheung","kanghui0204","jiashuy","shijieliu"]'), github.actor) &&
+         startsWith(github.event.comment.body, '/build')
     steps:
       - name: Check if comment is issued by authorized person
         run: blossom-ci
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -8,61 +8,50 @@ ARG TRITONSERVER_BUILD
 
 WORKDIR /workspace/deps
 
+# -- Layer 1: system setup, arch symlinks, tritonserver deps ---
 RUN if [ "${TRITONSERVER_BUILD}" = "1" ]; then \
       ln /bin/python3 /bin/python && \
-      apt-get update -y --fix-missing && apt-get install -y cmake && apt-get install -y patchelf; \
-    fi
-
-RUN if [ "${TRITONSERVER_BUILD}" = "1" ]; then \
+      apt-get update -y --fix-missing && apt-get install -y cmake patchelf && \
       pip3 install pandas rich cloudpickle psutil && \
       pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130; \
-    fi
-
-RUN ARCH=$([ "${TARGETPLATFORM}" = "linux/arm64" ] && echo "aarch64" || echo "x86_64") && \
+    fi && \
+    ARCH=$([ "${TARGETPLATFORM}" = "linux/arm64" ] && echo "aarch64" || echo "x86_64") && \
     rm -rf /usr/lib/${ARCH}-linux-gnu/libnvidia-ml.so.1 && \
     if [ ${ARCH} = "aarch64" ]; then \
       ln -s /usr/local/cuda-13/targets/sbsa-linux/lib/stubs/libnvidia-ml.so  /usr/lib/${ARCH}-linux-gnu/libnvidia-ml.so.1; \
     else \
       ln -s /usr/local/cuda-13/targets/${ARCH}-linux/lib/stubs/libnvidia-ml.so  /usr/lib/${ARCH}-linux-gnu/libnvidia-ml.so.1; \
-    fi
-
-RUN git clone -b core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git megatron-lm && \
-    pip install --no-deps -e ./megatron-lm
-
-RUN pip install torchx gin-config torchmetrics==1.0.3 typing-extensions iopath pyvers
-RUN pip install cloudpickle
-RUN pip install triton==3.6.0
-RUN pip install nvidia-cutlass-dsl==4.3.0
-
-RUN pip install --no-cache-dir setuptools-git-versioning scikit-build && \
-  git clone --recursive -b v1.5.0 https://github.com/pytorch/FBGEMM.git fbgemm && \
-  cd fbgemm/fbgemm_gpu && \
-  python setup.py install --build-target=default --build-variant=cuda -DTORCH_CUDA_ARCH_LIST="7.5 8.0 9.0"
-
-RUN pip install --no-deps tensordict orjson && \
-  git clone --recursive -b release/V1.5.0 https://github.com/pytorch/torchrec.git torchrec && \
-  cd torchrec && \
-  pip install --no-deps .
-
-
-# for dev
-RUN apt update -y --fix-missing && \
-    apt install -y gdb && \
-    apt autoremove -y && \
-    apt clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN pip install --no-cache pre-commit
-
-RUN if [ "${TARGETPLATFORM}" = "linux/arm64" ]; then \
+    fi && \
+    if [ "${TARGETPLATFORM}" = "linux/arm64" ]; then \
       CUDA_TARGET_ARCH=sbsa; \
     elif [ "${TARGETPLATFORM}" = "linux/amd64" ]; then \
       CUDA_TARGET_ARCH=x86_64; \
     else \
       CUDA_TARGET_ARCH=$(uname -m); \
     fi && \
     ln -sf /usr/local/cuda-13/targets/${CUDA_TARGET_ARCH}-linux/include/cccl/cuda \
-    /usr/local/cuda/include/cuda
+    /usr/local/cuda/include/cuda && \
+    apt update -y --fix-missing && \
+    apt install -y gdb && \
+    apt autoremove -y && apt clean && rm -rf /var/lib/apt/lists/*
+
+# -- Layer 2: pip dependencies + Megatron-LM ---
+RUN git clone -b core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git megatron-lm && \
+    pip install --no-deps -e ./megatron-lm && \
+    pip install torchx gin-config torchmetrics==1.0.3 typing-extensions iopath pyvers \
+    cloudpickle triton==3.6.0 nvidia-cutlass-dsl==4.3.0 --no-cache pre-commit
+
+# -- Layer 3: FBGEMM (long build, own layer for caching) ---
+RUN pip install --no-cache-dir setuptools-git-versioning scikit-build && \
+    git clone --recursive -b v1.5.0 https://github.com/pytorch/FBGEMM.git fbgemm && \
+    cd fbgemm/fbgemm_gpu && \
+    python setup.py install --build-target=default --build-variant=cuda -DTORCH_CUDA_ARCH_LIST="7.5 8.0 9.0"
+
+# -- Layer 4: TorchRec ---
+RUN pip install --no-deps tensordict orjson && \
+    git clone --recursive -b release/V1.5.0 https://github.com/pytorch/torchrec.git torchrec && \
+    cd torchrec && \
+    pip install --no-deps .
 
 # Install fbgemm_gpu_hstu (package: fbgemm_gpu_hstu, import: hstu) from submodule
 COPY third_party/FBGEMM /workspace/deps/fbgemm_hstu
@@ -84,23 +73,17 @@ WORKDIR /workspace/recsys-examples
 COPY . .
 
 RUN cd /workspace/recsys-examples/corelib/dynamicemb && \
-    python setup.py install
-    
-RUN cd /workspace/deps && rm -rf nvcomp && \
+    python setup.py install && \
+    cd /workspace/deps && rm -rf nvcomp && \
     wget https://developer.download.nvidia.com/compute/nvcomp/redist/nvcomp/linux-x86_64/nvcomp-linux-x86_64-5.1.0.21_cuda12-archive.tar.xz && \
     tar -xf nvcomp-linux-x86_64-5.1.0.21_cuda12-archive.tar.xz && \
     mv nvcomp-linux-x86_64-5.1.0.21_cuda12-archive nvcomp && \
-    rm  nvcomp-linux-x86_64-5.1.0.21_cuda12-archive.tar.xz 
-  
-RUN cd /workspace/recsys-examples/examples/commons && \
-    TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 9.0" python3 setup.py install
-
-RUN if [ "${TRITONSERVER_BUILD}" != "1" ]; then \
+    rm nvcomp-linux-x86_64-5.1.0.21_cuda12-archive.tar.xz && \
+    cd /workspace/recsys-examples/examples/commons && \
+    TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 9.0" python3 setup.py install && \
+    if [ "${TRITONSERVER_BUILD}" != "1" ]; then \
       rm -f /usr/lib/$(uname -m)-linux-gnu/libcuda.so.1 && \
-      ln -s /usr/local/cuda-13.1/compat/lib.real/libcuda.so.1  /usr/lib/$(uname -m)-linux-gnu/libcuda.so.1; \
-    fi
-
-RUN if [ "${TRITONSERVER_BUILD}" != "1" ]; then \
+      ln -s /usr/local/cuda-13.1/compat/lib.real/libcuda.so.1  /usr/lib/$(uname -m)-linux-gnu/libcuda.so.1 && \
       cd /workspace/recsys-examples/corelib/dynamicemb && \
       mkdir -p torch_binding_build && cd torch_binding_build && \
       cmake .. && make -j; \
diff --git a/examples/hstu/modules/exportable_embedding.py b/examples/hstu/modules/exportable_embedding.py
@@ -56,7 +56,22 @@ def _load_inference_emb_ops() -> bool:
 
 
 # Load operators before register fake ops.
-_load_inference_emb_ops()
+# isort: off
+_load_inference_emb_ops()  # registers torch.ops.INFERENCE_EMB.* before import dynamicemb
+import dynamicemb.index_range_meta as _index_range_meta  # noqa: F401 – registers fake impls for torch.export
+import dynamicemb.lookup_meta as _lookup_meta  # noqa: F401 – registers fake impls for torch.export
+
+import hstu_cuda_ops  # noqa: F401 – registers torch.ops.hstu_cuda_ops.*
+import commons.ops.cuda_ops.fake_hstu_cuda_ops  # noqa: F401 – registers fake impls for torch.export
+
+# isort: on
+
+
+# ---------------------------------------------------------------------------
+# ExportableEmbedding Module
+# ---------------------------------------------------------------------------
+
+
 from configs import InferenceEmbeddingConfig
 from dynamicemb import (
     DynamicEmbInitializerArgs,
@@ -66,10 +81,6 @@ def _load_inference_emb_ops() -> bool:
 from dynamicemb.exportable_tables import InferenceEmbeddingTable
 from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor
 
-# ---------------------------------------------------------------------------
-# ExportableEmbedding Module
-# ---------------------------------------------------------------------------
-
 
 class ExportableEmbedding(torch.nn.Module):
     """
diff --git a/examples/hstu/ops/fused_hstu_op.py b/examples/hstu/ops/fused_hstu_op.py
@@ -17,6 +17,7 @@
 from typing import Optional, Tuple, Union
 
 import hstu  # noqa: F401 – registers torch.ops.fbgemm.*
+import hstu.hstu_ops_gpu  # noqa: F401 – registers fake impls for torch.export
 import nvtx
 import torch
 from commons.utils.clear_tensor_data import clear_tensor_data
diff --git a/third_party/FBGEMM b/third_party/FBGEMM
@@ -1 +1 @@
-Subproject commit 04df5361367c8f51e00db52ed39ca65e32069c6e
+Subproject commit 65bad42a18896ebc2a36067770b90373b644a71c