-
Notifications
You must be signed in to change notification settings - Fork 892
gpl: opt-in HPWL GPU acceleration via Kokkos #10370
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
65d6f36
c5ac41e
6890d43
f83bd04
d56a5cb
01a89eb
5912887
12868cd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,126 @@ | ||
| # SPDX-License-Identifier: BSD-3-Clause | ||
| # Copyright (c) 2026, The OpenROAD Authors | ||
|
|
||
| # Kokkos GPU backend wiring for OpenROAD. Included from the root | ||
| # CMakeLists.txt only when ENABLE_GPU=ON; not loaded otherwise. | ||
| # | ||
| # Discovers the user's Kokkos install, inherits its compute backend, turns | ||
| # on the matching CMake language so downstream targets can mark kernel | ||
| # sources with set_source_files_properties(... LANGUAGE CUDA|HIP), and | ||
| # applies the small set of nvcc / fmt / host-compiler workarounds that the | ||
| # CUDA backend currently needs in modern Linux toolchains. Per-module | ||
| # CMakeLists (e.g. src/gpl) key off ENABLE_GPU and Kokkos_ENABLE_*; they | ||
| # do not need to call find_package(Kokkos) or enable_language() themselves. | ||
|
|
||
| find_package(Kokkos QUIET) | ||
| if(NOT Kokkos_FOUND) | ||
| message(FATAL_ERROR | ||
| "OpenROAD: ENABLE_GPU=ON requires the Kokkos package to be " | ||
| "installed and discoverable by CMake, but Kokkos was not found.\n" | ||
| " - If Kokkos is already installed: pass " | ||
| "-DKokkos_ROOT=/path/to/kokkos (or extend CMAKE_PREFIX_PATH).\n" | ||
| " - If not: build and install Kokkos from " | ||
| "https://github.com/kokkos/kokkos with the desired backend " | ||
| "(CUDA / HIP / SYCL / OpenMP) and a target architecture that " | ||
| "matches the host GPU.\n" | ||
| " - A future etc/DependencyInstaller.sh -gpu option will " | ||
| "automate this step.") | ||
| endif() | ||
| message(STATUS "OpenROAD: GPU acceleration enabled (Kokkos ${Kokkos_VERSION})") | ||
|
|
||
| if(Kokkos_ENABLE_CUDA) | ||
| # Auto-discover nvcc when the user has CUDA installed at a standard | ||
| # location but their environment does not expose it on PATH (common | ||
| # with IDE-launched configures: the bundled CMake does not inherit | ||
| # the shell PATH). enable_language(CUDA) below would otherwise abort | ||
| # with "No CMAKE_CUDA_COMPILER could be found" even though Kokkos's | ||
| # find_package already located the toolkit. | ||
| if(NOT DEFINED CMAKE_CUDA_COMPILER AND NOT DEFINED ENV{CUDACXX}) | ||
| find_program(_OPENROAD_NVCC nvcc | ||
| HINTS ENV CUDA_HOME ENV CUDA_PATH ENV CUDA_ROOT | ||
| /usr/local/cuda/bin | ||
| /usr/local/cuda-13.0/bin | ||
| /usr/local/cuda-12.8/bin /usr/local/cuda-12.0/bin | ||
| /opt/cuda/bin | ||
| ) | ||
| if(_OPENROAD_NVCC) | ||
| set(CMAKE_CUDA_COMPILER "${_OPENROAD_NVCC}" CACHE FILEPATH "") | ||
| message(STATUS "OpenROAD: auto-discovered nvcc at ${_OPENROAD_NVCC}") | ||
| endif() | ||
| endif() | ||
| # nvcc < 13 cannot parse glibc 2.38+'s _Float128 type that ships with | ||
| # gcc 13+'s C++ standard library headers (math.h template specialization | ||
| # for __iseqsig_type<_Float128>). When a known-broken pairing is detected, | ||
| # pin a compatible older g++ as the CUDA host compiler (the system C++ | ||
| # compiler stays unchanged for non-CUDA TUs). Override is always | ||
| # available via -DCMAKE_CUDA_HOST_COMPILER or CUDAHOSTCXX. | ||
| if(NOT DEFINED CMAKE_CUDA_HOST_COMPILER AND NOT DEFINED ENV{CUDAHOSTCXX} | ||
| AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU" | ||
| AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0" | ||
| AND _OPENROAD_NVCC) | ||
| execute_process( | ||
| COMMAND "${_OPENROAD_NVCC}" --version | ||
| OUTPUT_VARIABLE _OPENROAD_NVCC_VERSION_OUTPUT | ||
| ERROR_QUIET | ||
| OUTPUT_STRIP_TRAILING_WHITESPACE) | ||
| if(_OPENROAD_NVCC_VERSION_OUTPUT MATCHES "release ([0-9]+)") | ||
| set(_OPENROAD_NVCC_MAJOR "${CMAKE_MATCH_1}") | ||
| if(_OPENROAD_NVCC_MAJOR LESS 13) | ||
| foreach(_OPENROAD_GXX_VER 12 11) | ||
| find_program(_OPENROAD_CUDAHOST g++-${_OPENROAD_GXX_VER} | ||
| HINTS /usr/bin /usr/local/bin) | ||
| if(_OPENROAD_CUDAHOST) | ||
| set(CMAKE_CUDA_HOST_COMPILER "${_OPENROAD_CUDAHOST}" | ||
| CACHE FILEPATH "") | ||
| message(STATUS | ||
| "OpenROAD: pinning CUDA host compiler to " | ||
| "${_OPENROAD_CUDAHOST} (nvcc ${_OPENROAD_NVCC_MAJOR}.x + " | ||
| "glibc/gcc 13+ _Float128 compat)") | ||
| break() | ||
| endif() | ||
| unset(_OPENROAD_CUDAHOST CACHE) | ||
| endforeach() | ||
| if(NOT DEFINED CMAKE_CUDA_HOST_COMPILER) | ||
| message(FATAL_ERROR | ||
| "OpenROAD: nvcc ${_OPENROAD_NVCC_MAJOR}.x cannot parse " | ||
| "_Float128 declarations in glibc 2.38+ system headers used " | ||
| "by gcc ${CMAKE_CXX_COMPILER_VERSION}, and no compatible " | ||
| "g++-12 / g++-11 was found in /usr/bin or /usr/local/bin. " | ||
| "Install one (e.g. apt install g++-12) or set " | ||
| "-DCMAKE_CUDA_HOST_COMPILER=/path/to/older-g++ explicitly.") | ||
| endif() | ||
| endif() | ||
| endif() | ||
| endif() | ||
| if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") | ||
| if(DEFINED Kokkos_CUDA_ARCHITECTURES | ||
| AND NOT "${Kokkos_CUDA_ARCHITECTURES}" STREQUAL "") | ||
| set(CMAKE_CUDA_ARCHITECTURES "${Kokkos_CUDA_ARCHITECTURES}") | ||
| else() | ||
| message(FATAL_ERROR | ||
| "OpenROAD: ENABLE_GPU=ON with Kokkos CUDA backend, but the " | ||
| "Kokkos package does not advertise Kokkos_CUDA_ARCHITECTURES " | ||
| "and CMAKE_CUDA_ARCHITECTURES was not provided. Set " | ||
| "-DCMAKE_CUDA_ARCHITECTURES=<arch> explicitly (e.g. 89 for " | ||
| "RTX 4070, 120 for RTX 5090) or rebuild Kokkos with the " | ||
| "target architecture baked in.") | ||
| endif() | ||
| endif() | ||
| enable_language(CUDA) | ||
| find_package(CUDAToolkit REQUIRED) | ||
| message(STATUS "OpenROAD: CUDA backend (arch=${CMAKE_CUDA_ARCHITECTURES})") | ||
| # nvcc 12.8 cannot parse fmt 11's nontype-template-parameter user-defined | ||
| # literals (fmt/bundled/format.h: operator""_a with fixed_string). The | ||
| # legacy literal fallback is still available; opt into it for CUDA TUs | ||
| # only. Project-wide CXX compilation is unaffected. | ||
| add_compile_definitions( | ||
| $<$<COMPILE_LANGUAGE:CUDA>:FMT_USE_NONTYPE_TEMPLATE_ARGS=0>) | ||
| elseif(Kokkos_ENABLE_HIP) | ||
| enable_language(HIP) | ||
| message(STATUS "OpenROAD: HIP backend") | ||
| elseif(Kokkos_ENABLE_SYCL) | ||
| message(STATUS "OpenROAD: SYCL backend (driven by Kokkos host compiler)") | ||
| else() | ||
| message(STATUS | ||
| "OpenROAD: host-only Kokkos backend (Serial / OpenMP / Threads)") | ||
| endif() |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,175 @@ | ||||||
| // SPDX-License-Identifier: BSD-3-Clause | ||||||
| // Copyright (c) 2026, The OpenROAD Authors | ||||||
|
|
||||||
| // HPWL (half-perimeter wirelength) Kokkos backend. | ||||||
| // | ||||||
| // Compiled and linked only when ENABLE_GPU=ON; the OpenMP equivalent in | ||||||
| // ../hpwl.cpp is linked otherwise. Exactly one translation unit defines | ||||||
| // NesterovBaseCommon::getHpwl() per build (CMake-enforced ODR). | ||||||
| // | ||||||
| // Determinism: integer arithmetic; bit-exact across Kokkos backends | ||||||
| // (Serial / OpenMP / Threads / CUDA) and against the OpenMP CPU loop. | ||||||
|
|
||||||
| #include <Kokkos_Core.hpp> | ||||||
| #include <climits> | ||||||
| #include <cstdint> | ||||||
| #include <cstdlib> | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: included header cstdint is not used directly [misc-include-cleaner]
Suggested change
|
||||||
| #include <mutex> | ||||||
| #include <vector> | ||||||
|
|
||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: included header vector is not used directly [misc-include-cleaner]
Suggested change
|
||||||
| #include "nesterovBase.h" | ||||||
|
|
||||||
| namespace gpl { | ||||||
|
|
||||||
| // Lazy Kokkos lifecycle owned by gpl_lib so that the host application | ||||||
| // (the openroad binary, regression drivers, etc.) does not need to know | ||||||
| // Kokkos exists. The first GPU HPWL call initializes Kokkos and registers | ||||||
| // an atexit handler that finalizes it once at process shutdown — this is | ||||||
| // the upstream-safe pattern for opt-in CUDA backends without disrupting | ||||||
| // OpenROAD's existing main(). std::call_once keeps the initialization | ||||||
| // safe if a future caller drops the master-thread invariant. | ||||||
| namespace { | ||||||
| void ensureKokkosInitialized() | ||||||
| { | ||||||
| static std::once_flag once; | ||||||
| std::call_once(once, [] { | ||||||
| if (Kokkos::is_initialized()) { | ||||||
| return; | ||||||
| } | ||||||
| Kokkos::InitializationSettings settings; | ||||||
| settings.set_disable_warnings(true); | ||||||
| Kokkos::initialize(settings); | ||||||
| std::atexit([] { | ||||||
| if (Kokkos::is_initialized() && !Kokkos::is_finalized()) { | ||||||
| Kokkos::finalize(); | ||||||
| } | ||||||
| }); | ||||||
| }); | ||||||
| } | ||||||
| } // namespace | ||||||
|
|
||||||
| int64_t NesterovBaseCommon::getHpwl() | ||||||
| { | ||||||
| const int n_nets = static_cast<int>(gNetStor_.size()); | ||||||
| if (n_nets == 0) { | ||||||
| return 0; | ||||||
| } | ||||||
|
|
||||||
| ensureKokkosInitialized(); | ||||||
|
|
||||||
| // ---- 1. Flatten net→pins to CSR on host ---- | ||||||
| std::vector<int> h_net_off(n_nets + 1, 0); | ||||||
| for (int i = 0; i < n_nets; ++i) { | ||||||
| h_net_off[i + 1] | ||||||
| = h_net_off[i] + static_cast<int>(gNetStor_[i].getGPins().size()); | ||||||
| } | ||||||
| const int total_pins = h_net_off[n_nets]; | ||||||
|
|
||||||
| std::vector<int> h_pin_cx(total_pins); | ||||||
| std::vector<int> h_pin_cy(total_pins); | ||||||
| for (int i = 0; i < n_nets; ++i) { | ||||||
| int off = h_net_off[i]; | ||||||
| for (auto* gPin : gNetStor_[i].getGPins()) { | ||||||
| h_pin_cx[off] = gPin->cx(); | ||||||
| h_pin_cy[off] = gPin->cy(); | ||||||
| ++off; | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // ---- 2. Mirror inputs to device ---- | ||||||
| using ExecSpace = Kokkos::DefaultExecutionSpace; | ||||||
| Kokkos::View<int*, ExecSpace> d_net_off("hpwl_net_off", n_nets + 1); | ||||||
| Kokkos::View<int*, ExecSpace> d_pin_cx("hpwl_pin_cx", total_pins); | ||||||
| Kokkos::View<int*, ExecSpace> d_pin_cy("hpwl_pin_cy", total_pins); | ||||||
|
|
||||||
| Kokkos::View<int*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> h_net_off_view( | ||||||
| h_net_off.data(), n_nets + 1); | ||||||
| Kokkos::View<int*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> h_pin_cx_view( | ||||||
| h_pin_cx.data(), total_pins); | ||||||
| Kokkos::View<int*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> h_pin_cy_view( | ||||||
| h_pin_cy.data(), total_pins); | ||||||
|
|
||||||
| Kokkos::deep_copy(d_net_off, h_net_off_view); | ||||||
| Kokkos::deep_copy(d_pin_cx, h_pin_cx_view); | ||||||
| Kokkos::deep_copy(d_pin_cy, h_pin_cy_view); | ||||||
|
|
||||||
| // Per-net bbox outputs (kept on device for reduction; mirrored back at end). | ||||||
| Kokkos::View<int*, ExecSpace> d_lx("hpwl_net_lx", n_nets); | ||||||
| Kokkos::View<int*, ExecSpace> d_ly("hpwl_net_ly", n_nets); | ||||||
| Kokkos::View<int*, ExecSpace> d_ux("hpwl_net_ux", n_nets); | ||||||
| Kokkos::View<int*, ExecSpace> d_uy("hpwl_net_uy", n_nets); | ||||||
|
Comment on lines
+61
to
+100
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current implementation performs multiple host and device memory allocations, data flattening, and H2D transfers on every call to In accordance with performance guidelines for persistent state, consider the following optimizations:
References
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for catching this — the per-call allocation pattern is intentional for this PR's scope (pattern-establishing first cut). Persistent device views and amortized H2D transfers will be needed once multiple kernels share state across a Nesterov iteration (WLEN gradient + density gradient + HPWL all reading the same pin coordinates), and that's the natural place to introduce them. I'll revisit this For this PR, HPWL alone is small relative to placement gradient updates; the alloc overhead, while real, sits below the noise floor of an end-to-end placement run. |
||||||
|
|
||||||
| // ---- 3. Compute per-net bbox in parallel; serial inner over pins ---- | ||||||
| Kokkos::parallel_for( | ||||||
| "hpwl_bbox", | ||||||
| Kokkos::RangePolicy<ExecSpace>(0, n_nets), | ||||||
| KOKKOS_LAMBDA(const int i) { | ||||||
| int lx = INT_MAX; | ||||||
| int ly = INT_MAX; | ||||||
| int ux = INT_MIN; | ||||||
| int uy = INT_MIN; | ||||||
| const int begin = d_net_off(i); | ||||||
| const int end = d_net_off(i + 1); | ||||||
| // Serial over pins for determinism (sgizler 80b04e1c1 pattern: do not | ||||||
| // rely on parallel_reduce ordering even though min/max are commutative | ||||||
| // — keeps results bit-identical to the CPU updateBox() loop). | ||||||
| for (int j = begin; j < end; ++j) { | ||||||
| const int x = d_pin_cx(j); | ||||||
| const int y = d_pin_cy(j); | ||||||
| if (x < lx) { | ||||||
| lx = x; | ||||||
| } | ||||||
| if (y < ly) { | ||||||
| ly = y; | ||||||
| } | ||||||
| if (x > ux) { | ||||||
| ux = x; | ||||||
| } | ||||||
| if (y > uy) { | ||||||
| uy = y; | ||||||
| } | ||||||
| } | ||||||
| d_lx(i) = lx; | ||||||
| d_ly(i) = ly; | ||||||
| d_ux(i) = ux; | ||||||
| d_uy(i) = uy; | ||||||
| }); | ||||||
|
|
||||||
| // ---- 4. Sum HPWL across nets (int64 reduction → backend-deterministic) ---- | ||||||
| int64_t total_hpwl = 0; | ||||||
| Kokkos::parallel_reduce( | ||||||
| "hpwl_sum", | ||||||
| Kokkos::RangePolicy<ExecSpace>(0, n_nets), | ||||||
| KOKKOS_LAMBDA(const int i, int64_t& acc) { | ||||||
| const int lx = d_lx(i); | ||||||
| const int ly = d_ly(i); | ||||||
| const int ux = d_ux(i); | ||||||
| const int uy = d_uy(i); | ||||||
| // Dangling net (no pins): GNet::getHpwl() returns 0 in this case. | ||||||
| if (ux < lx) { | ||||||
| return; | ||||||
| } | ||||||
| acc += static_cast<int64_t>(ux - lx) + static_cast<int64_t>(uy - ly); | ||||||
| }, | ||||||
| total_hpwl); | ||||||
|
Comment on lines
+103
to
+154
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The References
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good observation. One nuance to flag: the kernels are split because Will fold this in once the kernel-launch pattern stabilizes across the WLEN / density kernels — at that point the right shape (custom reducer vs. two passes vs. a |
||||||
|
|
||||||
| // ---- 5. Mirror per-net bbox back to host GNet objects ---- | ||||||
| // Subsequent code paths (e.g. routeBase, timing-driven weights) read | ||||||
| // gNet->lx() / ly() / ux() / uy() and expect them updated. | ||||||
| auto h_lx = Kokkos::create_mirror_view(d_lx); | ||||||
| auto h_ly = Kokkos::create_mirror_view(d_ly); | ||||||
| auto h_ux = Kokkos::create_mirror_view(d_ux); | ||||||
| auto h_uy = Kokkos::create_mirror_view(d_uy); | ||||||
| Kokkos::deep_copy(h_lx, d_lx); | ||||||
| Kokkos::deep_copy(h_ly, d_ly); | ||||||
| Kokkos::deep_copy(h_ux, d_ux); | ||||||
| Kokkos::deep_copy(h_uy, d_uy); | ||||||
|
|
||||||
| for (int i = 0; i < n_nets; ++i) { | ||||||
| gNetStor_[i].setBox(h_lx(i), h_ly(i), h_ux(i), h_uy(i)); | ||||||
| } | ||||||
|
|
||||||
| return total_hpwl; | ||||||
| } | ||||||
|
|
||||||
| } // namespace gpl | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: 'Kokkos_Core.hpp' file not found [clang-diagnostic-error]