Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ option(USE_SYSTEM_ABC "Use system shared ABC library" OFF)
# Allow disabling tests
option(ENABLE_TESTS "Enable OpenROAD tests" ON)

# Opt-in GPU acceleration via Kokkos. The actual compute backend (CUDA, HIP,
# SYCL, or host-only OpenMP/Threads) is determined by the installed Kokkos
# package; OpenROAD inspects Kokkos_ENABLE_* and turns on the matching CMake
# language and dependencies automatically. See the per-module CMakeLists for
# how individual subsystems wire their GPU sources.
option(ENABLE_GPU "Enable GPU acceleration via Kokkos" OFF)

# Allow enabling address sanitizer
option(ASAN "Enable Address Sanitizer" OFF)

Expand Down Expand Up @@ -92,6 +99,13 @@ if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE RELEASE)
endif()

# GPU backend wiring (opt-in). All Kokkos / CUDA / HIP / SYCL detection,
# compiler probing, and language enablement live in cmake/KokkosBackend.cmake
# and are loaded only when the user opts in via ENABLE_GPU=ON.
if(ENABLE_GPU)
include(KokkosBackend)
endif()

if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8.3.0")
message(FATAL_ERROR "Insufficient gcc version. Found ${CMAKE_CXX_COMPILER_VERSION}, but require >= 8.3.0.")
Expand Down
126 changes: 126 additions & 0 deletions cmake/KokkosBackend.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2026, The OpenROAD Authors

# Kokkos GPU backend wiring for OpenROAD. Included from the root
# CMakeLists.txt only when ENABLE_GPU=ON; not loaded otherwise.
#
# Discovers the user's Kokkos install, inherits its compute backend, turns
# on the matching CMake language so downstream targets can mark kernel
# sources with set_source_files_properties(... LANGUAGE CUDA|HIP), and
# applies the small set of nvcc / fmt / host-compiler workarounds that the
# CUDA backend currently needs in modern Linux toolchains. Per-module
# CMakeLists (e.g. src/gpl) key off ENABLE_GPU and Kokkos_ENABLE_*; they
# do not need to call find_package(Kokkos) or enable_language() themselves.

find_package(Kokkos QUIET)
if(NOT Kokkos_FOUND)
message(FATAL_ERROR
"OpenROAD: ENABLE_GPU=ON requires the Kokkos package to be "
"installed and discoverable by CMake, but Kokkos was not found.\n"
" - If Kokkos is already installed: pass "
"-DKokkos_ROOT=/path/to/kokkos (or extend CMAKE_PREFIX_PATH).\n"
" - If not: build and install Kokkos from "
"https://github.com/kokkos/kokkos with the desired backend "
"(CUDA / HIP / SYCL / OpenMP) and a target architecture that "
"matches the host GPU.\n"
" - A future etc/DependencyInstaller.sh -gpu option will "
"automate this step.")
endif()
message(STATUS "OpenROAD: GPU acceleration enabled (Kokkos ${Kokkos_VERSION})")

if(Kokkos_ENABLE_CUDA)
# Auto-discover nvcc when the user has CUDA installed at a standard
# location but their environment does not expose it on PATH (common
# with IDE-launched configures: the bundled CMake does not inherit
# the shell PATH). enable_language(CUDA) below would otherwise abort
# with "No CMAKE_CUDA_COMPILER could be found" even though Kokkos's
# find_package already located the toolkit.
if(NOT DEFINED CMAKE_CUDA_COMPILER AND NOT DEFINED ENV{CUDACXX})
find_program(_OPENROAD_NVCC nvcc
HINTS ENV CUDA_HOME ENV CUDA_PATH ENV CUDA_ROOT
/usr/local/cuda/bin
/usr/local/cuda-13.0/bin
/usr/local/cuda-12.8/bin /usr/local/cuda-12.0/bin
/opt/cuda/bin
)
if(_OPENROAD_NVCC)
set(CMAKE_CUDA_COMPILER "${_OPENROAD_NVCC}" CACHE FILEPATH "")
message(STATUS "OpenROAD: auto-discovered nvcc at ${_OPENROAD_NVCC}")
endif()
endif()
# nvcc < 13 cannot parse glibc 2.38+'s _Float128 type that ships with
# gcc 13+'s C++ standard library headers (math.h template specialization
# for __iseqsig_type<_Float128>). When a known-broken pairing is detected,
# pin a compatible older g++ as the CUDA host compiler (the system C++
# compiler stays unchanged for non-CUDA TUs). Override is always
# available via -DCMAKE_CUDA_HOST_COMPILER or CUDAHOSTCXX.
if(NOT DEFINED CMAKE_CUDA_HOST_COMPILER AND NOT DEFINED ENV{CUDAHOSTCXX}
AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU"
AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0"
AND _OPENROAD_NVCC)
execute_process(
COMMAND "${_OPENROAD_NVCC}" --version
OUTPUT_VARIABLE _OPENROAD_NVCC_VERSION_OUTPUT
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(_OPENROAD_NVCC_VERSION_OUTPUT MATCHES "release ([0-9]+)")
set(_OPENROAD_NVCC_MAJOR "${CMAKE_MATCH_1}")
if(_OPENROAD_NVCC_MAJOR LESS 13)
foreach(_OPENROAD_GXX_VER 12 11)
find_program(_OPENROAD_CUDAHOST g++-${_OPENROAD_GXX_VER}
HINTS /usr/bin /usr/local/bin)
if(_OPENROAD_CUDAHOST)
set(CMAKE_CUDA_HOST_COMPILER "${_OPENROAD_CUDAHOST}"
CACHE FILEPATH "")
message(STATUS
"OpenROAD: pinning CUDA host compiler to "
"${_OPENROAD_CUDAHOST} (nvcc ${_OPENROAD_NVCC_MAJOR}.x + "
"glibc/gcc 13+ _Float128 compat)")
break()
endif()
unset(_OPENROAD_CUDAHOST CACHE)
endforeach()
if(NOT DEFINED CMAKE_CUDA_HOST_COMPILER)
message(FATAL_ERROR
"OpenROAD: nvcc ${_OPENROAD_NVCC_MAJOR}.x cannot parse "
"_Float128 declarations in glibc 2.38+ system headers used "
"by gcc ${CMAKE_CXX_COMPILER_VERSION}, and no compatible "
"g++-12 / g++-11 was found in /usr/bin or /usr/local/bin. "
"Install one (e.g. apt install g++-12) or set "
"-DCMAKE_CUDA_HOST_COMPILER=/path/to/older-g++ explicitly.")
endif()
endif()
endif()
endif()
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "")
if(DEFINED Kokkos_CUDA_ARCHITECTURES
AND NOT "${Kokkos_CUDA_ARCHITECTURES}" STREQUAL "")
set(CMAKE_CUDA_ARCHITECTURES "${Kokkos_CUDA_ARCHITECTURES}")
else()
message(FATAL_ERROR
"OpenROAD: ENABLE_GPU=ON with Kokkos CUDA backend, but the "
"Kokkos package does not advertise Kokkos_CUDA_ARCHITECTURES "
"and CMAKE_CUDA_ARCHITECTURES was not provided. Set "
"-DCMAKE_CUDA_ARCHITECTURES=<arch> explicitly (e.g. 89 for "
"RTX 4070, 120 for RTX 5090) or rebuild Kokkos with the "
"target architecture baked in.")
endif()
endif()
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
message(STATUS "OpenROAD: CUDA backend (arch=${CMAKE_CUDA_ARCHITECTURES})")
# nvcc 12.8 cannot parse fmt 11's nontype-template-parameter user-defined
# literals (fmt/bundled/format.h: operator""_a with fixed_string). The
# legacy literal fallback is still available; opt into it for CUDA TUs
# only. Project-wide CXX compilation is unaffected.
add_compile_definitions(
$<$<COMPILE_LANGUAGE:CUDA>:FMT_USE_NONTYPE_TEMPLATE_ARGS=0>)
elseif(Kokkos_ENABLE_HIP)
enable_language(HIP)
message(STATUS "OpenROAD: HIP backend")
elseif(Kokkos_ENABLE_SYCL)
message(STATUS "OpenROAD: SYCL backend (driven by Kokkos host compiler)")
else()
message(STATUS
"OpenROAD: host-only Kokkos backend (Serial / OpenMP / Threads)")
endif()
1 change: 1 addition & 0 deletions src/gpl/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ cc_library(
"src/graphicsImpl.cpp",
"src/graphicsImpl.h",
"src/graphicsNone.cpp",
"src/hpwl.cpp",
"src/initialPlace.cpp",
"src/initialPlace.h",
"src/mbff.cpp",
Expand Down
37 changes: 37 additions & 0 deletions src/gpl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,43 @@ add_library(gpl_lib
src/mbff.cpp
)

# --- HPWL backend selection (link-time dispatch) ---
# Exactly one translation unit defines NesterovBaseCommon::getHpwl(): either
# the OpenMP loop in src/hpwl.cpp (default) or the Kokkos kernel in
# src/gpu/hpwl.cpp when ENABLE_GPU=ON. CMake enforces ODR; the
# consumer-facing headers and sources stay free of preprocessor branches.
# gpu/ is a file-layout subdirectory only (no nested CMakeLists.txt) so
# kernel build settings stay in this module's CMakeLists with the rest
# of gpl_lib.
if(ENABLE_GPU)
target_sources(gpl_lib PRIVATE src/gpu/hpwl.cpp)
# nesterovBase.h and other private gpl headers live in src/; sources
# under src/gpu/ need that on the include path explicitly because
# the compiler's default same-dir lookup points into src/gpu/ instead.
target_include_directories(gpl_lib PRIVATE src)
if(Kokkos_ENABLE_CUDA)
set_source_files_properties(src/gpu/hpwl.cpp PROPERTIES LANGUAGE CUDA)
elseif(Kokkos_ENABLE_HIP)
set_source_files_properties(src/gpu/hpwl.cpp PROPERTIES LANGUAGE HIP)
endif()
# Disable FP contraction for kernels that share gpl_lib's compile
# context so they stay bit-stable across compilers. Scoped to gpl_lib
# but the CXX flag is also harmless on the existing CPU TUs.
target_compile_options(gpl_lib PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:-ffp-contract=off>
$<$<COMPILE_LANGUAGE:CUDA>:--fmad=false>
$<$<COMPILE_LANGUAGE:HIP>:-ffp-contract=off>
)
target_link_libraries(gpl_lib Kokkos::kokkos)
if(Kokkos_ENABLE_CUDA)
# cuda runtime symbols are referenced from the CUDA TU; expose cudart
# so that gpl_lib (and the openroad binary) link against libcudart.
target_link_libraries(gpl_lib CUDA::cudart)
endif()
else()
target_sources(gpl_lib PRIVATE src/hpwl.cpp)
endif()

target_sources(gpl
PRIVATE
src/MakeReplace.cpp
Expand Down
175 changes: 175 additions & 0 deletions src/gpl/src/gpu/hpwl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
// SPDX-License-Identifier: BSD-3-Clause
// Copyright (c) 2026, The OpenROAD Authors

// HPWL (half-perimeter wirelength) Kokkos backend.
//
// Compiled and linked only when ENABLE_GPU=ON; the OpenMP equivalent in
// ../hpwl.cpp is linked otherwise. Exactly one translation unit defines
// NesterovBaseCommon::getHpwl() per build (CMake-enforced ODR).
//
// Determinism: integer arithmetic; bit-exact across Kokkos backends
// (Serial / OpenMP / Threads / CUDA) and against the OpenMP CPU loop.

#include <Kokkos_Core.hpp>
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: 'Kokkos_Core.hpp' file not found [clang-diagnostic-error]

#include <Kokkos_Core.hpp>
         ^

#include <climits>
#include <cstdint>
#include <cstdlib>
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: included header cstdint is not used directly [misc-include-cleaner]

Suggested change
#include <cstdlib>
#include <cstdlib>

#include <mutex>
#include <vector>

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: included header vector is not used directly [misc-include-cleaner]

Suggested change

#include "nesterovBase.h"

namespace gpl {

// Lazy Kokkos lifecycle owned by gpl_lib so that the host application
// (the openroad binary, regression drivers, etc.) does not need to know
// Kokkos exists. The first GPU HPWL call initializes Kokkos and registers
// an atexit handler that finalizes it once at process shutdown — this is
// the upstream-safe pattern for opt-in CUDA backends without disrupting
// OpenROAD's existing main(). std::call_once keeps the initialization
// safe if a future caller drops the master-thread invariant.
namespace {
void ensureKokkosInitialized()
{
static std::once_flag once;
std::call_once(once, [] {
if (Kokkos::is_initialized()) {
return;
}
Kokkos::InitializationSettings settings;
settings.set_disable_warnings(true);
Kokkos::initialize(settings);
std::atexit([] {
if (Kokkos::is_initialized() && !Kokkos::is_finalized()) {
Kokkos::finalize();
}
});
});
}
} // namespace

int64_t NesterovBaseCommon::getHpwl()
{
const int n_nets = static_cast<int>(gNetStor_.size());
if (n_nets == 0) {
return 0;
}

ensureKokkosInitialized();

// ---- 1. Flatten net→pins to CSR on host ----
std::vector<int> h_net_off(n_nets + 1, 0);
for (int i = 0; i < n_nets; ++i) {
h_net_off[i + 1]
= h_net_off[i] + static_cast<int>(gNetStor_[i].getGPins().size());
}
const int total_pins = h_net_off[n_nets];

std::vector<int> h_pin_cx(total_pins);
std::vector<int> h_pin_cy(total_pins);
for (int i = 0; i < n_nets; ++i) {
int off = h_net_off[i];
for (auto* gPin : gNetStor_[i].getGPins()) {
h_pin_cx[off] = gPin->cx();
h_pin_cy[off] = gPin->cy();
++off;
}
}

// ---- 2. Mirror inputs to device ----
using ExecSpace = Kokkos::DefaultExecutionSpace;
Kokkos::View<int*, ExecSpace> d_net_off("hpwl_net_off", n_nets + 1);
Kokkos::View<int*, ExecSpace> d_pin_cx("hpwl_pin_cx", total_pins);
Kokkos::View<int*, ExecSpace> d_pin_cy("hpwl_pin_cy", total_pins);

Kokkos::View<int*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> h_net_off_view(
h_net_off.data(), n_nets + 1);
Kokkos::View<int*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> h_pin_cx_view(
h_pin_cx.data(), total_pins);
Kokkos::View<int*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> h_pin_cy_view(
h_pin_cy.data(), total_pins);

Kokkos::deep_copy(d_net_off, h_net_off_view);
Kokkos::deep_copy(d_pin_cx, h_pin_cx_view);
Kokkos::deep_copy(d_pin_cy, h_pin_cy_view);

// Per-net bbox outputs (kept on device for reduction; mirrored back at end).
Kokkos::View<int*, ExecSpace> d_lx("hpwl_net_lx", n_nets);
Kokkos::View<int*, ExecSpace> d_ly("hpwl_net_ly", n_nets);
Kokkos::View<int*, ExecSpace> d_ux("hpwl_net_ux", n_nets);
Kokkos::View<int*, ExecSpace> d_uy("hpwl_net_uy", n_nets);
Comment on lines +61 to +100
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The current implementation performs multiple host and device memory allocations, data flattening, and H2D transfers on every call to getHpwl(). Since this function is a hot path called frequently during the Nesterov placement loop, these overheads (especially std::vector allocations and cudaMalloc calls hidden behind Kokkos::View constructors) are likely to dominate the computation time.

In accordance with performance guidelines for persistent state, consider the following optimizations:

  1. Reuse Device Views: Move d_net_off, d_pin_cx, etc., to be persistent members so they are only allocated once or when the netlist size changes, rather than being treated as temporary scratch buffers.
  2. Avoid Host Flattening: Maintain pin coordinates on the device throughout the placement loop to avoid the cost of flattening gNetStor_ and copying to the device in every iteration.
  3. Reuse Mirror Views: Kokkos::create_mirror_view can be called once and reused to avoid host-side allocations during the D2H copy.
References
  1. When creating worker objects, distinguish between persistent state that must be copied and scratch buffers. Persistent state should be reused to avoid redundant allocations and transfers.
  2. Performance optimizations for loops are necessary when the containing function is part of a hot path called frequently, such as the Nesterov placement loop.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching this — the per-call allocation pattern is intentional for this PR's scope (pattern-establishing first cut). Persistent device views and amortized H2D transfers will be needed once multiple kernels share state across a Nesterov iteration (WLEN gradient + density gradient + HPWL all reading the same pin coordinates), and that's the natural place to introduce them. I'll revisit this getHpwl() to share the persistent state when the WLEN gradient PR lands.

For this PR, HPWL alone is small relative to placement gradient updates; the alloc overhead, while real, sits below the noise floor of an end-to-end placement run.


// ---- 3. Compute per-net bbox in parallel; serial inner over pins ----
Kokkos::parallel_for(
"hpwl_bbox",
Kokkos::RangePolicy<ExecSpace>(0, n_nets),
KOKKOS_LAMBDA(const int i) {
int lx = INT_MAX;
int ly = INT_MAX;
int ux = INT_MIN;
int uy = INT_MIN;
const int begin = d_net_off(i);
const int end = d_net_off(i + 1);
// Serial over pins for determinism (sgizler 80b04e1c1 pattern: do not
// rely on parallel_reduce ordering even though min/max are commutative
// — keeps results bit-identical to the CPU updateBox() loop).
for (int j = begin; j < end; ++j) {
const int x = d_pin_cx(j);
const int y = d_pin_cy(j);
if (x < lx) {
lx = x;
}
if (y < ly) {
ly = y;
}
if (x > ux) {
ux = x;
}
if (y > uy) {
uy = y;
}
}
d_lx(i) = lx;
d_ly(i) = ly;
d_ux(i) = ux;
d_uy(i) = uy;
});

// ---- 4. Sum HPWL across nets (int64 reduction → backend-deterministic) ----
int64_t total_hpwl = 0;
Kokkos::parallel_reduce(
"hpwl_sum",
Kokkos::RangePolicy<ExecSpace>(0, n_nets),
KOKKOS_LAMBDA(const int i, int64_t& acc) {
const int lx = d_lx(i);
const int ly = d_ly(i);
const int ux = d_ux(i);
const int uy = d_uy(i);
// Dangling net (no pins): GNet::getHpwl() returns 0 in this case.
if (ux < lx) {
return;
}
acc += static_cast<int64_t>(ux - lx) + static_cast<int64_t>(uy - ly);
},
total_hpwl);
Comment on lines +103 to +154
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The hpwl_bbox and hpwl_sum kernels can be merged into a single parallel_reduce operation. This would reduce kernel launch overhead and improve cache locality by processing each net's pins and contributing to the total HPWL sum in a single pass. This is a recommended optimization given that this function is called repeatedly within the placement loop.

References
  1. A performance optimization for a loop is necessary if the containing function is called frequently (hot path).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good observation. One nuance to flag: the kernels are split because hpwl_bbox writes per-net bboxes to four View<int*> outputs (mirrored back to the host via GNet::setBox() so downstream consumers like routeBase and timing-driven weighting see consistent gNet->lx()/ly()/ux()/uy()), while hpwl_sum only produces the reduction scalar. A fused single-pass parallel_reduce is feasible but would need a custom reducer that emits both the per-net bboxes and the running sum total in one launch.

Will fold this in once the kernel-launch pattern stabilizes across the WLEN / density kernels — at that point the right shape (custom reducer vs. two passes vs. a Kokkos::TeamPolicy two-level decomposition) will be more obvious. Keeping the simpler two-kernel form here so the reviewable surface stays focused on the dispatch shape.


// ---- 5. Mirror per-net bbox back to host GNet objects ----
// Subsequent code paths (e.g. routeBase, timing-driven weights) read
// gNet->lx() / ly() / ux() / uy() and expect them updated.
auto h_lx = Kokkos::create_mirror_view(d_lx);
auto h_ly = Kokkos::create_mirror_view(d_ly);
auto h_ux = Kokkos::create_mirror_view(d_ux);
auto h_uy = Kokkos::create_mirror_view(d_uy);
Kokkos::deep_copy(h_lx, d_lx);
Kokkos::deep_copy(h_ly, d_ly);
Kokkos::deep_copy(h_ux, d_ux);
Kokkos::deep_copy(h_uy, d_uy);

for (int i = 0; i < n_nets; ++i) {
gNetStor_[i].setBox(h_lx(i), h_ly(i), h_ux(i), h_uy(i));
}

return total_hpwl;
}

} // namespace gpl
Loading
Loading