The-OpenROAD-Project · ApeachM · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
@@ -48,6 +48,13 @@ option(USE_SYSTEM_ABC "Use system shared ABC library" OFF)
 # Allow disabling tests
 option(ENABLE_TESTS "Enable OpenROAD tests" ON)
 
+# Opt-in GPU acceleration via Kokkos. The actual compute backend (CUDA, HIP,
+# SYCL, or host-only OpenMP/Threads) is determined by the installed Kokkos
+# package; OpenROAD inspects Kokkos_ENABLE_* and turns on the matching CMake
+# language and dependencies automatically. See the per-module CMakeLists for
+# how individual subsystems wire their GPU sources.
+option(ENABLE_GPU "Enable GPU acceleration via Kokkos" OFF)
+
 # Allow enabling address sanitizer
 option(ASAN "Enable Address Sanitizer" OFF)
 
@@ -92,6 +99,13 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE RELEASE)
 endif()
 
+# GPU backend wiring (opt-in). All Kokkos / CUDA / HIP / SYCL detection,
+# compiler probing, and language enablement live in cmake/KokkosBackend.cmake
+# and are loaded only when the user opts in via ENABLE_GPU=ON.
+if(ENABLE_GPU)
+  include(KokkosBackend)
+endif()
+
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8.3.0")
     message(FATAL_ERROR "Insufficient gcc version. Found ${CMAKE_CXX_COMPILER_VERSION}, but require  >= 8.3.0.")

@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2026, The OpenROAD Authors
+
+# Kokkos GPU backend wiring for OpenROAD. Included from the root
+# CMakeLists.txt only when ENABLE_GPU=ON; not loaded otherwise.
+#
+# Discovers the user's Kokkos install, inherits its compute backend, turns
+# on the matching CMake language so downstream targets can mark kernel
+# sources with set_source_files_properties(... LANGUAGE CUDA|HIP), and
+# applies the small set of nvcc / fmt / host-compiler workarounds that the
+# CUDA backend currently needs in modern Linux toolchains. Per-module
+# CMakeLists (e.g. src/gpl) key off ENABLE_GPU and Kokkos_ENABLE_*; they
+# do not need to call find_package(Kokkos) or enable_language() themselves.
+
+find_package(Kokkos QUIET)
+if(NOT Kokkos_FOUND)
+  message(FATAL_ERROR
+    "OpenROAD: ENABLE_GPU=ON requires the Kokkos package to be "
+    "installed and discoverable by CMake, but Kokkos was not found.\n"
+    "  - If Kokkos is already installed: pass "
+    "-DKokkos_ROOT=/path/to/kokkos (or extend CMAKE_PREFIX_PATH).\n"
+    "  - If not: build and install Kokkos from "
+    "https://github.com/kokkos/kokkos with the desired backend "
+    "(CUDA / HIP / SYCL / OpenMP) and a target architecture that "
+    "matches the host GPU.\n"
+    "  - A future etc/DependencyInstaller.sh -gpu option will "
+    "automate this step.")
+endif()
+message(STATUS "OpenROAD: GPU acceleration enabled (Kokkos ${Kokkos_VERSION})")
+
+if(Kokkos_ENABLE_CUDA)
+  # Auto-discover nvcc when the user has CUDA installed at a standard
+  # location but their environment does not expose it on PATH (common
+  # with IDE-launched configures: the bundled CMake does not inherit
+  # the shell PATH). enable_language(CUDA) below would otherwise abort
+  # with "No CMAKE_CUDA_COMPILER could be found" even though Kokkos's
+  # find_package already located the toolkit.
+  if(NOT DEFINED CMAKE_CUDA_COMPILER AND NOT DEFINED ENV{CUDACXX})
+    find_program(_OPENROAD_NVCC nvcc
+      HINTS ENV CUDA_HOME ENV CUDA_PATH ENV CUDA_ROOT
+            /usr/local/cuda/bin
+            /usr/local/cuda-13.0/bin
+            /usr/local/cuda-12.8/bin /usr/local/cuda-12.0/bin
+            /opt/cuda/bin
+    )
+    if(_OPENROAD_NVCC)
+      set(CMAKE_CUDA_COMPILER "${_OPENROAD_NVCC}" CACHE FILEPATH "")
+      message(STATUS "OpenROAD: auto-discovered nvcc at ${_OPENROAD_NVCC}")
+    endif()
+  endif()
+  # nvcc < 13 cannot parse glibc 2.38+'s _Float128 type that ships with
+  # gcc 13+'s C++ standard library headers (math.h template specialization
+  # for __iseqsig_type<_Float128>). When a known-broken pairing is detected,
+  # pin a compatible older g++ as the CUDA host compiler (the system C++
+  # compiler stays unchanged for non-CUDA TUs). Override is always
+  # available via -DCMAKE_CUDA_HOST_COMPILER or CUDAHOSTCXX.
+  if(NOT DEFINED CMAKE_CUDA_HOST_COMPILER AND NOT DEFINED ENV{CUDAHOSTCXX}
+     AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU"
+     AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0"
+     AND _OPENROAD_NVCC)
+    execute_process(
+      COMMAND "${_OPENROAD_NVCC}" --version
+      OUTPUT_VARIABLE _OPENROAD_NVCC_VERSION_OUTPUT
+      ERROR_QUIET
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(_OPENROAD_NVCC_VERSION_OUTPUT MATCHES "release ([0-9]+)")
+      set(_OPENROAD_NVCC_MAJOR "${CMAKE_MATCH_1}")
+      if(_OPENROAD_NVCC_MAJOR LESS 13)
+        foreach(_OPENROAD_GXX_VER 12 11)
+          find_program(_OPENROAD_CUDAHOST g++-${_OPENROAD_GXX_VER}
+            HINTS /usr/bin /usr/local/bin)
+          if(_OPENROAD_CUDAHOST)
+            set(CMAKE_CUDA_HOST_COMPILER "${_OPENROAD_CUDAHOST}"
+              CACHE FILEPATH "")
+            message(STATUS
+              "OpenROAD: pinning CUDA host compiler to "
+              "${_OPENROAD_CUDAHOST} (nvcc ${_OPENROAD_NVCC_MAJOR}.x + "
+              "glibc/gcc 13+ _Float128 compat)")
+            break()
+          endif()
+          unset(_OPENROAD_CUDAHOST CACHE)
+        endforeach()
+        if(NOT DEFINED CMAKE_CUDA_HOST_COMPILER)
+          message(FATAL_ERROR
+            "OpenROAD: nvcc ${_OPENROAD_NVCC_MAJOR}.x cannot parse "
+            "_Float128 declarations in glibc 2.38+ system headers used "
+            "by gcc ${CMAKE_CXX_COMPILER_VERSION}, and no compatible "
+            "g++-12 / g++-11 was found in /usr/bin or /usr/local/bin. "
+            "Install one (e.g. apt install g++-12) or set "
+            "-DCMAKE_CUDA_HOST_COMPILER=/path/to/older-g++ explicitly.")
+        endif()
+      endif()
+    endif()
+  endif()
+  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "")
+    if(DEFINED Kokkos_CUDA_ARCHITECTURES
+       AND NOT "${Kokkos_CUDA_ARCHITECTURES}" STREQUAL "")
+      set(CMAKE_CUDA_ARCHITECTURES "${Kokkos_CUDA_ARCHITECTURES}")
+    else()
+      message(FATAL_ERROR
+        "OpenROAD: ENABLE_GPU=ON with Kokkos CUDA backend, but the "
+        "Kokkos package does not advertise Kokkos_CUDA_ARCHITECTURES "
+        "and CMAKE_CUDA_ARCHITECTURES was not provided. Set "
+        "-DCMAKE_CUDA_ARCHITECTURES=<arch> explicitly (e.g. 89 for "
+        "RTX 4070, 120 for RTX 5090) or rebuild Kokkos with the "
+        "target architecture baked in.")
+    endif()
+  endif()
+  enable_language(CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  message(STATUS "OpenROAD: CUDA backend (arch=${CMAKE_CUDA_ARCHITECTURES})")
+  # nvcc 12.8 cannot parse fmt 11's nontype-template-parameter user-defined
+  # literals (fmt/bundled/format.h: operator""_a with fixed_string). The
+  # legacy literal fallback is still available; opt into it for CUDA TUs
+  # only. Project-wide CXX compilation is unaffected.
+  add_compile_definitions(
+    $<$<COMPILE_LANGUAGE:CUDA>:FMT_USE_NONTYPE_TEMPLATE_ARGS=0>)
+elseif(Kokkos_ENABLE_HIP)
+  enable_language(HIP)
+  message(STATUS "OpenROAD: HIP backend")
+elseif(Kokkos_ENABLE_SYCL)
+  message(STATUS "OpenROAD: SYCL backend (driven by Kokkos host compiler)")
+else()
+  message(STATUS
+          "OpenROAD: host-only Kokkos backend (Serial / OpenMP / Threads)")
+endif()
@@ -26,6 +26,7 @@ cc_library(
         "src/graphicsImpl.cpp",
         "src/graphicsImpl.h",
         "src/graphicsNone.cpp",
+        "src/hpwl.cpp",
         "src/initialPlace.cpp",
         "src/initialPlace.h",
         "src/mbff.cpp",

@@ -41,6 +41,43 @@ add_library(gpl_lib
   src/mbff.cpp
 )
 
+# --- HPWL backend selection (link-time dispatch) ---
+# Exactly one translation unit defines NesterovBaseCommon::getHpwl(): either
+# the OpenMP loop in src/hpwl.cpp (default) or the Kokkos kernel in
+# src/gpu/hpwl.cpp when ENABLE_GPU=ON. CMake enforces ODR; the
+# consumer-facing headers and sources stay free of preprocessor branches.
+# gpu/ is a file-layout subdirectory only (no nested CMakeLists.txt) so
+# kernel build settings stay in this module's CMakeLists with the rest
+# of gpl_lib.
+if(ENABLE_GPU)
+  target_sources(gpl_lib PRIVATE src/gpu/hpwl.cpp)
+  # nesterovBase.h and other private gpl headers live in src/; sources
+  # under src/gpu/ need that on the include path explicitly because
+  # the compiler's default same-dir lookup points into src/gpu/ instead.
+  target_include_directories(gpl_lib PRIVATE src)
+  if(Kokkos_ENABLE_CUDA)
+    set_source_files_properties(src/gpu/hpwl.cpp PROPERTIES LANGUAGE CUDA)
+  elseif(Kokkos_ENABLE_HIP)
+    set_source_files_properties(src/gpu/hpwl.cpp PROPERTIES LANGUAGE HIP)
+  endif()
+  # Disable FP contraction for kernels that share gpl_lib's compile
+  # context so they stay bit-stable across compilers. Scoped to gpl_lib
+  # but the CXX flag is also harmless on the existing CPU TUs.
+  target_compile_options(gpl_lib PRIVATE
+    $<$<COMPILE_LANGUAGE:CXX>:-ffp-contract=off>
+    $<$<COMPILE_LANGUAGE:CUDA>:--fmad=false>
+    $<$<COMPILE_LANGUAGE:HIP>:-ffp-contract=off>
+  )
+  target_link_libraries(gpl_lib Kokkos::kokkos)
+  if(Kokkos_ENABLE_CUDA)
+    # cuda runtime symbols are referenced from the CUDA TU; expose cudart
+    # so that gpl_lib (and the openroad binary) link against libcudart.
+    target_link_libraries(gpl_lib CUDA::cudart)
+  endif()
+else()
+  target_sources(gpl_lib PRIVATE src/hpwl.cpp)
+endif()
+
 target_sources(gpl
   PRIVATE
     src/MakeReplace.cpp

@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: BSD-3-Clause
+// Copyright (c) 2026, The OpenROAD Authors
+
+// HPWL (half-perimeter wirelength) Kokkos backend.
+//
+// Compiled and linked only when ENABLE_GPU=ON; the OpenMP equivalent in
+// ../hpwl.cpp is linked otherwise. Exactly one translation unit defines
+// NesterovBaseCommon::getHpwl() per build (CMake-enforced ODR).
+//
+// Determinism: integer arithmetic; bit-exact across Kokkos backends
+// (Serial / OpenMP / Threads / CUDA) and against the OpenMP CPU loop.
+
+#include <Kokkos_Core.hpp>
+#include <climits>
+#include <cstdint>
+#include <cstdlib>
-#include <cstdlib>
+#include <cstdlib>
-#include <cstdlib>
+#include <cstdlib>
+#include <mutex>
+#include <vector>
+
-
-
+#include "nesterovBase.h"
+
+namespace gpl {
+
+// Lazy Kokkos lifecycle owned by gpl_lib so that the host application
+// (the openroad binary, regression drivers, etc.) does not need to know
+// Kokkos exists. The first GPU HPWL call initializes Kokkos and registers
+// an atexit handler that finalizes it once at process shutdown — this is
+// the upstream-safe pattern for opt-in CUDA backends without disrupting
+// OpenROAD's existing main(). std::call_once keeps the initialization
+// safe if a future caller drops the master-thread invariant.
+namespace {
+void ensureKokkosInitialized()
+{
+  static std::once_flag once;
+  std::call_once(once, [] {
+    if (Kokkos::is_initialized()) {
+      return;
+    }
+    Kokkos::InitializationSettings settings;
+    settings.set_disable_warnings(true);
+    Kokkos::initialize(settings);
+    std::atexit([] {
+      if (Kokkos::is_initialized() && !Kokkos::is_finalized()) {
+        Kokkos::finalize();
+      }
+    });
+  });
+}
+}  // namespace
+
+int64_t NesterovBaseCommon::getHpwl()
+{
+  const int n_nets = static_cast<int>(gNetStor_.size());
+  if (n_nets == 0) {
+    return 0;
+  }
+
+  ensureKokkosInitialized();
+
+  // ---- 1. Flatten net→pins to CSR on host ----
+  std::vector<int> h_net_off(n_nets + 1, 0);
+  for (int i = 0; i < n_nets; ++i) {
+    h_net_off[i + 1]
+        = h_net_off[i] + static_cast<int>(gNetStor_[i].getGPins().size());
+  }
+  const int total_pins = h_net_off[n_nets];
+
+  std::vector<int> h_pin_cx(total_pins);
+  std::vector<int> h_pin_cy(total_pins);
+  for (int i = 0; i < n_nets; ++i) {
+    int off = h_net_off[i];
+    for (auto* gPin : gNetStor_[i].getGPins()) {
+      h_pin_cx[off] = gPin->cx();
+      h_pin_cy[off] = gPin->cy();
+      ++off;
+    }
+  }
+
+  // ---- 2. Mirror inputs to device ----
+  using ExecSpace = Kokkos::DefaultExecutionSpace;
+  Kokkos::View<int*, ExecSpace> d_net_off("hpwl_net_off", n_nets + 1);
+  Kokkos::View<int*, ExecSpace> d_pin_cx("hpwl_pin_cx", total_pins);
+  Kokkos::View<int*, ExecSpace> d_pin_cy("hpwl_pin_cy", total_pins);
+
+  Kokkos::View<int*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> h_net_off_view(
+      h_net_off.data(), n_nets + 1);
+  Kokkos::View<int*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> h_pin_cx_view(
+      h_pin_cx.data(), total_pins);
+  Kokkos::View<int*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> h_pin_cy_view(
+      h_pin_cy.data(), total_pins);
+
+  Kokkos::deep_copy(d_net_off, h_net_off_view);
+  Kokkos::deep_copy(d_pin_cx, h_pin_cx_view);
+  Kokkos::deep_copy(d_pin_cy, h_pin_cy_view);
+
+  // Per-net bbox outputs (kept on device for reduction; mirrored back at end).
+  Kokkos::View<int*, ExecSpace> d_lx("hpwl_net_lx", n_nets);
+  Kokkos::View<int*, ExecSpace> d_ly("hpwl_net_ly", n_nets);
+  Kokkos::View<int*, ExecSpace> d_ux("hpwl_net_ux", n_nets);
+  Kokkos::View<int*, ExecSpace> d_uy("hpwl_net_uy", n_nets);
+
+  // ---- 3. Compute per-net bbox in parallel; serial inner over pins ----
+  Kokkos::parallel_for(
+      "hpwl_bbox",
+      Kokkos::RangePolicy<ExecSpace>(0, n_nets),
+      KOKKOS_LAMBDA(const int i) {
+        int lx = INT_MAX;
+        int ly = INT_MAX;
+        int ux = INT_MIN;
+        int uy = INT_MIN;
+        const int begin = d_net_off(i);
+        const int end = d_net_off(i + 1);
+        // Serial over pins for determinism (sgizler 80b04e1c1 pattern: do not
+        // rely on parallel_reduce ordering even though min/max are commutative
+        // — keeps results bit-identical to the CPU updateBox() loop).
+        for (int j = begin; j < end; ++j) {
+          const int x = d_pin_cx(j);
+          const int y = d_pin_cy(j);
+          if (x < lx) {
+            lx = x;
+          }
+          if (y < ly) {
+            ly = y;
+          }
+          if (x > ux) {
+            ux = x;
+          }
+          if (y > uy) {
+            uy = y;
+          }
+        }
+        d_lx(i) = lx;
+        d_ly(i) = ly;
+        d_ux(i) = ux;
+        d_uy(i) = uy;
+      });
+
+  // ---- 4. Sum HPWL across nets (int64 reduction → backend-deterministic) ----
+  int64_t total_hpwl = 0;
+  Kokkos::parallel_reduce(
+      "hpwl_sum",
+      Kokkos::RangePolicy<ExecSpace>(0, n_nets),
+      KOKKOS_LAMBDA(const int i, int64_t& acc) {
+        const int lx = d_lx(i);
+        const int ly = d_ly(i);
+        const int ux = d_ux(i);
+        const int uy = d_uy(i);
+        // Dangling net (no pins): GNet::getHpwl() returns 0 in this case.
+        if (ux < lx) {
+          return;
+        }
+        acc += static_cast<int64_t>(ux - lx) + static_cast<int64_t>(uy - ly);
+      },
+      total_hpwl);
+
+  // ---- 5. Mirror per-net bbox back to host GNet objects ----
+  // Subsequent code paths (e.g. routeBase, timing-driven weights) read
+  // gNet->lx() / ly() / ux() / uy() and expect them updated.
+  auto h_lx = Kokkos::create_mirror_view(d_lx);
+  auto h_ly = Kokkos::create_mirror_view(d_ly);
+  auto h_ux = Kokkos::create_mirror_view(d_ux);
+  auto h_uy = Kokkos::create_mirror_view(d_uy);
+  Kokkos::deep_copy(h_lx, d_lx);
+  Kokkos::deep_copy(h_ly, d_ly);
+  Kokkos::deep_copy(h_ux, d_ux);
+  Kokkos::deep_copy(h_uy, d_uy);
+
+  for (int i = 0; i < n_nets; ++i) {
+    gNetStor_[i].setBox(h_lx(i), h_ly(i), h_ux(i), h_uy(i));
+  }
+
+  return total_hpwl;
+}
+
+}  // namespace gpl