ggml-org
diff --git a/‎README.md
Lines changed: 132 additions & 0 deletions b/‎README.md
Lines changed: 132 additions & 0 deletions
diff --git a/‎ggml/CMakeLists.txt
Lines changed: 3 additions & 0 deletions b/‎ggml/CMakeLists.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎ggml/cmake/ggml-config.cmake.in
Lines changed: 5 additions & 0 deletions b/‎ggml/cmake/ggml-config.cmake.in
Lines changed: 5 additions & 0 deletions
diff --git a/‎ggml/include/ggml-alloc.h
Lines changed: 2 additions & 0 deletions b/‎ggml/include/ggml-alloc.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎ggml/include/ggml-igpu.h
Lines changed: 16 additions & 0 deletions b/‎ggml/include/ggml-igpu.h
Lines changed: 16 additions & 0 deletions
diff --git a/‎ggml/src/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎ggml/src/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/src/ggml-backend-reg.cpp
Lines changed: 7 additions & 0 deletions b/‎ggml/src/ggml-backend-reg.cpp
Lines changed: 7 additions & 0 deletions
diff --git a/‎ggml/src/ggml-igpu/CMakeLists.txt
Lines changed: 85 additions & 0 deletions b/‎ggml/src/ggml-igpu/CMakeLists.txt
Lines changed: 85 additions & 0 deletions
diff --git a/‎ggml/src/ggml-igpu/ggml-hip.cpp
Lines changed: 7 additions & 0 deletions b/‎ggml/src/ggml-igpu/ggml-hip.cpp
Lines changed: 7 additions & 0 deletions
diff --git a/‎ggml/src/ggml-igpu/ggml-hip.h
Lines changed: 48 additions & 0 deletions b/‎ggml/src/ggml-igpu/ggml-hip.h
Lines changed: 48 additions & 0 deletions
@@ -1,3 +1,135 @@
+# experimental support of Ryzen 7x40 (Linux)
+in my case a 7940HS with 64Go of RAM.with fedora:41/rocm-hip:6.2.1
+
+The backend only add mulmat(bf16) support OP with hip (no use for rocblas.) There is no limit on RAM usage (GTT/VRAM) weight are allocate on RAM.
+
+If you want to test:
+
+```sh
+# build:
+rm -rf build/igpu
+cmake -S . -B build/igpu -DGGML_IGPU=ON -DAMDGPU_TARGETS=gfx1103 -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=OFF
+cmake --build build/igpu --config Release -- -j 8
+
+# run: (please use -ngl 999 --no-mmap -ctk bf16 -ctv bf16 for the best)
+build/igpu/bin/llama-cli --color -ngl 999 --no-mmap -ctk bf16 -ctv bf16 -m Meta-Llama-3.1-8B-Instruct.BF16.gguf
+```
+
+to be fare there is some aleatory crache with 'MES' error, may need some correction on AMD firmware
+
+01/03/2025: 1er version of kernel (V1)  (support only BF16 quantisation)
+14/03/2025: create a new kernel (V2)    (support only BF16 quantisation)
+01/04/2025: V4 optimise small N
+15/04/2025: V5 kernel support BF16 & FP16 quant
+25/05/2025: V7 optimised tensor loading (WIP)
+
+Next:
+  - create kernel for FP8 and support optional conversion of weight (FP16/BF16/FP32) to BFP on load.
+  - create true block kernel for CPU ("blis" like)?
+
+Some result (when it did not crash):
+
+## Llama-3.2-1B-Instruct/BF16.gguf 
+| model           |       size |   params | type_k | type_v |  test |    CPU |    V1   |     V2  |     V4  |      V9 |  Vulkan |
+| --------------- | ---------: | -------: | -----: | -----: | ----: | -----: | ------: | ------: | ------: | ------: | ------: |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 |   pp1 |  23.26 |   18.53 |   27.59 |   30.14 |   30.21 |   30.99 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 |   pp2 |  45.39 |   36.20 |   34.22 |   57.68 |   57.89 |   60.76 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 |   pp4 |  90.47 |   71.78 |   65.12 |  111.07 |  111.81 |  117.07 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 |   pp8 | 176.86 |  139.26 |  119.79 |  200.94 |  201.25 |  229.28 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 |  pp16 | 344.33 |  266.42 |  200.51 |  315.39 |  314.93 |  196.28 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 |  pp32 | 562.30 |  422.50 |  429.52 |  423.95 |  596.81 |  366.10 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 |  pp48 | 665.70 |  653.25 |  601.83 |  597.82 |  912.48 |  594.74 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 |  pp64 | 679.13 |  717.96 |  760.94 |  764.79 | 1134.05 |  744.75 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 | pp128 | 723.15 |  990.37 | 1062.69 | 1061.43 | 1632.71 | 1007.61 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 | pp192 | 738.65 | 1131.50 | 1304.20 | 1298.02 | 1904.42 | 1054.13 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 | pp256 | 746.87 | 1151.29 | 1326.96 | 1329.72 | 1832.45 | 1153.88 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 | pp384 | 714.54 | 1178.65 | 1220.25 | 1197.43 | 1355.90 | 1238.02 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 | pp512 | 677.09 |  963.16 |  950.69 |  946.85 |  958.19 | 1207.43 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 | pp768 | 665.30 |  901.93 |  884.07 |  874.94 |  913.08 | 1162.78 |
+| llama 1B BF16   |   2.30 GiB |   1.24 B |   bf16 |   bf16 |  tg16 |  23.00 |   18.26 |   27.69 |   30.13 |   30.16 |   31.17 |
+
+
+## Llama-3.2-3B-Instruct/BF16.gguf
+| model           |       size |   params | type_k | type_v |  test |    CPU |   V1   |    V2  |    V4  |     V9 | Vulkan |
+| --------------- | ---------: | -------: | -----: | -----: | ----: | -----: | -----: | -----: | -----: | -----: | -----: |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 |   pp1 |   8.94 |   7.85 |  11.03 |  11.84 |  11.83 |  12.07 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 |   pp2 |  17.56 |  15.67 |  14.61 |  23.08 |  22.86 |  23.67 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 |   pp4 |  35.02 |  31.11 |  27.86 |  44.61 |  44.23 |  44.96 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 |   pp8 |  69.18 |  61.01 |  51.21 |  82.57 |  81.46 |  90.41 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 |  pp16 | 131.72 | 117.77 |  86.80 | 135.50 | 135.25 |  78.39 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 |  pp32 | 209.28 | 185.05 | 178.08 | 176.60 | 258.01 | 142.46 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 |  pp48 | 232.70 | 273.60 | 249.61 | 251.45 | 364.37 | 196.73 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 |  pp64 | 237.90 | 300.62 | 313.17 | 316.92 | 445.82 | 246.77 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 | pp128 | 261.37 | 390.84 | 438.12 | 438.36 | 673.04 | 316.93 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 | pp192 | 263.82 | 445.00 | 506.12 | 504.17 | 760.73 | 368.65 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 | pp256 | 265.27 | 450.11 | 516.21 | 512.75 | 750.77 | 373.97 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 | pp384 | 261.27 | 470.54 | 485.27 | 476.42 | 682.73 | 400.52 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 | pp512 | 254.72 | 441.51 | 480.40 | 479.50 | 559.39 | 390.60 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 | pp768 | 253.87 | 429.79 | 462.86 | 462.20 | 538.43 | 384.43 |
+| llama 3B BF16   |   5.98 GiB |   3.21 B |   bf16 |   bf16 |  tg16 |   8.90 |   7.85 |  11.02 |  11.88 |  11.89 |  12.30 |
+
+
+## Meta-Llama-3.1-8B-Instruct/BF16.gguf
+| model           |       size |   params | type_k | type_v |  test |    CPU |   V1   |    V2  |    V4  |    V9  | Vulkan |
+| --------------- | ---------: | -------: | -----: | -----: | ----: | -----: | -----: | -----: | -----: | -----: | -----: |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 |   pp1 |   3.88 |   3.88 |   4.88 |   5.21 |   5.21 |   5.35 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 |   pp2 |   7.59 |   7.74 |   7.40 |  10.12 |  10.12 |  10.60 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 |   pp4 |  15.04 |  15.43 |  14.20 |  19.67 |  19.62 |  20.59 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 |   pp8 |  29.73 |  30.23 |  26.37 |  36.74 |  36.60 |  40.71 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 |  pp16 |  56.55 |  58.55 |  45.95 |  61.51 |  61.39 |  41.17 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 |  pp32 |  84.81 |  91.54 |  83.38 |  81.09 | 117.81 |  75.68 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 |  pp48 |  90.43 | 114.77 | 116.55 | 114.14 | 163.96 | 106.00 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 |  pp64 |  85.45 | 137.17 | 139.46 | 142.46 | 200.57 | 132.83 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 | pp128 | 103.68 | 152.59 | 195.33 | 192.79 | 277.65 | 150.98 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 | pp192 | 107.07 | 183.30 | 215.62 | 217.06 | 294.23 | 159.43 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 | pp256 | 107.43 | 185.74 | 235.19 | 233.90 | 304.70 | 164.52 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 | pp384 | 106.74 | 213.56 | 230.65 | 229.00 | 316.93 | 168.15 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 | pp512 | 104.39 | 203.01 | 232.16 | 231.73 | 306.62 | 167.31 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 | pp768 | 104.19 | 194.98 | 225.46 | 225.09 | 290.90 | 165.74 |
+| llama 8B BF16   |  14.96 GiB |   8.03 B |   bf16 |   bf16 |  tg16 |   3.88 |   3.88 |   4.87 |   5.21 |   5.21 |   5.36 |
+
+
+## Mistral-Nemo-Instruct-2407/BF16.gguf
+| model           |       size |   params | type_k | type_v |  test |    CPU |   V1   |    V2  |    V4  | Vulkan |
+| --------------- | ---------: | -------: | -----: | -----: | ----: | -----: | -----: | -----: | -----: | -----: |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 |   pp1 |   2.52 |   2.76 |   3.16 |   3.39 |   3.47 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 |   pp2 |   4.94 |   5.49 |   4.90 |   6.59 |   6.89 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 |   pp4 |   9.82 |  10.92 |   9.42 |  12.85 |  13.38 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 |   pp8 |  19.40 |  21.60 |  17.56 |  23.92 |  25.51 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 |  pp16 |  36.85 |  42.03 |  30.77 |  40.88 |  12.83 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 |  pp32 |  50.40 |  65.33 |  56.43 |  55.22 |  22.44 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 |  pp48 |  52.77 |  77.46 |  76.93 |  75.94 |  37.75 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 |  pp64 |  54.65 |  94.48 |  93.57 |  94.02 |  48.15 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 | pp128 |  65.72 | 103.87 | 127.90 | 128.54 |  51.19 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 | pp192 |  67.66 | 121.43 | 143.60 | 147.41 |  54.16 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 | pp256 |  68.45 | 130.03 | 156.00 | 155.52 |  54.07 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 | pp384 |  67.64 | 142.89 | 154.52 | 153.33 |  54.42 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 | pp512 |  67.02 | 136.18 | 156.22 | 156.51 |  46.71 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 | pp768 |  66.74 | 130.78 | 151.59 | 151.78 |  46.73 |
+| llama 12B BF16  |  22.81 GiB |  12.25 B |   bf16 |   bf16 |  tg16 |   2.52 |   2.76 |   3.16 |   3.39 |   3.48 |
+
+
+## Mistral-Small-24B-Instruct-2501/BF16.gguf
+| model           |       size |   params | type_k | type_v |  test |    CPU |   V1   |    V2  |     V4  |
+| --------------- | ---------: | -------: | -----: | -----: | ----: | -----: | -----: | -----: | ------: |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 |   pp1 |   1.28 |   1.39 |   1.64 |   1.73  |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 |   pp2 |   2.52 |   2.76 |   2.71 |   3.40  |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 |   pp4 |   5.02 |   5.50 |   5.26 |   6.63  |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 |   pp8 |   9.87 |  10.89 |   9.94 |  12.52  |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 |  pp16 |  18.32 |  21.32 |  17.86 |  22.36  |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 |  pp32 |  25.53 |  34.65 |  31.50 |  30.18  |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 |  pp48 |  24.53 |  36.05 |  43.93 |  43.43  |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 |  pp64 |  25.88 |  47.87 |  53.96 |  53.73  |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 | pp128 |  29.69 |  52.03 |  69.64 |  65.84  |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 | pp192 |  29.99 |  61.00 |  79.73 |  80.14  |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 | pp256 |  30.94 |  63.11 |  87.30 |  87.01  |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 | pp384 |  32.51 |  75.00 |  86.26 |    -    |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 | pp512 |  32.28 |  71.11 |  88.11 |    -    |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 | pp768 |  32.02 |  67.33 |  85.47 |    -    |
+| llama 24B BF16  |  43.91 GiB |  23.57 B |   bf16 |   bf16 |  tg16 |   1.28 |   1.38 |   1.62 |    -    |
+
+-------------------------------
+
 # llama.cpp
 
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 
@@ -154,6 +154,8 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                             "ggml: BLAS library vendor")
 option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
 
+option(GGML_IGPU                            "ggml: use IGPU"                                  OFF)
+
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
@@ -272,6 +274,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-cann.h
     include/ggml-cpp.h
     include/ggml-cuda.h
+    include/ggml-igpu.h
     include/ggml-opt.h
     include/ggml-metal.h
     include/ggml-rpc.h
 
@@ -83,6 +83,11 @@ if (NOT GGML_SHARED_LIB)
         set(GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
     endif()
 
+    if (GGML_IGPU)
+        find_package(hip     REQUIRED)
+        list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host)
+    endif()
+
     if (GGML_SYCL)
         set(GGML_SYCL_INTERFACE_LINK_LIBRARIES "")
         find_package(DNNL)
 
@@ -68,6 +68,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
 
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
+//  Y a un probleme, les tenseurs sont initialisé mais leur type n'est pas "bon"
+//  TODO: ajouter "enum ggml_backend_buffer_usage usage = GGML_BACKEND_BUFFER_USAGE_ANY" en parametre !!!
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
 
 
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_igpu_reg(void);
+
+
+#ifdef  __cplusplus
+}
+#endif
@@ -378,6 +378,7 @@ ggml_add_backend(BLAS)
 ggml_add_backend(CANN)
 ggml_add_backend(CUDA)
 ggml_add_backend(HIP)
+ggml_add_backend(IGPU)
 ggml_add_backend(METAL)
 ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
 
@@ -65,6 +65,10 @@
 #include "ggml-cann.h"
 #endif
 
+#ifdef GGML_USE_IGPU
+#include "ggml-igpu.h"
+#endif
+
 // disable C++17 deprecation warning for std::codecvt_utf8
 #if defined(__clang__)
 #    pragma clang diagnostic push
@@ -165,6 +169,9 @@ struct ggml_backend_registry {
     std::vector<ggml_backend_dev_t> devices;
 
     ggml_backend_registry() {
+#ifdef GGML_USE_IGPU
+        register_backend(ggml_backend_igpu_reg());
+#endif
 #ifdef GGML_USE_CUDA
         register_backend(ggml_backend_cuda_reg());
 #endif
 
@@ -0,0 +1,85 @@
+if (WIN32)
+    message(FATAL_ERROR "Not teste on Windows OS")
+endif()
+
+# pas encore bon dans 
+# => /usr/share/cmake/Modules/CMakeDetermineHIPCompiler.cmake
+# => /usr/lib64/cmake/hip-lang/hip-lang-config.cmake vs /usr/local/lib64/cmake/hip-lang/hip-lang-config.cmake
+#   hipcc                         -v -print-targets | grep "Found HIP installation"  => mauvais?
+#   /usr/lib64/llvm18/bin/clang++ -v -print-targets | grep "Found HIP installation"  => mauvais !!!
+#   clang-18                      -v -print-targets | grep "Found HIP installation"  => OK
+#   hipconfig --rocmpath   => OK  => on va le forcer
+
+# forcer la dernier methode...
+if(NOT CMAKE_HIP_COMPILER_ROCM_ROOT)
+  execute_process(
+    COMMAND hipconfig --rocmpath
+    OUTPUT_VARIABLE _CMAKE_HIPCONFIG_ROCMPATH
+    RESULT_VARIABLE _CMAKE_HIPCONFIG_RESULT
+    )
+  if(_CMAKE_HIPCONFIG_RESULT EQUAL 0 AND EXISTS "${_CMAKE_HIPCONFIG_ROCMPATH}")
+    set(CMAKE_HIP_COMPILER_ROCM_ROOT "${_CMAKE_HIPCONFIG_ROCMPATH}")
+  endif()
+endif()
+
+# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
+if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+    set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
+endif()
+cmake_minimum_required(VERSION 3.21)
+
+enable_language(HIP)
+
+find_package(hip     REQUIRED)
+find_package(OpenMP  REQUIRED)
+
+# pas testé d'autre version...
+if (${hip_VERSION} VERSION_LESS 6.2)
+    message(FATAL_ERROR "At least ROCM/HIP V6.2 is required")
+endif()
+
+message(STATUS "HIP found")
+
+set(TARGET_NAME ggml-igpu)
+
+file(GLOB   GGML_SOURCES_ROCM "*.cpp")
+#file(GLOB   SRCS "*.hip")
+#list(APPEND GGML_SOURCES_ROCM ${SRCS})
+
+ggml_add_backend_library(${TARGET_NAME}
+                         ../../include/ggml-igpu.h
+                         ggml-hip.h
+                         mulmat.h
+                         mulmat-imp.h
+                         tensor.h
+                         types.h
+                         # mulmat-bf16.h
+                         mulmat-bf16bloc_V1.h
+                         mulmat-bf16bloc_V2.h
+                         mulmat-bf16bloc_V3.h
+                         mulmat-bf16bloc_V4.h
+                         mulmat-bf16bloc_V5.h
+                         ${GGML_SOURCES_ROCM}
+                        )
+
+add_compile_definitions(GGML_USE_IGPU)
+
+set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
+
+target_compile_features(${TARGET_NAME} PRIVATE c_std_11 cxx_std_20)
+
+target_compile_options(${TARGET_NAME} PRIVATE ${OpenMP_CXX_FLAGS})
+# target_link_libraries(${TARGET_NAME} PRIVATE hip::device ${OpenMP_CXX_FLAGS})
+
+if (GGML_STATIC)
+    message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
+endif()
+
+if (GGML_HIP_EXPORT_METRICS)
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Rpass-analysis=kernel-resource-usage --save-temps")
+endif()
+
+
+target_link_libraries(${TARGET_NAME} PRIVATE ggml-base hip::host OpenMP::OpenMP_CXX ${OpenMP_CXX_FLAGS})
+
+message(STATUS "OpenMP_CXX_FLAGS ${OpenMP_CXX_FLAGS} ")
@@ -0,0 +1,7 @@
+#include "ggml-hip.h"
+
+namespace ggml::hip {
+    void setDevice(int id) {
+        HIP_CHECK_ERROR(hipSetDevice(id));
+    }
+}
@@ -0,0 +1,48 @@
+#pragma once
+#include <hip/hip_runtime.h>
+
+#include "ggml.h"
+
+#define HIP_CHECK_ERROR(trt) \
+    do {                                                                           \
+        hipError_t _tmpVal;                                                        \
+        if((_tmpVal = trt) != hipSuccess) {                                        \
+           GGML_ABORT("HIP_ERROR(%s => %s)", hipGetErrorString(_tmpVal), #trt);    \
+        }                                                                          \
+    } while(0)
+
+namespace ggml::hip {
+
+    template<typename T>
+    T* allocateHost(const std::size_t size) {
+        void * ptr;
+        HIP_CHECK_ERROR(hipHostMalloc(&ptr, size*sizeof(T), hipHostMallocNonCoherent));
+        return reinterpret_cast<T*>(ptr);
+    }
+
+    template<typename T>
+    T* allocateDevice(const std::size_t size) {
+      void * ptr;
+      HIP_CHECK_ERROR(hipMalloc(&ptr, size*sizeof(T)));
+      return reinterpret_cast<T*>(ptr);
+    }
+
+    template<typename T>
+    void deallocateHost(T * ptr) {
+        HIP_CHECK_ERROR(hipHostFree((void*)ptr));
+    }
+
+    template<typename T>
+    void deallocateDevice(T * ptr) {
+      HIP_CHECK_ERROR(hipFree((void*)ptr));
+    }
+
+    template<typename T>
+    T* getDeviceMem(T* host_adr) {
+        void * ptr=nullptr;
+        HIP_CHECK_ERROR(hipHostGetDevicePointer(&ptr, host_adr, 0));
+        return reinterpret_cast<T*>(ptr);
+    }
+
+    void setDevice(int id);
+}