@@ -837,7 +837,9 @@ REGISTER_DTYPE(bf8 , unsigned _BitInt(8))
837837REGISTER_DTYPE(i32 , int32_t )
838838REGISTER_DTYPE(u32 , uint32_t )
839839REGISTER_DTYPE(i16 , int16_t )
840+ #if __clang_major__ >= 20
840841REGISTER_DTYPE (u16 , uint16_t )
842+ #endif
841843REGISTER_DTYPE (i8 , int8_t )
842844REGISTER_DTYPE(u8 , uint8_t )
843845
@@ -1163,6 +1165,12 @@ OPUS_D constexpr auto buffer_default_config() {
11631165OPUS_D __amdgpu_buffer_rsrc_t make_buffer_rsrc (const void * ptr, uint32_t size = 0xffffffff , uint32_t config = buffer_default_config()) {
11641166 return __builtin_amdgcn_make_buffer_rsrc (const_cast <void *>(static_cast <const void *>(ptr)), 0 , size, config); // void *p, short stride, int num, int flags
11651167}
1168+ #if __clang_major__ < 20
1169+ #pragma clang diagnostic push
1170+ #pragma clang diagnostic ignored "-Wundefined-inline"
1171+ OPUS_D void llvm_amdgcn_raw_buffer_load_lds (i32x4_t r, __attribute__((address_space(3 ))) uint32_t* p, index_t size, index_t vos, index_t sos, index_t ios, index_t aux) __asm(" llvm.amdgcn.raw.buffer.load.lds" );
1172+ #pragma clang diagnostic pop
1173+ #endif
11661174template <typename T_>
11671175struct gmem {
11681176 using T = remove_cvref_t <T_>;
@@ -1193,6 +1201,16 @@ struct gmem {
11931201 else if constexpr (sizeof (type) == 12 ) { __builtin_amdgcn_raw_ptr_buffer_load_lds (cached_rsrc, dst, 12 , v_os, s_os, 0 , aux); }
11941202 else if constexpr (sizeof (type) == 16 ) { __builtin_amdgcn_raw_ptr_buffer_load_lds (cached_rsrc, dst, 16 , v_os, s_os, 0 , aux); }
11951203#endif
1204+ #else
1205+ i32x4_t cached_rsrc_;
1206+ __builtin_memcpy (&cached_rsrc_, &cached_rsrc, sizeof (i32x4_t )); // builtin memcpy, __builtin_bit_cast() can not use here due to __amdgpu_buffer_rsrc_t is non copyable
1207+ if constexpr (sizeof (type) == 1 ) {llvm_amdgcn_raw_buffer_load_lds (cached_rsrc_, reinterpret_cast <__attribute__ ((address_space (3 ))) u32_t *>(reinterpret_cast <unsigned long int >(dst)), 1 , v_os, s_os, 0 , aux); }
1208+ else if constexpr (sizeof (type) == 2 ) {llvm_amdgcn_raw_buffer_load_lds (cached_rsrc_, reinterpret_cast <__attribute__ ((address_space (3 ))) u32_t *>(reinterpret_cast <unsigned long int >(dst)), 2 , v_os, s_os, 0 , aux); }
1209+ else if constexpr (sizeof (type) == 4 ) {llvm_amdgcn_raw_buffer_load_lds (cached_rsrc_, reinterpret_cast <__attribute__ ((address_space (3 ))) u32_t *>(reinterpret_cast <unsigned long int >(dst)), 4 , v_os, s_os, 0 , aux); }
1210+ #if defined(__gfx950__)
1211+ else if constexpr (sizeof (type) == 12 ) {llvm_amdgcn_raw_buffer_load_lds (cached_rsrc_, reinterpret_cast <__attribute__ ((address_space (3 ))) u32_t *>(reinterpret_cast <unsigned long int >(dst)), 12 , v_os, s_os, 0 , aux); }
1212+ else if constexpr (sizeof (type) == 16 ) {llvm_amdgcn_raw_buffer_load_lds (cached_rsrc_, reinterpret_cast <__attribute__ ((address_space (3 ))) u32_t *>(reinterpret_cast <unsigned long int >(dst)), 16 , v_os, s_os, 0 , aux); }
1213+ #endif
11961214#endif
11971215 }
11981216
0 commit comments