29 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H 30 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H 32 #include <hip/hcc_detail/hip_common.h> 48 #ifndef __HIP_ENABLE_DEVICE_MALLOC__ 49 #define __HIP_ENABLE_DEVICE_MALLOC__ 0 52 #if __HCC_OR_HIP_CLANG__ 55 #if !defined(__align__) 56 #define __align__(x) __attribute__((aligned(x))) 60 #define CUDA_SUCCESS hipSuccess 62 #include <hip/hip_runtime_api.h> 63 #endif // __HCC_OR_HIP_CLANG__ 67 #ifdef HIP_ENABLE_PRINTF 68 #define HCC_ENABLE_ACCELERATOR_PRINTF 1 74 #include "grid_launch.h" 75 #include "hc_printf.hpp" 79 #if GENERIC_GRID_LAUNCH == 0 80 #define hipLaunchParm grid_launch_parm 83 struct Empty_launch_parm {};
85 #define hipLaunchParm hip_impl::Empty_launch_parm 86 #endif // GENERIC_GRID_LAUNCH 88 #if defined(GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || GENERIC_GRID_LAUNCH == 1 89 #else // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0. 90 #error(HCC must support GRID_LAUNCH_20) 91 #endif // GRID_LAUNCH_VERSION 95 #if GENERIC_GRID_LAUNCH == 1 && defined __HCC__ 96 #include "grid_launch_GGL.hpp" 97 #endif // GENERIC_GRID_LAUNCH 101 #if __HCC_OR_HIP_CLANG__ 102 extern int HIP_TRACE_API;
105 #include <hip/hcc_detail/hip_ldg.h> 107 #include <hip/hcc_detail/hip_atomic.h> 109 #include <hip/hcc_detail/device_functions.h> 110 #include <hip/hcc_detail/surface_functions.h> 112 #include <hip/hcc_detail/math_functions.h> 113 #include <hip/hcc_detail/texture_functions.h> 115 #include <hip/hcc_detail/texture_fetch_functions.h> 116 #include <hip/hcc_detail/texture_indirect_functions.h> 119 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__) 120 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__ 124 #if defined(__HCC__) && __HIP_DEVICE_COMPILE__ == 1 126 #define assert(COND) \ 136 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__ 140 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1) 141 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1) 142 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1) 143 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1) 144 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1) 147 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1) 148 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0) 151 #define __HIP_ARCH_HAS_DOUBLES__ (1) 154 #define __HIP_ARCH_HAS_WARP_VOTE__ (1) 155 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1) 156 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1) 157 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0) 160 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1) 161 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0) 164 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0) 165 #define __HIP_ARCH_HAS_3DGRID__ (1) 166 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0) 171 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \ 172 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock))) 173 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \ 174 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \ 175 amdgpu_waves_per_eu(minBlocksPerMultiprocessor))) 176 #define select_impl_(_1, _2, impl_, ...) impl_ 177 #define __launch_bounds__(...) \ 178 select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__) 181 #if defined(__cplusplus) 183 #elif defined(__STDC_VERSION__) 187 __host__ inline void* __get_dynamicgroupbaseptr() {
return nullptr; }
189 #if __HIP_ARCH_GFX701__ == 0 191 __device__
unsigned __hip_ds_bpermute(
int index,
unsigned src);
192 __device__
float __hip_ds_bpermutef(
int index,
float src);
193 __device__
unsigned __hip_ds_permute(
int index,
unsigned src);
194 __device__
float __hip_ds_permutef(
int index,
float src);
196 template <
int pattern>
197 __device__
unsigned __hip_ds_swizzle_N(
unsigned int src);
198 template <
int pattern>
199 __device__
float __hip_ds_swizzlef_N(
float src);
201 template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
202 __device__
int __hip_move_dpp_N(
int src);
204 #endif //__HIP_ARCH_GFX803__ == 1 206 #endif // __HCC_OR_HIP_CLANG__ 212 using R = decltype(hc_get_group_id(0));
215 R operator()(std::uint32_t x)
const noexcept {
return hc_get_group_id(x); }
218 using R = decltype(hc_get_group_size(0));
221 R operator()(std::uint32_t x)
const noexcept {
222 return hc_get_group_size(x);
226 using R = decltype(hc_get_num_groups(0));
229 R operator()(std::uint32_t x)
const noexcept {
230 return hc_get_num_groups(x);
234 using R = decltype(hc_get_workitem_id(0));
237 R operator()(std::uint32_t x)
const noexcept {
238 return hc_get_workitem_id(x);
243 template <
typename F>
245 using R = decltype(F{}(0));
247 struct X { __device__
operator R()
const noexcept {
return F{}(0); } };
248 struct Y { __device__
operator R()
const noexcept {
return F{}(1); } };
249 struct Z { __device__
operator R()
const noexcept {
return F{}(2); } };
251 static constexpr
X x{};
252 static constexpr
Y y{};
253 static constexpr
Z z{};
260 return hc_get_grid_size(0);
266 return hc_get_grid_size(0);
272 return hc_get_grid_size(1);
278 return hc_get_grid_size(1);
284 return hc_get_grid_size(2);
290 return hc_get_grid_size(2);
298 #define hipThreadIdx_x (hc_get_workitem_id(0)) 299 #define hipThreadIdx_y (hc_get_workitem_id(1)) 300 #define hipThreadIdx_z (hc_get_workitem_id(2)) 302 #define hipBlockIdx_x (hc_get_group_id(0)) 303 #define hipBlockIdx_y (hc_get_group_id(1)) 304 #define hipBlockIdx_z (hc_get_group_id(2)) 306 #define hipBlockDim_x (hc_get_group_size(0)) 307 #define hipBlockDim_y (hc_get_group_size(1)) 308 #define hipBlockDim_z (hc_get_group_size(2)) 310 #define hipGridDim_x (hc_get_num_groups(0)) 311 #define hipGridDim_y (hc_get_num_groups(1)) 312 #define hipGridDim_z (hc_get_num_groups(2)) 314 #endif // defined __HCC__ 316 #ifndef __OPENMP_AMDGCN__ 317 #if __HCC_OR_HIP_CLANG__ 318 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 319 #if __HIP_ENABLE_DEVICE_MALLOC__ 320 extern "C" __device__
void* __hip_malloc(
size_t);
321 extern "C" __device__
void* __hip_free(
void* ptr);
322 static inline __device__
void* malloc(
size_t size) {
return __hip_malloc(size); }
323 static inline __device__
void* free(
void* ptr) {
return __hip_free(ptr); }
325 static inline __device__
void* malloc(
size_t size) { __builtin_trap();
return nullptr; }
326 static inline __device__
void* free(
void* ptr) { __builtin_trap();
return nullptr; }
328 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 329 #endif //__HCC_OR_HIP_CLANG__ 330 #endif // !__OPENMP_AMDGCN__ 334 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE) 336 #define HIP_KERNEL_NAME(...) (__VA_ARGS__) 337 #define HIP_SYMBOL(X) #X 339 #if defined __HCC_CPP__ 340 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream,
dim3 grid,
dim3 block,
342 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream,
dim3 grid,
size_t block,
344 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream,
size_t grid,
dim3 block,
346 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream,
size_t grid,
size_t block,
348 extern void ihipPostLaunchKernel(
const char* kernelName, hipStream_t stream,
grid_launch_parm& lp,
bool unlockPostponed = 0);
350 #if GENERIC_GRID_LAUNCH == 0 354 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \ 356 grid_launch_parm lp; \ 357 lp.dynamic_group_mem_bytes = _groupMemBytes; \ 358 hipStream_t trueStream = \ 359 (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \ 360 _kernelName(lp, ##__VA_ARGS__); \ 361 ihipPostLaunchKernel(#_kernelName, trueStream, lp); \ 363 #endif // GENERIC_GRID_LAUNCH 365 #elif defined(__HCC_C__) 379 #elif defined(__clang__) && defined(__HIP__) 381 #define HIP_KERNEL_NAME(...) __VA_ARGS__ 382 #define HIP_SYMBOL(X) X 384 typedef int hipLaunchParm;
386 template <std::size_t n,
typename... Ts,
387 typename std::enable_if<n ==
sizeof...(Ts)>::type* =
nullptr>
388 void pArgs(
const std::tuple<Ts...>&,
void*) {}
390 template <std::size_t n,
typename... Ts,
391 typename std::enable_if<n !=
sizeof...(Ts)>::type* =
nullptr>
392 void pArgs(
const std::tuple<Ts...>& formals,
void** _vargs) {
393 using T =
typename std::tuple_element<n, std::tuple<Ts...> >::type;
395 static_assert(!std::is_reference<T>{},
396 "A __global__ function cannot have a reference as one of its " 398 #if defined(HIP_STRICT) 399 static_assert(std::is_trivially_copyable<T>{},
400 "Only TriviallyCopyable types can be arguments to a __global__ " 403 _vargs[n] =
const_cast<void*
>(
reinterpret_cast<const void*
>(&std::get<n>(formals)));
404 return pArgs<n + 1>(formals, _vargs);
407 template <
typename... Formals,
typename... Actuals>
408 std::tuple<Formals...> validateArgsCountType(
void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
409 static_assert(
sizeof...(Formals) ==
sizeof...(Actuals),
"Argument Count Mismatch");
410 std::tuple<Formals...> to_formals{std::move(actuals)};
414 #if defined(HIP_TEMPLATE_KERNEL_LAUNCH) 415 template <
typename... Args,
typename F = void (*)(Args...)>
416 void hipLaunchKernelGGL(F kernel,
const dim3& numBlocks,
const dim3& dimBlocks,
417 std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
418 constexpr
size_t count =
sizeof...(Args);
419 auto tup_ = std::tuple<Args...>{args...};
420 auto tup = validateArgsCountType(kernel, tup_);
422 pArgs<0>(tup, _Args);
424 auto k =
reinterpret_cast<void*
>(kernel);
425 hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
428 #define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \ 430 kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \ 433 #define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__) 436 #include <hip/hip_runtime_api.h> 437 extern "C" __device__ __attribute__((
const))
size_t __ockl_get_local_id(uint);
438 extern "C" __device__ __attribute__((
const))
size_t __ockl_get_group_id(uint);
439 extern "C" __device__ __attribute__((
const))
size_t __ockl_get_local_size(uint);
440 extern "C" __device__ __attribute__((
const))
size_t __ockl_get_num_groups(uint);
441 struct __HIP_BlockIdx {
443 std::uint32_t operator()(std::uint32_t x)
const noexcept {
return __ockl_get_group_id(x); }
445 struct __HIP_BlockDim {
447 std::uint32_t operator()(std::uint32_t x)
const noexcept {
448 return __ockl_get_local_size(x);
451 struct __HIP_GridDim {
453 std::uint32_t operator()(std::uint32_t x)
const noexcept {
454 return __ockl_get_num_groups(x);
457 struct __HIP_ThreadIdx {
459 std::uint32_t operator()(std::uint32_t x)
const noexcept {
460 return __ockl_get_local_id(x);
464 template <
typename F>
465 struct __HIP_Coordinates {
466 using R = decltype(F{}(0));
468 struct X { __device__
operator R()
const noexcept {
return F{}(0); } };
469 struct Y { __device__
operator R()
const noexcept {
return F{}(1); } };
470 struct Z { __device__
operator R()
const noexcept {
return F{}(2); } };
472 static constexpr
X x{};
473 static constexpr
Y y{};
474 static constexpr
Z z{};
476 __device__
operator dim3()
const {
return dim3(x, y, z); }
480 template <
typename F>
481 #if !defined(_MSC_VER) 482 __attribute__((weak))
484 constexpr
typename __HIP_Coordinates<F>::X __HIP_Coordinates<F>::x;
485 template <
typename F>
486 #if !defined(_MSC_VER) 487 __attribute__((weak))
489 constexpr
typename __HIP_Coordinates<F>::Y __HIP_Coordinates<F>::y;
490 template <
typename F>
491 #if !defined(_MSC_VER) 492 __attribute__((weak))
494 constexpr
typename __HIP_Coordinates<F>::Z __HIP_Coordinates<F>::z;
496 extern "C" __device__ __attribute__((
const))
size_t __ockl_get_global_size(uint);
499 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::X,
500 __HIP_Coordinates<__HIP_BlockDim>::X) noexcept {
501 return __ockl_get_global_size(0);
505 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::X,
506 __HIP_Coordinates<__HIP_GridDim>::X) noexcept {
507 return __ockl_get_global_size(0);
511 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::Y,
512 __HIP_Coordinates<__HIP_BlockDim>::Y) noexcept {
513 return __ockl_get_global_size(1);
517 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::Y,
518 __HIP_Coordinates<__HIP_GridDim>::Y) noexcept {
519 return __ockl_get_global_size(1);
523 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::Z,
524 __HIP_Coordinates<__HIP_BlockDim>::Z) noexcept {
525 return __ockl_get_global_size(2);
529 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::Z,
530 __HIP_Coordinates<__HIP_GridDim>::Z) noexcept {
531 return __ockl_get_global_size(2);
534 static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
535 static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
536 static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
537 static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
539 extern "C" __device__ __attribute__((
const))
size_t __ockl_get_local_id(uint);
540 #define hipThreadIdx_x (__ockl_get_local_id(0)) 541 #define hipThreadIdx_y (__ockl_get_local_id(1)) 542 #define hipThreadIdx_z (__ockl_get_local_id(2)) 544 extern "C" __device__ __attribute__((
const))
size_t __ockl_get_group_id(uint);
545 #define hipBlockIdx_x (__ockl_get_group_id(0)) 546 #define hipBlockIdx_y (__ockl_get_group_id(1)) 547 #define hipBlockIdx_z (__ockl_get_group_id(2)) 549 extern "C" __device__ __attribute__((
const))
size_t __ockl_get_local_size(uint);
550 #define hipBlockDim_x (__ockl_get_local_size(0)) 551 #define hipBlockDim_y (__ockl_get_local_size(1)) 552 #define hipBlockDim_z (__ockl_get_local_size(2)) 554 extern "C" __device__ __attribute__((
const))
size_t __ockl_get_num_groups(uint);
555 #define hipGridDim_x (__ockl_get_num_groups(0)) 556 #define hipGridDim_y (__ockl_get_num_groups(1)) 557 #define hipGridDim_z (__ockl_get_num_groups(2)) 559 #include <hip/hcc_detail/math_functions.h> 561 #if __HIP_HCC_COMPAT_MODE__ 563 #pragma push_macro("__DEFINE_HCC_FUNC") 564 #define __DEFINE_HCC_FUNC(hc_fun,hip_var) \ 565 inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \ 574 __DEFINE_HCC_FUNC(workitem_id, threadIdx)
575 __DEFINE_HCC_FUNC(group_id, blockIdx)
576 __DEFINE_HCC_FUNC(group_size, blockDim)
577 __DEFINE_HCC_FUNC(num_groups, gridDim)
578 #pragma pop_macro("__DEFINE_HCC_FUNC") 580 extern "C" __device__ __attribute__((
const))
size_t __ockl_get_global_id(uint);
581 inline __device__ __attribute__((always_inline)) uint
582 hc_get_workitem_absolute_id(
int dim)
584 return (uint)__ockl_get_global_id(dim);
589 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 591 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__ 592 #pragma push_macro("__CUDA__") 594 #include <__clang_cuda_math_forward_declares.h> 595 #include <__clang_cuda_complex_builtins.h> 601 #include <include/cuda_wrappers/algorithm> 602 #include <include/cuda_wrappers/complex> 603 #include <include/cuda_wrappers/new> 605 #pragma pop_macro("__CUDA__") 606 #endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__ 607 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 608 #endif // defined(__clang__) && defined(__HIP__) 610 #include <hip/hcc_detail/hip_memory.h> 612 #endif // HIP_HCC_DETAIL_RUNTIME_H Definition: hip_runtime.h:233
hipError_t hipLaunchKernel(const void *function_address, dim3 numBlocks, dim3 dimBlocks, void **args, size_t sharedMemBytes __dparm(0), hipStream_t stream __dparm(0))
C compliant kernel launch API.
Definition: hip_runtime.h:248
Definition: hip_runtime.h:249
Definition: hip_runtime.h:225
Definition: hip_runtime_api.h:330
#define __host__
Definition: host_defines.h:41
Definition: hip_runtime.h:211
Definition: grid_launch.h:31
Definition: concepts.hpp:25
Definition: hip_runtime.h:247
Definition: hip_runtime.h:217
Definition: hip_runtime.h:244