29 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H 30 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H 32 #include <hip/amd_detail/hip_common.h> 37 #if !defined(__HIPCC_RTC__) 47 #endif // !defined(__HIPCC_RTC__) 50 #ifndef __HIP_ENABLE_DEVICE_MALLOC__ 51 #define __HIP_ENABLE_DEVICE_MALLOC__ 0 54 #if __HIP_CLANG_ONLY__ 56 #if !defined(__align__) 57 #define __align__(x) __attribute__((aligned(x))) 60 #define CUDA_SUCCESS hipSuccess 62 #if !defined(__HIPCC_RTC__) 63 #include <hip/hip_runtime_api.h> 64 extern int HIP_TRACE_API;
65 #endif // !defined(__HIPCC_RTC__) 68 #include <hip/amd_detail/hip_ldg.h> 70 #include <hip/amd_detail/hip_atomic.h> 72 #include <hip/amd_detail/device_functions.h> 73 #include <hip/amd_detail/surface_functions.h> 74 #include <hip/amd_detail/texture_fetch_functions.h> 75 #include <hip/amd_detail/texture_indirect_functions.h> 78 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__) 79 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__ 83 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__ 87 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1) 88 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1) 89 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1) 90 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1) 91 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1) 94 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1) 95 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1) 98 #define __HIP_ARCH_HAS_DOUBLES__ (1) 101 #define __HIP_ARCH_HAS_WARP_VOTE__ (1) 102 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1) 103 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1) 104 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0) 107 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1) 108 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0) 111 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0) 112 #define __HIP_ARCH_HAS_3DGRID__ (1) 113 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0) 118 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \ 119 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock))) 120 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \ 121 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \ 122 amdgpu_waves_per_eu(minBlocksPerMultiprocessor))) 123 #define select_impl_(_1, _2, impl_, ...) impl_ 124 #define __launch_bounds__(...) \ 125 select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__) 127 #if !defined(__HIPCC_RTC__) 128 __host__ inline void* __get_dynamicgroupbaseptr() {
return nullptr; }
129 #endif // !defined(__HIPCC_RTC__) 131 #if __HIP_ARCH_GFX701__ == 0 133 __device__
unsigned __hip_ds_bpermute(
int index,
unsigned src);
134 __device__
float __hip_ds_bpermutef(
int index,
float src);
135 __device__
unsigned __hip_ds_permute(
int index,
unsigned src);
136 __device__
float __hip_ds_permutef(
int index,
float src);
138 template <
int pattern>
139 __device__
unsigned __hip_ds_swizzle_N(
unsigned int src);
140 template <
int pattern>
141 __device__
float __hip_ds_swizzlef_N(
float src);
143 template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
144 __device__
int __hip_move_dpp_N(
int src);
146 #endif //__HIP_ARCH_GFX803__ == 1 148 #ifndef __OPENMP_AMDGCN__ 149 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 150 #if __HIP_ENABLE_DEVICE_MALLOC__ 151 extern "C" __device__
void* __hip_malloc(
size_t);
152 extern "C" __device__
void* __hip_free(
void* ptr);
153 static inline __device__
void* malloc(
size_t size) {
return __hip_malloc(size); }
154 static inline __device__
void* free(
void* ptr) {
return __hip_free(ptr); }
156 static inline __device__
void* malloc(
size_t size) { __builtin_trap();
return nullptr; }
157 static inline __device__
void* free(
void* ptr) { __builtin_trap();
return nullptr; }
159 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 160 #endif // !__OPENMP_AMDGCN__ 170 #if !defined(__HIPCC_RTC__) 171 #define HIP_KERNEL_NAME(...) __VA_ARGS__ 172 #define HIP_SYMBOL(X) X 174 typedef int hipLaunchParm;
176 template <std::size_t n,
typename... Ts,
177 typename std::enable_if<n ==
sizeof...(Ts)>::type* =
nullptr>
178 void pArgs(
const std::tuple<Ts...>&,
void*) {}
180 template <std::size_t n,
typename... Ts,
181 typename std::enable_if<n !=
sizeof...(Ts)>::type* =
nullptr>
182 void pArgs(
const std::tuple<Ts...>& formals,
void** _vargs) {
183 using T =
typename std::tuple_element<n, std::tuple<Ts...> >::type;
185 static_assert(!std::is_reference<T>{},
186 "A __global__ function cannot have a reference as one of its " 188 #if defined(HIP_STRICT) 189 static_assert(std::is_trivially_copyable<T>{},
190 "Only TriviallyCopyable types can be arguments to a __global__ " 193 _vargs[n] =
const_cast<void*
>(
reinterpret_cast<const void*
>(&std::get<n>(formals)));
194 return pArgs<n + 1>(formals, _vargs);
197 template <
typename... Formals,
typename... Actuals>
198 std::tuple<Formals...> validateArgsCountType(
void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
199 static_assert(
sizeof...(Formals) ==
sizeof...(Actuals),
"Argument Count Mismatch");
200 std::tuple<Formals...> to_formals{std::move(actuals)};
204 #if defined(HIP_TEMPLATE_KERNEL_LAUNCH) 205 template <
typename... Args,
typename F = void (*)(Args...)>
206 void hipLaunchKernelGGL(F kernel,
const dim3& numBlocks,
const dim3& dimBlocks,
207 std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
208 constexpr
size_t count =
sizeof...(Args);
209 auto tup_ = std::tuple<Args...>{args...};
210 auto tup = validateArgsCountType(kernel, tup_);
212 pArgs<0>(tup, _Args);
214 auto k =
reinterpret_cast<void*
>(kernel);
215 hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
218 #define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \ 220 kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \ 223 #define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__) 226 #include <hip/hip_runtime_api.h> 227 #endif // !defined(__HIPCC_RTC__) 229 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_local_id(uint);
230 extern "C" __device__ __attribute__((const))
size_t __ockl_get_group_id(uint);
231 extern "C" __device__ __attribute__((const))
size_t __ockl_get_local_size(uint);
232 extern "C" __device__ __attribute__((const))
size_t __ockl_get_num_groups(uint);
233 struct __HIP_BlockIdx {
235 std::uint32_t operator()(std::uint32_t x)
const noexcept {
return __ockl_get_group_id(x); }
237 struct __HIP_BlockDim {
239 std::uint32_t operator()(std::uint32_t x)
const noexcept {
240 return __ockl_get_local_size(x);
243 struct __HIP_GridDim {
245 std::uint32_t operator()(std::uint32_t x)
const noexcept {
246 return __ockl_get_num_groups(x);
249 struct __HIP_ThreadIdx {
251 std::uint32_t operator()(std::uint32_t x)
const noexcept {
252 return __ockl_get_local_id(x);
256 #if defined(__HIPCC_RTC__) 257 typedef struct dim3 {
262 constexpr __device__
dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
265 #endif // !defined(__HIPCC_RTC__) 267 template <
typename F>
268 struct __HIP_Coordinates {
269 using R = decltype(F{}(0));
271 struct __X { __device__
operator R() const noexcept {
return F{}(0); } };
272 struct __Y { __device__
operator R() const noexcept {
return F{}(1); } };
273 struct __Z { __device__
operator R() const noexcept {
return F{}(2); } };
275 static constexpr __X x{};
276 static constexpr __Y y{};
277 static constexpr __Z z{};
279 __device__
operator dim3()
const {
return dim3(x, y, z); }
283 template <
typename F>
284 #if !defined(_MSC_VER) 285 __attribute__((weak))
287 constexpr
typename __HIP_Coordinates<F>::__X __HIP_Coordinates<F>::x;
288 template <
typename F>
289 #if !defined(_MSC_VER) 290 __attribute__((weak))
292 constexpr
typename __HIP_Coordinates<F>::__Y __HIP_Coordinates<F>::y;
293 template <
typename F>
294 #if !defined(_MSC_VER) 295 __attribute__((weak))
297 constexpr
typename __HIP_Coordinates<F>::__Z __HIP_Coordinates<F>::z;
299 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_global_size(uint);
302 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__X,
303 __HIP_Coordinates<__HIP_BlockDim>::__X) noexcept {
304 return __ockl_get_global_size(0);
308 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__X,
309 __HIP_Coordinates<__HIP_GridDim>::__X) noexcept {
310 return __ockl_get_global_size(0);
314 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Y,
315 __HIP_Coordinates<__HIP_BlockDim>::__Y) noexcept {
316 return __ockl_get_global_size(1);
320 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Y,
321 __HIP_Coordinates<__HIP_GridDim>::__Y) noexcept {
322 return __ockl_get_global_size(1);
326 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Z,
327 __HIP_Coordinates<__HIP_BlockDim>::__Z) noexcept {
328 return __ockl_get_global_size(2);
332 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Z,
333 __HIP_Coordinates<__HIP_GridDim>::__Z) noexcept {
334 return __ockl_get_global_size(2);
337 static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
338 static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
339 static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
340 static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
342 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_local_id(uint);
343 #define hipThreadIdx_x (__ockl_get_local_id(0)) 344 #define hipThreadIdx_y (__ockl_get_local_id(1)) 345 #define hipThreadIdx_z (__ockl_get_local_id(2)) 347 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_group_id(uint);
348 #define hipBlockIdx_x (__ockl_get_group_id(0)) 349 #define hipBlockIdx_y (__ockl_get_group_id(1)) 350 #define hipBlockIdx_z (__ockl_get_group_id(2)) 352 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_local_size(uint);
353 #define hipBlockDim_x (__ockl_get_local_size(0)) 354 #define hipBlockDim_y (__ockl_get_local_size(1)) 355 #define hipBlockDim_z (__ockl_get_local_size(2)) 357 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_num_groups(uint);
358 #define hipGridDim_x (__ockl_get_num_groups(0)) 359 #define hipGridDim_y (__ockl_get_num_groups(1)) 360 #define hipGridDim_z (__ockl_get_num_groups(2)) 362 #include <hip/amd_detail/math_functions.h> 364 #if __HIP_HCC_COMPAT_MODE__ 366 #pragma push_macro("__DEFINE_HCC_FUNC") 367 #define __DEFINE_HCC_FUNC(hc_fun,hip_var) \ 368 inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \ 377 __DEFINE_HCC_FUNC(workitem_id, threadIdx)
378 __DEFINE_HCC_FUNC(group_id, blockIdx)
379 __DEFINE_HCC_FUNC(group_size, blockDim)
380 __DEFINE_HCC_FUNC(num_groups, gridDim)
381 #pragma pop_macro("__DEFINE_HCC_FUNC") 383 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_global_id(uint);
384 inline __device__ __attribute__((always_inline)) uint
385 hc_get_workitem_absolute_id(
int dim)
387 return (uint)__ockl_get_global_id(dim);
392 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 393 #if !defined(__HIPCC_RTC__) 395 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__ 396 #pragma push_macro("__CUDA__") 398 #include <__clang_cuda_math_forward_declares.h> 399 #include <__clang_cuda_complex_builtins.h> 405 #include <include/cuda_wrappers/algorithm> 406 #include <include/cuda_wrappers/complex> 407 #include <include/cuda_wrappers/new> 409 #pragma pop_macro("__CUDA__") 410 #endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__ 411 #endif // !defined(__HIPCC_RTC__) 412 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 413 #endif // __HIP_CLANG_ONLY__ 415 #include <hip/amd_detail/hip_memory.h> 417 #endif // HIP_AMD_DETAIL_RUNTIME_H hipError_t hipLaunchKernel(const void *function_address, dim3 numBlocks, dim3 dimBlocks, void **args, size_t sharedMemBytes __dparm(0), hipStream_t stream __dparm(0))
C compliant kernel launch API.
Definition: helpers.hpp:50
Definition: hip_runtime_api.h:318
#define __host__
Definition: host_defines.h:59