23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H
24 #define HIP_SRC_HIP_HCC_INTERNAL_H
28 #include <unordered_map>
31 #include "hsa/hsa_ext_amd.h"
32 #include "hip/hip_runtime.h"
37 #if (__hcc_workweek__ < 16354)
38 #error("This version of HIP requires a newer version of HCC.");
43 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \
44 (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
45 #define USE_APP_PTR_FOR_CTX 1
57 extern const int release;
61 extern int HIP_LAUNCH_BLOCKING;
62 extern int HIP_API_BLOCKING;
64 extern int HIP_PRINT_ENV;
65 extern int HIP_PROFILE_API;
69 extern int HIP_STAGING_SIZE;
70 extern int HIP_STREAM_SIGNALS;
71 extern int HIP_VISIBLE_DEVICES;
72 extern int HIP_FORCE_P2P_HOST;
74 extern int HIP_HOST_COHERENT;
76 extern int HIP_HIDDEN_FREE_MEM;
79 extern int HIP_SYNC_HOST_ALLOC;
80 extern int HIP_SYNC_STREAM_WAIT;
82 extern int HIP_SYNC_NULL_STREAM;
83 extern int HIP_INIT_ALLOC;
84 extern int HIP_FORCE_NULL_STREAM;
86 extern int HIP_DUMP_CODE_OBJECT;
89 extern int HCC_OPT_FLUSH;
96 int tid()
const {
return _shortTid; };
97 pid_t pid()
const {
return _pid; };
98 uint64_t incApiSeqNum() {
return ++_apiSeqNum; };
99 uint64_t apiSeqNum()
const {
return _apiSeqNum; };
110 static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
112 void print(
int tid) {
113 std::cout <<
"Enabling tracing for ";
114 for (
auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
115 std::cout <<
"tid:" << tid <<
"." << *iter <<
",";
120 uint64_t nextTrigger() {
return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
121 void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
122 void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
125 std::vector<uint64_t> _profTrigger;
131 extern thread_local hipError_t tls_lastHipError;
132 extern thread_local
TidInfo tls_tidInfo;
133 extern thread_local
bool tls_getPrimaryCtx;
135 extern std::vector<ProfTrigger> g_dbStartTriggers;
136 extern std::vector<ProfTrigger> g_dbStopTriggers;
146 #define KNRM "\x1B[0m"
147 #define KRED "\x1B[31m"
148 #define KGRN "\x1B[32m"
149 #define KYEL "\x1B[33m"
150 #define KBLU "\x1B[34m"
151 #define KMAG "\x1B[35m"
152 #define KCYN "\x1B[36m"
153 #define KWHT "\x1B[37m"
155 extern const char* API_COLOR;
156 extern const char* API_COLOR_END;
162 #define EVENT_THREAD_SAFE 1
164 #define STREAM_THREAD_SAFE 1
166 #define CTX_THREAD_SAFE 1
168 #define DEVICE_THREAD_SAFE 1
173 #define COMPILE_HIP_DB 1
181 #define COMPILE_HIP_TRACE_API 0x3
186 #ifndef COMPILE_HIP_ATP_MARKER
187 #define COMPILE_HIP_ATP_MARKER 0
196 #if COMPILE_HIP_ATP_MARKER
197 #include "CXLActivityLogger.h"
198 #define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr);
199 #define MARKER_END() amdtEndMarker();
200 #define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING);
201 #define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING);
204 #define MARKER_BEGIN(markerName, group)
206 #define RESUME_PROFILING
207 #define STOP_PROFILING
213 #define TRACE_ALL 0 // 0x01
214 #define TRACE_KCMD 1 // 0x02, kernel command
215 #define TRACE_MCMD 2 // 0x04, memory command
216 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation.
217 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent)
218 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery
229 #define DB_MAX_FLAG 6
236 const char* _shortName;
240 static const DbName dbName[] = {
242 {KYEL,
"sync"}, {KCYN,
"mem"}, {KMAG,
"copy"}, {KRED,
"warn"},
248 #define tprintf(trace_level, ...) \
250 if (HIP_DB & (1 << (trace_level))) { \
252 snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \
253 fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \
254 dbName[trace_level]._shortName, tls_tidInfo.pid(), tls_tidInfo.tid(), msgStr, KNRM); \
259 #define tprintf(trace_level, ...)
263 static inline uint64_t getTicks() {
return hc::get_system_ticks(); }
266 extern uint64_t recordApiTrace(std::string* fullStr,
const std::string& apiStr);
268 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
269 #define API_TRACE(forceTrace, ...) \
270 uint64_t hipApiStartTick = 0; \
272 tls_tidInfo.incApiSeqNum(); \
274 (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) { \
275 std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
276 std::string fullStr; \
277 hipApiStartTick = recordApiTrace(&fullStr, apiStr); \
278 if (HIP_PROFILE_API == 0x1) { \
279 MARKER_BEGIN(__func__, "HIP") \
280 } else if (HIP_PROFILE_API == 0x2) { \
281 MARKER_BEGIN(fullStr.c_str(), "HIP"); \
288 #define API_TRACE(IS_CMD, ...) tls_tidInfo.incApiSeqNum();
294 std::call_once(hip_initialized, ihipInit); \
295 ihipCtxStackUpdate();
296 #define HIP_SET_DEVICE() ihipDeviceSetState();
302 #define HIP_INIT_API(cid, ...) \
304 API_TRACE(0, __VA_ARGS__); \
305 HIP_CB_SPAWNER_OBJECT(cid);
311 #define HIP_INIT_SPECIAL_API(cid, tbit, ...) \
313 API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \
314 HIP_CB_SPAWNER_OBJECT(cid);
320 #define ihipLogStatus(hipStatus) \
322 hipError_t localHipStatus = hipStatus; \
323 tls_lastHipError = localHipStatus; \
325 if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \
326 auto ticks = getTicks() - hipApiStartTick; \
327 fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
328 (localHipStatus == 0) ? API_COLOR : KRED, tls_tidInfo.pid(), tls_tidInfo.tid(), \
329 tls_tidInfo.apiSeqNum(), __func__, localHipStatus, \
330 ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
332 if (HIP_PROFILE_API) { \
362 #define HIP_IPC_RESERVED_SIZE 24
369 char reserved[HIP_IPC_RESERVED_SIZE];
374 std::string fileName;
375 hsa_executable_t executable = {};
376 hsa_code_object_reader_t coReader = {};
379 if (executable.handle) hsa_executable_destroy(executable);
380 if (coReader.handle) hsa_code_object_reader_destroy(coReader);
390 bool try_lock() {
return true; }
394 #if EVENT_THREAD_SAFE
395 typedef std::mutex EventMutex;
397 #warning "Stream thread-safe disabled"
401 #if STREAM_THREAD_SAFE
402 typedef std::mutex StreamMutex;
404 #warning "Stream thread-safe disabled"
410 typedef std::mutex CtxMutex;
413 #warning "Ctx thread-safe disabled"
416 #if DEVICE_THREAD_SAFE
417 typedef std::mutex DeviceMutex;
420 #warning "Device thread-safe disabled"
427 template <
typename T>
431 : _criticalData(&criticalData),
432 _autoUnlock(autoUnlock)
435 tprintf(DB_SYNC,
"locking criticalData=%p for %s..\n", _criticalData,
436 ToString(_criticalData->_parent).c_str());
437 _criticalData->_mutex.lock();
442 tprintf(DB_SYNC,
"auto-unlocking criticalData=%p for %s...\n", _criticalData,
443 ToString(_criticalData->_parent).c_str());
444 _criticalData->_mutex.unlock();
449 tprintf(DB_SYNC,
"unlocking criticalData=%p for %s...\n", _criticalData,
450 ToString(_criticalData->_parent).c_str());
451 _criticalData->_mutex.unlock();
455 T* operator->() {
return _criticalData; };
463 template <
typename MUTEX_TYPE>
467 void lock() { _mutex.lock(); }
468 void unlock() { _mutex.unlock(); }
469 bool try_lock() {
return _mutex.try_lock(); }
475 template <
typename MUTEX_TYPE>
479 : _kernelCnt(0), _av(av), _parent(parentStream){};
489 tprintf(DB_SYNC,
"munlocking criticalData=%p for %s...\n",
this,
490 ToString(this->_parent).c_str());
496 tprintf(DB_SYNC,
"mtry_locking=%d criticalData=%p for %s...\n", gotLock,
this,
497 ToString(this->_parent).c_str());
498 return gotLock ?
this :
nullptr;
505 hc::accelerator_view _av;
526 enum ScheduleMode { Auto, Spin, Yield };
527 typedef uint64_t SeqNum_t;
534 void locked_copySync(
void* dst,
const void* src,
size_t sizeBytes,
unsigned kind,
535 bool resolveOn =
true);
537 void locked_copy2DSync(
void* dst,
const void* src,
size_t width,
size_t height,
size_t srcPitch,
size_t dstPitch,
unsigned kind,
538 bool resolveOn =
true);
540 void locked_copyAsync(
void* dst,
const void* src,
size_t sizeBytes,
unsigned kind);
542 void locked_copy2DAsync(
void* dst,
const void* src,
size_t width,
size_t height,
size_t srcPitch,
size_t dstPitch,
unsigned kind);
544 void lockedSymbolCopySync(hc::accelerator& acc,
void* dst,
void* src,
size_t sizeBytes,
545 size_t offset,
unsigned kind);
546 void lockedSymbolCopyAsync(hc::accelerator& acc,
void* dst,
void* src,
size_t sizeBytes,
547 size_t offset,
unsigned kind);
553 void lockclose_postKernelCommand(
const char* kernelName, hc::accelerator_view* av);
558 hc::accelerator_view* locked_getAv() {
564 hc::completion_future locked_recordEvent(
hipEvent_t event);
567 void locked_eventWaitComplete(hc::completion_future& marker, hc::hcWaitMode waitMode);
572 hc::hcWaitMode waitMode()
const;
577 void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
578 uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
579 uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
580 uint32_t sharedMemBytes,
void* kernarg,
size_t kernSize,
591 bool isDefaultStream()
const {
return _id == 0; };
602 unsigned resolveMemcpyDirection(
bool srcInDeviceMem,
bool dstInDeviceMem);
603 void resolveHcMemcpyDirection(
unsigned hipMemKind,
const hc::AmPointerInfo* dstPtrInfo,
604 const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
605 ihipCtx_t** copyDevice,
bool* forceUnpinnedCopy);
607 bool canSeeMemory(
const ihipCtx_t* thisCtx,
const hc::AmPointerInfo* dstInfo,
608 const hc::AmPointerInfo* srcInfo);
610 void addSymbolPtrToTracker(hc::accelerator& acc,
void* ptr,
size_t sizeBytes);
618 std::mutex _hasQueueLock;
623 friend std::ostream& operator<<(std::ostream& os,
const ihipStream_t& s);
626 ScheduleMode _scheduleMode;
635 : _stream(stream), _callback(callback), _userData(userData) {
645 enum hipEventStatus_t {
646 hipEventStatusUnitialized = 0,
647 hipEventStatusCreated = 1,
648 hipEventStatusRecording = 2,
649 hipEventStatusComplete = 3,
653 enum ihipEventType_t {
654 hipEventTypeIndependent,
655 hipEventTypeStartCommand,
656 hipEventTypeStopCommand,
662 _state = hipEventStatusCreated;
665 _type = hipEventTypeIndependent;
668 void marker(
const hc::completion_future& marker) { _marker = marker; };
669 hc::completion_future& marker() {
return _marker; }
670 uint64_t timestamp()
const {
return _timestamp; };
671 ihipEventType_t type()
const {
return _type; };
673 ihipEventType_t _type;
674 hipEventStatus_t _state;
679 hc::completion_future _marker;
685 template <
typename MUTEX_TYPE>
708 void attachToCompletionFuture(
const hc::completion_future* cf,
hipStream_t stream,
709 ihipEventType_t eventType);
710 std::pair<hipEventStatus_t, uint64_t> refreshEventStatus();
716 return _criticalData._eventData;
733 template <
typename MUTEX_TYPE>
737 : _parent(parentDevice), _ctxCount(0){};
744 std::list<ihipCtx_t*>& ctxs() {
return _ctxs; };
745 const std::list<ihipCtx_t*>& const_ctxs()
const {
return _ctxs; };
746 int getcount() {
return _ctxCount; };
753 std::list<ihipCtx_t*> _ctxs;
767 ihipDevice_t(
unsigned deviceId,
unsigned deviceCnt, hc::accelerator& acc);
771 ihipCtx_t* getPrimaryCtx()
const {
return _primaryCtx; };
779 hc::accelerator _acc;
780 hsa_agent_t _hsaAgent;
809 std::vector<char> _arguments;
814 template <
typename MUTEX_TYPE>
818 : _parent(parentCtx), _peerCnt(0) {
819 _peerAgents =
new hsa_agent_t[deviceCnt];
823 if (_peerAgents !=
nullptr) {
825 _peerAgents =
nullptr;
832 std::list<ihipStream_t*>& streams() {
return _streams; };
833 const std::list<ihipStream_t*>& const_streams()
const {
return _streams; };
837 bool isPeerWatcher(
const ihipCtx_t* peer);
841 void resetPeerWatchers(
ihipCtx_t* thisDevice);
842 void printPeerWatchers(FILE* f)
const;
844 uint32_t peerCnt()
const {
return _peerCnt; };
845 hsa_agent_t* peerAgents()
const {
return _peerAgents; };
849 std::list<ihipCtx_t*> _peers;
851 std::stack<ihipExec_t> _execStack;
859 std::list<ihipStream_t*> _streams;
867 hsa_agent_t* _peerAgents;
869 void recomputePeerAgents();
900 void locked_waitAllStreams();
901 void locked_syncDefaultStream(
bool waitOnSelf,
bool syncHost);
905 const ihipDevice_t* getDevice()
const {
return _device; };
906 int getDeviceNum()
const {
return _device->_deviceId; };
909 ihipDevice_t* getWriteableDevice()
const {
return _device; };
911 std::string toString()
const;
935 extern std::once_flag hip_initialized;
936 extern unsigned g_deviceCnt;
937 extern hsa_agent_t g_cpu_agent;
938 extern hsa_agent_t* g_allAgents;
942 extern void ihipInit();
943 extern const char* ihipErrorString(hipError_t);
944 extern ihipCtx_t* ihipGetTlsDefaultCtx();
945 extern void ihipSetTlsDefaultCtx(
ihipCtx_t* ctx);
946 extern hipError_t ihipSynchronize(
void);
947 extern void ihipCtxStackUpdate();
948 extern hipError_t ihipDeviceSetState();
951 ihipCtx_t* ihipGetPrimaryCtx(
unsigned deviceIndex);
955 hipError_t ihipStreamSynchronize(
hipStream_t stream);
959 inline std::ostream& operator<<(std::ostream& os,
const ihipStream_t& s) {
961 os << s.getDevice()->_deviceId;
968 inline std::ostream& operator<<(std::ostream& os,
const dim3& s) {
979 inline std::ostream& operator<<(std::ostream& os,
const gl_dim3& s) {
991 inline std::ostream& operator<<(std::ostream& os,
const hipEvent_t& e) {
992 os <<
"event:" << std::hex << static_cast<void*>(e);
996 inline std::ostream& operator<<(std::ostream& os,
const ihipCtx_t* c) {
997 os <<
"ctx:" <<
static_cast<const void*
>(c) <<
".dev:" << c->getDevice()->_deviceId;
1003 namespace hip_internal {
1004 hipError_t memcpyAsync(
void* dst,
const void* src,
size_t sizeBytes, hipMemcpyKind kind,
Definition: hip_hcc_internal.h:234
Definition: hip_hcc_internal.h:765
Definition: hip_hcc_internal.h:464
Definition: hip_hcc_internal.h:109
friend hipError_t hipStreamQuery(hipStream_t)
Return #hipSuccess if all of the operations in the specified stream have completed, or #hipErrorNotReady if not.
Definition: hip_stream.cpp:154
Definition: hip_hcc_internal.h:387
Definition: hip_hcc_internal.h:363
uint32_t x
x
Definition: hip_runtime_api.h:241
Definition: grid_launch.h:17
Definition: hip_hcc_internal.h:804
Definition: hip_hcc_internal.h:885
Definition: hip_runtime_api.h:240
uint32_t y
y
Definition: hip_runtime_api.h:242
Definition: hip_hcc_internal.h:632
Definition: hip_hcc_internal.h:734
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:783
uint32_t z
z
Definition: hip_runtime_api.h:243
friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:110
Definition: hip_runtime_api.h:83
Definition: hip_hcc_internal.h:660
Definition: hip_hcc_internal.h:686
Definition: hip_hcc_internal.h:339
Definition: hip_hcc_internal.h:705
Definition: hip_hcc_internal.h:815
Definition: hip_hcc_internal.h:524
Definition: hip_hcc_internal.h:476
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:366
Definition: hip_hcc_internal.h:428
Definition: hip_hcc_internal.h:373
Definition: hip_hcc_internal.h:92
void(* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void *userData)
Definition: hip_runtime_api.h:791