HIP: Heterogenous-computing Interface for Portability
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
hip_hcc_internal.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H
24 #define HIP_SRC_HIP_HCC_INTERNAL_H
25 
26 #include <hc.hpp>
27 #include <hsa/hsa.h>
28 #include <unordered_map>
29 #include <stack>
30 
31 #include "hsa/hsa_ext_amd.h"
32 #include "hip/hip_runtime.h"
33 #include "hip_util.h"
34 #include "env.h"
35 
36 
37 #if (__hcc_workweek__ < 16354)
38 #error("This version of HIP requires a newer version of HCC.");
39 #endif
40 
41 // Use the __appPtr field in the am memtracker to store the context.
42 // Requires a bug fix in HCC
43 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \
44  (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
45 #define USE_APP_PTR_FOR_CTX 1
46 #endif
47 
48 
49 #define USE_IPC 1
50 
51 //---
52 // Environment variables:
53 
54 // Intended to distinguish whether an environment variable should be visible only in debug mode, or
55 // in debug+release.
56 // static const int debug = 0;
57 extern const int release;
58 
59 // TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for
60 // kernels?
61 extern int HIP_LAUNCH_BLOCKING;
62 extern int HIP_API_BLOCKING;
63 
64 extern int HIP_PRINT_ENV;
65 extern int HIP_PROFILE_API;
66 // extern int HIP_TRACE_API;
67 extern int HIP_ATP;
68 extern int HIP_DB;
69 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
70 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
71 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
72 extern int HIP_FORCE_P2P_HOST;
73 
74 extern int HIP_HOST_COHERENT;
75 
76 extern int HIP_HIDDEN_FREE_MEM;
77 //---
78 // Chicken bits for disabling functionality to work around potential issues:
79 extern int HIP_SYNC_HOST_ALLOC;
80 extern int HIP_SYNC_STREAM_WAIT;
81 
82 extern int HIP_SYNC_NULL_STREAM;
83 extern int HIP_INIT_ALLOC;
84 extern int HIP_FORCE_NULL_STREAM;
85 
86 extern int HIP_DUMP_CODE_OBJECT;
87 
88 // TODO - remove when this is standard behavior.
89 extern int HCC_OPT_FLUSH;
90 
91 // Class to assign a short TID to each new thread, for HIP debugging purposes.
92 class TidInfo {
93  public:
94  TidInfo();
95 
96  int tid() const { return _shortTid; };
97  pid_t pid() const { return _pid; };
98  uint64_t incApiSeqNum() { return ++_apiSeqNum; };
99  uint64_t apiSeqNum() const { return _apiSeqNum; };
100 
101  private:
102  int _shortTid;
103  pid_t _pid;
104 
105  // monotonically increasing API sequence number for this threa.
106  uint64_t _apiSeqNum;
107 };
108 
109 struct ProfTrigger {
110  static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
111 
112  void print(int tid) {
113  std::cout << "Enabling tracing for ";
114  for (auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
115  std::cout << "tid:" << tid << "." << *iter << ",";
116  }
117  std::cout << "\n";
118  };
119 
120  uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
121  void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
122  void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
123 
124  private:
125  std::vector<uint64_t> _profTrigger;
126 };
127 
128 
129 //---
130 // Extern tls
131 extern thread_local hipError_t tls_lastHipError;
132 extern thread_local TidInfo tls_tidInfo;
133 extern thread_local bool tls_getPrimaryCtx;
134 
135 extern std::vector<ProfTrigger> g_dbStartTriggers;
136 extern std::vector<ProfTrigger> g_dbStopTriggers;
137 
138 //---
139 // Forward defs:
140 class ihipStream_t;
141 class ihipDevice_t;
142 class ihipCtx_t;
143 struct ihipEventData_t;
144 
145 // Color defs for debug messages:
146 #define KNRM "\x1B[0m"
147 #define KRED "\x1B[31m"
148 #define KGRN "\x1B[32m"
149 #define KYEL "\x1B[33m"
150 #define KBLU "\x1B[34m"
151 #define KMAG "\x1B[35m"
152 #define KCYN "\x1B[36m"
153 #define KWHT "\x1B[37m"
154 
155 extern const char* API_COLOR;
156 extern const char* API_COLOR_END;
157 
158 
159 // If set, thread-safety is enforced on all event/stream/ctx/device functions.
160 // Can disable for performance or functional experiments - in this case
161 // the code uses a dummy "no-op" mutex.
162 #define EVENT_THREAD_SAFE 1
163 
164 #define STREAM_THREAD_SAFE 1
165 
166 #define CTX_THREAD_SAFE 1
167 
168 #define DEVICE_THREAD_SAFE 1
169 
170 
171 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
172 // May be set to 0 to remove debug if checks - possible code size and performance difference?
173 #define COMPILE_HIP_DB 1
174 
175 
176 // Compile HIP tracing capability.
177 // 0x1 = print a string at function entry with arguments.
178 // 0x2 = prints a simple message with function name + return code when function exits.
179 // 0x3 = print both.
180 // Must be enabled at runtime with HIP_TRACE_API
181 #define COMPILE_HIP_TRACE_API 0x3
182 
183 
184 // Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
185 // ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
186 #ifndef COMPILE_HIP_ATP_MARKER
187 #define COMPILE_HIP_ATP_MARKER 0
188 #endif
189 
190 
191 // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function
192 // boundary.
193 // TODO - currently we print the trace message at the beginning. if we waited, we could also
194 // tls_tidInfo return codes, and any values returned through ptr-to-args (ie the pointers allocated
195 // by hipMalloc).
196 #if COMPILE_HIP_ATP_MARKER
197 #include "CXLActivityLogger.h"
198 #define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr);
199 #define MARKER_END() amdtEndMarker();
200 #define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING);
201 #define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING);
202 #else
203 // Swallow scoped markers:
204 #define MARKER_BEGIN(markerName, group)
205 #define MARKER_END()
206 #define RESUME_PROFILING
207 #define STOP_PROFILING
208 #endif
209 
210 
211 //---
212 // HIP Trace modes - use with HIP_TRACE_API=...
213 #define TRACE_ALL 0 // 0x01
214 #define TRACE_KCMD 1 // 0x02, kernel command
215 #define TRACE_MCMD 2 // 0x04, memory command
216 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation.
217 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent)
218 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery
219 
220 
221 //---
222 // HIP_DB Debug flags:
223 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
224 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
225 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
226 #define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
227 #define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */
228 #define DB_FB 5 /* 0x20 - trace loading fat binary */
229 #define DB_MAX_FLAG 6
230 // When adding a new debug flag, also add to the char name table below.
231 //
232 //
233 
234 struct DbName {
235  const char* _color;
236  const char* _shortName;
237 };
238 
239 // This table must be kept in-sync with the defines above.
240 static const DbName dbName[] = {
241  {KGRN, "api"}, // not used,
242  {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, {KRED, "warn"},
243  {KBLU, "fatbin"},
244 };
245 
246 
247 #if COMPILE_HIP_DB
248 #define tprintf(trace_level, ...) \
249  { \
250  if (HIP_DB & (1 << (trace_level))) { \
251  char msgStr[1000]; \
252  snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \
253  fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \
254  dbName[trace_level]._shortName, tls_tidInfo.pid(), tls_tidInfo.tid(), msgStr, KNRM); \
255  } \
256  }
257 #else
258 /* Compile to empty code */
259 #define tprintf(trace_level, ...)
260 #endif
261 
262 
263 static inline uint64_t getTicks() { return hc::get_system_ticks(); }
264 
265 //---
266 extern uint64_t recordApiTrace(std::string* fullStr, const std::string& apiStr);
267 
268 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
269 #define API_TRACE(forceTrace, ...) \
270  uint64_t hipApiStartTick = 0; \
271  { \
272  tls_tidInfo.incApiSeqNum(); \
273  if (forceTrace || \
274  (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) { \
275  std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
276  std::string fullStr; \
277  hipApiStartTick = recordApiTrace(&fullStr, apiStr); \
278  if (HIP_PROFILE_API == 0x1) { \
279  MARKER_BEGIN(__func__, "HIP") \
280  } else if (HIP_PROFILE_API == 0x2) { \
281  MARKER_BEGIN(fullStr.c_str(), "HIP"); \
282  } \
283  } \
284  }
285 
286 #else
287 // Swallow API_TRACE
288 #define API_TRACE(IS_CMD, ...) tls_tidInfo.incApiSeqNum();
289 #endif
290 
291 
292 // Just initialize the HIP runtime, but don't log any trace information.
293 #define HIP_INIT() \
294  std::call_once(hip_initialized, ihipInit); \
295  ihipCtxStackUpdate();
296 #define HIP_SET_DEVICE() ihipDeviceSetState();
297 
298 
299 // This macro should be called at the beginning of every HIP API.
300 // It initializes the hip runtime (exactly once), and
301 // generates a trace string that can be output to stderr or to ATP file.
302 #define HIP_INIT_API(cid, ...) \
303  HIP_INIT() \
304  API_TRACE(0, __VA_ARGS__); \
305  HIP_CB_SPAWNER_OBJECT(cid);
306 
307 
308 // Like above, but will trace with a specified "special" bit.
309 // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
310 // kernel launches, copy commands, memory sets, etc.
311 #define HIP_INIT_SPECIAL_API(cid, tbit, ...) \
312  HIP_INIT() \
313  API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \
314  HIP_CB_SPAWNER_OBJECT(cid);
315 
316 
317 // This macro should be called at the end of every HIP API, and only at the end of top-level hip
318 // APIS (not internal hip) It has dual function: logs the last error returned for use by
319 // hipGetLastError, and also prints the closing message when the debug trace is enabled.
320 #define ihipLogStatus(hipStatus) \
321  ({ \
322  hipError_t localHipStatus = hipStatus; /*local copy so hipStatus only evaluated once*/ \
323  tls_lastHipError = localHipStatus; \
324  \
325  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \
326  auto ticks = getTicks() - hipApiStartTick; \
327  fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
328  (localHipStatus == 0) ? API_COLOR : KRED, tls_tidInfo.pid(), tls_tidInfo.tid(), \
329  tls_tidInfo.apiSeqNum(), __func__, localHipStatus, \
330  ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
331  } \
332  if (HIP_PROFILE_API) { \
333  MARKER_END(); \
334  } \
335  localHipStatus; \
336  })
337 
338 
339 class ihipException : public std::exception {
340  public:
341  explicit ihipException(hipError_t e) : _code(e){};
342 
343  hipError_t _code;
344 };
345 
346 
347 #ifdef __cplusplus
348 extern "C" {
349 #endif
350 
351 
352 #ifdef __cplusplus
353 }
354 #endif
355 
356 const hipStream_t hipStreamNull = 0x0;
357 
358 
362 #define HIP_IPC_RESERVED_SIZE 24
364  public:
365 #if USE_IPC
366  hsa_amd_ipc_memory_t ipc_handle;
367 #endif
368  size_t psize;
369  char reserved[HIP_IPC_RESERVED_SIZE];
370 };
371 
372 
373 struct ihipModule_t {
374  std::string fileName;
375  hsa_executable_t executable = {};
376  hsa_code_object_reader_t coReader = {};
377 
378  ~ihipModule_t() {
379  if (executable.handle) hsa_executable_destroy(executable);
380  if (coReader.handle) hsa_code_object_reader_destroy(coReader);
381  }
382 };
383 
384 
385 //---
386 // Used to remove lock, for performance or stimulating bugs.
387 class FakeMutex {
388  public:
389  void lock() {}
390  bool try_lock() { return true; }
391  void unlock() {}
392 };
393 
394 #if EVENT_THREAD_SAFE
395 typedef std::mutex EventMutex;
396 #else
397 #warning "Stream thread-safe disabled"
398 typedef FakeMutex EventMutex;
399 #endif
400 
401 #if STREAM_THREAD_SAFE
402 typedef std::mutex StreamMutex;
403 #else
404 #warning "Stream thread-safe disabled"
405 typedef FakeMutex StreamMutex;
406 #endif
407 
408 // Pair Device and Ctx together, these could also be toggled separately if desired.
409 #if CTX_THREAD_SAFE
410 typedef std::mutex CtxMutex;
411 #else
412 typedef FakeMutex CtxMutex;
413 #warning "Ctx thread-safe disabled"
414 #endif
415 
416 #if DEVICE_THREAD_SAFE
417 typedef std::mutex DeviceMutex;
418 #else
419 typedef FakeMutex DeviceMutex;
420 #warning "Device thread-safe disabled"
421 #endif
422 
423 //
424 //---
425 // Protects access to the member _data with a lock acquired on contruction/destruction.
426 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
427 template <typename T>
429  public:
430  LockedAccessor(T& criticalData, bool autoUnlock = true)
431  : _criticalData(&criticalData),
432  _autoUnlock(autoUnlock)
433 
434  {
435  tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData,
436  ToString(_criticalData->_parent).c_str());
437  _criticalData->_mutex.lock();
438  };
439 
440  ~LockedAccessor() {
441  if (_autoUnlock) {
442  tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData,
443  ToString(_criticalData->_parent).c_str());
444  _criticalData->_mutex.unlock();
445  }
446  }
447 
448  void unlock() {
449  tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData,
450  ToString(_criticalData->_parent).c_str());
451  _criticalData->_mutex.unlock();
452  }
453 
454  // Syntactic sugar so -> can be used to get the underlying type.
455  T* operator->() { return _criticalData; };
456 
457  private:
458  T* _criticalData;
459  bool _autoUnlock;
460 };
461 
462 
463 template <typename MUTEX_TYPE>
464 struct LockedBase {
465  // Experts-only interface for explicit locking.
466  // Most uses should use the lock-accessor.
467  void lock() { _mutex.lock(); }
468  void unlock() { _mutex.unlock(); }
469  bool try_lock() { return _mutex.try_lock(); }
470 
471  MUTEX_TYPE _mutex;
472 };
473 
474 
475 template <typename MUTEX_TYPE>
476 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE> {
477  public:
478  ihipStreamCriticalBase_t(ihipStream_t* parentStream, hc::accelerator_view av)
479  : _kernelCnt(0), _av(av), _parent(parentStream){};
480 
482 
485  return this;
486  };
487 
488  void munlock() {
489  tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this,
490  ToString(this->_parent).c_str());
492  };
493 
495  bool gotLock = LockedBase<MUTEX_TYPE>::try_lock();
496  tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this,
497  ToString(this->_parent).c_str());
498  return gotLock ? this : nullptr;
499  };
500 
501  public:
502  ihipStream_t* _parent;
503  uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait().
504 
505  hc::accelerator_view _av;
506 
507  private:
508 };
509 
510 
511 // if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire
512 // the lock for the ihipCtx_t and then for the individual streams. The locks should not be acquired
513 // in reverse order or deadlock may occur. In some cases, it may be possible to reduce the range
514 // where the locks must be held. HIP routines should avoid acquiring and releasing the same lock
515 // during the execution of a single HIP API. Another option is to use try_lock in the innermost lock
516 // query.
517 
518 
521 
522 //---
523 // Internal stream structure.
525  public:
526  enum ScheduleMode { Auto, Spin, Yield };
527  typedef uint64_t SeqNum_t;
528 
529  // TODOD -make av a reference to avoid shared_ptr overhead?
530  ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags);
531  ~ihipStream_t();
532 
533  // kind is hipMemcpyKind
534  void locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind,
535  bool resolveOn = true);
536 
537  void locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind,
538  bool resolveOn = true);
539 
540  void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
541 
542  void locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind);
543 
544  void lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
545  size_t offset, unsigned kind);
546  void lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
547  size_t offset, unsigned kind);
548 
549  //---
550  // Member functions that begin with locked_ are thread-safe accessors - these acquire / release
551  // the critical mutex.
552  LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
553  void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av);
554 
555 
556  void locked_wait();
557 
558  hc::accelerator_view* locked_getAv() {
559  LockedAccessor_StreamCrit_t crit(_criticalData);
560  return &(crit->_av);
561  };
562 
563  void locked_streamWaitEvent(ihipEventData_t& event);
564  hc::completion_future locked_recordEvent(hipEvent_t event);
565 
566  bool locked_eventIsReady(hipEvent_t event);
567  void locked_eventWaitComplete(hc::completion_future& marker, hc::hcWaitMode waitMode);
568 
569  ihipStreamCritical_t& criticalData() { return _criticalData; };
570 
571  //---
572  hc::hcWaitMode waitMode() const;
573 
574  // Use this if we already have the stream critical data mutex:
575  void wait(LockedAccessor_StreamCrit_t& crit);
576 
577  void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
578  uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
579  uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
580  uint32_t sharedMemBytes, void* kernarg, size_t kernSize,
581  uint64_t kernel);
582 
583 
584  //-- Non-racy accessors:
585  // These functions access fields set at initialization time and are non-racy (so do not acquire
586  // mutex)
587  const ihipDevice_t* getDevice() const;
588  ihipCtx_t* getCtx() const;
589 
590  // Before calling this function, stream must be resolved from "0" to the actual stream:
591  bool isDefaultStream() const { return _id == 0; };
592 
593  public:
594  //---
595  // Public member vars - these are set at initialization and never change:
596  SeqNum_t _id; // monotonic sequence ID. 0 is the default stream.
597  unsigned _flags;
598 
599 
600  private:
601  // The unsigned return is hipMemcpyKind
602  unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem);
603  void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo* dstPtrInfo,
604  const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
605  ihipCtx_t** copyDevice, bool* forceUnpinnedCopy);
606 
607  bool canSeeMemory(const ihipCtx_t* thisCtx, const hc::AmPointerInfo* dstInfo,
608  const hc::AmPointerInfo* srcInfo);
609 
610  void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);
611 
612 
613  public: // TODO - move private
614  // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
615  ihipStreamCritical_t _criticalData;
616 
617  private: // Data
618  std::mutex _hasQueueLock;
619 
620  ihipCtx_t* _ctx; // parent context that owns this stream.
621 
622  // Friends:
623  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
624  friend hipError_t hipStreamQuery(hipStream_t);
625 
626  ScheduleMode _scheduleMode;
627 };
628 
629 
630 //----
631 // Internal structure for stream callback handler
633  public:
634  ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
635  : _stream(stream), _callback(callback), _userData(userData) {
636  };
637  hipStream_t _stream;
638  hipStreamCallback_t _callback;
639  void* _userData;
640 };
641 
642 
643 //----
644 // Internal event structure:
645 enum hipEventStatus_t {
646  hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
647  hipEventStatusCreated = 1, // event created, but not yet Recorded
648  hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet.
649  hipEventStatusComplete = 3, // event has been recorded - timestamps are valid.
650 };
651 
652 // TODO - rename to ihip type of some kind
653 enum ihipEventType_t {
654  hipEventTypeIndependent,
655  hipEventTypeStartCommand,
656  hipEventTypeStopCommand,
657 };
658 
659 
661  ihipEventData_t() {
662  _state = hipEventStatusCreated;
663  _stream = NULL;
664  _timestamp = 0;
665  _type = hipEventTypeIndependent;
666  };
667 
668  void marker(const hc::completion_future& marker) { _marker = marker; };
669  hc::completion_future& marker() { return _marker; }
670  uint64_t timestamp() const { return _timestamp; };
671  ihipEventType_t type() const { return _type; };
672 
673  ihipEventType_t _type;
674  hipEventStatus_t _state;
675  hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual
676  // stream when recorded
677  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
678  private:
679  hc::completion_future _marker;
680 };
681 
682 
683 //=============================================================================
684 // class ihipEventCriticalBase_t
685 template <typename MUTEX_TYPE>
686 class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE> {
687  public:
688  explicit ihipEventCriticalBase_t(const ihipEvent_t* parentEvent) : _parent(parentEvent) {}
690 
691  // Keep data in structure so it can be easily copied into snapshots
692  // (used to reduce lock contention and preserve correct lock order)
693  ihipEventData_t _eventData;
694 
695  private:
696  const ihipEvent_t* _parent;
697  friend class LockedAccessor<ihipEventCriticalBase_t>;
698 };
699 
701 
703 
704 // internal hip event structure.
705 class ihipEvent_t {
706  public:
707  explicit ihipEvent_t(unsigned flags);
708  void attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream,
709  ihipEventType_t eventType);
710  std::pair<hipEventStatus_t, uint64_t> refreshEventStatus(); // returns pair <state, timestamp>
711 
712 
713  // Return a copy of the critical state. The critical data is locked during the copy.
714  ihipEventData_t locked_copyCrit() {
715  LockedAccessor_EventCrit_t crit(_criticalData);
716  return _criticalData._eventData;
717  };
718 
719  ihipEventCritical_t& criticalData() { return _criticalData; };
720 
721  public:
722  unsigned _flags;
723 
724  private:
725  ihipEventCritical_t _criticalData;
726 
727  friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
728 };
729 
730 
731 //=============================================================================
732 // class ihipDeviceCriticalBase_t
733 template <typename MUTEX_TYPE>
735  public:
736  explicit ihipDeviceCriticalBase_t(ihipDevice_t* parentDevice)
737  : _parent(parentDevice), _ctxCount(0){};
738 
740 
741  // Contexts:
742  void addContext(ihipCtx_t* ctx);
743  void removeContext(ihipCtx_t* ctx);
744  std::list<ihipCtx_t*>& ctxs() { return _ctxs; };
745  const std::list<ihipCtx_t*>& const_ctxs() const { return _ctxs; };
746  int getcount() { return _ctxCount; };
747  friend class LockedAccessor<ihipDeviceCriticalBase_t>;
748 
749  private:
750  ihipDevice_t* _parent;
751 
752  //--- Context Tracker:
753  std::list<ihipCtx_t*> _ctxs; // contexts associated with this device across all threads.
754 
755  int _ctxCount;
756 };
757 
759 
761 
762 //----
763 // Properties of the HIP device.
764 // Multiple contexts can point to same device.
766  public:
767  ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc);
768  ~ihipDevice_t();
769 
770  // Accessors:
771  ihipCtx_t* getPrimaryCtx() const { return _primaryCtx; };
772  void locked_removeContext(ihipCtx_t* c);
773  void locked_reset();
774  ihipDeviceCritical_t& criticalData() { return _criticalData; };
775 
776  public:
777  unsigned _deviceId; // device ID
778 
779  hc::accelerator _acc;
780  hsa_agent_t _hsaAgent; // hsa agent handle
781 
783  unsigned _computeUnits;
784  hipDeviceProp_t _props; // saved device properties.
785 
786  // TODO - report this through device properties, base on HCC API call.
787  int _isLargeBar;
788 
789  ihipCtx_t* _primaryCtx;
790 
791  int _state; // 1 if device is set otherwise 0
792 
793  private:
794  hipError_t initProperties(hipDeviceProp_t* prop);
795 
796  private:
797  ihipDeviceCritical_t _criticalData;
798 };
799 //=============================================================================
800 
801 
802 //---
803 //
804 struct ihipExec_t {
805  dim3 _gridDim;
806  dim3 _blockDim;
807  size_t _sharedMem;
808  hipStream_t _hStream;
809  std::vector<char> _arguments;
810 };
811 
812 //=============================================================================
813 // class ihipCtxCriticalBase_t
814 template <typename MUTEX_TYPE>
815 class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE> {
816  public:
817  ihipCtxCriticalBase_t(ihipCtx_t* parentCtx, unsigned deviceCnt)
818  : _parent(parentCtx), _peerCnt(0) {
819  _peerAgents = new hsa_agent_t[deviceCnt];
820  };
821 
823  if (_peerAgents != nullptr) {
824  delete _peerAgents;
825  _peerAgents = nullptr;
826  }
827  _peerCnt = 0;
828  }
829 
830  // Streams:
831  void addStream(ihipStream_t* stream);
832  std::list<ihipStream_t*>& streams() { return _streams; };
833  const std::list<ihipStream_t*>& const_streams() const { return _streams; };
834 
835 
836  // Peer Accessor classes:
837  bool isPeerWatcher(const ihipCtx_t* peer); // returns True if peer has access to memory
838  // physically located on this device.
839  bool addPeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
840  bool removePeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
841  void resetPeerWatchers(ihipCtx_t* thisDevice);
842  void printPeerWatchers(FILE* f) const;
843 
844  uint32_t peerCnt() const { return _peerCnt; };
845  hsa_agent_t* peerAgents() const { return _peerAgents; };
846 
847 
848  // TODO - move private
849  std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
850  //--- Execution stack:
851  std::stack<ihipExec_t> _execStack; // Execution stack for this device.
852 
853  friend class LockedAccessor<ihipCtxCriticalBase_t>;
854 
855  private:
856  ihipCtx_t* _parent;
857 
858  //--- Stream Tracker:
859  std::list<ihipStream_t*> _streams; // streams associated with this device.
860 
861 
862  //--- Peer Tracker:
863  // These reflect the currently Enabled set of peers for this GPU:
864  // Enabled peers have permissions to access the memory physically allocated on this device.
865  // Note the peers always contain the self agent for easy interfacing with HSA APIs.
866  uint32_t _peerCnt; // number of enabled peers
867  hsa_agent_t* _peerAgents; // efficient packed array of enabled agents (to use for allocations.)
868  private:
869  void recomputePeerAgents();
870 };
871 // Note Mutex type Real/Fake selected based on CtxMutex
873 
874 // This type is used by functions that need access to the critical device structures.
876 //=============================================================================
877 
878 
879 //=============================================================================
880 // class ihipCtx_t:
881 // A HIP CTX (context) points at one of the existing devices and contains the streams,
882 // peer-to-peer mappings, creation flags. Multiple contexts can point to the same
883 // device.
884 //
885 class ihipCtx_t {
886  public: // Functions:
887  ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt,
888  unsigned flags); // note: calls constructor for _criticalData
889  ~ihipCtx_t();
890 
891  // Functions which read or write the critical data are named locked_.
892  // (might be better called "locking_"
893  // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a
894  // locked_ function from within a locked_ function. External functions which call several
895  // locked_ functions will acquire and release the lock for each function. if this occurs in
896  // performance-sensitive code we may want to refactor by adding non-locked functions and
897  // creating a new locked_ member function to call them all.
898  void locked_removeStream(ihipStream_t* s);
899  void locked_reset();
900  void locked_waitAllStreams();
901  void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);
902 
903  ihipCtxCritical_t& criticalData() { return _criticalData; };
904 
905  const ihipDevice_t* getDevice() const { return _device; };
906  int getDeviceNum() const { return _device->_deviceId; };
907 
908  // TODO - review uses of getWriteableDevice(), can these be converted to getDevice()
909  ihipDevice_t* getWriteableDevice() const { return _device; };
910 
911  std::string toString() const;
912 
913  public: // Data
914  // The NULL stream is used if no other stream is specified.
915  // Default stream has special synchronization properties with other streams.
916  ihipStream_t* _defaultStream;
917 
918  // Flags specified when the context is created:
919  unsigned _ctxFlags;
920 
921  private:
922  ihipDevice_t* _device;
923 
924 
925  private: // Critical data, protected with locked access:
926  // Members of _protected data MUST be accessed through the LockedAccessor.
927  // Search for LockedAccessor<ihipCtxCritical_t> for examples; do not access _criticalData
928  // directly.
929  ihipCtxCritical_t _criticalData;
930 };
931 
932 
933 //=================================================================================================
934 // Global variable definition:
935 extern std::once_flag hip_initialized;
936 extern unsigned g_deviceCnt;
937 extern hsa_agent_t g_cpu_agent; // the CPU agent.
938 extern hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents.
939 
940 //=================================================================================================
941 // Extern functions:
942 extern void ihipInit();
943 extern const char* ihipErrorString(hipError_t);
944 extern ihipCtx_t* ihipGetTlsDefaultCtx();
945 extern void ihipSetTlsDefaultCtx(ihipCtx_t* ctx);
946 extern hipError_t ihipSynchronize(void);
947 extern void ihipCtxStackUpdate();
948 extern hipError_t ihipDeviceSetState();
949 
950 extern ihipDevice_t* ihipGetDevice(int);
951 ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex);
952 
953 
954 hipStream_t ihipSyncAndResolveStream(hipStream_t);
955 hipError_t ihipStreamSynchronize(hipStream_t stream);
956 void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
957 
958 // Stream printf functions:
959 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) {
960  os << "stream:";
961  os << s.getDevice()->_deviceId;
962  ;
963  os << '.';
964  os << s._id;
965  return os;
966 }
967 
968 inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
969  os << '{';
970  os << s.x;
971  os << ',';
972  os << s.y;
973  os << ',';
974  os << s.z;
975  os << '}';
976  return os;
977 }
978 
979 inline std::ostream& operator<<(std::ostream& os, const gl_dim3& s) {
980  os << '{';
981  os << s.x;
982  os << ',';
983  os << s.y;
984  os << ',';
985  os << s.z;
986  os << '}';
987  return os;
988 }
989 
990 // Stream printf functions:
991 inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) {
992  os << "event:" << std::hex << static_cast<void*>(e);
993  return os;
994 }
995 
996 inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
997  os << "ctx:" << static_cast<const void*>(c) << ".dev:" << c->getDevice()->_deviceId;
998  return os;
999 }
1000 
1001 
1002 // Helper functions that are used across src files:
1003 namespace hip_internal {
1004 hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
1005  hipStream_t stream);
1006 };
1007 
1008 
1009 #endif
Definition: hip_hcc_internal.h:234
Definition: hip_hcc_internal.h:765
Definition: hip_hcc_internal.h:464
Definition: hip_hcc_internal.h:109
friend hipError_t hipStreamQuery(hipStream_t)
Return #hipSuccess if all of the operations in the specified stream have completed, or #hipErrorNotReady if not.
Definition: hip_stream.cpp:154
Definition: hip_hcc_internal.h:387
Definition: hip_hcc_internal.h:363
uint32_t x
x
Definition: hip_runtime_api.h:241
Definition: grid_launch.h:17
Definition: hip_hcc_internal.h:804
Definition: hip_hcc_internal.h:885
Definition: hip_runtime_api.h:240
uint32_t y
y
Definition: hip_runtime_api.h:242
Definition: hip_hcc_internal.h:632
Definition: hip_hcc_internal.h:734
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:783
uint32_t z
z
Definition: hip_runtime_api.h:243
friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:110
Definition: hip_runtime_api.h:83
Definition: hip_hcc_internal.h:660
Definition: hip_hcc_internal.h:686
Definition: hip_hcc_internal.h:339
Definition: hip_hcc_internal.h:705
Definition: hip_hcc_internal.h:815
Definition: hip_hcc_internal.h:524
Definition: hip_hcc_internal.h:476
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:366
Definition: hip_hcc_internal.h:428
Definition: hip_hcc_internal.h:373
Definition: hip_hcc_internal.h:92
void(* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void *userData)
Definition: hip_runtime_api.h:791