Actual source code: syclcontext.sycl.cxx

  1: #include "sycldevice.hpp"
  2: #include <sycl/sycl.hpp>
  3: #include <chrono>

  5: namespace Petsc
  6: {

  8: namespace device
  9: {

 11: namespace sycl
 12: {

 14: namespace impl
 15: {

 17: class DeviceContext {
 18: public:
 19:   struct PetscDeviceContext_SYCL {
 20:     ::sycl::event event;
 21:     ::sycl::event begin; // timer-only
 22:     ::sycl::event end;   // timer-only
 23: #if PetscDefined(USE_DEBUG)
 24:     PetscBool timerInUse{};
 25: #endif
 26:     ::sycl::queue queue;

 28:     std::chrono::time_point<std::chrono::steady_clock> timeBegin{};
 29:   };

 31: private:
 32:   static bool initialized_;

 34:   static PetscErrorCode finalize_() noexcept
 35:   {
 36:     PetscFunctionBegin;
 37:     initialized_ = false;
 38:     PetscFunctionReturn(PETSC_SUCCESS);
 39:   }

 41:   static PetscErrorCode initialize_(PetscInt id, PetscDeviceContext dctx) noexcept
 42:   {
 43:     PetscFunctionBegin;
 44:     PetscCall(PetscDeviceCheckDeviceCount_Internal(id));
 45:     if (!initialized_) {
 46:       initialized_ = true;
 47:       PetscCall(PetscRegisterFinalize(finalize_));
 48:     }
 49:     PetscFunctionReturn(PETSC_SUCCESS);
 50:   }

 52: public:
 53:   const struct _DeviceContextOps ops = {destroy, changeStreamType, setUp, query, waitForContext, synchronize, getBlasHandle, getSolverHandle, getStreamHandle, beginTimer, endTimer, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};

 55:   // default constructor
 56:   DeviceContext() noexcept = default;

 58:   // All of these functions MUST be static in order to be callable from C, otherwise they
 59:   // get the implicit 'this' pointer tacked on
 60:   static PetscErrorCode destroy(PetscDeviceContext dctx) noexcept
 61:   {
 62:     PetscFunctionBegin;
 63:     delete static_cast<PetscDeviceContext_SYCL *>(dctx->data);
 64:     dctx->data = nullptr;
 65:     PetscFunctionReturn(PETSC_SUCCESS);
 66:   }

 68:   static PetscErrorCode setUp(PetscDeviceContext dctx) noexcept
 69:   {
 70:     PetscDevice dev;
 71:     PetscInt    id;

 73:     PetscFunctionBegin;
 74: #if PetscDefined(USE_DEBUG)
 75:     static_cast<PetscDeviceContext_SYCL *>(dctx->data)->timerInUse = PETSC_FALSE;
 76: #endif
 77:     PetscCall(PetscDeviceContextGetDevice(dctx, &dev));
 78:     PetscCall(PetscDeviceGetDeviceId(dev, &id));
 79:     const ::sycl::device &syclDevice = (id == PETSC_SYCL_DEVICE_HOST) ? ::sycl::device(::sycl::cpu_selector_v) : ::sycl::device::get_devices(::sycl::info::device_type::gpu)[id];

 81:     static_cast<PetscDeviceContext_SYCL *>(dctx->data)->queue = ::sycl::queue(syclDevice, ::sycl::property::queue::in_order());
 82:     PetscFunctionReturn(PETSC_SUCCESS);
 83:   }

 85:   static PetscErrorCode query(PetscDeviceContext dctx, PetscBool *idle) noexcept
 86:   {
 87:     PetscFunctionBegin;
 88:     // available in future, https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_oneapi_queue_empty.asciidoc
 89:     // *idle = static_cast(dctx->data)->queue.empty() ? PETSC_TRUE : PETSC_FALSE;
 90:     *idle = PETSC_FALSE;
 91:     PetscFunctionReturn(PETSC_SUCCESS);
 92:   }

 94:   static PetscErrorCode synchronize(PetscDeviceContext dctx) noexcept
 95:   {
 96:     PetscBool  idle = PETSC_TRUE;
 97:     const auto dci  = static_cast<PetscDeviceContext_SYCL *>(dctx->data);

 99:     PetscFunctionBegin;
100:     PetscCall(query(dctx, &idle));
101:     if (!idle) PetscCallCXX(dci->queue.wait());
102:     PetscFunctionReturn(PETSC_SUCCESS);
103:   }

105:   static PetscErrorCode getStreamHandle(PetscDeviceContext dctx, void **handle) noexcept
106:   {
107:     PetscFunctionBegin;
108:     *reinterpret_cast<::sycl::queue **>(handle) = &(static_cast<PetscDeviceContext_SYCL *>(dctx->data)->queue);
109:     PetscFunctionReturn(PETSC_SUCCESS);
110:   }

112:   static PetscErrorCode beginTimer(PetscDeviceContext dctx) noexcept
113:   {
114:     const auto dci = static_cast<PetscDeviceContext_SYCL *>(dctx->data);

116:     PetscFunctionBegin;
117: #if PetscDefined(USE_DEBUG)
118:     PetscCheck(!dci->timerInUse, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Forgot to call PetscLogGpuTimeEnd()?");
119:     dci->timerInUse = PETSC_TRUE;
120: #endif
121:     // It is not a good approach to time SYCL kernels because the timer starts at the kernel launch time at host,
122:     // not at the start of execution time on device. SYCL provides this style of kernel timing:
123:     /*
124:       sycl::queue q(sycl::default_selector_v, sycl::property::queue::enable_profiling{});
125:       sycl::event e = q.submit([&](sycl::handler &h) {
126:         ...
127:       });
128:       e.wait();
129:       auto start_time = e.get_profiling_info<sycl::info::event_profiling::command_start>();
130:       auto end_time = e.get_profiling_info<sycl::info::event_profiling::command_end>();
131:       long long kernel_duration_ns = end_time - start_time;
132:     */
133:     // It requires 1) enable profiling at the queue's creation time, and 2) store the event returned by kernel launch.
134:     // But neither we have control of the input queue, nor does PetscDeviceContext support 2), so we just use a
135:     // host side timer.
136:     PetscCallCXX(dci->timeBegin = std::chrono::steady_clock::now());
137:     PetscFunctionReturn(PETSC_SUCCESS);
138:   }

140:   static PetscErrorCode endTimer(PetscDeviceContext dctx, PetscLogDouble *elapsed) noexcept
141:   {
142:     const auto                    dci = static_cast<PetscDeviceContext_SYCL *>(dctx->data);
143:     std::chrono::duration<double> duration;

145:     PetscFunctionBegin;
146: #if PetscDefined(USE_DEBUG)
147:     PetscCheck(dci->timerInUse, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Forgot to call PetscLogGpuTimeBegin()?");
148:     dci->timerInUse = PETSC_FALSE;
149: #endif
150:     PetscCallCXX(dci->queue.wait());
151:     PetscCallCXX(duration = std::chrono::steady_clock::now() - dci->timeBegin);
152:     PetscCallCXX(*elapsed = duration.count());
153:     PetscFunctionReturn(PETSC_SUCCESS);
154:   }

156:   static PetscErrorCode changeStreamType(PetscDeviceContext, PetscStreamType) noexcept { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); }
157:   static PetscErrorCode waitForContext(PetscDeviceContext, PetscDeviceContext) noexcept { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); }
158:   static PetscErrorCode getBlasHandle(PetscDeviceContext, void *) noexcept { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); }
159:   static PetscErrorCode getSolverHandle(PetscDeviceContext, void *) noexcept { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); }
160: };

162: } // namespace impl

164: } // namespace sycl

166: } // namespace device

168: } // namespace Petsc

170: PetscErrorCode PetscDeviceContextCreate_SYCL(PetscDeviceContext dctx)
171: {
172:   using namespace Petsc::device::sycl::impl;

174:   static const DeviceContext syclctx;

176:   PetscFunctionBegin;
177:   PetscCallCXX(dctx->data = new DeviceContext::PetscDeviceContext_SYCL());
178:   dctx->ops[0] = syclctx.ops;
179:   PetscFunctionReturn(PETSC_SUCCESS);
180: }