Actual source code: veckok.kokkos.cxx
1: /*
2: Implements the sequential Kokkos vectors.
3: */
4: #include <petsc_kokkos.hpp>
5: #include <petscvec_kokkos.hpp>
7: #include <petsc/private/sfimpl.h>
8: #include <petsc/private/petscimpl.h>
9: #include <petscmath.h>
10: #include <petscviewer.h>
11: #include <KokkosBlas.hpp>
12: #include <Kokkos_Functional.hpp>
14: #include <../src/vec/vec/impls/dvecimpl.h>
15: #include <../src/vec/vec/impls/seq/kokkos/veckokkosimpl.hpp>
17: template <class MemorySpace>
18: static PetscErrorCode VecGetKokkosView_Private(Vec v, PetscScalarKokkosViewType<MemorySpace> *kv, PetscBool overwrite)
19: {
20: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
22: PetscFunctionBegin;
23: VecErrorIfNotKokkos(v);
24: if (!overwrite) { /* If overwrite=true, no need to sync the space, since caller will overwrite the data */
25: PetscCall(KokkosDualViewSync<MemorySpace>(veckok->v_dual, PetscGetKokkosExecutionSpace()));
26: }
27: *kv = veckok->v_dual.view<MemorySpace>();
28: PetscFunctionReturn(PETSC_SUCCESS);
29: }
31: template <class MemorySpace>
32: static PetscErrorCode VecRestoreKokkosView_Private(Vec v, PetscScalarKokkosViewType<MemorySpace> *kv, PetscBool overwrite)
33: {
34: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
36: PetscFunctionBegin;
37: VecErrorIfNotKokkos(v);
38: if (overwrite) veckok->v_dual.clear_sync_state(); /* If overwrite=true, clear the old sync state since user forced an overwrite */
39: veckok->v_dual.modify<MemorySpace>();
40: PetscCall(PetscObjectStateIncrease((PetscObject)v));
41: PetscFunctionReturn(PETSC_SUCCESS);
42: }
44: template <class MemorySpace>
45: PetscErrorCode VecGetKokkosView(Vec v, ConstPetscScalarKokkosViewType<MemorySpace> *kv)
46: {
47: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
49: PetscFunctionBegin;
50: VecErrorIfNotKokkos(v);
51: PetscCall(KokkosDualViewSync<MemorySpace>(veckok->v_dual, PetscGetKokkosExecutionSpace()));
52: *kv = veckok->v_dual.view<MemorySpace>();
53: PetscFunctionReturn(PETSC_SUCCESS);
54: }
56: /* Function template explicit instantiation */
57: template PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView(Vec, ConstPetscScalarKokkosView *);
58: template <>
59: PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView(Vec v, PetscScalarKokkosView *kv)
60: {
61: return VecGetKokkosView_Private(v, kv, PETSC_FALSE);
62: }
63: template <>
64: PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosView(Vec v, PetscScalarKokkosView *kv)
65: {
66: return VecRestoreKokkosView_Private(v, kv, PETSC_FALSE);
67: }
68: template <>
69: PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosViewWrite(Vec v, PetscScalarKokkosView *kv)
70: {
71: return VecGetKokkosView_Private(v, kv, PETSC_TRUE);
72: }
73: template <>
74: PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosViewWrite(Vec v, PetscScalarKokkosView *kv)
75: {
76: return VecRestoreKokkosView_Private(v, kv, PETSC_TRUE);
77: }
79: #if !defined(KOKKOS_ENABLE_UNIFIED_MEMORY) /* Get host views if the default memory space is not host space */
80: template PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView(Vec, ConstPetscScalarKokkosViewHost *);
81: template <>
82: PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView(Vec v, PetscScalarKokkosViewHost *kv)
83: {
84: return VecGetKokkosView_Private(v, kv, PETSC_FALSE);
85: }
86: template <>
87: PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosView(Vec v, PetscScalarKokkosViewHost *kv)
88: {
89: return VecRestoreKokkosView_Private(v, kv, PETSC_FALSE);
90: }
91: template <>
92: PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosViewWrite(Vec v, PetscScalarKokkosViewHost *kv)
93: {
94: return VecGetKokkosView_Private(v, kv, PETSC_TRUE);
95: }
96: template <>
97: PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosViewWrite(Vec v, PetscScalarKokkosViewHost *kv)
98: {
99: return VecRestoreKokkosView_Private(v, kv, PETSC_TRUE);
100: }
101: #endif
103: PetscErrorCode VecSetRandom_SeqKokkos(Vec xin, PetscRandom r)
104: {
105: const PetscInt n = xin->map->n;
106: PetscScalar *xx;
108: PetscFunctionBegin;
109: PetscCall(VecGetArrayWrite(xin, &xx)); /* TODO: generate randoms directly on device */
110: for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValue(r, &xx[i]));
111: PetscCall(VecRestoreArrayWrite(xin, &xx));
112: PetscFunctionReturn(PETSC_SUCCESS);
113: }
115: /* x = |x| */
116: PetscErrorCode VecAbs_SeqKokkos(Vec xin)
117: {
118: PetscScalarKokkosView xv;
119: auto exec = PetscGetKokkosExecutionSpace();
121: PetscFunctionBegin;
122: PetscCall(PetscLogGpuTimeBegin());
123: PetscCall(VecGetKokkosView(xin, &xv));
124: PetscCallCXX(KokkosBlas::abs(exec, xv, xv));
125: PetscCall(VecRestoreKokkosView(xin, &xv));
126: PetscCall(PetscLogGpuTimeEnd());
127: PetscFunctionReturn(PETSC_SUCCESS);
128: }
130: /* x = 1/x */
131: PetscErrorCode VecReciprocal_SeqKokkos(Vec xin)
132: {
133: PetscScalarKokkosView xv;
135: PetscFunctionBegin;
136: PetscCall(PetscLogGpuTimeBegin());
137: PetscCall(VecGetKokkosView(xin, &xv));
138: PetscCallCXX(Kokkos::parallel_for(
139: "VecReciprocal", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i) {
140: if (xv(i) != (PetscScalar)0.0) xv(i) = (PetscScalar)1.0 / xv(i);
141: }));
142: PetscCall(VecRestoreKokkosView(xin, &xv));
143: PetscCall(PetscLogGpuTimeEnd());
144: PetscFunctionReturn(PETSC_SUCCESS);
145: }
147: PetscErrorCode VecMin_SeqKokkos(Vec xin, PetscInt *p, PetscReal *val)
148: {
149: ConstPetscScalarKokkosView xv;
150: Kokkos::MinLoc<PetscReal, PetscInt>::value_type result;
152: PetscFunctionBegin;
153: PetscCall(PetscLogGpuTimeBegin());
154: PetscCall(VecGetKokkosView(xin, &xv));
155: PetscCallCXX(Kokkos::parallel_reduce(
156: "VecMin", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n),
157: KOKKOS_LAMBDA(const PetscInt &i, Kokkos::MinLoc<PetscReal, PetscInt>::value_type &lupdate) {
158: if (PetscRealPart(xv(i)) < lupdate.val) {
159: lupdate.val = PetscRealPart(xv(i));
160: lupdate.loc = i;
161: }
162: },
163: Kokkos::MinLoc<PetscReal, PetscInt>(result)));
164: *val = result.val;
165: if (p) *p = result.loc;
166: PetscCall(VecRestoreKokkosView(xin, &xv));
167: PetscCall(PetscLogGpuTimeEnd());
168: PetscFunctionReturn(PETSC_SUCCESS);
169: }
171: PetscErrorCode VecMax_SeqKokkos(Vec xin, PetscInt *p, PetscReal *val)
172: {
173: ConstPetscScalarKokkosView xv;
174: Kokkos::MaxLoc<PetscReal, PetscInt>::value_type result;
176: PetscFunctionBegin;
177: PetscCall(PetscLogGpuTimeBegin());
178: PetscCall(VecGetKokkosView(xin, &xv));
179: PetscCallCXX(Kokkos::parallel_reduce(
180: "VecMax", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n),
181: KOKKOS_LAMBDA(const PetscInt &i, Kokkos::MaxLoc<PetscReal, PetscInt>::value_type &lupdate) {
182: if (PetscRealPart(xv(i)) > lupdate.val) {
183: lupdate.val = PetscRealPart(xv(i));
184: lupdate.loc = i;
185: }
186: },
187: Kokkos::MaxLoc<PetscReal, PetscInt>(result)));
188: *val = result.val;
189: if (p) *p = result.loc;
190: PetscCall(VecRestoreKokkosView(xin, &xv));
191: PetscCall(PetscLogGpuTimeEnd());
192: PetscFunctionReturn(PETSC_SUCCESS);
193: }
195: PetscErrorCode VecSum_SeqKokkos(Vec xin, PetscScalar *sum)
196: {
197: ConstPetscScalarKokkosView xv;
199: PetscFunctionBegin;
200: PetscCall(PetscLogGpuTimeBegin());
201: PetscCall(VecGetKokkosView(xin, &xv));
202: PetscCallCXX(*sum = KokkosBlas::sum(PetscGetKokkosExecutionSpace(), xv));
203: PetscCall(VecRestoreKokkosView(xin, &xv));
204: PetscCall(PetscLogGpuTimeEnd());
205: PetscFunctionReturn(PETSC_SUCCESS);
206: }
208: PetscErrorCode VecShift_SeqKokkos(Vec xin, PetscScalar shift)
209: {
210: PetscScalarKokkosView xv;
212: PetscFunctionBegin;
213: PetscCall(PetscLogGpuTimeBegin());
214: PetscCall(VecGetKokkosView(xin, &xv));
215: PetscCallCXX(Kokkos::parallel_for("VecShift", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i) { xv(i) += shift; }); PetscCall(VecRestoreKokkosView(xin, &xv)));
216: PetscCall(PetscLogGpuTimeEnd());
217: PetscCall(PetscLogGpuFlops(xin->map->n));
218: PetscFunctionReturn(PETSC_SUCCESS);
219: }
221: /* y = alpha x + y */
222: PetscErrorCode VecAXPY_SeqKokkos(Vec yin, PetscScalar alpha, Vec xin)
223: {
224: PetscFunctionBegin;
225: if (alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
226: if (yin == xin) {
227: PetscCall(VecScale_SeqKokkos(yin, alpha + 1));
228: } else {
229: PetscBool xiskok, yiskok;
231: PetscCall(PetscObjectTypeCompareAny((PetscObject)xin, &xiskok, VECSEQKOKKOS, VECMPIKOKKOS, ""));
232: PetscCall(PetscObjectTypeCompareAny((PetscObject)yin, &yiskok, VECSEQKOKKOS, VECMPIKOKKOS, ""));
233: if (xiskok && yiskok) {
234: PetscScalarKokkosView yv;
235: ConstPetscScalarKokkosView xv;
237: PetscCall(PetscLogGpuTimeBegin());
238: PetscCall(VecGetKokkosView(xin, &xv));
239: PetscCall(VecGetKokkosView(yin, &yv));
240: PetscCallCXX(KokkosBlas::axpy(PetscGetKokkosExecutionSpace(), alpha, xv, yv));
241: PetscCall(VecRestoreKokkosView(xin, &xv));
242: PetscCall(VecRestoreKokkosView(yin, &yv));
243: PetscCall(PetscLogGpuTimeEnd());
244: PetscCall(PetscLogGpuFlops(2.0 * yin->map->n));
245: } else {
246: PetscCall(VecAXPY_Seq(yin, alpha, xin));
247: }
248: }
249: PetscFunctionReturn(PETSC_SUCCESS);
250: }
252: /* y = x + beta y */
253: PetscErrorCode VecAYPX_SeqKokkos(Vec yin, PetscScalar beta, Vec xin)
254: {
255: PetscFunctionBegin;
256: /* One needs to define KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 to have optimizations for cases alpha/beta = 0,+/-1 */
257: PetscCall(VecAXPBY_SeqKokkos(yin, 1.0, beta, xin));
258: PetscFunctionReturn(PETSC_SUCCESS);
259: }
261: /* z = y^T x */
262: PetscErrorCode VecTDot_SeqKokkos(Vec xin, Vec yin, PetscScalar *z)
263: {
264: ConstPetscScalarKokkosView xv, yv;
266: PetscFunctionBegin;
267: PetscCall(PetscLogGpuTimeBegin());
268: PetscCall(VecGetKokkosView(xin, &xv));
269: PetscCall(VecGetKokkosView(yin, &yv));
270: // Kokkos always overwrites z, so no need to init it
271: PetscCallCXX(Kokkos::parallel_reduce("VecTDot", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &update) { update += yv(i) * xv(i); }, *z));
272: PetscCall(VecRestoreKokkosView(yin, &yv));
273: PetscCall(VecRestoreKokkosView(xin, &xv));
274: PetscCall(PetscLogGpuTimeEnd());
275: if (xin->map->n > 0) PetscCall(PetscLogGpuFlops(2.0 * xin->map->n));
276: PetscFunctionReturn(PETSC_SUCCESS);
277: }
279: struct TransposeDotTag { };
280: struct ConjugateDotTag { };
282: template <PetscInt ValueCount>
283: struct MDotFunctor {
284: static_assert(ValueCount >= 1 && ValueCount <= 8, "ValueCount must be in [1, 8]");
285: /* Note the C++ notation for an array typedef */
286: // noted, thanks
287: typedef PetscScalar value_type[];
288: typedef ConstPetscScalarKokkosView::size_type size_type;
290: /* Tell Kokkos the result array's number of entries. This must be a public value in the functor */
291: static constexpr size_type value_count = ValueCount;
292: ConstPetscScalarKokkosView xv, yv[8];
294: MDotFunctor(ConstPetscScalarKokkosView &xv, ConstPetscScalarKokkosView &yv0, ConstPetscScalarKokkosView &yv1, ConstPetscScalarKokkosView &yv2, ConstPetscScalarKokkosView &yv3, ConstPetscScalarKokkosView &yv4, ConstPetscScalarKokkosView &yv5, ConstPetscScalarKokkosView &yv6, ConstPetscScalarKokkosView &yv7) :
295: xv(xv)
296: {
297: yv[0] = yv0;
298: yv[1] = yv1;
299: yv[2] = yv2;
300: yv[3] = yv3;
301: yv[4] = yv4;
302: yv[5] = yv5;
303: yv[6] = yv6;
304: yv[7] = yv7;
305: }
307: KOKKOS_INLINE_FUNCTION void operator()(TransposeDotTag, const size_type i, value_type sum) const
308: {
309: PetscScalar xval = xv(i);
310: for (size_type j = 0; j < value_count; ++j) sum[j] += yv[j](i) * xval;
311: }
313: KOKKOS_INLINE_FUNCTION void operator()(ConjugateDotTag, const size_type i, value_type sum) const
314: {
315: PetscScalar xval = xv(i);
316: for (size_type j = 0; j < value_count; ++j) sum[j] += PetscConj(yv[j](i)) * xval;
317: }
319: // Per https://kokkos.github.io/kokkos-core-wiki/API/core/parallel-dispatch/parallel_reduce.html#requirements
320: // "when specifying a tag in the policy, the functor's potential init/join/final member functions must also be tagged"
321: // So we have this kind of duplicated code.
322: KOKKOS_INLINE_FUNCTION void join(TransposeDotTag, value_type dst, const value_type src) const { join(dst, src); }
323: KOKKOS_INLINE_FUNCTION void join(ConjugateDotTag, value_type dst, const value_type src) const { join(dst, src); }
325: KOKKOS_INLINE_FUNCTION void init(TransposeDotTag, value_type sum) const { init(sum); }
326: KOKKOS_INLINE_FUNCTION void init(ConjugateDotTag, value_type sum) const { init(sum); }
328: KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
329: {
330: for (size_type j = 0; j < value_count; j++) dst[j] += src[j];
331: }
333: KOKKOS_INLINE_FUNCTION void init(value_type sum) const
334: {
335: for (size_type j = 0; j < value_count; j++) sum[j] = 0.0;
336: }
337: };
339: template <class WorkTag>
340: PetscErrorCode VecMultiDot_Private(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
341: {
342: PetscInt i, j, cur = 0, ngroup = nv / 8, rem = nv % 8, N = xin->map->n;
343: ConstPetscScalarKokkosView xv, yv[8];
344: PetscScalarKokkosViewHost zv(z, nv);
345: auto exec = PetscGetKokkosExecutionSpace();
347: PetscFunctionBegin;
348: PetscCall(VecGetKokkosView(xin, &xv));
349: for (i = 0; i < ngroup; i++) { /* 8 y's per group */
350: for (j = 0; j < 8; j++) PetscCall(VecGetKokkosView(yin[cur + j], &yv[j]));
351: MDotFunctor<8> mdot(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]); /* Hope Kokkos make it asynchronous */
352: PetscCall(PetscLogGpuTimeBegin());
353: PetscCallCXX(Kokkos::parallel_reduce(Kokkos::RangePolicy<WorkTag>(exec, 0, N), mdot, Kokkos::subview(zv, Kokkos::pair<PetscInt, PetscInt>(cur, cur + 8))));
354: PetscCall(PetscLogGpuTimeEnd());
355: for (j = 0; j < 8; j++) PetscCall(VecRestoreKokkosView(yin[cur + j], &yv[j]));
356: cur += 8;
357: }
359: if (rem) { /* The remaining */
360: for (j = 0; j < rem; j++) PetscCall(VecGetKokkosView(yin[cur + j], &yv[j]));
361: Kokkos::RangePolicy<WorkTag> policy(exec, 0, N);
362: auto results = Kokkos::subview(zv, Kokkos::pair<PetscInt, PetscInt>(cur, cur + rem));
363: // clang-format off
364: PetscCall(PetscLogGpuTimeBegin());
365: switch (rem) {
366: case 1: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<1>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
367: case 2: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<2>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
368: case 3: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<3>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
369: case 4: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<4>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
370: case 5: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<5>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
371: case 6: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<6>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
372: case 7: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<7>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
373: }
374: PetscCall(PetscLogGpuTimeEnd());
375: // clang-format on
376: for (j = 0; j < rem; j++) PetscCall(VecRestoreKokkosView(yin[cur + j], &yv[j]));
377: }
378: PetscCall(VecRestoreKokkosView(xin, &xv));
379: exec.fence(); /* If reduce is async, then we need this fence to make sure z is ready for use on host */
380: PetscFunctionReturn(PETSC_SUCCESS);
381: }
383: static PetscErrorCode VecMultiDot_Verbose(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
384: {
385: PetscInt ngroup = nv / 8, rem = nv % 8, N = xin->map->n;
386: ConstPetscScalarKokkosView xv, y0, y1, y2, y3, y4, y5, y6, y7;
387: PetscScalar *zp = z;
388: const Vec *yp = yin;
389: Kokkos::RangePolicy<> policy(PetscGetKokkosExecutionSpace(), 0, N);
391: // clang-format off
392: PetscFunctionBegin;
393: PetscCall(VecGetKokkosView(xin, &xv));
394: for (PetscInt k = 0; k < ngroup; k++) { // 8 y's per group
395: PetscCall(VecGetKokkosView(yp[0], &y0));
396: PetscCall(VecGetKokkosView(yp[1], &y1));
397: PetscCall(VecGetKokkosView(yp[2], &y2));
398: PetscCall(VecGetKokkosView(yp[3], &y3));
399: PetscCall(VecGetKokkosView(yp[4], &y4));
400: PetscCall(VecGetKokkosView(yp[5], &y5));
401: PetscCall(VecGetKokkosView(yp[6], &y6));
402: PetscCall(VecGetKokkosView(yp[7], &y7));
403: PetscCall(PetscLogGpuTimeBegin()); // only for GPU kernel execution
404: Kokkos::parallel_reduce(
405: "VecMDot8", policy,
406: KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3, PetscScalar &lsum4, PetscScalar &lsum5, PetscScalar &lsum6, PetscScalar &lsum7) {
407: lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
408: lsum4 += xv(i) * PetscConj(y4(i)); lsum5 += xv(i) * PetscConj(y5(i)); lsum6 += xv(i) * PetscConj(y6(i)); lsum7 += xv(i) * PetscConj(y7(i));
409: }, zp[0], zp[1], zp[2], zp[3], zp[4], zp[5], zp[6], zp[7]);
410: PetscCall(PetscLogGpuTimeEnd());
411: PetscCall(PetscLogGpuToCpu(8 * sizeof(PetscScalar))); // for copying to z[] on host
412: PetscCall(VecRestoreKokkosView(yp[0], &y0));
413: PetscCall(VecRestoreKokkosView(yp[1], &y1));
414: PetscCall(VecRestoreKokkosView(yp[2], &y2));
415: PetscCall(VecRestoreKokkosView(yp[3], &y3));
416: PetscCall(VecRestoreKokkosView(yp[4], &y4));
417: PetscCall(VecRestoreKokkosView(yp[5], &y5));
418: PetscCall(VecRestoreKokkosView(yp[6], &y6));
419: PetscCall(VecRestoreKokkosView(yp[7], &y7));
420: yp += 8;
421: zp += 8;
422: }
424: if (rem) { /* The remaining */
425: if (rem > 0) PetscCall(VecGetKokkosView(yp[0], &y0));
426: if (rem > 1) PetscCall(VecGetKokkosView(yp[1], &y1));
427: if (rem > 2) PetscCall(VecGetKokkosView(yp[2], &y2));
428: if (rem > 3) PetscCall(VecGetKokkosView(yp[3], &y3));
429: if (rem > 4) PetscCall(VecGetKokkosView(yp[4], &y4));
430: if (rem > 5) PetscCall(VecGetKokkosView(yp[5], &y5));
431: if (rem > 6) PetscCall(VecGetKokkosView(yp[6], &y6));
432: PetscCall(PetscLogGpuTimeBegin());
433: switch (rem) {
434: case 7:
435: Kokkos::parallel_reduce(
436: "VecMDot7", policy,
437: KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3, PetscScalar &lsum4, PetscScalar &lsum5, PetscScalar &lsum6) {
438: lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
439: lsum4 += xv(i) * PetscConj(y4(i)); lsum5 += xv(i) * PetscConj(y5(i)); lsum6 += xv(i) * PetscConj(y6(i));
440: }, zp[0], zp[1], zp[2], zp[3], zp[4], zp[5], zp[6]);
441: break;
442: case 6:
443: Kokkos::parallel_reduce(
444: "VecMDot6", policy,
445: KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3, PetscScalar &lsum4, PetscScalar &lsum5) {
446: lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
447: lsum4 += xv(i) * PetscConj(y4(i)); lsum5 += xv(i) * PetscConj(y5(i));
448: }, zp[0], zp[1], zp[2], zp[3], zp[4], zp[5]);
449: break;
450: case 5:
451: Kokkos::parallel_reduce(
452: "VecMDot5", policy,
453: KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3, PetscScalar &lsum4) {
454: lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
455: lsum4 += xv(i) * PetscConj(y4(i));
456: }, zp[0], zp[1], zp[2], zp[3], zp[4]);
457: break;
458: case 4:
459: Kokkos::parallel_reduce(
460: "VecMDot4", policy,
461: KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3) {
462: lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
463: }, zp[0], zp[1], zp[2], zp[3]);
464: break;
465: case 3:
466: Kokkos::parallel_reduce(
467: "VecMDot3", policy,
468: KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2) {
469: lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i));
470: }, zp[0], zp[1], zp[2]);
471: break;
472: case 2:
473: Kokkos::parallel_reduce(
474: "VecMDot2", policy,
475: KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1) {
476: lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i));
477: }, zp[0], zp[1]);
478: break;
479: case 1:
480: Kokkos::parallel_reduce(
481: "VecMDot1", policy,
482: KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0) {
483: lsum0 += xv(i) * PetscConj(y0(i));
484: }, zp[0]);
485: break;
486: }
487: PetscCall(PetscLogGpuTimeEnd());
488: PetscCall(PetscLogGpuToCpu(rem * sizeof(PetscScalar))); // for copying to z[] on host
489: if (rem > 0) PetscCall(VecRestoreKokkosView(yp[0], &y0));
490: if (rem > 1) PetscCall(VecRestoreKokkosView(yp[1], &y1));
491: if (rem > 2) PetscCall(VecRestoreKokkosView(yp[2], &y2));
492: if (rem > 3) PetscCall(VecRestoreKokkosView(yp[3], &y3));
493: if (rem > 4) PetscCall(VecRestoreKokkosView(yp[4], &y4));
494: if (rem > 5) PetscCall(VecRestoreKokkosView(yp[5], &y5));
495: if (rem > 6) PetscCall(VecRestoreKokkosView(yp[6], &y6));
496: }
497: PetscCall(VecRestoreKokkosView(xin, &xv));
498: PetscFunctionReturn(PETSC_SUCCESS);
499: // clang-format on
500: }
502: /* z[i] = (x,y_i) = y_i^H x */
503: PetscErrorCode VecMDot_SeqKokkos(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
504: {
505: PetscFunctionBegin;
506: // With no good reason, VecMultiDot_Private() performs much worse than VecMultiDot_Verbose() with HIP,
507: // but they are on par with CUDA. Kokkos team is investigating this problem.
508: #if 0
509: PetscCall(VecMultiDot_Private<ConjugateDotTag>(xin, nv, yin, z));
510: #else
511: PetscCall(VecMultiDot_Verbose(xin, nv, yin, z));
512: #endif
513: PetscCall(PetscLogGpuFlops(PetscMax(nv * (2.0 * xin->map->n - 1), 0.0)));
514: PetscFunctionReturn(PETSC_SUCCESS);
515: }
517: /* z[i] = (x,y_i) = y_i^T x */
518: PetscErrorCode VecMTDot_SeqKokkos(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
519: {
520: PetscFunctionBegin;
521: PetscCall(VecMultiDot_Private<TransposeDotTag>(xin, nv, yin, z));
522: PetscCall(PetscLogGpuFlops(PetscMax(nv * (2.0 * xin->map->n - 1), 0.0)));
523: PetscFunctionReturn(PETSC_SUCCESS);
524: }
526: // z[i] = (x,y_i) = y_i^H x OR y_i^T x
527: static PetscErrorCode VecMultiDot_SeqKokkos_GEMV(PetscBool conjugate, Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z_h)
528: {
529: PetscInt i, j, nfail;
530: ConstPetscScalarKokkosView xv, yfirst, ynext;
531: const PetscScalar *yarray;
532: PetscBool stop = PETSC_FALSE;
533: PetscScalar *z_d = nullptr;
534: const char *trans = conjugate ? "C" : "T";
535: PetscInt64 lda = 0;
536: PetscInt m, n = xin->map->n;
538: PetscFunctionBegin;
539: PetscCall(VecGetKokkosView(xin, &xv));
540: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
541: z_d = z_h;
542: #endif
543: i = nfail = 0;
544: while (i < nv) {
545: // search a sequence of vectors with a fixed stride
546: stop = PETSC_FALSE;
547: PetscCall(VecGetKokkosView(yin[i], &yfirst));
548: yarray = yfirst.data();
549: for (j = i + 1; j < nv; j++) {
550: PetscCall(VecGetKokkosView(yin[j], &ynext));
551: if (j == i + 1) {
552: lda = ynext.data() - yarray; // arbitrary ptrdiff could be very large
553: if (lda < 0 || lda - n > 64) stop = PETSC_TRUE; // avoid using arbitrary lda; 64 bytes are a big enough alignment in VecDuplicateVecs
554: } else if (lda * (j - i) != ynext.data() - yarray) { // not in the same stride? if so, stop searching
555: stop = PETSC_TRUE;
556: }
557: PetscCall(VecRestoreKokkosView(yin[j], &ynext));
558: if (stop) break;
559: }
560: PetscCall(VecRestoreKokkosView(yin[i], &yfirst));
562: // found m vectors yin[i..j) with a stride lda at address yarray
563: m = j - i;
564: if (m > 1) {
565: if (!z_d) {
566: if (nv > PetscScalarPoolSize) { // rare case
567: PetscScalarPoolSize = nv;
568: PetscCallCXX(PetscScalarPool = static_cast<PetscScalar *>(Kokkos::kokkos_realloc(PetscScalarPool, PetscScalarPoolSize)));
569: }
570: z_d = PetscScalarPool;
571: }
572: const auto &A = Kokkos::View<const PetscScalar **, Kokkos::LayoutLeft>(yarray, lda, m);
573: const auto &Y = Kokkos::subview(A, std::pair<PetscInt, PetscInt>(0, n), Kokkos::ALL);
574: auto zv = PetscScalarKokkosDualView(PetscScalarKokkosView(z_d + i, m), PetscScalarKokkosViewHost(z_h + i, m));
575: PetscCall(PetscLogGpuTimeBegin());
576: PetscCallCXX(KokkosBlas::gemv(PetscGetKokkosExecutionSpace(), trans, 1.0, Y, xv, 0.0, zv.view_device()));
577: PetscCall(PetscLogGpuTimeEnd());
578: PetscCallCXX(zv.modify_device());
579: PetscCallCXX(zv.sync_host());
580: PetscCall(PetscLogGpuToCpu(zv.extent(0) * sizeof(PetscScalar)));
581: PetscCall(PetscLogGpuFlops(PetscMax(m * (2.0 * n - 1), 0.0)));
582: } else {
583: // we only allow falling back on VecDot once, to avoid doing VecMultiDot via individual VecDots
584: if (nfail++ == 0) {
585: if (conjugate) PetscCall(VecDot_SeqKokkos(xin, yin[i], z_h + i));
586: else PetscCall(VecTDot_SeqKokkos(xin, yin[i], z_h + i));
587: } else break; // break the while loop
588: }
589: i = j;
590: }
591: PetscCall(VecRestoreKokkosView(xin, &xv));
592: if (i < nv) { // finish the remaining if any
593: if (conjugate) PetscCall(VecMDot_SeqKokkos(xin, nv - i, yin + i, z_h + i));
594: else PetscCall(VecMTDot_SeqKokkos(xin, nv - i, yin + i, z_h + i));
595: }
596: PetscFunctionReturn(PETSC_SUCCESS);
597: }
599: PetscErrorCode VecMDot_SeqKokkos_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
600: {
601: PetscFunctionBegin;
602: PetscCall(VecMultiDot_SeqKokkos_GEMV(PETSC_TRUE, xin, nv, yin, z)); // conjugate
603: PetscFunctionReturn(PETSC_SUCCESS);
604: }
606: PetscErrorCode VecMTDot_SeqKokkos_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
607: {
608: PetscFunctionBegin;
609: PetscCall(VecMultiDot_SeqKokkos_GEMV(PETSC_FALSE, xin, nv, yin, z)); // transpose
610: PetscFunctionReturn(PETSC_SUCCESS);
611: }
613: /* x[:] = alpha */
614: PetscErrorCode VecSet_SeqKokkos(Vec xin, PetscScalar alpha)
615: {
616: PetscScalarKokkosView xv;
617: auto exec = PetscGetKokkosExecutionSpace();
619: PetscFunctionBegin;
620: PetscCall(PetscLogGpuTimeBegin());
621: PetscCall(VecGetKokkosViewWrite(xin, &xv));
622: PetscCallCXX(KokkosBlas::fill(exec, xv, alpha));
623: PetscCall(VecRestoreKokkosViewWrite(xin, &xv));
624: PetscCall(PetscLogGpuTimeEnd());
625: PetscFunctionReturn(PETSC_SUCCESS);
626: }
628: /* x = alpha x */
629: PetscErrorCode VecScale_SeqKokkos(Vec xin, PetscScalar alpha)
630: {
631: auto exec = PetscGetKokkosExecutionSpace();
633: PetscFunctionBegin;
634: if (alpha == (PetscScalar)0.0) {
635: PetscCall(VecSet_SeqKokkos(xin, alpha));
636: } else if (alpha != (PetscScalar)1.0) {
637: PetscScalarKokkosView xv;
639: PetscCall(PetscLogGpuTimeBegin());
640: PetscCall(VecGetKokkosView(xin, &xv));
641: PetscCallCXX(KokkosBlas::scal(exec, xv, alpha, xv));
642: PetscCall(VecRestoreKokkosView(xin, &xv));
643: PetscCall(PetscLogGpuTimeEnd());
644: PetscCall(PetscLogGpuFlops(xin->map->n));
645: }
646: PetscFunctionReturn(PETSC_SUCCESS);
647: }
649: /* z = y^H x */
650: PetscErrorCode VecDot_SeqKokkos(Vec xin, Vec yin, PetscScalar *z)
651: {
652: ConstPetscScalarKokkosView xv, yv;
653: auto exec = PetscGetKokkosExecutionSpace();
655: PetscFunctionBegin;
656: PetscCall(PetscLogGpuTimeBegin());
657: PetscCall(VecGetKokkosView(xin, &xv));
658: PetscCall(VecGetKokkosView(yin, &yv));
659: PetscCallCXX(*z = KokkosBlas::dot(exec, yv, xv)); /* KokkosBlas::dot(a,b) takes conjugate of a */
660: PetscCall(VecRestoreKokkosView(xin, &xv));
661: PetscCall(VecRestoreKokkosView(yin, &yv));
662: PetscCall(PetscLogGpuTimeEnd());
663: PetscCall(PetscLogGpuFlops(PetscMax(2.0 * xin->map->n - 1, 0.0)));
664: PetscFunctionReturn(PETSC_SUCCESS);
665: }
667: /* y = x, where x is VECKOKKOS, but y may be not */
668: PetscErrorCode VecCopy_SeqKokkos(Vec xin, Vec yin)
669: {
670: auto exec = PetscGetKokkosExecutionSpace();
672: PetscFunctionBegin;
673: PetscCall(PetscLogGpuTimeBegin());
674: if (xin != yin) {
675: Vec_Kokkos *xkok = static_cast<Vec_Kokkos *>(xin->spptr);
676: if (yin->offloadmask == PETSC_OFFLOAD_KOKKOS) {
677: /* y is also a VecKokkos */
678: Vec_Kokkos *ykok = static_cast<Vec_Kokkos *>(yin->spptr);
679: /* Kokkos rule: if x's host has newer data, it will copy to y's host view; otherwise to y's device view
680: In case x's host is newer, y's device is newer, it will error (though should not, I think). So we just
681: clear y's sync state.
682: */
683: ykok->v_dual.clear_sync_state();
684: PetscCallCXX(Kokkos::deep_copy(exec, ykok->v_dual, xkok->v_dual)); // either cpu2cpu or gpu2cpu, so don't log it
685: } else {
686: PetscScalar *yarray;
687: PetscCall(VecGetArrayWrite(yin, &yarray));
688: PetscScalarKokkosViewHost yv(yarray, yin->map->n);
689: if (xkok->v_dual.need_sync_host()) { // x's device has newer data
690: PetscCallCXX(Kokkos::deep_copy(exec, yv, xkok->v_dual.view_device())); // gpu2cpu
691: PetscCallCXX(exec.fence()); // finish the deep copy
692: PetscCall(PetscLogGpuToCpu(xkok->v_dual.extent(0) * sizeof(PetscScalar)));
693: } else {
694: PetscCallCXX(exec.fence()); // make sure xkok->v_dual.view_host() in ready for use on host; Kokkos might also call it inside deep_copy(). We do it here for safety.
695: PetscCallCXX(Kokkos::deep_copy(exec, yv, xkok->v_dual.view_host())); // Host view to host view deep copy, done on host
696: }
697: PetscCall(VecRestoreArrayWrite(yin, &yarray));
698: }
699: }
700: PetscCall(PetscLogGpuTimeEnd());
701: PetscFunctionReturn(PETSC_SUCCESS);
702: }
704: /* y[i] <--> x[i] */
705: PetscErrorCode VecSwap_SeqKokkos(Vec xin, Vec yin)
706: {
707: PetscFunctionBegin;
708: if (xin != yin) {
709: PetscScalarKokkosView xv, yv;
711: PetscCall(PetscLogGpuTimeBegin());
712: PetscCall(VecGetKokkosView(xin, &xv));
713: PetscCall(VecGetKokkosView(yin, &yv));
714: PetscCallCXX(Kokkos::parallel_for(
715: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i) {
716: PetscScalar tmp = xv(i);
717: xv(i) = yv(i);
718: yv(i) = tmp;
719: }));
720: PetscCall(VecRestoreKokkosView(xin, &xv));
721: PetscCall(VecRestoreKokkosView(yin, &yv));
722: PetscCall(PetscLogGpuTimeEnd());
723: }
724: PetscFunctionReturn(PETSC_SUCCESS);
725: }
727: /* w = alpha x + y */
728: PetscErrorCode VecWAXPY_SeqKokkos(Vec win, PetscScalar alpha, Vec xin, Vec yin)
729: {
730: PetscFunctionBegin;
731: if (alpha == (PetscScalar)0.0) {
732: PetscCall(VecCopy_SeqKokkos(yin, win));
733: } else {
734: ConstPetscScalarKokkosView xv, yv;
735: PetscScalarKokkosView wv;
737: PetscCall(PetscLogGpuTimeBegin());
738: PetscCall(VecGetKokkosViewWrite(win, &wv));
739: PetscCall(VecGetKokkosView(xin, &xv));
740: PetscCall(VecGetKokkosView(yin, &yv));
741: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, win->map->n), KOKKOS_LAMBDA(const PetscInt &i) { wv(i) = alpha * xv(i) + yv(i); }));
742: PetscCall(VecRestoreKokkosView(xin, &xv));
743: PetscCall(VecRestoreKokkosView(yin, &yv));
744: PetscCall(VecRestoreKokkosViewWrite(win, &wv));
745: PetscCall(PetscLogGpuTimeEnd());
746: PetscCall(PetscLogGpuFlops(2.0 * win->map->n));
747: }
748: PetscFunctionReturn(PETSC_SUCCESS);
749: }
751: template <PetscInt ValueCount>
752: struct MAXPYFunctor {
753: static_assert(ValueCount >= 1 && ValueCount <= 8, "ValueCount must be in [1, 8]");
754: typedef ConstPetscScalarKokkosView::size_type size_type;
756: PetscScalarKokkosView yv;
757: PetscScalar a[8];
758: ConstPetscScalarKokkosView xv[8];
760: MAXPYFunctor(PetscScalarKokkosView yv, PetscScalar a0, PetscScalar a1, PetscScalar a2, PetscScalar a3, PetscScalar a4, PetscScalar a5, PetscScalar a6, PetscScalar a7, ConstPetscScalarKokkosView xv0, ConstPetscScalarKokkosView xv1, ConstPetscScalarKokkosView xv2, ConstPetscScalarKokkosView xv3, ConstPetscScalarKokkosView xv4, ConstPetscScalarKokkosView xv5, ConstPetscScalarKokkosView xv6, ConstPetscScalarKokkosView xv7) :
761: yv(yv)
762: {
763: a[0] = a0;
764: a[1] = a1;
765: a[2] = a2;
766: a[3] = a3;
767: a[4] = a4;
768: a[5] = a5;
769: a[6] = a6;
770: a[7] = a7;
771: xv[0] = xv0;
772: xv[1] = xv1;
773: xv[2] = xv2;
774: xv[3] = xv3;
775: xv[4] = xv4;
776: xv[5] = xv5;
777: xv[6] = xv6;
778: xv[7] = xv7;
779: }
781: KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
782: {
783: for (PetscInt j = 0; j < ValueCount; ++j) yv(i) += a[j] * xv[j](i);
784: }
785: };
787: /* y = y + sum alpha[i] x[i] */
788: PetscErrorCode VecMAXPY_SeqKokkos(Vec yin, PetscInt nv, const PetscScalar *alpha, Vec *xin)
789: {
790: PetscInt i, j, cur = 0, ngroup = nv / 8, rem = nv % 8, N = yin->map->n;
791: PetscScalarKokkosView yv;
792: PetscScalar a[8];
793: ConstPetscScalarKokkosView xv[8];
794: Kokkos::RangePolicy<> policy(PetscGetKokkosExecutionSpace(), 0, N);
796: PetscFunctionBegin;
797: PetscCall(PetscLogGpuTimeBegin());
798: PetscCall(VecGetKokkosView(yin, &yv));
799: for (i = 0; i < ngroup; i++) { /* 8 x's per group */
800: for (j = 0; j < 8; j++) { /* Fill the parameters */
801: a[j] = alpha[cur + j];
802: PetscCall(VecGetKokkosView(xin[cur + j], &xv[j]));
803: }
804: MAXPYFunctor<8> maxpy(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]);
805: PetscCallCXX(Kokkos::parallel_for(policy, maxpy));
806: for (j = 0; j < 8; j++) PetscCall(VecRestoreKokkosView(xin[cur + j], &xv[j]));
807: cur += 8;
808: }
810: if (rem) { /* The remaining */
811: for (j = 0; j < rem; j++) {
812: a[j] = alpha[cur + j];
813: PetscCall(VecGetKokkosView(xin[cur + j], &xv[j]));
814: }
815: // clang-format off
816: switch (rem) {
817: case 1: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<1>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
818: case 2: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<2>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
819: case 3: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<3>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
820: case 4: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<4>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
821: case 5: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<5>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
822: case 6: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<6>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
823: case 7: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<7>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
824: }
825: // clang-format on
826: for (j = 0; j < rem; j++) PetscCall(VecRestoreKokkosView(xin[cur + j], &xv[j]));
827: }
828: PetscCall(VecRestoreKokkosView(yin, &yv));
829: PetscCall(PetscLogGpuTimeEnd());
830: PetscCall(PetscLogGpuFlops(nv * 2.0 * yin->map->n));
831: PetscFunctionReturn(PETSC_SUCCESS);
832: }
834: /* y = y + sum alpha[i] x[i] */
835: PetscErrorCode VecMAXPY_SeqKokkos_GEMV(Vec yin, PetscInt nv, const PetscScalar *a_h, Vec *xin)
836: {
837: const PetscInt n = yin->map->n;
838: PetscInt i, j, nfail;
839: PetscScalarKokkosView yv;
840: ConstPetscScalarKokkosView xfirst, xnext;
841: PetscBool stop = PETSC_FALSE;
842: PetscInt lda = 0, m;
843: const PetscScalar *xarray;
844: PetscScalar *a_d = nullptr;
846: PetscFunctionBegin;
847: PetscCall(PetscLogGpuTimeBegin());
848: PetscCall(VecGetKokkosView(yin, &yv));
849: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
850: a_d = const_cast<PetscScalar *>(a_h);
851: #endif
852: i = nfail = 0;
853: while (i < nv) {
854: stop = PETSC_FALSE;
855: PetscCall(VecGetKokkosView(xin[i], &xfirst));
856: xarray = xfirst.data();
857: for (j = i + 1; j < nv; j++) {
858: PetscCall(VecGetKokkosView(xin[j], &xnext));
859: if (j == i + 1) {
860: lda = xnext.data() - xfirst.data();
861: if (lda < 0 || lda - n > 64) stop = PETSC_TRUE; // avoid using arbitrary lda; 64 bytes are a big enough alignment in VecDuplicateVecs
862: } else if (lda * (j - i) != xnext.data() - xarray) { // not in the same stride? if so, stop here
863: stop = PETSC_TRUE;
864: }
865: PetscCall(VecRestoreKokkosView(xin[j], &xnext));
866: if (stop) break;
867: }
868: PetscCall(VecRestoreKokkosView(xin[i], &xfirst));
870: m = j - i;
871: if (m > 1) {
872: if (!a_d) {
873: if (nv > PetscScalarPoolSize) { // rare case
874: PetscScalarPoolSize = nv;
875: PetscCallCXX(PetscScalarPool = static_cast<PetscScalar *>(Kokkos::kokkos_realloc(PetscScalarPool, PetscScalarPoolSize)));
876: }
877: a_d = PetscScalarPool;
878: }
879: const auto &B = Kokkos::View<const PetscScalar **, Kokkos::LayoutLeft>(xarray, lda, m);
880: const auto &A = Kokkos::subview(B, std::pair<PetscInt, PetscInt>(0, n), Kokkos::ALL);
881: auto av = PetscScalarKokkosDualView(PetscScalarKokkosView(a_d + i, m), PetscScalarKokkosViewHost(const_cast<PetscScalar *>(a_h) + i, m));
882: av.modify_host();
883: av.sync_device();
884: PetscCall(PetscLogCpuToGpu(av.extent(0) * sizeof(PetscScalar)));
885: PetscCallCXX(KokkosBlas::gemv(PetscGetKokkosExecutionSpace(), "N", 1.0, A, av.view_device(), 1.0, yv));
886: PetscCall(PetscLogGpuFlops(m * 2.0 * n));
887: } else {
888: // we only allow falling back on VecAXPY once
889: if (nfail++ == 0) PetscCall(VecAXPY_SeqKokkos(yin, a_h[i], xin[i]));
890: else break; // break the while loop
891: }
892: i = j;
893: }
894: // finish the remaining if any
895: PetscCall(VecRestoreKokkosView(yin, &yv));
896: if (i < nv) PetscCall(VecMAXPY_SeqKokkos(yin, nv - i, a_h + i, xin + i));
897: PetscCall(PetscLogGpuTimeEnd());
898: PetscFunctionReturn(PETSC_SUCCESS);
899: }
901: /* y = alpha x + beta y */
902: PetscErrorCode VecAXPBY_SeqKokkos(Vec yin, PetscScalar alpha, PetscScalar beta, Vec xin)
903: {
904: PetscBool xiskok, yiskok;
906: PetscFunctionBegin;
907: PetscCall(PetscObjectTypeCompareAny((PetscObject)xin, &xiskok, VECSEQKOKKOS, VECMPIKOKKOS, ""));
908: PetscCall(PetscObjectTypeCompareAny((PetscObject)yin, &yiskok, VECSEQKOKKOS, VECMPIKOKKOS, ""));
909: if (xiskok && yiskok) {
910: ConstPetscScalarKokkosView xv;
911: PetscScalarKokkosView yv;
913: PetscCall(PetscLogGpuTimeBegin());
914: PetscCall(VecGetKokkosView(xin, &xv));
915: PetscCall(VecGetKokkosView(yin, &yv));
916: PetscCallCXX(KokkosBlas::axpby(PetscGetKokkosExecutionSpace(), alpha, xv, beta, yv));
917: PetscCall(VecRestoreKokkosView(xin, &xv));
918: PetscCall(VecRestoreKokkosView(yin, &yv));
919: PetscCall(PetscLogGpuTimeEnd());
920: if (alpha == (PetscScalar)0.0 || beta == (PetscScalar)0.0) {
921: PetscCall(PetscLogGpuFlops(xin->map->n));
922: } else if (beta == (PetscScalar)1.0 || alpha == (PetscScalar)1.0) {
923: PetscCall(PetscLogGpuFlops(2.0 * xin->map->n));
924: } else {
925: PetscCall(PetscLogGpuFlops(3.0 * xin->map->n));
926: }
927: } else {
928: PetscCall(VecAXPBY_Seq(yin, alpha, beta, xin));
929: }
930: PetscFunctionReturn(PETSC_SUCCESS);
931: }
933: /* z = alpha x + beta y + gamma z */
934: PetscErrorCode VecAXPBYPCZ_SeqKokkos(Vec zin, PetscScalar alpha, PetscScalar beta, PetscScalar gamma, Vec xin, Vec yin)
935: {
936: ConstPetscScalarKokkosView xv, yv;
937: PetscScalarKokkosView zv;
938: Kokkos::RangePolicy<> policy(PetscGetKokkosExecutionSpace(), 0, zin->map->n);
940: PetscFunctionBegin;
941: PetscCall(PetscLogGpuTimeBegin());
942: PetscCall(VecGetKokkosView(zin, &zv));
943: PetscCall(VecGetKokkosView(xin, &xv));
944: PetscCall(VecGetKokkosView(yin, &yv));
945: if (gamma == (PetscScalar)0.0) { // a common case
946: if (alpha == -beta) {
947: PetscCallCXX(Kokkos::parallel_for( // a common case
948: policy, KOKKOS_LAMBDA(const PetscInt &i) { zv(i) = alpha * (xv(i) - yv(i)); }));
949: } else {
950: PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i) { zv(i) = alpha * xv(i) + beta * yv(i); }));
951: }
952: } else {
953: PetscCallCXX(KokkosBlas::update(PetscGetKokkosExecutionSpace(), alpha, xv, beta, yv, gamma, zv));
954: }
955: PetscCall(VecRestoreKokkosView(xin, &xv));
956: PetscCall(VecRestoreKokkosView(yin, &yv));
957: PetscCall(VecRestoreKokkosView(zin, &zv));
958: PetscCall(PetscLogGpuTimeEnd());
959: PetscCall(PetscLogGpuFlops(zin->map->n * 5.0));
960: PetscFunctionReturn(PETSC_SUCCESS);
961: }
963: /* w = x*y. Any subset of the x, y, and w may be the same vector.
965: w is of type VecKokkos, but x, y may be not.
966: */
967: PetscErrorCode VecPointwiseMult_SeqKokkos(Vec win, Vec xin, Vec yin)
968: {
969: PetscInt n;
971: PetscFunctionBegin;
972: PetscCall(PetscLogGpuTimeBegin());
973: PetscCall(VecGetLocalSize(win, &n));
974: if (xin->offloadmask != PETSC_OFFLOAD_KOKKOS || yin->offloadmask != PETSC_OFFLOAD_KOKKOS) {
975: PetscScalarKokkosViewHost wv;
976: const PetscScalar *xp, *yp;
977: PetscCall(VecGetArrayRead(xin, &xp));
978: PetscCall(VecGetArrayRead(yin, &yp));
979: PetscCall(VecGetKokkosViewWrite(win, &wv));
981: ConstPetscScalarKokkosViewHost xv(xp, n), yv(yp, n);
982: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n), KOKKOS_LAMBDA(const PetscInt &i) { wv(i) = xv(i) * yv(i); }));
984: PetscCall(VecRestoreArrayRead(xin, &xp));
985: PetscCall(VecRestoreArrayRead(yin, &yp));
986: PetscCall(VecRestoreKokkosViewWrite(win, &wv));
987: } else {
988: ConstPetscScalarKokkosView xv, yv;
989: PetscScalarKokkosView wv;
991: PetscCall(VecGetKokkosViewWrite(win, &wv));
992: PetscCall(VecGetKokkosView(xin, &xv));
993: PetscCall(VecGetKokkosView(yin, &yv));
994: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt &i) { wv(i) = xv(i) * yv(i); }));
995: PetscCall(VecRestoreKokkosView(yin, &yv));
996: PetscCall(VecRestoreKokkosView(xin, &xv));
997: PetscCall(VecRestoreKokkosViewWrite(win, &wv));
998: }
999: PetscCall(PetscLogGpuTimeEnd());
1000: PetscCall(PetscLogGpuFlops(n));
1001: PetscFunctionReturn(PETSC_SUCCESS);
1002: }
1004: /* w = x/y */
1005: PetscErrorCode VecPointwiseDivide_SeqKokkos(Vec win, Vec xin, Vec yin)
1006: {
1007: PetscInt n;
1009: PetscFunctionBegin;
1010: PetscCall(PetscLogGpuTimeBegin());
1011: PetscCall(VecGetLocalSize(win, &n));
1012: if (xin->offloadmask != PETSC_OFFLOAD_KOKKOS || yin->offloadmask != PETSC_OFFLOAD_KOKKOS) {
1013: PetscScalarKokkosViewHost wv;
1014: const PetscScalar *xp, *yp;
1015: PetscCall(VecGetArrayRead(xin, &xp));
1016: PetscCall(VecGetArrayRead(yin, &yp));
1017: PetscCall(VecGetKokkosViewWrite(win, &wv));
1019: ConstPetscScalarKokkosViewHost xv(xp, n), yv(yp, n);
1020: PetscCallCXX(Kokkos::parallel_for(
1021: Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n), KOKKOS_LAMBDA(const PetscInt &i) {
1022: if (yv(i) != 0.0) wv(i) = xv(i) / yv(i);
1023: else wv(i) = 0.0;
1024: }));
1026: PetscCall(VecRestoreArrayRead(xin, &xp));
1027: PetscCall(VecRestoreArrayRead(yin, &yp));
1028: PetscCall(VecRestoreKokkosViewWrite(win, &wv));
1029: } else {
1030: ConstPetscScalarKokkosView xv, yv;
1031: PetscScalarKokkosView wv;
1033: PetscCall(VecGetKokkosViewWrite(win, &wv));
1034: PetscCall(VecGetKokkosView(xin, &xv));
1035: PetscCall(VecGetKokkosView(yin, &yv));
1036: PetscCallCXX(Kokkos::parallel_for(
1037: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt &i) {
1038: if (yv(i) != 0.0) wv(i) = xv(i) / yv(i);
1039: else wv(i) = 0.0;
1040: }));
1041: PetscCall(VecRestoreKokkosView(yin, &yv));
1042: PetscCall(VecRestoreKokkosView(xin, &xv));
1043: PetscCall(VecRestoreKokkosViewWrite(win, &wv));
1044: }
1045: PetscCall(PetscLogGpuTimeEnd());
1046: PetscCall(PetscLogGpuFlops(win->map->n));
1047: PetscFunctionReturn(PETSC_SUCCESS);
1048: }
1050: PetscErrorCode VecNorm_SeqKokkos(Vec xin, NormType type, PetscReal *z)
1051: {
1052: const PetscInt n = xin->map->n;
1053: ConstPetscScalarKokkosView xv;
1054: auto exec = PetscGetKokkosExecutionSpace();
1056: PetscFunctionBegin;
1057: if (type == NORM_1_AND_2) {
1058: PetscCall(VecNorm_SeqKokkos(xin, NORM_1, z));
1059: PetscCall(VecNorm_SeqKokkos(xin, NORM_2, z + 1));
1060: } else {
1061: PetscCall(PetscLogGpuTimeBegin());
1062: PetscCall(VecGetKokkosView(xin, &xv));
1063: if (type == NORM_2 || type == NORM_FROBENIUS) {
1064: PetscCallCXX(*z = KokkosBlas::nrm2(exec, xv));
1065: PetscCall(PetscLogGpuFlops(PetscMax(2.0 * n - 1, 0.0)));
1066: } else if (type == NORM_1) {
1067: PetscCallCXX(*z = KokkosBlas::nrm1(exec, xv));
1068: PetscCall(PetscLogGpuFlops(PetscMax(n - 1.0, 0.0)));
1069: } else if (type == NORM_INFINITY) {
1070: PetscCallCXX(*z = KokkosBlas::nrminf(exec, xv));
1071: }
1072: PetscCall(VecRestoreKokkosView(xin, &xv));
1073: PetscCall(PetscLogGpuTimeEnd());
1074: }
1075: PetscFunctionReturn(PETSC_SUCCESS);
1076: }
1078: PetscErrorCode VecErrorWeightedNorms_SeqKokkos(Vec U, Vec Y, Vec E, NormType wnormtype, PetscReal atol, Vec vatol, PetscReal rtol, Vec vrtol, PetscReal ignore_max, PetscReal *norm, PetscInt *norm_loc, PetscReal *norma, PetscInt *norma_loc, PetscReal *normr, PetscInt *normr_loc)
1079: {
1080: ConstPetscScalarKokkosView u, y, erra, atola, rtola;
1081: PetscBool has_E = PETSC_FALSE, has_atol = PETSC_FALSE, has_rtol = PETSC_FALSE;
1082: PetscInt n, n_loc = 0, na_loc = 0, nr_loc = 0;
1083: PetscReal nrm = 0, nrma = 0, nrmr = 0;
1085: PetscFunctionBegin;
1086: PetscCall(VecGetLocalSize(U, &n));
1087: PetscCall(VecGetKokkosView(U, &u));
1088: PetscCall(VecGetKokkosView(Y, &y));
1089: if (E) {
1090: PetscCall(VecGetKokkosView(E, &erra));
1091: has_E = PETSC_TRUE;
1092: }
1093: if (vatol) {
1094: PetscCall(VecGetKokkosView(vatol, &atola));
1095: has_atol = PETSC_TRUE;
1096: }
1097: if (vrtol) {
1098: PetscCall(VecGetKokkosView(vrtol, &rtola));
1099: has_rtol = PETSC_TRUE;
1100: }
1102: if (wnormtype == NORM_INFINITY) {
1103: PetscCallCXX(Kokkos::parallel_reduce(
1104: "VecErrorWeightedNorms_INFINITY", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n),
1105: KOKKOS_LAMBDA(const PetscInt &i, PetscReal &l_nrm, PetscReal &l_nrma, PetscReal &l_nrmr, PetscInt &l_n_loc, PetscInt &l_na_loc, PetscInt &l_nr_loc) {
1106: PetscReal err, tol, tola, tolr, l_atol, l_rtol;
1107: if (PetscAbsScalar(y(i)) >= ignore_max && PetscAbsScalar(u(i)) >= ignore_max) {
1108: l_atol = has_atol ? PetscRealPart(atola(i)) : atol;
1109: l_rtol = has_rtol ? PetscRealPart(rtola(i)) : rtol;
1110: err = has_E ? PetscAbsScalar(erra(i)) : PetscAbsScalar(y(i) - u(i));
1111: tola = l_atol;
1112: tolr = l_rtol * PetscMax(PetscAbsScalar(u(i)), PetscAbsScalar(y(i)));
1113: tol = tola + tolr;
1114: if (tola > 0.) {
1115: l_nrma = PetscMax(l_nrma, err / tola);
1116: l_na_loc++;
1117: }
1118: if (tolr > 0.) {
1119: l_nrmr = PetscMax(l_nrmr, err / tolr);
1120: l_nr_loc++;
1121: }
1122: if (tol > 0.) {
1123: l_nrm = PetscMax(l_nrm, err / tol);
1124: l_n_loc++;
1125: }
1126: }
1127: },
1128: Kokkos::Max<PetscReal>(nrm), Kokkos::Max<PetscReal>(nrma), Kokkos::Max<PetscReal>(nrmr), n_loc, na_loc, nr_loc));
1129: } else {
1130: PetscCallCXX(Kokkos::parallel_reduce(
1131: "VecErrorWeightedNorms_NORM_2", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n),
1132: KOKKOS_LAMBDA(const PetscInt &i, PetscReal &l_nrm, PetscReal &l_nrma, PetscReal &l_nrmr, PetscInt &l_n_loc, PetscInt &l_na_loc, PetscInt &l_nr_loc) {
1133: PetscReal err, tol, tola, tolr, l_atol, l_rtol;
1134: if (PetscAbsScalar(y(i)) >= ignore_max && PetscAbsScalar(u(i)) >= ignore_max) {
1135: l_atol = has_atol ? PetscRealPart(atola(i)) : atol;
1136: l_rtol = has_rtol ? PetscRealPart(rtola(i)) : rtol;
1137: err = has_E ? PetscAbsScalar(erra(i)) : PetscAbsScalar(y(i) - u(i));
1138: tola = l_atol;
1139: tolr = l_rtol * PetscMax(PetscAbsScalar(u(i)), PetscAbsScalar(y(i)));
1140: tol = tola + tolr;
1141: if (tola > 0.) {
1142: l_nrma += PetscSqr(err / tola);
1143: l_na_loc++;
1144: }
1145: if (tolr > 0.) {
1146: l_nrmr += PetscSqr(err / tolr);
1147: l_nr_loc++;
1148: }
1149: if (tol > 0.) {
1150: l_nrm += PetscSqr(err / tol);
1151: l_n_loc++;
1152: }
1153: }
1154: },
1155: nrm, nrma, nrmr, n_loc, na_loc, nr_loc));
1156: }
1158: if (wnormtype == NORM_2) {
1159: *norm = PetscSqrtReal(nrm);
1160: *norma = PetscSqrtReal(nrma);
1161: *normr = PetscSqrtReal(nrmr);
1162: } else {
1163: *norm = nrm;
1164: *norma = nrma;
1165: *normr = nrmr;
1166: }
1167: *norm_loc = n_loc;
1168: *norma_loc = na_loc;
1169: *normr_loc = nr_loc;
1171: if (E) PetscCall(VecRestoreKokkosView(E, &erra));
1172: if (vatol) PetscCall(VecRestoreKokkosView(vatol, &atola));
1173: if (vrtol) PetscCall(VecRestoreKokkosView(vrtol, &rtola));
1174: PetscCall(VecRestoreKokkosView(U, &u));
1175: PetscCall(VecRestoreKokkosView(Y, &y));
1176: PetscFunctionReturn(PETSC_SUCCESS);
1177: }
1179: /* A functor for DotNorm2 so that we can compute dp and nm in one kernel */
1180: struct DotNorm2 {
1181: typedef PetscScalar value_type[];
1182: typedef ConstPetscScalarKokkosView::size_type size_type;
1184: size_type value_count;
1185: ConstPetscScalarKokkosView xv_, yv_; /* first and second vectors in VecDotNorm2. The order matters. */
1187: DotNorm2(ConstPetscScalarKokkosView &xv, ConstPetscScalarKokkosView &yv) : value_count(2), xv_(xv), yv_(yv) { }
1189: KOKKOS_INLINE_FUNCTION void operator()(const size_type i, value_type result) const
1190: {
1191: result[0] += PetscConj(yv_(i)) * xv_(i);
1192: result[1] += PetscConj(yv_(i)) * yv_(i);
1193: }
1195: KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
1196: {
1197: dst[0] += src[0];
1198: dst[1] += src[1];
1199: }
1201: KOKKOS_INLINE_FUNCTION void init(value_type result) const
1202: {
1203: result[0] = 0.0;
1204: result[1] = 0.0;
1205: }
1206: };
1208: /* dp = y^H x, nm = y^H y */
1209: PetscErrorCode VecDotNorm2_SeqKokkos(Vec xin, Vec yin, PetscScalar *dp, PetscScalar *nm)
1210: {
1211: ConstPetscScalarKokkosView xv, yv;
1212: PetscScalar result[2];
1214: PetscFunctionBegin;
1215: PetscCall(PetscLogGpuTimeBegin());
1216: PetscCall(VecGetKokkosView(xin, &xv));
1217: PetscCall(VecGetKokkosView(yin, &yv));
1218: DotNorm2 dn(xv, yv);
1219: PetscCallCXX(Kokkos::parallel_reduce(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), dn, result));
1220: *dp = result[0];
1221: *nm = result[1];
1222: PetscCall(VecRestoreKokkosView(yin, &yv));
1223: PetscCall(VecRestoreKokkosView(xin, &xv));
1224: PetscCall(PetscLogGpuTimeEnd());
1225: PetscCall(PetscLogGpuFlops(4.0 * xin->map->n));
1226: PetscFunctionReturn(PETSC_SUCCESS);
1227: }
1229: PetscErrorCode VecConjugate_SeqKokkos(Vec xin)
1230: {
1231: #if defined(PETSC_USE_COMPLEX)
1232: PetscScalarKokkosView xv;
1234: PetscFunctionBegin;
1235: PetscCall(PetscLogGpuTimeBegin());
1236: PetscCall(VecGetKokkosView(xin, &xv));
1237: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i) { xv(i) = PetscConj(xv(i)); }));
1238: PetscCall(VecRestoreKokkosView(xin, &xv));
1239: PetscCall(PetscLogGpuTimeEnd());
1240: #else
1241: PetscFunctionBegin;
1242: #endif
1243: PetscFunctionReturn(PETSC_SUCCESS);
1244: }
1246: /* Temporarily replace the array in vin with a[]. Return to the original array with a call to VecResetArray() */
1247: PetscErrorCode VecPlaceArray_SeqKokkos(Vec vin, const PetscScalar *a)
1248: {
1249: Vec_Seq *vecseq = (Vec_Seq *)vin->data;
1250: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(vin->spptr);
1252: PetscFunctionBegin;
1253: PetscCall(VecPlaceArray_Seq(vin, a));
1254: PetscCall(veckok->UpdateArray<HostMirrorMemorySpace>(vecseq->array));
1255: PetscFunctionReturn(PETSC_SUCCESS);
1256: }
1258: PetscErrorCode VecResetArray_SeqKokkos(Vec vin)
1259: {
1260: Vec_Seq *vecseq = (Vec_Seq *)vin->data;
1261: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(vin->spptr);
1263: PetscFunctionBegin;
1264: /* User wants to unhook the provided host array. Sync it so that user can get the latest */
1265: PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1266: PetscCall(VecResetArray_Seq(vin)); /* Swap back the old host array, assuming its has the latest value */
1267: PetscCall(veckok->UpdateArray<HostMirrorMemorySpace>(vecseq->array));
1268: PetscFunctionReturn(PETSC_SUCCESS);
1269: }
1271: /*@C
1272: VecKokkosPlaceArray - Allows one to replace the device array in a VecKokkos vector with a
1273: device array provided by the user. This is useful to avoid copying an array into a vector.
1275: Logically Collective; No Fortran Support
1277: Input Parameters:
1278: + v - the VecKokkos vector
1279: - a - the device array
1281: Level: developer
1283: Notes:
1285: You can return to the original array with a call to `VecKokkosResetArray()`. `vec` does not take
1286: ownership of `array` in any way.
1288: The user manages the device array so PETSc doesn't care how it was allocated.
1290: The user must free `array` themselves but be careful not to
1291: do so before the vector has either been destroyed, had its original array restored with
1292: `VecKokkosResetArray()` or permanently replaced with `VecReplaceArray()`.
1294: .seealso: [](ch_vectors), `Vec`, `VecGetArray()`, `VecKokkosResetArray()`, `VecReplaceArray()`, `VecResetArray()`
1295: @*/
1296: PetscErrorCode VecKokkosPlaceArray(Vec v, PetscScalar *a)
1297: {
1298: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1300: PetscFunctionBegin;
1301: VecErrorIfNotKokkos(v);
1302: // Sync the old device view before replacing it; so that when it is put back, it has the saved value.
1303: PetscCall(KokkosDualViewSync<DefaultMemorySpace>(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1304: PetscCallCXX(veckok->unplaced_d = veckok->v_dual.view_device());
1305: // We assume a[] contains the latest data and discard the vector's old sync state
1306: PetscCall(veckok->UpdateArray<DefaultMemorySpace>(a));
1307: PetscFunctionReturn(PETSC_SUCCESS);
1308: }
1310: /*@C
1311: VecKokkosResetArray - Resets a vector to use its default memory. Call this
1312: after the use of `VecKokkosPlaceArray()`.
1314: Not Collective
1316: Input Parameter:
1317: . v - the vector
1319: Level: developer
1321: Notes:
1323: After the call, the original array placed in with `VecKokkosPlaceArray()` will contain the latest value of the vector.
1324: Note that device kernels are asynchronous. Users are responsible to sync the device if they wish to have immediate access
1325: to the data in the array. Also, after the call, `v` will contain whatever data before `VecKokkosPlaceArray()`.
1327: .seealso: [](ch_vectors), `Vec`, `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`, `VecKokkosPlaceArray()`
1328: @*/
1329: PetscErrorCode VecKokkosResetArray(Vec v)
1330: {
1331: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1333: PetscFunctionBegin;
1334: VecErrorIfNotKokkos(v);
1335: // User wants to unhook the provided device array. Sync it so that user can get the latest
1336: PetscCall(KokkosDualViewSync<DefaultMemorySpace>(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1337: // Put the unplaced device view back
1338: PetscCallCXX(veckok->v_dual = PetscScalarKokkosDualView(veckok->unplaced_d, veckok->v_dual.view_host()));
1339: // Indicate that unplaced_d has the latest value before replacing.
1340: PetscCallCXX(veckok->v_dual.modify_device());
1341: PetscFunctionReturn(PETSC_SUCCESS);
1342: }
1344: /* Replace the array in vin with a[] that must be allocated by PetscMalloc. a[] is owned by vin afterwards. */
1345: PetscErrorCode VecReplaceArray_SeqKokkos(Vec vin, const PetscScalar *a)
1346: {
1347: Vec_Seq *vecseq = (Vec_Seq *)vin->data;
1348: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(vin->spptr);
1350: PetscFunctionBegin;
1351: /* Make sure the users array has the latest values */
1352: if (vecseq->array != vecseq->array_allocated) PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1353: PetscCall(VecReplaceArray_Seq(vin, a));
1354: PetscCall(veckok->UpdateArray<HostMirrorMemorySpace>(vecseq->array));
1355: PetscFunctionReturn(PETSC_SUCCESS);
1356: }
1358: /* Maps the local portion of vector v into vector w */
1359: PetscErrorCode VecGetLocalVector_SeqKokkos(Vec v, Vec w)
1360: {
1361: Vec_Seq *vecseq = static_cast<Vec_Seq *>(w->data);
1362: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(w->spptr);
1364: PetscFunctionBegin;
1365: PetscCheckTypeNames(w, VECSEQKOKKOS, VECMPIKOKKOS);
1366: /* Destroy w->data, w->spptr */
1367: if (vecseq) {
1368: PetscCall(PetscFree(vecseq->array_allocated));
1369: PetscCall(PetscFree(w->data));
1370: }
1371: delete veckok;
1373: /* Replace with v's */
1374: w->data = v->data;
1375: w->spptr = v->spptr;
1376: PetscCall(PetscObjectStateIncrease((PetscObject)w));
1377: PetscFunctionReturn(PETSC_SUCCESS);
1378: }
1380: PetscErrorCode VecRestoreLocalVector_SeqKokkos(Vec v, Vec w)
1381: {
1382: PetscFunctionBegin;
1383: PetscCheckTypeNames(w, VECSEQKOKKOS, VECMPIKOKKOS);
1384: v->data = w->data;
1385: v->spptr = w->spptr;
1386: PetscCall(PetscObjectStateIncrease((PetscObject)v));
1387: /* TODO: need to think if setting w->data/spptr to NULL is safe */
1388: w->data = NULL;
1389: w->spptr = NULL;
1390: PetscFunctionReturn(PETSC_SUCCESS);
1391: }
1393: PetscErrorCode VecGetArray_SeqKokkos(Vec v, PetscScalar **a)
1394: {
1395: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1397: PetscFunctionBegin;
1398: PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1399: *a = *((PetscScalar **)v->data);
1400: PetscFunctionReturn(PETSC_SUCCESS);
1401: }
1403: PetscErrorCode VecRestoreArray_SeqKokkos(Vec v, PetscScalar **a)
1404: {
1405: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1407: PetscFunctionBegin;
1408: PetscCallCXX(veckok->v_dual.modify_host());
1409: PetscFunctionReturn(PETSC_SUCCESS);
1410: }
1412: /* Get array on host to overwrite, so no need to sync host. In VecRestoreArrayWrite() we will mark host is modified. */
1413: PetscErrorCode VecGetArrayWrite_SeqKokkos(Vec v, PetscScalar **a)
1414: {
1415: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1417: PetscFunctionBegin;
1418: PetscCallCXX(veckok->v_dual.clear_sync_state());
1419: *a = veckok->v_dual.view_host().data();
1420: PetscFunctionReturn(PETSC_SUCCESS);
1421: }
1423: PetscErrorCode VecGetArrayAndMemType_SeqKokkos(Vec v, PetscScalar **a, PetscMemType *mtype)
1424: {
1425: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1427: PetscFunctionBegin;
1428: /* Always return up-to-date in the default memory space */
1429: PetscCall(KokkosDualViewSync<DefaultMemorySpace>(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1430: *a = veckok->v_dual.view_device().data();
1431: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; // Could be PETSC_MEMTYPE_HOST when Kokkos was not configured with cuda etc.
1432: PetscFunctionReturn(PETSC_SUCCESS);
1433: }
1435: PetscErrorCode VecRestoreArrayAndMemType_SeqKokkos(Vec v, PetscScalar **a)
1436: {
1437: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1439: PetscFunctionBegin;
1440: if (std::is_same<DefaultMemorySpace, HostMirrorMemorySpace>::value) {
1441: PetscCallCXX(veckok->v_dual.modify_host());
1442: } else {
1443: PetscCallCXX(veckok->v_dual.modify_device());
1444: }
1445: PetscFunctionReturn(PETSC_SUCCESS);
1446: }
1448: PetscErrorCode VecGetArrayWriteAndMemType_SeqKokkos(Vec v, PetscScalar **a, PetscMemType *mtype)
1449: {
1450: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1452: PetscFunctionBegin;
1453: // Since the data will be overwritten, we clear the sync state to suppress potential memory copying in sync'ing
1454: PetscCallCXX(veckok->v_dual.clear_sync_state()); // So that in restore, we can safely modify_device()
1455: PetscCall(KokkosDualViewSync<DefaultMemorySpace>(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1456: *a = veckok->v_dual.view_device().data();
1457: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; // Could be PETSC_MEMTYPE_HOST when Kokkos was not configured with cuda etc.
1458: PetscFunctionReturn(PETSC_SUCCESS);
1459: }
1461: /* Copy xin's sync state to y */
1462: static PetscErrorCode VecCopySyncState_Kokkos_Private(Vec xin, Vec yout)
1463: {
1464: Vec_Kokkos *xkok = static_cast<Vec_Kokkos *>(xin->spptr);
1465: Vec_Kokkos *ykok = static_cast<Vec_Kokkos *>(yout->spptr);
1467: PetscFunctionBegin;
1468: PetscCallCXX(ykok->v_dual.clear_sync_state());
1469: if (xkok->v_dual.need_sync_host()) {
1470: PetscCallCXX(ykok->v_dual.modify_device());
1471: } else if (xkok->v_dual.need_sync_device()) {
1472: PetscCallCXX(ykok->v_dual.modify_host());
1473: }
1474: PetscFunctionReturn(PETSC_SUCCESS);
1475: }
1477: static PetscErrorCode VecCreateSeqKokkosWithArrays_Private(MPI_Comm, PetscInt, PetscInt, const PetscScalar[], const PetscScalar[], Vec *);
1479: /* Internal routine shared by VecGetSubVector_{SeqKokkos,MPIKokkos} */
1480: PetscErrorCode VecGetSubVector_Kokkos_Private(Vec x, PetscBool xIsMPI, IS is, Vec *y)
1481: {
1482: PetscBool contig;
1483: PetscInt n, N, start, bs;
1484: MPI_Comm comm;
1485: Vec z;
1487: PetscFunctionBegin;
1488: PetscCall(PetscObjectGetComm((PetscObject)x, &comm));
1489: PetscCall(ISGetLocalSize(is, &n));
1490: PetscCall(ISGetSize(is, &N));
1491: PetscCall(VecGetSubVectorContiguityAndBS_Private(x, is, &contig, &start, &bs));
1493: if (contig) { /* We can do a no-copy (in-place) implementation with y sharing x's arrays */
1494: Vec_Kokkos *xkok = static_cast<Vec_Kokkos *>(x->spptr);
1495: const PetscScalar *array_h = xkok->v_dual.view_host().data() + start;
1496: const PetscScalar *array_d = xkok->v_dual.view_device().data() + start;
1498: /* These calls assume the input arrays are synced */
1499: if (xIsMPI) PetscCall(VecCreateMPIKokkosWithArrays_Private(comm, bs, n, N, array_h, array_d, &z)); /* x could be MPI even when x's comm size = 1 */
1500: else PetscCall(VecCreateSeqKokkosWithArrays_Private(comm, bs, n, array_h, array_d, &z));
1502: PetscCall(VecCopySyncState_Kokkos_Private(x, z)); /* Copy x's sync state to z */
1504: /* This is relevant only in debug mode */
1505: PetscInt state = 0;
1506: PetscCall(VecLockGet(x, &state));
1507: if (state) { /* x is either in read or read/write mode, therefore z, overlapped with x, can only be in read mode */
1508: PetscCall(VecLockReadPush(z));
1509: }
1511: z->ops->placearray = NULL; /* z's arrays can't be replaced, because z does not own them */
1512: z->ops->replacearray = NULL;
1514: } else { /* Have to create a VecScatter and a stand-alone vector */
1515: PetscCall(VecGetSubVectorThroughVecScatter_Private(x, is, bs, &z));
1516: }
1517: *y = z;
1518: PetscFunctionReturn(PETSC_SUCCESS);
1519: }
1521: static PetscErrorCode VecGetSubVector_SeqKokkos(Vec x, IS is, Vec *y)
1522: {
1523: PetscFunctionBegin;
1524: PetscCall(VecGetSubVector_Kokkos_Private(x, PETSC_FALSE, is, y));
1525: PetscFunctionReturn(PETSC_SUCCESS);
1526: }
1528: /* Restore subvector y to x */
1529: PetscErrorCode VecRestoreSubVector_SeqKokkos(Vec x, IS is, Vec *y)
1530: {
1531: VecScatter vscat;
1532: PETSC_UNUSED PetscObjectState dummystate = 0;
1533: PetscBool unchanged;
1535: PetscFunctionBegin;
1536: PetscCall(PetscObjectComposedDataGetInt((PetscObject)*y, VecGetSubVectorSavedStateId, dummystate, unchanged));
1537: if (unchanged) PetscFunctionReturn(PETSC_SUCCESS); /* If y's state has not changed since VecGetSubVector(), we only need to destroy it */
1539: PetscCall(PetscObjectQuery((PetscObject)*y, "VecGetSubVector_Scatter", (PetscObject *)&vscat));
1540: if (vscat) {
1541: PetscCall(VecScatterBegin(vscat, *y, x, INSERT_VALUES, SCATTER_REVERSE));
1542: PetscCall(VecScatterEnd(vscat, *y, x, INSERT_VALUES, SCATTER_REVERSE));
1543: } else { /* y and x's (host and device) arrays overlap */
1544: Vec_Kokkos *xkok = static_cast<Vec_Kokkos *>(x->spptr);
1545: Vec_Kokkos *ykok = static_cast<Vec_Kokkos *>((*y)->spptr);
1546: PetscInt state;
1548: PetscCall(VecLockGet(x, &state));
1549: PetscCheck(!state, PetscObjectComm((PetscObject)x), PETSC_ERR_ARG_WRONGSTATE, "Vec x is locked for read-only or read/write access");
1551: /* The tricky part: one has to carefully sync the arrays */
1552: auto exec = PetscGetKokkosExecutionSpace();
1553: if (xkok->v_dual.need_sync_device()) { /* x's host has newer data */
1554: /* Move y's latest values to host (since y is just a subset of x) */
1555: PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(ykok->v_dual, exec));
1556: } else if (xkok->v_dual.need_sync_host()) { /* x's device has newer data */
1557: PetscCall(KokkosDualViewSync<DefaultMemorySpace>(ykok->v_dual, exec)); /* Move y's latest data to device */
1558: } else { /* x's host and device data is already sync'ed; Copy y's sync state to x */
1559: PetscCall(VecCopySyncState_Kokkos_Private(*y, x));
1560: }
1561: PetscCall(PetscObjectStateIncrease((PetscObject)x)); /* Since x is updated */
1562: }
1563: PetscFunctionReturn(PETSC_SUCCESS);
1564: }
1566: static PetscErrorCode VecSetPreallocationCOO_SeqKokkos(Vec x, PetscCount ncoo, const PetscInt coo_i[])
1567: {
1568: Vec_Seq *vecseq = static_cast<Vec_Seq *>(x->data);
1569: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(x->spptr);
1570: PetscInt m;
1572: PetscFunctionBegin;
1573: PetscCall(VecSetPreallocationCOO_Seq(x, ncoo, coo_i));
1574: PetscCall(VecGetLocalSize(x, &m));
1575: PetscCall(veckok->SetUpCOO(vecseq, m));
1576: PetscFunctionReturn(PETSC_SUCCESS);
1577: }
1579: static PetscErrorCode VecSetValuesCOO_SeqKokkos(Vec x, const PetscScalar v[], InsertMode imode)
1580: {
1581: Vec_Seq *vecseq = static_cast<Vec_Seq *>(x->data);
1582: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(x->spptr);
1583: const PetscCountKokkosView &jmap1 = veckok->jmap1_d;
1584: const PetscCountKokkosView &perm1 = veckok->perm1_d;
1585: PetscScalarKokkosView xv; /* View for vector x */
1586: ConstPetscScalarKokkosView vv; /* View for array v[] */
1587: PetscInt m;
1588: PetscMemType memtype;
1590: PetscFunctionBegin;
1591: PetscCall(VecGetLocalSize(x, &m));
1592: PetscCall(PetscGetMemType(v, &memtype));
1593: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1594: PetscCallCXX(vv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstPetscScalarKokkosViewHost(v, vecseq->coo_n)));
1595: } else {
1596: PetscCallCXX(vv = ConstPetscScalarKokkosView(v, vecseq->coo_n)); /* Directly use v[]'s memory */
1597: }
1599: if (imode == INSERT_VALUES) PetscCall(VecGetKokkosViewWrite(x, &xv)); /* write vector */
1600: else PetscCall(VecGetKokkosView(x, &xv)); /* read & write vector */
1602: PetscCallCXX(Kokkos::parallel_for(
1603: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, m), KOKKOS_LAMBDA(const PetscInt &i) {
1604: PetscScalar sum = 0.0;
1605: for (PetscCount k = jmap1(i); k < jmap1(i + 1); k++) sum += vv(perm1(k));
1606: xv(i) = (imode == INSERT_VALUES ? 0.0 : xv(i)) + sum;
1607: }));
1609: if (imode == INSERT_VALUES) PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1610: else PetscCall(VecRestoreKokkosView(x, &xv));
1611: PetscFunctionReturn(PETSC_SUCCESS);
1612: }
1614: static PetscErrorCode VecCreate_SeqKokkos_Common(Vec); // forward declaration
1616: /* Duplicate layout etc but not the values in the input vector */
1617: static PetscErrorCode VecDuplicate_SeqKokkos(Vec win, Vec *v)
1618: {
1619: PetscFunctionBegin;
1620: PetscCall(VecDuplicate_Seq(win, v)); /* It also dups ops of win */
1621: PetscCall(VecCreate_SeqKokkos_Common(*v));
1622: (*v)->ops[0] = win->ops[0]; // recover the ops[]. We need to always follow ops in win
1623: PetscFunctionReturn(PETSC_SUCCESS);
1624: }
1626: static PetscErrorCode VecDestroy_SeqKokkos(Vec v)
1627: {
1628: Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1629: Vec_Seq *vecseq = static_cast<Vec_Seq *>(v->data);
1631: PetscFunctionBegin;
1632: delete veckok;
1633: v->spptr = NULL;
1634: if (vecseq) PetscCall(VecDestroy_Seq(v));
1635: PetscFunctionReturn(PETSC_SUCCESS);
1636: }
1638: // Shared by all VecCreate/Duplicate routines for VecSeqKokkos
1639: static PetscErrorCode VecCreate_SeqKokkos_Common(Vec v)
1640: {
1641: PetscFunctionBegin;
1642: v->ops->abs = VecAbs_SeqKokkos;
1643: v->ops->reciprocal = VecReciprocal_SeqKokkos;
1644: v->ops->pointwisemult = VecPointwiseMult_SeqKokkos;
1645: v->ops->min = VecMin_SeqKokkos;
1646: v->ops->max = VecMax_SeqKokkos;
1647: v->ops->sum = VecSum_SeqKokkos;
1648: v->ops->shift = VecShift_SeqKokkos;
1649: v->ops->norm = VecNorm_SeqKokkos;
1650: v->ops->scale = VecScale_SeqKokkos;
1651: v->ops->copy = VecCopy_SeqKokkos;
1652: v->ops->set = VecSet_SeqKokkos;
1653: v->ops->swap = VecSwap_SeqKokkos;
1654: v->ops->axpy = VecAXPY_SeqKokkos;
1655: v->ops->axpby = VecAXPBY_SeqKokkos;
1656: v->ops->axpbypcz = VecAXPBYPCZ_SeqKokkos;
1657: v->ops->pointwisedivide = VecPointwiseDivide_SeqKokkos;
1658: v->ops->setrandom = VecSetRandom_SeqKokkos;
1660: v->ops->dot = VecDot_SeqKokkos;
1661: v->ops->tdot = VecTDot_SeqKokkos;
1662: v->ops->mdot = VecMDot_SeqKokkos;
1663: v->ops->mtdot = VecMTDot_SeqKokkos;
1665: v->ops->dot_local = VecDot_SeqKokkos;
1666: v->ops->tdot_local = VecTDot_SeqKokkos;
1667: v->ops->mdot_local = VecMDot_SeqKokkos;
1668: v->ops->mtdot_local = VecMTDot_SeqKokkos;
1670: v->ops->norm_local = VecNorm_SeqKokkos;
1671: v->ops->maxpy = VecMAXPY_SeqKokkos;
1672: v->ops->aypx = VecAYPX_SeqKokkos;
1673: v->ops->waxpy = VecWAXPY_SeqKokkos;
1674: v->ops->dotnorm2 = VecDotNorm2_SeqKokkos;
1675: v->ops->errorwnorm = VecErrorWeightedNorms_SeqKokkos;
1676: v->ops->placearray = VecPlaceArray_SeqKokkos;
1677: v->ops->replacearray = VecReplaceArray_SeqKokkos;
1678: v->ops->resetarray = VecResetArray_SeqKokkos;
1679: v->ops->destroy = VecDestroy_SeqKokkos;
1680: v->ops->duplicate = VecDuplicate_SeqKokkos;
1681: v->ops->conjugate = VecConjugate_SeqKokkos;
1682: v->ops->getlocalvector = VecGetLocalVector_SeqKokkos;
1683: v->ops->restorelocalvector = VecRestoreLocalVector_SeqKokkos;
1684: v->ops->getlocalvectorread = VecGetLocalVector_SeqKokkos;
1685: v->ops->restorelocalvectorread = VecRestoreLocalVector_SeqKokkos;
1686: v->ops->getarraywrite = VecGetArrayWrite_SeqKokkos;
1687: v->ops->getarray = VecGetArray_SeqKokkos;
1688: v->ops->restorearray = VecRestoreArray_SeqKokkos;
1690: v->ops->getarrayandmemtype = VecGetArrayAndMemType_SeqKokkos;
1691: v->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqKokkos;
1692: v->ops->getarraywriteandmemtype = VecGetArrayWriteAndMemType_SeqKokkos;
1693: v->ops->getsubvector = VecGetSubVector_SeqKokkos;
1694: v->ops->restoresubvector = VecRestoreSubVector_SeqKokkos;
1696: v->ops->setpreallocationcoo = VecSetPreallocationCOO_SeqKokkos;
1697: v->ops->setvaluescoo = VecSetValuesCOO_SeqKokkos;
1699: v->offloadmask = PETSC_OFFLOAD_KOKKOS; // Mark this is a VECKOKKOS; We use this flag for cheap VECKOKKOS test.
1700: PetscFunctionReturn(PETSC_SUCCESS);
1701: }
1703: /*@C
1704: VecCreateSeqKokkosWithArray - Creates a Kokkos sequential array-style vector,
1705: where the user provides the array space to store the vector values. The array
1706: provided must be a device array.
1708: Collective
1710: Input Parameters:
1711: + comm - the communicator, should be PETSC_COMM_SELF
1712: . bs - the block size
1713: . n - the vector length
1714: - darray - device memory where the vector elements are to be stored.
1716: Output Parameter:
1717: . v - the vector
1719: Notes:
1720: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1721: same type as an existing vector.
1723: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
1724: The user should not free the array until the vector is destroyed.
1726: Level: intermediate
1728: .seealso: `VecCreateMPICUDAWithArray()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`,
1729: `VecCreateGhost()`, `VecCreateSeq()`, `VecCreateSeqWithArray()`,
1730: `VecCreateMPIWithArray()`
1731: @*/
1732: PetscErrorCode VecCreateSeqKokkosWithArray(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar darray[], Vec *v)
1733: {
1734: PetscMPIInt size;
1735: Vec w;
1736: Vec_Kokkos *veckok = NULL;
1737: PetscScalar *harray;
1739: PetscFunctionBegin;
1740: PetscCallMPI(MPI_Comm_size(comm, &size));
1741: PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create VECSEQKOKKOS on more than one process");
1743: PetscCall(PetscKokkosInitializeCheck());
1744: PetscCall(VecCreate(comm, &w));
1745: PetscCall(VecSetSizes(w, n, n));
1746: PetscCall(VecSetBlockSize(w, bs));
1747: if (!darray) { /* Allocate memory ourself if user provided NULL */
1748: PetscCall(VecSetType(w, VECSEQKOKKOS));
1749: } else {
1750: /* Build a VECSEQ, get its harray, and then build Vec_Kokkos along with darray */
1751: if (std::is_same<DefaultMemorySpace, HostMirrorMemorySpace>::value) {
1752: harray = const_cast<PetscScalar *>(darray);
1753: PetscCall(VecCreate_Seq_Private(w, harray)); /* Build a sequential vector with harray */
1754: } else {
1755: PetscCall(VecSetType(w, VECSEQ));
1756: harray = static_cast<Vec_Seq *>(w->data)->array;
1757: }
1758: PetscCall(PetscObjectChangeTypeName((PetscObject)w, VECSEQKOKKOS)); /* Change it to Kokkos */
1759: PetscCall(VecCreate_SeqKokkos_Common(w));
1760: PetscCallCXX(veckok = new Vec_Kokkos{n, harray, const_cast<PetscScalar *>(darray)});
1761: PetscCallCXX(veckok->v_dual.modify_device()); /* Mark the device is modified */
1762: w->spptr = static_cast<void *>(veckok);
1763: }
1764: *v = w;
1765: PetscFunctionReturn(PETSC_SUCCESS);
1766: }
1768: PetscErrorCode VecConvert_Seq_SeqKokkos_inplace(Vec v)
1769: {
1770: Vec_Seq *vecseq;
1772: PetscFunctionBegin;
1773: PetscCall(PetscKokkosInitializeCheck());
1774: PetscCall(PetscLayoutSetUp(v->map));
1775: PetscCall(PetscObjectChangeTypeName((PetscObject)v, VECSEQKOKKOS));
1776: PetscCall(VecCreate_SeqKokkos_Common(v));
1777: PetscCheck(!v->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "v->spptr not NULL");
1778: vecseq = static_cast<Vec_Seq *>(v->data);
1779: PetscCallCXX(v->spptr = new Vec_Kokkos(v->map->n, vecseq->array, NULL));
1780: PetscFunctionReturn(PETSC_SUCCESS);
1781: }
1783: // Create a VECSEQKOKKOS with layout and arrays
1784: static PetscErrorCode VecCreateSeqKokkosWithLayoutAndArrays_Private(PetscLayout map, const PetscScalar harray[], const PetscScalar darray[], Vec *v)
1785: {
1786: Vec w;
1788: PetscFunctionBegin;
1789: if (map->n > 0) PetscCheck(darray, map->comm, PETSC_ERR_ARG_WRONG, "darray cannot be NULL");
1790: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1791: PetscCheck(harray == darray, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "harray and darray must be the same");
1792: #endif
1793: PetscCall(VecCreateSeqWithLayoutAndArray_Private(map, harray, &w));
1794: PetscCall(PetscObjectChangeTypeName((PetscObject)w, VECSEQKOKKOS)); // Change it to VECKOKKOS
1795: PetscCall(VecCreate_SeqKokkos_Common(w));
1796: PetscCallCXX(w->spptr = new Vec_Kokkos(map->n, const_cast<PetscScalar *>(harray), const_cast<PetscScalar *>(darray)));
1797: *v = w;
1798: PetscFunctionReturn(PETSC_SUCCESS);
1799: }
1801: /*
1802: VecCreateSeqKokkosWithArrays_Private - Creates a Kokkos sequential array-style vector
1803: with user-provided arrays on host and device.
1805: Collective
1807: Input Parameter:
1808: + comm - the communicator, should be PETSC_COMM_SELF
1809: . bs - the block size
1810: . n - the vector length
1811: . harray - host memory where the vector elements are to be stored.
1812: - darray - device memory where the vector elements are to be stored.
1814: Output Parameter:
1815: . v - the vector
1817: Notes:
1818: Unlike VecCreate{Seq,MPI}CUDAWithArrays(), this routine is private since we do not expect users to use it directly.
1820: If there is no device, then harray and darray must be the same.
1821: If n is not zero, then harray and darray must be allocated.
1822: After the call, the created vector is supposed to be in a synchronized state, i.e.,
1823: we suppose harray and darray have the same data.
1825: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
1826: Caller should not free the array until the vector is destroyed.
1827: */
1828: static PetscErrorCode VecCreateSeqKokkosWithArrays_Private(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar harray[], const PetscScalar darray[], Vec *v)
1829: {
1830: PetscMPIInt size;
1831: PetscLayout map;
1833: PetscFunctionBegin;
1834: PetscCall(PetscKokkosInitializeCheck());
1835: PetscCallMPI(MPI_Comm_size(comm, &size));
1836: PetscCheck(size == 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create VECSEQKOKKOS on more than one process");
1837: PetscCall(PetscLayoutCreateFromSizes(comm, n, n, bs, &map));
1838: PetscCall(VecCreateSeqKokkosWithLayoutAndArrays_Private(map, harray, darray, v));
1839: PetscCall(PetscLayoutDestroy(&map));
1840: PetscFunctionReturn(PETSC_SUCCESS);
1841: }
1843: /* TODO: ftn-auto generates veckok.kokkosf.c */
1844: /*@C
1845: VecCreateSeqKokkos - Creates a standard, sequential array-style vector.
1847: Collective
1849: Input Parameters:
1850: + comm - the communicator, should be `PETSC_COMM_SELF`
1851: - n - the vector length
1853: Output Parameter:
1854: . v - the vector
1856: Notes:
1857: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1858: same type as an existing vector.
1860: Level: intermediate
1862: .seealso: `VecCreateMPI()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`
1863: @*/
1864: PetscErrorCode VecCreateSeqKokkos(MPI_Comm comm, PetscInt n, Vec *v)
1865: {
1866: PetscFunctionBegin;
1867: PetscCall(PetscKokkosInitializeCheck());
1868: PetscCall(VecCreate(comm, v));
1869: PetscCall(VecSetSizes(*v, n, n));
1870: PetscCall(VecSetType(*v, VECSEQKOKKOS)); /* Calls VecCreate_SeqKokkos */
1871: PetscFunctionReturn(PETSC_SUCCESS);
1872: }
1874: // Duplicate a VECSEQKOKKOS
1875: static PetscErrorCode VecDuplicateVecs_SeqKokkos_GEMV(Vec w, PetscInt m, Vec *V[])
1876: {
1877: PetscInt64 lda; // use 64-bit as we will do "m * lda"
1878: PetscScalar *array_h, *array_d;
1879: PetscLayout map;
1880: PetscScalarKokkosDualView w_dual;
1882: PetscFunctionBegin;
1883: PetscCall(PetscKokkosInitializeCheck()); // as we'll call kokkos_malloc()
1884: PetscCall(PetscMalloc1(m, V));
1885: PetscCall(VecGetLayout(w, &map));
1886: VecGetLocalSizeAligned(w, 64, &lda); // get in lda the 64-bytes aligned local size
1887: // See comments in VecCreate_SeqKokkos() on why we use DualView to allocate the memory
1888: PetscCallCXX(w_dual = PetscScalarKokkosDualView("VecDuplicateVecs", m * lda)); // Kokkos init's w_dual to zero
1890: // create the m vectors with raw arrays from v_dual
1891: array_h = w_dual.view_host().data();
1892: array_d = w_dual.view_device().data();
1893: for (PetscInt i = 0; i < m; i++) {
1894: Vec v;
1895: PetscCall(VecCreateSeqKokkosWithLayoutAndArrays_Private(map, &array_h[i * lda], &array_d[i * lda], &v));
1896: PetscCall(PetscObjectListDuplicate(((PetscObject)w)->olist, &((PetscObject)v)->olist));
1897: PetscCall(PetscFunctionListDuplicate(((PetscObject)w)->qlist, &((PetscObject)v)->qlist));
1898: v->ops[0] = w->ops[0];
1899: (*V)[i] = v;
1900: }
1902: // let the first vector own the long DualView, so when it is destroyed it will free the v_dual
1903: if (m) {
1904: Vec v = (*V)[0];
1906: static_cast<Vec_Kokkos *>(v->spptr)->w_dual = w_dual; // stash the memory
1907: // disable replacearray of the first vector, as freeing its memory also frees others in the group.
1908: // But replacearray of others is ok, as they don't own their array.
1909: if (m > 1) v->ops->replacearray = VecReplaceArray_Default_GEMV_Error;
1910: }
1911: PetscFunctionReturn(PETSC_SUCCESS);
1912: }
1914: /*MC
1915: VECSEQKOKKOS - VECSEQKOKKOS = "seqkokkos" - The basic sequential vector, modified to use Kokkos
1917: Options Database Keys:
1918: . -vec_type seqkokkos - sets the vector type to VECSEQKOKKOS during a call to VecSetFromOptions()
1920: Level: beginner
1922: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIWithArray()`, `VECMPI`, `VecType`, `VecCreateMPI()`
1923: M*/
1924: PetscErrorCode VecCreate_SeqKokkos(Vec v)
1925: {
1926: PetscBool mdot_use_gemv = PETSC_TRUE;
1927: PetscBool maxpy_use_gemv = PETSC_FALSE; // default is false as we saw bad performance with vendors' GEMV with tall skinny matrices.
1928: PetscScalarKokkosDualView v_dual;
1930: PetscFunctionBegin;
1931: PetscCall(PetscKokkosInitializeCheck());
1932: PetscCall(PetscLayoutSetUp(v->map));
1934: // Use DualView to allocate both the host array and the device array.
1935: // DualView first allocates the device array and then mirrors it to host.
1936: // With unified memory (e.g., on AMD MI300A APU), the two arrays are the same, with the host array
1937: // sharing the device array allocated by hipMalloc(), but not the other way around if we call
1938: // VecCreate_Seq() first and let the device array share the host array allocated by malloc().
1939: // hipMalloc() has an advantage over malloc() as it gives great binding and page size settings automatically, see
1940: // https://hpc.llnl.gov/documentation/user-guides/using-el-capitan-systems/introduction-and-quickstart/pro-tips
1941: PetscCallCXX(v_dual = PetscScalarKokkosDualView("v_dual", v->map->n)); // Kokkos init's v_dual to zero
1943: PetscCall(VecCreate_Seq_Private(v, v_dual.view_host().data()));
1944: PetscCall(PetscObjectChangeTypeName((PetscObject)v, VECSEQKOKKOS));
1945: PetscCall(VecCreate_SeqKokkos_Common(v));
1946: PetscCheck(!v->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "v->spptr not NULL");
1947: PetscCallCXX(v->spptr = new Vec_Kokkos(v_dual));
1949: PetscCall(PetscOptionsGetBool(NULL, NULL, "-vec_mdot_use_gemv", &mdot_use_gemv, NULL));
1950: PetscCall(PetscOptionsGetBool(NULL, NULL, "-vec_maxpy_use_gemv", &maxpy_use_gemv, NULL));
1952: // allocate multiple vectors together
1953: if (mdot_use_gemv || maxpy_use_gemv) v->ops[0].duplicatevecs = VecDuplicateVecs_SeqKokkos_GEMV;
1955: if (mdot_use_gemv) {
1956: v->ops[0].mdot = VecMDot_SeqKokkos_GEMV;
1957: v->ops[0].mtdot = VecMTDot_SeqKokkos_GEMV;
1958: v->ops[0].mdot_local = VecMDot_SeqKokkos_GEMV;
1959: v->ops[0].mtdot_local = VecMTDot_SeqKokkos_GEMV;
1960: }
1961: if (maxpy_use_gemv) v->ops[0].maxpy = VecMAXPY_SeqKokkos_GEMV;
1962: PetscFunctionReturn(PETSC_SUCCESS);
1963: }