Actual source code: mpikok.kokkos.cxx


  2: /*
  3:    This file contains routines for Parallel vector operations.
  4:  */

  6: #include <petscvec_kokkos.hpp>
  7: #include <petsc/private/deviceimpl.h>
  8: #include <petsc/private/vecimpl.h>
  9: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
 10: #include <../src/vec/vec/impls/seq/kokkos/veckokkosimpl.hpp>
 11: #include <petscsf.h>

 13: PetscErrorCode VecDestroy_MPIKokkos(Vec v)
 14: {
 15:   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);

 17:   delete veckok;
 18:   VecDestroy_MPI(v);
 19:   return 0;
 20: }

 22: PetscErrorCode VecNorm_MPIKokkos(Vec xin, NormType type, PetscReal *z)
 23: {
 24:   PetscReal sum, work = 0.0;

 26:   if (type == NORM_2 || type == NORM_FROBENIUS) {
 27:     VecNorm_SeqKokkos(xin, NORM_2, &work);
 28:     work *= work;
 29:     MPIU_Allreduce(&work, &sum, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)xin));
 30:     *z = PetscSqrtReal(sum);
 31:   } else if (type == NORM_1) {
 32:     /* Find the local part */
 33:     VecNorm_SeqKokkos(xin, NORM_1, &work);
 34:     /* Find the global max */
 35:     MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)xin));
 36:   } else if (type == NORM_INFINITY) {
 37:     /* Find the local max */
 38:     VecNorm_SeqKokkos(xin, NORM_INFINITY, &work);
 39:     /* Find the global max */
 40:     MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)xin));
 41:   } else if (type == NORM_1_AND_2) {
 42:     PetscReal temp[2];
 43:     VecNorm_SeqKokkos(xin, NORM_1, temp);
 44:     VecNorm_SeqKokkos(xin, NORM_2, temp + 1);
 45:     temp[1] = temp[1] * temp[1];
 46:     MPIU_Allreduce(temp, z, 2, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)xin));
 47:     z[1] = PetscSqrtReal(z[1]);
 48:   }
 49:   return 0;
 50: }

 52: /* z = y^H x */
 53: PetscErrorCode VecDot_MPIKokkos(Vec xin, Vec yin, PetscScalar *z)
 54: {
 55:   PetscScalar sum, work;

 57:   VecDot_SeqKokkos(xin, yin, &work);
 58:   MPIU_Allreduce(&work, &sum, 1, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
 59:   *z = sum;
 60:   return 0;
 61: }

 63: PetscErrorCode VecMDot_MPIKokkos(Vec xin, PetscInt nv, const Vec y[], PetscScalar *z)
 64: {
 65:   PetscScalar awork[128], *work = awork;

 67:   if (nv > 128) PetscMalloc1(nv, &work);
 68:   VecMDot_SeqKokkos(xin, nv, y, work);
 69:   MPIU_Allreduce(work, z, nv, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
 70:   if (nv > 128) PetscFree(work);
 71:   return 0;
 72: }

 74: /* z = y^T x */
 75: PetscErrorCode VecTDot_MPIKokkos(Vec xin, Vec yin, PetscScalar *z)
 76: {
 77:   PetscScalar sum, work;

 79:   VecTDot_SeqKokkos(xin, yin, &work);
 80:   MPIU_Allreduce(&work, &sum, 1, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
 81:   *z = sum;
 82:   return 0;
 83: }

 85: PetscErrorCode VecMTDot_MPIKokkos(Vec xin, PetscInt nv, const Vec y[], PetscScalar *z)
 86: {
 87:   PetscScalar awork[128], *work = awork;

 89:   if (nv > 128) PetscMalloc1(nv, &work);
 90:   VecMTDot_SeqKokkos(xin, nv, y, work);
 91:   MPIU_Allreduce(work, z, nv, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
 92:   if (nv > 128) PetscFree(work);
 93:   return 0;
 94: }

 96: PetscErrorCode VecMax_MPIKokkos(Vec xin, PetscInt *idx, PetscReal *z)
 97: {
 98:   PetscReal work;

100:   /* Find the local max */
101:   VecMax_SeqKokkos(xin, idx, &work);
102: #if defined(PETSC_HAVE_MPIUNI)
103:   *z = work;
104: #else
105:   /* Find the global max */
106:   if (!idx) { /* User does not need idx */
107:     MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)xin));
108:   } else {
109:     struct {
110:       PetscReal v;
111:       PetscInt  i;
112:     } in, out;

114:     in.v = work;
115:     in.i = *idx + xin->map->rstart;
116:     MPIU_Allreduce(&in, &out, 1, MPIU_REAL_INT, MPIU_MAXLOC, PetscObjectComm((PetscObject)xin));
117:     *z   = out.v;
118:     *idx = out.i;
119:   }
120: #endif
121:   return 0;
122: }

124: PetscErrorCode VecMin_MPIKokkos(Vec xin, PetscInt *idx, PetscReal *z)
125: {
126:   PetscReal work;

128:   /* Find the local Min */
129:   VecMin_SeqKokkos(xin, idx, &work);
130: #if defined(PETSC_HAVE_MPIUNI)
131:   *z = work;
132: #else
133:   /* Find the global Min */
134:   if (!idx) {
135:     MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_MIN, PetscObjectComm((PetscObject)xin));
136:   } else {
137:     struct {
138:       PetscReal v;
139:       PetscInt  i;
140:     } in, out;

142:     in.v = work;
143:     in.i = *idx + xin->map->rstart;
144:     MPIU_Allreduce(&in, &out, 1, MPIU_REAL_INT, MPIU_MINLOC, PetscObjectComm((PetscObject)xin));
145:     *z   = out.v;
146:     *idx = out.i;
147:   }
148: #endif
149:   return 0;
150: }

152: PetscErrorCode VecDuplicate_MPIKokkos(Vec win, Vec *vv)
153: {
154:   Vec         v;
155:   Vec_MPI    *vecmpi;
156:   Vec_Kokkos *veckok;

158:   /* Reuse VecDuplicate_MPI, which contains a lot of stuff */
159:   VecDuplicate_MPI(win, &v); /* after the call, v is a VECMPI, with data zero'ed */
160:   PetscObjectChangeTypeName((PetscObject)v, VECMPIKOKKOS);
161:   PetscMemcpy(v->ops, win->ops, sizeof(struct _VecOps));

163:   /* Build the Vec_Kokkos struct */
164:   vecmpi = static_cast<Vec_MPI *>(v->data);
165:   veckok = new Vec_Kokkos(v->map->n, vecmpi->array);
166:   Kokkos::deep_copy(veckok->v_dual.view_device(), 0.0);
167:   v->spptr       = veckok;
168:   v->offloadmask = PETSC_OFFLOAD_KOKKOS;
169:   *vv            = v;
170:   return 0;
171: }

173: PetscErrorCode VecDotNorm2_MPIKokkos(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
174: {
175:   PetscScalar work[2], sum[2];

177:   VecDotNorm2_SeqKokkos(s, t, work, work + 1);
178:   MPIU_Allreduce(&work, &sum, 2, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)s));
179:   *dp = sum[0];
180:   *nm = sum[1];
181:   return 0;
182: }

184: static PetscErrorCode VecGetSubVector_MPIKokkos(Vec x, IS is, Vec *y)
185: {
186:   VecGetSubVector_Kokkos_Private(x, PETSC_TRUE, is, y);
187:   return 0;
188: }

190: static PetscErrorCode VecSetPreallocationCOO_MPIKokkos(Vec x, PetscCount ncoo, const PetscInt coo_i[])
191: {
192:   Vec_MPI    *vecmpi = static_cast<Vec_MPI *>(x->data);
193:   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(x->spptr);
194:   PetscInt    m;

196:   VecGetLocalSize(x, &m);
197:   VecSetPreallocationCOO_MPI(x, ncoo, coo_i);
198:   veckok->SetUpCOO(vecmpi, m);
199:   return 0;
200: }

202: static PetscErrorCode VecSetValuesCOO_MPIKokkos(Vec x, const PetscScalar v[], InsertMode imode)
203: {
204:   Vec_MPI                    *vecmpi  = static_cast<Vec_MPI *>(x->data);
205:   Vec_Kokkos                 *veckok  = static_cast<Vec_Kokkos *>(x->spptr);
206:   const PetscCountKokkosView &jmap1   = veckok->jmap1_d;
207:   const PetscCountKokkosView &perm1   = veckok->perm1_d;
208:   const PetscCountKokkosView &imap2   = veckok->imap2_d;
209:   const PetscCountKokkosView &jmap2   = veckok->jmap2_d;
210:   const PetscCountKokkosView &perm2   = veckok->perm2_d;
211:   const PetscCountKokkosView &Cperm   = veckok->Cperm_d;
212:   PetscScalarKokkosView      &sendbuf = veckok->sendbuf_d;
213:   PetscScalarKokkosView      &recvbuf = veckok->recvbuf_d;
214:   PetscScalarKokkosView       xv;
215:   ConstPetscScalarKokkosView  vv;
216:   PetscMemType                memtype;
217:   PetscInt                    m;

219:   VecGetLocalSize(x, &m);
220:   PetscGetMemType(v, &memtype);
221:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
222:     vv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstPetscScalarKokkosViewHost(v, vecmpi->coo_n));
223:   } else {
224:     vv = ConstPetscScalarKokkosView(v, vecmpi->coo_n); /* Directly use v[]'s memory */
225:   }

227:   /* Pack entries to be sent to remote */
228:   Kokkos::parallel_for(
229:     vecmpi->sendlen, KOKKOS_LAMBDA(const PetscCount i) { sendbuf(i) = vv(Cperm(i)); });
230:   PetscSFReduceWithMemTypeBegin(vecmpi->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_KOKKOS, sendbuf.data(), PETSC_MEMTYPE_KOKKOS, recvbuf.data(), MPI_REPLACE);

232:   if (imode == INSERT_VALUES) VecGetKokkosViewWrite(x, &xv); /* write vector */
233:   else VecGetKokkosView(x, &xv);                             /* read & write vector */

235:   Kokkos::parallel_for(
236:     m, KOKKOS_LAMBDA(const PetscCount i) {
237:       PetscScalar sum = 0.0;
238:       for (PetscCount k = jmap1(i); k < jmap1(i + 1); k++) sum += vv(perm1(k));
239:       xv(i) = (imode == INSERT_VALUES ? 0.0 : xv(i)) + sum;
240:     });

242:   PetscSFReduceEnd(vecmpi->coo_sf, MPIU_SCALAR, sendbuf.data(), recvbuf.data(), MPI_REPLACE);

244:   /* Add received remote entries */
245:   Kokkos::parallel_for(
246:     vecmpi->nnz2, KOKKOS_LAMBDA(PetscCount i) {
247:       for (PetscCount k = jmap2(i); k < jmap2(i + 1); k++) xv(imap2(i)) += recvbuf(perm2(k));
248:     });

250:   if (imode == INSERT_VALUES) VecRestoreKokkosViewWrite(x, &xv);
251:   else VecRestoreKokkosView(x, &xv);
252:   return 0;
253: }

255: static PetscErrorCode VecSetOps_MPIKokkos(Vec v)
256: {
257:   v->ops->abs             = VecAbs_SeqKokkos;
258:   v->ops->reciprocal      = VecReciprocal_SeqKokkos;
259:   v->ops->pointwisemult   = VecPointwiseMult_SeqKokkos;
260:   v->ops->setrandom       = VecSetRandom_SeqKokkos;
261:   v->ops->dotnorm2        = VecDotNorm2_MPIKokkos;
262:   v->ops->waxpy           = VecWAXPY_SeqKokkos;
263:   v->ops->norm            = VecNorm_MPIKokkos;
264:   v->ops->min             = VecMin_MPIKokkos;
265:   v->ops->max             = VecMax_MPIKokkos;
266:   v->ops->sum             = VecSum_SeqKokkos;
267:   v->ops->shift           = VecShift_SeqKokkos;
268:   v->ops->scale           = VecScale_SeqKokkos;
269:   v->ops->copy            = VecCopy_SeqKokkos;
270:   v->ops->set             = VecSet_SeqKokkos;
271:   v->ops->swap            = VecSwap_SeqKokkos;
272:   v->ops->axpy            = VecAXPY_SeqKokkos;
273:   v->ops->axpby           = VecAXPBY_SeqKokkos;
274:   v->ops->maxpy           = VecMAXPY_SeqKokkos;
275:   v->ops->aypx            = VecAYPX_SeqKokkos;
276:   v->ops->axpbypcz        = VecAXPBYPCZ_SeqKokkos;
277:   v->ops->pointwisedivide = VecPointwiseDivide_SeqKokkos;
278:   v->ops->placearray      = VecPlaceArray_SeqKokkos;
279:   v->ops->replacearray    = VecReplaceArray_SeqKokkos;
280:   v->ops->resetarray      = VecResetArray_SeqKokkos;

282:   v->ops->dot   = VecDot_MPIKokkos;
283:   v->ops->tdot  = VecTDot_MPIKokkos;
284:   v->ops->mdot  = VecMDot_MPIKokkos;
285:   v->ops->mtdot = VecMTDot_MPIKokkos;

287:   v->ops->dot_local   = VecDot_SeqKokkos;
288:   v->ops->tdot_local  = VecTDot_SeqKokkos;
289:   v->ops->mdot_local  = VecMDot_SeqKokkos;
290:   v->ops->mtdot_local = VecMTDot_SeqKokkos;

292:   v->ops->norm_local              = VecNorm_SeqKokkos;
293:   v->ops->duplicate               = VecDuplicate_MPIKokkos;
294:   v->ops->destroy                 = VecDestroy_MPIKokkos;
295:   v->ops->getlocalvector          = VecGetLocalVector_SeqKokkos;
296:   v->ops->restorelocalvector      = VecRestoreLocalVector_SeqKokkos;
297:   v->ops->getlocalvectorread      = VecGetLocalVector_SeqKokkos;
298:   v->ops->restorelocalvectorread  = VecRestoreLocalVector_SeqKokkos;
299:   v->ops->getarraywrite           = VecGetArrayWrite_SeqKokkos;
300:   v->ops->getarray                = VecGetArray_SeqKokkos;
301:   v->ops->restorearray            = VecRestoreArray_SeqKokkos;
302:   v->ops->getarrayandmemtype      = VecGetArrayAndMemType_SeqKokkos;
303:   v->ops->restorearrayandmemtype  = VecRestoreArrayAndMemType_SeqKokkos;
304:   v->ops->getarraywriteandmemtype = VecGetArrayWriteAndMemType_SeqKokkos;
305:   v->ops->getsubvector            = VecGetSubVector_MPIKokkos;
306:   v->ops->restoresubvector        = VecRestoreSubVector_SeqKokkos;

308:   v->ops->setpreallocationcoo = VecSetPreallocationCOO_MPIKokkos;
309:   v->ops->setvaluescoo        = VecSetValuesCOO_MPIKokkos;
310:   return 0;
311: }

313: /*MC
314:    VECMPIKOKKOS - VECMPIKOKKOS = "mpikokkos" - The basic parallel vector, modified to use Kokkos

316:    Options Database Keys:
317: . -vec_type mpikokkos - sets the vector type to VECMPIKOKKOS during a call to VecSetFromOptions()

319:   Level: beginner

321: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIKokkosWithArray()`, `VECMPI`, `VecType`, `VecCreateMPI()`
322: M*/
323: PetscErrorCode VecCreate_MPIKokkos(Vec v)
324: {
325:   Vec_MPI    *vecmpi;
326:   Vec_Kokkos *veckok;

328:   PetscKokkosInitializeCheck();
329:   PetscLayoutSetUp(v->map);
330:   VecCreate_MPI(v); /* Calloc host array */

332:   vecmpi = static_cast<Vec_MPI *>(v->data);
333:   PetscObjectChangeTypeName((PetscObject)v, VECMPIKOKKOS);
334:   VecSetOps_MPIKokkos(v);
335:   veckok         = new Vec_Kokkos(v->map->n, vecmpi->array, NULL); /* Alloc device array but do not init it */
336:   v->spptr       = static_cast<void *>(veckok);
337:   v->offloadmask = PETSC_OFFLOAD_KOKKOS;
338:   return 0;
339: }

341: /*@C
342:    VecCreateMPIKokkosWithArray - Creates a parallel, array-style vector,
343:    where the user provides the GPU array space to store the vector values.

345:    Collective

347:    Input Parameters:
348: +  comm  - the MPI communicator to use
349: .  bs    - block size, same meaning as VecSetBlockSize()
350: .  n     - local vector length, cannot be PETSC_DECIDE
351: .  N     - global vector length (or PETSC_DECIDE to have calculated)
352: -  array - the user provided GPU array to store the vector values

354:    Output Parameter:
355: .  vv - the vector

357:    Notes:
358:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
359:    same type as an existing vector.

361:    If the user-provided array is NULL, then VecKokkosPlaceArray() can be used
362:    at a later stage to SET the array for storing the vector values.

364:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
365:    The user should not free the array until the vector is destroyed.

367:    Level: intermediate

369: .seealso: `VecCreateSeqKokkosWithArray()`, `VecCreateMPIWithArray()`, `VecCreateSeqWithArray()`,
370:           `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`,
371:           `VecCreateMPI()`, `VecCreateGhostWithArray()`, `VecPlaceArray()`

373: @*/
374: PetscErrorCode VecCreateMPIKokkosWithArray(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, const PetscScalar darray[], Vec *v)
375: {
376:   Vec          w;
377:   Vec_Kokkos  *veckok;
378:   Vec_MPI     *vecmpi;
379:   PetscScalar *harray;

382:   PetscKokkosInitializeCheck();
383:   PetscSplitOwnership(comm, &n, &N);
384:   VecCreate(comm, &w);
385:   VecSetSizes(w, n, N);
386:   VecSetBlockSize(w, bs);
387:   PetscLayoutSetUp(w->map);

389:   if (std::is_same<DefaultMemorySpace, Kokkos::HostSpace>::value) {
390:     harray = const_cast<PetscScalar *>(darray);
391:   } else PetscMalloc1(w->map->n, &harray); /* If device is not the same as host, allocate the host array ourselves */

393:   VecCreate_MPI_Private(w, PETSC_FALSE /*alloc*/, 0 /*nghost*/, harray); /* Build a sequential vector with provided data */
394:   vecmpi = static_cast<Vec_MPI *>(w->data);

396:   if (!std::is_same<DefaultMemorySpace, Kokkos::HostSpace>::value) vecmpi->array_allocated = harray; /* The host array was allocated by petsc */

398:   PetscObjectChangeTypeName((PetscObject)w, VECMPIKOKKOS);
399:   VecSetOps_MPIKokkos(w);
400:   veckok = new Vec_Kokkos(n, harray, const_cast<PetscScalar *>(darray));
401:   veckok->v_dual.modify_device(); /* Mark the device is modified */
402:   w->spptr       = static_cast<void *>(veckok);
403:   w->offloadmask = PETSC_OFFLOAD_KOKKOS;
404:   *v             = w;
405:   return 0;
406: }

408: /*
409:    VecCreateMPIKokkosWithArrays_Private - Creates a Kokkos parallel, array-style vector
410:    with user-provided arrays on host and device.

412:    Collective

414:    Input Parameter:
415: +  comm - the communicator
416: .  bs - the block size
417: .  n - the local vector length
418: .  N - the global vector length
419: -  harray - host memory where the vector elements are to be stored.
420: -  darray - device memory where the vector elements are to be stored.

422:    Output Parameter:
423: .  v - the vector

425:    Notes:
426:    If there is no device, then harray and darray must be the same.
427:    If n is not zero, then harray and darray must be allocated.
428:    After the call, the created vector is supposed to be in a synchronized state, i.e.,
429:    we suppose harray and darray have the same data.

431:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
432:    The user should not free the array until the vector is destroyed.
433: */
434: PetscErrorCode VecCreateMPIKokkosWithArrays_Private(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, const PetscScalar harray[], const PetscScalar darray[], Vec *v)
435: {
436:   Vec w;

438:   PetscKokkosInitializeCheck();
439:   if (n) {
442:   }
444:   VecCreateMPIWithArray(comm, bs, n, N, harray, &w);
445:   PetscObjectChangeTypeName((PetscObject)w, VECMPIKOKKOS); /* Change it to Kokkos */
446:   VecSetOps_MPIKokkos(w);
447:   w->spptr = new Vec_Kokkos(n, const_cast<PetscScalar *>(harray), const_cast<PetscScalar *>(darray));
448:   w->offloadmask = PETSC_OFFLOAD_KOKKOS;
449:   *v             = w;
450:   return 0;
451: }

453: /*MC
454:    VECKOKKOS - VECKOKKOS = "kokkos" - The basic vector, modified to use Kokkos

456:    Options Database Keys:
457: . -vec_type kokkos - sets the vector type to VECKOKKOS during a call to VecSetFromOptions()

459:   Level: beginner

461: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIKokkosWithArray()`, `VECMPI`, `VecType`, `VecCreateMPI()`
462: M*/
463: PetscErrorCode VecCreate_Kokkos(Vec v)
464: {
465:   PetscMPIInt size;

467:   MPI_Comm_size(PetscObjectComm((PetscObject)v), &size);
468:   if (size == 1) VecSetType(v, VECSEQKOKKOS);
469:   else VecSetType(v, VECMPIKOKKOS);
470:   return 0;
471: }