Actual source code: mpicuda.cu


  2: /*
  3:    This file contains routines for Parallel vector operations.
  4:  */
  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8: #include <petscsf.h>
  9: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
 10: #include <petsc/private/cudavecimpl.h>

 12: static PetscErrorCode VecResetPreallocationCOO_MPICUDA(Vec x)
 13: {
 14:   Vec_CUDA *veccuda = static_cast<Vec_CUDA *>(x->spptr);

 16:   if (veccuda) {
 17:     cudaFree(veccuda->jmap1_d);
 18:     cudaFree(veccuda->perm1_d);
 19:     cudaFree(veccuda->imap2_d);
 20:     cudaFree(veccuda->jmap2_d);
 21:     cudaFree(veccuda->perm2_d);
 22:     cudaFree(veccuda->Cperm_d);
 23:     cudaFree(veccuda->sendbuf_d);
 24:     cudaFree(veccuda->recvbuf_d);
 25:   }
 26:   return 0;
 27: }

 29: PetscErrorCode VecSetPreallocationCOO_MPICUDA(Vec x, PetscCount ncoo, const PetscInt coo_i[])
 30: {
 31:   Vec_MPI  *vecmpi  = static_cast<Vec_MPI *>(x->data);
 32:   Vec_CUDA *veccuda = static_cast<Vec_CUDA *>(x->spptr);
 33:   PetscInt  m;

 35:   VecResetPreallocationCOO_MPICUDA(x);
 36:   VecSetPreallocationCOO_MPI(x, ncoo, coo_i);
 37:   VecGetLocalSize(x, &m);

 39:   cudaMalloc((void **)&veccuda->jmap1_d, sizeof(PetscCount) * m);
 40:   cudaMalloc((void **)&veccuda->perm1_d, sizeof(PetscCount) * vecmpi->tot1);
 41:   cudaMalloc((void **)&veccuda->imap2_d, sizeof(PetscInt) * vecmpi->nnz2);
 42:   cudaMalloc((void **)&veccuda->jmap2_d, sizeof(PetscCount) * (vecmpi->nnz2 + 1));
 43:   cudaMalloc((void **)&veccuda->perm2_d, sizeof(PetscCount) * vecmpi->recvlen);
 44:   cudaMalloc((void **)&veccuda->Cperm_d, sizeof(PetscCount) * vecmpi->sendlen);
 45:   cudaMalloc((void **)&veccuda->sendbuf_d, sizeof(PetscScalar) * vecmpi->sendlen);
 46:   cudaMalloc((void **)&veccuda->recvbuf_d, sizeof(PetscScalar) * vecmpi->recvlen);

 48:   cudaMemcpy(veccuda->jmap1_d, vecmpi->jmap1, sizeof(PetscCount) * m, cudaMemcpyHostToDevice);
 49:   cudaMemcpy(veccuda->perm1_d, vecmpi->perm1, sizeof(PetscCount) * vecmpi->tot1, cudaMemcpyHostToDevice);
 50:   cudaMemcpy(veccuda->imap2_d, vecmpi->imap2, sizeof(PetscInt) * vecmpi->nnz2, cudaMemcpyHostToDevice);
 51:   cudaMemcpy(veccuda->jmap2_d, vecmpi->jmap2, sizeof(PetscCount) * (vecmpi->nnz2 + 1), cudaMemcpyHostToDevice);
 52:   cudaMemcpy(veccuda->perm2_d, vecmpi->perm2, sizeof(PetscCount) * vecmpi->recvlen, cudaMemcpyHostToDevice);
 53:   cudaMemcpy(veccuda->Cperm_d, vecmpi->Cperm, sizeof(PetscCount) * vecmpi->sendlen, cudaMemcpyHostToDevice);
 54:   cudaMemcpy(veccuda->sendbuf_d, vecmpi->sendbuf, sizeof(PetscScalar) * vecmpi->sendlen, cudaMemcpyHostToDevice);
 55:   cudaMemcpy(veccuda->recvbuf_d, vecmpi->recvbuf, sizeof(PetscScalar) * vecmpi->recvlen, cudaMemcpyHostToDevice);
 56:   return 0;
 57: }

 59: __global__ static void VecPackCOOValues(const PetscScalar vv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
 60: {
 61:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
 62:   const PetscCount grid_size = gridDim.x * blockDim.x;
 63:   for (; i < nnz; i += grid_size) buf[i] = vv[perm[i]];
 64: }

 66: __global__ static void VecAddCOOValues(const PetscScalar vv[], PetscCount m, const PetscCount jmap1[], const PetscCount perm1[], InsertMode imode, PetscScalar xv[])
 67: {
 68:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
 69:   const PetscCount grid_size = gridDim.x * blockDim.x;
 70:   for (; i < m; i += grid_size) {
 71:     PetscScalar sum = 0.0;
 72:     for (PetscCount k = jmap1[i]; k < jmap1[i + 1]; k++) sum += vv[perm1[k]];
 73:     xv[i] = (imode == INSERT_VALUES ? 0.0 : xv[i]) + sum;
 74:   }
 75: }

 77: __global__ static void VecAddRemoteCOOValues(const PetscScalar vv[], PetscCount nnz2, const PetscCount imap2[], const PetscCount jmap2[], const PetscCount perm2[], PetscScalar xv[])
 78: {
 79:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
 80:   const PetscCount grid_size = gridDim.x * blockDim.x;
 81:   for (; i < nnz2; i += grid_size) {
 82:     for (PetscCount k = jmap2[i]; k < jmap2[i + 1]; k++) xv[imap2[i]] += vv[perm2[k]];
 83:   }
 84: }

 86: PetscErrorCode VecSetValuesCOO_MPICUDA(Vec x, const PetscScalar v[], InsertMode imode)
 87: {
 88:   Vec_MPI           *vecmpi  = static_cast<Vec_MPI *>(x->data);
 89:   Vec_CUDA          *veccuda = static_cast<Vec_CUDA *>(x->spptr);
 90:   const PetscCount  *jmap1   = veccuda->jmap1_d;
 91:   const PetscCount  *perm1   = veccuda->perm1_d;
 92:   const PetscCount  *imap2   = veccuda->imap2_d;
 93:   const PetscCount  *jmap2   = veccuda->jmap2_d;
 94:   const PetscCount  *perm2   = veccuda->perm2_d;
 95:   const PetscCount  *Cperm   = veccuda->Cperm_d;
 96:   PetscScalar       *sendbuf = veccuda->sendbuf_d;
 97:   PetscScalar       *recvbuf = veccuda->recvbuf_d;
 98:   PetscScalar       *xv      = NULL;
 99:   const PetscScalar *vv      = v;
100:   PetscMemType       memtype;
101:   PetscInt           m;

103:   VecGetLocalSize(x, &m);
104:   PetscGetMemType(v, &memtype);
105:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] on host, copy it to device */
106:     cudaMalloc((void **)&vv, sizeof(PetscScalar) * vecmpi->coo_n);
107:     cudaMemcpy((void *)vv, v, sizeof(PetscScalar) * vecmpi->coo_n, cudaMemcpyHostToDevice);
108:   } else {
109:     vv = (PetscScalar *)v;
110:   }

112:   /* Pack entries to be sent to remote */
113:   if (vecmpi->sendlen) {
114:     VecPackCOOValues<<<(vecmpi->sendlen + 255) / 256, 256>>>(vv, vecmpi->sendlen, Cperm, sendbuf);
115:     cudaPeekAtLastError();
116:   }

118:   PetscSFReduceWithMemTypeBegin(vecmpi->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, sendbuf, PETSC_MEMTYPE_CUDA, recvbuf, MPI_REPLACE);
119:   if (imode == INSERT_VALUES) VecCUDAGetArrayWrite(x, &xv); /* write vector */
120:   else VecCUDAGetArray(x, &xv);                             /* read & write vector */

122:   if (m) {
123:     VecAddCOOValues<<<(m + 255) / 256, 256>>>(vv, m, jmap1, perm1, imode, xv);
124:     cudaPeekAtLastError();
125:   }
126:   PetscSFReduceEnd(vecmpi->coo_sf, MPIU_SCALAR, sendbuf, recvbuf, MPI_REPLACE);

128:   /* Add received remote entries */
129:   if (vecmpi->nnz2) {
130:     VecAddRemoteCOOValues<<<(vecmpi->nnz2 + 255) / 256, 256>>>(recvbuf, vecmpi->nnz2, imap2, jmap2, perm2, xv);
131:     cudaPeekAtLastError();
132:   }

134:   if (PetscMemTypeHost(memtype)) cudaFree((void *)vv);
135:   if (imode == INSERT_VALUES) VecCUDARestoreArrayWrite(x, &xv);
136:   else VecCUDARestoreArray(x, &xv);
137:   return 0;
138: }

140: /*MC
141:    VECCUDA - VECCUDA = "cuda" - A VECSEQCUDA on a single-process communicator, and VECMPICUDA otherwise.

143:    Options Database Keys:
144: . -vec_type cuda - sets the vector type to VECCUDA during a call to VecSetFromOptions()

146:   Level: beginner

148: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIWithArray()`, `VECSEQCUDA`, `VECMPICUDA`, `VECSTANDARD`, `VecType`, `VecCreateMPI()`, `VecSetPinnedMemoryMin()`
149: M*/

151: PetscErrorCode VecDestroy_MPICUDA(Vec v)
152: {
153:   Vec_MPI  *vecmpi = (Vec_MPI *)v->data;
154:   Vec_CUDA *veccuda;

156:   if (v->spptr) {
157:     veccuda = (Vec_CUDA *)v->spptr;
158:     if (veccuda->GPUarray_allocated) {
159:       cudaFree(((Vec_CUDA *)v->spptr)->GPUarray_allocated);
160:       veccuda->GPUarray_allocated = NULL;
161:     }
162:     if (veccuda->stream) cudaStreamDestroy(((Vec_CUDA *)v->spptr)->stream);
163:     if (v->pinned_memory) {
164:       PetscMallocSetCUDAHost();
165:       PetscFree(vecmpi->array_allocated);
166:       PetscMallocResetCUDAHost();
167:       v->pinned_memory = PETSC_FALSE;
168:     }
169:     VecResetPreallocationCOO_MPICUDA(v);
170:     PetscFree(v->spptr);
171:   }
172:   VecDestroy_MPI(v);
173:   return 0;
174: }

176: PetscErrorCode VecNorm_MPICUDA(Vec xin, NormType type, PetscReal *z)
177: {
178:   PetscReal sum, work = 0.0;

180:   if (type == NORM_2 || type == NORM_FROBENIUS) {
181:     VecNorm_SeqCUDA(xin, NORM_2, &work);
182:     work *= work;
183:     MPIU_Allreduce(&work, &sum, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)xin));
184:     *z = PetscSqrtReal(sum);
185:   } else if (type == NORM_1) {
186:     /* Find the local part */
187:     VecNorm_SeqCUDA(xin, NORM_1, &work);
188:     /* Find the global max */
189:     MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)xin));
190:   } else if (type == NORM_INFINITY) {
191:     /* Find the local max */
192:     VecNorm_SeqCUDA(xin, NORM_INFINITY, &work);
193:     /* Find the global max */
194:     MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)xin));
195:   } else if (type == NORM_1_AND_2) {
196:     PetscReal temp[2];
197:     VecNorm_SeqCUDA(xin, NORM_1, temp);
198:     VecNorm_SeqCUDA(xin, NORM_2, temp + 1);
199:     temp[1] = temp[1] * temp[1];
200:     MPIU_Allreduce(temp, z, 2, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)xin));
201:     z[1] = PetscSqrtReal(z[1]);
202:   }
203:   return 0;
204: }

206: PetscErrorCode VecDot_MPICUDA(Vec xin, Vec yin, PetscScalar *z)
207: {
208:   PetscScalar sum, work;

210:   VecDot_SeqCUDA(xin, yin, &work);
211:   MPIU_Allreduce(&work, &sum, 1, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
212:   *z = sum;
213:   return 0;
214: }

216: PetscErrorCode VecTDot_MPICUDA(Vec xin, Vec yin, PetscScalar *z)
217: {
218:   PetscScalar sum, work;

220:   VecTDot_SeqCUDA(xin, yin, &work);
221:   MPIU_Allreduce(&work, &sum, 1, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
222:   *z = sum;
223:   return 0;
224: }

226: PetscErrorCode VecMDot_MPICUDA(Vec xin, PetscInt nv, const Vec y[], PetscScalar *z)
227: {
228:   PetscScalar awork[128], *work = awork;

230:   if (nv > 128) PetscMalloc1(nv, &work);
231:   VecMDot_SeqCUDA(xin, nv, y, work);
232:   MPIU_Allreduce(work, z, nv, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)xin));
233:   if (nv > 128) PetscFree(work);
234:   return 0;
235: }

237: /*MC
238:    VECMPICUDA - VECMPICUDA = "mpicuda" - The basic parallel vector, modified to use CUDA

240:    Options Database Keys:
241: . -vec_type mpicuda - sets the vector type to VECMPICUDA during a call to VecSetFromOptions()

243:   Level: beginner

245: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIWithArray()`, `VECMPI`, `VecType`, `VecCreateMPI()`, `VecSetPinnedMemoryMin()`
246: M*/

248: PetscErrorCode VecDuplicate_MPICUDA(Vec win, Vec *v)
249: {
250:   Vec_MPI     *vw, *w = (Vec_MPI *)win->data;
251:   PetscScalar *array;

253:   VecCreate(PetscObjectComm((PetscObject)win), v);
254:   PetscLayoutReference(win->map, &(*v)->map);

256:   VecCreate_MPICUDA_Private(*v, PETSC_TRUE, w->nghost, 0);
257:   vw = (Vec_MPI *)(*v)->data;
258:   PetscMemcpy((*v)->ops, win->ops, sizeof(struct _VecOps));

260:   /* save local representation of the parallel vector (and scatter) if it exists */
261:   if (w->localrep) {
262:     VecGetArray(*v, &array);
263:     VecCreateSeqWithArray(PETSC_COMM_SELF, 1, win->map->n + w->nghost, array, &vw->localrep);
264:     PetscMemcpy(vw->localrep->ops, w->localrep->ops, sizeof(struct _VecOps));
265:     VecRestoreArray(*v, &array);
266:     vw->localupdate = w->localupdate;
267:     if (vw->localupdate) PetscObjectReference((PetscObject)vw->localupdate);
268:   }

270:   /* New vector should inherit stashing property of parent */
271:   (*v)->stash.donotstash   = win->stash.donotstash;
272:   (*v)->stash.ignorenegidx = win->stash.ignorenegidx;

274:   /* change type_name appropriately */
275:   VecCUDAAllocateCheck(*v);
276:   PetscObjectChangeTypeName((PetscObject)(*v), VECMPICUDA);

278:   PetscObjectListDuplicate(((PetscObject)win)->olist, &((PetscObject)(*v))->olist);
279:   PetscFunctionListDuplicate(((PetscObject)win)->qlist, &((PetscObject)(*v))->qlist);
280:   (*v)->map->bs   = PetscAbs(win->map->bs);
281:   (*v)->bstash.bs = win->bstash.bs;
282:   return 0;
283: }

285: PetscErrorCode VecDotNorm2_MPICUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
286: {
287:   PetscScalar work[2], sum[2];

289:   VecDotNorm2_SeqCUDA(s, t, work, work + 1);
290:   MPIU_Allreduce(&work, &sum, 2, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)s));
291:   *dp = sum[0];
292:   *nm = sum[1];
293:   return 0;
294: }

296: PetscErrorCode VecCreate_MPICUDA(Vec vv)
297: {
298:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
299:   PetscLayoutSetUp(vv->map);
300:   VecCUDAAllocateCheck(vv);
301:   VecCreate_MPICUDA_Private(vv, PETSC_FALSE, 0, ((Vec_CUDA *)vv->spptr)->GPUarray_allocated);
302:   VecCUDAAllocateCheckHost(vv);
303:   VecSet(vv, 0.0);
304:   VecSet_Seq(vv, 0.0);
305:   vv->offloadmask = PETSC_OFFLOAD_BOTH;
306:   return 0;
307: }

309: PetscErrorCode VecCreate_CUDA(Vec v)
310: {
311:   PetscMPIInt size;

313:   MPI_Comm_size(PetscObjectComm((PetscObject)v), &size);
314:   if (size == 1) {
315:     VecSetType(v, VECSEQCUDA);
316:   } else {
317:     VecSetType(v, VECMPICUDA);
318:   }
319:   return 0;
320: }

322: /*@
323:  VecCreateMPICUDA - Creates a standard, parallel array-style vector for CUDA devices.

325:  Collective

327:  Input Parameters:
328:  +  comm - the MPI communicator to use
329:  .  n - local vector length (or PETSC_DECIDE to have calculated if N is given)
330:  -  N - global vector length (or PETSC_DETERMINE to have calculated if n is given)

332:     Output Parameter:
333:  .  v - the vector

335:     Notes:
336:     Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
337:     same type as an existing vector.

339:     Level: intermediate

341:  .seealso: `VecCreateMPICUDAWithArray()`, `VecCreateMPICUDAWithArrays()`, `VecCreateSeqCUDA()`, `VecCreateSeq()`,
342:            `VecCreateMPI()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`,
343:            `VecCreateMPIWithArray()`, `VecCreateGhostWithArray()`, `VecMPISetGhost()`

345:  @*/
346: PetscErrorCode VecCreateMPICUDA(MPI_Comm comm, PetscInt n, PetscInt N, Vec *v)
347: {
348:   VecCreate(comm, v);
349:   VecSetSizes(*v, n, N);
350:   VecSetType(*v, VECMPICUDA);
351:   return 0;
352: }

354: /*@C
355:    VecCreateMPICUDAWithArray - Creates a parallel, array-style vector,
356:    where the user provides the GPU array space to store the vector values.

358:    Collective

360:    Input Parameters:
361: +  comm  - the MPI communicator to use
362: .  bs    - block size, same meaning as VecSetBlockSize()
363: .  n     - local vector length, cannot be PETSC_DECIDE
364: .  N     - global vector length (or PETSC_DECIDE to have calculated)
365: -  array - the user provided GPU array to store the vector values

367:    Output Parameter:
368: .  vv - the vector

370:    Notes:
371:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
372:    same type as an existing vector.

374:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
375:    at a later stage to SET the array for storing the vector values.

377:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
378:    The user should not free the array until the vector is destroyed.

380:    Level: intermediate

382: .seealso: `VecCreateMPICUDA()`, `VecCreateSeqCUDAWithArray()`, `VecCreateMPIWithArray()`, `VecCreateSeqWithArray()`,
383:           `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`,
384:           `VecCreateMPI()`, `VecCreateGhostWithArray()`, `VecPlaceArray()`

386: @*/
387: PetscErrorCode VecCreateMPICUDAWithArray(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, const PetscScalar array[], Vec *vv)
388: {
390:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
391:   VecCreate(comm, vv);
392:   VecSetSizes(*vv, n, N);
393:   VecSetBlockSize(*vv, bs);
394:   VecCreate_MPICUDA_Private(*vv, PETSC_FALSE, 0, array);
395:   return 0;
396: }

398: /*@C
399:    VecCreateMPICUDAWithArrays - Creates a parallel, array-style vector,
400:    where the user provides the GPU array space to store the vector values.

402:    Collective

404:    Input Parameters:
405: +  comm  - the MPI communicator to use
406: .  bs    - block size, same meaning as VecSetBlockSize()
407: .  n     - local vector length, cannot be PETSC_DECIDE
408: .  N     - global vector length (or PETSC_DECIDE to have calculated)
409: -  cpuarray - the user provided CPU array to store the vector values
410: -  gpuarray - the user provided GPU array to store the vector values

412:    Output Parameter:
413: .  vv - the vector

415:    Notes:
416:    If both cpuarray and gpuarray are provided, the caller must ensure that
417:    the provided arrays have identical values.

419:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
420:    same type as an existing vector.

422:    PETSc does NOT free the provided arrays when the vector is destroyed via
423:    VecDestroy(). The user should not free the array until the vector is
424:    destroyed.

426:    Level: intermediate

428: .seealso: `VecCreateSeqCUDAWithArrays()`, `VecCreateMPIWithArray()`, `VecCreateSeqWithArray()`,
429:           `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`,
430:           `VecCreateMPI()`, `VecCreateGhostWithArray()`, `VecCUDAPlaceArray()`, `VecPlaceArray()`,
431:           `VecCUDAAllocateCheckHost()`
432: @*/
433: PetscErrorCode VecCreateMPICUDAWithArrays(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, const PetscScalar cpuarray[], const PetscScalar gpuarray[], Vec *vv)
434: {
435:   VecCreateMPICUDAWithArray(comm, bs, n, N, gpuarray, vv);

437:   if (cpuarray && gpuarray) {
438:     Vec_MPI *s         = (Vec_MPI *)((*vv)->data);
439:     s->array           = (PetscScalar *)cpuarray;
440:     (*vv)->offloadmask = PETSC_OFFLOAD_BOTH;
441:   } else if (cpuarray) {
442:     Vec_MPI *s         = (Vec_MPI *)((*vv)->data);
443:     s->array           = (PetscScalar *)cpuarray;
444:     (*vv)->offloadmask = PETSC_OFFLOAD_CPU;
445:   } else if (gpuarray) {
446:     (*vv)->offloadmask = PETSC_OFFLOAD_GPU;
447:   } else {
448:     (*vv)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
449:   }

451:   return 0;
452: }

454: PetscErrorCode VecMax_MPICUDA(Vec xin, PetscInt *idx, PetscReal *z)
455: {
456:   PetscReal work;

458:   VecMax_SeqCUDA(xin, idx, &work);
459: #if defined(PETSC_HAVE_MPIUNI)
460:   *z = work;
461: #else
462:   if (!idx) {
463:     MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)xin));
464:   } else {
465:     struct {
466:       PetscReal v;
467:       PetscInt  i;
468:     } in, out;

470:     in.v = work;
471:     in.i = *idx + xin->map->rstart;
472:     MPIU_Allreduce(&in, &out, 1, MPIU_REAL_INT, MPIU_MAXLOC, PetscObjectComm((PetscObject)xin));
473:     *z   = out.v;
474:     *idx = out.i;
475:   }
476: #endif
477:   return 0;
478: }

480: PetscErrorCode VecMin_MPICUDA(Vec xin, PetscInt *idx, PetscReal *z)
481: {
482:   PetscReal work;

484:   VecMin_SeqCUDA(xin, idx, &work);
485: #if defined(PETSC_HAVE_MPIUNI)
486:   *z = work;
487: #else
488:   if (!idx) {
489:     MPIU_Allreduce(&work, z, 1, MPIU_REAL, MPIU_MIN, PetscObjectComm((PetscObject)xin));
490:   } else {
491:     struct {
492:       PetscReal v;
493:       PetscInt  i;
494:     } in, out;

496:     in.v = work;
497:     in.i = *idx + xin->map->rstart;
498:     MPIU_Allreduce(&in, &out, 1, MPIU_REAL_INT, MPIU_MINLOC, PetscObjectComm((PetscObject)xin));
499:     *z   = out.v;
500:     *idx = out.i;
501:   }
502: #endif
503:   return 0;
504: }

506: PetscErrorCode VecBindToCPU_MPICUDA(Vec V, PetscBool bind)
507: {
508:   V->boundtocpu = bind;
509:   if (bind) {
510:     VecCUDACopyFromGPU(V);
511:     V->offloadmask                  = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
512:     V->ops->dotnorm2                = NULL;
513:     V->ops->waxpy                   = VecWAXPY_Seq;
514:     V->ops->dot                     = VecDot_MPI;
515:     V->ops->mdot                    = VecMDot_MPI;
516:     V->ops->tdot                    = VecTDot_MPI;
517:     V->ops->norm                    = VecNorm_MPI;
518:     V->ops->scale                   = VecScale_Seq;
519:     V->ops->copy                    = VecCopy_Seq;
520:     V->ops->set                     = VecSet_Seq;
521:     V->ops->swap                    = VecSwap_Seq;
522:     V->ops->axpy                    = VecAXPY_Seq;
523:     V->ops->axpby                   = VecAXPBY_Seq;
524:     V->ops->maxpy                   = VecMAXPY_Seq;
525:     V->ops->aypx                    = VecAYPX_Seq;
526:     V->ops->axpbypcz                = VecAXPBYPCZ_Seq;
527:     V->ops->pointwisemult           = VecPointwiseMult_Seq;
528:     V->ops->setrandom               = VecSetRandom_Seq;
529:     V->ops->placearray              = VecPlaceArray_Seq;
530:     V->ops->replacearray            = VecReplaceArray_SeqCUDA;
531:     V->ops->resetarray              = VecResetArray_Seq;
532:     V->ops->dot_local               = VecDot_Seq;
533:     V->ops->tdot_local              = VecTDot_Seq;
534:     V->ops->norm_local              = VecNorm_Seq;
535:     V->ops->mdot_local              = VecMDot_Seq;
536:     V->ops->pointwisedivide         = VecPointwiseDivide_Seq;
537:     V->ops->getlocalvector          = NULL;
538:     V->ops->restorelocalvector      = NULL;
539:     V->ops->getlocalvectorread      = NULL;
540:     V->ops->restorelocalvectorread  = NULL;
541:     V->ops->getarraywrite           = NULL;
542:     V->ops->getarrayandmemtype      = NULL;
543:     V->ops->restorearrayandmemtype  = NULL;
544:     V->ops->getarraywriteandmemtype = NULL;
545:     V->ops->max                     = VecMax_MPI;
546:     V->ops->min                     = VecMin_MPI;
547:     V->ops->reciprocal              = VecReciprocal_Default;
548:     V->ops->sum                     = NULL;
549:     V->ops->shift                   = NULL;
550:     /* default random number generator */
551:     PetscFree(V->defaultrandtype);
552:     PetscStrallocpy(PETSCRANDER48, &V->defaultrandtype);
553:   } else {
554:     V->ops->dotnorm2                = VecDotNorm2_MPICUDA;
555:     V->ops->waxpy                   = VecWAXPY_SeqCUDA;
556:     V->ops->duplicate               = VecDuplicate_MPICUDA;
557:     V->ops->dot                     = VecDot_MPICUDA;
558:     V->ops->mdot                    = VecMDot_MPICUDA;
559:     V->ops->tdot                    = VecTDot_MPICUDA;
560:     V->ops->norm                    = VecNorm_MPICUDA;
561:     V->ops->scale                   = VecScale_SeqCUDA;
562:     V->ops->copy                    = VecCopy_SeqCUDA;
563:     V->ops->set                     = VecSet_SeqCUDA;
564:     V->ops->swap                    = VecSwap_SeqCUDA;
565:     V->ops->axpy                    = VecAXPY_SeqCUDA;
566:     V->ops->axpby                   = VecAXPBY_SeqCUDA;
567:     V->ops->maxpy                   = VecMAXPY_SeqCUDA;
568:     V->ops->aypx                    = VecAYPX_SeqCUDA;
569:     V->ops->axpbypcz                = VecAXPBYPCZ_SeqCUDA;
570:     V->ops->pointwisemult           = VecPointwiseMult_SeqCUDA;
571:     V->ops->setrandom               = VecSetRandom_SeqCUDA;
572:     V->ops->placearray              = VecPlaceArray_SeqCUDA;
573:     V->ops->replacearray            = VecReplaceArray_SeqCUDA;
574:     V->ops->resetarray              = VecResetArray_SeqCUDA;
575:     V->ops->dot_local               = VecDot_SeqCUDA;
576:     V->ops->tdot_local              = VecTDot_SeqCUDA;
577:     V->ops->norm_local              = VecNorm_SeqCUDA;
578:     V->ops->mdot_local              = VecMDot_SeqCUDA;
579:     V->ops->destroy                 = VecDestroy_MPICUDA;
580:     V->ops->pointwisedivide         = VecPointwiseDivide_SeqCUDA;
581:     V->ops->getlocalvector          = VecGetLocalVector_SeqCUDA;
582:     V->ops->restorelocalvector      = VecRestoreLocalVector_SeqCUDA;
583:     V->ops->getlocalvectorread      = VecGetLocalVectorRead_SeqCUDA;
584:     V->ops->restorelocalvectorread  = VecRestoreLocalVectorRead_SeqCUDA;
585:     V->ops->getarraywrite           = VecGetArrayWrite_SeqCUDA;
586:     V->ops->getarray                = VecGetArray_SeqCUDA;
587:     V->ops->restorearray            = VecRestoreArray_SeqCUDA;
588:     V->ops->getarrayandmemtype      = VecGetArrayAndMemType_SeqCUDA;
589:     V->ops->restorearrayandmemtype  = VecRestoreArrayAndMemType_SeqCUDA;
590:     V->ops->getarraywriteandmemtype = VecGetArrayWriteAndMemType_SeqCUDA;
591:     V->ops->max                     = VecMax_MPICUDA;
592:     V->ops->min                     = VecMin_MPICUDA;
593:     V->ops->reciprocal              = VecReciprocal_SeqCUDA;
594:     V->ops->sum                     = VecSum_SeqCUDA;
595:     V->ops->shift                   = VecShift_SeqCUDA;
596:     /* default random number generator */
597:     PetscFree(V->defaultrandtype);
598:     PetscStrallocpy(PETSCCURAND, &V->defaultrandtype);
599:   }
600:   return 0;
601: }

603: PetscErrorCode VecCreate_MPICUDA_Private(Vec vv, PetscBool alloc, PetscInt nghost, const PetscScalar array[])
604: {
605:   Vec_CUDA *veccuda;

607:   VecCreate_MPI_Private(vv, PETSC_FALSE, 0, 0);
608:   PetscObjectChangeTypeName((PetscObject)vv, VECMPICUDA);

610:   VecBindToCPU_MPICUDA(vv, PETSC_FALSE);
611:   vv->ops->bindtocpu = VecBindToCPU_MPICUDA;

613:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
614:   if (alloc && !array) {
615:     VecCUDAAllocateCheck(vv);
616:     VecCUDAAllocateCheckHost(vv);
617:     VecSet(vv, 0.0);
618:     VecSet_Seq(vv, 0.0);
619:     vv->offloadmask = PETSC_OFFLOAD_BOTH;
620:   }
621:   if (array) {
622:     if (!vv->spptr) {
623:       PetscReal pinned_memory_min;
624:       PetscBool flag;
625:       /* Cannot use PetscNew() here because spptr is void* */
626:       PetscCalloc(sizeof(Vec_CUDA), &vv->spptr);
627:       veccuda                         = (Vec_CUDA *)vv->spptr;
628:       vv->minimum_bytes_pinned_memory = 0;

630:       /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
631:          Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCUDAAllocateCheck(). Is there a good way to avoid this? */
632:       PetscOptionsBegin(PetscObjectComm((PetscObject)vv), ((PetscObject)vv)->prefix, "VECCUDA Options", "Vec");
633:       pinned_memory_min = vv->minimum_bytes_pinned_memory;
634:       PetscOptionsReal("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", pinned_memory_min, &pinned_memory_min, &flag);
635:       if (flag) vv->minimum_bytes_pinned_memory = pinned_memory_min;
636:       PetscOptionsEnd();
637:     }
638:     veccuda           = (Vec_CUDA *)vv->spptr;
639:     veccuda->GPUarray = (PetscScalar *)array;
640:     vv->offloadmask   = PETSC_OFFLOAD_GPU;
641:   }
642:   return 0;
643: }