Actual source code: veccuda2.cu

  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8: #include <petsc/private/vecimpl.h>
  9: #include <../src/vec/vec/impls/dvecimpl.h>
 10: #include <petsc/private/cudavecimpl.h>

 12: #include <cuda_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/functional.h>
 15: #include <thrust/iterator/counting_iterator.h>
 16: #include <thrust/reduce.h>
 17: #include <thrust/transform.h>
 18: #if defined(PETSC_USE_COMPLEX)
 19:   #include <thrust/transform_reduce.h>
 20: #endif

 22: #if THRUST_VERSION >= 101600 && !PetscDefined(USE_DEBUG)
 23: static thrust::cuda_cub::par_nosync_t::stream_attachment_type VecCUDAThrustPolicy(Vec x)
 24: {
 25:   return thrust::cuda::par_nosync.on(((Vec_CUDA *)x->spptr)->stream);
 26: }
 27: #else
 28: static thrust::cuda_cub::par_t::stream_attachment_type VecCUDAThrustPolicy(Vec x)
 29: {
 30:   return thrust::cuda::par.on(((Vec_CUDA *)x->spptr)->stream);
 31: }
 32: #endif

 34: /*
 35:     Allocates space for the vector array on the GPU if it does not exist.
 36:     Does NOT change the PetscCUDAFlag for the vector
 37:     Does NOT zero the CUDA array

 39:  */
 40: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 41: {
 42:   Vec_CUDA *veccuda;
 43:   PetscBool option_set;

 45:   if (!v->spptr) {
 46:     PetscReal pinned_memory_min;
 47:     PetscCalloc(sizeof(Vec_CUDA), &v->spptr);
 48:     veccuda = (Vec_CUDA *)v->spptr;
 49:     cudaMalloc((void **)&veccuda->GPUarray_allocated, sizeof(PetscScalar) * ((PetscBLASInt)v->map->n));
 50:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 51:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 52:       if (v->data && ((Vec_Seq *)v->data)->array) {
 53:         v->offloadmask = PETSC_OFFLOAD_CPU;
 54:       } else {
 55:         v->offloadmask = PETSC_OFFLOAD_GPU;
 56:       }
 57:     }
 58:     pinned_memory_min = 0;

 60:     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
 61:        Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
 62:     PetscOptionsBegin(PetscObjectComm((PetscObject)v), ((PetscObject)v)->prefix, "VECCUDA Options", "Vec");
 63:     PetscOptionsReal("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", pinned_memory_min, &pinned_memory_min, &option_set);
 64:     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
 65:     PetscOptionsEnd();
 66:   }
 67:   return 0;
 68: }

 70: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 71: PetscErrorCode VecCUDACopyToGPU(Vec v)
 72: {
 73:   Vec_CUDA    *veccuda;
 74:   PetscScalar *varray;

 77:   VecCUDAAllocateCheck(v);
 78:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 79:     PetscLogEventBegin(VEC_CUDACopyToGPU, v, 0, 0, 0);
 80:     veccuda = (Vec_CUDA *)v->spptr;
 81:     varray  = veccuda->GPUarray;
 82:     cudaMemcpy(varray, ((Vec_Seq *)v->data)->array, v->map->n * sizeof(PetscScalar), cudaMemcpyHostToDevice);
 83:     PetscLogCpuToGpu((v->map->n) * sizeof(PetscScalar));
 84:     PetscLogEventEnd(VEC_CUDACopyToGPU, v, 0, 0, 0);
 85:     v->offloadmask = PETSC_OFFLOAD_BOTH;
 86:   }
 87:   return 0;
 88: }

 90: /*
 91:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
 92: */
 93: PetscErrorCode VecCUDACopyFromGPU(Vec v)
 94: {
 95:   Vec_CUDA    *veccuda;
 96:   PetscScalar *varray;

 99:   VecCUDAAllocateCheckHost(v);
100:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
101:     PetscLogEventBegin(VEC_CUDACopyFromGPU, v, 0, 0, 0);
102:     veccuda = (Vec_CUDA *)v->spptr;
103:     varray  = veccuda->GPUarray;
104:     cudaMemcpy(((Vec_Seq *)v->data)->array, varray, v->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToHost);
105:     PetscLogGpuToCpu((v->map->n) * sizeof(PetscScalar));
106:     PetscLogEventEnd(VEC_CUDACopyFromGPU, v, 0, 0, 0);
107:     v->offloadmask = PETSC_OFFLOAD_BOTH;
108:   }
109:   return 0;
110: }

112: /*MC
113:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

115:    Options Database Keys:
116: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

118:   Level: beginner

120: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateSeqWithArray()`, `VECMPI`, `VecType`, `VecCreateMPI()`, `VecCreateSeq()`, `VecSetPinnedMemoryMin()`
121: M*/

123: PetscErrorCode VecAYPX_SeqCUDA(Vec yin, PetscScalar alpha, Vec xin)
124: {
125:   const PetscScalar *xarray;
126:   PetscScalar       *yarray;
127:   PetscBLASInt       one = 1, bn = 0;
128:   PetscScalar        sone = 1.0;
129:   cublasHandle_t     cublasv2handle;

131:   PetscCUBLASGetHandle(&cublasv2handle);
132:   PetscBLASIntCast(yin->map->n, &bn);
133:   VecCUDAGetArrayRead(xin, &xarray);
134:   VecCUDAGetArray(yin, &yarray);
135:   PetscLogGpuTimeBegin();
136:   if (alpha == (PetscScalar)0.0) {
137:     cudaMemcpy(yarray, xarray, bn * sizeof(PetscScalar), cudaMemcpyDeviceToDevice);
138:   } else if (alpha == (PetscScalar)1.0) {
139:     cublasXaxpy(cublasv2handle, bn, &alpha, xarray, one, yarray, one);
140:     PetscLogGpuFlops(1.0 * yin->map->n);
141:   } else {
142:     cublasXscal(cublasv2handle, bn, &alpha, yarray, one);
143:     cublasXaxpy(cublasv2handle, bn, &sone, xarray, one, yarray, one);
144:     PetscLogGpuFlops(2.0 * yin->map->n);
145:   }
146:   PetscLogGpuTimeEnd();
147:   VecCUDARestoreArrayRead(xin, &xarray);
148:   VecCUDARestoreArray(yin, &yarray);
149:   PetscLogCpuToGpuScalar(sizeof(PetscScalar));
150:   return 0;
151: }

153: PetscErrorCode VecAXPY_SeqCUDA(Vec yin, PetscScalar alpha, Vec xin)
154: {
155:   const PetscScalar *xarray;
156:   PetscScalar       *yarray;
157:   PetscBLASInt       one = 1, bn = 0;
158:   cublasHandle_t     cublasv2handle;
159:   PetscBool          xiscuda;

161:   if (alpha == (PetscScalar)0.0) return 0;
162:   PetscCUBLASGetHandle(&cublasv2handle);
163:   PetscObjectTypeCompareAny((PetscObject)xin, &xiscuda, VECSEQCUDA, VECMPICUDA, "");
164:   if (xiscuda) {
165:     PetscBLASIntCast(yin->map->n, &bn);
166:     VecCUDAGetArrayRead(xin, &xarray);
167:     VecCUDAGetArray(yin, &yarray);
168:     PetscLogGpuTimeBegin();
169:     cublasXaxpy(cublasv2handle, bn, &alpha, xarray, one, yarray, one);
170:     PetscLogGpuTimeEnd();
171:     VecCUDARestoreArrayRead(xin, &xarray);
172:     VecCUDARestoreArray(yin, &yarray);
173:     PetscLogGpuFlops(2.0 * yin->map->n);
174:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
175:   } else {
176:     VecAXPY_Seq(yin, alpha, xin);
177:   }
178:   return 0;
179: }

181: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
182: {
183:   PetscInt                              n      = xin->map->n;
184:   const PetscScalar                    *xarray = NULL, *yarray = NULL;
185:   PetscScalar                          *warray = NULL;
186:   thrust::device_ptr<const PetscScalar> xptr, yptr;
187:   thrust::device_ptr<PetscScalar>       wptr;

189:   if (xin->boundtocpu || yin->boundtocpu) {
190:     VecPointwiseDivide_Seq(win, xin, yin);
191:     return 0;
192:   }
193:   VecCUDAGetArrayWrite(win, &warray);
194:   VecCUDAGetArrayRead(xin, &xarray);
195:   VecCUDAGetArrayRead(yin, &yarray);
196:   PetscLogGpuTimeBegin();
197:   try {
198:     wptr = thrust::device_pointer_cast(warray);
199:     xptr = thrust::device_pointer_cast(xarray);
200:     yptr = thrust::device_pointer_cast(yarray);
201:     thrust::transform(VecCUDAThrustPolicy(win), xptr, xptr + n, yptr, wptr, thrust::divides<PetscScalar>());
202:   } catch (char *ex) {
203:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
204:   }
205:   PetscLogGpuTimeEnd();
206:   PetscLogGpuFlops(n);
207:   VecCUDARestoreArrayRead(xin, &xarray);
208:   VecCUDARestoreArrayRead(yin, &yarray);
209:   VecCUDARestoreArrayWrite(win, &warray);
210:   return 0;
211: }

213: PetscErrorCode VecWAXPY_SeqCUDA(Vec win, PetscScalar alpha, Vec xin, Vec yin)
214: {
215:   const PetscScalar *xarray = NULL, *yarray = NULL;
216:   PetscScalar       *warray = NULL;
217:   PetscBLASInt       one = 1, bn = 0;
218:   cublasHandle_t     cublasv2handle;
219:   cudaStream_t       stream;

221:   PetscCUBLASGetHandle(&cublasv2handle);
222:   PetscBLASIntCast(win->map->n, &bn);
223:   if (alpha == (PetscScalar)0.0) {
224:     VecCopy_SeqCUDA(yin, win);
225:   } else {
226:     VecCUDAGetArrayRead(xin, &xarray);
227:     VecCUDAGetArrayRead(yin, &yarray);
228:     VecCUDAGetArrayWrite(win, &warray);
229:     PetscLogGpuTimeBegin();
230:     cublasGetStream(cublasv2handle, &stream);
231:     cudaMemcpyAsync(warray, yarray, win->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToDevice, stream);
232:     cublasXaxpy(cublasv2handle, bn, &alpha, xarray, one, warray, one);
233:     PetscLogGpuTimeEnd();
234:     PetscLogGpuFlops(2 * win->map->n);
235:     VecCUDARestoreArrayRead(xin, &xarray);
236:     VecCUDARestoreArrayRead(yin, &yarray);
237:     VecCUDARestoreArrayWrite(win, &warray);
238:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
239:   }
240:   return 0;
241: }

243: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv, const PetscScalar *alpha, Vec *y)
244: {
245:   PetscInt           n = xin->map->n, j;
246:   PetscScalar       *xarray;
247:   const PetscScalar *yarray;
248:   PetscBLASInt       one = 1, bn = 0;
249:   cublasHandle_t     cublasv2handle;

251:   PetscLogGpuFlops(nv * 2.0 * n);
252:   PetscLogCpuToGpuScalar(nv * sizeof(PetscScalar));
253:   PetscCUBLASGetHandle(&cublasv2handle);
254:   PetscBLASIntCast(n, &bn);
255:   VecCUDAGetArray(xin, &xarray);
256:   PetscLogGpuTimeBegin();
257:   for (j = 0; j < nv; j++) {
258:     VecCUDAGetArrayRead(y[j], &yarray);
259:     cublasXaxpy(cublasv2handle, bn, alpha + j, yarray, one, xarray, one);
260:     VecCUDARestoreArrayRead(y[j], &yarray);
261:   }
262:   PetscLogGpuTimeEnd();
263:   VecCUDARestoreArray(xin, &xarray);
264:   return 0;
265: }

267: PetscErrorCode VecDot_SeqCUDA(Vec xin, Vec yin, PetscScalar *z)
268: {
269:   const PetscScalar *xarray, *yarray;
270:   PetscBLASInt       one = 1, bn = 0;
271:   cublasHandle_t     cublasv2handle;

273:   PetscCUBLASGetHandle(&cublasv2handle);
274:   PetscBLASIntCast(yin->map->n, &bn);
275:   VecCUDAGetArrayRead(xin, &xarray);
276:   VecCUDAGetArrayRead(yin, &yarray);
277:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
278:   PetscLogGpuTimeBegin();
279:   cublasXdot(cublasv2handle, bn, yarray, one, xarray, one, z);
280:   PetscLogGpuTimeEnd();
281:   if (xin->map->n > 0) PetscLogGpuFlops(2.0 * xin->map->n - 1);
282:   PetscLogGpuToCpuScalar(sizeof(PetscScalar));
283:   VecCUDARestoreArrayRead(xin, &xarray);
284:   VecCUDARestoreArrayRead(yin, &yarray);
285:   return 0;
286: }

288: //
289: // CUDA kernels for MDot to follow
290: //

292: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
293: #define MDOT_WORKGROUP_SIZE 128
294: #define MDOT_WORKGROUP_NUM  128

296: #if !defined(PETSC_USE_COMPLEX)
297: // M = 2:
298: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, PetscInt size, PetscScalar *group_results)
299: {
300:   __shared__ PetscScalar tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
301:   PetscInt               entries_per_group = (size - 1) / gridDim.x + 1;
302:   entries_per_group                        = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
303:   PetscInt vec_start_index                 = blockIdx.x * entries_per_group;
304:   PetscInt vec_stop_index                  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

306:   PetscScalar entry_x    = 0;
307:   PetscScalar group_sum0 = 0;
308:   PetscScalar group_sum1 = 0;
309:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
310:     entry_x = x[i]; // load only once from global memory!
311:     group_sum0 += entry_x * y0[i];
312:     group_sum1 += entry_x * y1[i];
313:   }
314:   tmp_buffer[threadIdx.x]                       = group_sum0;
315:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

317:   // parallel reduction
318:   for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
319:     __syncthreads();
320:     if (threadIdx.x < stride) {
321:       tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
322:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
323:     }
324:   }

326:   // write result of group to group_results
327:   if (threadIdx.x == 0) {
328:     group_results[blockIdx.x]             = tmp_buffer[0];
329:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
330:   }
331: }

333: // M = 3:
334: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, const PetscScalar *y2, PetscInt size, PetscScalar *group_results)
335: {
336:   __shared__ PetscScalar tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
337:   PetscInt               entries_per_group = (size - 1) / gridDim.x + 1;
338:   entries_per_group                        = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
339:   PetscInt vec_start_index                 = blockIdx.x * entries_per_group;
340:   PetscInt vec_stop_index                  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

342:   PetscScalar entry_x    = 0;
343:   PetscScalar group_sum0 = 0;
344:   PetscScalar group_sum1 = 0;
345:   PetscScalar group_sum2 = 0;
346:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
347:     entry_x = x[i]; // load only once from global memory!
348:     group_sum0 += entry_x * y0[i];
349:     group_sum1 += entry_x * y1[i];
350:     group_sum2 += entry_x * y2[i];
351:   }
352:   tmp_buffer[threadIdx.x]                           = group_sum0;
353:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE]     = group_sum1;
354:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

356:   // parallel reduction
357:   for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
358:     __syncthreads();
359:     if (threadIdx.x < stride) {
360:       tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
361:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
362:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 2 * MDOT_WORKGROUP_SIZE];
363:     }
364:   }

366:   // write result of group to group_results
367:   if (threadIdx.x == 0) {
368:     group_results[blockIdx.x]                 = tmp_buffer[0];
369:     group_results[blockIdx.x + gridDim.x]     = tmp_buffer[MDOT_WORKGROUP_SIZE];
370:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
371:   }
372: }

374: // M = 4:
375: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, const PetscScalar *y2, const PetscScalar *y3, PetscInt size, PetscScalar *group_results)
376: {
377:   __shared__ PetscScalar tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
378:   PetscInt               entries_per_group = (size - 1) / gridDim.x + 1;
379:   entries_per_group                        = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
380:   PetscInt vec_start_index                 = blockIdx.x * entries_per_group;
381:   PetscInt vec_stop_index                  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

383:   PetscScalar entry_x    = 0;
384:   PetscScalar group_sum0 = 0;
385:   PetscScalar group_sum1 = 0;
386:   PetscScalar group_sum2 = 0;
387:   PetscScalar group_sum3 = 0;
388:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
389:     entry_x = x[i]; // load only once from global memory!
390:     group_sum0 += entry_x * y0[i];
391:     group_sum1 += entry_x * y1[i];
392:     group_sum2 += entry_x * y2[i];
393:     group_sum3 += entry_x * y3[i];
394:   }
395:   tmp_buffer[threadIdx.x]                           = group_sum0;
396:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE]     = group_sum1;
397:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
398:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

400:   // parallel reduction
401:   for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
402:     __syncthreads();
403:     if (threadIdx.x < stride) {
404:       tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
405:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
406:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 2 * MDOT_WORKGROUP_SIZE];
407:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 3 * MDOT_WORKGROUP_SIZE];
408:     }
409:   }

411:   // write result of group to group_results
412:   if (threadIdx.x == 0) {
413:     group_results[blockIdx.x]                 = tmp_buffer[0];
414:     group_results[blockIdx.x + gridDim.x]     = tmp_buffer[MDOT_WORKGROUP_SIZE];
415:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
416:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
417:   }
418: }

420: // M = 8:
421: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, const PetscScalar *y2, const PetscScalar *y3, const PetscScalar *y4, const PetscScalar *y5, const PetscScalar *y6, const PetscScalar *y7, PetscInt size, PetscScalar *group_results)
422: {
423:   __shared__ PetscScalar tmp_buffer[8 * MDOT_WORKGROUP_SIZE];
424:   PetscInt               entries_per_group = (size - 1) / gridDim.x + 1;
425:   entries_per_group                        = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
426:   PetscInt vec_start_index                 = blockIdx.x * entries_per_group;
427:   PetscInt vec_stop_index                  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

429:   PetscScalar entry_x    = 0;
430:   PetscScalar group_sum0 = 0;
431:   PetscScalar group_sum1 = 0;
432:   PetscScalar group_sum2 = 0;
433:   PetscScalar group_sum3 = 0;
434:   PetscScalar group_sum4 = 0;
435:   PetscScalar group_sum5 = 0;
436:   PetscScalar group_sum6 = 0;
437:   PetscScalar group_sum7 = 0;
438:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
439:     entry_x = x[i]; // load only once from global memory!
440:     group_sum0 += entry_x * y0[i];
441:     group_sum1 += entry_x * y1[i];
442:     group_sum2 += entry_x * y2[i];
443:     group_sum3 += entry_x * y3[i];
444:     group_sum4 += entry_x * y4[i];
445:     group_sum5 += entry_x * y5[i];
446:     group_sum6 += entry_x * y6[i];
447:     group_sum7 += entry_x * y7[i];
448:   }
449:   tmp_buffer[threadIdx.x]                           = group_sum0;
450:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE]     = group_sum1;
451:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
452:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
453:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
454:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
455:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
456:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

458:   // parallel reduction
459:   for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
460:     __syncthreads();
461:     if (threadIdx.x < stride) {
462:       tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
463:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
464:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 2 * MDOT_WORKGROUP_SIZE];
465:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 3 * MDOT_WORKGROUP_SIZE];
466:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 4 * MDOT_WORKGROUP_SIZE];
467:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 5 * MDOT_WORKGROUP_SIZE];
468:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 6 * MDOT_WORKGROUP_SIZE];
469:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 7 * MDOT_WORKGROUP_SIZE];
470:     }
471:   }

473:   // write result of group to group_results
474:   if (threadIdx.x == 0) {
475:     group_results[blockIdx.x]                 = tmp_buffer[0];
476:     group_results[blockIdx.x + gridDim.x]     = tmp_buffer[MDOT_WORKGROUP_SIZE];
477:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
478:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
479:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
480:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
481:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
482:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
483:   }
484: }
485: #endif /* !defined(PETSC_USE_COMPLEX) */

487: PetscErrorCode VecMDot_SeqCUDA(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
488: {
489:   PetscInt           i, n = xin->map->n, current_y_index = 0;
490:   const PetscScalar *xptr, *y0ptr, *y1ptr, *y2ptr, *y3ptr, *y4ptr, *y5ptr, *y6ptr, *y7ptr;
491: #if !defined(PETSC_USE_COMPLEX)
492:   PetscInt     nv1 = ((nv % 4) == 1) ? nv - 1 : nv, j;
493:   PetscScalar *group_results_gpu, *group_results_cpu;
494: #endif
495:   PetscBLASInt   one = 1, bn = 0;
496:   cublasHandle_t cublasv2handle;

498:   PetscCUBLASGetHandle(&cublasv2handle);
499:   PetscBLASIntCast(xin->map->n, &bn);
501:   /* Handle the case of local size zero first */
502:   if (!xin->map->n) {
503:     for (i = 0; i < nv; ++i) z[i] = 0;
504:     return 0;
505:   }

507: #if !defined(PETSC_USE_COMPLEX)
508:   // allocate scratchpad memory for the results of individual work groups:
509:   cudaMalloc((void **)&group_results_gpu, nv1 * sizeof(PetscScalar) * MDOT_WORKGROUP_NUM);
510:   PetscMalloc1(nv1 * MDOT_WORKGROUP_NUM, &group_results_cpu);
511: #endif
512:   VecCUDAGetArrayRead(xin, &xptr);
513:   PetscLogGpuTimeBegin();

515:   while (current_y_index < nv) {
516:     switch (nv - current_y_index) {
517:     case 7:
518:     case 6:
519:     case 5:
520:     case 4:
521:       VecCUDAGetArrayRead(yin[current_y_index], &y0ptr);
522:       VecCUDAGetArrayRead(yin[current_y_index + 1], &y1ptr);
523:       VecCUDAGetArrayRead(yin[current_y_index + 2], &y2ptr);
524:       VecCUDAGetArrayRead(yin[current_y_index + 3], &y3ptr);
525: #if defined(PETSC_USE_COMPLEX)
526:       cublasXdot(cublasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
527:       cublasXdot(cublasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
528:       cublasXdot(cublasv2handle, bn, y2ptr, one, xptr, one, &z[current_y_index + 2]);
529:       cublasXdot(cublasv2handle, bn, y3ptr, one, xptr, one, &z[current_y_index + 3]);
530: #else
531:       VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM, MDOT_WORKGROUP_SIZE>>>(xptr, y0ptr, y1ptr, y2ptr, y3ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
532: #endif
533:       VecCUDARestoreArrayRead(yin[current_y_index], &y0ptr);
534:       VecCUDARestoreArrayRead(yin[current_y_index + 1], &y1ptr);
535:       VecCUDARestoreArrayRead(yin[current_y_index + 2], &y2ptr);
536:       VecCUDARestoreArrayRead(yin[current_y_index + 3], &y3ptr);
537:       current_y_index += 4;
538:       break;

540:     case 3:
541:       VecCUDAGetArrayRead(yin[current_y_index], &y0ptr);
542:       VecCUDAGetArrayRead(yin[current_y_index + 1], &y1ptr);
543:       VecCUDAGetArrayRead(yin[current_y_index + 2], &y2ptr);

545: #if defined(PETSC_USE_COMPLEX)
546:       cublasXdot(cublasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
547:       cublasXdot(cublasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
548:       cublasXdot(cublasv2handle, bn, y2ptr, one, xptr, one, &z[current_y_index + 2]);
549: #else
550:       VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM, MDOT_WORKGROUP_SIZE>>>(xptr, y0ptr, y1ptr, y2ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
551: #endif
552:       VecCUDARestoreArrayRead(yin[current_y_index], &y0ptr);
553:       VecCUDARestoreArrayRead(yin[current_y_index + 1], &y1ptr);
554:       VecCUDARestoreArrayRead(yin[current_y_index + 2], &y2ptr);
555:       current_y_index += 3;
556:       break;

558:     case 2:
559:       VecCUDAGetArrayRead(yin[current_y_index], &y0ptr);
560:       VecCUDAGetArrayRead(yin[current_y_index + 1], &y1ptr);
561: #if defined(PETSC_USE_COMPLEX)
562:       cublasXdot(cublasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
563:       cublasXdot(cublasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
564: #else
565:       VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM, MDOT_WORKGROUP_SIZE>>>(xptr, y0ptr, y1ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
566: #endif
567:       VecCUDARestoreArrayRead(yin[current_y_index], &y0ptr);
568:       VecCUDARestoreArrayRead(yin[current_y_index + 1], &y1ptr);
569:       current_y_index += 2;
570:       break;

572:     case 1:
573:       VecCUDAGetArrayRead(yin[current_y_index], &y0ptr);
574:       cublasXdot(cublasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
575:       VecCUDARestoreArrayRead(yin[current_y_index], &y0ptr);
576:       current_y_index += 1;
577:       break;

579:     default: // 8 or more vectors left
580:       VecCUDAGetArrayRead(yin[current_y_index], &y0ptr);
581:       VecCUDAGetArrayRead(yin[current_y_index + 1], &y1ptr);
582:       VecCUDAGetArrayRead(yin[current_y_index + 2], &y2ptr);
583:       VecCUDAGetArrayRead(yin[current_y_index + 3], &y3ptr);
584:       VecCUDAGetArrayRead(yin[current_y_index + 4], &y4ptr);
585:       VecCUDAGetArrayRead(yin[current_y_index + 5], &y5ptr);
586:       VecCUDAGetArrayRead(yin[current_y_index + 6], &y6ptr);
587:       VecCUDAGetArrayRead(yin[current_y_index + 7], &y7ptr);
588: #if defined(PETSC_USE_COMPLEX)
589:       cublasXdot(cublasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
590:       cublasXdot(cublasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
591:       cublasXdot(cublasv2handle, bn, y2ptr, one, xptr, one, &z[current_y_index + 2]);
592:       cublasXdot(cublasv2handle, bn, y3ptr, one, xptr, one, &z[current_y_index + 3]);
593:       cublasXdot(cublasv2handle, bn, y4ptr, one, xptr, one, &z[current_y_index + 4]);
594:       cublasXdot(cublasv2handle, bn, y5ptr, one, xptr, one, &z[current_y_index + 5]);
595:       cublasXdot(cublasv2handle, bn, y6ptr, one, xptr, one, &z[current_y_index + 6]);
596:       cublasXdot(cublasv2handle, bn, y7ptr, one, xptr, one, &z[current_y_index + 7]);
597: #else
598:       VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM, MDOT_WORKGROUP_SIZE>>>(xptr, y0ptr, y1ptr, y2ptr, y3ptr, y4ptr, y5ptr, y6ptr, y7ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
599: #endif
600:       VecCUDARestoreArrayRead(yin[current_y_index], &y0ptr);
601:       VecCUDARestoreArrayRead(yin[current_y_index + 1], &y1ptr);
602:       VecCUDARestoreArrayRead(yin[current_y_index + 2], &y2ptr);
603:       VecCUDARestoreArrayRead(yin[current_y_index + 3], &y3ptr);
604:       VecCUDARestoreArrayRead(yin[current_y_index + 4], &y4ptr);
605:       VecCUDARestoreArrayRead(yin[current_y_index + 5], &y5ptr);
606:       VecCUDARestoreArrayRead(yin[current_y_index + 6], &y6ptr);
607:       VecCUDARestoreArrayRead(yin[current_y_index + 7], &y7ptr);
608:       current_y_index += 8;
609:       break;
610:     }
611:   }
612:   PetscLogGpuTimeEnd();
613:   VecCUDARestoreArrayRead(xin, &xptr);

615: #if defined(PETSC_USE_COMPLEX)
616:   PetscLogGpuToCpuScalar(nv * sizeof(PetscScalar));
617: #else
618:   // copy results to CPU
619:   cudaMemcpy(group_results_cpu, group_results_gpu, nv1 * sizeof(PetscScalar) * MDOT_WORKGROUP_NUM, cudaMemcpyDeviceToHost);

621:   // sum group results into z
622:   for (j = 0; j < nv1; ++j) {
623:     z[j] = 0;
624:     for (i = j * MDOT_WORKGROUP_NUM; i < (j + 1) * MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
625:   }
626:   PetscLogFlops(nv1 * MDOT_WORKGROUP_NUM);
627:   cudaFree(group_results_gpu);
628:   PetscFree(group_results_cpu);
629:   PetscLogGpuToCpuScalar(nv1 * sizeof(PetscScalar) * MDOT_WORKGROUP_NUM);
630: #endif
631:   PetscLogGpuFlops(PetscMax(nv * (2.0 * n - 1), 0.0));
632:   return 0;
633: }

635: #undef MDOT_WORKGROUP_SIZE
636: #undef MDOT_WORKGROUP_NUM

638: PetscErrorCode VecSet_SeqCUDA(Vec xin, PetscScalar alpha)
639: {
640:   PetscInt                        n      = xin->map->n;
641:   PetscScalar                    *xarray = NULL;
642:   thrust::device_ptr<PetscScalar> xptr;

644:   VecCUDAGetArrayWrite(xin, &xarray);
645:   PetscLogGpuTimeBegin();
646:   if (alpha == (PetscScalar)0.0) {
647:     cudaMemset(xarray, 0, n * sizeof(PetscScalar));
648:   } else {
649:     try {
650:       xptr = thrust::device_pointer_cast(xarray);
651:       thrust::fill(xptr, xptr + n, alpha);
652:     } catch (char *ex) {
653:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
654:     }
655:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
656:   }
657:   PetscLogGpuTimeEnd();
658:   VecCUDARestoreArrayWrite(xin, &xarray);
659:   return 0;
660: }

662: struct PetscScalarReciprocal {
663:   __host__ __device__ PetscScalar operator()(const PetscScalar &s) { return (s != (PetscScalar)0.0) ? (PetscScalar)1.0 / s : 0.0; }
664: };

666: PetscErrorCode VecReciprocal_SeqCUDA(Vec v)
667: {
668:   PetscInt     n;
669:   PetscScalar *x;

671:   VecGetLocalSize(v, &n);
672:   VecCUDAGetArray(v, &x);
673:   PetscLogGpuTimeBegin();
674:   try {
675:     auto xptr = thrust::device_pointer_cast(x);
676:     thrust::transform(VecCUDAThrustPolicy(v), xptr, xptr + n, xptr, PetscScalarReciprocal());
677:   } catch (char *ex) {
678:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
679:   }
680:   PetscLogGpuTimeEnd();
681:   VecCUDARestoreArray(v, &x);
682:   return 0;
683: }

685: PetscErrorCode VecScale_SeqCUDA(Vec xin, PetscScalar alpha)
686: {
687:   PetscScalar   *xarray;
688:   PetscBLASInt   one = 1, bn = 0;
689:   cublasHandle_t cublasv2handle;

691:   if (alpha == (PetscScalar)0.0) {
692:     VecSet_SeqCUDA(xin, alpha);
693:   } else if (alpha != (PetscScalar)1.0) {
694:     PetscCUBLASGetHandle(&cublasv2handle);
695:     PetscBLASIntCast(xin->map->n, &bn);
696:     VecCUDAGetArray(xin, &xarray);
697:     PetscLogGpuTimeBegin();
698:     cublasXscal(cublasv2handle, bn, &alpha, xarray, one);
699:     VecCUDARestoreArray(xin, &xarray);
700:     PetscLogGpuTimeEnd();
701:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
702:     PetscLogGpuFlops(xin->map->n);
703:   }
704:   return 0;
705: }

707: PetscErrorCode VecTDot_SeqCUDA(Vec xin, Vec yin, PetscScalar *z)
708: {
709:   const PetscScalar *xarray, *yarray;
710:   PetscBLASInt       one = 1, bn = 0;
711:   cublasHandle_t     cublasv2handle;

713:   PetscCUBLASGetHandle(&cublasv2handle);
714:   PetscBLASIntCast(xin->map->n, &bn);
715:   VecCUDAGetArrayRead(xin, &xarray);
716:   VecCUDAGetArrayRead(yin, &yarray);
717:   PetscLogGpuTimeBegin();
718:   cublasXdotu(cublasv2handle, bn, xarray, one, yarray, one, z);
719:   PetscLogGpuTimeEnd();
720:   if (xin->map->n > 0) PetscLogGpuFlops(2.0 * xin->map->n - 1);
721:   PetscLogGpuToCpuScalar(sizeof(PetscScalar));
722:   VecCUDARestoreArrayRead(yin, &yarray);
723:   VecCUDARestoreArrayRead(xin, &xarray);
724:   return 0;
725: }

727: PetscErrorCode VecCopy_SeqCUDA(Vec xin, Vec yin)
728: {
729:   const PetscScalar *xarray;
730:   PetscScalar       *yarray;

732:   if (xin != yin) {
733:     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
734:       PetscBool yiscuda;

736:       PetscObjectTypeCompareAny((PetscObject)yin, &yiscuda, VECSEQCUDA, VECMPICUDA, "");
737:       VecCUDAGetArrayRead(xin, &xarray);
738:       if (yiscuda) {
739:         VecCUDAGetArrayWrite(yin, &yarray);
740:       } else {
741:         VecGetArrayWrite(yin, &yarray);
742:       }
743:       PetscLogGpuTimeBegin();
744:       if (yiscuda) {
745:         cudaMemcpyAsync(yarray, xarray, yin->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
746:       } else {
747:         cudaMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToHost);
748:         PetscLogGpuToCpu((yin->map->n) * sizeof(PetscScalar));
749:       }
750:       PetscLogGpuTimeEnd();
751:       VecCUDARestoreArrayRead(xin, &xarray);
752:       if (yiscuda) {
753:         VecCUDARestoreArrayWrite(yin, &yarray);
754:       } else {
755:         VecRestoreArrayWrite(yin, &yarray);
756:       }
757:     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
758:       /* copy in CPU if we are on the CPU */
759:       VecCopy_SeqCUDA_Private(xin, yin);
760:     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
761:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
762:       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
763:         /* copy in CPU */
764:         VecCopy_SeqCUDA_Private(xin, yin);
765:       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
766:         /* copy in GPU */
767:         VecCUDAGetArrayRead(xin, &xarray);
768:         VecCUDAGetArrayWrite(yin, &yarray);
769:         PetscLogGpuTimeBegin();
770:         cudaMemcpyAsync(yarray, xarray, yin->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
771:         PetscLogGpuTimeEnd();
772:         VecCUDARestoreArrayRead(xin, &xarray);
773:         VecCUDARestoreArrayWrite(yin, &yarray);
774:       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
775:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
776:            default to copy in GPU (this is an arbitrary choice) */
777:         VecCUDAGetArrayRead(xin, &xarray);
778:         VecCUDAGetArrayWrite(yin, &yarray);
779:         PetscLogGpuTimeBegin();
780:         cudaMemcpyAsync(yarray, xarray, yin->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
781:         PetscLogGpuTimeEnd();
782:         VecCUDARestoreArrayRead(xin, &xarray);
783:         VecCUDARestoreArrayWrite(yin, &yarray);
784:       } else {
785:         VecCopy_SeqCUDA_Private(xin, yin);
786:       }
787:     }
788:   }
789:   return 0;
790: }

792: PetscErrorCode VecSwap_SeqCUDA(Vec xin, Vec yin)
793: {
794:   PetscBLASInt   one = 1, bn = 0;
795:   PetscScalar   *xarray, *yarray;
796:   cublasHandle_t cublasv2handle;

798:   PetscCUBLASGetHandle(&cublasv2handle);
799:   PetscBLASIntCast(xin->map->n, &bn);
800:   if (xin != yin) {
801:     VecCUDAGetArray(xin, &xarray);
802:     VecCUDAGetArray(yin, &yarray);
803:     PetscLogGpuTimeBegin();
804:     cublasXswap(cublasv2handle, bn, xarray, one, yarray, one);
805:     PetscLogGpuTimeEnd();
806:     VecCUDARestoreArray(xin, &xarray);
807:     VecCUDARestoreArray(yin, &yarray);
808:   }
809:   return 0;
810: }

812: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin, PetscScalar alpha, PetscScalar beta, Vec xin)
813: {
814:   PetscScalar        a = alpha, b = beta;
815:   const PetscScalar *xarray;
816:   PetscScalar       *yarray;
817:   PetscBLASInt       one = 1, bn = 0;
818:   cublasHandle_t     cublasv2handle;

820:   PetscCUBLASGetHandle(&cublasv2handle);
821:   PetscBLASIntCast(yin->map->n, &bn);
822:   if (a == (PetscScalar)0.0) {
823:     VecScale_SeqCUDA(yin, beta);
824:   } else if (b == (PetscScalar)1.0) {
825:     VecAXPY_SeqCUDA(yin, alpha, xin);
826:   } else if (a == (PetscScalar)1.0) {
827:     VecAYPX_SeqCUDA(yin, beta, xin);
828:   } else if (b == (PetscScalar)0.0) {
829:     VecCUDAGetArrayRead(xin, &xarray);
830:     VecCUDAGetArray(yin, &yarray);
831:     PetscLogGpuTimeBegin();
832:     cudaMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToDevice);
833:     cublasXscal(cublasv2handle, bn, &alpha, yarray, one);
834:     PetscLogGpuTimeEnd();
835:     PetscLogGpuFlops(xin->map->n);
836:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
837:     VecCUDARestoreArrayRead(xin, &xarray);
838:     VecCUDARestoreArray(yin, &yarray);
839:   } else {
840:     VecCUDAGetArrayRead(xin, &xarray);
841:     VecCUDAGetArray(yin, &yarray);
842:     PetscLogGpuTimeBegin();
843:     cublasXscal(cublasv2handle, bn, &beta, yarray, one);
844:     cublasXaxpy(cublasv2handle, bn, &alpha, xarray, one, yarray, one);
845:     VecCUDARestoreArrayRead(xin, &xarray);
846:     VecCUDARestoreArray(yin, &yarray);
847:     PetscLogGpuTimeEnd();
848:     PetscLogGpuFlops(3.0 * xin->map->n);
849:     PetscLogCpuToGpuScalar(2 * sizeof(PetscScalar));
850:   }
851:   return 0;
852: }

854: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin, PetscScalar alpha, PetscScalar beta, PetscScalar gamma, Vec xin, Vec yin)
855: {
856:   PetscInt n = zin->map->n;

858:   if (gamma == (PetscScalar)1.0) {
859:     /* z = ax + b*y + z */
860:     VecAXPY_SeqCUDA(zin, alpha, xin);
861:     VecAXPY_SeqCUDA(zin, beta, yin);
862:     PetscLogGpuFlops(4.0 * n);
863:   } else {
864:     /* z = a*x + b*y + c*z */
865:     VecScale_SeqCUDA(zin, gamma);
866:     VecAXPY_SeqCUDA(zin, alpha, xin);
867:     VecAXPY_SeqCUDA(zin, beta, yin);
868:     PetscLogGpuFlops(5.0 * n);
869:   }
870:   return 0;
871: }

873: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win, Vec xin, Vec yin)
874: {
875:   PetscInt                              n = win->map->n;
876:   const PetscScalar                    *xarray, *yarray;
877:   PetscScalar                          *warray;
878:   thrust::device_ptr<const PetscScalar> xptr, yptr;
879:   thrust::device_ptr<PetscScalar>       wptr;

881:   if (xin->boundtocpu || yin->boundtocpu) {
882:     VecPointwiseMult_Seq(win, xin, yin);
883:     return 0;
884:   }
885:   VecCUDAGetArrayRead(xin, &xarray);
886:   VecCUDAGetArrayRead(yin, &yarray);
887:   VecCUDAGetArrayWrite(win, &warray);
888:   PetscLogGpuTimeBegin();
889:   try {
890:     wptr = thrust::device_pointer_cast(warray);
891:     xptr = thrust::device_pointer_cast(xarray);
892:     yptr = thrust::device_pointer_cast(yarray);
893:     thrust::transform(VecCUDAThrustPolicy(win), xptr, xptr + n, yptr, wptr, thrust::multiplies<PetscScalar>());
894:   } catch (char *ex) {
895:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
896:   }
897:   PetscLogGpuTimeEnd();
898:   VecCUDARestoreArrayRead(xin, &xarray);
899:   VecCUDARestoreArrayRead(yin, &yarray);
900:   VecCUDARestoreArrayWrite(win, &warray);
901:   PetscLogGpuFlops(n);
902:   return 0;
903: }

905: /* should do infinity norm in cuda */

907: PetscErrorCode VecNorm_SeqCUDA(Vec xin, NormType type, PetscReal *z)
908: {
909:   PetscInt           n   = xin->map->n;
910:   PetscBLASInt       one = 1, bn = 0;
911:   const PetscScalar *xarray;
912:   cublasHandle_t     cublasv2handle;

914:   PetscCUBLASGetHandle(&cublasv2handle);
915:   PetscBLASIntCast(n, &bn);
916:   if (type == NORM_2 || type == NORM_FROBENIUS) {
917:     VecCUDAGetArrayRead(xin, &xarray);
918:     PetscLogGpuTimeBegin();
919:     cublasXnrm2(cublasv2handle, bn, xarray, one, z);
920:     PetscLogGpuTimeEnd();
921:     VecCUDARestoreArrayRead(xin, &xarray);
922:     PetscLogGpuFlops(PetscMax(2.0 * n - 1, 0.0));
923:   } else if (type == NORM_INFINITY) {
924:     int i;
925:     VecCUDAGetArrayRead(xin, &xarray);
926:     PetscLogGpuTimeBegin();
927:     cublasIXamax(cublasv2handle, bn, xarray, one, &i);
928:     PetscLogGpuTimeEnd();
929:     if (bn) {
930:       PetscScalar zs;
931:       cudaMemcpy(&zs, xarray + i - 1, sizeof(PetscScalar), cudaMemcpyDeviceToHost);
932:       *z = PetscAbsScalar(zs);
933:     } else *z = 0.0;
934:     VecCUDARestoreArrayRead(xin, &xarray);
935:   } else if (type == NORM_1) {
936:     VecCUDAGetArrayRead(xin, &xarray);
937:     PetscLogGpuTimeBegin();
938:     cublasXasum(cublasv2handle, bn, xarray, one, z);
939:     VecCUDARestoreArrayRead(xin, &xarray);
940:     PetscLogGpuTimeEnd();
941:     PetscLogGpuFlops(PetscMax(n - 1.0, 0.0));
942:   } else if (type == NORM_1_AND_2) {
943:     VecNorm_SeqCUDA(xin, NORM_1, z);
944:     VecNorm_SeqCUDA(xin, NORM_2, z + 1);
945:   }
946:   PetscLogGpuToCpuScalar(sizeof(PetscReal));
947:   return 0;
948: }

950: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
951: {
952:   VecDot_SeqCUDA(s, t, dp);
953:   VecDot_SeqCUDA(t, t, nm);
954:   return 0;
955: }

957: static PetscErrorCode VecResetPreallocationCOO_SeqCUDA(Vec x)
958: {
959:   Vec_CUDA *veccuda = static_cast<Vec_CUDA *>(x->spptr);

961:   if (veccuda) {
962:     cudaFree(veccuda->jmap1_d);
963:     cudaFree(veccuda->perm1_d);
964:   }
965:   return 0;
966: }

968: PetscErrorCode VecSetPreallocationCOO_SeqCUDA(Vec x, PetscCount ncoo, const PetscInt coo_i[])
969: {
970:   Vec_Seq  *vecseq  = static_cast<Vec_Seq *>(x->data);
971:   Vec_CUDA *veccuda = static_cast<Vec_CUDA *>(x->spptr);
972:   PetscInt  m;

974:   VecResetPreallocationCOO_SeqCUDA(x);
975:   VecSetPreallocationCOO_Seq(x, ncoo, coo_i);
976:   VecGetLocalSize(x, &m);
977:   cudaMalloc((void **)&veccuda->jmap1_d, sizeof(PetscCount) * (m + 1));
978:   cudaMalloc((void **)&veccuda->perm1_d, sizeof(PetscCount) * vecseq->tot1);
979:   cudaMemcpy(veccuda->jmap1_d, vecseq->jmap1, sizeof(PetscCount) * (m + 1), cudaMemcpyHostToDevice);
980:   cudaMemcpy(veccuda->perm1_d, vecseq->perm1, sizeof(PetscCount) * vecseq->tot1, cudaMemcpyHostToDevice);
981:   return 0;
982: }

984: __global__ static void VecAddCOOValues(const PetscScalar vv[], PetscCount m, const PetscCount jmap1[], const PetscCount perm1[], InsertMode imode, PetscScalar xv[])
985: {
986:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
987:   const PetscCount grid_size = gridDim.x * blockDim.x;
988:   for (; i < m; i += grid_size) {
989:     PetscScalar sum = 0.0;
990:     for (PetscCount k = jmap1[i]; k < jmap1[i + 1]; k++) sum += vv[perm1[k]];
991:     xv[i] = (imode == INSERT_VALUES ? 0.0 : xv[i]) + sum;
992:   }
993: }

995: PetscErrorCode VecSetValuesCOO_SeqCUDA(Vec x, const PetscScalar v[], InsertMode imode)
996: {
997:   Vec_Seq           *vecseq  = static_cast<Vec_Seq *>(x->data);
998:   Vec_CUDA          *veccuda = static_cast<Vec_CUDA *>(x->spptr);
999:   const PetscCount  *jmap1   = veccuda->jmap1_d;
1000:   const PetscCount  *perm1   = veccuda->perm1_d;
1001:   PetscScalar       *xv;
1002:   const PetscScalar *vv = v;
1003:   PetscInt           m;
1004:   PetscMemType       memtype;

1006:   VecGetLocalSize(x, &m);
1007:   PetscGetMemType(v, &memtype);

1009:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1010:     cudaMalloc((void **)&vv, vecseq->coo_n * sizeof(PetscScalar));
1011:     cudaMemcpy((void *)vv, v, vecseq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice);
1012:   }

1014:   if (imode == INSERT_VALUES) VecCUDAGetArrayWrite(x, &xv); /* write vector */
1015:   else VecCUDAGetArray(x, &xv);                             /* read & write vector */

1017:   if (m) {
1018:     VecAddCOOValues<<<(m + 255) / 256, 256>>>(vv, m, jmap1, perm1, imode, xv);
1019:     cudaPeekAtLastError();
1020:   }
1021:   if (imode == INSERT_VALUES) VecCUDARestoreArrayWrite(x, &xv);
1022:   else VecCUDARestoreArray(x, &xv);

1024:   if (PetscMemTypeHost(memtype)) cudaFree((void *)vv);
1025:   return 0;
1026: }

1028: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1029: {
1030:   Vec_CUDA *veccuda = (Vec_CUDA *)v->spptr;

1032:   if (v->spptr) {
1033:     if (veccuda->GPUarray_allocated) {
1034: #if defined(PETSC_HAVE_NVSHMEM)
1035:       if (veccuda->nvshmem) {
1036:         PetscNvshmemFree(veccuda->GPUarray_allocated);
1037:         veccuda->nvshmem = PETSC_FALSE;
1038:       } else
1039: #endif
1040:         cudaFree(veccuda->GPUarray_allocated);
1041:       veccuda->GPUarray_allocated = NULL;
1042:     }
1043:     if (veccuda->stream) cudaStreamDestroy(veccuda->stream);
1044:     VecResetPreallocationCOO_SeqCUDA(v);
1045:   }
1046:   VecDestroy_SeqCUDA_Private(v);
1047:   PetscFree(v->spptr);
1048:   return 0;
1049: }

1051: #if defined(PETSC_USE_COMPLEX)
1052: struct conjugate {
1053:   __host__ __device__ PetscScalar operator()(const PetscScalar &x) { return PetscConj(x); }
1054: };
1055: #endif

1057: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1058: {
1059: #if defined(PETSC_USE_COMPLEX)
1060:   PetscScalar                    *xarray;
1061:   PetscInt                        n = xin->map->n;
1062:   thrust::device_ptr<PetscScalar> xptr;

1064:   VecCUDAGetArray(xin, &xarray);
1065:   PetscLogGpuTimeBegin();
1066:   try {
1067:     xptr = thrust::device_pointer_cast(xarray);
1068:     thrust::transform(VecCUDAThrustPolicy(xin), xptr, xptr + n, xptr, conjugate());
1069:   } catch (char *ex) {
1070:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1071:   }
1072:   PetscLogGpuTimeEnd();
1073:   VecCUDARestoreArray(xin, &xarray);
1074: #else
1075: #endif
1076:   return 0;
1077: }

1079: static inline PetscErrorCode VecGetLocalVectorK_SeqCUDA(Vec v, Vec w, PetscBool read)
1080: {
1081:   PetscBool wisseqcuda;

1086:   PetscObjectTypeCompare((PetscObject)w, VECSEQCUDA, &wisseqcuda);
1087:   if (w->data && wisseqcuda) {
1088:     if (((Vec_Seq *)w->data)->array_allocated) {
1089:       if (w->pinned_memory) PetscMallocSetCUDAHost();
1090:       PetscFree(((Vec_Seq *)w->data)->array_allocated);
1091:       if (w->pinned_memory) {
1092:         PetscMallocResetCUDAHost();
1093:         w->pinned_memory = PETSC_FALSE;
1094:       }
1095:     }
1096:     ((Vec_Seq *)w->data)->array         = NULL;
1097:     ((Vec_Seq *)w->data)->unplacedarray = NULL;
1098:   }
1099:   if (w->spptr && wisseqcuda) {
1100:     if (((Vec_CUDA *)w->spptr)->GPUarray) {
1101:       cudaFree(((Vec_CUDA *)w->spptr)->GPUarray);
1102:       ((Vec_CUDA *)w->spptr)->GPUarray = NULL;
1103:     }
1104:     if (((Vec_CUDA *)w->spptr)->stream) cudaStreamDestroy(((Vec_CUDA *)w->spptr)->stream);
1105:     PetscFree(w->spptr);
1106:   }

1108:   if (v->petscnative && wisseqcuda) {
1109:     PetscFree(w->data);
1110:     w->data          = v->data;
1111:     w->offloadmask   = v->offloadmask;
1112:     w->pinned_memory = v->pinned_memory;
1113:     w->spptr         = v->spptr;
1114:     PetscObjectStateIncrease((PetscObject)w);
1115:   } else {
1116:     if (read) {
1117:       VecGetArrayRead(v, (const PetscScalar **)&((Vec_Seq *)w->data)->array);
1118:     } else {
1119:       VecGetArray(v, &((Vec_Seq *)w->data)->array);
1120:     }
1121:     w->offloadmask = PETSC_OFFLOAD_CPU;
1122:     if (wisseqcuda) VecCUDAAllocateCheck(w);
1123:   }
1124:   return 0;
1125: }

1127: static inline PetscErrorCode VecRestoreLocalVectorK_SeqCUDA(Vec v, Vec w, PetscBool read)
1128: {
1129:   PetscBool wisseqcuda;

1134:   PetscObjectTypeCompare((PetscObject)w, VECSEQCUDA, &wisseqcuda);
1135:   if (v->petscnative && wisseqcuda) {
1136:     v->data          = w->data;
1137:     v->offloadmask   = w->offloadmask;
1138:     v->pinned_memory = w->pinned_memory;
1139:     v->spptr         = w->spptr;
1140:     w->data          = 0;
1141:     w->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
1142:     w->spptr         = 0;
1143:   } else {
1144:     if (read) {
1145:       VecRestoreArrayRead(v, (const PetscScalar **)&((Vec_Seq *)w->data)->array);
1146:     } else {
1147:       VecRestoreArray(v, &((Vec_Seq *)w->data)->array);
1148:     }
1149:     if ((Vec_CUDA *)w->spptr && wisseqcuda) {
1150:       cudaFree(((Vec_CUDA *)w->spptr)->GPUarray);
1151:       ((Vec_CUDA *)w->spptr)->GPUarray = NULL;
1152:       if (((Vec_CUDA *)v->spptr)->stream) cudaStreamDestroy(((Vec_CUDA *)w->spptr)->stream);
1153:       PetscFree(w->spptr);
1154:     }
1155:   }
1156:   return 0;
1157: }

1159: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v, Vec w)
1160: {
1161:   VecGetLocalVectorK_SeqCUDA(v, w, PETSC_FALSE);
1162:   return 0;
1163: }

1165: PetscErrorCode VecGetLocalVectorRead_SeqCUDA(Vec v, Vec w)
1166: {
1167:   VecGetLocalVectorK_SeqCUDA(v, w, PETSC_TRUE);
1168:   return 0;
1169: }

1171: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v, Vec w)
1172: {
1173:   VecRestoreLocalVectorK_SeqCUDA(v, w, PETSC_FALSE);
1174:   return 0;
1175: }

1177: PetscErrorCode VecRestoreLocalVectorRead_SeqCUDA(Vec v, Vec w)
1178: {
1179:   VecRestoreLocalVectorK_SeqCUDA(v, w, PETSC_TRUE);
1180:   return 0;
1181: }

1183: struct petscrealpart : public thrust::unary_function<PetscScalar, PetscReal> {
1184:   __host__ __device__ PetscReal operator()(const PetscScalar &x) { return PetscRealPart(x); }
1185: };

1187: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>, thrust::tuple<PetscReal, PetscInt>> {
1188:   __host__ __device__ thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt> &x) { return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>()); }
1189: };

1191: struct petscmax : public thrust::binary_function<PetscReal, PetscReal, PetscReal> {
1192:   __host__ __device__ PetscReal operator()(const PetscReal &x, const PetscReal &y) { return x < y ? y : x; }
1193: };

1195: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>> {
1196:   __host__ __device__ thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt> &x, const thrust::tuple<PetscReal, PetscInt> &y)
1197:   {
1198:     return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) : (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1199:   }
1200: };

1202: struct petscmin : public thrust::binary_function<PetscReal, PetscReal, PetscReal> {
1203:   __host__ __device__ PetscReal operator()(const PetscReal &x, const PetscReal &y) { return x < y ? x : y; }
1204: };

1206: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>> {
1207:   __host__ __device__ thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt> &x, const thrust::tuple<PetscReal, PetscInt> &y)
1208:   {
1209:     return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) : (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1210:   }
1211: };

1213: PetscErrorCode VecMax_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1214: {
1215:   PetscInt                              n = v->map->n;
1216:   const PetscScalar                    *av;
1217:   thrust::device_ptr<const PetscScalar> avpt;

1220:   if (!n) {
1221:     *m = PETSC_MIN_REAL;
1222:     if (p) *p = -1;
1223:     return 0;
1224:   }
1225:   VecCUDAGetArrayRead(v, &av);
1226:   avpt = thrust::device_pointer_cast(av);
1227:   PetscLogGpuTimeBegin();
1228:   if (p) {
1229:     thrust::tuple<PetscReal, PetscInt> res(PETSC_MIN_REAL, -1);
1230:     auto                               zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt, thrust::counting_iterator<PetscInt>(0)));
1231:     try {
1232: #if defined(PETSC_USE_COMPLEX)
1233:       res = thrust::transform_reduce(VecCUDAThrustPolicy(v), zibit, zibit + n, petscrealparti(), res, petscmaxi());
1234: #else
1235:       res = thrust::reduce(VecCUDAThrustPolicy(v), zibit, zibit + n, res, petscmaxi());
1236: #endif
1237:     } catch (char *ex) {
1238:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1239:     }
1240:     *m = res.get<0>();
1241:     *p = res.get<1>();
1242:   } else {
1243:     try {
1244: #if defined(PETSC_USE_COMPLEX)
1245:       *m = thrust::transform_reduce(VecCUDAThrustPolicy(v), avpt, avpt + n, petscrealpart(), PETSC_MIN_REAL, petscmax());
1246: #else
1247:       *m  = thrust::reduce(VecCUDAThrustPolicy(v), avpt, avpt + n, PETSC_MIN_REAL, petscmax());
1248: #endif
1249:     } catch (char *ex) {
1250:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1251:     }
1252:   }
1253:   PetscLogGpuTimeEnd();
1254:   VecCUDARestoreArrayRead(v, &av);
1255:   return 0;
1256: }

1258: PetscErrorCode VecMin_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1259: {
1260:   PetscInt                              n = v->map->n;
1261:   const PetscScalar                    *av;
1262:   thrust::device_ptr<const PetscScalar> avpt;

1265:   if (!n) {
1266:     *m = PETSC_MAX_REAL;
1267:     if (p) *p = -1;
1268:     return 0;
1269:   }
1270:   VecCUDAGetArrayRead(v, &av);
1271:   avpt = thrust::device_pointer_cast(av);
1272:   PetscLogGpuTimeBegin();
1273:   if (p) {
1274:     thrust::tuple<PetscReal, PetscInt> res(PETSC_MAX_REAL, -1);
1275:     auto                               zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt, thrust::counting_iterator<PetscInt>(0)));
1276:     try {
1277: #if defined(PETSC_USE_COMPLEX)
1278:       res = thrust::transform_reduce(VecCUDAThrustPolicy(v), zibit, zibit + n, petscrealparti(), res, petscmini());
1279: #else
1280:       res = thrust::reduce(VecCUDAThrustPolicy(v), zibit, zibit + n, res, petscmini());
1281: #endif
1282:     } catch (char *ex) {
1283:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1284:     }
1285:     *m = res.get<0>();
1286:     *p = res.get<1>();
1287:   } else {
1288:     try {
1289: #if defined(PETSC_USE_COMPLEX)
1290:       *m = thrust::transform_reduce(VecCUDAThrustPolicy(v), avpt, avpt + n, petscrealpart(), PETSC_MAX_REAL, petscmin());
1291: #else
1292:       *m  = thrust::reduce(VecCUDAThrustPolicy(v), avpt, avpt + n, PETSC_MAX_REAL, petscmin());
1293: #endif
1294:     } catch (char *ex) {
1295:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1296:     }
1297:   }
1298:   PetscLogGpuTimeEnd();
1299:   VecCUDARestoreArrayRead(v, &av);
1300:   return 0;
1301: }

1303: PetscErrorCode VecSum_SeqCUDA(Vec v, PetscScalar *sum)
1304: {
1305:   PetscInt                              n = v->map->n;
1306:   const PetscScalar                    *a;
1307:   thrust::device_ptr<const PetscScalar> dptr;

1310:   VecCUDAGetArrayRead(v, &a);
1311:   dptr = thrust::device_pointer_cast(a);
1312:   PetscLogGpuTimeBegin();
1313:   try {
1314:     *sum = thrust::reduce(VecCUDAThrustPolicy(v), dptr, dptr + n, PetscScalar(0.0));
1315:   } catch (char *ex) {
1316:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1317:   }
1318:   PetscLogGpuTimeEnd();
1319:   VecCUDARestoreArrayRead(v, &a);
1320:   return 0;
1321: }

1323: struct petscshift : public thrust::unary_function<PetscScalar, PetscScalar> {
1324:   const PetscScalar shift_;
1325:   petscshift(PetscScalar shift) : shift_(shift) { }
1326:   __host__ __device__ PetscScalar operator()(PetscScalar x) { return x + shift_; }
1327: };

1329: PetscErrorCode VecShift_SeqCUDA(Vec v, PetscScalar shift)
1330: {
1331:   PetscInt                        n = v->map->n;
1332:   PetscScalar                    *a;
1333:   thrust::device_ptr<PetscScalar> dptr;

1336:   VecCUDAGetArray(v, &a);
1337:   dptr = thrust::device_pointer_cast(a);
1338:   PetscLogGpuTimeBegin();
1339:   try {
1340:     thrust::transform(VecCUDAThrustPolicy(v), dptr, dptr + n, dptr, petscshift(shift)); /* in-place transform */
1341:   } catch (char *ex) {
1342:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1343:   }
1344:   PetscLogGpuTimeEnd();
1345:   VecCUDARestoreArray(v, &a);
1346:   return 0;
1347: }

1349: #if defined(PETSC_HAVE_NVSHMEM)
1350: /* Free old CUDA array and re-allocate a new one from nvshmem symmetric heap.
1351:    New array does not retain values in the old array. The offload mask is not changed.

1353:    Note: the function is only meant to be used in MatAssemblyEnd_MPIAIJCUSPARSE.
1354:  */
1355: PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec v)
1356: {
1357:   cudaError_t cerr;
1358:   Vec_CUDA   *veccuda = (Vec_CUDA *)v->spptr;
1359:   PetscInt    n;

1361:   cudaFree(veccuda->GPUarray_allocated);
1362:   VecGetLocalSize(v, &n);
1363:   MPIU_Allreduce(MPI_IN_PLACE, &n, 1, MPIU_INT, MPI_MAX, PETSC_COMM_WORLD);
1364:   PetscNvshmemMalloc(n * sizeof(PetscScalar), (void **)&veccuda->GPUarray_allocated);
1365:   veccuda->GPUarray = veccuda->GPUarray_allocated;
1366:   veccuda->nvshmem  = PETSC_TRUE;
1367:   return 0;
1368: }
1369: #endif