Actual source code: veccuda2.cu

  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8: #include <petsc/private/vecimpl.h>
  9: #include <../src/vec/vec/impls/dvecimpl.h>
 10: #include <petsc/private/cudavecimpl.h>

 12: #include <cuda_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/functional.h>
 16: #include <thrust/reduce.h>

 18: /*
 19:     Allocates space for the vector array on the GPU if it does not exist.
 20:     Does NOT change the PetscCUDAFlag for the vector
 21:     Does NOT zero the CUDA array

 23:  */
 24: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 25: {
 27:   cudaError_t    err;
 28:   Vec_CUDA       *veccuda;
 29:   PetscBool      option_set;

 32:   if (!v->spptr) {
 33:     PetscReal pinned_memory_min;
 34:     PetscCalloc(sizeof(Vec_CUDA),&v->spptr);
 35:     veccuda = (Vec_CUDA*)v->spptr;
 36:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 37:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 38:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 39:       if (v->data && ((Vec_Seq*)v->data)->array) {
 40:         v->offloadmask = PETSC_OFFLOAD_CPU;
 41:       } else {
 42:         v->offloadmask = PETSC_OFFLOAD_GPU;
 43:       }
 44:     }
 45:     pinned_memory_min = 0;

 47:     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
 48:        Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
 49:     PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
 50:     PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
 51:     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
 52:     PetscOptionsEnd();
 53:   }
 54:   return(0);
 55: }

 57: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 58: PetscErrorCode VecCUDACopyToGPU(Vec v)
 59: {
 61:   cudaError_t    err;
 62:   Vec_CUDA       *veccuda;
 63:   PetscScalar    *varray;

 67:   VecCUDAAllocateCheck(v);
 68:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 69:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 70:     veccuda        = (Vec_CUDA*)v->spptr;
 71:     varray         = veccuda->GPUarray;
 72:     err            = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 73:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
 74:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 75:     v->offloadmask = PETSC_OFFLOAD_BOTH;
 76:   }
 77:   return(0);
 78: }

 80: /*
 81:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
 82: */
 83: PetscErrorCode VecCUDACopyFromGPU(Vec v)
 84: {
 86:   cudaError_t    err;
 87:   Vec_CUDA       *veccuda;
 88:   PetscScalar    *varray;

 92:   VecCUDAAllocateCheckHost(v);
 93:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
 94:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
 95:     veccuda        = (Vec_CUDA*)v->spptr;
 96:     varray         = veccuda->GPUarray;
 97:     err            = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
 98:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
 99:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
100:     v->offloadmask = PETSC_OFFLOAD_BOTH;
101:   }
102:   return(0);
103: }

105: /*MC
106:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

108:    Options Database Keys:
109: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

111:   Level: beginner

113: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
114: M*/

116: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
117: {
118:   const PetscScalar *xarray;
119:   PetscScalar       *yarray;
120:   PetscErrorCode    ierr;
121:   PetscBLASInt      one = 1,bn = 0;
122:   PetscScalar       sone = 1.0;
123:   cublasHandle_t    cublasv2handle;
124:   cublasStatus_t    cberr;
125:   cudaError_t       err;

128:   PetscCUBLASGetHandle(&cublasv2handle);
129:   PetscBLASIntCast(yin->map->n,&bn);
130:   VecCUDAGetArrayRead(xin,&xarray);
131:   VecCUDAGetArray(yin,&yarray);
132:   PetscLogGpuTimeBegin();
133:   if (alpha == (PetscScalar)0.0) {
134:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
135:   } else if (alpha == (PetscScalar)1.0) {
136:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
137:     PetscLogGpuFlops(1.0*yin->map->n);
138:   } else {
139:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
140:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
141:     PetscLogGpuFlops(2.0*yin->map->n);
142:   }
143:   PetscLogGpuTimeEnd();
144:   VecCUDARestoreArrayRead(xin,&xarray);
145:   VecCUDARestoreArray(yin,&yarray);
146:   PetscLogCpuToGpuScalar(sizeof(PetscScalar));
147:   return(0);
148: }

150: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
151: {
152:   const PetscScalar *xarray;
153:   PetscScalar       *yarray;
154:   PetscErrorCode    ierr;
155:   PetscBLASInt      one = 1,bn = 0;
156:   cublasHandle_t    cublasv2handle;
157:   cublasStatus_t    cberr;
158:   PetscBool         xiscuda;

161:   if (alpha == (PetscScalar)0.0) return(0);
162:   PetscCUBLASGetHandle(&cublasv2handle);
163:   PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
164:   if (xiscuda) {
165:     PetscBLASIntCast(yin->map->n,&bn);
166:     VecCUDAGetArrayRead(xin,&xarray);
167:     VecCUDAGetArray(yin,&yarray);
168:     PetscLogGpuTimeBegin();
169:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
170:     PetscLogGpuTimeEnd();
171:     VecCUDARestoreArrayRead(xin,&xarray);
172:     VecCUDARestoreArray(yin,&yarray);
173:     PetscLogGpuFlops(2.0*yin->map->n);
174:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
175:   } else {
176:     VecAXPY_Seq(yin,alpha,xin);
177:   }
178:   return(0);
179: }

181: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
182: {
183:   PetscInt                              n = xin->map->n;
184:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
185:   PetscScalar                           *warray=NULL;
186:   thrust::device_ptr<const PetscScalar> xptr,yptr;
187:   thrust::device_ptr<PetscScalar>       wptr;
188:   PetscErrorCode                        ierr;

191:   if (xin->boundtocpu || yin->boundtocpu) {
192:     VecPointwiseDivide_Seq(win,xin,yin);
193:     return(0);
194:   }
195:   VecCUDAGetArrayWrite(win,&warray);
196:   VecCUDAGetArrayRead(xin,&xarray);
197:   VecCUDAGetArrayRead(yin,&yarray);
198:   PetscLogGpuTimeBegin();
199:   try {
200:     wptr = thrust::device_pointer_cast(warray);
201:     xptr = thrust::device_pointer_cast(xarray);
202:     yptr = thrust::device_pointer_cast(yarray);
203:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
204:   } catch (char *ex) {
205:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
206:   }
207:   PetscLogGpuTimeEnd();
208:   PetscLogGpuFlops(n);
209:   VecCUDARestoreArrayRead(xin,&xarray);
210:   VecCUDARestoreArrayRead(yin,&yarray);
211:   VecCUDARestoreArrayWrite(win,&warray);
212:   return(0);
213: }

215: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
216: {
217:   const PetscScalar *xarray=NULL,*yarray=NULL;
218:   PetscScalar       *warray=NULL;
219:   PetscErrorCode    ierr;
220:   PetscBLASInt      one = 1,bn = 0;
221:   cublasHandle_t    cublasv2handle;
222:   cublasStatus_t    stat;
223:   cudaError_t       cerr;
224:   cudaStream_t      stream;

227:   PetscCUBLASGetHandle(&cublasv2handle);
228:   PetscBLASIntCast(win->map->n,&bn);
229:   if (alpha == (PetscScalar)0.0) {
230:     VecCopy_SeqCUDA(yin,win);
231:   } else {
232:     VecCUDAGetArrayRead(xin,&xarray);
233:     VecCUDAGetArrayRead(yin,&yarray);
234:     VecCUDAGetArrayWrite(win,&warray);
235:     PetscLogGpuTimeBegin();
236:     stat = cublasGetStream(cublasv2handle,&stream);CHKERRCUBLAS(stat);
237:     cerr = cudaMemcpyAsync(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,stream);CHKERRCUDA(cerr);
238:     stat = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(stat);
239:     PetscLogGpuTimeEnd();
240:     PetscLogGpuFlops(2*win->map->n);
241:     VecCUDARestoreArrayRead(xin,&xarray);
242:     VecCUDARestoreArrayRead(yin,&yarray);
243:     VecCUDARestoreArrayWrite(win,&warray);
244:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
245:   }
246:   return(0);
247: }

249: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
250: {
251:   PetscErrorCode    ierr;
252:   PetscInt          n = xin->map->n,j;
253:   PetscScalar       *xarray;
254:   const PetscScalar *yarray;
255:   PetscBLASInt      one = 1,bn = 0;
256:   cublasHandle_t    cublasv2handle;
257:   cublasStatus_t    cberr;

260:   PetscLogGpuFlops(nv*2.0*n);
261:   PetscLogCpuToGpuScalar(nv*sizeof(PetscScalar));
262:   PetscCUBLASGetHandle(&cublasv2handle);
263:   PetscBLASIntCast(n,&bn);
264:   VecCUDAGetArray(xin,&xarray);
265:   PetscLogGpuTimeBegin();
266:   for (j=0; j<nv; j++) {
267:     VecCUDAGetArrayRead(y[j],&yarray);
268:     cberr = cublasXaxpy(cublasv2handle,bn,alpha+j,yarray,one,xarray,one);CHKERRCUBLAS(cberr);
269:     VecCUDARestoreArrayRead(y[j],&yarray);
270:   }
271:   PetscLogGpuTimeEnd();
272:   VecCUDARestoreArray(xin,&xarray);
273:   return(0);
274: }

276: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
277: {
278:   const PetscScalar *xarray,*yarray;
279:   PetscErrorCode    ierr;
280:   PetscBLASInt      one = 1,bn = 0;
281:   cublasHandle_t    cublasv2handle;
282:   cublasStatus_t    cerr;

285:   PetscCUBLASGetHandle(&cublasv2handle);
286:   PetscBLASIntCast(yin->map->n,&bn);
287:   VecCUDAGetArrayRead(xin,&xarray);
288:   VecCUDAGetArrayRead(yin,&yarray);
289:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
290:   PetscLogGpuTimeBegin();
291:   cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
292:   PetscLogGpuTimeEnd();
293:   if (xin->map->n >0) {
294:     PetscLogGpuFlops(2.0*xin->map->n-1);
295:   }
296:   PetscLogGpuToCpuScalar(sizeof(PetscScalar));
297:   VecCUDARestoreArrayRead(xin,&xarray);
298:   VecCUDARestoreArrayRead(yin,&yarray);
299:   return(0);
300: }

302: //
303: // CUDA kernels for MDot to follow
304: //

306: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
307: #define MDOT_WORKGROUP_SIZE 128
308: #define MDOT_WORKGROUP_NUM  128

310: #if !defined(PETSC_USE_COMPLEX)
311: // M = 2:
312: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
313:                                         PetscInt size, PetscScalar *group_results)
314: {
315:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
316:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
317:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
318:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
319:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

321:   PetscScalar entry_x    = 0;
322:   PetscScalar group_sum0 = 0;
323:   PetscScalar group_sum1 = 0;
324:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
325:     entry_x     = x[i];   // load only once from global memory!
326:     group_sum0 += entry_x * y0[i];
327:     group_sum1 += entry_x * y1[i];
328:   }
329:   tmp_buffer[threadIdx.x]                       = group_sum0;
330:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

332:   // parallel reduction
333:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
334:     __syncthreads();
335:     if (threadIdx.x < stride) {
336:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
337:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
338:     }
339:   }

341:   // write result of group to group_results
342:   if (threadIdx.x == 0) {
343:     group_results[blockIdx.x]             = tmp_buffer[0];
344:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
345:   }
346: }

348: // M = 3:
349: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
350:                                         PetscInt size, PetscScalar *group_results)
351: {
352:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
353:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
354:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
355:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
356:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

358:   PetscScalar entry_x    = 0;
359:   PetscScalar group_sum0 = 0;
360:   PetscScalar group_sum1 = 0;
361:   PetscScalar group_sum2 = 0;
362:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
363:     entry_x     = x[i];   // load only once from global memory!
364:     group_sum0 += entry_x * y0[i];
365:     group_sum1 += entry_x * y1[i];
366:     group_sum2 += entry_x * y2[i];
367:   }
368:   tmp_buffer[threadIdx.x]                           = group_sum0;
369:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
370:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

372:   // parallel reduction
373:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
374:     __syncthreads();
375:     if (threadIdx.x < stride) {
376:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
377:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
378:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
379:     }
380:   }

382:   // write result of group to group_results
383:   if (threadIdx.x == 0) {
384:     group_results[blockIdx.x                ] = tmp_buffer[0];
385:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
386:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
387:   }
388: }

390: // M = 4:
391: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
392:                                         PetscInt size, PetscScalar *group_results)
393: {
394:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
395:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
396:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
397:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
398:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

400:   PetscScalar entry_x    = 0;
401:   PetscScalar group_sum0 = 0;
402:   PetscScalar group_sum1 = 0;
403:   PetscScalar group_sum2 = 0;
404:   PetscScalar group_sum3 = 0;
405:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
406:     entry_x     = x[i];   // load only once from global memory!
407:     group_sum0 += entry_x * y0[i];
408:     group_sum1 += entry_x * y1[i];
409:     group_sum2 += entry_x * y2[i];
410:     group_sum3 += entry_x * y3[i];
411:   }
412:   tmp_buffer[threadIdx.x]                           = group_sum0;
413:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
414:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
415:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

417:   // parallel reduction
418:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
419:     __syncthreads();
420:     if (threadIdx.x < stride) {
421:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
422:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
423:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
424:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
425:     }
426:   }

428:   // write result of group to group_results
429:   if (threadIdx.x == 0) {
430:     group_results[blockIdx.x                ] = tmp_buffer[0];
431:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
432:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
433:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
434:   }
435: }

437: // M = 8:
438: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
439:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
440:                                           PetscInt size, PetscScalar *group_results)
441: {
442:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
443:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
444:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
445:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
446:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

448:   PetscScalar entry_x    = 0;
449:   PetscScalar group_sum0 = 0;
450:   PetscScalar group_sum1 = 0;
451:   PetscScalar group_sum2 = 0;
452:   PetscScalar group_sum3 = 0;
453:   PetscScalar group_sum4 = 0;
454:   PetscScalar group_sum5 = 0;
455:   PetscScalar group_sum6 = 0;
456:   PetscScalar group_sum7 = 0;
457:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
458:     entry_x     = x[i];   // load only once from global memory!
459:     group_sum0 += entry_x * y0[i];
460:     group_sum1 += entry_x * y1[i];
461:     group_sum2 += entry_x * y2[i];
462:     group_sum3 += entry_x * y3[i];
463:     group_sum4 += entry_x * y4[i];
464:     group_sum5 += entry_x * y5[i];
465:     group_sum6 += entry_x * y6[i];
466:     group_sum7 += entry_x * y7[i];
467:   }
468:   tmp_buffer[threadIdx.x]                           = group_sum0;
469:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
470:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
471:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
472:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
473:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
474:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
475:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

477:   // parallel reduction
478:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
479:     __syncthreads();
480:     if (threadIdx.x < stride) {
481:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
482:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
483:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
484:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
485:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
486:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
487:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
488:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
489:     }
490:   }

492:   // write result of group to group_results
493:   if (threadIdx.x == 0) {
494:     group_results[blockIdx.x                ] = tmp_buffer[0];
495:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
496:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
497:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
498:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
499:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
500:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
501:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
502:   }
503: }
504: #endif /* !defined(PETSC_USE_COMPLEX) */

506: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
507: {
508:   PetscErrorCode    ierr;
509:   PetscInt          i,n = xin->map->n,current_y_index = 0;
510:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
511: #if !defined(PETSC_USE_COMPLEX)
512:   PetscInt          nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
513:   PetscScalar       *group_results_gpu,*group_results_cpu;
514:   cudaError_t       cuda_ierr;
515: #endif
516:   PetscBLASInt      one = 1,bn = 0;
517:   cublasHandle_t    cublasv2handle;
518:   cublasStatus_t    cberr;

521:   PetscCUBLASGetHandle(&cublasv2handle);
522:   PetscBLASIntCast(xin->map->n,&bn);
523:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
524:   /* Handle the case of local size zero first */
525:   if (!xin->map->n) {
526:     for (i=0; i<nv; ++i) z[i] = 0;
527:     return(0);
528:   }

530: #if !defined(PETSC_USE_COMPLEX)
531:   // allocate scratchpad memory for the results of individual work groups:
532:   cuda_cudaMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);CHKERRCUDA(cuda_ierr);
533:   PetscMalloc1(nv1*MDOT_WORKGROUP_NUM,&group_results_cpu);
534: #endif
535:   VecCUDAGetArrayRead(xin,&xptr);
536:   PetscLogGpuTimeBegin();

538:   while (current_y_index < nv)
539:   {
540:     switch (nv - current_y_index) {

542:       case 7:
543:       case 6:
544:       case 5:
545:       case 4:
546:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
547:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
548:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
549:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
550: #if defined(PETSC_USE_COMPLEX)
551:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
552:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
553:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
554:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
555: #else
556:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
557: #endif
558:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
559:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
560:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
561:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
562:         current_y_index += 4;
563:         break;

565:       case 3:
566:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
567:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
568:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

570: #if defined(PETSC_USE_COMPLEX)
571:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
572:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
573:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
574: #else
575:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
576: #endif
577:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
578:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
579:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
580:         current_y_index += 3;
581:         break;

583:       case 2:
584:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
585:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
586: #if defined(PETSC_USE_COMPLEX)
587:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
588:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
589: #else
590:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
591: #endif
592:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
593:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
594:         current_y_index += 2;
595:         break;

597:       case 1:
598:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
599:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
600:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
601:         current_y_index += 1;
602:         break;

604:       default: // 8 or more vectors left
605:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
606:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
607:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
608:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
609:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
610:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
611:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
612:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
613: #if defined(PETSC_USE_COMPLEX)
614:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
615:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
616:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
617:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
618:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
619:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
620:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
621:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
622: #else
623:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
624: #endif
625:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
626:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
627:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
628:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
629:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
630:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
631:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
632:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
633:         current_y_index += 8;
634:         break;
635:     }
636:   }
637:   PetscLogGpuTimeEnd();
638:   VecCUDARestoreArrayRead(xin,&xptr);

640: #if defined(PETSC_USE_COMPLEX)
641:   PetscLogGpuToCpuScalar(nv*sizeof(PetscScalar));
642: #else
643:   // copy results to CPU
644:   cuda_cudaMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

646:   // sum group results into z
647:   for (j=0; j<nv1; ++j) {
648:     z[j] = 0;
649:     for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
650:   }
651:   PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
652:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
653:   PetscFree(group_results_cpu);
654:   PetscLogGpuToCpuScalar(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
655: #endif
656:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
657:   return(0);
658: }

660: #undef MDOT_WORKGROUP_SIZE
661: #undef MDOT_WORKGROUP_NUM

663: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
664: {
665:   PetscInt                        n = xin->map->n;
666:   PetscScalar                     *xarray = NULL;
667:   thrust::device_ptr<PetscScalar> xptr;
668:   PetscErrorCode                  ierr;
669:   cudaError_t                     err;

672:   VecCUDAGetArrayWrite(xin,&xarray);
673:   PetscLogGpuTimeBegin();
674:   if (alpha == (PetscScalar)0.0) {
675:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
676:   } else {
677:     try {
678:       xptr = thrust::device_pointer_cast(xarray);
679:       thrust::fill(xptr,xptr+n,alpha);
680:     } catch (char *ex) {
681:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
682:     }
683:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
684:   }
685:   PetscLogGpuTimeEnd();
686:   VecCUDARestoreArrayWrite(xin,&xarray);
687:   return(0);
688: }

690: struct PetscScalarReciprocal
691: {
692:   __host__ __device__
693:   PetscScalar operator()(const PetscScalar& s)
694:   {
695:     return (s != (PetscScalar)0.0) ? (PetscScalar)1.0/s : 0.0;
696:   }
697: };

699: PetscErrorCode VecReciprocal_SeqCUDA(Vec v)
700: {
702:   PetscInt       n;
703:   PetscScalar    *x;

706:   VecGetLocalSize(v,&n);
707:   VecCUDAGetArray(v,&x);
708:   PetscLogGpuTimeBegin();
709:   try {
710:     auto xptr = thrust::device_pointer_cast(x);
711:     thrust::transform(xptr,xptr+n,xptr,PetscScalarReciprocal());
712:   } catch (char *ex) {
713:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
714:   }
715:   PetscLogGpuTimeEnd();
716:   VecCUDARestoreArray(v,&x);
717:   return(0);
718: }

720: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
721: {
722:   PetscScalar    *xarray;
724:   PetscBLASInt   one = 1,bn = 0;
725:   cublasHandle_t cublasv2handle;
726:   cublasStatus_t cberr;

729:   if (alpha == (PetscScalar)0.0) {
730:     VecSet_SeqCUDA(xin,alpha);
731:   } else if (alpha != (PetscScalar)1.0) {
732:     PetscCUBLASGetHandle(&cublasv2handle);
733:     PetscBLASIntCast(xin->map->n,&bn);
734:     VecCUDAGetArray(xin,&xarray);
735:     PetscLogGpuTimeBegin();
736:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
737:     VecCUDARestoreArray(xin,&xarray);
738:     PetscLogGpuTimeEnd();
739:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
740:     PetscLogGpuFlops(xin->map->n);
741:   }
742:   return(0);
743: }

745: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
746: {
747:   const PetscScalar *xarray,*yarray;
748:   PetscErrorCode    ierr;
749:   PetscBLASInt      one = 1,bn = 0;
750:   cublasHandle_t    cublasv2handle;
751:   cublasStatus_t    cerr;

754:   PetscCUBLASGetHandle(&cublasv2handle);
755:   PetscBLASIntCast(xin->map->n,&bn);
756:   VecCUDAGetArrayRead(xin,&xarray);
757:   VecCUDAGetArrayRead(yin,&yarray);
758:   PetscLogGpuTimeBegin();
759:   cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
760:   PetscLogGpuTimeEnd();
761:   if (xin->map->n > 0) {
762:     PetscLogGpuFlops(2.0*xin->map->n-1);
763:   }
764:   PetscLogGpuToCpuScalar(sizeof(PetscScalar));
765:   VecCUDARestoreArrayRead(yin,&yarray);
766:   VecCUDARestoreArrayRead(xin,&xarray);
767:   return(0);
768: }

770: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
771: {
772:   const PetscScalar *xarray;
773:   PetscScalar       *yarray;
774:   PetscErrorCode    ierr;
775:   cudaError_t       err;

778:   if (xin != yin) {
779:     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
780:       PetscBool yiscuda;

782:       PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
783:       VecCUDAGetArrayRead(xin,&xarray);
784:       if (yiscuda) {
785:         VecCUDAGetArrayWrite(yin,&yarray);
786:       } else {
787:         VecGetArrayWrite(yin,&yarray);
788:       }
789:       PetscLogGpuTimeBegin();
790:       if (yiscuda) {
791:         err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
792:       } else {
793:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
794:       }
795:       PetscLogGpuTimeEnd();
796:       VecCUDARestoreArrayRead(xin,&xarray);
797:       if (yiscuda) {
798:         VecCUDARestoreArrayWrite(yin,&yarray);
799:       } else {
800:         VecRestoreArrayWrite(yin,&yarray);
801:       }
802:     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
803:       /* copy in CPU if we are on the CPU */
804:       VecCopy_SeqCUDA_Private(xin,yin);
805:     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
806:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
807:       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
808:         /* copy in CPU */
809:         VecCopy_SeqCUDA_Private(xin,yin);
810:       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
811:         /* copy in GPU */
812:         VecCUDAGetArrayRead(xin,&xarray);
813:         VecCUDAGetArrayWrite(yin,&yarray);
814:         PetscLogGpuTimeBegin();
815:         err  = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
816:         PetscLogGpuTimeEnd();
817:         VecCUDARestoreArrayRead(xin,&xarray);
818:         VecCUDARestoreArrayWrite(yin,&yarray);
819:       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
820:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
821:            default to copy in GPU (this is an arbitrary choice) */
822:         VecCUDAGetArrayRead(xin,&xarray);
823:         VecCUDAGetArrayWrite(yin,&yarray);
824:         PetscLogGpuTimeBegin();
825:         err  = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
826:         PetscLogGpuTimeEnd();
827:         VecCUDARestoreArrayRead(xin,&xarray);
828:         VecCUDARestoreArrayWrite(yin,&yarray);
829:       } else {
830:         VecCopy_SeqCUDA_Private(xin,yin);
831:       }
832:     }
833:   }
834:   return(0);
835: }

837: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
838: {
840:   PetscBLASInt   one = 1,bn = 0;
841:   PetscScalar    *xarray,*yarray;
842:   cublasHandle_t cublasv2handle;
843:   cublasStatus_t cberr;

846:   PetscCUBLASGetHandle(&cublasv2handle);
847:   PetscBLASIntCast(xin->map->n,&bn);
848:   if (xin != yin) {
849:     VecCUDAGetArray(xin,&xarray);
850:     VecCUDAGetArray(yin,&yarray);
851:     PetscLogGpuTimeBegin();
852:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
853:     PetscLogGpuTimeEnd();
854:     VecCUDARestoreArray(xin,&xarray);
855:     VecCUDARestoreArray(yin,&yarray);
856:   }
857:   return(0);
858: }

860: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
861: {
862:   PetscErrorCode    ierr;
863:   PetscScalar       a = alpha,b = beta;
864:   const PetscScalar *xarray;
865:   PetscScalar       *yarray;
866:   PetscBLASInt      one = 1, bn = 0;
867:   cublasHandle_t    cublasv2handle;
868:   cublasStatus_t    cberr;
869:   cudaError_t       err;

872:   PetscCUBLASGetHandle(&cublasv2handle);
873:   PetscBLASIntCast(yin->map->n,&bn);
874:   if (a == (PetscScalar)0.0) {
875:     VecScale_SeqCUDA(yin,beta);
876:   } else if (b == (PetscScalar)1.0) {
877:     VecAXPY_SeqCUDA(yin,alpha,xin);
878:   } else if (a == (PetscScalar)1.0) {
879:     VecAYPX_SeqCUDA(yin,beta,xin);
880:   } else if (b == (PetscScalar)0.0) {
881:     VecCUDAGetArrayRead(xin,&xarray);
882:     VecCUDAGetArray(yin,&yarray);
883:     PetscLogGpuTimeBegin();
884:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
885:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
886:     PetscLogGpuTimeEnd();
887:     PetscLogGpuFlops(xin->map->n);
888:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
889:     VecCUDARestoreArrayRead(xin,&xarray);
890:     VecCUDARestoreArray(yin,&yarray);
891:   } else {
892:     VecCUDAGetArrayRead(xin,&xarray);
893:     VecCUDAGetArray(yin,&yarray);
894:     PetscLogGpuTimeBegin();
895:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
896:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
897:     VecCUDARestoreArrayRead(xin,&xarray);
898:     VecCUDARestoreArray(yin,&yarray);
899:     PetscLogGpuTimeEnd();
900:     PetscLogGpuFlops(3.0*xin->map->n);
901:     PetscLogCpuToGpuScalar(2*sizeof(PetscScalar));
902:   }
903:   return(0);
904: }

906: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
907: {
909:   PetscInt       n = zin->map->n;

912:   if (gamma == (PetscScalar)1.0) {
913:     /* z = ax + b*y + z */
914:     VecAXPY_SeqCUDA(zin,alpha,xin);
915:     VecAXPY_SeqCUDA(zin,beta,yin);
916:     PetscLogGpuFlops(4.0*n);
917:   } else {
918:     /* z = a*x + b*y + c*z */
919:     VecScale_SeqCUDA(zin,gamma);
920:     VecAXPY_SeqCUDA(zin,alpha,xin);
921:     VecAXPY_SeqCUDA(zin,beta,yin);
922:     PetscLogGpuFlops(5.0*n);
923:   }
924:   return(0);
925: }

927: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
928: {
929:   PetscInt                              n = win->map->n;
930:   const PetscScalar                     *xarray,*yarray;
931:   PetscScalar                           *warray;
932:   thrust::device_ptr<const PetscScalar> xptr,yptr;
933:   thrust::device_ptr<PetscScalar>       wptr;
934:   PetscErrorCode                        ierr;

937:   if (xin->boundtocpu || yin->boundtocpu) {
938:     VecPointwiseMult_Seq(win,xin,yin);
939:     return(0);
940:   }
941:   VecCUDAGetArrayRead(xin,&xarray);
942:   VecCUDAGetArrayRead(yin,&yarray);
943:   VecCUDAGetArrayWrite(win,&warray);
944:   PetscLogGpuTimeBegin();
945:   try {
946:     wptr = thrust::device_pointer_cast(warray);
947:     xptr = thrust::device_pointer_cast(xarray);
948:     yptr = thrust::device_pointer_cast(yarray);
949:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
950:   } catch (char *ex) {
951:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
952:   }
953:   PetscLogGpuTimeEnd();
954:   VecCUDARestoreArrayRead(xin,&xarray);
955:   VecCUDARestoreArrayRead(yin,&yarray);
956:   VecCUDARestoreArrayWrite(win,&warray);
957:   PetscLogGpuFlops(n);
958:   return(0);
959: }

961: /* should do infinity norm in cuda */

963: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
964: {
965:   PetscErrorCode    ierr;
966:   PetscInt          n = xin->map->n;
967:   PetscBLASInt      one = 1, bn = 0;
968:   const PetscScalar *xarray;
969:   cublasHandle_t    cublasv2handle;
970:   cublasStatus_t    cberr;
971:   cudaError_t       err;

974:   PetscCUBLASGetHandle(&cublasv2handle);
975:   PetscBLASIntCast(n,&bn);
976:   if (type == NORM_2 || type == NORM_FROBENIUS) {
977:     VecCUDAGetArrayRead(xin,&xarray);
978:     PetscLogGpuTimeBegin();
979:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
980:     PetscLogGpuTimeEnd();
981:     VecCUDARestoreArrayRead(xin,&xarray);
982:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
983:   } else if (type == NORM_INFINITY) {
984:     int  i;
985:     VecCUDAGetArrayRead(xin,&xarray);
986:     PetscLogGpuTimeBegin();
987:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
988:     PetscLogGpuTimeEnd();
989:     if (bn) {
990:       PetscScalar zs;
991:       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
992:       *z = PetscAbsScalar(zs);
993:     } else *z = 0.0;
994:     VecCUDARestoreArrayRead(xin,&xarray);
995:   } else if (type == NORM_1) {
996:     VecCUDAGetArrayRead(xin,&xarray);
997:     PetscLogGpuTimeBegin();
998:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
999:     VecCUDARestoreArrayRead(xin,&xarray);
1000:     PetscLogGpuTimeEnd();
1001:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1002:   } else if (type == NORM_1_AND_2) {
1003:     VecNorm_SeqCUDA(xin,NORM_1,z);
1004:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
1005:   }
1006:   PetscLogGpuToCpuScalar(sizeof(PetscReal));
1007:   return(0);
1008: }

1010: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1011: {

1015:   VecDot_SeqCUDA(s,t,dp);
1016:   VecDot_SeqCUDA(t,t,nm);
1017:   return(0);
1018: }

1020: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1021: {
1023:   cudaError_t    cerr;
1024:   Vec_CUDA       *veccuda = (Vec_CUDA*)v->spptr;

1027:   if (v->spptr) {
1028:     if (veccuda->GPUarray_allocated) {
1029:      #if defined(PETSC_HAVE_NVSHMEM)
1030:       if (veccuda->nvshmem) {
1031:         PetscNvshmemFree(veccuda->GPUarray_allocated);
1032:         veccuda->nvshmem = PETSC_FALSE;
1033:       }
1034:       else
1035:      #endif
1036:       {cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);}
1037:       veccuda->GPUarray_allocated = NULL;
1038:     }
1039:     if (veccuda->stream) {
1040:       cerr = cudaStreamDestroy(veccuda->stream);CHKERRCUDA(cerr);
1041:     }
1042:   }
1043:   VecDestroy_SeqCUDA_Private(v);
1044:   PetscFree(v->spptr);
1045:   return(0);
1046: }

1048: #if defined(PETSC_USE_COMPLEX)
1049: struct conjugate
1050: {
1051:   __host__ __device__
1052:     PetscScalar operator()(const PetscScalar& x)
1053:     {
1054:       return PetscConj(x);
1055:     }
1056: };
1057: #endif

1059: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1060: {
1061: #if defined(PETSC_USE_COMPLEX)
1062:   PetscScalar                     *xarray;
1063:   PetscErrorCode                  ierr;
1064:   PetscInt                        n = xin->map->n;
1065:   thrust::device_ptr<PetscScalar> xptr;

1068:   VecCUDAGetArray(xin,&xarray);
1069:   PetscLogGpuTimeBegin();
1070:   try {
1071:     xptr = thrust::device_pointer_cast(xarray);
1072:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1073:   } catch (char *ex) {
1074:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1075:   }
1076:   PetscLogGpuTimeEnd();
1077:   VecCUDARestoreArray(xin,&xarray);
1078: #else
1080: #endif
1081:   return(0);
1082: }

1084: PETSC_STATIC_INLINE PetscErrorCode VecGetLocalVectorK_SeqCUDA(Vec v,Vec w,PetscBool read)
1085: {
1087:   cudaError_t    err;
1088:   PetscBool      wisseqcuda;

1094:   PetscObjectTypeCompare((PetscObject)w,VECSEQCUDA,&wisseqcuda);
1095:   if (w->data && wisseqcuda) {
1096:     if (((Vec_Seq*)w->data)->array_allocated) {
1097:       if (w->pinned_memory) {
1098:         PetscMallocSetCUDAHost();
1099:       }
1100:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1101:       if (w->pinned_memory) {
1102:         PetscMallocResetCUDAHost();
1103:         w->pinned_memory = PETSC_FALSE;
1104:       }
1105:     }
1106:     ((Vec_Seq*)w->data)->array = NULL;
1107:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1108:   }
1109:   if (w->spptr && wisseqcuda) {
1110:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1111:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1112:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1113:     }
1114:     if (((Vec_CUDA*)w->spptr)->stream) {
1115:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1116:     }
1117:     PetscFree(w->spptr);
1118:   }

1120:   if (v->petscnative && wisseqcuda) {
1121:     PetscFree(w->data);
1122:     w->data = v->data;
1123:     w->offloadmask = v->offloadmask;
1124:     w->pinned_memory = v->pinned_memory;
1125:     w->spptr = v->spptr;
1126:     PetscObjectStateIncrease((PetscObject)w);
1127:   } else {
1128:     if (read) {
1129:       VecGetArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1130:     } else {
1131:       VecGetArray(v,&((Vec_Seq*)w->data)->array);
1132:     }
1133:     w->offloadmask = PETSC_OFFLOAD_CPU;
1134:     if (wisseqcuda) {
1135:       VecCUDAAllocateCheck(w);
1136:     }
1137:   }
1138:   return(0);
1139: }

1141: PETSC_STATIC_INLINE PetscErrorCode VecRestoreLocalVectorK_SeqCUDA(Vec v,Vec w,PetscBool read)
1142: {
1144:   cudaError_t    err;
1145:   PetscBool      wisseqcuda;

1151:   PetscObjectTypeCompare((PetscObject)w,VECSEQCUDA,&wisseqcuda);
1152:   if (v->petscnative && wisseqcuda) {
1153:     v->data = w->data;
1154:     v->offloadmask = w->offloadmask;
1155:     v->pinned_memory = w->pinned_memory;
1156:     v->spptr = w->spptr;
1157:     w->data = 0;
1158:     w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1159:     w->spptr = 0;
1160:   } else {
1161:     if (read) {
1162:       VecRestoreArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1163:     } else {
1164:       VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1165:     }
1166:     if ((Vec_CUDA*)w->spptr && wisseqcuda) {
1167:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1168:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1169:       if (((Vec_CUDA*)v->spptr)->stream) {
1170:         err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1171:       }
1172:       PetscFree(w->spptr);
1173:     }
1174:   }
1175:   return(0);
1176: }

1178: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1179: {

1183:   VecGetLocalVectorK_SeqCUDA(v,w,PETSC_FALSE);
1184:   return(0);
1185: }

1187: PetscErrorCode VecGetLocalVectorRead_SeqCUDA(Vec v,Vec w)
1188: {

1192:   VecGetLocalVectorK_SeqCUDA(v,w,PETSC_TRUE);
1193:   return(0);
1194: }

1196: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1197: {

1201:   VecRestoreLocalVectorK_SeqCUDA(v,w,PETSC_FALSE);
1202:   return(0);
1203: }

1205: PetscErrorCode VecRestoreLocalVectorRead_SeqCUDA(Vec v,Vec w)
1206: {

1210:   VecRestoreLocalVectorK_SeqCUDA(v,w,PETSC_TRUE);
1211:   return(0);
1212: }

1214: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1215: {
1216:   __host__ __device__
1217:   PetscReal operator()(const PetscScalar& x) {
1218:     return PetscRealPart(x);
1219:   }
1220: };

1222: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1223: {
1224:   __host__ __device__
1225:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt>& x) {
1226:     return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1227:   }
1228: };

1230: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1231: {
1232:   __host__ __device__
1233:   PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1234:     return x < y ? y : x;
1235:   }
1236: };

1238: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1239: {
1240:   __host__ __device__
1241:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1242:     return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1243:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1244:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1245:   }
1246: };

1248: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1249: {
1250:   __host__ __device__
1251:   PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1252:     return x < y ? x : y;
1253:   }
1254: };

1256: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1257: {
1258:   __host__ __device__
1259:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1260:     return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1261:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1262:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1263:   }
1264: };

1266: PetscErrorCode VecMax_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1267: {
1268:   PetscErrorCode                        ierr;
1269:   PetscInt                              n = v->map->n;
1270:   const PetscScalar                     *av;
1271:   thrust::device_ptr<const PetscScalar> avpt;

1275:   if (!n) {
1276:     *m = PETSC_MIN_REAL;
1277:     if (p) *p = -1;
1278:     return(0);
1279:   }
1280:   VecCUDAGetArrayRead(v,&av);
1281:   avpt = thrust::device_pointer_cast(av);
1282:   PetscLogGpuTimeBegin();
1283:   if (p) {
1284:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1285:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1286:     try {
1287: #if defined(PETSC_USE_COMPLEX)
1288:       res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmaxi());
1289: #else
1290:       res = thrust::reduce(zibit,zibit+n,res,petscmaxi());
1291: #endif
1292:     } catch (char *ex) {
1293:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1294:     }
1295:     *m = res.get<0>();
1296:     *p = res.get<1>();
1297:   } else {
1298:     try {
1299: #if defined(PETSC_USE_COMPLEX)
1300:       *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1301: #else
1302:       *m = thrust::reduce(avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1303: #endif
1304:     } catch (char *ex) {
1305:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1306:     }
1307:   }
1308:   PetscLogGpuTimeEnd();
1309:   VecCUDARestoreArrayRead(v,&av);
1310:   return(0);
1311: }

1313: PetscErrorCode VecMin_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1314: {
1315:   PetscErrorCode                        ierr;
1316:   PetscInt                              n = v->map->n;
1317:   const PetscScalar                     *av;
1318:   thrust::device_ptr<const PetscScalar> avpt;

1322:   if (!n) {
1323:     *m = PETSC_MAX_REAL;
1324:     if (p) *p = -1;
1325:     return(0);
1326:   }
1327:   VecCUDAGetArrayRead(v,&av);
1328:   avpt = thrust::device_pointer_cast(av);
1329:   PetscLogGpuTimeBegin();
1330:   if (p) {
1331:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1332:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1333:     try {
1334: #if defined(PETSC_USE_COMPLEX)
1335:       res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmini());
1336: #else
1337:       res = thrust::reduce(zibit,zibit+n,res,petscmini());
1338: #endif
1339:     } catch (char *ex) {
1340:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1341:     }
1342:     *m = res.get<0>();
1343:     *p = res.get<1>();
1344:   } else {
1345:     try {
1346: #if defined(PETSC_USE_COMPLEX)
1347:       *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1348: #else
1349:       *m = thrust::reduce(avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1350: #endif
1351:     } catch (char *ex) {
1352:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1353:     }
1354:   }
1355:   PetscLogGpuTimeEnd();
1356:   VecCUDARestoreArrayRead(v,&av);
1357:   return(0);
1358: }

1360: PetscErrorCode VecSum_SeqCUDA(Vec v,PetscScalar *sum)
1361: {
1362:   PetscErrorCode                        ierr;
1363:   PetscInt                              n = v->map->n;
1364:   const PetscScalar                     *a;
1365:   thrust::device_ptr<const PetscScalar> dptr;

1369:   VecCUDAGetArrayRead(v,&a);
1370:   dptr = thrust::device_pointer_cast(a);
1371:   PetscLogGpuTimeBegin();
1372:   try {
1373:     *sum = thrust::reduce(dptr,dptr+n,PetscScalar(0.0));
1374:   } catch (char *ex) {
1375:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1376:   }
1377:   PetscLogGpuTimeEnd();
1378:   VecCUDARestoreArrayRead(v,&a);
1379:   return(0);
1380: }

1382: struct petscshift : public thrust::unary_function<PetscScalar,PetscScalar>
1383: {
1384:   const PetscScalar shift_;
1385:   petscshift(PetscScalar shift) : shift_(shift){}
1386:   __host__ __device__
1387:   PetscScalar operator()(PetscScalar x) {return x + shift_;}
1388: };

1390: PetscErrorCode VecShift_SeqCUDA(Vec v,PetscScalar shift)
1391: {
1392:   PetscErrorCode                        ierr;
1393:   PetscInt                              n = v->map->n;
1394:   PetscScalar                           *a;
1395:   thrust::device_ptr<PetscScalar>       dptr;

1399:   VecCUDAGetArray(v,&a);
1400:   dptr = thrust::device_pointer_cast(a);
1401:   PetscLogGpuTimeBegin();
1402:   try {
1403:     thrust::transform(dptr,dptr+n,dptr,petscshift(shift)); /* in-place transform */
1404:   } catch (char *ex) {
1405:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1406:   }
1407:   PetscLogGpuTimeEnd();
1408:   VecCUDARestoreArray(v,&a);
1409:   return(0);
1410: }

1412: #if defined(PETSC_HAVE_NVSHMEM)
1413: /* Free old CUDA array and re-allocate a new one from nvshmem symmetric heap.
1414:    New array does not retain values in the old array. The offload mask is not changed.

1416:    Note: the function is only meant to be used in MatAssemblyEnd_MPIAIJCUSPARSE.
1417:  */
1418: PetscErrorCode  VecAllocateNVSHMEM_SeqCUDA(Vec v)
1419: {
1421:   cudaError_t    cerr;
1422:   Vec_CUDA       *veccuda = (Vec_CUDA*)v->spptr;
1423:   PetscInt       n;

1426:   cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);
1427:   VecGetLocalSize(v,&n);
1428:   MPIU_Allreduce(MPI_IN_PLACE,&n,1,MPIU_INT,MPI_MAX,PETSC_COMM_WORLD);
1429:   PetscNvshmemMalloc(n*sizeof(PetscScalar),(void**)&veccuda->GPUarray_allocated);
1430:   veccuda->GPUarray = veccuda->GPUarray_allocated;
1431:   veccuda->nvshmem  = PETSC_TRUE;
1432:   return(0);
1433: }
1434: #endif