Actual source code: veccuda2.cu

  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8: #include <petsc/private/vecimpl.h>
  9: #include <../src/vec/vec/impls/dvecimpl.h>
 10: #include <petsc/private/cudavecimpl.h>

 12: #include <cuda_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/functional.h>
 16: #include <thrust/reduce.h>

 18: /*
 19:     Allocates space for the vector array on the GPU if it does not exist.
 20:     Does NOT change the PetscCUDAFlag for the vector
 21:     Does NOT zero the CUDA array

 23:  */
 24: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 25: {
 27:   cudaError_t    err;
 28:   Vec_CUDA       *veccuda;
 29:   PetscBool      option_set;

 32:   if (!v->spptr) {
 33:     PetscReal pinned_memory_min;
 34:     PetscCalloc(sizeof(Vec_CUDA),&v->spptr);
 35:     veccuda = (Vec_CUDA*)v->spptr;
 36:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 37:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 38:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 39:       if (v->data && ((Vec_Seq*)v->data)->array) {
 40:         v->offloadmask = PETSC_OFFLOAD_CPU;
 41:       } else {
 42:         v->offloadmask = PETSC_OFFLOAD_GPU;
 43:       }
 44:     }
 45:     pinned_memory_min = 0;

 47:     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
 48:        Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
 49:     PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
 50:     PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
 51:     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
 52:     PetscOptionsEnd();
 53:   }
 54:   return(0);
 55: }

 57: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 58: PetscErrorCode VecCUDACopyToGPU(Vec v)
 59: {
 61:   cudaError_t    err;
 62:   Vec_CUDA       *veccuda;
 63:   PetscScalar    *varray;

 67:   VecCUDAAllocateCheck(v);
 68:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 69:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 70:     veccuda        = (Vec_CUDA*)v->spptr;
 71:     varray         = veccuda->GPUarray;
 72:     err            = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 73:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
 74:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 75:     v->offloadmask = PETSC_OFFLOAD_BOTH;
 76:   }
 77:   return(0);
 78: }

 80: /*
 81:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
 82: */
 83: PetscErrorCode VecCUDACopyFromGPU(Vec v)
 84: {
 86:   cudaError_t    err;
 87:   Vec_CUDA       *veccuda;
 88:   PetscScalar    *varray;

 92:   VecCUDAAllocateCheckHost(v);
 93:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
 94:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
 95:     veccuda        = (Vec_CUDA*)v->spptr;
 96:     varray         = veccuda->GPUarray;
 97:     err            = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
 98:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
 99:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
100:     v->offloadmask = PETSC_OFFLOAD_BOTH;
101:   }
102:   return(0);
103: }

105: /*MC
106:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

108:    Options Database Keys:
109: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

111:   Level: beginner

113: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
114: M*/

116: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
117: {
118:   const PetscScalar *xarray;
119:   PetscScalar       *yarray;
120:   PetscErrorCode    ierr;
121:   PetscBLASInt      one = 1,bn = 0;
122:   PetscScalar       sone = 1.0;
123:   cublasHandle_t    cublasv2handle;
124:   cublasStatus_t    cberr;
125:   cudaError_t       err;

128:   PetscCUBLASGetHandle(&cublasv2handle);
129:   PetscBLASIntCast(yin->map->n,&bn);
130:   VecCUDAGetArrayRead(xin,&xarray);
131:   VecCUDAGetArray(yin,&yarray);
132:   PetscLogGpuTimeBegin();
133:   if (alpha == (PetscScalar)0.0) {
134:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
135:   } else if (alpha == (PetscScalar)1.0) {
136:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
137:     PetscLogGpuFlops(1.0*yin->map->n);
138:   } else {
139:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
140:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
141:     PetscLogGpuFlops(2.0*yin->map->n);
142:   }
143:   PetscLogGpuTimeEnd();
144:   VecCUDARestoreArrayRead(xin,&xarray);
145:   VecCUDARestoreArray(yin,&yarray);
146:   PetscLogCpuToGpuScalar(sizeof(PetscScalar));
147:   return(0);
148: }

150: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
151: {
152:   const PetscScalar *xarray;
153:   PetscScalar       *yarray;
154:   PetscErrorCode    ierr;
155:   PetscBLASInt      one = 1,bn = 0;
156:   cublasHandle_t    cublasv2handle;
157:   cublasStatus_t    cberr;
158:   PetscBool         xiscuda;

161:   if (alpha == (PetscScalar)0.0) return(0);
162:   PetscCUBLASGetHandle(&cublasv2handle);
163:   PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
164:   if (xiscuda) {
165:     PetscBLASIntCast(yin->map->n,&bn);
166:     VecCUDAGetArrayRead(xin,&xarray);
167:     VecCUDAGetArray(yin,&yarray);
168:     PetscLogGpuTimeBegin();
169:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
170:     PetscLogGpuTimeEnd();
171:     VecCUDARestoreArrayRead(xin,&xarray);
172:     VecCUDARestoreArray(yin,&yarray);
173:     PetscLogGpuFlops(2.0*yin->map->n);
174:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
175:   } else {
176:     VecAXPY_Seq(yin,alpha,xin);
177:   }
178:   return(0);
179: }

181: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
182: {
183:   PetscInt                              n = xin->map->n;
184:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
185:   PetscScalar                           *warray=NULL;
186:   thrust::device_ptr<const PetscScalar> xptr,yptr;
187:   thrust::device_ptr<PetscScalar>       wptr;
188:   PetscErrorCode                        ierr;

191:   VecCUDAGetArrayWrite(win,&warray);
192:   VecCUDAGetArrayRead(xin,&xarray);
193:   VecCUDAGetArrayRead(yin,&yarray);
194:   PetscLogGpuTimeBegin();
195:   try {
196:     wptr = thrust::device_pointer_cast(warray);
197:     xptr = thrust::device_pointer_cast(xarray);
198:     yptr = thrust::device_pointer_cast(yarray);
199:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
200:   } catch (char *ex) {
201:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
202:   }
203:   PetscLogGpuTimeEnd();
204:   PetscLogGpuFlops(n);
205:   VecCUDARestoreArrayRead(xin,&xarray);
206:   VecCUDARestoreArrayRead(yin,&yarray);
207:   VecCUDARestoreArrayWrite(win,&warray);
208:   return(0);
209: }

211: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
212: {
213:   const PetscScalar *xarray=NULL,*yarray=NULL;
214:   PetscScalar       *warray=NULL;
215:   PetscErrorCode    ierr;
216:   PetscBLASInt      one = 1,bn = 0;
217:   cublasHandle_t    cublasv2handle;
218:   cublasStatus_t    stat;
219:   cudaError_t       cerr;
220:   cudaStream_t      stream;

223:   PetscCUBLASGetHandle(&cublasv2handle);
224:   PetscBLASIntCast(win->map->n,&bn);
225:   if (alpha == (PetscScalar)0.0) {
226:     VecCopy_SeqCUDA(yin,win);
227:   } else {
228:     VecCUDAGetArrayRead(xin,&xarray);
229:     VecCUDAGetArrayRead(yin,&yarray);
230:     VecCUDAGetArrayWrite(win,&warray);
231:     PetscLogGpuTimeBegin();
232:     stat = cublasGetStream(cublasv2handle,&stream);CHKERRCUBLAS(stat);
233:     cerr = cudaMemcpyAsync(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,stream);CHKERRCUDA(cerr);
234:     stat = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(stat);
235:     PetscLogGpuTimeEnd();
236:     PetscLogGpuFlops(2*win->map->n);
237:     VecCUDARestoreArrayRead(xin,&xarray);
238:     VecCUDARestoreArrayRead(yin,&yarray);
239:     VecCUDARestoreArrayWrite(win,&warray);
240:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
241:   }
242:   return(0);
243: }

245: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
246: {
247:   PetscErrorCode    ierr;
248:   PetscInt          n = xin->map->n,j;
249:   PetscScalar       *xarray;
250:   const PetscScalar *yarray;
251:   PetscBLASInt      one = 1,bn = 0;
252:   cublasHandle_t    cublasv2handle;
253:   cublasStatus_t    cberr;

256:   PetscLogGpuFlops(nv*2.0*n);
257:   PetscLogCpuToGpuScalar(nv*sizeof(PetscScalar));
258:   PetscCUBLASGetHandle(&cublasv2handle);
259:   PetscBLASIntCast(n,&bn);
260:   VecCUDAGetArray(xin,&xarray);
261:   PetscLogGpuTimeBegin();
262:   for (j=0; j<nv; j++) {
263:     VecCUDAGetArrayRead(y[j],&yarray);
264:     cberr = cublasXaxpy(cublasv2handle,bn,alpha+j,yarray,one,xarray,one);CHKERRCUBLAS(cberr);
265:     VecCUDARestoreArrayRead(y[j],&yarray);
266:   }
267:   PetscLogGpuTimeEnd();
268:   VecCUDARestoreArray(xin,&xarray);
269:   return(0);
270: }

272: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
273: {
274:   const PetscScalar *xarray,*yarray;
275:   PetscErrorCode    ierr;
276:   PetscBLASInt      one = 1,bn = 0;
277:   cublasHandle_t    cublasv2handle;
278:   cublasStatus_t    cerr;

281:   PetscCUBLASGetHandle(&cublasv2handle);
282:   PetscBLASIntCast(yin->map->n,&bn);
283:   VecCUDAGetArrayRead(xin,&xarray);
284:   VecCUDAGetArrayRead(yin,&yarray);
285:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
286:   PetscLogGpuTimeBegin();
287:   cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
288:   PetscLogGpuTimeEnd();
289:   if (xin->map->n >0) {
290:     PetscLogGpuFlops(2.0*xin->map->n-1);
291:   }
292:   PetscLogGpuToCpuScalar(sizeof(PetscScalar));
293:   VecCUDARestoreArrayRead(xin,&xarray);
294:   VecCUDARestoreArrayRead(yin,&yarray);
295:   return(0);
296: }

298: //
299: // CUDA kernels for MDot to follow
300: //

302: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
303: #define MDOT_WORKGROUP_SIZE 128
304: #define MDOT_WORKGROUP_NUM  128

306: #if !defined(PETSC_USE_COMPLEX)
307: // M = 2:
308: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
309:                                         PetscInt size, PetscScalar *group_results)
310: {
311:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
312:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
313:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
314:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
315:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

317:   PetscScalar entry_x    = 0;
318:   PetscScalar group_sum0 = 0;
319:   PetscScalar group_sum1 = 0;
320:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
321:     entry_x     = x[i];   // load only once from global memory!
322:     group_sum0 += entry_x * y0[i];
323:     group_sum1 += entry_x * y1[i];
324:   }
325:   tmp_buffer[threadIdx.x]                       = group_sum0;
326:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

328:   // parallel reduction
329:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
330:     __syncthreads();
331:     if (threadIdx.x < stride) {
332:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
333:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
334:     }
335:   }

337:   // write result of group to group_results
338:   if (threadIdx.x == 0) {
339:     group_results[blockIdx.x]             = tmp_buffer[0];
340:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
341:   }
342: }

344: // M = 3:
345: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
346:                                         PetscInt size, PetscScalar *group_results)
347: {
348:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
349:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
350:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
351:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
352:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

354:   PetscScalar entry_x    = 0;
355:   PetscScalar group_sum0 = 0;
356:   PetscScalar group_sum1 = 0;
357:   PetscScalar group_sum2 = 0;
358:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
359:     entry_x     = x[i];   // load only once from global memory!
360:     group_sum0 += entry_x * y0[i];
361:     group_sum1 += entry_x * y1[i];
362:     group_sum2 += entry_x * y2[i];
363:   }
364:   tmp_buffer[threadIdx.x]                           = group_sum0;
365:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
366:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

368:   // parallel reduction
369:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
370:     __syncthreads();
371:     if (threadIdx.x < stride) {
372:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
373:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
374:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
375:     }
376:   }

378:   // write result of group to group_results
379:   if (threadIdx.x == 0) {
380:     group_results[blockIdx.x                ] = tmp_buffer[0];
381:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
382:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
383:   }
384: }

386: // M = 4:
387: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
388:                                         PetscInt size, PetscScalar *group_results)
389: {
390:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
391:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
392:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
393:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
394:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

396:   PetscScalar entry_x    = 0;
397:   PetscScalar group_sum0 = 0;
398:   PetscScalar group_sum1 = 0;
399:   PetscScalar group_sum2 = 0;
400:   PetscScalar group_sum3 = 0;
401:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
402:     entry_x     = x[i];   // load only once from global memory!
403:     group_sum0 += entry_x * y0[i];
404:     group_sum1 += entry_x * y1[i];
405:     group_sum2 += entry_x * y2[i];
406:     group_sum3 += entry_x * y3[i];
407:   }
408:   tmp_buffer[threadIdx.x]                           = group_sum0;
409:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
410:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
411:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

413:   // parallel reduction
414:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
415:     __syncthreads();
416:     if (threadIdx.x < stride) {
417:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
418:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
419:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
420:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
421:     }
422:   }

424:   // write result of group to group_results
425:   if (threadIdx.x == 0) {
426:     group_results[blockIdx.x                ] = tmp_buffer[0];
427:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
428:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
429:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
430:   }
431: }

433: // M = 8:
434: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
435:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
436:                                           PetscInt size, PetscScalar *group_results)
437: {
438:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
439:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
440:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
441:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
442:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

444:   PetscScalar entry_x    = 0;
445:   PetscScalar group_sum0 = 0;
446:   PetscScalar group_sum1 = 0;
447:   PetscScalar group_sum2 = 0;
448:   PetscScalar group_sum3 = 0;
449:   PetscScalar group_sum4 = 0;
450:   PetscScalar group_sum5 = 0;
451:   PetscScalar group_sum6 = 0;
452:   PetscScalar group_sum7 = 0;
453:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
454:     entry_x     = x[i];   // load only once from global memory!
455:     group_sum0 += entry_x * y0[i];
456:     group_sum1 += entry_x * y1[i];
457:     group_sum2 += entry_x * y2[i];
458:     group_sum3 += entry_x * y3[i];
459:     group_sum4 += entry_x * y4[i];
460:     group_sum5 += entry_x * y5[i];
461:     group_sum6 += entry_x * y6[i];
462:     group_sum7 += entry_x * y7[i];
463:   }
464:   tmp_buffer[threadIdx.x]                           = group_sum0;
465:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
466:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
467:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
468:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
469:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
470:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
471:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

473:   // parallel reduction
474:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
475:     __syncthreads();
476:     if (threadIdx.x < stride) {
477:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
478:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
479:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
480:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
481:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
482:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
483:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
484:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
485:     }
486:   }

488:   // write result of group to group_results
489:   if (threadIdx.x == 0) {
490:     group_results[blockIdx.x                ] = tmp_buffer[0];
491:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
492:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
493:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
494:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
495:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
496:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
497:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
498:   }
499: }
500: #endif /* !defined(PETSC_USE_COMPLEX) */

502: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
503: {
504:   PetscErrorCode    ierr;
505:   PetscInt          i,n = xin->map->n,current_y_index = 0;
506:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
507: #if !defined(PETSC_USE_COMPLEX)
508:   PetscInt          nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
509:   PetscScalar       *group_results_gpu,*group_results_cpu;
510:   cudaError_t       cuda_ierr;
511: #endif
512:   PetscBLASInt      one = 1,bn = 0;
513:   cublasHandle_t    cublasv2handle;
514:   cublasStatus_t    cberr;

517:   PetscCUBLASGetHandle(&cublasv2handle);
518:   PetscBLASIntCast(xin->map->n,&bn);
519:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
520:   /* Handle the case of local size zero first */
521:   if (!xin->map->n) {
522:     for (i=0; i<nv; ++i) z[i] = 0;
523:     return(0);
524:   }

526: #if !defined(PETSC_USE_COMPLEX)
527:   // allocate scratchpad memory for the results of individual work groups:
528:   cuda_cudaMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);CHKERRCUDA(cuda_ierr);
529:   PetscMalloc1(nv1*MDOT_WORKGROUP_NUM,&group_results_cpu);
530: #endif
531:   VecCUDAGetArrayRead(xin,&xptr);
532:   PetscLogGpuTimeBegin();

534:   while (current_y_index < nv)
535:   {
536:     switch (nv - current_y_index) {

538:       case 7:
539:       case 6:
540:       case 5:
541:       case 4:
542:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
543:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
544:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
545:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
546: #if defined(PETSC_USE_COMPLEX)
547:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
548:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
549:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
550:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
551: #else
552:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
553: #endif
554:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
555:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
556:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
557:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
558:         current_y_index += 4;
559:         break;

561:       case 3:
562:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
563:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
564:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

566: #if defined(PETSC_USE_COMPLEX)
567:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
568:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
569:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
570: #else
571:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
572: #endif
573:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
574:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
575:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
576:         current_y_index += 3;
577:         break;

579:       case 2:
580:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
581:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
582: #if defined(PETSC_USE_COMPLEX)
583:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
584:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
585: #else
586:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
587: #endif
588:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
589:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
590:         current_y_index += 2;
591:         break;

593:       case 1:
594:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
595:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
596:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
597:         current_y_index += 1;
598:         break;

600:       default: // 8 or more vectors left
601:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
602:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
603:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
604:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
605:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
606:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
607:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
608:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
609: #if defined(PETSC_USE_COMPLEX)
610:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
611:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
612:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
613:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
614:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
615:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
616:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
617:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
618: #else
619:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
620: #endif
621:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
622:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
623:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
624:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
625:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
626:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
627:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
628:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
629:         current_y_index += 8;
630:         break;
631:     }
632:   }
633:   PetscLogGpuTimeEnd();
634:   VecCUDARestoreArrayRead(xin,&xptr);

636: #if defined(PETSC_USE_COMPLEX)
637:   PetscLogGpuToCpuScalar(nv*sizeof(PetscScalar));
638: #else
639:   // copy results to CPU
640:   cuda_cudaMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

642:   // sum group results into z
643:   for (j=0; j<nv1; ++j) {
644:     z[j] = 0;
645:     for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
646:   }
647:   PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
648:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
649:   PetscFree(group_results_cpu);
650:   PetscLogGpuToCpuScalar(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
651: #endif
652:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
653:   return(0);
654: }

656: #undef MDOT_WORKGROUP_SIZE
657: #undef MDOT_WORKGROUP_NUM

659: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
660: {
661:   PetscInt                        n = xin->map->n;
662:   PetscScalar                     *xarray = NULL;
663:   thrust::device_ptr<PetscScalar> xptr;
664:   PetscErrorCode                  ierr;
665:   cudaError_t                     err;

668:   VecCUDAGetArrayWrite(xin,&xarray);
669:   PetscLogGpuTimeBegin();
670:   if (alpha == (PetscScalar)0.0) {
671:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
672:   } else {
673:     try {
674:       xptr = thrust::device_pointer_cast(xarray);
675:       thrust::fill(xptr,xptr+n,alpha);
676:     } catch (char *ex) {
677:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
678:     }
679:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
680:   }
681:   PetscLogGpuTimeEnd();
682:   VecCUDARestoreArrayWrite(xin,&xarray);
683:   return(0);
684: }

686: struct PetscScalarReciprocal
687: {
688:   __host__ __device__
689:   PetscScalar operator()(const PetscScalar& s)
690:   {
691:     return (s != (PetscScalar)0.0) ? (PetscScalar)1.0/s : 0.0;
692:   }
693: };

695: PetscErrorCode VecReciprocal_SeqCUDA(Vec v)
696: {
698:   PetscInt       n;
699:   PetscScalar    *x;

702:   VecGetLocalSize(v,&n);
703:   VecCUDAGetArray(v,&x);
704:   PetscLogGpuTimeBegin();
705:   try {
706:     auto xptr = thrust::device_pointer_cast(x);
707:     thrust::transform(xptr,xptr+n,xptr,PetscScalarReciprocal());
708:   } catch (char *ex) {
709:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
710:   }
711:   PetscLogGpuTimeEnd();
712:   VecCUDARestoreArray(v,&x);
713:   return(0);
714: }

716: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
717: {
718:   PetscScalar    *xarray;
720:   PetscBLASInt   one = 1,bn = 0;
721:   cublasHandle_t cublasv2handle;
722:   cublasStatus_t cberr;

725:   if (alpha == (PetscScalar)0.0) {
726:     VecSet_SeqCUDA(xin,alpha);
727:   } else if (alpha != (PetscScalar)1.0) {
728:     PetscCUBLASGetHandle(&cublasv2handle);
729:     PetscBLASIntCast(xin->map->n,&bn);
730:     VecCUDAGetArray(xin,&xarray);
731:     PetscLogGpuTimeBegin();
732:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
733:     VecCUDARestoreArray(xin,&xarray);
734:     PetscLogGpuTimeEnd();
735:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
736:     PetscLogGpuFlops(xin->map->n);
737:   }
738:   return(0);
739: }

741: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
742: {
743:   const PetscScalar *xarray,*yarray;
744:   PetscErrorCode    ierr;
745:   PetscBLASInt      one = 1,bn = 0;
746:   cublasHandle_t    cublasv2handle;
747:   cublasStatus_t    cerr;

750:   PetscCUBLASGetHandle(&cublasv2handle);
751:   PetscBLASIntCast(xin->map->n,&bn);
752:   VecCUDAGetArrayRead(xin,&xarray);
753:   VecCUDAGetArrayRead(yin,&yarray);
754:   PetscLogGpuTimeBegin();
755:   cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
756:   PetscLogGpuTimeEnd();
757:   if (xin->map->n > 0) {
758:     PetscLogGpuFlops(2.0*xin->map->n-1);
759:   }
760:   PetscLogGpuToCpuScalar(sizeof(PetscScalar));
761:   VecCUDARestoreArrayRead(yin,&yarray);
762:   VecCUDARestoreArrayRead(xin,&xarray);
763:   return(0);
764: }

766: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
767: {
768:   const PetscScalar *xarray;
769:   PetscScalar       *yarray;
770:   PetscErrorCode    ierr;
771:   cudaError_t       err;

774:   if (xin != yin) {
775:     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
776:       PetscBool yiscuda;

778:       PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
779:       VecCUDAGetArrayRead(xin,&xarray);
780:       if (yiscuda) {
781:         VecCUDAGetArrayWrite(yin,&yarray);
782:       } else {
783:         VecGetArrayWrite(yin,&yarray);
784:       }
785:       PetscLogGpuTimeBegin();
786:       if (yiscuda) {
787:         err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
788:       } else {
789:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
790:       }
791:       PetscLogGpuTimeEnd();
792:       VecCUDARestoreArrayRead(xin,&xarray);
793:       if (yiscuda) {
794:         VecCUDARestoreArrayWrite(yin,&yarray);
795:       } else {
796:         VecRestoreArrayWrite(yin,&yarray);
797:       }
798:     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
799:       /* copy in CPU if we are on the CPU */
800:       VecCopy_SeqCUDA_Private(xin,yin);
801:     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
802:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
803:       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
804:         /* copy in CPU */
805:         VecCopy_SeqCUDA_Private(xin,yin);
806:       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
807:         /* copy in GPU */
808:         VecCUDAGetArrayRead(xin,&xarray);
809:         VecCUDAGetArrayWrite(yin,&yarray);
810:         PetscLogGpuTimeBegin();
811:         err  = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
812:         PetscLogGpuTimeEnd();
813:         VecCUDARestoreArrayRead(xin,&xarray);
814:         VecCUDARestoreArrayWrite(yin,&yarray);
815:       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
816:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
817:            default to copy in GPU (this is an arbitrary choice) */
818:         VecCUDAGetArrayRead(xin,&xarray);
819:         VecCUDAGetArrayWrite(yin,&yarray);
820:         PetscLogGpuTimeBegin();
821:         err  = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
822:         PetscLogGpuTimeEnd();
823:         VecCUDARestoreArrayRead(xin,&xarray);
824:         VecCUDARestoreArrayWrite(yin,&yarray);
825:       } else {
826:         VecCopy_SeqCUDA_Private(xin,yin);
827:       }
828:     }
829:   }
830:   return(0);
831: }

833: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
834: {
836:   PetscBLASInt   one = 1,bn = 0;
837:   PetscScalar    *xarray,*yarray;
838:   cublasHandle_t cublasv2handle;
839:   cublasStatus_t cberr;

842:   PetscCUBLASGetHandle(&cublasv2handle);
843:   PetscBLASIntCast(xin->map->n,&bn);
844:   if (xin != yin) {
845:     VecCUDAGetArray(xin,&xarray);
846:     VecCUDAGetArray(yin,&yarray);
847:     PetscLogGpuTimeBegin();
848:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
849:     PetscLogGpuTimeEnd();
850:     VecCUDARestoreArray(xin,&xarray);
851:     VecCUDARestoreArray(yin,&yarray);
852:   }
853:   return(0);
854: }

856: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
857: {
858:   PetscErrorCode    ierr;
859:   PetscScalar       a = alpha,b = beta;
860:   const PetscScalar *xarray;
861:   PetscScalar       *yarray;
862:   PetscBLASInt      one = 1, bn = 0;
863:   cublasHandle_t    cublasv2handle;
864:   cublasStatus_t    cberr;
865:   cudaError_t       err;

868:   PetscCUBLASGetHandle(&cublasv2handle);
869:   PetscBLASIntCast(yin->map->n,&bn);
870:   if (a == (PetscScalar)0.0) {
871:     VecScale_SeqCUDA(yin,beta);
872:   } else if (b == (PetscScalar)1.0) {
873:     VecAXPY_SeqCUDA(yin,alpha,xin);
874:   } else if (a == (PetscScalar)1.0) {
875:     VecAYPX_SeqCUDA(yin,beta,xin);
876:   } else if (b == (PetscScalar)0.0) {
877:     VecCUDAGetArrayRead(xin,&xarray);
878:     VecCUDAGetArray(yin,&yarray);
879:     PetscLogGpuTimeBegin();
880:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
881:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
882:     PetscLogGpuTimeEnd();
883:     PetscLogGpuFlops(xin->map->n);
884:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
885:     VecCUDARestoreArrayRead(xin,&xarray);
886:     VecCUDARestoreArray(yin,&yarray);
887:   } else {
888:     VecCUDAGetArrayRead(xin,&xarray);
889:     VecCUDAGetArray(yin,&yarray);
890:     PetscLogGpuTimeBegin();
891:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
892:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
893:     VecCUDARestoreArrayRead(xin,&xarray);
894:     VecCUDARestoreArray(yin,&yarray);
895:     PetscLogGpuTimeEnd();
896:     PetscLogGpuFlops(3.0*xin->map->n);
897:     PetscLogCpuToGpuScalar(2*sizeof(PetscScalar));
898:   }
899:   return(0);
900: }

902: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
903: {
905:   PetscInt       n = zin->map->n;

908:   if (gamma == (PetscScalar)1.0) {
909:     /* z = ax + b*y + z */
910:     VecAXPY_SeqCUDA(zin,alpha,xin);
911:     VecAXPY_SeqCUDA(zin,beta,yin);
912:     PetscLogGpuFlops(4.0*n);
913:   } else {
914:     /* z = a*x + b*y + c*z */
915:     VecScale_SeqCUDA(zin,gamma);
916:     VecAXPY_SeqCUDA(zin,alpha,xin);
917:     VecAXPY_SeqCUDA(zin,beta,yin);
918:     PetscLogGpuFlops(5.0*n);
919:   }
920:   return(0);
921: }

923: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
924: {
925:   PetscInt                              n = win->map->n;
926:   const PetscScalar                     *xarray,*yarray;
927:   PetscScalar                           *warray;
928:   thrust::device_ptr<const PetscScalar> xptr,yptr;
929:   thrust::device_ptr<PetscScalar>       wptr;
930:   PetscErrorCode                        ierr;

933:   VecCUDAGetArrayRead(xin,&xarray);
934:   VecCUDAGetArrayRead(yin,&yarray);
935:   VecCUDAGetArrayWrite(win,&warray);
936:   PetscLogGpuTimeBegin();
937:   try {
938:     wptr = thrust::device_pointer_cast(warray);
939:     xptr = thrust::device_pointer_cast(xarray);
940:     yptr = thrust::device_pointer_cast(yarray);
941:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
942:   } catch (char *ex) {
943:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
944:   }
945:   PetscLogGpuTimeEnd();
946:   VecCUDARestoreArrayRead(xin,&xarray);
947:   VecCUDARestoreArrayRead(yin,&yarray);
948:   VecCUDARestoreArrayWrite(win,&warray);
949:   PetscLogGpuFlops(n);
950:   return(0);
951: }

953: /* should do infinity norm in cuda */

955: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
956: {
957:   PetscErrorCode    ierr;
958:   PetscInt          n = xin->map->n;
959:   PetscBLASInt      one = 1, bn = 0;
960:   const PetscScalar *xarray;
961:   cublasHandle_t    cublasv2handle;
962:   cublasStatus_t    cberr;
963:   cudaError_t       err;

966:   PetscCUBLASGetHandle(&cublasv2handle);
967:   PetscBLASIntCast(n,&bn);
968:   if (type == NORM_2 || type == NORM_FROBENIUS) {
969:     VecCUDAGetArrayRead(xin,&xarray);
970:     PetscLogGpuTimeBegin();
971:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
972:     PetscLogGpuTimeEnd();
973:     VecCUDARestoreArrayRead(xin,&xarray);
974:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
975:   } else if (type == NORM_INFINITY) {
976:     int  i;
977:     VecCUDAGetArrayRead(xin,&xarray);
978:     PetscLogGpuTimeBegin();
979:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
980:     PetscLogGpuTimeEnd();
981:     if (bn) {
982:       PetscScalar zs;
983:       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
984:       *z = PetscAbsScalar(zs);
985:     } else *z = 0.0;
986:     VecCUDARestoreArrayRead(xin,&xarray);
987:   } else if (type == NORM_1) {
988:     VecCUDAGetArrayRead(xin,&xarray);
989:     PetscLogGpuTimeBegin();
990:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
991:     VecCUDARestoreArrayRead(xin,&xarray);
992:     PetscLogGpuTimeEnd();
993:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
994:   } else if (type == NORM_1_AND_2) {
995:     VecNorm_SeqCUDA(xin,NORM_1,z);
996:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
997:   }
998:   PetscLogGpuToCpuScalar(sizeof(PetscReal));
999:   return(0);
1000: }

1002: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1003: {

1007:   VecDot_SeqCUDA(s,t,dp);
1008:   VecDot_SeqCUDA(t,t,nm);
1009:   return(0);
1010: }

1012: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1013: {
1015:   cudaError_t    cerr;
1016:   Vec_CUDA       *veccuda = (Vec_CUDA*)v->spptr;

1019:   if (v->spptr) {
1020:     if (veccuda->GPUarray_allocated) {
1021:      #if defined(PETSC_HAVE_NVSHMEM)
1022:       if (veccuda->nvshmem) {
1023:         PetscNvshmemFree(veccuda->GPUarray_allocated);
1024:         veccuda->nvshmem = PETSC_FALSE;
1025:       }
1026:       else
1027:      #endif
1028:       {cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);}
1029:       veccuda->GPUarray_allocated = NULL;
1030:     }
1031:     if (veccuda->stream) {
1032:       cerr = cudaStreamDestroy(veccuda->stream);CHKERRCUDA(cerr);
1033:     }
1034:   }
1035:   VecDestroy_SeqCUDA_Private(v);
1036:   PetscFree(v->spptr);
1037:   return(0);
1038: }

1040: #if defined(PETSC_USE_COMPLEX)
1041: struct conjugate
1042: {
1043:   __host__ __device__
1044:     PetscScalar operator()(const PetscScalar& x)
1045:     {
1046:       return PetscConj(x);
1047:     }
1048: };
1049: #endif

1051: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1052: {
1053: #if defined(PETSC_USE_COMPLEX)
1054:   PetscScalar                     *xarray;
1055:   PetscErrorCode                  ierr;
1056:   PetscInt                        n = xin->map->n;
1057:   thrust::device_ptr<PetscScalar> xptr;

1060:   VecCUDAGetArray(xin,&xarray);
1061:   PetscLogGpuTimeBegin();
1062:   try {
1063:     xptr = thrust::device_pointer_cast(xarray);
1064:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1065:   } catch (char *ex) {
1066:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1067:   }
1068:   PetscLogGpuTimeEnd();
1069:   VecCUDARestoreArray(xin,&xarray);
1070: #else
1072: #endif
1073:   return(0);
1074: }

1076: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1077: {
1079:   cudaError_t    err;

1086:   if (w->data) {
1087:     if (((Vec_Seq*)w->data)->array_allocated) {
1088:       if (w->pinned_memory) {
1089:         PetscMallocSetCUDAHost();
1090:       }
1091:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1092:       if (w->pinned_memory) {
1093:         PetscMallocResetCUDAHost();
1094:         w->pinned_memory = PETSC_FALSE;
1095:       }
1096:     }
1097:     ((Vec_Seq*)w->data)->array = NULL;
1098:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1099:   }
1100:   if (w->spptr) {
1102:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1103:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1104:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1105:     }
1106:     if (((Vec_CUDA*)w->spptr)->stream) {
1107:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1108:     }
1109:     PetscFree(w->spptr);
1110:   }

1112:   if (v->petscnative) {
1113:     PetscFree(w->data);
1114:     w->data = v->data;
1115:     w->offloadmask = v->offloadmask;
1116:     w->pinned_memory = v->pinned_memory;
1117:     w->spptr = v->spptr;
1118:     PetscObjectStateIncrease((PetscObject)w);
1119:   } else {
1120:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1121:     w->offloadmask = PETSC_OFFLOAD_CPU;
1122:     VecCUDAAllocateCheck(w);
1123:   }
1124:   return(0);
1125: }

1127: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1128: {
1130:   cudaError_t    err;


1138:   if (v->petscnative) {
1139:     v->data = w->data;
1140:     v->offloadmask = w->offloadmask;
1141:     v->pinned_memory = w->pinned_memory;
1142:     v->spptr = w->spptr;
1143:     w->data = 0;
1144:     w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1145:     w->spptr = 0;
1146:   } else {
1147:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1148:     if ((Vec_CUDA*)w->spptr) {
1149:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1150:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1151:       if (((Vec_CUDA*)v->spptr)->stream) {
1152:         err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1153:       }
1154:       PetscFree(w->spptr);
1155:     }
1156:   }
1157:   return(0);
1158: }

1160: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1161: {
1162:   __host__ __device__
1163:   PetscReal operator()(const PetscScalar& x) {
1164:     return PetscRealPart(x);
1165:   }
1166: };

1168: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1169: {
1170:   __host__ __device__
1171:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt>& x) {
1172:     return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1173:   }
1174: };

1176: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1177: {
1178:   __host__ __device__
1179:   PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1180:     return x < y ? y : x;
1181:   }
1182: };

1184: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1185: {
1186:   __host__ __device__
1187:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1188:     return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1189:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1190:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1191:   }
1192: };

1194: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1195: {
1196:   __host__ __device__
1197:   PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1198:     return x < y ? x : y;
1199:   }
1200: };

1202: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1203: {
1204:   __host__ __device__
1205:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1206:     return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1207:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1208:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1209:   }
1210: };

1212: PetscErrorCode VecMax_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1213: {
1214:   PetscErrorCode                        ierr;
1215:   PetscInt                              n = v->map->n;
1216:   const PetscScalar                     *av;
1217:   thrust::device_ptr<const PetscScalar> avpt;

1221:   if (!n) {
1222:     *m = PETSC_MIN_REAL;
1223:     if (p) *p = -1;
1224:     return(0);
1225:   }
1226:   VecCUDAGetArrayRead(v,&av);
1227:   avpt = thrust::device_pointer_cast(av);
1228:   PetscLogGpuTimeBegin();
1229:   if (p) {
1230:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1231:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1232:     try {
1233: #if defined(PETSC_USE_COMPLEX)
1234:       res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmaxi());
1235: #else
1236:       res = thrust::reduce(zibit,zibit+n,res,petscmaxi());
1237: #endif
1238:     } catch (char *ex) {
1239:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1240:     }
1241:     *m = res.get<0>();
1242:     *p = res.get<1>();
1243:   } else {
1244:     try {
1245: #if defined(PETSC_USE_COMPLEX)
1246:       *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1247: #else
1248:       *m = thrust::reduce(avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1249: #endif
1250:     } catch (char *ex) {
1251:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1252:     }
1253:   }
1254:   PetscLogGpuTimeEnd();
1255:   VecCUDARestoreArrayRead(v,&av);
1256:   return(0);
1257: }

1259: PetscErrorCode VecMin_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1260: {
1261:   PetscErrorCode                        ierr;
1262:   PetscInt                              n = v->map->n;
1263:   const PetscScalar                     *av;
1264:   thrust::device_ptr<const PetscScalar> avpt;

1268:   if (!n) {
1269:     *m = PETSC_MAX_REAL;
1270:     if (p) *p = -1;
1271:     return(0);
1272:   }
1273:   VecCUDAGetArrayRead(v,&av);
1274:   avpt = thrust::device_pointer_cast(av);
1275:   PetscLogGpuTimeBegin();
1276:   if (p) {
1277:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1278:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1279:     try {
1280: #if defined(PETSC_USE_COMPLEX)
1281:       res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmini());
1282: #else
1283:       res = thrust::reduce(zibit,zibit+n,res,petscmini());
1284: #endif
1285:     } catch (char *ex) {
1286:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1287:     }
1288:     *m = res.get<0>();
1289:     *p = res.get<1>();
1290:   } else {
1291:     try {
1292: #if defined(PETSC_USE_COMPLEX)
1293:       *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1294: #else
1295:       *m = thrust::reduce(avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1296: #endif
1297:     } catch (char *ex) {
1298:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1299:     }
1300:   }
1301:   PetscLogGpuTimeEnd();
1302:   VecCUDARestoreArrayRead(v,&av);
1303:   return(0);
1304: }

1306: PetscErrorCode VecSum_SeqCUDA(Vec v,PetscScalar *sum)
1307: {
1308:   PetscErrorCode                        ierr;
1309:   PetscInt                              n = v->map->n;
1310:   const PetscScalar                     *a;
1311:   thrust::device_ptr<const PetscScalar> dptr;

1315:   VecCUDAGetArrayRead(v,&a);
1316:   dptr = thrust::device_pointer_cast(a);
1317:   PetscLogGpuTimeBegin();
1318:   try {
1319:     *sum = thrust::reduce(dptr,dptr+n,PetscScalar(0.0));
1320:   } catch (char *ex) {
1321:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1322:   }
1323:   PetscLogGpuTimeEnd();
1324:   VecCUDARestoreArrayRead(v,&a);
1325:   return(0);
1326: }

1328: struct petscshift : public thrust::unary_function<PetscScalar,PetscScalar>
1329: {
1330:   const PetscScalar shift_;
1331:   petscshift(PetscScalar shift) : shift_(shift){}
1332:   __host__ __device__
1333:   PetscScalar operator()(PetscScalar x) {return x + shift_;}
1334: };

1336: PetscErrorCode VecShift_SeqCUDA(Vec v,PetscScalar shift)
1337: {
1338:   PetscErrorCode                        ierr;
1339:   PetscInt                              n = v->map->n;
1340:   PetscScalar                           *a;
1341:   thrust::device_ptr<PetscScalar>       dptr;

1345:   VecCUDAGetArray(v,&a);
1346:   dptr = thrust::device_pointer_cast(a);
1347:   PetscLogGpuTimeBegin();
1348:   try {
1349:     thrust::transform(dptr,dptr+n,dptr,petscshift(shift)); /* in-place transform */
1350:   } catch (char *ex) {
1351:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1352:   }
1353:   PetscLogGpuTimeEnd();
1354:   VecCUDARestoreArray(v,&a);
1355:   return(0);
1356: }

1358: #if defined(PETSC_HAVE_NVSHMEM)
1359: /* Free old CUDA array and re-allocate a new one from nvshmem symmetric heap.
1360:    New array does not retain values in the old array. The offload mask is not changed.

1362:    Note: the function is only meant to be used in MatAssemblyEnd_MPIAIJCUSPARSE.
1363:  */
1364: PetscErrorCode  VecAllocateNVSHMEM_SeqCUDA(Vec v)
1365: {
1367:   cudaError_t    cerr;
1368:   Vec_CUDA       *veccuda = (Vec_CUDA*)v->spptr;
1369:   PetscInt       n;

1372:   cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);
1373:   VecGetLocalSize(v,&n);
1374:   MPIU_Allreduce(MPI_IN_PLACE,&n,1,MPIU_INT,MPI_MAX,PETSC_COMM_WORLD);
1375:   PetscNvshmemMalloc(n*sizeof(PetscScalar),(void**)&veccuda->GPUarray_allocated);
1376:   veccuda->GPUarray = veccuda->GPUarray_allocated;
1377:   veccuda->nvshmem  = PETSC_TRUE;
1378:   return(0);
1379: }
1380: #endif