Actual source code: veccuda2.cu

  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX

  8: #include <petscconf.h>
  9: #include <petsc/private/vecimpl.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <petsc/private/cudavecimpl.h>

 13: #include <cuda_runtime.h>
 14: #include <thrust/device_ptr.h>
 15: #include <thrust/transform.h>
 16: #include <thrust/functional.h>
 17: #include <thrust/reduce.h>

 19: /*
 20:     Allocates space for the vector array on the GPU if it does not exist.
 21:     Does NOT change the PetscCUDAFlag for the vector
 22:     Does NOT zero the CUDA array

 24:  */
 25: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 26: {
 28:   cudaError_t    err;
 29:   Vec_CUDA       *veccuda;
 30:   PetscBool      option_set;

 33:   if (!v->spptr) {
 34:     PetscReal pinned_memory_min;
 35:     PetscCalloc(sizeof(Vec_CUDA),&v->spptr);
 36:     veccuda = (Vec_CUDA*)v->spptr;
 37:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 38:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 39:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 40:       if (v->data && ((Vec_Seq*)v->data)->array) {
 41:         v->offloadmask = PETSC_OFFLOAD_CPU;
 42:       } else {
 43:         v->offloadmask = PETSC_OFFLOAD_GPU;
 44:       }
 45:     }
 46:     pinned_memory_min = 0;

 48:     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
 49:        Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
 50:     PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
 51:     PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
 52:     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
 53:     PetscOptionsEnd();
 54:   }
 55:   return(0);
 56: }

 58: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 59: PetscErrorCode VecCUDACopyToGPU(Vec v)
 60: {
 62:   cudaError_t    err;
 63:   Vec_CUDA       *veccuda;
 64:   PetscScalar    *varray;

 68:   VecCUDAAllocateCheck(v);
 69:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 70:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 71:     veccuda            = (Vec_CUDA*)v->spptr;
 72:     varray             = veccuda->GPUarray;
 73:     err                = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 74:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
 75:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 76:     v->offloadmask = PETSC_OFFLOAD_BOTH;
 77:   }
 78:   return(0);
 79: }

 81: /*
 82:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
 83: */
 84: PetscErrorCode VecCUDACopyFromGPU(Vec v)
 85: {
 87:   cudaError_t    err;
 88:   Vec_CUDA       *veccuda;
 89:   PetscScalar    *varray;

 93:   VecCUDAAllocateCheckHost(v);
 94:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
 95:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
 96:     veccuda            = (Vec_CUDA*)v->spptr;
 97:     varray             = veccuda->GPUarray;
 98:     err                = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
 99:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
100:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
101:     v->offloadmask     = PETSC_OFFLOAD_BOTH;
102:   }
103:   return(0);
104: }

106: /*MC
107:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

109:    Options Database Keys:
110: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

112:   Level: beginner

114: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
115: M*/

117: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
118: {
119:   const PetscScalar *xarray;
120:   PetscScalar       *yarray;
121:   PetscErrorCode    ierr;
122:   PetscBLASInt      one = 1,bn = 0;
123:   PetscScalar       sone = 1.0;
124:   cublasHandle_t    cublasv2handle;
125:   cublasStatus_t    cberr;
126:   cudaError_t       err;

129:   PetscCUBLASGetHandle(&cublasv2handle);
130:   PetscBLASIntCast(yin->map->n,&bn);
131:   VecCUDAGetArrayRead(xin,&xarray);
132:   VecCUDAGetArray(yin,&yarray);
133:   PetscLogGpuTimeBegin();
134:   if (alpha == (PetscScalar)0.0) {
135:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
136:   } else if (alpha == (PetscScalar)1.0) {
137:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
138:     PetscLogGpuFlops(1.0*yin->map->n);
139:   } else {
140:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
141:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
142:     PetscLogGpuFlops(2.0*yin->map->n);
143:   }
144:   err  = WaitForCUDA();CHKERRCUDA(err);
145:   PetscLogGpuTimeEnd();
146:   VecCUDARestoreArrayRead(xin,&xarray);
147:   VecCUDARestoreArray(yin,&yarray);
148:   PetscLogCpuToGpu(sizeof(PetscScalar));
149:   return(0);
150: }

152: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
153: {
154:   const PetscScalar *xarray;
155:   PetscScalar       *yarray;
156:   PetscErrorCode    ierr;
157:   PetscBLASInt      one = 1,bn = 0;
158:   cublasHandle_t    cublasv2handle;
159:   cublasStatus_t    cberr;
160:   PetscBool         xiscuda;
161:   cudaError_t       err;

164:   if (alpha == (PetscScalar)0.0) return(0);
165:   PetscCUBLASGetHandle(&cublasv2handle);
166:   PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
167:   if (xiscuda) {
168:     PetscBLASIntCast(yin->map->n,&bn);
169:     VecCUDAGetArrayRead(xin,&xarray);
170:     VecCUDAGetArray(yin,&yarray);
171:     PetscLogGpuTimeBegin();
172:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
173:     err  = WaitForCUDA();CHKERRCUDA(err);
174:     PetscLogGpuTimeEnd();
175:     VecCUDARestoreArrayRead(xin,&xarray);
176:     VecCUDARestoreArray(yin,&yarray);
177:     PetscLogGpuFlops(2.0*yin->map->n);
178:     PetscLogCpuToGpu(sizeof(PetscScalar));
179:   } else {
180:     VecAXPY_Seq(yin,alpha,xin);
181:   }
182:   return(0);
183: }

185: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
186: {
187:   PetscInt                              n = xin->map->n;
188:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
189:   PetscScalar                           *warray=NULL;
190:   thrust::device_ptr<const PetscScalar> xptr,yptr;
191:   thrust::device_ptr<PetscScalar>       wptr;
192:   PetscErrorCode                        ierr;
193:   cudaError_t                           err;

196:   VecCUDAGetArrayWrite(win,&warray);
197:   VecCUDAGetArrayRead(xin,&xarray);
198:   VecCUDAGetArrayRead(yin,&yarray);
199:   PetscLogGpuTimeBegin();
200:   try {
201:     wptr = thrust::device_pointer_cast(warray);
202:     xptr = thrust::device_pointer_cast(xarray);
203:     yptr = thrust::device_pointer_cast(yarray);
204:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
205:     err  = WaitForCUDA();CHKERRCUDA(err);
206:   } catch (char *ex) {
207:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
208:   }
209:   PetscLogGpuTimeEnd();
210:   PetscLogGpuFlops(n);
211:   VecCUDARestoreArrayRead(xin,&xarray);
212:   VecCUDARestoreArrayRead(yin,&yarray);
213:   VecCUDARestoreArrayWrite(win,&warray);
214:   return(0);
215: }

217: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
218: {
219:   const PetscScalar *xarray=NULL,*yarray=NULL;
220:   PetscScalar       *warray=NULL;
221:   PetscErrorCode    ierr;
222:   PetscBLASInt      one = 1,bn = 0;
223:   cublasHandle_t    cublasv2handle;
224:   cublasStatus_t    stat;
225:   cudaError_t       cerr;
226:   cudaStream_t      stream;

229:   PetscCUBLASGetHandle(&cublasv2handle);
230:   PetscBLASIntCast(win->map->n,&bn);
231:   if (alpha == (PetscScalar)0.0) {
232:     VecCopy_SeqCUDA(yin,win);
233:   } else {
234:     VecCUDAGetArrayRead(xin,&xarray);
235:     VecCUDAGetArrayRead(yin,&yarray);
236:     VecCUDAGetArrayWrite(win,&warray);
237:     PetscLogGpuTimeBegin();
238:     stat = cublasGetStream(cublasv2handle,&stream);CHKERRCUBLAS(stat);
239:     cerr = cudaMemcpyAsync(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,stream);CHKERRCUDA(cerr);
240:     stat = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(stat);
241:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
242:     PetscLogGpuTimeEnd();
243:     PetscLogGpuFlops(2*win->map->n);
244:     VecCUDARestoreArrayRead(xin,&xarray);
245:     VecCUDARestoreArrayRead(yin,&yarray);
246:     VecCUDARestoreArrayWrite(win,&warray);
247:     PetscLogCpuToGpu(sizeof(PetscScalar));
248:   }
249:   return(0);
250: }

252: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
253: {
254:   PetscErrorCode    ierr;
255:   cudaError_t       err;
256:   PetscInt          n = xin->map->n,j;
257:   PetscScalar       *xarray;
258:   const PetscScalar *yarray;
259:   PetscBLASInt      one = 1,bn = 0;
260:   cublasHandle_t    cublasv2handle;
261:   cublasStatus_t    cberr;

264:   PetscLogGpuFlops(nv*2.0*n);
265:   PetscLogCpuToGpu(nv*sizeof(PetscScalar));
266:   PetscCUBLASGetHandle(&cublasv2handle);
267:   PetscBLASIntCast(n,&bn);
268:   PetscLogGpuTimeBegin();
269:   VecCUDAGetArray(xin,&xarray);
270:   for (j=0; j<nv; j++) {
271:     VecCUDAGetArrayRead(y[j],&yarray);
272:     cberr = cublasXaxpy(cublasv2handle,bn,alpha+j,yarray,one,xarray,one);CHKERRCUBLAS(cberr);
273:     VecCUDARestoreArrayRead(y[j],&yarray);
274:   }
275:   err  = WaitForCUDA();CHKERRCUDA(err);
276:   PetscLogGpuTimeEnd();
277:   VecCUDARestoreArray(xin,&xarray);
278:   return(0);
279: }

281: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
282: {
283:   const PetscScalar *xarray,*yarray;
284:   PetscErrorCode    ierr;
285:   PetscBLASInt      one = 1,bn = 0;
286:   cublasHandle_t    cublasv2handle;
287:   cublasStatus_t    cerr;

290:   PetscCUBLASGetHandle(&cublasv2handle);
291:   PetscBLASIntCast(yin->map->n,&bn);
292:   VecCUDAGetArrayRead(xin,&xarray);
293:   VecCUDAGetArrayRead(yin,&yarray);
294:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
295:   PetscLogGpuTimeBegin();
296:   cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
297:   PetscLogGpuTimeEnd();
298:   if (xin->map->n >0) {
299:     PetscLogGpuFlops(2.0*xin->map->n-1);
300:   }
301:   PetscLogGpuToCpu(sizeof(PetscScalar));
302:   VecCUDARestoreArrayRead(xin,&xarray);
303:   VecCUDARestoreArrayRead(yin,&yarray);
304:   return(0);
305: }

307: //
308: // CUDA kernels for MDot to follow
309: //

311: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
312: #define MDOT_WORKGROUP_SIZE 128
313: #define MDOT_WORKGROUP_NUM  128

315: #if !defined(PETSC_USE_COMPLEX)
316: // M = 2:
317: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
318:                                         PetscInt size, PetscScalar *group_results)
319: {
320:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
321:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
322:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
323:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
324:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

326:   PetscScalar entry_x    = 0;
327:   PetscScalar group_sum0 = 0;
328:   PetscScalar group_sum1 = 0;
329:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
330:     entry_x     = x[i];   // load only once from global memory!
331:     group_sum0 += entry_x * y0[i];
332:     group_sum1 += entry_x * y1[i];
333:   }
334:   tmp_buffer[threadIdx.x]                       = group_sum0;
335:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

337:   // parallel reduction
338:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
339:     __syncthreads();
340:     if (threadIdx.x < stride) {
341:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
342:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
343:     }
344:   }

346:   // write result of group to group_results
347:   if (threadIdx.x == 0) {
348:     group_results[blockIdx.x]             = tmp_buffer[0];
349:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
350:   }
351: }

353: // M = 3:
354: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
355:                                         PetscInt size, PetscScalar *group_results)
356: {
357:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
358:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
359:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
360:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
361:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

363:   PetscScalar entry_x    = 0;
364:   PetscScalar group_sum0 = 0;
365:   PetscScalar group_sum1 = 0;
366:   PetscScalar group_sum2 = 0;
367:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
368:     entry_x     = x[i];   // load only once from global memory!
369:     group_sum0 += entry_x * y0[i];
370:     group_sum1 += entry_x * y1[i];
371:     group_sum2 += entry_x * y2[i];
372:   }
373:   tmp_buffer[threadIdx.x]                           = group_sum0;
374:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
375:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

377:   // parallel reduction
378:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
379:     __syncthreads();
380:     if (threadIdx.x < stride) {
381:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
382:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
383:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
384:     }
385:   }

387:   // write result of group to group_results
388:   if (threadIdx.x == 0) {
389:     group_results[blockIdx.x                ] = tmp_buffer[0];
390:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
391:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
392:   }
393: }

395: // M = 4:
396: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
397:                                         PetscInt size, PetscScalar *group_results)
398: {
399:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
400:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
401:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
402:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
403:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

405:   PetscScalar entry_x    = 0;
406:   PetscScalar group_sum0 = 0;
407:   PetscScalar group_sum1 = 0;
408:   PetscScalar group_sum2 = 0;
409:   PetscScalar group_sum3 = 0;
410:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
411:     entry_x     = x[i];   // load only once from global memory!
412:     group_sum0 += entry_x * y0[i];
413:     group_sum1 += entry_x * y1[i];
414:     group_sum2 += entry_x * y2[i];
415:     group_sum3 += entry_x * y3[i];
416:   }
417:   tmp_buffer[threadIdx.x]                           = group_sum0;
418:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
419:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
420:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

422:   // parallel reduction
423:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
424:     __syncthreads();
425:     if (threadIdx.x < stride) {
426:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
427:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
428:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
429:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
430:     }
431:   }

433:   // write result of group to group_results
434:   if (threadIdx.x == 0) {
435:     group_results[blockIdx.x                ] = tmp_buffer[0];
436:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
437:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
438:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
439:   }
440: }

442: // M = 8:
443: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
444:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
445:                                           PetscInt size, PetscScalar *group_results)
446: {
447:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
448:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
449:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
450:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
451:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

453:   PetscScalar entry_x    = 0;
454:   PetscScalar group_sum0 = 0;
455:   PetscScalar group_sum1 = 0;
456:   PetscScalar group_sum2 = 0;
457:   PetscScalar group_sum3 = 0;
458:   PetscScalar group_sum4 = 0;
459:   PetscScalar group_sum5 = 0;
460:   PetscScalar group_sum6 = 0;
461:   PetscScalar group_sum7 = 0;
462:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
463:     entry_x     = x[i];   // load only once from global memory!
464:     group_sum0 += entry_x * y0[i];
465:     group_sum1 += entry_x * y1[i];
466:     group_sum2 += entry_x * y2[i];
467:     group_sum3 += entry_x * y3[i];
468:     group_sum4 += entry_x * y4[i];
469:     group_sum5 += entry_x * y5[i];
470:     group_sum6 += entry_x * y6[i];
471:     group_sum7 += entry_x * y7[i];
472:   }
473:   tmp_buffer[threadIdx.x]                           = group_sum0;
474:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
475:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
476:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
477:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
478:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
479:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
480:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

482:   // parallel reduction
483:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
484:     __syncthreads();
485:     if (threadIdx.x < stride) {
486:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
487:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
488:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
489:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
490:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
491:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
492:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
493:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
494:     }
495:   }

497:   // write result of group to group_results
498:   if (threadIdx.x == 0) {
499:     group_results[blockIdx.x                ] = tmp_buffer[0];
500:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
501:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
502:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
503:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
504:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
505:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
506:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
507:   }
508: }
509: #endif /* !defined(PETSC_USE_COMPLEX) */

511: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
512: {
513:   PetscErrorCode    ierr;
514:   PetscInt          i,n = xin->map->n,current_y_index = 0;
515:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
516: #if !defined(PETSC_USE_COMPLEX)
517:   PetscInt          nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
518:   PetscScalar       *group_results_gpu,*group_results_cpu;
519:   cudaError_t       cuda_ierr;
520: #endif
521:   PetscBLASInt      one = 1,bn = 0;
522:   cublasHandle_t    cublasv2handle;
523:   cublasStatus_t    cberr;
524:   cudaError_t       err;

527:   PetscCUBLASGetHandle(&cublasv2handle);
528:   PetscBLASIntCast(xin->map->n,&bn);
529:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
530:   /* Handle the case of local size zero first */
531:   if (!xin->map->n) {
532:     for (i=0; i<nv; ++i) z[i] = 0;
533:     return(0);
534:   }

536: #if !defined(PETSC_USE_COMPLEX)
537:   // allocate scratchpad memory for the results of individual work groups:
538:   cuda_cudaMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);CHKERRCUDA(cuda_ierr);
539:   PetscMalloc1(nv1*MDOT_WORKGROUP_NUM,&group_results_cpu);
540: #endif
541:   VecCUDAGetArrayRead(xin,&xptr);
542:   PetscLogGpuTimeBegin();

544:   while (current_y_index < nv)
545:   {
546:     switch (nv - current_y_index) {

548:       case 7:
549:       case 6:
550:       case 5:
551:       case 4:
552:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
553:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
554:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
555:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
556: #if defined(PETSC_USE_COMPLEX)
557:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
558:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
559:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
560:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
561: #else
562:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);

564: #endif
565:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
566:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
567:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
568:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
569:         current_y_index += 4;
570:         break;

572:       case 3:
573:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
574:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
575:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

577: #if defined(PETSC_USE_COMPLEX)
578:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
579:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
580:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
581: #else
582:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
583: #endif
584:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
585:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
586:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
587:         current_y_index += 3;
588:         break;

590:       case 2:
591:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
592:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
593: #if defined(PETSC_USE_COMPLEX)
594:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
595:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
596: #else
597:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
598: #endif
599:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
600:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
601:         current_y_index += 2;
602:         break;

604:       case 1:
605:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
606:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
607:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
608:         current_y_index += 1;
609:         break;

611:       default: // 8 or more vectors left
612:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
613:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
614:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
615:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
616:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
617:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
618:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
619:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
620: #if defined(PETSC_USE_COMPLEX)
621:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
622:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
623:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
624:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
625:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
626:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
627:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
628:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
629: #else
630:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
631: #endif
632:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
633:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
634:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
635:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
636:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
637:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
638:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
639:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
640:         current_y_index += 8;
641:         break;
642:     }
643:   }
644:   err  = WaitForCUDA();CHKERRCUDA(err);
645:   PetscLogGpuTimeEnd();
646:   VecCUDARestoreArrayRead(xin,&xptr);

648: #if defined(PETSC_USE_COMPLEX)
649:   PetscLogGpuToCpu(nv*sizeof(PetscScalar));
650: #else
651:   // copy results to CPU
652:   cuda_cudaMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

654:   // sum group results into z
655:   for (j=0; j<nv1; ++j) {
656:     z[j] = 0;
657:     for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
658:   }
659:   PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
660:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
661:   PetscFree(group_results_cpu);
662:   PetscLogGpuToCpu(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
663: #endif
664:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
665:   return(0);
666: }

668: #undef MDOT_WORKGROUP_SIZE
669: #undef MDOT_WORKGROUP_NUM

671: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
672: {
673:   PetscInt                        n = xin->map->n;
674:   PetscScalar                     *xarray = NULL;
675:   thrust::device_ptr<PetscScalar> xptr;
676:   PetscErrorCode                  ierr;
677:   cudaError_t                     err;

680:   VecCUDAGetArrayWrite(xin,&xarray);
681:   PetscLogGpuTimeBegin();
682:   if (alpha == (PetscScalar)0.0) {
683:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
684:   } else {
685:     try {
686:       xptr = thrust::device_pointer_cast(xarray);
687:       thrust::fill(xptr,xptr+n,alpha);
688:     } catch (char *ex) {
689:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
690:     }
691:     PetscLogCpuToGpu(sizeof(PetscScalar));
692:   }
693:   err  = WaitForCUDA();CHKERRCUDA(err);
694:   PetscLogGpuTimeEnd();
695:   VecCUDARestoreArrayWrite(xin,&xarray);
696:   return(0);
697: }

699: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
700: {
701:   PetscScalar    *xarray;
703:   PetscBLASInt   one = 1,bn = 0;
704:   cublasHandle_t cublasv2handle;
705:   cublasStatus_t cberr;
706:   cudaError_t    err;

709:   if (alpha == (PetscScalar)0.0) {
710:     VecSet_SeqCUDA(xin,alpha);
711:     err  = WaitForCUDA();CHKERRCUDA(err);
712:   } else if (alpha != (PetscScalar)1.0) {
713:     PetscCUBLASGetHandle(&cublasv2handle);
714:     PetscBLASIntCast(xin->map->n,&bn);
715:     VecCUDAGetArray(xin,&xarray);
716:     PetscLogGpuTimeBegin();
717:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
718:     VecCUDARestoreArray(xin,&xarray);
719:     err  = WaitForCUDA();CHKERRCUDA(err);
720:     PetscLogGpuTimeEnd();
721:     PetscLogCpuToGpu(sizeof(PetscScalar));
722:     PetscLogGpuFlops(xin->map->n);
723:   }
724:   return(0);
725: }

727: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
728: {
729:   const PetscScalar *xarray,*yarray;
730:   PetscErrorCode    ierr;
731:   PetscBLASInt      one = 1,bn = 0;
732:   cublasHandle_t    cublasv2handle;
733:   cublasStatus_t    cerr;

736:   PetscCUBLASGetHandle(&cublasv2handle);
737:   PetscBLASIntCast(xin->map->n,&bn);
738:   VecCUDAGetArrayRead(xin,&xarray);
739:   VecCUDAGetArrayRead(yin,&yarray);
740:   PetscLogGpuTimeBegin();
741:   cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
742:   PetscLogGpuTimeEnd();
743:   if (xin->map->n > 0) {
744:     PetscLogGpuFlops(2.0*xin->map->n-1);
745:   }
746:   PetscLogGpuToCpu(sizeof(PetscScalar));
747:   VecCUDARestoreArrayRead(yin,&yarray);
748:   VecCUDARestoreArrayRead(xin,&xarray);
749:   return(0);
750: }

752: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
753: {
754:   const PetscScalar *xarray;
755:   PetscScalar       *yarray;
756:   PetscErrorCode    ierr;
757:   cudaError_t       err;

760:   if (xin != yin) {
761:     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
762:       PetscBool yiscuda;

764:       PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
765:       VecCUDAGetArrayRead(xin,&xarray);
766:       if (yiscuda) {
767:         VecCUDAGetArrayWrite(yin,&yarray);
768:       } else {
769:         VecGetArrayWrite(yin,&yarray);
770:       }
771:       PetscLogGpuTimeBegin();
772:       if (yiscuda) {
773:         err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
774:       } else {
775:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
776:       }
777:       err  = WaitForCUDA();CHKERRCUDA(err);
778:       PetscLogGpuTimeEnd();
779:       VecCUDARestoreArrayRead(xin,&xarray);
780:       if (yiscuda) {
781:         VecCUDARestoreArrayWrite(yin,&yarray);
782:       } else {
783:         VecRestoreArrayWrite(yin,&yarray);
784:       }
785:     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
786:       /* copy in CPU if we are on the CPU */
787:       VecCopy_SeqCUDA_Private(xin,yin);
788:     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
789:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
790:       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
791:         /* copy in CPU */
792:         VecCopy_SeqCUDA_Private(xin,yin);
793:       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
794:         /* copy in GPU */
795:         VecCUDAGetArrayRead(xin,&xarray);
796:         VecCUDAGetArrayWrite(yin,&yarray);
797:         PetscLogGpuTimeBegin();
798:         err  = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
799:         PetscLogGpuTimeEnd();
800:         VecCUDARestoreArrayRead(xin,&xarray);
801:         VecCUDARestoreArrayWrite(yin,&yarray);
802:       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
803:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
804:            default to copy in GPU (this is an arbitrary choice) */
805:         VecCUDAGetArrayRead(xin,&xarray);
806:         VecCUDAGetArrayWrite(yin,&yarray);
807:         PetscLogGpuTimeBegin();
808:         err  = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
809:         PetscLogGpuTimeEnd();
810:         VecCUDARestoreArrayRead(xin,&xarray);
811:         VecCUDARestoreArrayWrite(yin,&yarray);
812:       } else {
813:         VecCopy_SeqCUDA_Private(xin,yin);
814:       }
815:     }
816:   }
817:   return(0);
818: }

820: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
821: {
823:   PetscBLASInt   one = 1,bn = 0;
824:   PetscScalar    *xarray,*yarray;
825:   cublasHandle_t cublasv2handle;
826:   cublasStatus_t cberr;
827:   cudaError_t    err;

830:   PetscCUBLASGetHandle(&cublasv2handle);
831:   PetscBLASIntCast(xin->map->n,&bn);
832:   if (xin != yin) {
833:     VecCUDAGetArray(xin,&xarray);
834:     VecCUDAGetArray(yin,&yarray);
835:     PetscLogGpuTimeBegin();
836:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
837:     err  = WaitForCUDA();CHKERRCUDA(err);
838:     PetscLogGpuTimeEnd();
839:     VecCUDARestoreArray(xin,&xarray);
840:     VecCUDARestoreArray(yin,&yarray);
841:   }
842:   return(0);
843: }

845: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
846: {
847:   PetscErrorCode    ierr;
848:   PetscScalar       a = alpha,b = beta;
849:   const PetscScalar *xarray;
850:   PetscScalar       *yarray;
851:   PetscBLASInt      one = 1, bn = 0;
852:   cublasHandle_t    cublasv2handle;
853:   cublasStatus_t    cberr;
854:   cudaError_t       err;

857:   PetscCUBLASGetHandle(&cublasv2handle);
858:   PetscBLASIntCast(yin->map->n,&bn);
859:   if (a == (PetscScalar)0.0) {
860:     VecScale_SeqCUDA(yin,beta);
861:   } else if (b == (PetscScalar)1.0) {
862:     VecAXPY_SeqCUDA(yin,alpha,xin);
863:   } else if (a == (PetscScalar)1.0) {
864:     VecAYPX_SeqCUDA(yin,beta,xin);
865:   } else if (b == (PetscScalar)0.0) {
866:     VecCUDAGetArrayRead(xin,&xarray);
867:     VecCUDAGetArray(yin,&yarray);
868:     PetscLogGpuTimeBegin();
869:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
870:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
871:     err  = WaitForCUDA();CHKERRCUDA(err);
872:     PetscLogGpuTimeEnd();
873:     PetscLogGpuFlops(xin->map->n);
874:     PetscLogCpuToGpu(sizeof(PetscScalar));
875:     VecCUDARestoreArrayRead(xin,&xarray);
876:     VecCUDARestoreArray(yin,&yarray);
877:   } else {
878:     VecCUDAGetArrayRead(xin,&xarray);
879:     VecCUDAGetArray(yin,&yarray);
880:     PetscLogGpuTimeBegin();
881:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
882:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
883:     VecCUDARestoreArrayRead(xin,&xarray);
884:     VecCUDARestoreArray(yin,&yarray);
885:     err  = WaitForCUDA();CHKERRCUDA(err);
886:     PetscLogGpuTimeEnd();
887:     PetscLogGpuFlops(3.0*xin->map->n);
888:     PetscLogCpuToGpu(2*sizeof(PetscScalar));
889:   }
890:   return(0);
891: }

893: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
894: {
896:   PetscInt       n = zin->map->n;

899:   if (gamma == (PetscScalar)1.0) {
900:     /* z = ax + b*y + z */
901:     VecAXPY_SeqCUDA(zin,alpha,xin);
902:     VecAXPY_SeqCUDA(zin,beta,yin);
903:     PetscLogGpuFlops(4.0*n);
904:   } else {
905:     /* z = a*x + b*y + c*z */
906:     VecScale_SeqCUDA(zin,gamma);
907:     VecAXPY_SeqCUDA(zin,alpha,xin);
908:     VecAXPY_SeqCUDA(zin,beta,yin);
909:     PetscLogGpuFlops(5.0*n);
910:   }
911:   return(0);
912: }

914: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
915: {
916:   PetscInt                              n = win->map->n;
917:   const PetscScalar                     *xarray,*yarray;
918:   PetscScalar                           *warray;
919:   thrust::device_ptr<const PetscScalar> xptr,yptr;
920:   thrust::device_ptr<PetscScalar>       wptr;
921:   PetscErrorCode                        ierr;
922:   cudaError_t                           err;

925:   VecCUDAGetArrayRead(xin,&xarray);
926:   VecCUDAGetArrayRead(yin,&yarray);
927:   VecCUDAGetArrayWrite(win,&warray);
928:   PetscLogGpuTimeBegin();
929:   try {
930:     wptr = thrust::device_pointer_cast(warray);
931:     xptr = thrust::device_pointer_cast(xarray);
932:     yptr = thrust::device_pointer_cast(yarray);
933:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
934:     err  = WaitForCUDA();CHKERRCUDA(err);
935:   } catch (char *ex) {
936:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
937:   }
938:   PetscLogGpuTimeEnd();
939:   VecCUDARestoreArrayRead(xin,&xarray);
940:   VecCUDARestoreArrayRead(yin,&yarray);
941:   VecCUDARestoreArrayWrite(win,&warray);
942:   PetscLogGpuFlops(n);
943:   return(0);
944: }

946: /* should do infinity norm in cuda */

948: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
949: {
950:   PetscErrorCode    ierr;
951:   PetscInt          n = xin->map->n;
952:   PetscBLASInt      one = 1, bn = 0;
953:   const PetscScalar *xarray;
954:   cublasHandle_t    cublasv2handle;
955:   cublasStatus_t    cberr;
956:   cudaError_t       err;

959:   PetscCUBLASGetHandle(&cublasv2handle);
960:   PetscBLASIntCast(n,&bn);
961:   if (type == NORM_2 || type == NORM_FROBENIUS) {
962:     VecCUDAGetArrayRead(xin,&xarray);
963:     PetscLogGpuTimeBegin();
964:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
965:     PetscLogGpuTimeEnd();
966:     VecCUDARestoreArrayRead(xin,&xarray);
967:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
968:   } else if (type == NORM_INFINITY) {
969:     int  i;
970:     VecCUDAGetArrayRead(xin,&xarray);
971:     PetscLogGpuTimeBegin();
972:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
973:     PetscLogGpuTimeEnd();
974:     if (bn) {
975:       PetscScalar zs;
976:       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
977:       *z = PetscAbsScalar(zs);
978:     } else *z = 0.0;
979:     VecCUDARestoreArrayRead(xin,&xarray);
980:   } else if (type == NORM_1) {
981:     VecCUDAGetArrayRead(xin,&xarray);
982:     PetscLogGpuTimeBegin();
983:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
984:     VecCUDARestoreArrayRead(xin,&xarray);
985:     PetscLogGpuTimeEnd();
986:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
987:   } else if (type == NORM_1_AND_2) {
988:     VecNorm_SeqCUDA(xin,NORM_1,z);
989:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
990:   }
991:   PetscLogGpuToCpu(sizeof(PetscReal));
992:   return(0);
993: }

995: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
996: {

1000:   VecDot_SeqCUDA(s,t,dp);
1001:   VecDot_SeqCUDA(t,t,nm);
1002:   return(0);
1003: }

1005: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1006: {
1008:   cudaError_t    cerr;
1009:   Vec_CUDA       *veccuda = (Vec_CUDA*)v->spptr;

1012:   if (v->spptr) {
1013:     if (veccuda->GPUarray_allocated) {
1014:      #if defined(PETSC_HAVE_NVSHMEM)
1015:       if (veccuda->nvshmem) {
1016:         PetscNvshmemFree(veccuda->GPUarray_allocated);
1017:         veccuda->nvshmem = PETSC_FALSE;
1018:       }
1019:       else
1020:      #endif
1021:       {cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);}
1022:       veccuda->GPUarray_allocated = NULL;
1023:     }
1024:     if (veccuda->stream) {
1025:       cerr = cudaStreamDestroy(veccuda->stream);CHKERRCUDA(cerr);
1026:     }
1027:   }
1028:   VecDestroy_SeqCUDA_Private(v);
1029:   PetscFree(v->spptr);
1030:   return(0);
1031: }

1033: #if defined(PETSC_USE_COMPLEX)
1034: struct conjugate
1035: {
1036:   __host__ __device__
1037:     PetscScalar operator()(PetscScalar x)
1038:     {
1039:       return PetscConj(x);
1040:     }
1041: };
1042: #endif

1044: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1045: {
1046: #if defined(PETSC_USE_COMPLEX)
1047:   PetscScalar                     *xarray;
1048:   PetscErrorCode                  ierr;
1049:   PetscInt                        n = xin->map->n;
1050:   thrust::device_ptr<PetscScalar> xptr;
1051:   cudaError_t                     err;

1054:   VecCUDAGetArray(xin,&xarray);
1055:   PetscLogGpuTimeBegin();
1056:   try {
1057:     xptr = thrust::device_pointer_cast(xarray);
1058:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1059:     err  = WaitForCUDA();CHKERRCUDA(err);
1060:   } catch (char *ex) {
1061:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1062:   }
1063:   PetscLogGpuTimeEnd();
1064:   VecCUDARestoreArray(xin,&xarray);
1065: #else
1067: #endif
1068:   return(0);
1069: }

1071: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1072: {
1074:   cudaError_t    err;

1081:   if (w->data) {
1082:     if (((Vec_Seq*)w->data)->array_allocated) {
1083:       if (w->pinned_memory) {
1084:         PetscMallocSetCUDAHost();
1085:       }
1086:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1087:       if (w->pinned_memory) {
1088:         PetscMallocResetCUDAHost();
1089:         w->pinned_memory = PETSC_FALSE;
1090:       }
1091:     }
1092:     ((Vec_Seq*)w->data)->array = NULL;
1093:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1094:   }
1095:   if (w->spptr) {
1097:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1098:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1099:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1100:     }
1101:     if (((Vec_CUDA*)w->spptr)->stream) {
1102:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1103:     }
1104:     PetscFree(w->spptr);
1105:   }

1107:   if (v->petscnative) {
1108:     PetscFree(w->data);
1109:     w->data = v->data;
1110:     w->offloadmask = v->offloadmask;
1111:     w->pinned_memory = v->pinned_memory;
1112:     w->spptr = v->spptr;
1113:     PetscObjectStateIncrease((PetscObject)w);
1114:   } else {
1115:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1116:     w->offloadmask = PETSC_OFFLOAD_CPU;
1117:     VecCUDAAllocateCheck(w);
1118:   }
1119:   return(0);
1120: }

1122: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1123: {
1125:   cudaError_t    err;


1133:   if (v->petscnative) {
1134:     v->data = w->data;
1135:     v->offloadmask = w->offloadmask;
1136:     v->pinned_memory = w->pinned_memory;
1137:     v->spptr = w->spptr;
1138:     w->data = 0;
1139:     w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1140:     w->spptr = 0;
1141:   } else {
1142:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1143:     if ((Vec_CUDA*)w->spptr) {
1144:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1145:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1146:       if (((Vec_CUDA*)v->spptr)->stream) {
1147:         err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1148:       }
1149:       PetscFree(w->spptr);
1150:     }
1151:   }
1152:   return(0);
1153: }

1155: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1156: {
1157:   __host__ __device__
1158:   PetscReal operator()(PetscScalar x) {
1159:     return PetscRealPart(x);
1160:   }
1161: };

1163: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1164: {
1165:   __host__ __device__
1166:   thrust::tuple<PetscReal, PetscInt> operator()(thrust::tuple<PetscScalar, PetscInt> x) {
1167:     return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1168:   }
1169: };

1171: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1172: {
1173:   __host__ __device__
1174:   PetscReal operator()(PetscReal x, PetscReal y) {
1175:     return x < y ? y : x;
1176:   }
1177: };

1179: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1180: {
1181:   __host__ __device__
1182:   thrust::tuple<PetscReal, PetscInt> operator()(thrust::tuple<PetscReal, PetscInt> x, thrust::tuple<PetscReal, PetscInt> y) {
1183:     return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1184:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1185:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1186:   }
1187: };

1189: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1190: {
1191:   __host__ __device__
1192:   PetscReal operator()(PetscReal x, PetscReal y) {
1193:     return x < y ? x : y;
1194:   }
1195: };

1197: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1198: {
1199:   __host__ __device__
1200:   thrust::tuple<PetscReal, PetscInt> operator()(thrust::tuple<PetscReal, PetscInt> x, thrust::tuple<PetscReal, PetscInt> y) {
1201:     return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1202:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1203:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1204:   }
1205: };

1207: PetscErrorCode VecMax_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1208: {
1209:   PetscErrorCode                        ierr;
1210:   PetscInt                              n = v->map->n;
1211:   const PetscScalar                     *av;
1212:   thrust::device_ptr<const PetscScalar> avpt;

1216:   if (!n) {
1217:     *m = PETSC_MIN_REAL;
1218:     if (p) *p = -1;
1219:     return(0);
1220:   }
1221:   VecCUDAGetArrayRead(v,&av);
1222:   avpt = thrust::device_pointer_cast(av);
1223:   PetscLogGpuTimeBegin();
1224:   if (p) {
1225:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1226:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1227:     try {
1228: #if defined(PETSC_USE_COMPLEX)
1229:       res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmaxi());
1230: #else
1231:       res = thrust::reduce(zibit,zibit+n,res,petscmaxi());
1232: #endif
1233:     } catch (char *ex) {
1234:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1235:     }
1236:     *m = res.get<0>();
1237:     *p = res.get<1>();
1238:   } else {
1239:     try {
1240: #if defined(PETSC_USE_COMPLEX)
1241:       *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1242: #else
1243:       *m = thrust::reduce(avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1244: #endif
1245:     } catch (char *ex) {
1246:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1247:     }
1248:   }
1249:   PetscLogGpuTimeEnd();
1250:   VecCUDARestoreArrayRead(v,&av);
1251:   return(0);
1252: }

1254: PetscErrorCode VecMin_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1255: {
1256:   PetscErrorCode                        ierr;
1257:   PetscInt                              n = v->map->n;
1258:   const PetscScalar                     *av;
1259:   thrust::device_ptr<const PetscScalar> avpt;

1263:   if (!n) {
1264:     *m = PETSC_MAX_REAL;
1265:     if (p) *p = -1;
1266:     return(0);
1267:   }
1268:   VecCUDAGetArrayRead(v,&av);
1269:   avpt = thrust::device_pointer_cast(av);
1270:   PetscLogGpuTimeBegin();
1271:   if (p) {
1272:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1273:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1274:     try {
1275: #if defined(PETSC_USE_COMPLEX)
1276:       res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmini());
1277: #else
1278:       res = thrust::reduce(zibit,zibit+n,res,petscmini());
1279: #endif
1280:     } catch (char *ex) {
1281:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1282:     }
1283:     *m = res.get<0>();
1284:     *p = res.get<1>();
1285:   } else {
1286:     try {
1287: #if defined(PETSC_USE_COMPLEX)
1288:       *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1289: #else
1290:       *m = thrust::reduce(avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1291: #endif
1292:     } catch (char *ex) {
1293:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1294:     }
1295:   }
1296:   PetscLogGpuTimeEnd();
1297:   VecCUDARestoreArrayRead(v,&av);
1298:   return(0);
1299: }
1300: #if defined(PETSC_HAVE_NVSHMEM)
1301: /* Free old CUDA array and re-allocate a new one from nvshmem symmetric heap.
1302:    New array does not retain values in the old array. The offload mask is not changed.

1304:    Note: the function is only meant to be used in MatAssemblyEnd_MPIAIJCUSPARSE.
1305:  */
1306: PetscErrorCode  VecAllocateNVSHMEM_SeqCUDA(Vec v)
1307: {
1309:   cudaError_t    cerr;
1310:   Vec_CUDA       *veccuda = (Vec_CUDA*)v->spptr;
1311:   PetscInt       n;

1314:   cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);
1315:   VecGetLocalSize(v,&n);
1316:   MPIU_Allreduce(MPI_IN_PLACE,&n,1,MPIU_INT,MPI_MAX,PETSC_COMM_WORLD);
1317:   PetscNvshmemMalloc(n*sizeof(PetscScalar),(void**)&veccuda->GPUarray_allocated);
1318:   veccuda->GPUarray = veccuda->GPUarray_allocated;
1319:   veccuda->nvshmem  = PETSC_TRUE;
1320:   return(0);
1321: }
1322: #endif