Actual source code: veccuda2.cu
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <petsc/private/vecimpl.h>
9: #include <../src/vec/vec/impls/dvecimpl.h>
10: #include <petsc/private/cudavecimpl.h>
12: #include <cuda_runtime.h>
13: #include <thrust/device_ptr.h>
14: #include <thrust/functional.h>
15: #include <thrust/iterator/counting_iterator.h>
16: #include <thrust/reduce.h>
17: #include <thrust/transform.h>
18: #if defined(PETSC_USE_COMPLEX)
19: #include <thrust/transform_reduce.h>
20: #endif
22: #if THRUST_VERSION >= 101600 && !PetscDefined(USE_DEBUG)
23: static thrust::cuda_cub::par_nosync_t::stream_attachment_type VecCUDAThrustPolicy(Vec x)
24: {
25: return thrust::cuda::par_nosync.on(((Vec_CUDA *)x->spptr)->stream);
26: }
27: #else
28: static thrust::cuda_cub::par_t::stream_attachment_type VecCUDAThrustPolicy(Vec x)
29: {
30: return thrust::cuda::par.on(((Vec_CUDA *)x->spptr)->stream);
31: }
32: #endif
34: /*
35: Allocates space for the vector array on the GPU if it does not exist.
36: Does NOT change the PetscCUDAFlag for the vector
37: Does NOT zero the CUDA array
39: */
40: PetscErrorCode VecCUDAAllocateCheck(Vec v)
41: {
42: Vec_CUDA *veccuda;
43: PetscBool option_set;
45: if (!v->spptr) {
46: PetscReal pinned_memory_min;
47: PetscCalloc(sizeof(Vec_CUDA), &v->spptr);
48: veccuda = (Vec_CUDA *)v->spptr;
49: cudaMalloc((void **)&veccuda->GPUarray_allocated, sizeof(PetscScalar) * ((PetscBLASInt)v->map->n));
50: veccuda->GPUarray = veccuda->GPUarray_allocated;
51: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
52: if (v->data && ((Vec_Seq *)v->data)->array) {
53: v->offloadmask = PETSC_OFFLOAD_CPU;
54: } else {
55: v->offloadmask = PETSC_OFFLOAD_GPU;
56: }
57: }
58: pinned_memory_min = 0;
60: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
61: Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
62: PetscOptionsBegin(PetscObjectComm((PetscObject)v), ((PetscObject)v)->prefix, "VECCUDA Options", "Vec");
63: PetscOptionsReal("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", pinned_memory_min, &pinned_memory_min, &option_set);
64: if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
65: PetscOptionsEnd();
66: }
67: return 0;
68: }
70: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
71: PetscErrorCode VecCUDACopyToGPU(Vec v)
72: {
73: Vec_CUDA *veccuda;
74: PetscScalar *varray;
77: VecCUDAAllocateCheck(v);
78: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
79: PetscLogEventBegin(VEC_CUDACopyToGPU, v, 0, 0, 0);
80: veccuda = (Vec_CUDA *)v->spptr;
81: varray = veccuda->GPUarray;
82: cudaMemcpy(varray, ((Vec_Seq *)v->data)->array, v->map->n * sizeof(PetscScalar), cudaMemcpyHostToDevice);
83: PetscLogCpuToGpu((v->map->n) * sizeof(PetscScalar));
84: PetscLogEventEnd(VEC_CUDACopyToGPU, v, 0, 0, 0);
85: v->offloadmask = PETSC_OFFLOAD_BOTH;
86: }
87: return 0;
88: }
90: /*
91: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
92: */
93: PetscErrorCode VecCUDACopyFromGPU(Vec v)
94: {
95: Vec_CUDA *veccuda;
96: PetscScalar *varray;
99: VecCUDAAllocateCheckHost(v);
100: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
101: PetscLogEventBegin(VEC_CUDACopyFromGPU, v, 0, 0, 0);
102: veccuda = (Vec_CUDA *)v->spptr;
103: varray = veccuda->GPUarray;
104: cudaMemcpy(((Vec_Seq *)v->data)->array, varray, v->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToHost);
105: PetscLogGpuToCpu((v->map->n) * sizeof(PetscScalar));
106: PetscLogEventEnd(VEC_CUDACopyFromGPU, v, 0, 0, 0);
107: v->offloadmask = PETSC_OFFLOAD_BOTH;
108: }
109: return 0;
110: }
112: /*MC
113: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
115: Options Database Keys:
116: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
118: Level: beginner
120: .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateSeqWithArray()`, `VECMPI`, `VecType`, `VecCreateMPI()`, `VecCreateSeq()`, `VecSetPinnedMemoryMin()`
121: M*/
123: PetscErrorCode VecAYPX_SeqCUDA(Vec yin, PetscScalar alpha, Vec xin)
124: {
125: const PetscScalar *xarray;
126: PetscScalar *yarray;
127: PetscBLASInt one = 1, bn = 0;
128: PetscScalar sone = 1.0;
129: cublasHandle_t cublasv2handle;
131: PetscCUBLASGetHandle(&cublasv2handle);
132: PetscBLASIntCast(yin->map->n, &bn);
133: VecCUDAGetArrayRead(xin, &xarray);
134: VecCUDAGetArray(yin, &yarray);
135: PetscLogGpuTimeBegin();
136: if (alpha == (PetscScalar)0.0) {
137: cudaMemcpy(yarray, xarray, bn * sizeof(PetscScalar), cudaMemcpyDeviceToDevice);
138: } else if (alpha == (PetscScalar)1.0) {
139: cublasXaxpy(cublasv2handle, bn, &alpha, xarray, one, yarray, one);
140: PetscLogGpuFlops(1.0 * yin->map->n);
141: } else {
142: cublasXscal(cublasv2handle, bn, &alpha, yarray, one);
143: cublasXaxpy(cublasv2handle, bn, &sone, xarray, one, yarray, one);
144: PetscLogGpuFlops(2.0 * yin->map->n);
145: }
146: PetscLogGpuTimeEnd();
147: VecCUDARestoreArrayRead(xin, &xarray);
148: VecCUDARestoreArray(yin, &yarray);
149: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
150: return 0;
151: }
153: PetscErrorCode VecAXPY_SeqCUDA(Vec yin, PetscScalar alpha, Vec xin)
154: {
155: const PetscScalar *xarray;
156: PetscScalar *yarray;
157: PetscBLASInt one = 1, bn = 0;
158: cublasHandle_t cublasv2handle;
159: PetscBool xiscuda;
161: if (alpha == (PetscScalar)0.0) return 0;
162: PetscCUBLASGetHandle(&cublasv2handle);
163: PetscObjectTypeCompareAny((PetscObject)xin, &xiscuda, VECSEQCUDA, VECMPICUDA, "");
164: if (xiscuda) {
165: PetscBLASIntCast(yin->map->n, &bn);
166: VecCUDAGetArrayRead(xin, &xarray);
167: VecCUDAGetArray(yin, &yarray);
168: PetscLogGpuTimeBegin();
169: cublasXaxpy(cublasv2handle, bn, &alpha, xarray, one, yarray, one);
170: PetscLogGpuTimeEnd();
171: VecCUDARestoreArrayRead(xin, &xarray);
172: VecCUDARestoreArray(yin, &yarray);
173: PetscLogGpuFlops(2.0 * yin->map->n);
174: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
175: } else {
176: VecAXPY_Seq(yin, alpha, xin);
177: }
178: return 0;
179: }
181: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
182: {
183: PetscInt n = xin->map->n;
184: const PetscScalar *xarray = NULL, *yarray = NULL;
185: PetscScalar *warray = NULL;
186: thrust::device_ptr<const PetscScalar> xptr, yptr;
187: thrust::device_ptr<PetscScalar> wptr;
189: if (xin->boundtocpu || yin->boundtocpu) {
190: VecPointwiseDivide_Seq(win, xin, yin);
191: return 0;
192: }
193: VecCUDAGetArrayWrite(win, &warray);
194: VecCUDAGetArrayRead(xin, &xarray);
195: VecCUDAGetArrayRead(yin, &yarray);
196: PetscLogGpuTimeBegin();
197: try {
198: wptr = thrust::device_pointer_cast(warray);
199: xptr = thrust::device_pointer_cast(xarray);
200: yptr = thrust::device_pointer_cast(yarray);
201: thrust::transform(VecCUDAThrustPolicy(win), xptr, xptr + n, yptr, wptr, thrust::divides<PetscScalar>());
202: } catch (char *ex) {
203: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
204: }
205: PetscLogGpuTimeEnd();
206: PetscLogGpuFlops(n);
207: VecCUDARestoreArrayRead(xin, &xarray);
208: VecCUDARestoreArrayRead(yin, &yarray);
209: VecCUDARestoreArrayWrite(win, &warray);
210: return 0;
211: }
213: PetscErrorCode VecWAXPY_SeqCUDA(Vec win, PetscScalar alpha, Vec xin, Vec yin)
214: {
215: const PetscScalar *xarray = NULL, *yarray = NULL;
216: PetscScalar *warray = NULL;
217: PetscBLASInt one = 1, bn = 0;
218: cublasHandle_t cublasv2handle;
219: cudaStream_t stream;
221: PetscCUBLASGetHandle(&cublasv2handle);
222: PetscBLASIntCast(win->map->n, &bn);
223: if (alpha == (PetscScalar)0.0) {
224: VecCopy_SeqCUDA(yin, win);
225: } else {
226: VecCUDAGetArrayRead(xin, &xarray);
227: VecCUDAGetArrayRead(yin, &yarray);
228: VecCUDAGetArrayWrite(win, &warray);
229: PetscLogGpuTimeBegin();
230: cublasGetStream(cublasv2handle, &stream);
231: cudaMemcpyAsync(warray, yarray, win->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToDevice, stream);
232: cublasXaxpy(cublasv2handle, bn, &alpha, xarray, one, warray, one);
233: PetscLogGpuTimeEnd();
234: PetscLogGpuFlops(2 * win->map->n);
235: VecCUDARestoreArrayRead(xin, &xarray);
236: VecCUDARestoreArrayRead(yin, &yarray);
237: VecCUDARestoreArrayWrite(win, &warray);
238: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
239: }
240: return 0;
241: }
243: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv, const PetscScalar *alpha, Vec *y)
244: {
245: PetscInt n = xin->map->n, j;
246: PetscScalar *xarray;
247: const PetscScalar *yarray;
248: PetscBLASInt one = 1, bn = 0;
249: cublasHandle_t cublasv2handle;
251: PetscLogGpuFlops(nv * 2.0 * n);
252: PetscLogCpuToGpuScalar(nv * sizeof(PetscScalar));
253: PetscCUBLASGetHandle(&cublasv2handle);
254: PetscBLASIntCast(n, &bn);
255: VecCUDAGetArray(xin, &xarray);
256: PetscLogGpuTimeBegin();
257: for (j = 0; j < nv; j++) {
258: VecCUDAGetArrayRead(y[j], &yarray);
259: cublasXaxpy(cublasv2handle, bn, alpha + j, yarray, one, xarray, one);
260: VecCUDARestoreArrayRead(y[j], &yarray);
261: }
262: PetscLogGpuTimeEnd();
263: VecCUDARestoreArray(xin, &xarray);
264: return 0;
265: }
267: PetscErrorCode VecDot_SeqCUDA(Vec xin, Vec yin, PetscScalar *z)
268: {
269: const PetscScalar *xarray, *yarray;
270: PetscBLASInt one = 1, bn = 0;
271: cublasHandle_t cublasv2handle;
273: PetscCUBLASGetHandle(&cublasv2handle);
274: PetscBLASIntCast(yin->map->n, &bn);
275: VecCUDAGetArrayRead(xin, &xarray);
276: VecCUDAGetArrayRead(yin, &yarray);
277: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
278: PetscLogGpuTimeBegin();
279: cublasXdot(cublasv2handle, bn, yarray, one, xarray, one, z);
280: PetscLogGpuTimeEnd();
281: if (xin->map->n > 0) PetscLogGpuFlops(2.0 * xin->map->n - 1);
282: PetscLogGpuToCpuScalar(sizeof(PetscScalar));
283: VecCUDARestoreArrayRead(xin, &xarray);
284: VecCUDARestoreArrayRead(yin, &yarray);
285: return 0;
286: }
288: //
289: // CUDA kernels for MDot to follow
290: //
292: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
293: #define MDOT_WORKGROUP_SIZE 128
294: #define MDOT_WORKGROUP_NUM 128
296: #if !defined(PETSC_USE_COMPLEX)
297: // M = 2:
298: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, PetscInt size, PetscScalar *group_results)
299: {
300: __shared__ PetscScalar tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
301: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
302: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
303: PetscInt vec_start_index = blockIdx.x * entries_per_group;
304: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
306: PetscScalar entry_x = 0;
307: PetscScalar group_sum0 = 0;
308: PetscScalar group_sum1 = 0;
309: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
310: entry_x = x[i]; // load only once from global memory!
311: group_sum0 += entry_x * y0[i];
312: group_sum1 += entry_x * y1[i];
313: }
314: tmp_buffer[threadIdx.x] = group_sum0;
315: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
317: // parallel reduction
318: for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
319: __syncthreads();
320: if (threadIdx.x < stride) {
321: tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
322: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
323: }
324: }
326: // write result of group to group_results
327: if (threadIdx.x == 0) {
328: group_results[blockIdx.x] = tmp_buffer[0];
329: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
330: }
331: }
333: // M = 3:
334: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, const PetscScalar *y2, PetscInt size, PetscScalar *group_results)
335: {
336: __shared__ PetscScalar tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
337: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
338: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
339: PetscInt vec_start_index = blockIdx.x * entries_per_group;
340: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
342: PetscScalar entry_x = 0;
343: PetscScalar group_sum0 = 0;
344: PetscScalar group_sum1 = 0;
345: PetscScalar group_sum2 = 0;
346: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
347: entry_x = x[i]; // load only once from global memory!
348: group_sum0 += entry_x * y0[i];
349: group_sum1 += entry_x * y1[i];
350: group_sum2 += entry_x * y2[i];
351: }
352: tmp_buffer[threadIdx.x] = group_sum0;
353: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
354: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
356: // parallel reduction
357: for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
358: __syncthreads();
359: if (threadIdx.x < stride) {
360: tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
361: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
362: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 2 * MDOT_WORKGROUP_SIZE];
363: }
364: }
366: // write result of group to group_results
367: if (threadIdx.x == 0) {
368: group_results[blockIdx.x] = tmp_buffer[0];
369: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
370: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
371: }
372: }
374: // M = 4:
375: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, const PetscScalar *y2, const PetscScalar *y3, PetscInt size, PetscScalar *group_results)
376: {
377: __shared__ PetscScalar tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
378: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
379: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
380: PetscInt vec_start_index = blockIdx.x * entries_per_group;
381: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
383: PetscScalar entry_x = 0;
384: PetscScalar group_sum0 = 0;
385: PetscScalar group_sum1 = 0;
386: PetscScalar group_sum2 = 0;
387: PetscScalar group_sum3 = 0;
388: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
389: entry_x = x[i]; // load only once from global memory!
390: group_sum0 += entry_x * y0[i];
391: group_sum1 += entry_x * y1[i];
392: group_sum2 += entry_x * y2[i];
393: group_sum3 += entry_x * y3[i];
394: }
395: tmp_buffer[threadIdx.x] = group_sum0;
396: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
397: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
398: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
400: // parallel reduction
401: for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
402: __syncthreads();
403: if (threadIdx.x < stride) {
404: tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
405: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
406: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 2 * MDOT_WORKGROUP_SIZE];
407: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 3 * MDOT_WORKGROUP_SIZE];
408: }
409: }
411: // write result of group to group_results
412: if (threadIdx.x == 0) {
413: group_results[blockIdx.x] = tmp_buffer[0];
414: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
415: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
416: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
417: }
418: }
420: // M = 8:
421: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x, const PetscScalar *y0, const PetscScalar *y1, const PetscScalar *y2, const PetscScalar *y3, const PetscScalar *y4, const PetscScalar *y5, const PetscScalar *y6, const PetscScalar *y7, PetscInt size, PetscScalar *group_results)
422: {
423: __shared__ PetscScalar tmp_buffer[8 * MDOT_WORKGROUP_SIZE];
424: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
425: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
426: PetscInt vec_start_index = blockIdx.x * entries_per_group;
427: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
429: PetscScalar entry_x = 0;
430: PetscScalar group_sum0 = 0;
431: PetscScalar group_sum1 = 0;
432: PetscScalar group_sum2 = 0;
433: PetscScalar group_sum3 = 0;
434: PetscScalar group_sum4 = 0;
435: PetscScalar group_sum5 = 0;
436: PetscScalar group_sum6 = 0;
437: PetscScalar group_sum7 = 0;
438: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
439: entry_x = x[i]; // load only once from global memory!
440: group_sum0 += entry_x * y0[i];
441: group_sum1 += entry_x * y1[i];
442: group_sum2 += entry_x * y2[i];
443: group_sum3 += entry_x * y3[i];
444: group_sum4 += entry_x * y4[i];
445: group_sum5 += entry_x * y5[i];
446: group_sum6 += entry_x * y6[i];
447: group_sum7 += entry_x * y7[i];
448: }
449: tmp_buffer[threadIdx.x] = group_sum0;
450: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
451: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
452: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
453: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
454: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
455: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
456: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
458: // parallel reduction
459: for (PetscInt stride = blockDim.x / 2; stride > 0; stride /= 2) {
460: __syncthreads();
461: if (threadIdx.x < stride) {
462: tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
463: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + MDOT_WORKGROUP_SIZE];
464: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 2 * MDOT_WORKGROUP_SIZE];
465: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 3 * MDOT_WORKGROUP_SIZE];
466: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 4 * MDOT_WORKGROUP_SIZE];
467: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 5 * MDOT_WORKGROUP_SIZE];
468: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 6 * MDOT_WORKGROUP_SIZE];
469: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x + stride + 7 * MDOT_WORKGROUP_SIZE];
470: }
471: }
473: // write result of group to group_results
474: if (threadIdx.x == 0) {
475: group_results[blockIdx.x] = tmp_buffer[0];
476: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
477: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
478: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
479: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
480: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
481: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
482: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
483: }
484: }
485: #endif /* !defined(PETSC_USE_COMPLEX) */
487: PetscErrorCode VecMDot_SeqCUDA(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
488: {
489: PetscInt i, n = xin->map->n, current_y_index = 0;
490: const PetscScalar *xptr, *y0ptr, *y1ptr, *y2ptr, *y3ptr, *y4ptr, *y5ptr, *y6ptr, *y7ptr;
491: #if !defined(PETSC_USE_COMPLEX)
492: PetscInt nv1 = ((nv % 4) == 1) ? nv - 1 : nv, j;
493: PetscScalar *group_results_gpu, *group_results_cpu;
494: #endif
495: PetscBLASInt one = 1, bn = 0;
496: cublasHandle_t cublasv2handle;
498: PetscCUBLASGetHandle(&cublasv2handle);
499: PetscBLASIntCast(xin->map->n, &bn);
501: /* Handle the case of local size zero first */
502: if (!xin->map->n) {
503: for (i = 0; i < nv; ++i) z[i] = 0;
504: return 0;
505: }
507: #if !defined(PETSC_USE_COMPLEX)
508: // allocate scratchpad memory for the results of individual work groups:
509: cudaMalloc((void **)&group_results_gpu, nv1 * sizeof(PetscScalar) * MDOT_WORKGROUP_NUM);
510: PetscMalloc1(nv1 * MDOT_WORKGROUP_NUM, &group_results_cpu);
511: #endif
512: VecCUDAGetArrayRead(xin, &xptr);
513: PetscLogGpuTimeBegin();
515: while (current_y_index < nv) {
516: switch (nv - current_y_index) {
517: case 7:
518: case 6:
519: case 5:
520: case 4:
521: VecCUDAGetArrayRead(yin[current_y_index], &y0ptr);
522: VecCUDAGetArrayRead(yin[current_y_index + 1], &y1ptr);
523: VecCUDAGetArrayRead(yin[current_y_index + 2], &y2ptr);
524: VecCUDAGetArrayRead(yin[current_y_index + 3], &y3ptr);
525: #if defined(PETSC_USE_COMPLEX)
526: cublasXdot(cublasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
527: cublasXdot(cublasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
528: cublasXdot(cublasv2handle, bn, y2ptr, one, xptr, one, &z[current_y_index + 2]);
529: cublasXdot(cublasv2handle, bn, y3ptr, one, xptr, one, &z[current_y_index + 3]);
530: #else
531: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM, MDOT_WORKGROUP_SIZE>>>(xptr, y0ptr, y1ptr, y2ptr, y3ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
532: #endif
533: VecCUDARestoreArrayRead(yin[current_y_index], &y0ptr);
534: VecCUDARestoreArrayRead(yin[current_y_index + 1], &y1ptr);
535: VecCUDARestoreArrayRead(yin[current_y_index + 2], &y2ptr);
536: VecCUDARestoreArrayRead(yin[current_y_index + 3], &y3ptr);
537: current_y_index += 4;
538: break;
540: case 3:
541: VecCUDAGetArrayRead(yin[current_y_index], &y0ptr);
542: VecCUDAGetArrayRead(yin[current_y_index + 1], &y1ptr);
543: VecCUDAGetArrayRead(yin[current_y_index + 2], &y2ptr);
545: #if defined(PETSC_USE_COMPLEX)
546: cublasXdot(cublasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
547: cublasXdot(cublasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
548: cublasXdot(cublasv2handle, bn, y2ptr, one, xptr, one, &z[current_y_index + 2]);
549: #else
550: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM, MDOT_WORKGROUP_SIZE>>>(xptr, y0ptr, y1ptr, y2ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
551: #endif
552: VecCUDARestoreArrayRead(yin[current_y_index], &y0ptr);
553: VecCUDARestoreArrayRead(yin[current_y_index + 1], &y1ptr);
554: VecCUDARestoreArrayRead(yin[current_y_index + 2], &y2ptr);
555: current_y_index += 3;
556: break;
558: case 2:
559: VecCUDAGetArrayRead(yin[current_y_index], &y0ptr);
560: VecCUDAGetArrayRead(yin[current_y_index + 1], &y1ptr);
561: #if defined(PETSC_USE_COMPLEX)
562: cublasXdot(cublasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
563: cublasXdot(cublasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
564: #else
565: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM, MDOT_WORKGROUP_SIZE>>>(xptr, y0ptr, y1ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
566: #endif
567: VecCUDARestoreArrayRead(yin[current_y_index], &y0ptr);
568: VecCUDARestoreArrayRead(yin[current_y_index + 1], &y1ptr);
569: current_y_index += 2;
570: break;
572: case 1:
573: VecCUDAGetArrayRead(yin[current_y_index], &y0ptr);
574: cublasXdot(cublasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
575: VecCUDARestoreArrayRead(yin[current_y_index], &y0ptr);
576: current_y_index += 1;
577: break;
579: default: // 8 or more vectors left
580: VecCUDAGetArrayRead(yin[current_y_index], &y0ptr);
581: VecCUDAGetArrayRead(yin[current_y_index + 1], &y1ptr);
582: VecCUDAGetArrayRead(yin[current_y_index + 2], &y2ptr);
583: VecCUDAGetArrayRead(yin[current_y_index + 3], &y3ptr);
584: VecCUDAGetArrayRead(yin[current_y_index + 4], &y4ptr);
585: VecCUDAGetArrayRead(yin[current_y_index + 5], &y5ptr);
586: VecCUDAGetArrayRead(yin[current_y_index + 6], &y6ptr);
587: VecCUDAGetArrayRead(yin[current_y_index + 7], &y7ptr);
588: #if defined(PETSC_USE_COMPLEX)
589: cublasXdot(cublasv2handle, bn, y0ptr, one, xptr, one, &z[current_y_index]);
590: cublasXdot(cublasv2handle, bn, y1ptr, one, xptr, one, &z[current_y_index + 1]);
591: cublasXdot(cublasv2handle, bn, y2ptr, one, xptr, one, &z[current_y_index + 2]);
592: cublasXdot(cublasv2handle, bn, y3ptr, one, xptr, one, &z[current_y_index + 3]);
593: cublasXdot(cublasv2handle, bn, y4ptr, one, xptr, one, &z[current_y_index + 4]);
594: cublasXdot(cublasv2handle, bn, y5ptr, one, xptr, one, &z[current_y_index + 5]);
595: cublasXdot(cublasv2handle, bn, y6ptr, one, xptr, one, &z[current_y_index + 6]);
596: cublasXdot(cublasv2handle, bn, y7ptr, one, xptr, one, &z[current_y_index + 7]);
597: #else
598: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM, MDOT_WORKGROUP_SIZE>>>(xptr, y0ptr, y1ptr, y2ptr, y3ptr, y4ptr, y5ptr, y6ptr, y7ptr, n, group_results_gpu + current_y_index * MDOT_WORKGROUP_NUM);
599: #endif
600: VecCUDARestoreArrayRead(yin[current_y_index], &y0ptr);
601: VecCUDARestoreArrayRead(yin[current_y_index + 1], &y1ptr);
602: VecCUDARestoreArrayRead(yin[current_y_index + 2], &y2ptr);
603: VecCUDARestoreArrayRead(yin[current_y_index + 3], &y3ptr);
604: VecCUDARestoreArrayRead(yin[current_y_index + 4], &y4ptr);
605: VecCUDARestoreArrayRead(yin[current_y_index + 5], &y5ptr);
606: VecCUDARestoreArrayRead(yin[current_y_index + 6], &y6ptr);
607: VecCUDARestoreArrayRead(yin[current_y_index + 7], &y7ptr);
608: current_y_index += 8;
609: break;
610: }
611: }
612: PetscLogGpuTimeEnd();
613: VecCUDARestoreArrayRead(xin, &xptr);
615: #if defined(PETSC_USE_COMPLEX)
616: PetscLogGpuToCpuScalar(nv * sizeof(PetscScalar));
617: #else
618: // copy results to CPU
619: cudaMemcpy(group_results_cpu, group_results_gpu, nv1 * sizeof(PetscScalar) * MDOT_WORKGROUP_NUM, cudaMemcpyDeviceToHost);
621: // sum group results into z
622: for (j = 0; j < nv1; ++j) {
623: z[j] = 0;
624: for (i = j * MDOT_WORKGROUP_NUM; i < (j + 1) * MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
625: }
626: PetscLogFlops(nv1 * MDOT_WORKGROUP_NUM);
627: cudaFree(group_results_gpu);
628: PetscFree(group_results_cpu);
629: PetscLogGpuToCpuScalar(nv1 * sizeof(PetscScalar) * MDOT_WORKGROUP_NUM);
630: #endif
631: PetscLogGpuFlops(PetscMax(nv * (2.0 * n - 1), 0.0));
632: return 0;
633: }
635: #undef MDOT_WORKGROUP_SIZE
636: #undef MDOT_WORKGROUP_NUM
638: PetscErrorCode VecSet_SeqCUDA(Vec xin, PetscScalar alpha)
639: {
640: PetscInt n = xin->map->n;
641: PetscScalar *xarray = NULL;
642: thrust::device_ptr<PetscScalar> xptr;
644: VecCUDAGetArrayWrite(xin, &xarray);
645: PetscLogGpuTimeBegin();
646: if (alpha == (PetscScalar)0.0) {
647: cudaMemset(xarray, 0, n * sizeof(PetscScalar));
648: } else {
649: try {
650: xptr = thrust::device_pointer_cast(xarray);
651: thrust::fill(xptr, xptr + n, alpha);
652: } catch (char *ex) {
653: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
654: }
655: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
656: }
657: PetscLogGpuTimeEnd();
658: VecCUDARestoreArrayWrite(xin, &xarray);
659: return 0;
660: }
662: struct PetscScalarReciprocal {
663: __host__ __device__ PetscScalar operator()(const PetscScalar &s) { return (s != (PetscScalar)0.0) ? (PetscScalar)1.0 / s : 0.0; }
664: };
666: PetscErrorCode VecReciprocal_SeqCUDA(Vec v)
667: {
668: PetscInt n;
669: PetscScalar *x;
671: VecGetLocalSize(v, &n);
672: VecCUDAGetArray(v, &x);
673: PetscLogGpuTimeBegin();
674: try {
675: auto xptr = thrust::device_pointer_cast(x);
676: thrust::transform(VecCUDAThrustPolicy(v), xptr, xptr + n, xptr, PetscScalarReciprocal());
677: } catch (char *ex) {
678: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
679: }
680: PetscLogGpuTimeEnd();
681: VecCUDARestoreArray(v, &x);
682: return 0;
683: }
685: PetscErrorCode VecScale_SeqCUDA(Vec xin, PetscScalar alpha)
686: {
687: PetscScalar *xarray;
688: PetscBLASInt one = 1, bn = 0;
689: cublasHandle_t cublasv2handle;
691: if (alpha == (PetscScalar)0.0) {
692: VecSet_SeqCUDA(xin, alpha);
693: } else if (alpha != (PetscScalar)1.0) {
694: PetscCUBLASGetHandle(&cublasv2handle);
695: PetscBLASIntCast(xin->map->n, &bn);
696: VecCUDAGetArray(xin, &xarray);
697: PetscLogGpuTimeBegin();
698: cublasXscal(cublasv2handle, bn, &alpha, xarray, one);
699: VecCUDARestoreArray(xin, &xarray);
700: PetscLogGpuTimeEnd();
701: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
702: PetscLogGpuFlops(xin->map->n);
703: }
704: return 0;
705: }
707: PetscErrorCode VecTDot_SeqCUDA(Vec xin, Vec yin, PetscScalar *z)
708: {
709: const PetscScalar *xarray, *yarray;
710: PetscBLASInt one = 1, bn = 0;
711: cublasHandle_t cublasv2handle;
713: PetscCUBLASGetHandle(&cublasv2handle);
714: PetscBLASIntCast(xin->map->n, &bn);
715: VecCUDAGetArrayRead(xin, &xarray);
716: VecCUDAGetArrayRead(yin, &yarray);
717: PetscLogGpuTimeBegin();
718: cublasXdotu(cublasv2handle, bn, xarray, one, yarray, one, z);
719: PetscLogGpuTimeEnd();
720: if (xin->map->n > 0) PetscLogGpuFlops(2.0 * xin->map->n - 1);
721: PetscLogGpuToCpuScalar(sizeof(PetscScalar));
722: VecCUDARestoreArrayRead(yin, &yarray);
723: VecCUDARestoreArrayRead(xin, &xarray);
724: return 0;
725: }
727: PetscErrorCode VecCopy_SeqCUDA(Vec xin, Vec yin)
728: {
729: const PetscScalar *xarray;
730: PetscScalar *yarray;
732: if (xin != yin) {
733: if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
734: PetscBool yiscuda;
736: PetscObjectTypeCompareAny((PetscObject)yin, &yiscuda, VECSEQCUDA, VECMPICUDA, "");
737: VecCUDAGetArrayRead(xin, &xarray);
738: if (yiscuda) {
739: VecCUDAGetArrayWrite(yin, &yarray);
740: } else {
741: VecGetArrayWrite(yin, &yarray);
742: }
743: PetscLogGpuTimeBegin();
744: if (yiscuda) {
745: cudaMemcpyAsync(yarray, xarray, yin->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
746: } else {
747: cudaMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToHost);
748: PetscLogGpuToCpu((yin->map->n) * sizeof(PetscScalar));
749: }
750: PetscLogGpuTimeEnd();
751: VecCUDARestoreArrayRead(xin, &xarray);
752: if (yiscuda) {
753: VecCUDARestoreArrayWrite(yin, &yarray);
754: } else {
755: VecRestoreArrayWrite(yin, &yarray);
756: }
757: } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
758: /* copy in CPU if we are on the CPU */
759: VecCopy_SeqCUDA_Private(xin, yin);
760: } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
761: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
762: if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
763: /* copy in CPU */
764: VecCopy_SeqCUDA_Private(xin, yin);
765: } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
766: /* copy in GPU */
767: VecCUDAGetArrayRead(xin, &xarray);
768: VecCUDAGetArrayWrite(yin, &yarray);
769: PetscLogGpuTimeBegin();
770: cudaMemcpyAsync(yarray, xarray, yin->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
771: PetscLogGpuTimeEnd();
772: VecCUDARestoreArrayRead(xin, &xarray);
773: VecCUDARestoreArrayWrite(yin, &yarray);
774: } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
775: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
776: default to copy in GPU (this is an arbitrary choice) */
777: VecCUDAGetArrayRead(xin, &xarray);
778: VecCUDAGetArrayWrite(yin, &yarray);
779: PetscLogGpuTimeBegin();
780: cudaMemcpyAsync(yarray, xarray, yin->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
781: PetscLogGpuTimeEnd();
782: VecCUDARestoreArrayRead(xin, &xarray);
783: VecCUDARestoreArrayWrite(yin, &yarray);
784: } else {
785: VecCopy_SeqCUDA_Private(xin, yin);
786: }
787: }
788: }
789: return 0;
790: }
792: PetscErrorCode VecSwap_SeqCUDA(Vec xin, Vec yin)
793: {
794: PetscBLASInt one = 1, bn = 0;
795: PetscScalar *xarray, *yarray;
796: cublasHandle_t cublasv2handle;
798: PetscCUBLASGetHandle(&cublasv2handle);
799: PetscBLASIntCast(xin->map->n, &bn);
800: if (xin != yin) {
801: VecCUDAGetArray(xin, &xarray);
802: VecCUDAGetArray(yin, &yarray);
803: PetscLogGpuTimeBegin();
804: cublasXswap(cublasv2handle, bn, xarray, one, yarray, one);
805: PetscLogGpuTimeEnd();
806: VecCUDARestoreArray(xin, &xarray);
807: VecCUDARestoreArray(yin, &yarray);
808: }
809: return 0;
810: }
812: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin, PetscScalar alpha, PetscScalar beta, Vec xin)
813: {
814: PetscScalar a = alpha, b = beta;
815: const PetscScalar *xarray;
816: PetscScalar *yarray;
817: PetscBLASInt one = 1, bn = 0;
818: cublasHandle_t cublasv2handle;
820: PetscCUBLASGetHandle(&cublasv2handle);
821: PetscBLASIntCast(yin->map->n, &bn);
822: if (a == (PetscScalar)0.0) {
823: VecScale_SeqCUDA(yin, beta);
824: } else if (b == (PetscScalar)1.0) {
825: VecAXPY_SeqCUDA(yin, alpha, xin);
826: } else if (a == (PetscScalar)1.0) {
827: VecAYPX_SeqCUDA(yin, beta, xin);
828: } else if (b == (PetscScalar)0.0) {
829: VecCUDAGetArrayRead(xin, &xarray);
830: VecCUDAGetArray(yin, &yarray);
831: PetscLogGpuTimeBegin();
832: cudaMemcpy(yarray, xarray, yin->map->n * sizeof(PetscScalar), cudaMemcpyDeviceToDevice);
833: cublasXscal(cublasv2handle, bn, &alpha, yarray, one);
834: PetscLogGpuTimeEnd();
835: PetscLogGpuFlops(xin->map->n);
836: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
837: VecCUDARestoreArrayRead(xin, &xarray);
838: VecCUDARestoreArray(yin, &yarray);
839: } else {
840: VecCUDAGetArrayRead(xin, &xarray);
841: VecCUDAGetArray(yin, &yarray);
842: PetscLogGpuTimeBegin();
843: cublasXscal(cublasv2handle, bn, &beta, yarray, one);
844: cublasXaxpy(cublasv2handle, bn, &alpha, xarray, one, yarray, one);
845: VecCUDARestoreArrayRead(xin, &xarray);
846: VecCUDARestoreArray(yin, &yarray);
847: PetscLogGpuTimeEnd();
848: PetscLogGpuFlops(3.0 * xin->map->n);
849: PetscLogCpuToGpuScalar(2 * sizeof(PetscScalar));
850: }
851: return 0;
852: }
854: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin, PetscScalar alpha, PetscScalar beta, PetscScalar gamma, Vec xin, Vec yin)
855: {
856: PetscInt n = zin->map->n;
858: if (gamma == (PetscScalar)1.0) {
859: /* z = ax + b*y + z */
860: VecAXPY_SeqCUDA(zin, alpha, xin);
861: VecAXPY_SeqCUDA(zin, beta, yin);
862: PetscLogGpuFlops(4.0 * n);
863: } else {
864: /* z = a*x + b*y + c*z */
865: VecScale_SeqCUDA(zin, gamma);
866: VecAXPY_SeqCUDA(zin, alpha, xin);
867: VecAXPY_SeqCUDA(zin, beta, yin);
868: PetscLogGpuFlops(5.0 * n);
869: }
870: return 0;
871: }
873: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win, Vec xin, Vec yin)
874: {
875: PetscInt n = win->map->n;
876: const PetscScalar *xarray, *yarray;
877: PetscScalar *warray;
878: thrust::device_ptr<const PetscScalar> xptr, yptr;
879: thrust::device_ptr<PetscScalar> wptr;
881: if (xin->boundtocpu || yin->boundtocpu) {
882: VecPointwiseMult_Seq(win, xin, yin);
883: return 0;
884: }
885: VecCUDAGetArrayRead(xin, &xarray);
886: VecCUDAGetArrayRead(yin, &yarray);
887: VecCUDAGetArrayWrite(win, &warray);
888: PetscLogGpuTimeBegin();
889: try {
890: wptr = thrust::device_pointer_cast(warray);
891: xptr = thrust::device_pointer_cast(xarray);
892: yptr = thrust::device_pointer_cast(yarray);
893: thrust::transform(VecCUDAThrustPolicy(win), xptr, xptr + n, yptr, wptr, thrust::multiplies<PetscScalar>());
894: } catch (char *ex) {
895: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
896: }
897: PetscLogGpuTimeEnd();
898: VecCUDARestoreArrayRead(xin, &xarray);
899: VecCUDARestoreArrayRead(yin, &yarray);
900: VecCUDARestoreArrayWrite(win, &warray);
901: PetscLogGpuFlops(n);
902: return 0;
903: }
905: /* should do infinity norm in cuda */
907: PetscErrorCode VecNorm_SeqCUDA(Vec xin, NormType type, PetscReal *z)
908: {
909: PetscInt n = xin->map->n;
910: PetscBLASInt one = 1, bn = 0;
911: const PetscScalar *xarray;
912: cublasHandle_t cublasv2handle;
914: PetscCUBLASGetHandle(&cublasv2handle);
915: PetscBLASIntCast(n, &bn);
916: if (type == NORM_2 || type == NORM_FROBENIUS) {
917: VecCUDAGetArrayRead(xin, &xarray);
918: PetscLogGpuTimeBegin();
919: cublasXnrm2(cublasv2handle, bn, xarray, one, z);
920: PetscLogGpuTimeEnd();
921: VecCUDARestoreArrayRead(xin, &xarray);
922: PetscLogGpuFlops(PetscMax(2.0 * n - 1, 0.0));
923: } else if (type == NORM_INFINITY) {
924: int i;
925: VecCUDAGetArrayRead(xin, &xarray);
926: PetscLogGpuTimeBegin();
927: cublasIXamax(cublasv2handle, bn, xarray, one, &i);
928: PetscLogGpuTimeEnd();
929: if (bn) {
930: PetscScalar zs;
931: cudaMemcpy(&zs, xarray + i - 1, sizeof(PetscScalar), cudaMemcpyDeviceToHost);
932: *z = PetscAbsScalar(zs);
933: } else *z = 0.0;
934: VecCUDARestoreArrayRead(xin, &xarray);
935: } else if (type == NORM_1) {
936: VecCUDAGetArrayRead(xin, &xarray);
937: PetscLogGpuTimeBegin();
938: cublasXasum(cublasv2handle, bn, xarray, one, z);
939: VecCUDARestoreArrayRead(xin, &xarray);
940: PetscLogGpuTimeEnd();
941: PetscLogGpuFlops(PetscMax(n - 1.0, 0.0));
942: } else if (type == NORM_1_AND_2) {
943: VecNorm_SeqCUDA(xin, NORM_1, z);
944: VecNorm_SeqCUDA(xin, NORM_2, z + 1);
945: }
946: PetscLogGpuToCpuScalar(sizeof(PetscReal));
947: return 0;
948: }
950: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
951: {
952: VecDot_SeqCUDA(s, t, dp);
953: VecDot_SeqCUDA(t, t, nm);
954: return 0;
955: }
957: static PetscErrorCode VecResetPreallocationCOO_SeqCUDA(Vec x)
958: {
959: Vec_CUDA *veccuda = static_cast<Vec_CUDA *>(x->spptr);
961: if (veccuda) {
962: cudaFree(veccuda->jmap1_d);
963: cudaFree(veccuda->perm1_d);
964: }
965: return 0;
966: }
968: PetscErrorCode VecSetPreallocationCOO_SeqCUDA(Vec x, PetscCount ncoo, const PetscInt coo_i[])
969: {
970: Vec_Seq *vecseq = static_cast<Vec_Seq *>(x->data);
971: Vec_CUDA *veccuda = static_cast<Vec_CUDA *>(x->spptr);
972: PetscInt m;
974: VecResetPreallocationCOO_SeqCUDA(x);
975: VecSetPreallocationCOO_Seq(x, ncoo, coo_i);
976: VecGetLocalSize(x, &m);
977: cudaMalloc((void **)&veccuda->jmap1_d, sizeof(PetscCount) * (m + 1));
978: cudaMalloc((void **)&veccuda->perm1_d, sizeof(PetscCount) * vecseq->tot1);
979: cudaMemcpy(veccuda->jmap1_d, vecseq->jmap1, sizeof(PetscCount) * (m + 1), cudaMemcpyHostToDevice);
980: cudaMemcpy(veccuda->perm1_d, vecseq->perm1, sizeof(PetscCount) * vecseq->tot1, cudaMemcpyHostToDevice);
981: return 0;
982: }
984: __global__ static void VecAddCOOValues(const PetscScalar vv[], PetscCount m, const PetscCount jmap1[], const PetscCount perm1[], InsertMode imode, PetscScalar xv[])
985: {
986: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
987: const PetscCount grid_size = gridDim.x * blockDim.x;
988: for (; i < m; i += grid_size) {
989: PetscScalar sum = 0.0;
990: for (PetscCount k = jmap1[i]; k < jmap1[i + 1]; k++) sum += vv[perm1[k]];
991: xv[i] = (imode == INSERT_VALUES ? 0.0 : xv[i]) + sum;
992: }
993: }
995: PetscErrorCode VecSetValuesCOO_SeqCUDA(Vec x, const PetscScalar v[], InsertMode imode)
996: {
997: Vec_Seq *vecseq = static_cast<Vec_Seq *>(x->data);
998: Vec_CUDA *veccuda = static_cast<Vec_CUDA *>(x->spptr);
999: const PetscCount *jmap1 = veccuda->jmap1_d;
1000: const PetscCount *perm1 = veccuda->perm1_d;
1001: PetscScalar *xv;
1002: const PetscScalar *vv = v;
1003: PetscInt m;
1004: PetscMemType memtype;
1006: VecGetLocalSize(x, &m);
1007: PetscGetMemType(v, &memtype);
1009: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1010: cudaMalloc((void **)&vv, vecseq->coo_n * sizeof(PetscScalar));
1011: cudaMemcpy((void *)vv, v, vecseq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice);
1012: }
1014: if (imode == INSERT_VALUES) VecCUDAGetArrayWrite(x, &xv); /* write vector */
1015: else VecCUDAGetArray(x, &xv); /* read & write vector */
1017: if (m) {
1018: VecAddCOOValues<<<(m + 255) / 256, 256>>>(vv, m, jmap1, perm1, imode, xv);
1019: cudaPeekAtLastError();
1020: }
1021: if (imode == INSERT_VALUES) VecCUDARestoreArrayWrite(x, &xv);
1022: else VecCUDARestoreArray(x, &xv);
1024: if (PetscMemTypeHost(memtype)) cudaFree((void *)vv);
1025: return 0;
1026: }
1028: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1029: {
1030: Vec_CUDA *veccuda = (Vec_CUDA *)v->spptr;
1032: if (v->spptr) {
1033: if (veccuda->GPUarray_allocated) {
1034: #if defined(PETSC_HAVE_NVSHMEM)
1035: if (veccuda->nvshmem) {
1036: PetscNvshmemFree(veccuda->GPUarray_allocated);
1037: veccuda->nvshmem = PETSC_FALSE;
1038: } else
1039: #endif
1040: cudaFree(veccuda->GPUarray_allocated);
1041: veccuda->GPUarray_allocated = NULL;
1042: }
1043: if (veccuda->stream) cudaStreamDestroy(veccuda->stream);
1044: VecResetPreallocationCOO_SeqCUDA(v);
1045: }
1046: VecDestroy_SeqCUDA_Private(v);
1047: PetscFree(v->spptr);
1048: return 0;
1049: }
1051: #if defined(PETSC_USE_COMPLEX)
1052: struct conjugate {
1053: __host__ __device__ PetscScalar operator()(const PetscScalar &x) { return PetscConj(x); }
1054: };
1055: #endif
1057: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1058: {
1059: #if defined(PETSC_USE_COMPLEX)
1060: PetscScalar *xarray;
1061: PetscInt n = xin->map->n;
1062: thrust::device_ptr<PetscScalar> xptr;
1064: VecCUDAGetArray(xin, &xarray);
1065: PetscLogGpuTimeBegin();
1066: try {
1067: xptr = thrust::device_pointer_cast(xarray);
1068: thrust::transform(VecCUDAThrustPolicy(xin), xptr, xptr + n, xptr, conjugate());
1069: } catch (char *ex) {
1070: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1071: }
1072: PetscLogGpuTimeEnd();
1073: VecCUDARestoreArray(xin, &xarray);
1074: #else
1075: #endif
1076: return 0;
1077: }
1079: static inline PetscErrorCode VecGetLocalVectorK_SeqCUDA(Vec v, Vec w, PetscBool read)
1080: {
1081: PetscBool wisseqcuda;
1086: PetscObjectTypeCompare((PetscObject)w, VECSEQCUDA, &wisseqcuda);
1087: if (w->data && wisseqcuda) {
1088: if (((Vec_Seq *)w->data)->array_allocated) {
1089: if (w->pinned_memory) PetscMallocSetCUDAHost();
1090: PetscFree(((Vec_Seq *)w->data)->array_allocated);
1091: if (w->pinned_memory) {
1092: PetscMallocResetCUDAHost();
1093: w->pinned_memory = PETSC_FALSE;
1094: }
1095: }
1096: ((Vec_Seq *)w->data)->array = NULL;
1097: ((Vec_Seq *)w->data)->unplacedarray = NULL;
1098: }
1099: if (w->spptr && wisseqcuda) {
1100: if (((Vec_CUDA *)w->spptr)->GPUarray) {
1101: cudaFree(((Vec_CUDA *)w->spptr)->GPUarray);
1102: ((Vec_CUDA *)w->spptr)->GPUarray = NULL;
1103: }
1104: if (((Vec_CUDA *)w->spptr)->stream) cudaStreamDestroy(((Vec_CUDA *)w->spptr)->stream);
1105: PetscFree(w->spptr);
1106: }
1108: if (v->petscnative && wisseqcuda) {
1109: PetscFree(w->data);
1110: w->data = v->data;
1111: w->offloadmask = v->offloadmask;
1112: w->pinned_memory = v->pinned_memory;
1113: w->spptr = v->spptr;
1114: PetscObjectStateIncrease((PetscObject)w);
1115: } else {
1116: if (read) {
1117: VecGetArrayRead(v, (const PetscScalar **)&((Vec_Seq *)w->data)->array);
1118: } else {
1119: VecGetArray(v, &((Vec_Seq *)w->data)->array);
1120: }
1121: w->offloadmask = PETSC_OFFLOAD_CPU;
1122: if (wisseqcuda) VecCUDAAllocateCheck(w);
1123: }
1124: return 0;
1125: }
1127: static inline PetscErrorCode VecRestoreLocalVectorK_SeqCUDA(Vec v, Vec w, PetscBool read)
1128: {
1129: PetscBool wisseqcuda;
1134: PetscObjectTypeCompare((PetscObject)w, VECSEQCUDA, &wisseqcuda);
1135: if (v->petscnative && wisseqcuda) {
1136: v->data = w->data;
1137: v->offloadmask = w->offloadmask;
1138: v->pinned_memory = w->pinned_memory;
1139: v->spptr = w->spptr;
1140: w->data = 0;
1141: w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1142: w->spptr = 0;
1143: } else {
1144: if (read) {
1145: VecRestoreArrayRead(v, (const PetscScalar **)&((Vec_Seq *)w->data)->array);
1146: } else {
1147: VecRestoreArray(v, &((Vec_Seq *)w->data)->array);
1148: }
1149: if ((Vec_CUDA *)w->spptr && wisseqcuda) {
1150: cudaFree(((Vec_CUDA *)w->spptr)->GPUarray);
1151: ((Vec_CUDA *)w->spptr)->GPUarray = NULL;
1152: if (((Vec_CUDA *)v->spptr)->stream) cudaStreamDestroy(((Vec_CUDA *)w->spptr)->stream);
1153: PetscFree(w->spptr);
1154: }
1155: }
1156: return 0;
1157: }
1159: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v, Vec w)
1160: {
1161: VecGetLocalVectorK_SeqCUDA(v, w, PETSC_FALSE);
1162: return 0;
1163: }
1165: PetscErrorCode VecGetLocalVectorRead_SeqCUDA(Vec v, Vec w)
1166: {
1167: VecGetLocalVectorK_SeqCUDA(v, w, PETSC_TRUE);
1168: return 0;
1169: }
1171: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v, Vec w)
1172: {
1173: VecRestoreLocalVectorK_SeqCUDA(v, w, PETSC_FALSE);
1174: return 0;
1175: }
1177: PetscErrorCode VecRestoreLocalVectorRead_SeqCUDA(Vec v, Vec w)
1178: {
1179: VecRestoreLocalVectorK_SeqCUDA(v, w, PETSC_TRUE);
1180: return 0;
1181: }
1183: struct petscrealpart : public thrust::unary_function<PetscScalar, PetscReal> {
1184: __host__ __device__ PetscReal operator()(const PetscScalar &x) { return PetscRealPart(x); }
1185: };
1187: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>, thrust::tuple<PetscReal, PetscInt>> {
1188: __host__ __device__ thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt> &x) { return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>()); }
1189: };
1191: struct petscmax : public thrust::binary_function<PetscReal, PetscReal, PetscReal> {
1192: __host__ __device__ PetscReal operator()(const PetscReal &x, const PetscReal &y) { return x < y ? y : x; }
1193: };
1195: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>> {
1196: __host__ __device__ thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt> &x, const thrust::tuple<PetscReal, PetscInt> &y)
1197: {
1198: return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) : (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1199: }
1200: };
1202: struct petscmin : public thrust::binary_function<PetscReal, PetscReal, PetscReal> {
1203: __host__ __device__ PetscReal operator()(const PetscReal &x, const PetscReal &y) { return x < y ? x : y; }
1204: };
1206: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>, thrust::tuple<PetscReal, PetscInt>> {
1207: __host__ __device__ thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt> &x, const thrust::tuple<PetscReal, PetscInt> &y)
1208: {
1209: return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) : (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1210: }
1211: };
1213: PetscErrorCode VecMax_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1214: {
1215: PetscInt n = v->map->n;
1216: const PetscScalar *av;
1217: thrust::device_ptr<const PetscScalar> avpt;
1220: if (!n) {
1221: *m = PETSC_MIN_REAL;
1222: if (p) *p = -1;
1223: return 0;
1224: }
1225: VecCUDAGetArrayRead(v, &av);
1226: avpt = thrust::device_pointer_cast(av);
1227: PetscLogGpuTimeBegin();
1228: if (p) {
1229: thrust::tuple<PetscReal, PetscInt> res(PETSC_MIN_REAL, -1);
1230: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt, thrust::counting_iterator<PetscInt>(0)));
1231: try {
1232: #if defined(PETSC_USE_COMPLEX)
1233: res = thrust::transform_reduce(VecCUDAThrustPolicy(v), zibit, zibit + n, petscrealparti(), res, petscmaxi());
1234: #else
1235: res = thrust::reduce(VecCUDAThrustPolicy(v), zibit, zibit + n, res, petscmaxi());
1236: #endif
1237: } catch (char *ex) {
1238: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1239: }
1240: *m = res.get<0>();
1241: *p = res.get<1>();
1242: } else {
1243: try {
1244: #if defined(PETSC_USE_COMPLEX)
1245: *m = thrust::transform_reduce(VecCUDAThrustPolicy(v), avpt, avpt + n, petscrealpart(), PETSC_MIN_REAL, petscmax());
1246: #else
1247: *m = thrust::reduce(VecCUDAThrustPolicy(v), avpt, avpt + n, PETSC_MIN_REAL, petscmax());
1248: #endif
1249: } catch (char *ex) {
1250: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1251: }
1252: }
1253: PetscLogGpuTimeEnd();
1254: VecCUDARestoreArrayRead(v, &av);
1255: return 0;
1256: }
1258: PetscErrorCode VecMin_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1259: {
1260: PetscInt n = v->map->n;
1261: const PetscScalar *av;
1262: thrust::device_ptr<const PetscScalar> avpt;
1265: if (!n) {
1266: *m = PETSC_MAX_REAL;
1267: if (p) *p = -1;
1268: return 0;
1269: }
1270: VecCUDAGetArrayRead(v, &av);
1271: avpt = thrust::device_pointer_cast(av);
1272: PetscLogGpuTimeBegin();
1273: if (p) {
1274: thrust::tuple<PetscReal, PetscInt> res(PETSC_MAX_REAL, -1);
1275: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt, thrust::counting_iterator<PetscInt>(0)));
1276: try {
1277: #if defined(PETSC_USE_COMPLEX)
1278: res = thrust::transform_reduce(VecCUDAThrustPolicy(v), zibit, zibit + n, petscrealparti(), res, petscmini());
1279: #else
1280: res = thrust::reduce(VecCUDAThrustPolicy(v), zibit, zibit + n, res, petscmini());
1281: #endif
1282: } catch (char *ex) {
1283: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1284: }
1285: *m = res.get<0>();
1286: *p = res.get<1>();
1287: } else {
1288: try {
1289: #if defined(PETSC_USE_COMPLEX)
1290: *m = thrust::transform_reduce(VecCUDAThrustPolicy(v), avpt, avpt + n, petscrealpart(), PETSC_MAX_REAL, petscmin());
1291: #else
1292: *m = thrust::reduce(VecCUDAThrustPolicy(v), avpt, avpt + n, PETSC_MAX_REAL, petscmin());
1293: #endif
1294: } catch (char *ex) {
1295: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1296: }
1297: }
1298: PetscLogGpuTimeEnd();
1299: VecCUDARestoreArrayRead(v, &av);
1300: return 0;
1301: }
1303: PetscErrorCode VecSum_SeqCUDA(Vec v, PetscScalar *sum)
1304: {
1305: PetscInt n = v->map->n;
1306: const PetscScalar *a;
1307: thrust::device_ptr<const PetscScalar> dptr;
1310: VecCUDAGetArrayRead(v, &a);
1311: dptr = thrust::device_pointer_cast(a);
1312: PetscLogGpuTimeBegin();
1313: try {
1314: *sum = thrust::reduce(VecCUDAThrustPolicy(v), dptr, dptr + n, PetscScalar(0.0));
1315: } catch (char *ex) {
1316: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1317: }
1318: PetscLogGpuTimeEnd();
1319: VecCUDARestoreArrayRead(v, &a);
1320: return 0;
1321: }
1323: struct petscshift : public thrust::unary_function<PetscScalar, PetscScalar> {
1324: const PetscScalar shift_;
1325: petscshift(PetscScalar shift) : shift_(shift) { }
1326: __host__ __device__ PetscScalar operator()(PetscScalar x) { return x + shift_; }
1327: };
1329: PetscErrorCode VecShift_SeqCUDA(Vec v, PetscScalar shift)
1330: {
1331: PetscInt n = v->map->n;
1332: PetscScalar *a;
1333: thrust::device_ptr<PetscScalar> dptr;
1336: VecCUDAGetArray(v, &a);
1337: dptr = thrust::device_pointer_cast(a);
1338: PetscLogGpuTimeBegin();
1339: try {
1340: thrust::transform(VecCUDAThrustPolicy(v), dptr, dptr + n, dptr, petscshift(shift)); /* in-place transform */
1341: } catch (char *ex) {
1342: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1343: }
1344: PetscLogGpuTimeEnd();
1345: VecCUDARestoreArray(v, &a);
1346: return 0;
1347: }
1349: #if defined(PETSC_HAVE_NVSHMEM)
1350: /* Free old CUDA array and re-allocate a new one from nvshmem symmetric heap.
1351: New array does not retain values in the old array. The offload mask is not changed.
1353: Note: the function is only meant to be used in MatAssemblyEnd_MPIAIJCUSPARSE.
1354: */
1355: PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec v)
1356: {
1357: cudaError_t cerr;
1358: Vec_CUDA *veccuda = (Vec_CUDA *)v->spptr;
1359: PetscInt n;
1361: cudaFree(veccuda->GPUarray_allocated);
1362: VecGetLocalSize(v, &n);
1363: MPIU_Allreduce(MPI_IN_PLACE, &n, 1, MPIU_INT, MPI_MAX, PETSC_COMM_WORLD);
1364: PetscNvshmemMalloc(n * sizeof(PetscScalar), (void **)&veccuda->GPUarray_allocated);
1365: veccuda->GPUarray = veccuda->GPUarray_allocated;
1366: veccuda->nvshmem = PETSC_TRUE;
1367: return 0;
1368: }
1369: #endif