Actual source code: aijcupm.hpp
1: #pragma once
3: /* Shared CUPM (CUDA/HIP) implementations for SeqAIJCUSPARSE and SeqAIJHIPSPARSE
4: that do not depend on the cuSPARSE/hipSPARSE library proper.
6: Include ordering requirement: the vendor-specific impl header
7: (cusparsematimpl.h or hipsparsematimpl.h) must be included before this
8: header so that CsrMatrix, THRUSTINTARRAY*, THRUSTARRAY and all device-specific
9: struct types are visible when this header is processed.
11: Instantiated by:
12: aijcusparse.cu (DeviceType::CUDA, using MatSeqAIJCUSPARSE_Policy)
13: aijhipsparse.hip.cxx (DeviceType::HIP, using MatSeqAIJHIPSPARSE_Policy) */
15: #include <petsc/private/cupmobject.hpp>
16: #include <petsc/private/cupmblasinterface.hpp>
17: #include <petsc/private/matimpl.h>
18: #include <../src/sys/objects/device/impls/cupm/cupmthrustutility.hpp>
19: #include <../src/mat/impls/aij/seq/aij.h>
21: #include <thrust/device_ptr.h>
22: #include <thrust/iterator/counting_iterator.h>
23: #include <thrust/iterator/permutation_iterator.h>
24: #include <thrust/functional.h>
25: #include <thrust/fill.h>
26: #include <thrust/transform.h>
27: #include <thrust/for_each.h>
28: #include <thrust/equal.h>
30: /* Forward declaration of SeqAIJ fallback function used inside template */
31: PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat, Vec);
33: namespace Petsc
34: {
36: namespace mat
37: {
39: namespace aij
40: {
42: namespace cupm
43: {
45: namespace impl
46: {
48: /* --------------------------------------------------------------------------
49: Shared device functor: left-scale CSR rows.
50: cprow[i] gives the logical row index for compressed row i; NULL = identity.
51: -------------------------------------------------------------------------- */
52: struct DiagonalScaleLeft_CSR_Functor {
53: const int *row_ptr;
54: PetscScalar *val_ptr;
55: const PetscScalar *lv_ptr;
56: const PetscInt *cprow;
58: PETSC_HOSTDEVICE_INLINE_DECL void operator()(int i) const
59: {
60: const int row = cprow ? (int)cprow[i] : i;
61: const PetscScalar s = lv_ptr[row];
62: for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) val_ptr[j] *= s;
63: }
64: };
66: /* --------------------------------------------------------------------------
67: Shared device functor: get<1>(t) = get<0>(t).
68: Replaces the identical VecCUDAEquals / VecHIPEquals structs.
69: -------------------------------------------------------------------------- */
70: struct VecCUPMEquals {
71: template <typename Tuple>
72: PETSC_HOSTDEVICE_INLINE_DECL void operator()(Tuple t) const
73: {
74: thrust::get<1>(t) = thrust::get<0>(t);
75: }
76: };
78: /* --------------------------------------------------------------------------
79: Shared __global__ kernel: accumulate COO values into a CSR array.
80: __global__ is valid for both nvcc and hipcc; the body is identical.
81: -------------------------------------------------------------------------- */
82: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
83: {
84: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
85: const PetscCount grid_size = gridDim.x * blockDim.x;
86: for (; i < nnz; i += grid_size) {
87: PetscScalar sum = 0.0;
88: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
89: a[i] = (imode == INSERT_VALUES ? (PetscScalar)0.0 : a[i]) + sum;
90: }
91: }
93: /* --------------------------------------------------------------------------
94: Shared __global__ kernel: extract the CSR diagonal.
95: -------------------------------------------------------------------------- */
96: __global__ void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
97: {
98: const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
100: if (x < (size_t)len) {
101: const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
102: PetscScalar d = 0.0;
104: for (PetscInt i = 0; i < num_non0_row; i++) {
105: if (col[i + rowx] == (PetscInt)x) {
106: d = val[i + rowx];
107: break;
108: }
109: }
110: diag[x] = d;
111: }
112: }
114: /* ==========================================================================
115: MatSeqAIJCUSPARSE_CUPM<T, Policy>
117: Policy (C++11 traits class) requirements - all static methods:
119: // Device struct types
120: typedef ... mat_struct_type; // Mat_SeqAIJCUSPARSE / Mat_SeqAIJHIPSPARSE
121: typedef ... mult_struct_type; // ...MultStruct equivalent
123: // Storage-format constants (value of each format enumerator)
124: static int storage_format_csr();
125: static int storage_format_ell();
126: static int storage_format_hyb();
128: // Bookkeeping helpers (device-type specific)
129: static PetscErrorCode CopyToGPU(Mat);
130: static PetscErrorCode CopyFromGPU(Mat);
131: static PetscErrorCode InvalidateTranspose(Mat, PetscBool);
132: static PetscErrorCode ConvertFromSeqAIJ(Mat, MatType, MatReuse, Mat *);
133: static const char *mat_type_name; // "seqaijcusparse" / "seqaijhipsparse"
135: // Destruction helpers (device-type specific)
136: static PetscErrorCode Destroy(Mat);
137: static PetscErrorCode TriFactorsDestroy(void **);
139: // Compose-function keys that differ between CUDA and HIP
140: static const char *set_format_c; // "MatCUSPARSESetFormat_C" / "MatHIPSPARSESetFormat_C"
141: static const char *set_use_cpu_solve_c; // "MatCUSPARSESetUseCPUSolve_C" / "MatHIPSPARSESetUseCPUSolve_C"
142: static const char *product_seqdense_device_c; // "...seqdensecuda_C" / "...seqdensehip_C"
143: static const char *product_seqdense_c; // "...seqdense_C"
144: static const char *product_self_c; // "...seqaijcusparse_C" / "...seqaijhipsparse_C"
145: static const char *seq_convert_hypre_c; // "MatConvert_seqaijcusparse_hypre_C" / "_seqaijhipsparse_hypre_C"
147: // Vec device-array access (device-type specific)
148: static PetscErrorCode VecGetArrayRead (Vec, const PetscScalar **);
149: static PetscErrorCode VecRestoreArrayRead(Vec, const PetscScalar **);
150: static PetscErrorCode VecGetArrayWrite (Vec, PetscScalar **);
151: static PetscErrorCode VecRestoreArrayWrite(Vec, PetscScalar **);
152: ========================================================================== */
154: template <device::cupm::DeviceType T, typename Policy>
155: struct MatSeqAIJCUSPARSE_CUPM : device::cupm::impl::CUPMObject<T> {
156: PETSC_CUPMOBJECT_HEADER(T);
158: typedef typename Policy::mat_struct_type MatStructType;
159: typedef typename Policy::mult_struct_type MultStructType;
161: /* -------------------------------------------------------------------
162: Tier 1 - Trivial
163: ------------------------------------------------------------------- */
165: /* MatAssemblyEnd: delegation to SeqAIJ */
166: static PetscErrorCode AssemblyEnd(Mat A, MatAssemblyType mode) noexcept
167: {
168: PetscFunctionBegin;
169: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
170: PetscFunctionReturn(PETSC_SUCCESS);
171: }
173: /* MatDuplicate */
174: static PetscErrorCode Duplicate(Mat A, MatDuplicateOption cpvalues, Mat *B) noexcept
175: {
176: PetscFunctionBegin;
177: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
178: PetscCall(Policy::ConvertFromSeqAIJ(*B, Policy::mat_type_name, MAT_INPLACE_MATRIX, B));
179: PetscFunctionReturn(PETSC_SUCCESS);
180: }
182: /* MatGetCurrentMemType */
183: static PetscErrorCode GetCurrentMemType(PETSC_UNUSED Mat A, PetscMemType *m) noexcept
184: {
185: PetscFunctionBegin;
186: *m = PETSC_MEMTYPE_CUPM();
187: PetscFunctionReturn(PETSC_SUCCESS);
188: }
190: /* MatCOOStructDestroy: free device jmap and perm fields */
191: static PetscErrorCode COOStructDestroy(PetscCtxRt ctx) noexcept
192: {
193: MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)ctx;
195: PetscFunctionBegin;
196: PetscCallCUPM(cupmFree(coo->perm));
197: PetscCallCUPM(cupmFree(coo->jmap));
198: PetscCall(PetscFree(coo));
199: PetscFunctionReturn(PETSC_SUCCESS);
200: }
202: /* -------------------------------------------------------------------
203: Tier 2 - Straightforward
204: ------------------------------------------------------------------- */
206: /* MatZeroEntries: fill device CSR values with zero */
207: static PetscErrorCode ZeroEntries(Mat A) noexcept
208: {
209: PetscBool gpu = PETSC_FALSE;
210: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
211: MatStructType *spptr;
213: PetscFunctionBegin;
214: if (A->factortype == MAT_FACTOR_NONE) {
215: spptr = (MatStructType *)A->spptr;
216: if (spptr->mat) {
217: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
218: if (matrix->values) {
219: gpu = PETSC_TRUE;
220: PetscCallThrust(thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), (PetscScalar)0.));
221: }
222: }
223: if (spptr->matTranspose) {
224: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
225: if (matrix->values) PetscCallThrust(thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), (PetscScalar)0.));
226: }
227: }
228: if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
229: else {
230: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
231: A->offloadmask = PETSC_OFFLOAD_CPU;
232: }
233: PetscFunctionReturn(PETSC_SUCCESS);
234: }
236: /* MatScale: cupmBlasXscal on the device CSR values */
237: static PetscErrorCode Scale(Mat Y, PetscScalar a) noexcept
238: {
239: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
240: PetscScalar *ay = nullptr;
241: cupmBlasHandle_t blashandle;
242: PetscBLASInt one = 1, bnz = 1;
244: PetscFunctionBegin;
245: PetscCall(GetArray(Y, &ay));
246: PetscCall(GetHandles_(&blashandle));
247: PetscCall(PetscBLASIntCast(y->nz, &bnz));
248: PetscCall(PetscLogGpuTimeBegin());
249: PetscCallCUPMBLAS(cupmBlasXscal(blashandle, bnz, cupmScalarPtrCast(&a), cupmScalarPtrCast(ay), one));
250: PetscCall(PetscLogGpuFlops(bnz));
251: PetscCall(PetscLogGpuTimeEnd());
252: PetscCall(RestoreArray(Y, &ay));
253: PetscFunctionReturn(PETSC_SUCCESS);
254: }
256: /* MatDiagonalScale: Thrust-based left and right scaling of CSR values */
257: static PetscErrorCode DiagonalScale(Mat A, Vec ll, Vec rr) noexcept
258: {
259: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)A->data;
260: MatStructType *devstruct;
261: CsrMatrix *csr;
262: PetscScalar *av = nullptr;
263: PetscInt m, n, nz = aij->nz;
264: cupmStream_t stream;
266: PetscFunctionBegin;
267: PetscCall(GetHandles_(&stream));
268: PetscCall(PetscLogGpuTimeBegin());
269: PetscCall(GetArray(A, &av));
270: devstruct = (MatStructType *)A->spptr;
271: csr = (CsrMatrix *)devstruct->mat->mat;
272: if (ll) {
273: const PetscScalar *lv;
274: PetscCall(VecGetLocalSize(ll, &m));
275: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
276: PetscCall(Policy::VecGetArrayRead(ll, &lv));
277: {
278: const PetscInt *cprow = devstruct->mat->cprowIndices ? devstruct->mat->cprowIndices->data().get() : NULL;
279: DiagonalScaleLeft_CSR_Functor functor = {csr->row_offsets->data().get(), av, lv, cprow};
280: PetscCallThrust(THRUST_CALL(thrust::for_each, stream, thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(csr->num_rows), functor));
281: }
282: PetscCall(Policy::VecRestoreArrayRead(ll, &lv));
283: PetscCall(PetscLogGpuFlops(nz));
284: }
285: if (rr) {
286: const PetscScalar *rv;
287: PetscCall(VecGetLocalSize(rr, &n));
288: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
289: PetscCall(Policy::VecGetArrayRead(rr, &rv));
290: #if PetscDefined(USING_NVCC) && CCCL_VERSION >= 3001000
291: PetscCallThrust(THRUST_CALL(thrust::transform, stream, csr->values->begin(), csr->values->end(), thrust::make_permutation_iterator(thrust::device_pointer_cast(rv), csr->column_indices->begin()), csr->values->begin(), cuda::std::multiplies<PetscScalar>()));
292: #else
293: PetscCallThrust(THRUST_CALL(thrust::transform, stream, csr->values->begin(), csr->values->end(), thrust::make_permutation_iterator(thrust::device_pointer_cast(rv), csr->column_indices->begin()), csr->values->begin(), thrust::multiplies<PetscScalar>()));
294: #endif
295: PetscCall(Policy::VecRestoreArrayRead(rr, &rv));
296: PetscCall(PetscLogGpuFlops(nz));
297: }
298: PetscCall(RestoreArray(A, &av));
299: PetscCall(PetscLogGpuTimeEnd());
300: PetscFunctionReturn(PETSC_SUCCESS);
301: }
303: /* MatSeqAIJGetIJ: return device CSR row-pointer and column-index arrays */
304: static PetscErrorCode GetIJ(Mat A, PetscBool compressed, const int **i, const int **j) noexcept
305: {
306: MatStructType *cusp = (MatStructType *)A->spptr;
307: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
308: CsrMatrix *csr;
310: PetscFunctionBegin;
312: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
313: PetscCheckTypeName(A, Policy::mat_type_name);
314: PetscCheck(cusp->format != (decltype(cusp->format))Policy::storage_format_ell() && cusp->format != (decltype(cusp->format))Policy::storage_format_hyb(), PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
315: PetscCall(Policy::CopyToGPU(A));
316: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing MultStruct");
317: csr = (CsrMatrix *)cusp->mat->mat;
318: if (i) {
319: if (!compressed && a->compressedrow.use) { /* need full row offset */
320: if (!cusp->rowoffsets_gpu) {
321: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
322: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
323: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
324: }
325: *i = cusp->rowoffsets_gpu->data().get();
326: } else *i = csr->row_offsets->data().get();
327: }
328: if (j) *j = csr->column_indices->data().get();
329: PetscFunctionReturn(PETSC_SUCCESS);
330: }
332: /* MatSeqAIJRestoreIJ: nullify the pointers previously obtained with GetIJ */
333: static PetscErrorCode RestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) noexcept
334: {
335: PetscFunctionBegin;
337: PetscCheckTypeName(A, Policy::mat_type_name);
338: if (i) *i = NULL;
339: if (j) *j = NULL;
340: (void)compressed;
341: PetscFunctionReturn(PETSC_SUCCESS);
342: }
344: /* MatSetPreallocationCOO: copy COO bookkeeping struct to device */
345: static PetscErrorCode SetPreallocationCOO(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) noexcept
346: {
347: PetscBool dev_ij = PETSC_FALSE;
348: PetscMemType mtype = PETSC_MEMTYPE_HOST;
349: PetscInt *i, *j;
350: PetscContainer container_h;
351: MatCOOStruct_SeqAIJ *coo_h, *coo_d;
353: PetscFunctionBegin;
354: PetscCall(PetscGetMemType(coo_i, &mtype));
355: if (PetscMemTypeDevice(mtype)) {
356: dev_ij = PETSC_TRUE;
357: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
358: PetscCallCUPM(cupmMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cupmMemcpyDeviceToHost));
359: PetscCallCUPM(cupmMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cupmMemcpyDeviceToHost));
360: } else {
361: i = coo_i;
362: j = coo_j;
363: }
364: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
365: if (dev_ij) PetscCall(PetscFree2(i, j));
366: mat->offloadmask = PETSC_OFFLOAD_CPU;
367: /* Create the GPU memory */
368: PetscCall(Policy::CopyToGPU(mat));
370: /* Copy the COO struct to device */
371: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
372: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
373: PetscCall(PetscMalloc1(1, &coo_d));
374: *coo_d = *coo_h; /* shallow copy; device fields amended below */
375: PetscCallCUPM(cupmMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
376: PetscCallCUPM(cupmMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cupmMemcpyHostToDevice));
377: PetscCallCUPM(cupmMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
378: PetscCallCUPM(cupmMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cupmMemcpyHostToDevice));
380: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatSeqAIJCUSPARSE_CUPM::COOStructDestroy));
381: PetscFunctionReturn(PETSC_SUCCESS);
382: }
384: /* MatSetValuesCOO: launch MatAddCOOValues kernel */
385: static PetscErrorCode SetValuesCOO(Mat A, const PetscScalar v[], InsertMode imode) noexcept
386: {
387: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
388: MatStructType *dev = (MatStructType *)A->spptr;
389: PetscCount Annz = seq->nz;
390: PetscMemType memtype;
391: const PetscScalar *v1 = v;
392: PetscScalar *Aa = nullptr;
393: PetscContainer container;
394: MatCOOStruct_SeqAIJ *coo;
395: cupmStream_t stream;
397: PetscFunctionBegin;
398: if (!dev->mat) PetscCall(Policy::CopyToGPU(A));
400: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
401: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
403: PetscCall(PetscGetMemType(v, &memtype));
404: if (PetscMemTypeHost(memtype)) { /* copy host values to device */
405: PetscCallCUPM(cupmMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
406: PetscCallCUPM(cupmMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cupmMemcpyHostToDevice));
407: }
409: if (imode == INSERT_VALUES) PetscCall(GetArrayWrite(A, &Aa));
410: else PetscCall(GetArray(A, &Aa));
412: PetscCall(GetHandles_(&stream));
413: PetscCall(PetscLogGpuTimeBegin());
414: if (Annz) {
415: PetscCallCUPM(cupmLaunchKernel(MatAddCOOValues, (unsigned int)((Annz + 255) / 256), 256u, (size_t)0, stream, v1, Annz, coo->jmap, coo->perm, imode, Aa));
416: PetscCallCUPM(cupmGetLastError());
417: }
418: PetscCall(PetscLogGpuTimeEnd());
420: if (imode == INSERT_VALUES) PetscCall(RestoreArrayWrite(A, &Aa));
421: else PetscCall(RestoreArray(A, &Aa));
423: if (PetscMemTypeHost(memtype)) {
424: void *v1_device = (void *)v1;
425: PetscCallCUPM(cupmFree(v1_device));
426: }
427: PetscFunctionReturn(PETSC_SUCCESS);
428: }
430: /* MatSeqAIJCopySubArray: scatter-gather a sub-array of CSR values */
431: static PetscErrorCode CopySubArray(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) noexcept
432: {
433: const PetscScalar *av = nullptr;
434: PetscMemType mtype;
435: PetscBool dmem;
437: PetscFunctionBegin;
438: PetscCall(PetscCUPMGetMemType(v, &mtype));
439: dmem = PetscMemTypeDevice(mtype);
440: PetscCall(GetArrayRead(A, &av));
441: if (n && idx) {
442: THRUSTINTARRAY widx(n);
443: widx.assign(idx, idx + n);
444: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
446: THRUSTARRAY *w = NULL;
447: thrust::device_ptr<PetscScalar> dv;
448: if (dmem) {
449: dv = thrust::device_pointer_cast(v);
450: } else {
451: w = new THRUSTARRAY(n);
452: dv = w->data();
453: }
454: {
455: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
456: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
457: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
458: PetscCallThrust(thrust::for_each(zibit, zieit, VecCUPMEquals{}));
459: }
460: if (w) PetscCallCUPM(cupmMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cupmMemcpyDeviceToHost));
461: delete w;
462: } else {
463: PetscCallCUPM(cupmMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost));
464: }
465: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
466: PetscCall(RestoreArrayRead(A, &av));
467: PetscFunctionReturn(PETSC_SUCCESS);
468: }
470: /* -------------------------------------------------------------------
471: Tier 3 - AXPY shared branches (SAME_NZ and DIFFERENT_NZ only).
472: The SUBSET_NZ branch calls cuSPARSE/hipSPARSE and stays in the caller.
473: ------------------------------------------------------------------- */
475: /* AXPY SAME_NONZERO_PATTERN branch: cupmBlasXaxpy */
476: static PetscErrorCode AXPY_SameNZ(Mat Y, PetscScalar a, Mat X) noexcept
477: {
478: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data;
479: const PetscScalar *ax = nullptr;
480: PetscScalar *ay = nullptr;
481: cupmBlasHandle_t blashandle;
482: PetscBLASInt one = 1, bnz = 1;
484: PetscFunctionBegin;
485: PetscCall(GetArrayRead(X, &ax));
486: PetscCall(GetArray(Y, &ay));
487: PetscCall(GetHandles_(&blashandle));
488: PetscCall(PetscBLASIntCast(x->nz, &bnz));
489: PetscCall(PetscLogGpuTimeBegin());
490: PetscCallCUPMBLAS(cupmBlasXaxpy(blashandle, bnz, cupmScalarPtrCast(&a), cupmScalarPtrCast(ax), one, cupmScalarPtrCast(ay), one));
491: PetscCall(PetscLogGpuFlops(2.0 * bnz));
492: PetscCall(PetscLogGpuTimeEnd());
493: PetscCall(RestoreArrayRead(X, &ax));
494: PetscCall(RestoreArray(Y, &ay));
495: PetscFunctionReturn(PETSC_SUCCESS);
496: }
498: /* GetDiagonal: kernel-based extraction of the CSR diagonal */
499: static PetscErrorCode GetDiagonal(Mat A, Vec diag) noexcept
500: {
501: MatStructType *devstruct = (MatStructType *)A->spptr;
502: MultStructType *matstruct = (MultStructType *)devstruct->mat;
503: PetscScalar *darray;
504: cupmStream_t stream;
506: PetscFunctionBegin;
507: if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
508: PetscInt n = A->rmap->n;
509: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
511: PetscCheck(devstruct->format == (decltype(devstruct->format))Policy::storage_format_csr(), PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
512: if (n > 0) {
513: PetscCall(Policy::VecGetArrayWrite(diag, &darray));
514: PetscCall(GetHandles_(&stream));
515: PetscCallCUPM(cupmLaunchKernel(GetDiagonal_CSR, (unsigned int)((n + 255) / 256), 256u, (size_t)0, stream, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray));
516: PetscCallCUPM(cupmGetLastError());
517: PetscCall(Policy::VecRestoreArrayWrite(diag, &darray));
518: }
519: } else {
520: PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
521: }
522: PetscFunctionReturn(PETSC_SUCCESS);
523: }
525: /* -------------------------------------------------------------------
526: Tier 4 - Device array access (moved here from vendor files so both
527: SeqAIJCUSPARSE and SeqAIJHIPSPARSE share one implementation).
528: ------------------------------------------------------------------- */
530: /* GetArrayRead: read-only access to device CSR value array */
531: static PetscErrorCode GetArrayRead(Mat A, const PetscScalar **a) noexcept
532: {
533: MatStructType *cusp = (MatStructType *)A->spptr;
534: CsrMatrix *csr;
536: PetscFunctionBegin;
538: PetscAssertPointer(a, 2);
539: PetscCheckTypeName(A, Policy::mat_type_name);
540: PetscCheck(cusp->format != (decltype(cusp->format))Policy::storage_format_ell() && cusp->format != (decltype(cusp->format))Policy::storage_format_hyb(), PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
541: PetscCall(Policy::CopyToGPU(A));
542: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing MultStruct");
543: csr = (CsrMatrix *)cusp->mat->mat;
544: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing device memory");
545: *a = csr->values->data().get();
546: PetscFunctionReturn(PETSC_SUCCESS);
547: }
549: /* RestoreArrayRead: release read-only access obtained from GetArrayRead */
550: static PetscErrorCode RestoreArrayRead(Mat A, const PetscScalar **a) noexcept
551: {
552: PetscFunctionBegin;
554: PetscAssertPointer(a, 2);
555: PetscCheckTypeName(A, Policy::mat_type_name);
556: *a = NULL;
557: PetscFunctionReturn(PETSC_SUCCESS);
558: }
560: /* GetArray: read-write access to device CSR value array */
561: static PetscErrorCode GetArray(Mat A, PetscScalar **a) noexcept
562: {
563: MatStructType *cusp = (MatStructType *)A->spptr;
564: CsrMatrix *csr;
566: PetscFunctionBegin;
568: PetscAssertPointer(a, 2);
569: PetscCheckTypeName(A, Policy::mat_type_name);
570: PetscCheck(cusp->format != (decltype(cusp->format))Policy::storage_format_ell() && cusp->format != (decltype(cusp->format))Policy::storage_format_hyb(), PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
571: PetscCall(Policy::CopyToGPU(A));
572: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing MultStruct");
573: csr = (CsrMatrix *)cusp->mat->mat;
574: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing device memory");
575: *a = csr->values->data().get();
576: A->offloadmask = PETSC_OFFLOAD_GPU;
577: PetscCall(Policy::InvalidateTranspose(A, PETSC_FALSE));
578: PetscFunctionReturn(PETSC_SUCCESS);
579: }
581: /* RestoreArray: restore read-write access obtained from GetArray */
582: static PetscErrorCode RestoreArray(Mat A, PetscScalar **a) noexcept
583: {
584: PetscFunctionBegin;
586: PetscAssertPointer(a, 2);
587: PetscCheckTypeName(A, Policy::mat_type_name);
588: PetscCall(PetscObjectStateIncrease((PetscObject)A));
589: *a = NULL;
590: PetscFunctionReturn(PETSC_SUCCESS);
591: }
593: /* GetArrayWrite: write-only access to device CSR value array (no host-to-device copy) */
594: static PetscErrorCode GetArrayWrite(Mat A, PetscScalar **a) noexcept
595: {
596: MatStructType *cusp = (MatStructType *)A->spptr;
597: CsrMatrix *csr;
599: PetscFunctionBegin;
601: PetscAssertPointer(a, 2);
602: PetscCheckTypeName(A, Policy::mat_type_name);
603: PetscCheck(cusp->format != (decltype(cusp->format))Policy::storage_format_ell() && cusp->format != (decltype(cusp->format))Policy::storage_format_hyb(), PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
604: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing MultStruct");
605: csr = (CsrMatrix *)cusp->mat->mat;
606: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing device memory");
607: *a = csr->values->data().get();
608: A->offloadmask = PETSC_OFFLOAD_GPU;
609: PetscCall(Policy::InvalidateTranspose(A, PETSC_FALSE));
610: PetscFunctionReturn(PETSC_SUCCESS);
611: }
613: /* RestoreArrayWrite: restore write-only access obtained from GetArrayWrite */
614: static PetscErrorCode RestoreArrayWrite(Mat A, PetscScalar **a) noexcept
615: {
616: PetscFunctionBegin;
618: PetscAssertPointer(a, 2);
619: PetscCheckTypeName(A, Policy::mat_type_name);
620: PetscCall(PetscObjectStateIncrease((PetscObject)A));
621: *a = NULL;
622: PetscFunctionReturn(PETSC_SUCCESS);
623: }
625: /* SeqAIJGetArray: copy GPU-to-CPU then return host value array (ops->getarray) */
626: static PetscErrorCode SeqAIJGetArray(Mat A, PetscScalar *array[]) noexcept
627: {
628: PetscFunctionBegin;
629: PetscCall(Policy::CopyFromGPU(A));
630: *array = ((Mat_SeqAIJ *)A->data)->a;
631: PetscFunctionReturn(PETSC_SUCCESS);
632: }
634: /* SeqAIJRestoreArray: mark matrix data CPU-valid (ops->restorearray) */
635: static PetscErrorCode SeqAIJRestoreArray(Mat A, PetscScalar *array[]) noexcept
636: {
637: PetscFunctionBegin;
638: A->offloadmask = PETSC_OFFLOAD_CPU;
639: *array = NULL;
640: PetscFunctionReturn(PETSC_SUCCESS);
641: }
643: /* SeqAIJGetArrayRead: copy GPU-to-CPU then return host value array read-only (ops->getarrayread) */
644: static PetscErrorCode SeqAIJGetArrayRead(Mat A, const PetscScalar *array[]) noexcept
645: {
646: PetscFunctionBegin;
647: PetscCall(Policy::CopyFromGPU(A));
648: *array = ((Mat_SeqAIJ *)A->data)->a;
649: PetscFunctionReturn(PETSC_SUCCESS);
650: }
652: /* SeqAIJRestoreArrayRead: release read-only host array (ops->restorearrayread) */
653: static PetscErrorCode SeqAIJRestoreArrayRead(Mat /*A*/, const PetscScalar *array[]) noexcept
654: {
655: PetscFunctionBegin;
656: *array = NULL;
657: PetscFunctionReturn(PETSC_SUCCESS);
658: }
660: /* SeqAIJGetArrayWrite: return host value array for write-only access (ops->getarraywrite) */
661: static PetscErrorCode SeqAIJGetArrayWrite(Mat A, PetscScalar *array[]) noexcept
662: {
663: PetscFunctionBegin;
664: *array = ((Mat_SeqAIJ *)A->data)->a;
665: PetscFunctionReturn(PETSC_SUCCESS);
666: }
668: /* SeqAIJRestoreArrayWrite: mark matrix data CPU-valid after write (ops->restorearraywrite) */
669: static PetscErrorCode SeqAIJRestoreArrayWrite(Mat A, PetscScalar *array[]) noexcept
670: {
671: PetscFunctionBegin;
672: A->offloadmask = PETSC_OFFLOAD_CPU;
673: *array = NULL;
674: PetscFunctionReturn(PETSC_SUCCESS);
675: }
677: /* CreateSeqAIJ: allocate and preallocate a seq sparse matrix of this type */
678: static PetscErrorCode CreateSeqAIJ(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) noexcept
679: {
680: PetscFunctionBegin;
681: PetscCall(MatCreate(comm, A));
682: PetscCall(MatSetSizes(*A, m, n, m, n));
683: PetscCall(MatSetType(*A, Policy::mat_type_name));
684: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
685: PetscFunctionReturn(PETSC_SUCCESS);
686: }
688: /* MatDestroy: free vendor-specific state, deregister composed functions */
689: static PetscErrorCode Destroy(Mat A) noexcept
690: {
691: PetscFunctionBegin;
692: if (A->factortype == MAT_FACTOR_NONE) PetscCall(Policy::Destroy(A));
693: else PetscCall(Policy::TriFactorsDestroy(&A->spptr));
694: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
695: PetscCall(PetscObjectComposeFunction((PetscObject)A, Policy::set_format_c, NULL));
696: PetscCall(PetscObjectComposeFunction((PetscObject)A, Policy::set_use_cpu_solve_c, NULL));
697: PetscCall(PetscObjectComposeFunction((PetscObject)A, Policy::product_seqdense_device_c, NULL));
698: PetscCall(PetscObjectComposeFunction((PetscObject)A, Policy::product_seqdense_c, NULL));
699: PetscCall(PetscObjectComposeFunction((PetscObject)A, Policy::product_self_c, NULL));
700: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
701: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
702: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
703: PetscCall(PetscObjectComposeFunction((PetscObject)A, Policy::seq_convert_hypre_c, NULL));
704: PetscCall(MatDestroy_SeqAIJ(A));
705: PetscFunctionReturn(PETSC_SUCCESS);
706: }
707: };
709: } // namespace impl
711: } // namespace cupm
713: } // namespace aij
715: } // namespace mat
717: } // namespace Petsc