Actual source code: aijcupm.hpp

  1: #pragma once

  3: /* Shared CUPM (CUDA/HIP) implementations for SeqAIJCUSPARSE and SeqAIJHIPSPARSE
  4:    that do not depend on the cuSPARSE/hipSPARSE library proper.

  6:    Include ordering requirement: the vendor-specific impl header
  7:    (cusparsematimpl.h or hipsparsematimpl.h) must be included before this
  8:    header so that CsrMatrix, THRUSTINTARRAY*, THRUSTARRAY and all device-specific
  9:    struct types are visible when this header is processed.

 11:    Instantiated by:
 12:      aijcusparse.cu      (DeviceType::CUDA, using MatSeqAIJCUSPARSE_Policy)
 13:      aijhipsparse.hip.cxx (DeviceType::HIP,  using MatSeqAIJHIPSPARSE_Policy) */

 15: #include <petsc/private/cupmobject.hpp>
 16: #include <petsc/private/cupmblasinterface.hpp>
 17: #include <petsc/private/matimpl.h>
 18: #include <../src/sys/objects/device/impls/cupm/cupmthrustutility.hpp>
 19: #include <../src/mat/impls/aij/seq/aij.h>

 21: #include <thrust/device_ptr.h>
 22: #include <thrust/iterator/counting_iterator.h>
 23: #include <thrust/iterator/permutation_iterator.h>
 24: #include <thrust/functional.h>
 25: #include <thrust/fill.h>
 26: #include <thrust/transform.h>
 27: #include <thrust/for_each.h>
 28: #include <thrust/equal.h>

 30: /* Forward declaration of SeqAIJ fallback function used inside template */
 31: PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat, Vec);

 33: namespace Petsc
 34: {

 36: namespace mat
 37: {

 39: namespace aij
 40: {

 42: namespace cupm
 43: {

 45: namespace impl
 46: {

 48: /* --------------------------------------------------------------------------
 49:    Shared device functor: left-scale CSR rows.
 50:    cprow[i] gives the logical row index for compressed row i; NULL = identity.
 51:    -------------------------------------------------------------------------- */
 52: struct DiagonalScaleLeft_CSR_Functor {
 53:   const int         *row_ptr;
 54:   PetscScalar       *val_ptr;
 55:   const PetscScalar *lv_ptr;
 56:   const PetscInt    *cprow;

 58:   PETSC_HOSTDEVICE_INLINE_DECL void operator()(int i) const
 59:   {
 60:     const int         row = cprow ? (int)cprow[i] : i;
 61:     const PetscScalar s   = lv_ptr[row];
 62:     for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) val_ptr[j] *= s;
 63:   }
 64: };

 66: /* --------------------------------------------------------------------------
 67:    Shared device functor: get<1>(t) = get<0>(t).
 68:    Replaces the identical VecCUDAEquals / VecHIPEquals structs.
 69:    -------------------------------------------------------------------------- */
 70: struct VecCUPMEquals {
 71:   template <typename Tuple>
 72:   PETSC_HOSTDEVICE_INLINE_DECL void operator()(Tuple t) const
 73:   {
 74:     thrust::get<1>(t) = thrust::get<0>(t);
 75:   }
 76: };

 78: /* --------------------------------------------------------------------------
 79:    Shared __global__ kernel: accumulate COO values into a CSR array.
 80:    __global__ is valid for both nvcc and hipcc; the body is identical.
 81:    -------------------------------------------------------------------------- */
 82: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
 83: {
 84:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
 85:   const PetscCount grid_size = gridDim.x * blockDim.x;
 86:   for (; i < nnz; i += grid_size) {
 87:     PetscScalar sum = 0.0;
 88:     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
 89:     a[i] = (imode == INSERT_VALUES ? (PetscScalar)0.0 : a[i]) + sum;
 90:   }
 91: }

 93: /* --------------------------------------------------------------------------
 94:    Shared __global__ kernel: extract the CSR diagonal.
 95:    -------------------------------------------------------------------------- */
 96: __global__ void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
 97: {
 98:   const size_t x = blockIdx.x * blockDim.x + threadIdx.x;

100:   if (x < (size_t)len) {
101:     const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
102:     PetscScalar    d = 0.0;

104:     for (PetscInt i = 0; i < num_non0_row; i++) {
105:       if (col[i + rowx] == (PetscInt)x) {
106:         d = val[i + rowx];
107:         break;
108:       }
109:     }
110:     diag[x] = d;
111:   }
112: }

114: /* ==========================================================================
115:    MatSeqAIJCUSPARSE_CUPM<T, Policy>

117:    Policy (C++11 traits class) requirements - all static methods:

119:      // Device struct types
120:      typedef ... mat_struct_type;       // Mat_SeqAIJCUSPARSE / Mat_SeqAIJHIPSPARSE
121:      typedef ... mult_struct_type;      // ...MultStruct equivalent

123:      // Storage-format constants (value of each format enumerator)
124:      static int storage_format_csr();
125:      static int storage_format_ell();
126:      static int storage_format_hyb();

128:      // Bookkeeping helpers (device-type specific)
129:      static PetscErrorCode CopyToGPU(Mat);
130:      static PetscErrorCode CopyFromGPU(Mat);
131:      static PetscErrorCode InvalidateTranspose(Mat, PetscBool);
132:      static PetscErrorCode ConvertFromSeqAIJ(Mat, MatType, MatReuse, Mat *);
133:      static const char    *mat_type_name;   // "seqaijcusparse" / "seqaijhipsparse"

135:      // Destruction helpers (device-type specific)
136:      static PetscErrorCode Destroy(Mat);
137:      static PetscErrorCode TriFactorsDestroy(void **);

139:      // Compose-function keys that differ between CUDA and HIP
140:      static const char *set_format_c;          // "MatCUSPARSESetFormat_C"        / "MatHIPSPARSESetFormat_C"
141:      static const char *set_use_cpu_solve_c;   // "MatCUSPARSESetUseCPUSolve_C"   / "MatHIPSPARSESetUseCPUSolve_C"
142:      static const char *product_seqdense_device_c; // "...seqdensecuda_C"          / "...seqdensehip_C"
143:      static const char *product_seqdense_c;    // "...seqdense_C"
144:      static const char *product_self_c;        // "...seqaijcusparse_C"           / "...seqaijhipsparse_C"
145:      static const char *seq_convert_hypre_c;   // "MatConvert_seqaijcusparse_hypre_C" / "_seqaijhipsparse_hypre_C"

147:      // Vec device-array access (device-type specific)
148:      static PetscErrorCode VecGetArrayRead  (Vec, const PetscScalar **);
149:      static PetscErrorCode VecRestoreArrayRead(Vec, const PetscScalar **);
150:      static PetscErrorCode VecGetArrayWrite (Vec, PetscScalar **);
151:      static PetscErrorCode VecRestoreArrayWrite(Vec, PetscScalar **);
152:    ========================================================================== */

154: template <device::cupm::DeviceType T, typename Policy>
155: struct MatSeqAIJCUSPARSE_CUPM : device::cupm::impl::CUPMObject<T> {
156:   PETSC_CUPMOBJECT_HEADER(T);

158:   typedef typename Policy::mat_struct_type  MatStructType;
159:   typedef typename Policy::mult_struct_type MultStructType;

161:   /* -------------------------------------------------------------------
162:      Tier 1 - Trivial
163:      ------------------------------------------------------------------- */

165:   /* MatAssemblyEnd: delegation to SeqAIJ */
166:   static PetscErrorCode AssemblyEnd(Mat A, MatAssemblyType mode) noexcept
167:   {
168:     PetscFunctionBegin;
169:     PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
170:     PetscFunctionReturn(PETSC_SUCCESS);
171:   }

173:   /* MatDuplicate */
174:   static PetscErrorCode Duplicate(Mat A, MatDuplicateOption cpvalues, Mat *B) noexcept
175:   {
176:     PetscFunctionBegin;
177:     PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
178:     PetscCall(Policy::ConvertFromSeqAIJ(*B, Policy::mat_type_name, MAT_INPLACE_MATRIX, B));
179:     PetscFunctionReturn(PETSC_SUCCESS);
180:   }

182:   /* MatGetCurrentMemType */
183:   static PetscErrorCode GetCurrentMemType(PETSC_UNUSED Mat A, PetscMemType *m) noexcept
184:   {
185:     PetscFunctionBegin;
186:     *m = PETSC_MEMTYPE_CUPM();
187:     PetscFunctionReturn(PETSC_SUCCESS);
188:   }

190:   /* MatCOOStructDestroy: free device jmap and perm fields */
191:   static PetscErrorCode COOStructDestroy(PetscCtxRt ctx) noexcept
192:   {
193:     MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)ctx;

195:     PetscFunctionBegin;
196:     PetscCallCUPM(cupmFree(coo->perm));
197:     PetscCallCUPM(cupmFree(coo->jmap));
198:     PetscCall(PetscFree(coo));
199:     PetscFunctionReturn(PETSC_SUCCESS);
200:   }

202:   /* -------------------------------------------------------------------
203:      Tier 2 - Straightforward
204:      ------------------------------------------------------------------- */

206:   /* MatZeroEntries: fill device CSR values with zero */
207:   static PetscErrorCode ZeroEntries(Mat A) noexcept
208:   {
209:     PetscBool      gpu = PETSC_FALSE;
210:     Mat_SeqAIJ    *a   = (Mat_SeqAIJ *)A->data;
211:     MatStructType *spptr;

213:     PetscFunctionBegin;
214:     if (A->factortype == MAT_FACTOR_NONE) {
215:       spptr = (MatStructType *)A->spptr;
216:       if (spptr->mat) {
217:         CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
218:         if (matrix->values) {
219:           gpu = PETSC_TRUE;
220:           PetscCallThrust(thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), (PetscScalar)0.));
221:         }
222:       }
223:       if (spptr->matTranspose) {
224:         CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
225:         if (matrix->values) PetscCallThrust(thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), (PetscScalar)0.));
226:       }
227:     }
228:     if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
229:     else {
230:       PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
231:       A->offloadmask = PETSC_OFFLOAD_CPU;
232:     }
233:     PetscFunctionReturn(PETSC_SUCCESS);
234:   }

236:   /* MatScale: cupmBlasXscal on the device CSR values */
237:   static PetscErrorCode Scale(Mat Y, PetscScalar a) noexcept
238:   {
239:     Mat_SeqAIJ      *y  = (Mat_SeqAIJ *)Y->data;
240:     PetscScalar     *ay = nullptr;
241:     cupmBlasHandle_t blashandle;
242:     PetscBLASInt     one = 1, bnz = 1;

244:     PetscFunctionBegin;
245:     PetscCall(GetArray(Y, &ay));
246:     PetscCall(GetHandles_(&blashandle));
247:     PetscCall(PetscBLASIntCast(y->nz, &bnz));
248:     PetscCall(PetscLogGpuTimeBegin());
249:     PetscCallCUPMBLAS(cupmBlasXscal(blashandle, bnz, cupmScalarPtrCast(&a), cupmScalarPtrCast(ay), one));
250:     PetscCall(PetscLogGpuFlops(bnz));
251:     PetscCall(PetscLogGpuTimeEnd());
252:     PetscCall(RestoreArray(Y, &ay));
253:     PetscFunctionReturn(PETSC_SUCCESS);
254:   }

256:   /* MatDiagonalScale: Thrust-based left and right scaling of CSR values */
257:   static PetscErrorCode DiagonalScale(Mat A, Vec ll, Vec rr) noexcept
258:   {
259:     Mat_SeqAIJ    *aij = (Mat_SeqAIJ *)A->data;
260:     MatStructType *devstruct;
261:     CsrMatrix     *csr;
262:     PetscScalar   *av = nullptr;
263:     PetscInt       m, n, nz = aij->nz;
264:     cupmStream_t   stream;

266:     PetscFunctionBegin;
267:     PetscCall(GetHandles_(&stream));
268:     PetscCall(PetscLogGpuTimeBegin());
269:     PetscCall(GetArray(A, &av));
270:     devstruct = (MatStructType *)A->spptr;
271:     csr       = (CsrMatrix *)devstruct->mat->mat;
272:     if (ll) {
273:       const PetscScalar *lv;
274:       PetscCall(VecGetLocalSize(ll, &m));
275:       PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
276:       PetscCall(Policy::VecGetArrayRead(ll, &lv));
277:       {
278:         const PetscInt               *cprow   = devstruct->mat->cprowIndices ? devstruct->mat->cprowIndices->data().get() : NULL;
279:         DiagonalScaleLeft_CSR_Functor functor = {csr->row_offsets->data().get(), av, lv, cprow};
280:         PetscCallThrust(THRUST_CALL(thrust::for_each, stream, thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(csr->num_rows), functor));
281:       }
282:       PetscCall(Policy::VecRestoreArrayRead(ll, &lv));
283:       PetscCall(PetscLogGpuFlops(nz));
284:     }
285:     if (rr) {
286:       const PetscScalar *rv;
287:       PetscCall(VecGetLocalSize(rr, &n));
288:       PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
289:       PetscCall(Policy::VecGetArrayRead(rr, &rv));
290: #if PetscDefined(USING_NVCC) && CCCL_VERSION >= 3001000
291:       PetscCallThrust(THRUST_CALL(thrust::transform, stream, csr->values->begin(), csr->values->end(), thrust::make_permutation_iterator(thrust::device_pointer_cast(rv), csr->column_indices->begin()), csr->values->begin(), cuda::std::multiplies<PetscScalar>()));
292: #else
293:       PetscCallThrust(THRUST_CALL(thrust::transform, stream, csr->values->begin(), csr->values->end(), thrust::make_permutation_iterator(thrust::device_pointer_cast(rv), csr->column_indices->begin()), csr->values->begin(), thrust::multiplies<PetscScalar>()));
294: #endif
295:       PetscCall(Policy::VecRestoreArrayRead(rr, &rv));
296:       PetscCall(PetscLogGpuFlops(nz));
297:     }
298:     PetscCall(RestoreArray(A, &av));
299:     PetscCall(PetscLogGpuTimeEnd());
300:     PetscFunctionReturn(PETSC_SUCCESS);
301:   }

303:   /* MatSeqAIJGetIJ: return device CSR row-pointer and column-index arrays */
304:   static PetscErrorCode GetIJ(Mat A, PetscBool compressed, const int **i, const int **j) noexcept
305:   {
306:     MatStructType *cusp = (MatStructType *)A->spptr;
307:     Mat_SeqAIJ    *a    = (Mat_SeqAIJ *)A->data;
308:     CsrMatrix     *csr;

310:     PetscFunctionBegin;
312:     if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
313:     PetscCheckTypeName(A, Policy::mat_type_name);
314:     PetscCheck(cusp->format != (decltype(cusp->format))Policy::storage_format_ell() && cusp->format != (decltype(cusp->format))Policy::storage_format_hyb(), PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
315:     PetscCall(Policy::CopyToGPU(A));
316:     PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing MultStruct");
317:     csr = (CsrMatrix *)cusp->mat->mat;
318:     if (i) {
319:       if (!compressed && a->compressedrow.use) { /* need full row offset */
320:         if (!cusp->rowoffsets_gpu) {
321:           cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
322:           cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
323:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
324:         }
325:         *i = cusp->rowoffsets_gpu->data().get();
326:       } else *i = csr->row_offsets->data().get();
327:     }
328:     if (j) *j = csr->column_indices->data().get();
329:     PetscFunctionReturn(PETSC_SUCCESS);
330:   }

332:   /* MatSeqAIJRestoreIJ: nullify the pointers previously obtained with GetIJ */
333:   static PetscErrorCode RestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) noexcept
334:   {
335:     PetscFunctionBegin;
337:     PetscCheckTypeName(A, Policy::mat_type_name);
338:     if (i) *i = NULL;
339:     if (j) *j = NULL;
340:     (void)compressed;
341:     PetscFunctionReturn(PETSC_SUCCESS);
342:   }

344:   /* MatSetPreallocationCOO: copy COO bookkeeping struct to device */
345:   static PetscErrorCode SetPreallocationCOO(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) noexcept
346:   {
347:     PetscBool            dev_ij = PETSC_FALSE;
348:     PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
349:     PetscInt            *i, *j;
350:     PetscContainer       container_h;
351:     MatCOOStruct_SeqAIJ *coo_h, *coo_d;

353:     PetscFunctionBegin;
354:     PetscCall(PetscGetMemType(coo_i, &mtype));
355:     if (PetscMemTypeDevice(mtype)) {
356:       dev_ij = PETSC_TRUE;
357:       PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
358:       PetscCallCUPM(cupmMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cupmMemcpyDeviceToHost));
359:       PetscCallCUPM(cupmMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cupmMemcpyDeviceToHost));
360:     } else {
361:       i = coo_i;
362:       j = coo_j;
363:     }
364:     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
365:     if (dev_ij) PetscCall(PetscFree2(i, j));
366:     mat->offloadmask = PETSC_OFFLOAD_CPU;
367:     /* Create the GPU memory */
368:     PetscCall(Policy::CopyToGPU(mat));

370:     /* Copy the COO struct to device */
371:     PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
372:     PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
373:     PetscCall(PetscMalloc1(1, &coo_d));
374:     *coo_d = *coo_h; /* shallow copy; device fields amended below */
375:     PetscCallCUPM(cupmMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
376:     PetscCallCUPM(cupmMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cupmMemcpyHostToDevice));
377:     PetscCallCUPM(cupmMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
378:     PetscCallCUPM(cupmMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cupmMemcpyHostToDevice));

380:     PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatSeqAIJCUSPARSE_CUPM::COOStructDestroy));
381:     PetscFunctionReturn(PETSC_SUCCESS);
382:   }

384:   /* MatSetValuesCOO: launch MatAddCOOValues kernel */
385:   static PetscErrorCode SetValuesCOO(Mat A, const PetscScalar v[], InsertMode imode) noexcept
386:   {
387:     Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
388:     MatStructType       *dev  = (MatStructType *)A->spptr;
389:     PetscCount           Annz = seq->nz;
390:     PetscMemType         memtype;
391:     const PetscScalar   *v1 = v;
392:     PetscScalar         *Aa = nullptr;
393:     PetscContainer       container;
394:     MatCOOStruct_SeqAIJ *coo;
395:     cupmStream_t         stream;

397:     PetscFunctionBegin;
398:     if (!dev->mat) PetscCall(Policy::CopyToGPU(A));

400:     PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
401:     PetscCall(PetscContainerGetPointer(container, (void **)&coo));

403:     PetscCall(PetscGetMemType(v, &memtype));
404:     if (PetscMemTypeHost(memtype)) { /* copy host values to device */
405:       PetscCallCUPM(cupmMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
406:       PetscCallCUPM(cupmMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cupmMemcpyHostToDevice));
407:     }

409:     if (imode == INSERT_VALUES) PetscCall(GetArrayWrite(A, &Aa));
410:     else PetscCall(GetArray(A, &Aa));

412:     PetscCall(GetHandles_(&stream));
413:     PetscCall(PetscLogGpuTimeBegin());
414:     if (Annz) {
415:       PetscCallCUPM(cupmLaunchKernel(MatAddCOOValues, (unsigned int)((Annz + 255) / 256), 256u, (size_t)0, stream, v1, Annz, coo->jmap, coo->perm, imode, Aa));
416:       PetscCallCUPM(cupmGetLastError());
417:     }
418:     PetscCall(PetscLogGpuTimeEnd());

420:     if (imode == INSERT_VALUES) PetscCall(RestoreArrayWrite(A, &Aa));
421:     else PetscCall(RestoreArray(A, &Aa));

423:     if (PetscMemTypeHost(memtype)) {
424:       void *v1_device = (void *)v1;
425:       PetscCallCUPM(cupmFree(v1_device));
426:     }
427:     PetscFunctionReturn(PETSC_SUCCESS);
428:   }

430:   /* MatSeqAIJCopySubArray: scatter-gather a sub-array of CSR values */
431:   static PetscErrorCode CopySubArray(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) noexcept
432:   {
433:     const PetscScalar *av = nullptr;
434:     PetscMemType       mtype;
435:     PetscBool          dmem;

437:     PetscFunctionBegin;
438:     PetscCall(PetscCUPMGetMemType(v, &mtype));
439:     dmem = PetscMemTypeDevice(mtype);
440:     PetscCall(GetArrayRead(A, &av));
441:     if (n && idx) {
442:       THRUSTINTARRAY widx(n);
443:       widx.assign(idx, idx + n);
444:       PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));

446:       THRUSTARRAY                    *w = NULL;
447:       thrust::device_ptr<PetscScalar> dv;
448:       if (dmem) {
449:         dv = thrust::device_pointer_cast(v);
450:       } else {
451:         w  = new THRUSTARRAY(n);
452:         dv = w->data();
453:       }
454:       {
455:         thrust::device_ptr<const PetscScalar> dav   = thrust::device_pointer_cast(av);
456:         auto                                  zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
457:         auto                                  zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
458:         PetscCallThrust(thrust::for_each(zibit, zieit, VecCUPMEquals{}));
459:       }
460:       if (w) PetscCallCUPM(cupmMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cupmMemcpyDeviceToHost));
461:       delete w;
462:     } else {
463:       PetscCallCUPM(cupmMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost));
464:     }
465:     if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
466:     PetscCall(RestoreArrayRead(A, &av));
467:     PetscFunctionReturn(PETSC_SUCCESS);
468:   }

470:   /* -------------------------------------------------------------------
471:      Tier 3 - AXPY shared branches (SAME_NZ and DIFFERENT_NZ only).
472:      The SUBSET_NZ branch calls cuSPARSE/hipSPARSE and stays in the caller.
473:      ------------------------------------------------------------------- */

475:   /* AXPY SAME_NONZERO_PATTERN branch: cupmBlasXaxpy */
476:   static PetscErrorCode AXPY_SameNZ(Mat Y, PetscScalar a, Mat X) noexcept
477:   {
478:     Mat_SeqAIJ        *x  = (Mat_SeqAIJ *)X->data;
479:     const PetscScalar *ax = nullptr;
480:     PetscScalar       *ay = nullptr;
481:     cupmBlasHandle_t   blashandle;
482:     PetscBLASInt       one = 1, bnz = 1;

484:     PetscFunctionBegin;
485:     PetscCall(GetArrayRead(X, &ax));
486:     PetscCall(GetArray(Y, &ay));
487:     PetscCall(GetHandles_(&blashandle));
488:     PetscCall(PetscBLASIntCast(x->nz, &bnz));
489:     PetscCall(PetscLogGpuTimeBegin());
490:     PetscCallCUPMBLAS(cupmBlasXaxpy(blashandle, bnz, cupmScalarPtrCast(&a), cupmScalarPtrCast(ax), one, cupmScalarPtrCast(ay), one));
491:     PetscCall(PetscLogGpuFlops(2.0 * bnz));
492:     PetscCall(PetscLogGpuTimeEnd());
493:     PetscCall(RestoreArrayRead(X, &ax));
494:     PetscCall(RestoreArray(Y, &ay));
495:     PetscFunctionReturn(PETSC_SUCCESS);
496:   }

498:   /* GetDiagonal: kernel-based extraction of the CSR diagonal */
499:   static PetscErrorCode GetDiagonal(Mat A, Vec diag) noexcept
500:   {
501:     MatStructType  *devstruct = (MatStructType *)A->spptr;
502:     MultStructType *matstruct = (MultStructType *)devstruct->mat;
503:     PetscScalar    *darray;
504:     cupmStream_t    stream;

506:     PetscFunctionBegin;
507:     if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
508:       PetscInt   n   = A->rmap->n;
509:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;

511:       PetscCheck(devstruct->format == (decltype(devstruct->format))Policy::storage_format_csr(), PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
512:       if (n > 0) {
513:         PetscCall(Policy::VecGetArrayWrite(diag, &darray));
514:         PetscCall(GetHandles_(&stream));
515:         PetscCallCUPM(cupmLaunchKernel(GetDiagonal_CSR, (unsigned int)((n + 255) / 256), 256u, (size_t)0, stream, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray));
516:         PetscCallCUPM(cupmGetLastError());
517:         PetscCall(Policy::VecRestoreArrayWrite(diag, &darray));
518:       }
519:     } else {
520:       PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
521:     }
522:     PetscFunctionReturn(PETSC_SUCCESS);
523:   }

525:   /* -------------------------------------------------------------------
526:      Tier 4 - Device array access (moved here from vendor files so both
527:      SeqAIJCUSPARSE and SeqAIJHIPSPARSE share one implementation).
528:      ------------------------------------------------------------------- */

530:   /* GetArrayRead: read-only access to device CSR value array */
531:   static PetscErrorCode GetArrayRead(Mat A, const PetscScalar **a) noexcept
532:   {
533:     MatStructType *cusp = (MatStructType *)A->spptr;
534:     CsrMatrix     *csr;

536:     PetscFunctionBegin;
538:     PetscAssertPointer(a, 2);
539:     PetscCheckTypeName(A, Policy::mat_type_name);
540:     PetscCheck(cusp->format != (decltype(cusp->format))Policy::storage_format_ell() && cusp->format != (decltype(cusp->format))Policy::storage_format_hyb(), PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
541:     PetscCall(Policy::CopyToGPU(A));
542:     PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing MultStruct");
543:     csr = (CsrMatrix *)cusp->mat->mat;
544:     PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing device memory");
545:     *a = csr->values->data().get();
546:     PetscFunctionReturn(PETSC_SUCCESS);
547:   }

549:   /* RestoreArrayRead: release read-only access obtained from GetArrayRead */
550:   static PetscErrorCode RestoreArrayRead(Mat A, const PetscScalar **a) noexcept
551:   {
552:     PetscFunctionBegin;
554:     PetscAssertPointer(a, 2);
555:     PetscCheckTypeName(A, Policy::mat_type_name);
556:     *a = NULL;
557:     PetscFunctionReturn(PETSC_SUCCESS);
558:   }

560:   /* GetArray: read-write access to device CSR value array */
561:   static PetscErrorCode GetArray(Mat A, PetscScalar **a) noexcept
562:   {
563:     MatStructType *cusp = (MatStructType *)A->spptr;
564:     CsrMatrix     *csr;

566:     PetscFunctionBegin;
568:     PetscAssertPointer(a, 2);
569:     PetscCheckTypeName(A, Policy::mat_type_name);
570:     PetscCheck(cusp->format != (decltype(cusp->format))Policy::storage_format_ell() && cusp->format != (decltype(cusp->format))Policy::storage_format_hyb(), PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
571:     PetscCall(Policy::CopyToGPU(A));
572:     PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing MultStruct");
573:     csr = (CsrMatrix *)cusp->mat->mat;
574:     PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing device memory");
575:     *a             = csr->values->data().get();
576:     A->offloadmask = PETSC_OFFLOAD_GPU;
577:     PetscCall(Policy::InvalidateTranspose(A, PETSC_FALSE));
578:     PetscFunctionReturn(PETSC_SUCCESS);
579:   }

581:   /* RestoreArray: restore read-write access obtained from GetArray */
582:   static PetscErrorCode RestoreArray(Mat A, PetscScalar **a) noexcept
583:   {
584:     PetscFunctionBegin;
586:     PetscAssertPointer(a, 2);
587:     PetscCheckTypeName(A, Policy::mat_type_name);
588:     PetscCall(PetscObjectStateIncrease((PetscObject)A));
589:     *a = NULL;
590:     PetscFunctionReturn(PETSC_SUCCESS);
591:   }

593:   /* GetArrayWrite: write-only access to device CSR value array (no host-to-device copy) */
594:   static PetscErrorCode GetArrayWrite(Mat A, PetscScalar **a) noexcept
595:   {
596:     MatStructType *cusp = (MatStructType *)A->spptr;
597:     CsrMatrix     *csr;

599:     PetscFunctionBegin;
601:     PetscAssertPointer(a, 2);
602:     PetscCheckTypeName(A, Policy::mat_type_name);
603:     PetscCheck(cusp->format != (decltype(cusp->format))Policy::storage_format_ell() && cusp->format != (decltype(cusp->format))Policy::storage_format_hyb(), PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
604:     PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing MultStruct");
605:     csr = (CsrMatrix *)cusp->mat->mat;
606:     PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing device memory");
607:     *a             = csr->values->data().get();
608:     A->offloadmask = PETSC_OFFLOAD_GPU;
609:     PetscCall(Policy::InvalidateTranspose(A, PETSC_FALSE));
610:     PetscFunctionReturn(PETSC_SUCCESS);
611:   }

613:   /* RestoreArrayWrite: restore write-only access obtained from GetArrayWrite */
614:   static PetscErrorCode RestoreArrayWrite(Mat A, PetscScalar **a) noexcept
615:   {
616:     PetscFunctionBegin;
618:     PetscAssertPointer(a, 2);
619:     PetscCheckTypeName(A, Policy::mat_type_name);
620:     PetscCall(PetscObjectStateIncrease((PetscObject)A));
621:     *a = NULL;
622:     PetscFunctionReturn(PETSC_SUCCESS);
623:   }

625:   /* SeqAIJGetArray: copy GPU-to-CPU then return host value array (ops->getarray) */
626:   static PetscErrorCode SeqAIJGetArray(Mat A, PetscScalar *array[]) noexcept
627:   {
628:     PetscFunctionBegin;
629:     PetscCall(Policy::CopyFromGPU(A));
630:     *array = ((Mat_SeqAIJ *)A->data)->a;
631:     PetscFunctionReturn(PETSC_SUCCESS);
632:   }

634:   /* SeqAIJRestoreArray: mark matrix data CPU-valid (ops->restorearray) */
635:   static PetscErrorCode SeqAIJRestoreArray(Mat A, PetscScalar *array[]) noexcept
636:   {
637:     PetscFunctionBegin;
638:     A->offloadmask = PETSC_OFFLOAD_CPU;
639:     *array         = NULL;
640:     PetscFunctionReturn(PETSC_SUCCESS);
641:   }

643:   /* SeqAIJGetArrayRead: copy GPU-to-CPU then return host value array read-only (ops->getarrayread) */
644:   static PetscErrorCode SeqAIJGetArrayRead(Mat A, const PetscScalar *array[]) noexcept
645:   {
646:     PetscFunctionBegin;
647:     PetscCall(Policy::CopyFromGPU(A));
648:     *array = ((Mat_SeqAIJ *)A->data)->a;
649:     PetscFunctionReturn(PETSC_SUCCESS);
650:   }

652:   /* SeqAIJRestoreArrayRead: release read-only host array (ops->restorearrayread) */
653:   static PetscErrorCode SeqAIJRestoreArrayRead(Mat /*A*/, const PetscScalar *array[]) noexcept
654:   {
655:     PetscFunctionBegin;
656:     *array = NULL;
657:     PetscFunctionReturn(PETSC_SUCCESS);
658:   }

660:   /* SeqAIJGetArrayWrite: return host value array for write-only access (ops->getarraywrite) */
661:   static PetscErrorCode SeqAIJGetArrayWrite(Mat A, PetscScalar *array[]) noexcept
662:   {
663:     PetscFunctionBegin;
664:     *array = ((Mat_SeqAIJ *)A->data)->a;
665:     PetscFunctionReturn(PETSC_SUCCESS);
666:   }

668:   /* SeqAIJRestoreArrayWrite: mark matrix data CPU-valid after write (ops->restorearraywrite) */
669:   static PetscErrorCode SeqAIJRestoreArrayWrite(Mat A, PetscScalar *array[]) noexcept
670:   {
671:     PetscFunctionBegin;
672:     A->offloadmask = PETSC_OFFLOAD_CPU;
673:     *array         = NULL;
674:     PetscFunctionReturn(PETSC_SUCCESS);
675:   }

677:   /* CreateSeqAIJ: allocate and preallocate a seq sparse matrix of this type */
678:   static PetscErrorCode CreateSeqAIJ(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) noexcept
679:   {
680:     PetscFunctionBegin;
681:     PetscCall(MatCreate(comm, A));
682:     PetscCall(MatSetSizes(*A, m, n, m, n));
683:     PetscCall(MatSetType(*A, Policy::mat_type_name));
684:     PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
685:     PetscFunctionReturn(PETSC_SUCCESS);
686:   }

688:   /* MatDestroy: free vendor-specific state, deregister composed functions */
689:   static PetscErrorCode Destroy(Mat A) noexcept
690:   {
691:     PetscFunctionBegin;
692:     if (A->factortype == MAT_FACTOR_NONE) PetscCall(Policy::Destroy(A));
693:     else PetscCall(Policy::TriFactorsDestroy(&A->spptr));
694:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
695:     PetscCall(PetscObjectComposeFunction((PetscObject)A, Policy::set_format_c, NULL));
696:     PetscCall(PetscObjectComposeFunction((PetscObject)A, Policy::set_use_cpu_solve_c, NULL));
697:     PetscCall(PetscObjectComposeFunction((PetscObject)A, Policy::product_seqdense_device_c, NULL));
698:     PetscCall(PetscObjectComposeFunction((PetscObject)A, Policy::product_seqdense_c, NULL));
699:     PetscCall(PetscObjectComposeFunction((PetscObject)A, Policy::product_self_c, NULL));
700:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
701:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
702:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
703:     PetscCall(PetscObjectComposeFunction((PetscObject)A, Policy::seq_convert_hypre_c, NULL));
704:     PetscCall(MatDestroy_SeqAIJ(A));
705:     PetscFunctionReturn(PETSC_SUCCESS);
706:   }
707: };

709: } // namespace impl

711: } // namespace cupm

713: } // namespace aij

715: } // namespace mat

717: } // namespace Petsc