Actual source code: aijkok.kokkos.cxx

  1: #include <petscvec_kokkos.hpp>
  2: #include <petscpkg_version.h>
  3: #include <petsc/private/petscimpl.h>
  4: #include <petsc/private/sfimpl.h>
  5: #include <petscsystypes.h>
  6: #include <petscerror.h>

  8: #include <Kokkos_Core.hpp>
  9: #include <KokkosBlas.hpp>
 10: #include <KokkosSparse_CrsMatrix.hpp>
 11: #include <KokkosSparse_spmv.hpp>
 12: #include <KokkosSparse_spiluk.hpp>
 13: #include <KokkosSparse_sptrsv.hpp>
 14: #include <KokkosSparse_spgemm.hpp>
 15: #include <KokkosSparse_spadd.hpp>

 17: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>

 19: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99)
 20:   #include <KokkosSparse_Utils.hpp>
 21: using KokkosSparse::sort_crs_matrix;
 22: using KokkosSparse::Impl::transpose_matrix;
 23: #else
 24:   #include <KokkosKernels_Sorting.hpp>
 25: using KokkosKernels::sort_crs_matrix;
 26: using KokkosKernels::Impl::transpose_matrix;
 27: #endif

 29: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */

 31: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
 32:    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
 33:    In the latter case, it is important to set a_dual's sync state correctly.
 34:  */
 35: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
 36: {
 37:   Mat_SeqAIJ       *aijseq;
 38:   Mat_SeqAIJKokkos *aijkok;

 40:   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
 41:   MatAssemblyEnd_SeqAIJ(A, mode);

 43:   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
 44:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 46:   /* If aijkok does not exist, we just copy i, j to device.
 47:      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
 48:      In both cases, we build a new aijkok structure.
 49:   */
 50:   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
 51:     delete aijkok;
 52:     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
 53:     A->spptr = aijkok;
 54:   }

 56:   if (aijkok->device_mat_d.data()) {
 57:     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
 58:   }
 59:   return 0;
 60: }

 62: /* Sync CSR data to device if not yet */
 63: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
 64: {
 65:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 69:   if (aijkok->a_dual.need_sync_device()) {
 70:     aijkok->a_dual.sync_device();
 71:     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
 72:     aijkok->hermitian_updated = PETSC_FALSE;
 73:   }
 74:   return 0;
 75: }

 77: /* Mark the CSR data on device as modified */
 78: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
 79: {
 80:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 83:   aijkok->a_dual.clear_sync_state();
 84:   aijkok->a_dual.modify_device();
 85:   aijkok->transpose_updated = PETSC_FALSE;
 86:   aijkok->hermitian_updated = PETSC_FALSE;
 87:   MatSeqAIJInvalidateDiagonal(A);
 88:   PetscObjectStateIncrease((PetscObject)A);
 89:   return 0;
 90: }

 92: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
 93: {
 94:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 97:   /* We do not expect one needs factors on host  */
100:   aijkok->a_dual.sync_host();
101:   return 0;
102: }

104: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
105: {
106:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

108:   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
109:     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
110:     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
111:     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
112:   */
113:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
114:     aijkok->a_dual.sync_host();
115:     *array = aijkok->a_dual.view_host().data();
116:   } else { /* Happens when calling MatSetValues on a newly created matrix */
117:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
118:   }
119:   return 0;
120: }

122: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
123: {
124:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

126:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
127:   return 0;
128: }

130: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
131: {
132:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

134:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
135:     aijkok->a_dual.sync_host();
136:     *array = aijkok->a_dual.view_host().data();
137:   } else {
138:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
139:   }
140:   return 0;
141: }

143: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
144: {
145:   *array = NULL;
146:   return 0;
147: }

149: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
150: {
151:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

153:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
154:     *array = aijkok->a_dual.view_host().data();
155:   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
156:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
157:   }
158:   return 0;
159: }

161: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
162: {
163:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

165:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
166:     aijkok->a_dual.clear_sync_state();
167:     aijkok->a_dual.modify_host();
168:   }
169:   return 0;
170: }

172: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
173: {
174:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);


178:   if (i) *i = aijkok->i_device_data();
179:   if (j) *j = aijkok->j_device_data();
180:   if (a) {
181:     aijkok->a_dual.sync_device();
182:     *a = aijkok->a_device_data();
183:   }
184:   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
185:   return 0;
186: }

188: // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
189: PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
190: {
191:   Mat_SeqAIJKokkos                            *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
192:   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);

195:   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k);
196:   Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k);
197:   return 0;
198: }

200: // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
201: PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
202: {
203:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

205:   if (aijkok && aijkok->device_mat_d.data()) {
206:     *d_mat = aijkok->device_mat_d.data();
207:   } else {
208:     MatSeqAIJKokkosSyncDevice(A); // create aijkok (we are making d_mat now so make a place for it)
209:     *d_mat = NULL;
210:   }
211:   return 0;
212: }

214: /* Generate the transpose on device and cache it internally */
215: static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
216: {
217:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

220:   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
221:     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
222:     aijkok->a_dual.sync_device();
223:     aijkok->csrmatT = transpose_matrix(aijkok->csrmat);
224:     sort_crs_matrix(aijkok->csrmatT);
225:     aijkok->transpose_updated = PETSC_TRUE;
226:   }
227:   *csrmatT = &aijkok->csrmatT;
228:   return 0;
229: }

231: /* Generate the Hermitian on device and cache it internally */
232: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
233: {
234:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

236:   PetscLogGpuTimeBegin();
238:   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
239:     aijkok->a_dual.sync_device();
240:     aijkok->csrmatH = transpose_matrix(aijkok->csrmat);
241:     sort_crs_matrix(aijkok->csrmatH);
242: #if defined(PETSC_USE_COMPLEX)
243:     const auto &a = aijkok->csrmatH.values;
244:     Kokkos::parallel_for(
245:       a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); });
246: #endif
247:     aijkok->hermitian_updated = PETSC_TRUE;
248:   }
249:   *csrmatH = &aijkok->csrmatH;
250:   PetscLogGpuTimeEnd();
251:   return 0;
252: }

254: /* y = A x */
255: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
256: {
257:   Mat_SeqAIJKokkos          *aijkok;
258:   ConstPetscScalarKokkosView xv;
259:   PetscScalarKokkosView      yv;

261:   PetscLogGpuTimeBegin();
262:   MatSeqAIJKokkosSyncDevice(A);
263:   VecGetKokkosView(xx, &xv);
264:   VecGetKokkosViewWrite(yy, &yv);
265:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
266:   KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A x + beta y */
267:   VecRestoreKokkosView(xx, &xv);
268:   VecRestoreKokkosViewWrite(yy, &yv);
269:   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
270:   PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz());
271:   PetscLogGpuTimeEnd();
272:   return 0;
273: }

275: /* y = A^T x */
276: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
277: {
278:   Mat_SeqAIJKokkos          *aijkok;
279:   const char                *mode;
280:   ConstPetscScalarKokkosView xv;
281:   PetscScalarKokkosView      yv;
282:   KokkosCsrMatrix           *csrmat;

284:   PetscLogGpuTimeBegin();
285:   MatSeqAIJKokkosSyncDevice(A);
286:   VecGetKokkosView(xx, &xv);
287:   VecGetKokkosViewWrite(yy, &yv);
288:   if (A->form_explicit_transpose) {
289:     MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat);
290:     mode = "N";
291:   } else {
292:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
293:     csrmat = &aijkok->csrmat;
294:     mode   = "T";
295:   }
296:   KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A^T x + beta y */
297:   VecRestoreKokkosView(xx, &xv);
298:   VecRestoreKokkosViewWrite(yy, &yv);
299:   PetscLogGpuFlops(2.0 * csrmat->nnz());
300:   PetscLogGpuTimeEnd();
301:   return 0;
302: }

304: /* y = A^H x */
305: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
306: {
307:   Mat_SeqAIJKokkos          *aijkok;
308:   const char                *mode;
309:   ConstPetscScalarKokkosView xv;
310:   PetscScalarKokkosView      yv;
311:   KokkosCsrMatrix           *csrmat;

313:   PetscLogGpuTimeBegin();
314:   MatSeqAIJKokkosSyncDevice(A);
315:   VecGetKokkosView(xx, &xv);
316:   VecGetKokkosViewWrite(yy, &yv);
317:   if (A->form_explicit_transpose) {
318:     MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat);
319:     mode = "N";
320:   } else {
321:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
322:     csrmat = &aijkok->csrmat;
323:     mode   = "C";
324:   }
325:   KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A^H x + beta y */
326:   VecRestoreKokkosView(xx, &xv);
327:   VecRestoreKokkosViewWrite(yy, &yv);
328:   PetscLogGpuFlops(2.0 * csrmat->nnz());
329:   PetscLogGpuTimeEnd();
330:   return 0;
331: }

333: /* z = A x + y */
334: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
335: {
336:   Mat_SeqAIJKokkos          *aijkok;
337:   ConstPetscScalarKokkosView xv, yv;
338:   PetscScalarKokkosView      zv;

340:   PetscLogGpuTimeBegin();
341:   MatSeqAIJKokkosSyncDevice(A);
342:   VecGetKokkosView(xx, &xv);
343:   VecGetKokkosView(yy, &yv);
344:   VecGetKokkosViewWrite(zz, &zv);
345:   if (zz != yy) Kokkos::deep_copy(zv, yv);
346:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
347:   KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A x + beta z */
348:   VecRestoreKokkosView(xx, &xv);
349:   VecRestoreKokkosView(yy, &yv);
350:   VecRestoreKokkosViewWrite(zz, &zv);
351:   PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz());
352:   PetscLogGpuTimeEnd();
353:   return 0;
354: }

356: /* z = A^T x + y */
357: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
358: {
359:   Mat_SeqAIJKokkos          *aijkok;
360:   const char                *mode;
361:   ConstPetscScalarKokkosView xv, yv;
362:   PetscScalarKokkosView      zv;
363:   KokkosCsrMatrix           *csrmat;

365:   PetscLogGpuTimeBegin();
366:   MatSeqAIJKokkosSyncDevice(A);
367:   VecGetKokkosView(xx, &xv);
368:   VecGetKokkosView(yy, &yv);
369:   VecGetKokkosViewWrite(zz, &zv);
370:   if (zz != yy) Kokkos::deep_copy(zv, yv);
371:   if (A->form_explicit_transpose) {
372:     MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat);
373:     mode = "N";
374:   } else {
375:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
376:     csrmat = &aijkok->csrmat;
377:     mode   = "T";
378:   }
379:   KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A^T x + beta z */
380:   VecRestoreKokkosView(xx, &xv);
381:   VecRestoreKokkosView(yy, &yv);
382:   VecRestoreKokkosViewWrite(zz, &zv);
383:   PetscLogGpuFlops(2.0 * csrmat->nnz());
384:   PetscLogGpuTimeEnd();
385:   return 0;
386: }

388: /* z = A^H x + y */
389: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
390: {
391:   Mat_SeqAIJKokkos          *aijkok;
392:   const char                *mode;
393:   ConstPetscScalarKokkosView xv, yv;
394:   PetscScalarKokkosView      zv;
395:   KokkosCsrMatrix           *csrmat;

397:   PetscLogGpuTimeBegin();
398:   MatSeqAIJKokkosSyncDevice(A);
399:   VecGetKokkosView(xx, &xv);
400:   VecGetKokkosView(yy, &yv);
401:   VecGetKokkosViewWrite(zz, &zv);
402:   if (zz != yy) Kokkos::deep_copy(zv, yv);
403:   if (A->form_explicit_transpose) {
404:     MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat);
405:     mode = "N";
406:   } else {
407:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
408:     csrmat = &aijkok->csrmat;
409:     mode   = "C";
410:   }
411:   KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A^H x + beta z */
412:   VecRestoreKokkosView(xx, &xv);
413:   VecRestoreKokkosView(yy, &yv);
414:   VecRestoreKokkosViewWrite(zz, &zv);
415:   PetscLogGpuFlops(2.0 * csrmat->nnz());
416:   PetscLogGpuTimeEnd();
417:   return 0;
418: }

420: PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
421: {
422:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

424:   switch (op) {
425:   case MAT_FORM_EXPLICIT_TRANSPOSE:
426:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
427:     if (A->form_explicit_transpose && !flg && aijkok) aijkok->DestroyMatTranspose();
428:     A->form_explicit_transpose = flg;
429:     break;
430:   default:
431:     MatSetOption_SeqAIJ(A, op, flg);
432:     break;
433:   }
434:   return 0;
435: }

437: /* Depending on reuse, either build a new mat, or use the existing mat */
438: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
439: {
440:   Mat_SeqAIJ *aseq;

442:   PetscKokkosInitializeCheck();
443:   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
444:     MatDuplicate(A, MAT_COPY_VALUES, newmat);  /* the returned newmat is a SeqAIJKokkos */
445:   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
446:     MatCopy(A, *newmat, SAME_NONZERO_PATTERN); /* newmat is already a SeqAIJKokkos */
447:   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
449:     PetscFree(A->defaultvectype);
450:     PetscStrallocpy(VECKOKKOS, &A->defaultvectype); /* Allocate and copy the string */
451:     PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS);
452:     MatSetOps_SeqAIJKokkos(A);
453:     aseq = static_cast<Mat_SeqAIJ *>(A->data);
454:     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
456:       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
457:     }
458:   }
459:   return 0;
460: }

462: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
463:    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
464:  */
465: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
466: {
467:   Mat_SeqAIJ       *bseq;
468:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
469:   Mat               mat;

471:   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
472:   MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B);
473:   mat = *B;
474:   if (A->assembled) {
475:     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
476:     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
477:     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
478:     /* Now copy values to B if needed */
479:     if (dupOption == MAT_COPY_VALUES) {
480:       if (akok->a_dual.need_sync_device()) {
481:         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
482:         bkok->a_dual.modify_host();
483:       } else { /* If device has the latest data, we only copy data on device */
484:         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
485:         bkok->a_dual.modify_device();
486:       }
487:     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
488:       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
489:       bkok->a_dual.modify_host();
490:     }
491:     mat->spptr = bkok;
492:   }

494:   PetscFree(mat->defaultvectype);
495:   PetscStrallocpy(VECKOKKOS, &mat->defaultvectype); /* Allocate and copy the string */
496:   PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS);
497:   MatSetOps_SeqAIJKokkos(mat);
498:   return 0;
499: }

501: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
502: {
503:   Mat               At;
504:   KokkosCsrMatrix  *internT;
505:   Mat_SeqAIJKokkos *atkok, *bkok;

507:   if (reuse == MAT_REUSE_MATRIX) MatTransposeCheckNonzeroState_Private(A, *B);
508:   MatSeqAIJKokkosGenerateTranspose_Private(A, &internT); /* Generate a transpose internally */
509:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
510:     /* Deep copy internT, as we want to isolate the internal transpose */
511:     atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT));
512:     MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At);
513:     if (reuse == MAT_INITIAL_MATRIX) *B = At;
514:     else MatHeaderReplace(A, &At); /* Replace A with At inplace */
515:   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
516:     if ((*B)->assembled) {
517:       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
518:       Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values);
519:       MatSeqAIJKokkosModifyDevice(*B);
520:     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
521:       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
522:       MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */
523:       MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz());
524:       Kokkos::deep_copy(a_h, internT->values);
525:       Kokkos::deep_copy(j_h, internT->graph.entries);
526:     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
527:   }
528:   return 0;
529: }

531: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
532: {
533:   Mat_SeqAIJKokkos *aijkok;

535:   if (A->factortype == MAT_FACTOR_NONE) {
536:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
537:     delete aijkok;
538:   } else {
539:     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
540:   }
541:   A->spptr = NULL;
542:   PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL);
543:   PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
544:   PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
545:   MatDestroy_SeqAIJ(A);
546:   return 0;
547: }

549: /*MC
550:    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos

552:    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types

554:    Options Database Keys:
555: .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`

557:   Level: beginner

559: .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
560: M*/
561: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
562: {
563:   PetscKokkosInitializeCheck();
564:   MatCreate_SeqAIJ(A);
565:   MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A);
566:   return 0;
567: }

569: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
570: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
571: {
572:   Mat_SeqAIJ         *a, *b;
573:   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
574:   MatScalarKokkosView aa, ba, ca;
575:   MatRowMapKokkosView ai, bi, ci;
576:   MatColIdxKokkosView aj, bj, cj;
577:   PetscInt            m, n, nnz, aN;


587:   MatSeqAIJKokkosSyncDevice(A);
588:   MatSeqAIJKokkosSyncDevice(B);
589:   a    = static_cast<Mat_SeqAIJ *>(A->data);
590:   b    = static_cast<Mat_SeqAIJ *>(B->data);
591:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
592:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
593:   aa   = akok->a_dual.view_device();
594:   ai   = akok->i_dual.view_device();
595:   ba   = bkok->a_dual.view_device();
596:   bi   = bkok->i_dual.view_device();
597:   m    = A->rmap->n; /* M, N and nnz of C */
598:   n    = A->cmap->n + B->cmap->n;
599:   nnz  = a->nz + b->nz;
600:   aN   = A->cmap->n; /* N of A */
601:   if (reuse == MAT_INITIAL_MATRIX) {
602:     aj           = akok->j_dual.view_device();
603:     bj           = bkok->j_dual.view_device();
604:     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
605:     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
606:     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
607:     ca           = ca_dual.view_device();
608:     ci           = ci_dual.view_device();
609:     cj           = cj_dual.view_device();

611:     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
612:     Kokkos::parallel_for(
613:       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
614:         PetscInt i       = t.league_rank(); /* row i */
615:         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);

617:         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
618:                                                    ci(i) = coffset;
619:                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
620:         });

622:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
623:           if (k < alen) {
624:             ca(coffset + k) = aa(ai(i) + k);
625:             cj(coffset + k) = aj(ai(i) + k);
626:           } else {
627:             ca(coffset + k) = ba(bi(i) + k - alen);
628:             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
629:           }
630:         });
631:       });
632:     ca_dual.modify_device();
633:     ci_dual.modify_device();
634:     cj_dual.modify_device();
635:     ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual);
636:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C);
637:   } else if (reuse == MAT_REUSE_MATRIX) {
640:     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
641:     ca   = ckok->a_dual.view_device();
642:     ci   = ckok->i_dual.view_device();

644:     Kokkos::parallel_for(
645:       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
646:         PetscInt i    = t.league_rank(); /* row i */
647:         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
648:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
649:           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
650:           else ca(ci(i) + k) = ba(bi(i) + k - alen);
651:         });
652:       });
653:     MatSeqAIJKokkosModifyDevice(*C);
654:   }
655:   return 0;
656: }

658: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
659: {
660:   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
661:   return 0;
662: }

664: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
665: {
666:   Mat_Product                 *product = C->product;
667:   Mat                          A, B;
668:   bool                         transA, transB; /* use bool, since KK needs this type */
669:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
670:   Mat_SeqAIJ                  *c;
671:   MatProductData_SeqAIJKokkos *pdata;
672:   KokkosCsrMatrix             *csrmatA, *csrmatB;

674:   MatCheckProduct(C, 1);
676:   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);

678:   if (pdata->reusesym) {           /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
679:     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
680:     return 0;
681:   }

683:   switch (product->type) {
684:   case MATPRODUCT_AB:
685:     transA = false;
686:     transB = false;
687:     break;
688:   case MATPRODUCT_AtB:
689:     transA = true;
690:     transB = false;
691:     break;
692:   case MATPRODUCT_ABt:
693:     transA = false;
694:     transB = true;
695:     break;
696:   default:
697:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
698:   }

700:   A = product->A;
701:   B = product->B;
702:   MatSeqAIJKokkosSyncDevice(A);
703:   MatSeqAIJKokkosSyncDevice(B);
704:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
705:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
706:   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);


710:   csrmatA = &akok->csrmat;
711:   csrmatB = &bkok->csrmat;

713:   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
714:   if (transA) {
715:     MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA);
716:     transA = false;
717:   }

719:   if (transB) {
720:     MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB);
721:     transB = false;
722:   }
723:   PetscLogGpuTimeBegin();
724:   KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat);
725:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
726:   if (spgemmHandle->get_sort_option() != 1) sort_crs_matrix(ckok->csrmat); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */

728:   PetscLogGpuTimeEnd();
729:   MatSeqAIJKokkosModifyDevice(C);
730:   /* shorter version of MatAssemblyEnd_SeqAIJ */
731:   c = (Mat_SeqAIJ *)C->data;
732:   PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz);
733:   PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n");
734:   PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax);
735:   c->reallocs         = 0;
736:   C->info.mallocs     = 0;
737:   C->info.nz_unneeded = 0;
738:   C->assembled = C->was_assembled = PETSC_TRUE;
739:   C->num_ass++;
740:   return 0;
741: }

743: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
744: {
745:   Mat_Product                 *product = C->product;
746:   MatProductType               ptype;
747:   Mat                          A, B;
748:   bool                         transA, transB;
749:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
750:   MatProductData_SeqAIJKokkos *pdata;
751:   MPI_Comm                     comm;
752:   KokkosCsrMatrix             *csrmatA, *csrmatB, csrmatC;

754:   MatCheckProduct(C, 1);
755:   PetscObjectGetComm((PetscObject)C, &comm);
757:   A = product->A;
758:   B = product->B;
759:   MatSeqAIJKokkosSyncDevice(A);
760:   MatSeqAIJKokkosSyncDevice(B);
761:   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
762:   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
763:   csrmatA = &akok->csrmat;
764:   csrmatB = &bkok->csrmat;

766:   ptype = product->type;
767:   switch (ptype) {
768:   case MATPRODUCT_AB:
769:     transA = false;
770:     transB = false;
771:     break;
772:   case MATPRODUCT_AtB:
773:     transA = true;
774:     transB = false;
775:     break;
776:   case MATPRODUCT_ABt:
777:     transA = false;
778:     transB = true;
779:     break;
780:   default:
781:     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
782:   }

784:   product->data = pdata = new MatProductData_SeqAIJKokkos();
785:   pdata->kh.set_team_work_size(16);
786:   pdata->kh.set_dynamic_scheduling(true);
787:   pdata->reusesym = product->api_user;

789:   /* TODO: add command line options to select spgemm algorithms */
790:   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */

792:   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
793: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
794:   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
795:   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
796:   #endif
797: #endif

799:   pdata->kh.create_spgemm_handle(spgemm_alg);

801:   PetscLogGpuTimeBegin();
802:   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
803:   if (transA) {
804:     MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA);
805:     transA = false;
806:   }

808:   if (transB) {
809:     MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB);
810:     transB = false;
811:   }

813:   KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC);

815:   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
816:     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
817:     calling new Mat_SeqAIJKokkos().
818:     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
819:   */
820:   KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC);
821:   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
822:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
823:   if (spgemmHandle->get_sort_option() != 1) sort_crs_matrix(csrmatC); /* sort_option defaults to -1 in KK!*/
824:   PetscLogGpuTimeEnd();

826:   ckok = new Mat_SeqAIJKokkos(csrmatC);
827:   MatSetSeqAIJKokkosWithCSRMatrix(C, ckok);
828:   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
829:   return 0;
830: }

832: /* handles sparse matrix matrix ops */
833: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
834: {
835:   Mat_Product *product = mat->product;
836:   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;

838:   MatCheckProduct(mat, 1);
839:   PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok);
840:   if (product->type == MATPRODUCT_ABC) PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok);
841:   if (Biskok && Ciskok) {
842:     switch (product->type) {
843:     case MATPRODUCT_AB:
844:     case MATPRODUCT_AtB:
845:     case MATPRODUCT_ABt:
846:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
847:       break;
848:     case MATPRODUCT_PtAP:
849:     case MATPRODUCT_RARt:
850:     case MATPRODUCT_ABC:
851:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
852:       break;
853:     default:
854:       break;
855:     }
856:   } else { /* fallback for AIJ */
857:     MatProductSetFromOptions_SeqAIJ(mat);
858:   }
859:   return 0;
860: }

862: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
863: {
864:   Mat_SeqAIJKokkos *aijkok;

866:   PetscLogGpuTimeBegin();
867:   MatSeqAIJKokkosSyncDevice(A);
868:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
869:   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
870:   MatSeqAIJKokkosModifyDevice(A);
871:   PetscLogGpuFlops(aijkok->a_dual.extent(0));
872:   PetscLogGpuTimeEnd();
873:   return 0;
874: }

876: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
877: {
878:   Mat_SeqAIJKokkos *aijkok;

880:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
881:   if (aijkok) { /* Only zero the device if data is already there */
882:     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
883:     MatSeqAIJKokkosModifyDevice(A);
884:   } else { /* Might be preallocated but not assembled */
885:     MatZeroEntries_SeqAIJ(A);
886:   }
887:   return 0;
888: }

890: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
891: {
892:   Mat_SeqAIJ           *aijseq;
893:   Mat_SeqAIJKokkos     *aijkok;
894:   PetscInt              n;
895:   PetscScalarKokkosView xv;

897:   VecGetLocalSize(x, &n);

901:   MatSeqAIJKokkosSyncDevice(A);
902:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

904:   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
905:     MatMarkDiagonal_SeqAIJ(A);
906:     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
907:     aijkok->SetDiagonal(aijseq->diag);
908:   }

910:   const auto &Aa    = aijkok->a_dual.view_device();
911:   const auto &Ai    = aijkok->i_dual.view_device();
912:   const auto &Adiag = aijkok->diag_dual.view_device();

914:   VecGetKokkosViewWrite(x, &xv);
915:   Kokkos::parallel_for(
916:     n, KOKKOS_LAMBDA(const PetscInt i) {
917:       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
918:       else xv(i) = 0;
919:     });
920:   VecRestoreKokkosViewWrite(x, &xv);
921:   return 0;
922: }

924: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
925: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
926: {
927:   Mat_SeqAIJKokkos *aijkok;

932:   MatSeqAIJKokkosSyncDevice(A);
933:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
934:   *kv    = aijkok->a_dual.view_device();
935:   return 0;
936: }

938: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
939: {
943:   return 0;
944: }

946: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
947: {
948:   Mat_SeqAIJKokkos *aijkok;

953:   MatSeqAIJKokkosSyncDevice(A);
954:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
955:   *kv    = aijkok->a_dual.view_device();
956:   return 0;
957: }

959: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
960: {
964:   MatSeqAIJKokkosModifyDevice(A);
965:   return 0;
966: }

968: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
969: {
970:   Mat_SeqAIJKokkos *aijkok;

975:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
976:   *kv    = aijkok->a_dual.view_device();
977:   return 0;
978: }

980: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
981: {
985:   MatSeqAIJKokkosModifyDevice(A);
986:   return 0;
987: }

989: /* Computes Y += alpha X */
990: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
991: {
992:   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
993:   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
994:   ConstMatScalarKokkosView Xa;
995:   MatScalarKokkosView      Ya;

999:   MatSeqAIJKokkosSyncDevice(Y);
1000:   MatSeqAIJKokkosSyncDevice(X);
1001:   PetscLogGpuTimeBegin();

1003:   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1004:     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
1005:     PetscBool e;
1006:     PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e);
1007:     if (e) {
1008:       PetscArraycmp(x->j, y->j, y->nz, &e);
1009:       if (e) pattern = SAME_NONZERO_PATTERN;
1010:     }
1011:   }

1013:   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1014:     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1015:     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1016:     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1017:   */
1018:   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1019:   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1020:   Xa   = xkok->a_dual.view_device();
1021:   Ya   = ykok->a_dual.view_device();

1023:   if (pattern == SAME_NONZERO_PATTERN) {
1024:     KokkosBlas::axpy(alpha, Xa, Ya);
1025:     MatSeqAIJKokkosModifyDevice(Y);
1026:   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1027:     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1028:     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();

1030:     Kokkos::parallel_for(
1031:       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1032:         PetscInt i = t.league_rank();              /* row i */
1033:         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */
1034:                                                    PetscInt p, q = Yi(i);
1035:                                                    for (p = Xi(i); p < Xi(i + 1); p++) {          /* For each nonzero on row i of X */
1036:                                                      while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */
1037:                                                      if (Xj(p) == Yj(q)) {                        /* Found it */
1038:                                                        Ya(q) += alpha * Xa(p);
1039:                                                        q++;
1040:                                                      } else {
1041:                                                        /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1042:                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1043:             */
1044:                                                        if (Yi(i) != Yi(i + 1))
1045:                                                          Ya(Yi(i)) =
1046: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99)
1047:                                                            Kokkos::nan("1"); /* auto promote the double NaN if needed */
1048: #else
1049:               Kokkos::Experimental::nan("1");
1050: #endif
1051:                                                      }
1052:                                                    }
1053:         });
1054:       });
1055:     MatSeqAIJKokkosModifyDevice(Y);
1056:   } else { /* different nonzero patterns */
1057:     Mat             Z;
1058:     KokkosCsrMatrix zcsr;
1059:     KernelHandle    kh;
1060:     kh.create_spadd_handle(false);
1061:     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1062:     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1063:     zkok = new Mat_SeqAIJKokkos(zcsr);
1064:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z);
1065:     MatHeaderReplace(Y, &Z);
1066:     kh.destroy_spadd_handle();
1067:   }
1068:   PetscLogGpuTimeEnd();
1069:   PetscLogGpuFlops(xkok->a_dual.extent(0) * 2); /* Because we scaled X and then added it to Y */
1070:   return 0;
1071: }

1073: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1074: {
1075:   Mat_SeqAIJKokkos *akok;
1076:   Mat_SeqAIJ       *aseq;

1078:   MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j);
1079:   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1080:   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1081:   delete akok;
1082:   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1083:   MatZeroEntries_SeqAIJKokkos(mat);
1084:   akok->SetUpCOO(aseq);
1085:   return 0;
1086: }

1088: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1089: {
1090:   Mat_SeqAIJ                 *aseq = static_cast<Mat_SeqAIJ *>(A->data);
1091:   Mat_SeqAIJKokkos           *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1092:   PetscCount                  Annz = aseq->nz;
1093:   const PetscCountKokkosView &jmap = akok->jmap_d;
1094:   const PetscCountKokkosView &perm = akok->perm_d;
1095:   MatScalarKokkosView         Aa;
1096:   ConstMatScalarKokkosView    kv;
1097:   PetscMemType                memtype;

1099:   PetscGetMemType(v, &memtype);
1100:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1101:     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
1102:   } else {
1103:     kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
1104:   }

1106:   if (imode == INSERT_VALUES) MatSeqAIJGetKokkosViewWrite(A, &Aa); /* write matrix values */
1107:   else MatSeqAIJGetKokkosView(A, &Aa);                             /* read & write matrix values */

1109:   Kokkos::parallel_for(
1110:     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1111:       PetscScalar sum = 0.0;
1112:       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1113:       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1114:     });

1116:   if (imode == INSERT_VALUES) MatSeqAIJRestoreKokkosViewWrite(A, &Aa);
1117:   else MatSeqAIJRestoreKokkosView(A, &Aa);
1118:   return 0;
1119: }

1121: PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag)
1122: {
1123:   Mat_SeqAIJKokkos          *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1124:   MatScalarKokkosView        Aa;
1125:   const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
1126:   PetscInt                   m  = A->rmap->n;
1127:   ConstMatRowMapKokkosView   Adiag(diag, m); /* diag is a device pointer */

1129:   MatSeqAIJGetKokkosViewWrite(A, &Aa);
1130:   Kokkos::parallel_for(
1131:     m, KOKKOS_LAMBDA(const PetscInt i) {
1132:       PetscScalar tmp;
1133:       if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
1134:         tmp          = Aa(Ai(i));
1135:         Aa(Ai(i))    = Aa(Adiag(i));
1136:         Aa(Adiag(i)) = tmp;
1137:       }
1138:     });
1139:   MatSeqAIJRestoreKokkosViewWrite(A, &Aa);
1140:   return 0;
1141: }

1143: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1144: {
1145:   MatSeqAIJKokkosSyncHost(A);
1146:   MatLUFactorNumeric_SeqAIJ(B, A, info);
1147:   B->offloadmask = PETSC_OFFLOAD_CPU;
1148:   return 0;
1149: }

1151: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1152: {
1153:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

1155:   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1156:   A->boundtocpu  = PETSC_FALSE;

1158:   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1159:   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1160:   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1161:   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1162:   A->ops->scale                     = MatScale_SeqAIJKokkos;
1163:   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1164:   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1165:   A->ops->mult                      = MatMult_SeqAIJKokkos;
1166:   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1167:   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1168:   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1169:   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1170:   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1171:   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1172:   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1173:   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1174:   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1175:   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1176:   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1177:   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1178:   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1179:   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1180:   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1181:   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;

1183:   PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos);
1184:   PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);
1185:   return 0;
1186: }

1188: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1189: {
1190:   Mat_SeqAIJ *aseq;
1191:   PetscInt    i, m, n;


1195:   m = akok->nrows();
1196:   n = akok->ncols();
1197:   MatSetSizes(A, m, n, m, n);
1198:   MatSetType(A, MATSEQAIJKOKKOS);

1200:   /* Set up data structures of A as a MATSEQAIJ */
1201:   MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL);
1202:   aseq = (Mat_SeqAIJ *)(A)->data;

1204:   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1205:   akok->j_dual.sync_host();

1207:   aseq->i            = akok->i_host_data();
1208:   aseq->j            = akok->j_host_data();
1209:   aseq->a            = akok->a_host_data();
1210:   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1211:   aseq->singlemalloc = PETSC_FALSE;
1212:   aseq->free_a       = PETSC_FALSE;
1213:   aseq->free_ij      = PETSC_FALSE;
1214:   aseq->nz           = akok->nnz();
1215:   aseq->maxnz        = aseq->nz;

1217:   PetscMalloc1(m, &aseq->imax);
1218:   PetscMalloc1(m, &aseq->ilen);
1219:   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];

1221:   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1222:   akok->nonzerostate = A->nonzerostate;
1223:   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1224:   MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
1225:   MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
1226:   return 0;
1227: }

1229: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure

1231:    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1232:  */
1233: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1234: {
1235:   MatCreate(comm, A);
1236:   MatSetSeqAIJKokkosWithCSRMatrix(*A, akok);
1237:   return 0;
1238: }

1240: /* --------------------------------------------------------------------------------*/
1241: /*@C
1242:    MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1243:    (the default parallel PETSc format). This matrix will ultimately be handled by
1244:    Kokkos for calculations. For good matrix
1245:    assembly performance the user should preallocate the matrix storage by setting
1246:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1247:    performance during matrix assembly can be increased by more than a factor of 50.

1249:    Collective

1251:    Input Parameters:
1252: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1253: .  m - number of rows
1254: .  n - number of columns
1255: .  nz - number of nonzeros per row (same for all rows)
1256: -  nnz - array containing the number of nonzeros in the various rows
1257:          (possibly different for each row) or NULL

1259:    Output Parameter:
1260: .  A - the matrix

1262:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1263:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1264:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

1266:    Notes:
1267:    If nnz is given then nz is ignored

1269:    The AIJ format, also called
1270:    compressed row storage, is fully compatible with standard Fortran 77
1271:    storage.  That is, the stored row and column indices can begin at
1272:    either one (as in Fortran) or zero.  See the users' manual for details.

1274:    Specify the preallocated storage with either nz or nnz (not both).
1275:    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
1276:    allocation.  For large problems you MUST preallocate memory or you
1277:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1279:    By default, this format uses inodes (identical nodes) when possible, to
1280:    improve numerical efficiency of matrix-vector products and solves. We
1281:    search for consecutive rows with the same nonzero structure, thereby
1282:    reusing matrix information to achieve increased efficiency.

1284:    Level: intermediate

1286: .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1287: @*/
1288: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1289: {
1290:   PetscKokkosInitializeCheck();
1291:   MatCreate(comm, A);
1292:   MatSetSizes(*A, m, n, m, n);
1293:   MatSetType(*A, MATSEQAIJKOKKOS);
1294:   MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz);
1295:   return 0;
1296: }

1298: typedef Kokkos::TeamPolicy<>::member_type team_member;
1299: //
1300: // This factorization exploits block diagonal matrices with "Nf" (not used).
1301: // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1302: //
1303: static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info)
1304: {
1305:   Mat_SeqAIJ       *b      = (Mat_SeqAIJ *)B->data;
1306:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1307:   IS                isrow = b->row, isicol = b->icol;
1308:   const PetscInt   *r_h, *ic_h;
1309:   const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1310:   const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1311:   PetscScalar       *ba_d = baijkok->a_dual.view_device().data();
1312:   PetscBool          row_identity, col_identity;
1313:   PetscInt           nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used

1316:   MatIsStructurallySymmetric(A, &row_identity);
1318:   ISGetIndices(isrow, &r_h);
1319:   ISGetIndices(isicol, &ic_h);
1320:   ISGetSize(isicol, &nc);
1321:   PetscLogGpuTimeBegin();
1322:   MatSeqAIJKokkosSyncDevice(A);
1323:   {
1324: #define KOKKOS_SHARED_LEVEL 1
1325:     using scr_mem_t    = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1326:     using sizet_scr_t  = Kokkos::View<size_t, scr_mem_t>;
1327:     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1328:     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n);
1329:     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_r_k("r", n);
1330:     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc);
1331:     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_ic_k("ic", nc);
1332:     size_t                                                                                                               flops_h = 0.0;
1333:     Kokkos::View<size_t, Kokkos::HostSpace>                                                                              h_flops_k(&flops_h);
1334:     Kokkos::View<size_t>                                                                                                 d_flops_k("flops");
1335:     const int                                                                                                            conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1336:     const int                                                                                                            nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1;
1337:     Kokkos::deep_copy(d_flops_k, h_flops_k);
1338:     Kokkos::deep_copy(d_r_k, h_r_k);
1339:     Kokkos::deep_copy(d_ic_k, h_ic_k);
1340:     // Fill A --> fact
1341:     Kokkos::parallel_for(
1342:       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) {
1343:         const PetscInt  field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA
1344:         const PetscInt  nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
1345:         const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
1346:         // zero rows of B
1347:         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1348:           PetscInt     nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag
1349:           PetscScalar *baL = ba_d + bi_d[rowb];
1350:           PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1;
1351:           /* zero (unfactored row) */
1352:           for (int j = 0; j < nzbL; j++) baL[j] = 0;
1353:           for (int j = 0; j < nzbU; j++) baU[j] = 0;
1354:         });
1355:         // copy A into B
1356:         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1357:           PetscInt           rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
1358:           const PetscScalar *av    = aa_d + ai_d[rowa];
1359:           const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1360:           /* load in initial (unfactored row) */
1361:           for (int j = 0; j < nza; j++) {
1362:             PetscInt    colb = ic[ajtmp[j]];
1363:             PetscScalar vala = av[j];
1364:             if (colb == rowb) {
1365:               *(ba_d + bdiag_d[rowb]) = vala;
1366:             } else {
1367:               const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1368:               PetscScalar    *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1369:               PetscInt        nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0;
1370:               for (int j = 0; j < nz; j++) {
1371:                 if (pbj[j] == colb) {
1372:                   pba[j] = vala;
1373:                   set++;
1374:                   break;
1375:                 }
1376:               }
1377: #if !defined(PETSC_HAVE_SYCL)
1378:               if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1379: #endif
1380:             }
1381:           }
1382:         });
1383:       });
1384:     Kokkos::fence();

1386:     Kokkos::parallel_for(
1387:       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) {
1388:         sizet_scr_t    colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1389:         scalar_scr_t   L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1390:         sizet_scr_t    flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1391:         const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA
1392:         const PetscInt start = field * nloc, end = start + nloc;
1393:         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1394:         // A22 panel update for each row A(1,:) and col A(:,1)
1395:         for (int ii = start; ii < end - 1; ii++) {
1396:           const PetscInt    *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end)
1397:           const PetscScalar *baUi    = ba_d + bdiag_d[ii + 1] + 1;                                          // vector of data  U(i,i+1:end)
1398:           const PetscInt     nUi_its = nzUi / Ni + !!(nzUi % Ni);
1399:           const PetscScalar  Bii     = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1400:           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) {
1401:             PetscInt kIdx = j * Ni + field_block_idx;
1402:             if (kIdx >= nzUi) /* void */
1403:               ;
1404:             else {
1405:               const PetscInt  myk = bjUi[kIdx];                // assume symmetric structure, need a transposed meta-data here in general
1406:               const PetscInt *pjL = bj_d + bi_d[myk];          // look for L(myk,ii) in start of row
1407:               const PetscInt  nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:)
1408:               size_t          st_idx;
1409:               // find and do L(k,i) = A(:k,i) / A(i,i)
1410:               Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1411:               // get column, there has got to be a better way
1412:               Kokkos::parallel_reduce(
1413:                 Kokkos::ThreadVectorRange(team, nzL),
1414:                 [&](const int &j, size_t &idx) {
1415:                   if (pjL[j] == ii) {
1416:                     PetscScalar *pLki = ba_d + bi_d[myk] + j;
1417:                     idx               = j;           // output
1418:                     *pLki             = *pLki / Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1419:                   }
1420:                 },
1421:                 st_idx);
1422:               Kokkos::single(Kokkos::PerThread(team), [=]() {
1423:                 colkIdx() = st_idx;
1424:                 L_ki()    = *(ba_d + bi_d[myk] + st_idx);
1425:               });
1426: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1427:               if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register
1428: #endif
1429:               // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1430:               // U(i+1,:end)
1431:               Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U)
1432:                 PetscScalar Uij = baUi[uiIdx];
1433:                 PetscInt    col = bjUi[uiIdx];
1434:                 if (col == myk) {
1435:                   // A_kk = A_kk - L_ki * U_ij(k)
1436:                   PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1437:                   *Akkv             = *Akkv - L_ki() * Uij;  // UiK
1438:                 } else {
1439:                   PetscScalar    *start, *end, *pAkjv = NULL;
1440:                   PetscInt        high, low;
1441:                   const PetscInt *startj;
1442:                   if (col < myk) { // L
1443:                     PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1444:                     PetscInt     idx  = (pLki + 1) - (ba_d + bi_d[myk]); // index into row
1445:                     start             = pLki + 1;                        // start at pLki+1, A22(myk,1)
1446:                     startj            = bj_d + bi_d[myk] + idx;
1447:                     end               = ba_d + bi_d[myk + 1];
1448:                   } else {
1449:                     PetscInt idx = bdiag_d[myk + 1] + 1;
1450:                     start        = ba_d + idx;
1451:                     startj       = bj_d + idx;
1452:                     end          = ba_d + bdiag_d[myk];
1453:                   }
1454:                   // search for 'col', use bisection search - TODO
1455:                   low  = 0;
1456:                   high = (PetscInt)(end - start);
1457:                   while (high - low > 5) {
1458:                     int t = (low + high) / 2;
1459:                     if (startj[t] > col) high = t;
1460:                     else low = t;
1461:                   }
1462:                   for (pAkjv = start + low; pAkjv < start + high; pAkjv++) {
1463:                     if (startj[pAkjv - start] == col) break;
1464:                   }
1465: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1466:                   if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register
1467: #endif
1468:                   *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1469:                 }
1470:               });
1471:             }
1472:           });
1473:           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1474:           if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); });
1475:         } /* endof for (i=0; i<n; i++) { */
1476:         Kokkos::single(Kokkos::PerTeam(team), [=]() {
1477:           Kokkos::atomic_add(&d_flops_k(), flops());
1478:           flops() = 0;
1479:         });
1480:       });
1481:     Kokkos::fence();
1482:     Kokkos::deep_copy(h_flops_k, d_flops_k);
1483:     PetscLogGpuFlops((PetscLogDouble)h_flops_k());
1484:     Kokkos::parallel_for(
1485:       Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) {
1486:         const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni;                            //, field_offset = lg_rank%Ni;
1487:         const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters
1488:         /* Invert diagonal for simpler triangular solves */
1489:         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) {
1490:           int i = start + outer_index * Ni + lg_rank % Ni;
1491:           if (i < end) {
1492:             PetscScalar *pv = ba_d + bdiag_d[i];
1493:             *pv             = 1.0 / (*pv);
1494:           }
1495:         });
1496:       });
1497:   }
1498:   PetscLogGpuTimeEnd();
1499:   ISRestoreIndices(isicol, &ic_h);
1500:   ISRestoreIndices(isrow, &r_h);

1502:   ISIdentity(isrow, &row_identity);
1503:   ISIdentity(isicol, &col_identity);
1504:   if (b->inode.size) {
1505:     B->ops->solve = MatSolve_SeqAIJ_Inode;
1506:   } else if (row_identity && col_identity) {
1507:     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1508:   } else {
1509:     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1510:   }
1511:   B->offloadmask = PETSC_OFFLOAD_GPU;
1512:   MatSeqAIJKokkosSyncHost(B);          // solve on CPU
1513:   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1514:   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1515:   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1516:   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1517:   B->assembled              = PETSC_TRUE;
1518:   B->preallocated           = PETSC_TRUE;

1520:   return 0;
1521: }

1523: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1524: {
1525:   MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info);
1526:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1527:   return 0;
1528: }

1530: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1531: {
1532:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1534:   if (!factors->sptrsv_symbolic_completed) {
1535:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1536:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1537:     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1538:   }
1539:   return 0;
1540: }

1542: /* Check if we need to update factors etc for transpose solve */
1543: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1544: {
1545:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1546:   MatColIdxType               n       = A->rmap->n;

1548:   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1549:     /* Update L^T and do sptrsv symbolic */
1550:     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
1551:     Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1552:     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1553:     factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));

1555:     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1556:                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);

1558:     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1559:       We have to sort the indices, until KK provides finer control options.
1560:     */
1561:     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);

1563:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);

1565:     /* Update U^T and do sptrsv symbolic */
1566:     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
1567:     Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1568:     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1569:     factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));

1571:     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1572:                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);

1574:     /* Sort indices. See comments above */
1575:     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);

1577:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1578:     factors->transpose_updated = PETSC_TRUE;
1579:   }
1580:   return 0;
1581: }

1583: /* Solve Ax = b, with A = LU */
1584: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1585: {
1586:   ConstPetscScalarKokkosView  bv;
1587:   PetscScalarKokkosView       xv;
1588:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1590:   PetscLogGpuTimeBegin();
1591:   MatSeqAIJKokkosSymbolicSolveCheck(A);
1592:   VecGetKokkosView(b, &bv);
1593:   VecGetKokkosViewWrite(x, &xv);
1594:   /* Solve L tmpv = b */
1595:   KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector);
1596:   /* Solve Ux = tmpv */
1597:   KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv);
1598:   VecRestoreKokkosView(b, &bv);
1599:   VecRestoreKokkosViewWrite(x, &xv);
1600:   PetscLogGpuTimeEnd();
1601:   return 0;
1602: }

1604: /* Solve A^T x = b, where A^T = U^T L^T */
1605: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1606: {
1607:   ConstPetscScalarKokkosView  bv;
1608:   PetscScalarKokkosView       xv;
1609:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1611:   PetscLogGpuTimeBegin();
1612:   MatSeqAIJKokkosTransposeSolveCheck(A);
1613:   VecGetKokkosView(b, &bv);
1614:   VecGetKokkosViewWrite(x, &xv);
1615:   /* Solve U^T tmpv = b */
1616:   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);

1618:   /* Solve L^T x = tmpv */
1619:   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1620:   VecRestoreKokkosView(b, &bv);
1621:   VecRestoreKokkosViewWrite(x, &xv);
1622:   PetscLogGpuTimeEnd();
1623:   return 0;
1624: }

1626: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1627: {
1628:   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1629:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1630:   PetscInt                    fill_lev = info->levels;

1632:   PetscLogGpuTimeBegin();
1633:   MatSeqAIJKokkosSyncDevice(A);

1635:   auto a_d = aijkok->a_dual.view_device();
1636:   auto i_d = aijkok->i_dual.view_device();
1637:   auto j_d = aijkok->j_dual.view_device();

1639:   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);

1641:   B->assembled              = PETSC_TRUE;
1642:   B->preallocated           = PETSC_TRUE;
1643:   B->ops->solve             = MatSolve_SeqAIJKokkos;
1644:   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1645:   B->ops->matsolve          = NULL;
1646:   B->ops->matsolvetranspose = NULL;
1647:   B->offloadmask            = PETSC_OFFLOAD_GPU;

1649:   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1650:   factors->transpose_updated         = PETSC_FALSE;
1651:   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1652:   /* TODO: log flops, but how to know that? */
1653:   PetscLogGpuTimeEnd();
1654:   return 0;
1655: }

1657: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1658: {
1659:   Mat_SeqAIJKokkos           *aijkok;
1660:   Mat_SeqAIJ                 *b;
1661:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1662:   PetscInt                    fill_lev = info->levels;
1663:   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1664:   PetscInt                    n        = A->rmap->n;

1666:   MatSeqAIJKokkosSyncDevice(A);
1667:   /* Rebuild factors */
1668:   if (factors) {
1669:     factors->Destroy();
1670:   } /* Destroy the old if it exists */
1671:   else {
1672:     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1673:   }

1675:   /* Create a spiluk handle and then do symbolic factorization */
1676:   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1677:   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);

1679:   auto spiluk_handle = factors->kh.get_spiluk_handle();

1681:   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1682:   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1683:   Kokkos::realloc(factors->iU_d, n + 1);
1684:   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());

1686:   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1687:   auto i_d = aijkok->i_dual.view_device();
1688:   auto j_d = aijkok->j_dual.view_device();
1689:   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1690:   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */

1692:   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1693:   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1694:   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1695:   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());

1697:   /* TODO: add options to select sptrsv algorithms */
1698:   /* Create sptrsv handles for L, U and their transpose */
1699: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1700:   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1701: #else
1702:   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1703: #endif

1705:   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1706:   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1707:   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1708:   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);

1710:   /* Fill fields of the factor matrix B */
1711:   MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL);
1712:   b     = (Mat_SeqAIJ *)B->data;
1713:   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1714:   B->info.fill_ratio_given  = info->fill;
1715:   B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA);

1717:   B->offloadmask          = PETSC_OFFLOAD_GPU;
1718:   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1719:   return 0;
1720: }

1722: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1723: {
1724:   Mat_SeqAIJ    *b     = (Mat_SeqAIJ *)B->data;
1725:   const PetscInt nrows = A->rmap->n;

1727:   MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info);
1728:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1729:   // move B data into Kokkos
1730:   MatSeqAIJKokkosSyncDevice(B); // create aijkok
1731:   MatSeqAIJKokkosSyncDevice(A); // create aijkok
1732:   {
1733:     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1734:     if (!baijkok->diag_d.extent(0)) {
1735:       const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1);
1736:       baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag));
1737:       Kokkos::deep_copy(baijkok->diag_d, h_diag);
1738:     }
1739:   }
1740:   return 0;
1741: }

1743: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1744: {
1745:   *type = MATSOLVERKOKKOS;
1746:   return 0;
1747: }

1749: static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type)
1750: {
1751:   *type = MATSOLVERKOKKOSDEVICE;
1752:   return 0;
1753: }

1755: /*MC
1756:   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1757:   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.

1759:   Level: beginner

1761: .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1762: M*/
1763: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1764: {
1765:   PetscInt n = A->rmap->n;

1767:   MatCreate(PetscObjectComm((PetscObject)A), B);
1768:   MatSetSizes(*B, n, n, n, n);
1769:   (*B)->factortype = ftype;
1770:   PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]);
1771:   MatSetType(*B, MATSEQAIJKOKKOS);

1773:   if (ftype == MAT_FACTOR_LU) {
1774:     MatSetBlockSizesFromMats(*B, A, A);
1775:     (*B)->canuseordering        = PETSC_TRUE;
1776:     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1777:   } else if (ftype == MAT_FACTOR_ILU) {
1778:     MatSetBlockSizesFromMats(*B, A, A);
1779:     (*B)->canuseordering         = PETSC_FALSE;
1780:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1781:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);

1783:   MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL);
1784:   PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos);
1785:   return 0;
1786: }

1788: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B)
1789: {
1790:   PetscInt n = A->rmap->n;

1792:   MatCreate(PetscObjectComm((PetscObject)A), B);
1793:   MatSetSizes(*B, n, n, n, n);
1794:   (*B)->factortype     = ftype;
1795:   (*B)->canuseordering = PETSC_TRUE;
1796:   PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]);
1797:   MatSetType(*B, MATSEQAIJKOKKOS);

1799:   if (ftype == MAT_FACTOR_LU) {
1800:     MatSetBlockSizesFromMats(*B, A, A);
1801:     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1802:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types");

1804:   MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL);
1805:   PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device);
1806:   return 0;
1807: }

1809: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1810: {
1811:   MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos);
1812:   MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos);
1813:   MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device);
1814:   return 0;
1815: }

1817: /* Utility to print out a KokkosCsrMatrix for debugging */
1818: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1819: {
1820:   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1821:   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1822:   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1823:   const PetscInt    *i  = iv.data();
1824:   const PetscInt    *j  = jv.data();
1825:   const PetscScalar *a  = av.data();
1826:   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();

1828:   PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz);
1829:   for (PetscInt k = 0; k < m; k++) {
1830:     PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k);
1831:     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]));
1832:     PetscPrintf(PETSC_COMM_SELF, "\n");
1833:   }
1834:   return 0;
1835: }