Actual source code: aijkok.kokkos.cxx

  1: #include <petsc_kokkos.hpp>
  2: #include <petscvec_kokkos.hpp>
  3: #include <petscpkg_version.h>
  4: #include <petsc/private/petscimpl.h>
  5: #include <petsc/private/sfimpl.h>
  6: #include <petscsystypes.h>
  7: #include <petscerror.h>

  9: #include <Kokkos_Core.hpp>
 10: #include <KokkosBlas.hpp>
 11: #include <KokkosSparse_CrsMatrix.hpp>
 12: #include <KokkosSparse_spmv.hpp>
 13: #include <KokkosSparse_spiluk.hpp>
 14: #include <KokkosSparse_sptrsv.hpp>
 15: #include <KokkosSparse_spgemm.hpp>
 16: #include <KokkosSparse_spadd.hpp>
 17: #include <KokkosBatched_LU_Decl.hpp>
 18: #include <KokkosBatched_InverseLU_Decl.hpp>

 20: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>

 22: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
 23:   #include <KokkosSparse_Utils.hpp>
 24: using KokkosSparse::sort_crs_matrix;
 25: using KokkosSparse::Impl::transpose_matrix;
 26: #else
 27:   #include <KokkosKernels_Sorting.hpp>
 28: using KokkosKernels::sort_crs_matrix;
 29: using KokkosKernels::Impl::transpose_matrix;
 30: #endif

 32: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */

 34: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
 35:    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
 36:    In the latter case, it is important to set a_dual's sync state correctly.
 37:  */
 38: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
 39: {
 40:   Mat_SeqAIJ       *aijseq;
 41:   Mat_SeqAIJKokkos *aijkok;

 43:   PetscFunctionBegin;
 44:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
 45:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));

 47:   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
 48:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 50:   /* If aijkok does not exist, we just copy i, j to device.
 51:      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
 52:      In both cases, we build a new aijkok structure.
 53:   */
 54:   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
 55:     delete aijkok;
 56:     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
 57:     A->spptr = aijkok;
 58:   }

 60:   PetscFunctionReturn(PETSC_SUCCESS);
 61: }

 63: /* Sync CSR data to device if not yet */
 64: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
 65: {
 66:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 68:   PetscFunctionBegin;
 69:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
 70:   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
 71:   if (aijkok->a_dual.need_sync_device()) {
 72:     aijkok->a_dual.sync_device();
 73:     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
 74:     aijkok->hermitian_updated = PETSC_FALSE;
 75:   }
 76:   PetscFunctionReturn(PETSC_SUCCESS);
 77: }

 79: /* Mark the CSR data on device as modified */
 80: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
 81: {
 82:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 84:   PetscFunctionBegin;
 85:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
 86:   aijkok->a_dual.clear_sync_state();
 87:   aijkok->a_dual.modify_device();
 88:   aijkok->transpose_updated = PETSC_FALSE;
 89:   aijkok->hermitian_updated = PETSC_FALSE;
 90:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
 91:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
 92:   PetscFunctionReturn(PETSC_SUCCESS);
 93: }

 95: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
 96: {
 97:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
 98:   auto             &exec   = PetscGetKokkosExecutionSpace();

100:   PetscFunctionBegin;
101:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
102:   /* We do not expect one needs factors on host  */
103:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
104:   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
105:   PetscCallCXX(aijkok->a_dual.sync_host(exec));
106:   PetscCallCXX(exec.fence());
107:   PetscFunctionReturn(PETSC_SUCCESS);
108: }

110: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
111: {
112:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

114:   PetscFunctionBegin;
115:   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
116:     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
117:     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
118:     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
119:   */
120:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
121:     auto &exec = PetscGetKokkosExecutionSpace();
122:     PetscCallCXX(aijkok->a_dual.sync_host(exec));
123:     PetscCallCXX(exec.fence());
124:     *array = aijkok->a_dual.view_host().data();
125:   } else { /* Happens when calling MatSetValues on a newly created matrix */
126:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
127:   }
128:   PetscFunctionReturn(PETSC_SUCCESS);
129: }

131: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
132: {
133:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

135:   PetscFunctionBegin;
136:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
137:   PetscFunctionReturn(PETSC_SUCCESS);
138: }

140: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
141: {
142:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

144:   PetscFunctionBegin;
145:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
146:     auto &exec = PetscGetKokkosExecutionSpace();
147:     PetscCallCXX(aijkok->a_dual.sync_host(exec));
148:     PetscCallCXX(exec.fence());
149:     *array = aijkok->a_dual.view_host().data();
150:   } else {
151:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
152:   }
153:   PetscFunctionReturn(PETSC_SUCCESS);
154: }

156: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
157: {
158:   PetscFunctionBegin;
159:   *array = NULL;
160:   PetscFunctionReturn(PETSC_SUCCESS);
161: }

163: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
164: {
165:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

167:   PetscFunctionBegin;
168:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
169:     *array = aijkok->a_dual.view_host().data();
170:   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
171:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
172:   }
173:   PetscFunctionReturn(PETSC_SUCCESS);
174: }

176: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
177: {
178:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

180:   PetscFunctionBegin;
181:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
182:     aijkok->a_dual.clear_sync_state();
183:     aijkok->a_dual.modify_host();
184:   }
185:   PetscFunctionReturn(PETSC_SUCCESS);
186: }

188: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
189: {
190:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

192:   PetscFunctionBegin;
193:   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");

195:   if (i) *i = aijkok->i_device_data();
196:   if (j) *j = aijkok->j_device_data();
197:   if (a) {
198:     aijkok->a_dual.sync_device();
199:     *a = aijkok->a_device_data();
200:   }
201:   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
202:   PetscFunctionReturn(PETSC_SUCCESS);
203: }

205: /*
206:   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.

208:   Input Parameter:
209: .  A       - the MATSEQAIJKOKKOS matrix

211:   Output Parameters:
212: +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
213: -  T_d    - the transpose on device, whose value array is allocated but not initialized
214: */
215: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
216: {
217:   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
218:   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
219:   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
220:   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
221:   MatRowMapType          *Ti = Ti_h.data();
222:   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
223:   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
224:   PetscInt               *Tj   = Tj_h.data();
225:   PetscInt               *perm = perm_h.data();
226:   PetscInt               *offset;

228:   PetscFunctionBegin;
229:   // Populate Ti
230:   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
231:   Ti++;
232:   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
233:   Ti--;
234:   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];

236:   // Populate Tj and the permutation array
237:   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
238:   for (PetscInt i = 0; i < m; i++) {
239:     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
240:       PetscInt r    = Aj[j];                       // row r of T
241:       PetscInt disp = Ti[r] + offset[r];

243:       Tj[disp]   = i; // col i of T
244:       perm[disp] = j;
245:       offset[r]++;
246:     }
247:   }
248:   PetscCall(PetscFree(offset));

250:   // Sort each row of T, along with the permutation array
251:   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));

253:   // Output perm and T on device
254:   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
255:   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
256:   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
257:   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
258:   PetscFunctionReturn(PETSC_SUCCESS);
259: }

261: // Generate the transpose on device and cache it internally
262: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
263: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
264: {
265:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
266:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
267:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
268:   KokkosCsrMatrix  &T = akok->csrmatT;

270:   PetscFunctionBegin;
271:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
272:   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device

274:   const auto &Aa = akok->a_dual.view_device();

276:   if (A->symmetric == PETSC_BOOL3_TRUE) {
277:     *csrmatT = akok->csrmat;
278:   } else {
279:     // See if we already have a cached transpose and its value is up to date
280:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
281:       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
282:         const auto &perm = akok->transpose_perm; // get the permutation array
283:         auto       &Ta   = T.values;

285:         PetscCallCXX(Kokkos::parallel_for(
286:           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
287:       }
288:     } else { // Generate T of size n x m for the first time
289:       MatRowMapKokkosView perm;

291:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
292:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
293:       PetscCallCXX(Kokkos::parallel_for(
294:         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
295:     }
296:     akok->transpose_updated = PETSC_TRUE;
297:     *csrmatT                = akok->csrmatT;
298:   }
299:   PetscFunctionReturn(PETSC_SUCCESS);
300: }

302: // Generate the Hermitian on device and cache it internally
303: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
304: {
305:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
306:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
307:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
308:   KokkosCsrMatrix  &T = akok->csrmatH;

310:   PetscFunctionBegin;
311:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
312:   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device

314:   const auto &Aa = akok->a_dual.view_device();

316:   if (A->hermitian == PETSC_BOOL3_TRUE) {
317:     *csrmatH = akok->csrmat;
318:   } else {
319:     // See if we already have a cached hermitian and its value is up to date
320:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
321:       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
322:         const auto &perm = akok->transpose_perm; // get the permutation array
323:         auto       &Ta   = T.values;

325:         PetscCallCXX(Kokkos::parallel_for(
326:           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
327:       }
328:     } else { // Generate T of size n x m for the first time
329:       MatRowMapKokkosView perm;

331:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
332:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
333:       PetscCallCXX(Kokkos::parallel_for(
334:         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
335:     }
336:     akok->hermitian_updated = PETSC_TRUE;
337:     *csrmatH                = akok->csrmatH;
338:   }
339:   PetscFunctionReturn(PETSC_SUCCESS);
340: }

342: /* y = A x */
343: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
344: {
345:   Mat_SeqAIJKokkos          *aijkok;
346:   ConstPetscScalarKokkosView xv;
347:   PetscScalarKokkosView      yv;

349:   PetscFunctionBegin;
350:   PetscCall(PetscLogGpuTimeBegin());
351:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
352:   PetscCall(VecGetKokkosView(xx, &xv));
353:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
354:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
355:   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
356:   PetscCall(VecRestoreKokkosView(xx, &xv));
357:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
358:   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
359:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
360:   PetscCall(PetscLogGpuTimeEnd());
361:   PetscFunctionReturn(PETSC_SUCCESS);
362: }

364: /* y = A^T x */
365: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
366: {
367:   Mat_SeqAIJKokkos          *aijkok;
368:   const char                *mode;
369:   ConstPetscScalarKokkosView xv;
370:   PetscScalarKokkosView      yv;
371:   KokkosCsrMatrix            csrmat;

373:   PetscFunctionBegin;
374:   PetscCall(PetscLogGpuTimeBegin());
375:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
376:   PetscCall(VecGetKokkosView(xx, &xv));
377:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
378:   if (A->form_explicit_transpose) {
379:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
380:     mode = "N";
381:   } else {
382:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
383:     csrmat = aijkok->csrmat;
384:     mode   = "T";
385:   }
386:   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
387:   PetscCall(VecRestoreKokkosView(xx, &xv));
388:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
389:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
390:   PetscCall(PetscLogGpuTimeEnd());
391:   PetscFunctionReturn(PETSC_SUCCESS);
392: }

394: /* y = A^H x */
395: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
396: {
397:   Mat_SeqAIJKokkos          *aijkok;
398:   const char                *mode;
399:   ConstPetscScalarKokkosView xv;
400:   PetscScalarKokkosView      yv;
401:   KokkosCsrMatrix            csrmat;

403:   PetscFunctionBegin;
404:   PetscCall(PetscLogGpuTimeBegin());
405:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
406:   PetscCall(VecGetKokkosView(xx, &xv));
407:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
408:   if (A->form_explicit_transpose) {
409:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
410:     mode = "N";
411:   } else {
412:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
413:     csrmat = aijkok->csrmat;
414:     mode   = "C";
415:   }
416:   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
417:   PetscCall(VecRestoreKokkosView(xx, &xv));
418:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
419:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
420:   PetscCall(PetscLogGpuTimeEnd());
421:   PetscFunctionReturn(PETSC_SUCCESS);
422: }

424: /* z = A x + y */
425: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
426: {
427:   Mat_SeqAIJKokkos          *aijkok;
428:   ConstPetscScalarKokkosView xv, yv;
429:   PetscScalarKokkosView      zv;

431:   PetscFunctionBegin;
432:   PetscCall(PetscLogGpuTimeBegin());
433:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
434:   PetscCall(VecGetKokkosView(xx, &xv));
435:   PetscCall(VecGetKokkosView(yy, &yv));
436:   PetscCall(VecGetKokkosViewWrite(zz, &zv));
437:   if (zz != yy) Kokkos::deep_copy(zv, yv);
438:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
439:   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
440:   PetscCall(VecRestoreKokkosView(xx, &xv));
441:   PetscCall(VecRestoreKokkosView(yy, &yv));
442:   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
443:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
444:   PetscCall(PetscLogGpuTimeEnd());
445:   PetscFunctionReturn(PETSC_SUCCESS);
446: }

448: /* z = A^T x + y */
449: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
450: {
451:   Mat_SeqAIJKokkos          *aijkok;
452:   const char                *mode;
453:   ConstPetscScalarKokkosView xv, yv;
454:   PetscScalarKokkosView      zv;
455:   KokkosCsrMatrix            csrmat;

457:   PetscFunctionBegin;
458:   PetscCall(PetscLogGpuTimeBegin());
459:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
460:   PetscCall(VecGetKokkosView(xx, &xv));
461:   PetscCall(VecGetKokkosView(yy, &yv));
462:   PetscCall(VecGetKokkosViewWrite(zz, &zv));
463:   if (zz != yy) Kokkos::deep_copy(zv, yv);
464:   if (A->form_explicit_transpose) {
465:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
466:     mode = "N";
467:   } else {
468:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
469:     csrmat = aijkok->csrmat;
470:     mode   = "T";
471:   }
472:   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
473:   PetscCall(VecRestoreKokkosView(xx, &xv));
474:   PetscCall(VecRestoreKokkosView(yy, &yv));
475:   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
476:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
477:   PetscCall(PetscLogGpuTimeEnd());
478:   PetscFunctionReturn(PETSC_SUCCESS);
479: }

481: /* z = A^H x + y */
482: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
483: {
484:   Mat_SeqAIJKokkos          *aijkok;
485:   const char                *mode;
486:   ConstPetscScalarKokkosView xv, yv;
487:   PetscScalarKokkosView      zv;
488:   KokkosCsrMatrix            csrmat;

490:   PetscFunctionBegin;
491:   PetscCall(PetscLogGpuTimeBegin());
492:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
493:   PetscCall(VecGetKokkosView(xx, &xv));
494:   PetscCall(VecGetKokkosView(yy, &yv));
495:   PetscCall(VecGetKokkosViewWrite(zz, &zv));
496:   if (zz != yy) Kokkos::deep_copy(zv, yv);
497:   if (A->form_explicit_transpose) {
498:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
499:     mode = "N";
500:   } else {
501:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
502:     csrmat = aijkok->csrmat;
503:     mode   = "C";
504:   }
505:   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
506:   PetscCall(VecRestoreKokkosView(xx, &xv));
507:   PetscCall(VecRestoreKokkosView(yy, &yv));
508:   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
509:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
510:   PetscCall(PetscLogGpuTimeEnd());
511:   PetscFunctionReturn(PETSC_SUCCESS);
512: }

514: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
515: {
516:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

518:   PetscFunctionBegin;
519:   switch (op) {
520:   case MAT_FORM_EXPLICIT_TRANSPOSE:
521:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
522:     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
523:     A->form_explicit_transpose = flg;
524:     break;
525:   default:
526:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
527:     break;
528:   }
529:   PetscFunctionReturn(PETSC_SUCCESS);
530: }

532: /* Depending on reuse, either build a new mat, or use the existing mat */
533: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
534: {
535:   Mat_SeqAIJ *aseq;

537:   PetscFunctionBegin;
538:   PetscCall(PetscKokkosInitializeCheck());
539:   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
540:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
541:   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
542:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
543:   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
544:     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
545:     PetscCall(PetscFree(A->defaultvectype));
546:     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
547:     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
548:     PetscCall(MatSetOps_SeqAIJKokkos(A));
549:     aseq = static_cast<Mat_SeqAIJ *>(A->data);
550:     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
551:       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
552:       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
553:     }
554:   }
555:   PetscFunctionReturn(PETSC_SUCCESS);
556: }

558: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
559:    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
560:  */
561: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
562: {
563:   Mat_SeqAIJ       *bseq;
564:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
565:   Mat               mat;

567:   PetscFunctionBegin;
568:   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
569:   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
570:   mat = *B;
571:   if (A->preallocated) {
572:     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
573:     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
574:     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
575:     /* Now copy values to B if needed */
576:     if (dupOption == MAT_COPY_VALUES) {
577:       if (akok->a_dual.need_sync_device()) {
578:         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
579:         bkok->a_dual.modify_host();
580:       } else { /* If device has the latest data, we only copy data on device */
581:         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
582:         bkok->a_dual.modify_device();
583:       }
584:     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
585:       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
586:       bkok->a_dual.modify_host();
587:     }
588:     mat->spptr = bkok;
589:   }

591:   PetscCall(PetscFree(mat->defaultvectype));
592:   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
593:   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
594:   PetscCall(MatSetOps_SeqAIJKokkos(mat));
595:   PetscFunctionReturn(PETSC_SUCCESS);
596: }

598: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
599: {
600:   Mat               At;
601:   KokkosCsrMatrix   internT;
602:   Mat_SeqAIJKokkos *atkok, *bkok;

604:   PetscFunctionBegin;
605:   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
606:   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
607:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
608:     /* Deep copy internT, as we want to isolate the internal transpose */
609:     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
610:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
611:     if (reuse == MAT_INITIAL_MATRIX) *B = At;
612:     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
613:   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
614:     if ((*B)->assembled) {
615:       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
616:       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
617:       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
618:     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
619:       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
620:       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
621:       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
622:       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
623:       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
624:     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
625:   }
626:   PetscFunctionReturn(PETSC_SUCCESS);
627: }

629: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
630: {
631:   Mat_SeqAIJKokkos *aijkok;

633:   PetscFunctionBegin;
634:   if (A->factortype == MAT_FACTOR_NONE) {
635:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
636:     delete aijkok;
637:   } else {
638:     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
639:   }
640:   A->spptr = NULL;
641:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
642:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
643:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
644:   PetscCall(MatDestroy_SeqAIJ(A));
645:   PetscFunctionReturn(PETSC_SUCCESS);
646: }

648: /*MC
649:    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos

651:    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types

653:    Options Database Key:
654: .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`

656:   Level: beginner

658: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
659: M*/
660: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
661: {
662:   PetscFunctionBegin;
663:   PetscCall(PetscKokkosInitializeCheck());
664:   PetscCall(MatCreate_SeqAIJ(A));
665:   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
666:   PetscFunctionReturn(PETSC_SUCCESS);
667: }

669: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
670: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
671: {
672:   Mat_SeqAIJ         *a, *b;
673:   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
674:   MatScalarKokkosView aa, ba, ca;
675:   MatRowMapKokkosView ai, bi, ci;
676:   MatColIdxKokkosView aj, bj, cj;
677:   PetscInt            m, n, nnz, aN;

679:   PetscFunctionBegin;
682:   PetscAssertPointer(C, 4);
683:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
684:   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
685:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
686:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");

688:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
689:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
690:   a    = static_cast<Mat_SeqAIJ *>(A->data);
691:   b    = static_cast<Mat_SeqAIJ *>(B->data);
692:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
693:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
694:   aa   = akok->a_dual.view_device();
695:   ai   = akok->i_dual.view_device();
696:   ba   = bkok->a_dual.view_device();
697:   bi   = bkok->i_dual.view_device();
698:   m    = A->rmap->n; /* M, N and nnz of C */
699:   n    = A->cmap->n + B->cmap->n;
700:   nnz  = a->nz + b->nz;
701:   aN   = A->cmap->n; /* N of A */
702:   if (reuse == MAT_INITIAL_MATRIX) {
703:     aj           = akok->j_dual.view_device();
704:     bj           = bkok->j_dual.view_device();
705:     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
706:     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
707:     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
708:     ca           = ca_dual.view_device();
709:     ci           = ci_dual.view_device();
710:     cj           = cj_dual.view_device();

712:     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
713:     Kokkos::parallel_for(
714:       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
715:         PetscInt i       = t.league_rank(); /* row i */
716:         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);

718:         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
719:                                                    ci(i) = coffset;
720:                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
721:         });

723:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
724:           if (k < alen) {
725:             ca(coffset + k) = aa(ai(i) + k);
726:             cj(coffset + k) = aj(ai(i) + k);
727:           } else {
728:             ca(coffset + k) = ba(bi(i) + k - alen);
729:             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
730:           }
731:         });
732:       });
733:     ca_dual.modify_device();
734:     ci_dual.modify_device();
735:     cj_dual.modify_device();
736:     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
737:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
738:   } else if (reuse == MAT_REUSE_MATRIX) {
740:     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
741:     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
742:     ca   = ckok->a_dual.view_device();
743:     ci   = ckok->i_dual.view_device();

745:     Kokkos::parallel_for(
746:       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
747:         PetscInt i    = t.league_rank(); /* row i */
748:         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
749:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
750:           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
751:           else ca(ci(i) + k) = ba(bi(i) + k - alen);
752:         });
753:       });
754:     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
755:   }
756:   PetscFunctionReturn(PETSC_SUCCESS);
757: }

759: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
760: {
761:   PetscFunctionBegin;
762:   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
763:   PetscFunctionReturn(PETSC_SUCCESS);
764: }

766: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
767: {
768:   Mat_Product                 *product = C->product;
769:   Mat                          A, B;
770:   bool                         transA, transB; /* use bool, since KK needs this type */
771:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
772:   Mat_SeqAIJ                  *c;
773:   MatProductData_SeqAIJKokkos *pdata;
774:   KokkosCsrMatrix              csrmatA, csrmatB;

776:   PetscFunctionBegin;
777:   MatCheckProduct(C, 1);
778:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
779:   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);

781:   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
782:   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
783:   // we still do numeric.
784:   if (pdata->reusesym) { // numeric reuses results from symbolic
785:     pdata->reusesym = PETSC_FALSE;
786:     PetscFunctionReturn(PETSC_SUCCESS);
787:   }

789:   switch (product->type) {
790:   case MATPRODUCT_AB:
791:     transA = false;
792:     transB = false;
793:     break;
794:   case MATPRODUCT_AtB:
795:     transA = true;
796:     transB = false;
797:     break;
798:   case MATPRODUCT_ABt:
799:     transA = false;
800:     transB = true;
801:     break;
802:   default:
803:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
804:   }

806:   A = product->A;
807:   B = product->B;
808:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
809:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
810:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
811:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
812:   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);

814:   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");

816:   csrmatA = akok->csrmat;
817:   csrmatB = bkok->csrmat;

819:   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
820:   if (transA) {
821:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
822:     transA = false;
823:   }

825:   if (transB) {
826:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
827:     transB = false;
828:   }
829:   PetscCall(PetscLogGpuTimeBegin());
830:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
831: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
832:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
833:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
834: #endif

836:   PetscCall(PetscLogGpuTimeEnd());
837:   PetscCall(MatSeqAIJKokkosModifyDevice(C));
838:   /* shorter version of MatAssemblyEnd_SeqAIJ */
839:   c = (Mat_SeqAIJ *)C->data;
840:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
841:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
842:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
843:   c->reallocs         = 0;
844:   C->info.mallocs     = 0;
845:   C->info.nz_unneeded = 0;
846:   C->assembled = C->was_assembled = PETSC_TRUE;
847:   C->num_ass++;
848:   PetscFunctionReturn(PETSC_SUCCESS);
849: }

851: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
852: {
853:   Mat_Product                 *product = C->product;
854:   MatProductType               ptype;
855:   Mat                          A, B;
856:   bool                         transA, transB;
857:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
858:   MatProductData_SeqAIJKokkos *pdata;
859:   MPI_Comm                     comm;
860:   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;

862:   PetscFunctionBegin;
863:   MatCheckProduct(C, 1);
864:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
865:   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
866:   A = product->A;
867:   B = product->B;
868:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
869:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
870:   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
871:   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
872:   csrmatA = akok->csrmat;
873:   csrmatB = bkok->csrmat;

875:   ptype = product->type;
876:   // Take advantage of the symmetry if true
877:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
878:     ptype                                          = MATPRODUCT_AB;
879:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
880:   }
881:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
882:     ptype                                          = MATPRODUCT_AB;
883:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
884:   }

886:   switch (ptype) {
887:   case MATPRODUCT_AB:
888:     transA = false;
889:     transB = false;
890:     break;
891:   case MATPRODUCT_AtB:
892:     transA = true;
893:     transB = false;
894:     break;
895:   case MATPRODUCT_ABt:
896:     transA = false;
897:     transB = true;
898:     break;
899:   default:
900:     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
901:   }
902:   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
903:   pdata->reusesym = product->api_user;

905:   /* TODO: add command line options to select spgemm algorithms */
906:   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */

908:   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
909: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
910:   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
911:   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
912:   #endif
913: #endif
914:   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));

916:   PetscCall(PetscLogGpuTimeBegin());
917:   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
918:   if (transA) {
919:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
920:     transA = false;
921:   }

923:   if (transB) {
924:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
925:     transB = false;
926:   }

928:   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
929:   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
930:     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
931:     calling new Mat_SeqAIJKokkos().
932:     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
933:   */
934:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
935: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
936:   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
937:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
938:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
939: #endif
940:   PetscCall(PetscLogGpuTimeEnd());

942:   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
943:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
944:   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
945:   PetscFunctionReturn(PETSC_SUCCESS);
946: }

948: /* handles sparse matrix matrix ops */
949: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
950: {
951:   Mat_Product *product = mat->product;
952:   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;

954:   PetscFunctionBegin;
955:   MatCheckProduct(mat, 1);
956:   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
957:   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
958:   if (Biskok && Ciskok) {
959:     switch (product->type) {
960:     case MATPRODUCT_AB:
961:     case MATPRODUCT_AtB:
962:     case MATPRODUCT_ABt:
963:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
964:       break;
965:     case MATPRODUCT_PtAP:
966:     case MATPRODUCT_RARt:
967:     case MATPRODUCT_ABC:
968:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
969:       break;
970:     default:
971:       break;
972:     }
973:   } else { /* fallback for AIJ */
974:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
975:   }
976:   PetscFunctionReturn(PETSC_SUCCESS);
977: }

979: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
980: {
981:   Mat_SeqAIJKokkos *aijkok;

983:   PetscFunctionBegin;
984:   PetscCall(PetscLogGpuTimeBegin());
985:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
986:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
987:   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
988:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
989:   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
990:   PetscCall(PetscLogGpuTimeEnd());
991:   PetscFunctionReturn(PETSC_SUCCESS);
992: }

994: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
995: {
996:   Mat_SeqAIJKokkos *aijkok;

998:   PetscFunctionBegin;
999:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1000:   if (aijkok) { /* Only zero the device if data is already there */
1001:     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
1002:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1003:   } else { /* Might be preallocated but not assembled */
1004:     PetscCall(MatZeroEntries_SeqAIJ(A));
1005:   }
1006:   PetscFunctionReturn(PETSC_SUCCESS);
1007: }

1009: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1010: {
1011:   Mat_SeqAIJ           *aijseq;
1012:   Mat_SeqAIJKokkos     *aijkok;
1013:   PetscInt              n;
1014:   PetscScalarKokkosView xv;

1016:   PetscFunctionBegin;
1017:   PetscCall(VecGetLocalSize(x, &n));
1018:   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1019:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");

1021:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1022:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

1024:   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
1025:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1026:     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1027:     aijkok->SetDiagonal(aijseq->diag);
1028:   }

1030:   const auto &Aa    = aijkok->a_dual.view_device();
1031:   const auto &Ai    = aijkok->i_dual.view_device();
1032:   const auto &Adiag = aijkok->diag_dual.view_device();

1034:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1035:   Kokkos::parallel_for(
1036:     n, KOKKOS_LAMBDA(const PetscInt i) {
1037:       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1038:       else xv(i) = 0;
1039:     });
1040:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1041:   PetscFunctionReturn(PETSC_SUCCESS);
1042: }

1044: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1045: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1046: {
1047:   Mat_SeqAIJKokkos *aijkok;

1049:   PetscFunctionBegin;
1051:   PetscAssertPointer(kv, 2);
1052:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1053:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1054:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1055:   *kv    = aijkok->a_dual.view_device();
1056:   PetscFunctionReturn(PETSC_SUCCESS);
1057: }

1059: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1060: {
1061:   PetscFunctionBegin;
1063:   PetscAssertPointer(kv, 2);
1064:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1065:   PetscFunctionReturn(PETSC_SUCCESS);
1066: }

1068: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1069: {
1070:   Mat_SeqAIJKokkos *aijkok;

1072:   PetscFunctionBegin;
1074:   PetscAssertPointer(kv, 2);
1075:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1076:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1077:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1078:   *kv    = aijkok->a_dual.view_device();
1079:   PetscFunctionReturn(PETSC_SUCCESS);
1080: }

1082: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1083: {
1084:   PetscFunctionBegin;
1086:   PetscAssertPointer(kv, 2);
1087:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1088:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1089:   PetscFunctionReturn(PETSC_SUCCESS);
1090: }

1092: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1093: {
1094:   Mat_SeqAIJKokkos *aijkok;

1096:   PetscFunctionBegin;
1098:   PetscAssertPointer(kv, 2);
1099:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1100:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1101:   *kv    = aijkok->a_dual.view_device();
1102:   PetscFunctionReturn(PETSC_SUCCESS);
1103: }

1105: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1106: {
1107:   PetscFunctionBegin;
1109:   PetscAssertPointer(kv, 2);
1110:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1111:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1112:   PetscFunctionReturn(PETSC_SUCCESS);
1113: }

1115: /* Computes Y += alpha X */
1116: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1117: {
1118:   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1119:   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1120:   ConstMatScalarKokkosView Xa;
1121:   MatScalarKokkosView      Ya;

1123:   PetscFunctionBegin;
1124:   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1125:   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1126:   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1127:   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1128:   PetscCall(PetscLogGpuTimeBegin());

1130:   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1131:     PetscBool e;
1132:     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1133:     if (e) {
1134:       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1135:       if (e) pattern = SAME_NONZERO_PATTERN;
1136:     }
1137:   }

1139:   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1140:     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1141:     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1142:     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1143:   */
1144:   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1145:   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1146:   Xa   = xkok->a_dual.view_device();
1147:   Ya   = ykok->a_dual.view_device();

1149:   if (pattern == SAME_NONZERO_PATTERN) {
1150:     KokkosBlas::axpy(alpha, Xa, Ya);
1151:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1152:   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1153:     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1154:     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();

1156:     Kokkos::parallel_for(
1157:       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1158:         PetscInt i = t.league_rank(); // row i
1159:         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1160:           // Only one thread works in a team
1161:           PetscInt p, q = Yi(i);
1162:           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1163:             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1164:             if (Xj(p) == Yj(q)) {                        // Found it
1165:               Ya(q) += alpha * Xa(p);
1166:               q++;
1167:             } else {
1168:             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1169:             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1170: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1171:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1172: #else
1173:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1174: #endif
1175:             }
1176:           }
1177:         });
1178:       });
1179:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1180:   } else { // different nonzero patterns
1181:     Mat             Z;
1182:     KokkosCsrMatrix zcsr;
1183:     KernelHandle    kh;
1184:     kh.create_spadd_handle(true); // X, Y are sorted
1185:     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1186:     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1187:     zkok = new Mat_SeqAIJKokkos(zcsr);
1188:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1189:     PetscCall(MatHeaderReplace(Y, &Z));
1190:     kh.destroy_spadd_handle();
1191:   }
1192:   PetscCall(PetscLogGpuTimeEnd());
1193:   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1194:   PetscFunctionReturn(PETSC_SUCCESS);
1195: }

1197: struct MatCOOStruct_SeqAIJKokkos {
1198:   PetscCount           n;
1199:   PetscCount           Atot;
1200:   PetscInt             nz;
1201:   PetscCountKokkosView jmap;
1202:   PetscCountKokkosView perm;

1204:   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1205:   {
1206:     nz   = coo_h->nz;
1207:     n    = coo_h->n;
1208:     Atot = coo_h->Atot;
1209:     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1210:     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1211:   }
1212: };

1214: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1215: {
1216:   PetscFunctionBegin;
1217:   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1218:   PetscFunctionReturn(PETSC_SUCCESS);
1219: }

1221: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1222: {
1223:   Mat_SeqAIJKokkos          *akok;
1224:   Mat_SeqAIJ                *aseq;
1225:   PetscContainer             container_h, container_d;
1226:   MatCOOStruct_SeqAIJ       *coo_h;
1227:   MatCOOStruct_SeqAIJKokkos *coo_d;

1229:   PetscFunctionBegin;
1230:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1231:   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1232:   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1233:   delete akok;
1234:   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1235:   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));

1237:   // Copy the COO struct to device
1238:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1239:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1240:   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));

1242:   // Put the COO struct in a container and then attach that to the matrix
1243:   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1244:   PetscCall(PetscContainerSetPointer(container_d, coo_d));
1245:   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos));
1246:   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1247:   PetscCall(PetscContainerDestroy(&container_d));
1248:   PetscFunctionReturn(PETSC_SUCCESS);
1249: }

1251: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1252: {
1253:   MatScalarKokkosView        Aa;
1254:   ConstMatScalarKokkosView   kv;
1255:   PetscMemType               memtype;
1256:   PetscContainer             container;
1257:   MatCOOStruct_SeqAIJKokkos *coo;

1259:   PetscFunctionBegin;
1260:   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1261:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

1263:   const auto &n    = coo->n;
1264:   const auto &Annz = coo->nz;
1265:   const auto &jmap = coo->jmap;
1266:   const auto &perm = coo->perm;

1268:   PetscCall(PetscGetMemType(v, &memtype));
1269:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1270:     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1271:   } else {
1272:     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1273:   }

1275:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1276:   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */

1278:   PetscCall(PetscLogGpuTimeBegin());
1279:   Kokkos::parallel_for(
1280:     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1281:       PetscScalar sum = 0.0;
1282:       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1283:       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1284:     });
1285:   PetscCall(PetscLogGpuTimeEnd());

1287:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1288:   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1289:   PetscFunctionReturn(PETSC_SUCCESS);
1290: }

1292: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1293: {
1294:   PetscFunctionBegin;
1295:   PetscCall(MatSeqAIJKokkosSyncHost(A));
1296:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1297:   B->offloadmask = PETSC_OFFLOAD_CPU;
1298:   PetscFunctionReturn(PETSC_SUCCESS);
1299: }

1301: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1302: {
1303:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

1305:   PetscFunctionBegin;
1306:   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1307:   A->boundtocpu  = PETSC_FALSE;

1309:   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1310:   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1311:   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1312:   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1313:   A->ops->scale                     = MatScale_SeqAIJKokkos;
1314:   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1315:   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1316:   A->ops->mult                      = MatMult_SeqAIJKokkos;
1317:   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1318:   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1319:   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1320:   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1321:   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1322:   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1323:   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1324:   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1325:   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1326:   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1327:   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1328:   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1329:   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1330:   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1331:   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1332:   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;

1334:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1335:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1336:   PetscFunctionReturn(PETSC_SUCCESS);
1337: }

1339: /*
1340:    Extract the (prescribled) diagonal blocks of the matrix and then invert them

1342:   Input Parameters:
1343: +  A       - the MATSEQAIJKOKKOS matrix
1344: .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1345: .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1346: .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1347: -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine

1349:   Output Parameter:
1350: .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1351: */
1352: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1353: {
1354:   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1355:   PetscInt          N       = A->rmap->n;
1356:   PetscInt          nblocks = bs.extent(0) - 1;

1358:   PetscFunctionBegin;
1359:   // Set the diagonal pointer on device if not already
1360:   if (N && akok->diag_dual.extent(0) == 0) {
1361:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1362:     akok->SetDiagonal(static_cast<Mat_SeqAIJ *>(A->data)->diag);
1363:   }

1365:   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device

1367:   // Pull out the diagonal blocks of the matrix and then invert the blocks
1368:   auto Aa    = akok->a_dual.view_device();
1369:   auto Ai    = akok->i_dual.view_device();
1370:   auto Aj    = akok->j_dual.view_device();
1371:   auto Adiag = akok->diag_dual.view_device();
1372:   // TODO: how to tune the team size?
1373: #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1374:   auto ts = Kokkos::AUTO();
1375: #else
1376:   auto ts         = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1377: #endif
1378:   PetscCallCXX(Kokkos::parallel_for(
1379:     Kokkos::TeamPolicy<>(nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1380:       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1381:       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1382:       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1383:       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1384:       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);

1386:       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1387:         PetscInt i = rstart + r;                                                            // i-th row in A

1389:         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1390:           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row

1392:           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1393:             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1394:               B(r, c) = 0.0;
1395:             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1396:               B(r, c) = Aa(first + c);
1397:             } else { // this entry does not show up in the CSR
1398:               B(r, c) = 0.0;
1399:             }
1400:           }
1401:         } else { // rare case that the diagonal does not exist
1402:           const PetscInt begin = Ai(i);
1403:           const PetscInt end   = Ai(i + 1);
1404:           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1405:           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1406:             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1407:             else if (Aj(j) >= rstart + m) break;
1408:           }
1409:         }
1410:       });

1412:       // LU-decompose B (w/o pivoting) and then invert B
1413:       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1414:       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1415:     }));
1416:   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1417:   PetscFunctionReturn(PETSC_SUCCESS);
1418: }

1420: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1421: {
1422:   Mat_SeqAIJ *aseq;
1423:   PetscInt    i, m, n;
1424:   auto       &exec = PetscGetKokkosExecutionSpace();

1426:   PetscFunctionBegin;
1427:   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");

1429:   m = akok->nrows();
1430:   n = akok->ncols();
1431:   PetscCall(MatSetSizes(A, m, n, m, n));
1432:   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));

1434:   /* Set up data structures of A as a MATSEQAIJ */
1435:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1436:   aseq = (Mat_SeqAIJ *)(A)->data;

1438:   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1439:   PetscCallCXX(akok->j_dual.sync_host(exec));
1440:   PetscCallCXX(exec.fence());

1442:   aseq->i            = akok->i_host_data();
1443:   aseq->j            = akok->j_host_data();
1444:   aseq->a            = akok->a_host_data();
1445:   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1446:   aseq->singlemalloc = PETSC_FALSE;
1447:   aseq->free_a       = PETSC_FALSE;
1448:   aseq->free_ij      = PETSC_FALSE;
1449:   aseq->nz           = akok->nnz();
1450:   aseq->maxnz        = aseq->nz;

1452:   PetscCall(PetscMalloc1(m, &aseq->imax));
1453:   PetscCall(PetscMalloc1(m, &aseq->ilen));
1454:   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];

1456:   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1457:   akok->nonzerostate = A->nonzerostate;
1458:   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1459:   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1460:   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1461:   PetscFunctionReturn(PETSC_SUCCESS);
1462: }

1464: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1465: {
1466:   PetscFunctionBegin;
1467:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1468:   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1469:   PetscFunctionReturn(PETSC_SUCCESS);
1470: }

1472: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1473: {
1474:   Mat_SeqAIJKokkos *akok;
1475:   PetscFunctionBegin;
1476:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1477:   PetscCall(MatCreate(comm, A));
1478:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1479:   PetscFunctionReturn(PETSC_SUCCESS);
1480: }

1482: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure

1484:    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1485:  */
1486: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1487: {
1488:   PetscFunctionBegin;
1489:   PetscCall(MatCreate(comm, A));
1490:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1491:   PetscFunctionReturn(PETSC_SUCCESS);
1492: }

1494: /*@C
1495:   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1496:   (the default parallel PETSc format). This matrix will ultimately be handled by
1497:   Kokkos for calculations.

1499:   Collective

1501:   Input Parameters:
1502: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1503: . m    - number of rows
1504: . n    - number of columns
1505: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1506: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

1508:   Output Parameter:
1509: . A - the matrix

1511:   Level: intermediate

1513:   Notes:
1514:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1515:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1516:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

1518:   The AIJ format, also called
1519:   compressed row storage, is fully compatible with standard Fortran
1520:   storage.  That is, the stored row and column indices can begin at
1521:   either one (as in Fortran) or zero.

1523:   Specify the preallocated storage with either `nz` or `nnz` (not both).
1524:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1525:   allocation.

1527: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1528: @*/
1529: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1530: {
1531:   PetscFunctionBegin;
1532:   PetscCall(PetscKokkosInitializeCheck());
1533:   PetscCall(MatCreate(comm, A));
1534:   PetscCall(MatSetSizes(*A, m, n, m, n));
1535:   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1536:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1537:   PetscFunctionReturn(PETSC_SUCCESS);
1538: }

1540: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1541: {
1542:   PetscFunctionBegin;
1543:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1544:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1545:   PetscFunctionReturn(PETSC_SUCCESS);
1546: }

1548: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1549: {
1550:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1552:   PetscFunctionBegin;
1553:   if (!factors->sptrsv_symbolic_completed) {
1554:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1555:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1556:     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1557:   }
1558:   PetscFunctionReturn(PETSC_SUCCESS);
1559: }

1561: /* Check if we need to update factors etc for transpose solve */
1562: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1563: {
1564:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1565:   MatColIdxType               n       = A->rmap->n;

1567:   PetscFunctionBegin;
1568:   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1569:     /* Update L^T and do sptrsv symbolic */
1570:     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1571:     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1572:     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));

1574:     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1575:                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);

1577:     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1578:       We have to sort the indices, until KK provides finer control options.
1579:     */
1580:     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);

1582:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);

1584:     /* Update U^T and do sptrsv symbolic */
1585:     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1586:     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1587:     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));

1589:     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1590:                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);

1592:     /* Sort indices. See comments above */
1593:     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);

1595:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1596:     factors->transpose_updated = PETSC_TRUE;
1597:   }
1598:   PetscFunctionReturn(PETSC_SUCCESS);
1599: }

1601: /* Solve Ax = b, with A = LU */
1602: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1603: {
1604:   ConstPetscScalarKokkosView  bv;
1605:   PetscScalarKokkosView       xv;
1606:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1608:   PetscFunctionBegin;
1609:   PetscCall(PetscLogGpuTimeBegin());
1610:   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1611:   PetscCall(VecGetKokkosView(b, &bv));
1612:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1613:   /* Solve L tmpv = b */
1614:   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1615:   /* Solve Ux = tmpv */
1616:   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1617:   PetscCall(VecRestoreKokkosView(b, &bv));
1618:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1619:   PetscCall(PetscLogGpuTimeEnd());
1620:   PetscFunctionReturn(PETSC_SUCCESS);
1621: }

1623: /* Solve A^T x = b, where A^T = U^T L^T */
1624: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1625: {
1626:   ConstPetscScalarKokkosView  bv;
1627:   PetscScalarKokkosView       xv;
1628:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1630:   PetscFunctionBegin;
1631:   PetscCall(PetscLogGpuTimeBegin());
1632:   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1633:   PetscCall(VecGetKokkosView(b, &bv));
1634:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1635:   /* Solve U^T tmpv = b */
1636:   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);

1638:   /* Solve L^T x = tmpv */
1639:   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1640:   PetscCall(VecRestoreKokkosView(b, &bv));
1641:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1642:   PetscCall(PetscLogGpuTimeEnd());
1643:   PetscFunctionReturn(PETSC_SUCCESS);
1644: }

1646: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1647: {
1648:   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1649:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1650:   PetscInt                    fill_lev = info->levels;

1652:   PetscFunctionBegin;
1653:   PetscCall(PetscLogGpuTimeBegin());
1654:   PetscCall(MatSeqAIJKokkosSyncDevice(A));

1656:   auto a_d = aijkok->a_dual.view_device();
1657:   auto i_d = aijkok->i_dual.view_device();
1658:   auto j_d = aijkok->j_dual.view_device();

1660:   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);

1662:   B->assembled              = PETSC_TRUE;
1663:   B->preallocated           = PETSC_TRUE;
1664:   B->ops->solve             = MatSolve_SeqAIJKokkos;
1665:   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1666:   B->ops->matsolve          = NULL;
1667:   B->ops->matsolvetranspose = NULL;
1668:   B->offloadmask            = PETSC_OFFLOAD_GPU;

1670:   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1671:   factors->transpose_updated         = PETSC_FALSE;
1672:   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1673:   /* TODO: log flops, but how to know that? */
1674:   PetscCall(PetscLogGpuTimeEnd());
1675:   PetscFunctionReturn(PETSC_SUCCESS);
1676: }

1678: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1679: {
1680:   Mat_SeqAIJKokkos           *aijkok;
1681:   Mat_SeqAIJ                 *b;
1682:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1683:   PetscInt                    fill_lev = info->levels;
1684:   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1685:   PetscInt                    n        = A->rmap->n;

1687:   PetscFunctionBegin;
1688:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1689:   /* Rebuild factors */
1690:   if (factors) {
1691:     factors->Destroy();
1692:   } /* Destroy the old if it exists */
1693:   else {
1694:     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1695:   }

1697:   /* Create a spiluk handle and then do symbolic factorization */
1698:   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1699:   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);

1701:   auto spiluk_handle = factors->kh.get_spiluk_handle();

1703:   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1704:   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1705:   Kokkos::realloc(factors->iU_d, n + 1);
1706:   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());

1708:   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1709:   auto i_d = aijkok->i_dual.view_device();
1710:   auto j_d = aijkok->j_dual.view_device();
1711:   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1712:   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */

1714:   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1715:   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1716:   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1717:   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());

1719:   /* TODO: add options to select sptrsv algorithms */
1720:   /* Create sptrsv handles for L, U and their transpose */
1721: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1722:   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1723: #else
1724:   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1725: #endif

1727:   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1728:   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1729:   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1730:   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);

1732:   /* Fill fields of the factor matrix B */
1733:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1734:   b     = (Mat_SeqAIJ *)B->data;
1735:   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1736:   B->info.fill_ratio_given  = info->fill;
1737:   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;

1739:   B->offloadmask          = PETSC_OFFLOAD_GPU;
1740:   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1741:   PetscFunctionReturn(PETSC_SUCCESS);
1742: }

1744: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1745: {
1746:   PetscFunctionBegin;
1747:   *type = MATSOLVERKOKKOS;
1748:   PetscFunctionReturn(PETSC_SUCCESS);
1749: }

1751: /*MC
1752:   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1753:   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.

1755:   Level: beginner

1757: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1758: M*/
1759: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1760: {
1761:   PetscInt n = A->rmap->n;

1763:   PetscFunctionBegin;
1764:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1765:   PetscCall(MatSetSizes(*B, n, n, n, n));
1766:   (*B)->factortype = ftype;
1767:   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1768:   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));

1770:   if (ftype == MAT_FACTOR_LU) {
1771:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1772:     (*B)->canuseordering        = PETSC_TRUE;
1773:     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1774:   } else if (ftype == MAT_FACTOR_ILU) {
1775:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1776:     (*B)->canuseordering         = PETSC_FALSE;
1777:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1778:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);

1780:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1781:   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1782:   PetscFunctionReturn(PETSC_SUCCESS);
1783: }

1785: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1786: {
1787:   PetscFunctionBegin;
1788:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1789:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1790:   PetscFunctionReturn(PETSC_SUCCESS);
1791: }

1793: /* Utility to print out a KokkosCsrMatrix for debugging */
1794: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1795: {
1796:   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1797:   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1798:   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1799:   const PetscInt    *i  = iv.data();
1800:   const PetscInt    *j  = jv.data();
1801:   const PetscScalar *a  = av.data();
1802:   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();

1804:   PetscFunctionBegin;
1805:   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1806:   for (PetscInt k = 0; k < m; k++) {
1807:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1808:     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1809:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1810:   }
1811:   PetscFunctionReturn(PETSC_SUCCESS);
1812: }