Actual source code: aijkok.kokkos.cxx

  1: #include <petsc_kokkos.hpp>
  2: #include <petscvec_kokkos.hpp>
  3: #include <petscpkg_version.h>
  4: #include <petsc/private/petscimpl.h>
  5: #include <petsc/private/sfimpl.h>
  6: #include <petscsystypes.h>
  7: #include <petscerror.h>

  9: #include <Kokkos_Core.hpp>
 10: #include <KokkosBlas.hpp>
 11: #include <KokkosSparse_CrsMatrix.hpp>
 12: #include <KokkosSparse_spmv.hpp>
 13: #include <KokkosSparse_spiluk.hpp>
 14: #include <KokkosSparse_sptrsv.hpp>
 15: #include <KokkosSparse_spgemm.hpp>
 16: #include <KokkosSparse_spadd.hpp>
 17: #include <KokkosBatched_LU_Decl.hpp>
 18: #include <KokkosBatched_InverseLU_Decl.hpp>

 20: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>

 22: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
 23:   #include <KokkosSparse_Utils.hpp>
 24: using KokkosSparse::sort_crs_matrix;
 25: using KokkosSparse::Impl::transpose_matrix;
 26: #else
 27:   #include <KokkosKernels_Sorting.hpp>
 28: using KokkosKernels::sort_crs_matrix;
 29: using KokkosKernels::Impl::transpose_matrix;
 30: #endif

 32: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */

 34: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
 35:    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
 36:    In the latter case, it is important to set a_dual's sync state correctly.
 37:  */
 38: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
 39: {
 40:   Mat_SeqAIJ       *aijseq;
 41:   Mat_SeqAIJKokkos *aijkok;

 43:   PetscFunctionBegin;
 44:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
 45:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));

 47:   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
 48:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 50:   /* If aijkok does not exist, we just copy i, j to device.
 51:      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
 52:      In both cases, we build a new aijkok structure.
 53:   */
 54:   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
 55:     delete aijkok;
 56:     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
 57:     A->spptr = aijkok;
 58:   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
 59:     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
 60:     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
 61:     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
 62:   }
 63:   PetscFunctionReturn(PETSC_SUCCESS);
 64: }

 66: /* Sync CSR data to device if not yet */
 67: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
 68: {
 69:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 71:   PetscFunctionBegin;
 72:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
 73:   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
 74:   if (aijkok->a_dual.need_sync_device()) {
 75:     aijkok->a_dual.sync_device();
 76:     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
 77:     aijkok->hermitian_updated = PETSC_FALSE;
 78:   }
 79:   PetscFunctionReturn(PETSC_SUCCESS);
 80: }

 82: /* Mark the CSR data on device as modified */
 83: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
 84: {
 85:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 87:   PetscFunctionBegin;
 88:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
 89:   aijkok->a_dual.clear_sync_state();
 90:   aijkok->a_dual.modify_device();
 91:   aijkok->transpose_updated = PETSC_FALSE;
 92:   aijkok->hermitian_updated = PETSC_FALSE;
 93:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
 94:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
 95:   PetscFunctionReturn(PETSC_SUCCESS);
 96: }

 98: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
 99: {
100:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
101:   auto             &exec   = PetscGetKokkosExecutionSpace();

103:   PetscFunctionBegin;
104:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
105:   /* We do not expect one needs factors on host  */
106:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
107:   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
108:   PetscCallCXX(aijkok->a_dual.sync_host(exec));
109:   PetscCallCXX(exec.fence());
110:   PetscFunctionReturn(PETSC_SUCCESS);
111: }

113: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
114: {
115:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

117:   PetscFunctionBegin;
118:   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
119:     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
120:     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
121:     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
122:   */
123:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
124:     auto &exec = PetscGetKokkosExecutionSpace();
125:     PetscCallCXX(aijkok->a_dual.sync_host(exec));
126:     PetscCallCXX(exec.fence());
127:     *array = aijkok->a_dual.view_host().data();
128:   } else { /* Happens when calling MatSetValues on a newly created matrix */
129:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
130:   }
131:   PetscFunctionReturn(PETSC_SUCCESS);
132: }

134: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
135: {
136:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

138:   PetscFunctionBegin;
139:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
140:   PetscFunctionReturn(PETSC_SUCCESS);
141: }

143: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
144: {
145:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

147:   PetscFunctionBegin;
148:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149:     auto &exec = PetscGetKokkosExecutionSpace();
150:     PetscCallCXX(aijkok->a_dual.sync_host(exec));
151:     PetscCallCXX(exec.fence());
152:     *array = aijkok->a_dual.view_host().data();
153:   } else {
154:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
155:   }
156:   PetscFunctionReturn(PETSC_SUCCESS);
157: }

159: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
160: {
161:   PetscFunctionBegin;
162:   *array = NULL;
163:   PetscFunctionReturn(PETSC_SUCCESS);
164: }

166: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
167: {
168:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

170:   PetscFunctionBegin;
171:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
172:     *array = aijkok->a_dual.view_host().data();
173:   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
174:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
175:   }
176:   PetscFunctionReturn(PETSC_SUCCESS);
177: }

179: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
180: {
181:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

183:   PetscFunctionBegin;
184:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
185:     aijkok->a_dual.clear_sync_state();
186:     aijkok->a_dual.modify_host();
187:   }
188:   PetscFunctionReturn(PETSC_SUCCESS);
189: }

191: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
192: {
193:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

195:   PetscFunctionBegin;
196:   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");

198:   if (i) *i = aijkok->i_device_data();
199:   if (j) *j = aijkok->j_device_data();
200:   if (a) {
201:     aijkok->a_dual.sync_device();
202:     *a = aijkok->a_device_data();
203:   }
204:   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
205:   PetscFunctionReturn(PETSC_SUCCESS);
206: }

208: /*
209:   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.

211:   Input Parameter:
212: .  A       - the MATSEQAIJKOKKOS matrix

214:   Output Parameters:
215: +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
216: -  T_d    - the transpose on device, whose value array is allocated but not initialized
217: */
218: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
219: {
220:   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
221:   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
222:   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
223:   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
224:   MatRowMapType          *Ti = Ti_h.data();
225:   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
226:   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
227:   PetscInt               *Tj   = Tj_h.data();
228:   PetscInt               *perm = perm_h.data();
229:   PetscInt               *offset;

231:   PetscFunctionBegin;
232:   // Populate Ti
233:   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
234:   Ti++;
235:   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
236:   Ti--;
237:   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];

239:   // Populate Tj and the permutation array
240:   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
241:   for (PetscInt i = 0; i < m; i++) {
242:     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
243:       PetscInt r    = Aj[j];                       // row r of T
244:       PetscInt disp = Ti[r] + offset[r];

246:       Tj[disp]   = i; // col i of T
247:       perm[disp] = j;
248:       offset[r]++;
249:     }
250:   }
251:   PetscCall(PetscFree(offset));

253:   // Sort each row of T, along with the permutation array
254:   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));

256:   // Output perm and T on device
257:   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
258:   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
259:   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
260:   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
261:   PetscFunctionReturn(PETSC_SUCCESS);
262: }

264: // Generate the transpose on device and cache it internally
265: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
266: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
267: {
268:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
269:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
270:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
271:   KokkosCsrMatrix  &T = akok->csrmatT;

273:   PetscFunctionBegin;
274:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
275:   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device

277:   const auto &Aa = akok->a_dual.view_device();

279:   if (A->symmetric == PETSC_BOOL3_TRUE) {
280:     *csrmatT = akok->csrmat;
281:   } else {
282:     // See if we already have a cached transpose and its value is up to date
283:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
284:       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
285:         const auto &perm = akok->transpose_perm; // get the permutation array
286:         auto       &Ta   = T.values;

288:         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
289:       }
290:     } else { // Generate T of size n x m for the first time
291:       MatRowMapKokkosView perm;

293:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
294:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
295:       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
296:     }
297:     akok->transpose_updated = PETSC_TRUE;
298:     *csrmatT                = akok->csrmatT;
299:   }
300:   PetscFunctionReturn(PETSC_SUCCESS);
301: }

303: // Generate the Hermitian on device and cache it internally
304: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
305: {
306:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
307:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
308:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
309:   KokkosCsrMatrix  &T = akok->csrmatH;

311:   PetscFunctionBegin;
312:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
313:   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device

315:   const auto &Aa = akok->a_dual.view_device();

317:   if (A->hermitian == PETSC_BOOL3_TRUE) {
318:     *csrmatH = akok->csrmat;
319:   } else {
320:     // See if we already have a cached hermitian and its value is up to date
321:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
322:       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
323:         const auto &perm = akok->transpose_perm; // get the permutation array
324:         auto       &Ta   = T.values;

326:         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
327:       }
328:     } else { // Generate T of size n x m for the first time
329:       MatRowMapKokkosView perm;

331:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
332:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
333:       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
334:     }
335:     akok->hermitian_updated = PETSC_TRUE;
336:     *csrmatH                = akok->csrmatH;
337:   }
338:   PetscFunctionReturn(PETSC_SUCCESS);
339: }

341: /* y = A x */
342: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
343: {
344:   Mat_SeqAIJKokkos          *aijkok;
345:   ConstPetscScalarKokkosView xv;
346:   PetscScalarKokkosView      yv;

348:   PetscFunctionBegin;
349:   PetscCall(PetscLogGpuTimeBegin());
350:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
351:   PetscCall(VecGetKokkosView(xx, &xv));
352:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
353:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
354:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
355:   PetscCall(VecRestoreKokkosView(xx, &xv));
356:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
357:   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
358:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
359:   PetscCall(PetscLogGpuTimeEnd());
360:   PetscFunctionReturn(PETSC_SUCCESS);
361: }

363: /* y = A^T x */
364: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
365: {
366:   Mat_SeqAIJKokkos          *aijkok;
367:   const char                *mode;
368:   ConstPetscScalarKokkosView xv;
369:   PetscScalarKokkosView      yv;
370:   KokkosCsrMatrix            csrmat;

372:   PetscFunctionBegin;
373:   PetscCall(PetscLogGpuTimeBegin());
374:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
375:   PetscCall(VecGetKokkosView(xx, &xv));
376:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
377:   if (A->form_explicit_transpose) {
378:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
379:     mode = "N";
380:   } else {
381:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
382:     csrmat = aijkok->csrmat;
383:     mode   = "T";
384:   }
385:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
386:   PetscCall(VecRestoreKokkosView(xx, &xv));
387:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
388:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
389:   PetscCall(PetscLogGpuTimeEnd());
390:   PetscFunctionReturn(PETSC_SUCCESS);
391: }

393: /* y = A^H x */
394: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
395: {
396:   Mat_SeqAIJKokkos          *aijkok;
397:   const char                *mode;
398:   ConstPetscScalarKokkosView xv;
399:   PetscScalarKokkosView      yv;
400:   KokkosCsrMatrix            csrmat;

402:   PetscFunctionBegin;
403:   PetscCall(PetscLogGpuTimeBegin());
404:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
405:   PetscCall(VecGetKokkosView(xx, &xv));
406:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
407:   if (A->form_explicit_transpose) {
408:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
409:     mode = "N";
410:   } else {
411:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
412:     csrmat = aijkok->csrmat;
413:     mode   = "C";
414:   }
415:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
416:   PetscCall(VecRestoreKokkosView(xx, &xv));
417:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
418:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
419:   PetscCall(PetscLogGpuTimeEnd());
420:   PetscFunctionReturn(PETSC_SUCCESS);
421: }

423: /* z = A x + y */
424: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
425: {
426:   Mat_SeqAIJKokkos          *aijkok;
427:   ConstPetscScalarKokkosView xv;
428:   PetscScalarKokkosView      zv;

430:   PetscFunctionBegin;
431:   PetscCall(PetscLogGpuTimeBegin());
432:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
433:   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
434:   PetscCall(VecGetKokkosView(xx, &xv));
435:   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
436:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
437:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
438:   PetscCall(VecRestoreKokkosView(xx, &xv));
439:   PetscCall(VecRestoreKokkosView(zz, &zv));
440:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
441:   PetscCall(PetscLogGpuTimeEnd());
442:   PetscFunctionReturn(PETSC_SUCCESS);
443: }

445: /* z = A^T x + y */
446: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
447: {
448:   Mat_SeqAIJKokkos          *aijkok;
449:   const char                *mode;
450:   ConstPetscScalarKokkosView xv;
451:   PetscScalarKokkosView      zv;
452:   KokkosCsrMatrix            csrmat;

454:   PetscFunctionBegin;
455:   PetscCall(PetscLogGpuTimeBegin());
456:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
457:   if (zz != yy) PetscCall(VecCopy(yy, zz));
458:   PetscCall(VecGetKokkosView(xx, &xv));
459:   PetscCall(VecGetKokkosView(zz, &zv));
460:   if (A->form_explicit_transpose) {
461:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
462:     mode = "N";
463:   } else {
464:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
465:     csrmat = aijkok->csrmat;
466:     mode   = "T";
467:   }
468:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
469:   PetscCall(VecRestoreKokkosView(xx, &xv));
470:   PetscCall(VecRestoreKokkosView(zz, &zv));
471:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
472:   PetscCall(PetscLogGpuTimeEnd());
473:   PetscFunctionReturn(PETSC_SUCCESS);
474: }

476: /* z = A^H x + y */
477: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
478: {
479:   Mat_SeqAIJKokkos          *aijkok;
480:   const char                *mode;
481:   ConstPetscScalarKokkosView xv;
482:   PetscScalarKokkosView      zv;
483:   KokkosCsrMatrix            csrmat;

485:   PetscFunctionBegin;
486:   PetscCall(PetscLogGpuTimeBegin());
487:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
488:   if (zz != yy) PetscCall(VecCopy(yy, zz));
489:   PetscCall(VecGetKokkosView(xx, &xv));
490:   PetscCall(VecGetKokkosView(zz, &zv));
491:   if (A->form_explicit_transpose) {
492:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
493:     mode = "N";
494:   } else {
495:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
496:     csrmat = aijkok->csrmat;
497:     mode   = "C";
498:   }
499:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
500:   PetscCall(VecRestoreKokkosView(xx, &xv));
501:   PetscCall(VecRestoreKokkosView(zz, &zv));
502:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
503:   PetscCall(PetscLogGpuTimeEnd());
504:   PetscFunctionReturn(PETSC_SUCCESS);
505: }

507: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
508: {
509:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

511:   PetscFunctionBegin;
512:   switch (op) {
513:   case MAT_FORM_EXPLICIT_TRANSPOSE:
514:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
515:     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
516:     A->form_explicit_transpose = flg;
517:     break;
518:   default:
519:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
520:     break;
521:   }
522:   PetscFunctionReturn(PETSC_SUCCESS);
523: }

525: /* Depending on reuse, either build a new mat, or use the existing mat */
526: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
527: {
528:   Mat_SeqAIJ *aseq;

530:   PetscFunctionBegin;
531:   PetscCall(PetscKokkosInitializeCheck());
532:   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
533:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
534:   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
535:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
536:   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
537:     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
538:     PetscCall(PetscFree(A->defaultvectype));
539:     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
540:     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
541:     PetscCall(MatSetOps_SeqAIJKokkos(A));
542:     aseq = static_cast<Mat_SeqAIJ *>(A->data);
543:     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
544:       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
545:       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
546:     }
547:   }
548:   PetscFunctionReturn(PETSC_SUCCESS);
549: }

551: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
552:    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
553:  */
554: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
555: {
556:   Mat_SeqAIJ       *bseq;
557:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
558:   Mat               mat;

560:   PetscFunctionBegin;
561:   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
562:   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
563:   mat = *B;
564:   if (A->assembled) {
565:     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
566:     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
567:     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
568:     /* Now copy values to B if needed */
569:     if (dupOption == MAT_COPY_VALUES) {
570:       if (akok->a_dual.need_sync_device()) {
571:         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
572:         bkok->a_dual.modify_host();
573:       } else { /* If device has the latest data, we only copy data on device */
574:         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
575:         bkok->a_dual.modify_device();
576:       }
577:     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
578:       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
579:       bkok->a_dual.modify_host();
580:     }
581:     mat->spptr = bkok;
582:   }

584:   PetscCall(PetscFree(mat->defaultvectype));
585:   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
586:   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
587:   PetscCall(MatSetOps_SeqAIJKokkos(mat));
588:   PetscFunctionReturn(PETSC_SUCCESS);
589: }

591: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
592: {
593:   Mat               At;
594:   KokkosCsrMatrix   internT;
595:   Mat_SeqAIJKokkos *atkok, *bkok;

597:   PetscFunctionBegin;
598:   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
599:   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
600:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
601:     /* Deep copy internT, as we want to isolate the internal transpose */
602:     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
603:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
604:     if (reuse == MAT_INITIAL_MATRIX) *B = At;
605:     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
606:   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
607:     if ((*B)->assembled) {
608:       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
609:       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
610:       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
611:     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
612:       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
613:       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
614:       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
615:       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
616:       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
617:     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
618:   }
619:   PetscFunctionReturn(PETSC_SUCCESS);
620: }

622: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
623: {
624:   Mat_SeqAIJKokkos *aijkok;

626:   PetscFunctionBegin;
627:   if (A->factortype == MAT_FACTOR_NONE) {
628:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
629:     delete aijkok;
630:   } else {
631:     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
632:   }
633:   A->spptr = NULL;
634:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
635:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
636:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
637:   PetscCall(MatDestroy_SeqAIJ(A));
638:   PetscFunctionReturn(PETSC_SUCCESS);
639: }

641: /*MC
642:    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos

644:    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types

646:    Options Database Key:
647: .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`

649:   Level: beginner

651: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
652: M*/
653: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
654: {
655:   PetscFunctionBegin;
656:   PetscCall(PetscKokkosInitializeCheck());
657:   PetscCall(MatCreate_SeqAIJ(A));
658:   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
659:   PetscFunctionReturn(PETSC_SUCCESS);
660: }

662: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
663: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
664: {
665:   Mat_SeqAIJ         *a, *b;
666:   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
667:   MatScalarKokkosView aa, ba, ca;
668:   MatRowMapKokkosView ai, bi, ci;
669:   MatColIdxKokkosView aj, bj, cj;
670:   PetscInt            m, n, nnz, aN;

672:   PetscFunctionBegin;
675:   PetscAssertPointer(C, 4);
676:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
677:   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
678:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
679:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");

681:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
682:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
683:   a    = static_cast<Mat_SeqAIJ *>(A->data);
684:   b    = static_cast<Mat_SeqAIJ *>(B->data);
685:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
686:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
687:   aa   = akok->a_dual.view_device();
688:   ai   = akok->i_dual.view_device();
689:   ba   = bkok->a_dual.view_device();
690:   bi   = bkok->i_dual.view_device();
691:   m    = A->rmap->n; /* M, N and nnz of C */
692:   n    = A->cmap->n + B->cmap->n;
693:   nnz  = a->nz + b->nz;
694:   aN   = A->cmap->n; /* N of A */
695:   if (reuse == MAT_INITIAL_MATRIX) {
696:     aj           = akok->j_dual.view_device();
697:     bj           = bkok->j_dual.view_device();
698:     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
699:     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
700:     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
701:     ca           = ca_dual.view_device();
702:     ci           = ci_dual.view_device();
703:     cj           = cj_dual.view_device();

705:     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
706:     Kokkos::parallel_for(
707:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
708:         PetscInt i       = t.league_rank(); /* row i */
709:         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);

711:         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
712:                                                    ci(i) = coffset;
713:                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
714:         });

716:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
717:           if (k < alen) {
718:             ca(coffset + k) = aa(ai(i) + k);
719:             cj(coffset + k) = aj(ai(i) + k);
720:           } else {
721:             ca(coffset + k) = ba(bi(i) + k - alen);
722:             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
723:           }
724:         });
725:       });
726:     ca_dual.modify_device();
727:     ci_dual.modify_device();
728:     cj_dual.modify_device();
729:     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
730:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
731:   } else if (reuse == MAT_REUSE_MATRIX) {
733:     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
734:     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
735:     ca   = ckok->a_dual.view_device();
736:     ci   = ckok->i_dual.view_device();

738:     Kokkos::parallel_for(
739:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
740:         PetscInt i    = t.league_rank(); /* row i */
741:         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
742:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
743:           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
744:           else ca(ci(i) + k) = ba(bi(i) + k - alen);
745:         });
746:       });
747:     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
748:   }
749:   PetscFunctionReturn(PETSC_SUCCESS);
750: }

752: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
753: {
754:   PetscFunctionBegin;
755:   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
756:   PetscFunctionReturn(PETSC_SUCCESS);
757: }

759: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
760: {
761:   Mat_Product                 *product = C->product;
762:   Mat                          A, B;
763:   bool                         transA, transB; /* use bool, since KK needs this type */
764:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
765:   Mat_SeqAIJ                  *c;
766:   MatProductData_SeqAIJKokkos *pdata;
767:   KokkosCsrMatrix              csrmatA, csrmatB;

769:   PetscFunctionBegin;
770:   MatCheckProduct(C, 1);
771:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
772:   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);

774:   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
775:   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
776:   // we still do numeric.
777:   if (pdata->reusesym) { // numeric reuses results from symbolic
778:     pdata->reusesym = PETSC_FALSE;
779:     PetscFunctionReturn(PETSC_SUCCESS);
780:   }

782:   switch (product->type) {
783:   case MATPRODUCT_AB:
784:     transA = false;
785:     transB = false;
786:     break;
787:   case MATPRODUCT_AtB:
788:     transA = true;
789:     transB = false;
790:     break;
791:   case MATPRODUCT_ABt:
792:     transA = false;
793:     transB = true;
794:     break;
795:   default:
796:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
797:   }

799:   A = product->A;
800:   B = product->B;
801:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
802:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
803:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
804:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
805:   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);

807:   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");

809:   csrmatA = akok->csrmat;
810:   csrmatB = bkok->csrmat;

812:   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
813:   if (transA) {
814:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
815:     transA = false;
816:   }

818:   if (transB) {
819:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
820:     transB = false;
821:   }
822:   PetscCall(PetscLogGpuTimeBegin());
823:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
824: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
825:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
826:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
827: #endif

829:   PetscCall(PetscLogGpuTimeEnd());
830:   PetscCall(MatSeqAIJKokkosModifyDevice(C));
831:   /* shorter version of MatAssemblyEnd_SeqAIJ */
832:   c = (Mat_SeqAIJ *)C->data;
833:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
834:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
835:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
836:   c->reallocs         = 0;
837:   C->info.mallocs     = 0;
838:   C->info.nz_unneeded = 0;
839:   C->assembled = C->was_assembled = PETSC_TRUE;
840:   C->num_ass++;
841:   PetscFunctionReturn(PETSC_SUCCESS);
842: }

844: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
845: {
846:   Mat_Product                 *product = C->product;
847:   MatProductType               ptype;
848:   Mat                          A, B;
849:   bool                         transA, transB;
850:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
851:   MatProductData_SeqAIJKokkos *pdata;
852:   MPI_Comm                     comm;
853:   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;

855:   PetscFunctionBegin;
856:   MatCheckProduct(C, 1);
857:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
858:   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
859:   A = product->A;
860:   B = product->B;
861:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
862:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
863:   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
864:   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
865:   csrmatA = akok->csrmat;
866:   csrmatB = bkok->csrmat;

868:   ptype = product->type;
869:   // Take advantage of the symmetry if true
870:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
871:     ptype                                          = MATPRODUCT_AB;
872:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
873:   }
874:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
875:     ptype                                          = MATPRODUCT_AB;
876:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
877:   }

879:   switch (ptype) {
880:   case MATPRODUCT_AB:
881:     transA = false;
882:     transB = false;
883:     break;
884:   case MATPRODUCT_AtB:
885:     transA = true;
886:     transB = false;
887:     break;
888:   case MATPRODUCT_ABt:
889:     transA = false;
890:     transB = true;
891:     break;
892:   default:
893:     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
894:   }
895:   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
896:   pdata->reusesym = product->api_user;

898:   /* TODO: add command line options to select spgemm algorithms */
899:   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */

901:   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
902: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
903:   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
904:   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
905:   #endif
906: #endif
907:   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));

909:   PetscCall(PetscLogGpuTimeBegin());
910:   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
911:   if (transA) {
912:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
913:     transA = false;
914:   }

916:   if (transB) {
917:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
918:     transB = false;
919:   }

921:   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
922:   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
923:     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
924:     calling new Mat_SeqAIJKokkos().
925:     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
926:   */
927:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
928: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
929:   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
930:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
931:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
932: #endif
933:   PetscCall(PetscLogGpuTimeEnd());

935:   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
936:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
937:   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
938:   PetscFunctionReturn(PETSC_SUCCESS);
939: }

941: /* handles sparse matrix matrix ops */
942: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
943: {
944:   Mat_Product *product = mat->product;
945:   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;

947:   PetscFunctionBegin;
948:   MatCheckProduct(mat, 1);
949:   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
950:   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
951:   if (Biskok && Ciskok) {
952:     switch (product->type) {
953:     case MATPRODUCT_AB:
954:     case MATPRODUCT_AtB:
955:     case MATPRODUCT_ABt:
956:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
957:       break;
958:     case MATPRODUCT_PtAP:
959:     case MATPRODUCT_RARt:
960:     case MATPRODUCT_ABC:
961:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
962:       break;
963:     default:
964:       break;
965:     }
966:   } else { /* fallback for AIJ */
967:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
968:   }
969:   PetscFunctionReturn(PETSC_SUCCESS);
970: }

972: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
973: {
974:   Mat_SeqAIJKokkos *aijkok;

976:   PetscFunctionBegin;
977:   PetscCall(PetscLogGpuTimeBegin());
978:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
979:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
980:   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
981:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
982:   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
983:   PetscCall(PetscLogGpuTimeEnd());
984:   PetscFunctionReturn(PETSC_SUCCESS);
985: }

987: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
988: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
989: {
990:   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);

992:   PetscFunctionBegin;
993:   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
994:     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);

996:     PetscCall(PetscLogGpuTimeBegin());
997:     PetscCall(MatSeqAIJKokkosSyncDevice(A));
998:     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
999:     const auto &Aa     = aijkok->a_dual.view_device();
1000:     const auto &Adiag  = aijkok->diag_dual.view_device();
1001:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1002:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1003:     PetscCall(PetscLogGpuFlops(n));
1004:     PetscCall(PetscLogGpuTimeEnd());
1005:   } else { // need reassembly, very slow!
1006:     PetscCall(MatShift_Basic(A, a));
1007:   }
1008:   PetscFunctionReturn(PETSC_SUCCESS);
1009: }

1011: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1012: {
1013:   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);

1015:   PetscFunctionBegin;
1016:   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1017:     ConstPetscScalarKokkosView dv;
1018:     PetscInt                   n, nv;

1020:     PetscCall(PetscLogGpuTimeBegin());
1021:     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1022:     PetscCall(VecGetKokkosView(D, &dv));
1023:     PetscCall(VecGetLocalSize(D, &nv));
1024:     n = PetscMin(Y->rmap->n, Y->cmap->n);
1025:     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");

1027:     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1028:     const auto &Aa     = aijkok->a_dual.view_device();
1029:     const auto &Adiag  = aijkok->diag_dual.view_device();
1030:     PetscCallCXX(Kokkos::parallel_for(
1031:       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1032:         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1033:         else Aa(Adiag(i)) += dv(i);
1034:       }));
1035:     PetscCall(VecRestoreKokkosView(D, &dv));
1036:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1037:     PetscCall(PetscLogGpuFlops(n));
1038:     PetscCall(PetscLogGpuTimeEnd());
1039:   } else { // need reassembly, very slow!
1040:     PetscCall(MatDiagonalSet_Default(Y, D, is));
1041:   }
1042:   PetscFunctionReturn(PETSC_SUCCESS);
1043: }

1045: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1046: {
1047:   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1048:   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1049:   ConstPetscScalarKokkosView lv, rv;

1051:   PetscFunctionBegin;
1052:   PetscCall(PetscLogGpuTimeBegin());
1053:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1054:   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1055:   const auto &Aa     = aijkok->a_dual.view_device();
1056:   const auto &Ai     = aijkok->i_dual.view_device();
1057:   const auto &Aj     = aijkok->j_dual.view_device();
1058:   if (ll) {
1059:     PetscCall(VecGetLocalSize(ll, &m));
1060:     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1061:     PetscCall(VecGetKokkosView(ll, &lv));
1062:     PetscCallCXX(Kokkos::parallel_for( // for each row
1063:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1064:         PetscInt i   = t.league_rank(); // row i
1065:         PetscInt len = Ai(i + 1) - Ai(i);
1066:         // scale entries on the row
1067:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1068:       }));
1069:     PetscCall(VecRestoreKokkosView(ll, &lv));
1070:     PetscCall(PetscLogGpuFlops(nz));
1071:   }
1072:   if (rr) {
1073:     PetscCall(VecGetLocalSize(rr, &n));
1074:     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1075:     PetscCall(VecGetKokkosView(rr, &rv));
1076:     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1077:       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1078:     PetscCall(VecRestoreKokkosView(rr, &lv));
1079:     PetscCall(PetscLogGpuFlops(nz));
1080:   }
1081:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1082:   PetscCall(PetscLogGpuTimeEnd());
1083:   PetscFunctionReturn(PETSC_SUCCESS);
1084: }

1086: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1087: {
1088:   Mat_SeqAIJKokkos *aijkok;

1090:   PetscFunctionBegin;
1091:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1092:   if (aijkok) { /* Only zero the device if data is already there */
1093:     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1094:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1095:   } else { /* Might be preallocated but not assembled */
1096:     PetscCall(MatZeroEntries_SeqAIJ(A));
1097:   }
1098:   PetscFunctionReturn(PETSC_SUCCESS);
1099: }

1101: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1102: {
1103:   Mat_SeqAIJKokkos     *aijkok;
1104:   PetscInt              n;
1105:   PetscScalarKokkosView xv;

1107:   PetscFunctionBegin;
1108:   PetscCall(VecGetLocalSize(x, &n));
1109:   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1110:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");

1112:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1113:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

1115:   const auto &Aa    = aijkok->a_dual.view_device();
1116:   const auto &Ai    = aijkok->i_dual.view_device();
1117:   const auto &Adiag = aijkok->diag_dual.view_device();

1119:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1120:   Kokkos::parallel_for(
1121:     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1122:       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1123:       else xv(i) = 0;
1124:     });
1125:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1126:   PetscFunctionReturn(PETSC_SUCCESS);
1127: }

1129: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1130: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1131: {
1132:   Mat_SeqAIJKokkos *aijkok;

1134:   PetscFunctionBegin;
1136:   PetscAssertPointer(kv, 2);
1137:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1138:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1139:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1140:   *kv    = aijkok->a_dual.view_device();
1141:   PetscFunctionReturn(PETSC_SUCCESS);
1142: }

1144: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1145: {
1146:   PetscFunctionBegin;
1148:   PetscAssertPointer(kv, 2);
1149:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1150:   PetscFunctionReturn(PETSC_SUCCESS);
1151: }

1153: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1154: {
1155:   Mat_SeqAIJKokkos *aijkok;

1157:   PetscFunctionBegin;
1159:   PetscAssertPointer(kv, 2);
1160:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1161:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1162:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1163:   *kv    = aijkok->a_dual.view_device();
1164:   PetscFunctionReturn(PETSC_SUCCESS);
1165: }

1167: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1168: {
1169:   PetscFunctionBegin;
1171:   PetscAssertPointer(kv, 2);
1172:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1173:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1174:   PetscFunctionReturn(PETSC_SUCCESS);
1175: }

1177: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1178: {
1179:   Mat_SeqAIJKokkos *aijkok;

1181:   PetscFunctionBegin;
1183:   PetscAssertPointer(kv, 2);
1184:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1185:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1186:   *kv    = aijkok->a_dual.view_device();
1187:   PetscFunctionReturn(PETSC_SUCCESS);
1188: }

1190: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1191: {
1192:   PetscFunctionBegin;
1194:   PetscAssertPointer(kv, 2);
1195:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1196:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1197:   PetscFunctionReturn(PETSC_SUCCESS);
1198: }

1200: /* Computes Y += alpha X */
1201: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1202: {
1203:   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1204:   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1205:   ConstMatScalarKokkosView Xa;
1206:   MatScalarKokkosView      Ya;
1207:   auto                    &exec = PetscGetKokkosExecutionSpace();

1209:   PetscFunctionBegin;
1210:   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1211:   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1212:   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1213:   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1214:   PetscCall(PetscLogGpuTimeBegin());

1216:   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1217:     PetscBool e;
1218:     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1219:     if (e) {
1220:       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1221:       if (e) pattern = SAME_NONZERO_PATTERN;
1222:     }
1223:   }

1225:   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1226:     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1227:     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1228:     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1229:   */
1230:   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1231:   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1232:   Xa   = xkok->a_dual.view_device();
1233:   Ya   = ykok->a_dual.view_device();

1235:   if (pattern == SAME_NONZERO_PATTERN) {
1236:     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1237:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1238:   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1239:     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1240:     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();

1242:     Kokkos::parallel_for(
1243:       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1244:         PetscInt i = t.league_rank(); // row i
1245:         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1246:           // Only one thread works in a team
1247:           PetscInt p, q = Yi(i);
1248:           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1249:             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1250:             if (Xj(p) == Yj(q)) {                        // Found it
1251:               Ya(q) += alpha * Xa(p);
1252:               q++;
1253:             } else {
1254:             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1255:             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1256: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1257:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1258: #else
1259:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1260: #endif
1261:             }
1262:           }
1263:         });
1264:       });
1265:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1266:   } else { // different nonzero patterns
1267:     Mat             Z;
1268:     KokkosCsrMatrix zcsr;
1269:     KernelHandle    kh;
1270:     kh.create_spadd_handle(true); // X, Y are sorted
1271:     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1272:     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1273:     zkok = new Mat_SeqAIJKokkos(zcsr);
1274:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1275:     PetscCall(MatHeaderReplace(Y, &Z));
1276:     kh.destroy_spadd_handle();
1277:   }
1278:   PetscCall(PetscLogGpuTimeEnd());
1279:   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1280:   PetscFunctionReturn(PETSC_SUCCESS);
1281: }

1283: struct MatCOOStruct_SeqAIJKokkos {
1284:   PetscCount           n;
1285:   PetscCount           Atot;
1286:   PetscInt             nz;
1287:   PetscCountKokkosView jmap;
1288:   PetscCountKokkosView perm;

1290:   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1291:   {
1292:     nz   = coo_h->nz;
1293:     n    = coo_h->n;
1294:     Atot = coo_h->Atot;
1295:     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1296:     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1297:   }
1298: };

1300: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1301: {
1302:   PetscFunctionBegin;
1303:   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1304:   PetscFunctionReturn(PETSC_SUCCESS);
1305: }

1307: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1308: {
1309:   Mat_SeqAIJKokkos          *akok;
1310:   Mat_SeqAIJ                *aseq;
1311:   PetscContainer             container_h;
1312:   MatCOOStruct_SeqAIJ       *coo_h;
1313:   MatCOOStruct_SeqAIJKokkos *coo_d;

1315:   PetscFunctionBegin;
1316:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1317:   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1318:   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1319:   delete akok;
1320:   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1321:   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));

1323:   // Copy the COO struct to device
1324:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1325:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1326:   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));

1328:   // Put the COO struct in a container and then attach that to the matrix
1329:   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1330:   PetscFunctionReturn(PETSC_SUCCESS);
1331: }

1333: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1334: {
1335:   MatScalarKokkosView        Aa;
1336:   ConstMatScalarKokkosView   kv;
1337:   PetscMemType               memtype;
1338:   PetscContainer             container;
1339:   MatCOOStruct_SeqAIJKokkos *coo;

1341:   PetscFunctionBegin;
1342:   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1343:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

1345:   const auto &n    = coo->n;
1346:   const auto &Annz = coo->nz;
1347:   const auto &jmap = coo->jmap;
1348:   const auto &perm = coo->perm;

1350:   PetscCall(PetscGetMemType(v, &memtype));
1351:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1352:     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1353:   } else {
1354:     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1355:   }

1357:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1358:   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */

1360:   PetscCall(PetscLogGpuTimeBegin());
1361:   Kokkos::parallel_for(
1362:     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1363:       PetscScalar sum = 0.0;
1364:       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1365:       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1366:     });
1367:   PetscCall(PetscLogGpuTimeEnd());

1369:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1370:   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1371:   PetscFunctionReturn(PETSC_SUCCESS);
1372: }

1374: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1375: {
1376:   PetscFunctionBegin;
1377:   PetscCall(MatSeqAIJKokkosSyncHost(A));
1378:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1379:   B->offloadmask = PETSC_OFFLOAD_CPU;
1380:   PetscFunctionReturn(PETSC_SUCCESS);
1381: }

1383: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1384: {
1385:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

1387:   PetscFunctionBegin;
1388:   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1389:   A->boundtocpu  = PETSC_FALSE;

1391:   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1392:   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1393:   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1394:   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1395:   A->ops->scale                     = MatScale_SeqAIJKokkos;
1396:   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1397:   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1398:   A->ops->mult                      = MatMult_SeqAIJKokkos;
1399:   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1400:   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1401:   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1402:   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1403:   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1404:   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1405:   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1406:   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1407:   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1408:   A->ops->shift                     = MatShift_SeqAIJKokkos;
1409:   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1410:   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1411:   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1412:   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1413:   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1414:   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1415:   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1416:   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1417:   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;

1419:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1420:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1421:   PetscFunctionReturn(PETSC_SUCCESS);
1422: }

1424: /*
1425:    Extract the (prescribled) diagonal blocks of the matrix and then invert them

1427:   Input Parameters:
1428: +  A       - the MATSEQAIJKOKKOS matrix
1429: .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1430: .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1431: .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1432: -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine

1434:   Output Parameter:
1435: .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1436: */
1437: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1438: {
1439:   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1440:   PetscInt          nblocks = bs.extent(0) - 1;

1442:   PetscFunctionBegin;
1443:   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device

1445:   // Pull out the diagonal blocks of the matrix and then invert the blocks
1446:   auto Aa    = akok->a_dual.view_device();
1447:   auto Ai    = akok->i_dual.view_device();
1448:   auto Aj    = akok->j_dual.view_device();
1449:   auto Adiag = akok->diag_dual.view_device();
1450:   // TODO: how to tune the team size?
1451: #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1452:   auto ts = Kokkos::AUTO();
1453: #else
1454:   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1455: #endif
1456:   PetscCallCXX(Kokkos::parallel_for(
1457:     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1458:       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1459:       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1460:       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1461:       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1462:       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);

1464:       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1465:         PetscInt i = rstart + r;                                                            // i-th row in A

1467:         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1468:           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row

1470:           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1471:             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1472:               B(r, c) = 0.0;
1473:             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1474:               B(r, c) = Aa(first + c);
1475:             } else { // this entry does not show up in the CSR
1476:               B(r, c) = 0.0;
1477:             }
1478:           }
1479:         } else { // rare case that the diagonal does not exist
1480:           const PetscInt begin = Ai(i);
1481:           const PetscInt end   = Ai(i + 1);
1482:           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1483:           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1484:             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1485:             else if (Aj(j) >= rstart + m) break;
1486:           }
1487:         }
1488:       });

1490:       // LU-decompose B (w/o pivoting) and then invert B
1491:       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1492:       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1493:     }));
1494:   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1495:   PetscFunctionReturn(PETSC_SUCCESS);
1496: }

1498: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1499: {
1500:   Mat_SeqAIJ *aseq;
1501:   PetscInt    i, m, n;
1502:   auto       &exec = PetscGetKokkosExecutionSpace();

1504:   PetscFunctionBegin;
1505:   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");

1507:   m = akok->nrows();
1508:   n = akok->ncols();
1509:   PetscCall(MatSetSizes(A, m, n, m, n));
1510:   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));

1512:   /* Set up data structures of A as a MATSEQAIJ */
1513:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1514:   aseq = (Mat_SeqAIJ *)A->data;

1516:   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1517:   PetscCallCXX(akok->j_dual.sync_host(exec));
1518:   PetscCallCXX(exec.fence());

1520:   aseq->i       = akok->i_host_data();
1521:   aseq->j       = akok->j_host_data();
1522:   aseq->a       = akok->a_host_data();
1523:   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1524:   aseq->free_a  = PETSC_FALSE;
1525:   aseq->free_ij = PETSC_FALSE;
1526:   aseq->nz      = akok->nnz();
1527:   aseq->maxnz   = aseq->nz;

1529:   PetscCall(PetscMalloc1(m, &aseq->imax));
1530:   PetscCall(PetscMalloc1(m, &aseq->ilen));
1531:   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];

1533:   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1534:   akok->nonzerostate = A->nonzerostate;
1535:   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1536:   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1537:   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1538:   PetscFunctionReturn(PETSC_SUCCESS);
1539: }

1541: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1542: {
1543:   PetscFunctionBegin;
1544:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1545:   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1546:   PetscFunctionReturn(PETSC_SUCCESS);
1547: }

1549: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1550: {
1551:   Mat_SeqAIJKokkos *akok;

1553:   PetscFunctionBegin;
1554:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1555:   PetscCall(MatCreate(comm, A));
1556:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1557:   PetscFunctionReturn(PETSC_SUCCESS);
1558: }

1560: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure

1562:    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1563:  */
1564: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1565: {
1566:   PetscFunctionBegin;
1567:   PetscCall(MatCreate(comm, A));
1568:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1569:   PetscFunctionReturn(PETSC_SUCCESS);
1570: }

1572: /*@C
1573:   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1574:   (the default parallel PETSc format). This matrix will ultimately be handled by
1575:   Kokkos for calculations.

1577:   Collective

1579:   Input Parameters:
1580: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1581: . m    - number of rows
1582: . n    - number of columns
1583: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1584: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

1586:   Output Parameter:
1587: . A - the matrix

1589:   Level: intermediate

1591:   Notes:
1592:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1593:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1594:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

1596:   The AIJ format, also called
1597:   compressed row storage, is fully compatible with standard Fortran
1598:   storage.  That is, the stored row and column indices can begin at
1599:   either one (as in Fortran) or zero.

1601:   Specify the preallocated storage with either `nz` or `nnz` (not both).
1602:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1603:   allocation.

1605: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1606: @*/
1607: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1608: {
1609:   PetscFunctionBegin;
1610:   PetscCall(PetscKokkosInitializeCheck());
1611:   PetscCall(MatCreate(comm, A));
1612:   PetscCall(MatSetSizes(*A, m, n, m, n));
1613:   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1614:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1615:   PetscFunctionReturn(PETSC_SUCCESS);
1616: }

1618: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1619: {
1620:   PetscFunctionBegin;
1621:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1622:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1623:   PetscFunctionReturn(PETSC_SUCCESS);
1624: }

1626: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1627: {
1628:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1630:   PetscFunctionBegin;
1631:   if (!factors->sptrsv_symbolic_completed) {
1632:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1633:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1634:     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1635:   }
1636:   PetscFunctionReturn(PETSC_SUCCESS);
1637: }

1639: /* Check if we need to update factors etc for transpose solve */
1640: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1641: {
1642:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1643:   MatColIdxType               n       = A->rmap->n;

1645:   PetscFunctionBegin;
1646:   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1647:     /* Update L^T and do sptrsv symbolic */
1648:     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1649:     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1650:     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));

1652:     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1653:                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);

1655:     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1656:       We have to sort the indices, until KK provides finer control options.
1657:     */
1658:     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);

1660:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);

1662:     /* Update U^T and do sptrsv symbolic */
1663:     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1664:     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1665:     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));

1667:     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1668:                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);

1670:     /* Sort indices. See comments above */
1671:     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);

1673:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1674:     factors->transpose_updated = PETSC_TRUE;
1675:   }
1676:   PetscFunctionReturn(PETSC_SUCCESS);
1677: }

1679: /* Solve Ax = b, with A = LU */
1680: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1681: {
1682:   ConstPetscScalarKokkosView  bv;
1683:   PetscScalarKokkosView       xv;
1684:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1686:   PetscFunctionBegin;
1687:   PetscCall(PetscLogGpuTimeBegin());
1688:   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1689:   PetscCall(VecGetKokkosView(b, &bv));
1690:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1691:   /* Solve L tmpv = b */
1692:   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1693:   /* Solve Ux = tmpv */
1694:   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1695:   PetscCall(VecRestoreKokkosView(b, &bv));
1696:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1697:   PetscCall(PetscLogGpuTimeEnd());
1698:   PetscFunctionReturn(PETSC_SUCCESS);
1699: }

1701: /* Solve A^T x = b, where A^T = U^T L^T */
1702: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1703: {
1704:   ConstPetscScalarKokkosView  bv;
1705:   PetscScalarKokkosView       xv;
1706:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1708:   PetscFunctionBegin;
1709:   PetscCall(PetscLogGpuTimeBegin());
1710:   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1711:   PetscCall(VecGetKokkosView(b, &bv));
1712:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1713:   /* Solve U^T tmpv = b */
1714:   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);

1716:   /* Solve L^T x = tmpv */
1717:   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1718:   PetscCall(VecRestoreKokkosView(b, &bv));
1719:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1720:   PetscCall(PetscLogGpuTimeEnd());
1721:   PetscFunctionReturn(PETSC_SUCCESS);
1722: }

1724: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1725: {
1726:   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1727:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1728:   PetscInt                    fill_lev = info->levels;

1730:   PetscFunctionBegin;
1731:   PetscCall(PetscLogGpuTimeBegin());
1732:   PetscCall(MatSeqAIJKokkosSyncDevice(A));

1734:   auto a_d = aijkok->a_dual.view_device();
1735:   auto i_d = aijkok->i_dual.view_device();
1736:   auto j_d = aijkok->j_dual.view_device();

1738:   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);

1740:   B->assembled              = PETSC_TRUE;
1741:   B->preallocated           = PETSC_TRUE;
1742:   B->ops->solve             = MatSolve_SeqAIJKokkos;
1743:   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1744:   B->ops->matsolve          = NULL;
1745:   B->ops->matsolvetranspose = NULL;
1746:   B->offloadmask            = PETSC_OFFLOAD_GPU;

1748:   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1749:   factors->transpose_updated         = PETSC_FALSE;
1750:   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1751:   /* TODO: log flops, but how to know that? */
1752:   PetscCall(PetscLogGpuTimeEnd());
1753:   PetscFunctionReturn(PETSC_SUCCESS);
1754: }

1756: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1757: {
1758:   Mat_SeqAIJKokkos           *aijkok;
1759:   Mat_SeqAIJ                 *b;
1760:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1761:   PetscInt                    fill_lev = info->levels;
1762:   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1763:   PetscInt                    n        = A->rmap->n;

1765:   PetscFunctionBegin;
1766:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1767:   /* Rebuild factors */
1768:   if (factors) {
1769:     factors->Destroy();
1770:   } /* Destroy the old if it exists */
1771:   else {
1772:     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1773:   }

1775:   /* Create a spiluk handle and then do symbolic factorization */
1776:   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1777:   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);

1779:   auto spiluk_handle = factors->kh.get_spiluk_handle();

1781:   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1782:   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1783:   Kokkos::realloc(factors->iU_d, n + 1);
1784:   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());

1786:   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1787:   auto i_d = aijkok->i_dual.view_device();
1788:   auto j_d = aijkok->j_dual.view_device();
1789:   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1790:   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */

1792:   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1793:   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1794:   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1795:   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());

1797:   /* TODO: add options to select sptrsv algorithms */
1798:   /* Create sptrsv handles for L, U and their transpose */
1799: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1800:   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1801: #else
1802:   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1803: #endif

1805:   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1806:   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1807:   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1808:   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);

1810:   /* Fill fields of the factor matrix B */
1811:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1812:   b     = (Mat_SeqAIJ *)B->data;
1813:   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1814:   B->info.fill_ratio_given  = info->fill;
1815:   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;

1817:   B->offloadmask          = PETSC_OFFLOAD_GPU;
1818:   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1819:   PetscFunctionReturn(PETSC_SUCCESS);
1820: }

1822: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1823: {
1824:   PetscFunctionBegin;
1825:   *type = MATSOLVERKOKKOS;
1826:   PetscFunctionReturn(PETSC_SUCCESS);
1827: }

1829: /*MC
1830:   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1831:   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.

1833:   Level: beginner

1835: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1836: M*/
1837: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1838: {
1839:   PetscInt n = A->rmap->n;

1841:   PetscFunctionBegin;
1842:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1843:   PetscCall(MatSetSizes(*B, n, n, n, n));
1844:   (*B)->factortype = ftype;
1845:   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1846:   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));

1848:   if (ftype == MAT_FACTOR_LU) {
1849:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1850:     (*B)->canuseordering        = PETSC_TRUE;
1851:     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1852:   } else if (ftype == MAT_FACTOR_ILU) {
1853:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1854:     (*B)->canuseordering         = PETSC_FALSE;
1855:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1856:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);

1858:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1859:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1860:   PetscFunctionReturn(PETSC_SUCCESS);
1861: }

1863: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1864: {
1865:   PetscFunctionBegin;
1866:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1867:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1868:   PetscFunctionReturn(PETSC_SUCCESS);
1869: }

1871: /* Utility to print out a KokkosCsrMatrix for debugging */
1872: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1873: {
1874:   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1875:   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1876:   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1877:   const PetscInt    *i  = iv.data();
1878:   const PetscInt    *j  = jv.data();
1879:   const PetscScalar *a  = av.data();
1880:   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();

1882:   PetscFunctionBegin;
1883:   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1884:   for (PetscInt k = 0; k < m; k++) {
1885:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1886:     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1887:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1888:   }
1889:   PetscFunctionReturn(PETSC_SUCCESS);
1890: }