Actual source code: aijkok.kokkos.cxx

  1: #include <petsc_kokkos.hpp>
  2: #include <petscvec_kokkos.hpp>
  3: #include <petscmat_kokkos.hpp>
  4: #include <petscpkg_version.h>
  5: #include <petsc/private/petscimpl.h>
  6: #include <petsc/private/sfimpl.h>
  7: #include <petsc/private/kokkosimpl.hpp>
  8: #include <petscsys.h>

 10: #include <Kokkos_Core.hpp>
 11: #include <KokkosBlas.hpp>
 12: #include <KokkosSparse_CrsMatrix.hpp>

 14: // To suppress compiler warnings:
 15: // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
 16: // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
 17: // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
 18: // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
 19: #define DISABLE_CUSPARSE_DEPRECATED
 20: #include <KokkosSparse_spmv.hpp>

 22: #include <KokkosSparse_spiluk.hpp>
 23: #include <KokkosSparse_sptrsv.hpp>
 24: #include <KokkosSparse_spgemm.hpp>
 25: #include <KokkosSparse_spadd.hpp>
 26: #include <KokkosBatched_LU_Decl.hpp>
 27: #include <KokkosBatched_InverseLU_Decl.hpp>

 29: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>

 31: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
 32:   #include <KokkosSparse_Utils.hpp>
 33: using KokkosSparse::sort_crs_matrix;
 34: using KokkosSparse::Impl::transpose_matrix;
 35: #else
 36:   #include <KokkosKernels_Sorting.hpp>
 37: using KokkosKernels::sort_crs_matrix;
 38: using KokkosKernels::Impl::transpose_matrix;
 39: #endif

 41: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
 42: using KokkosSparse::spiluk_symbolic;
 43: using KokkosSparse::spiluk_numeric;
 44: using KokkosSparse::sptrsv_symbolic;
 45: using KokkosSparse::sptrsv_solve;
 46: using KokkosSparse::Experimental::SPTRSVAlgorithm;
 47: using KokkosSparse::Experimental::SPILUKAlgorithm;
 48: #else
 49: using KokkosSparse::Experimental::spiluk_symbolic;
 50: using KokkosSparse::Experimental::spiluk_numeric;
 51: using KokkosSparse::Experimental::sptrsv_symbolic;
 52: using KokkosSparse::Experimental::sptrsv_solve;
 53: using KokkosSparse::Experimental::SPTRSVAlgorithm;
 54: using KokkosSparse::Experimental::SPILUKAlgorithm;
 55: #endif

 57: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */

 59: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
 60:    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
 61:    In the latter case, it is important to set a_dual's sync state correctly.
 62:  */
 63: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
 64: {
 65:   Mat_SeqAIJ       *aijseq;
 66:   Mat_SeqAIJKokkos *aijkok;

 68:   PetscFunctionBegin;
 69:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
 70:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));

 72:   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
 73:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 75:   /* If aijkok does not exist, we just copy i, j to device.
 76:      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
 77:      In both cases, we build a new aijkok structure.
 78:   */
 79:   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
 80:     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
 81:       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
 82:       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
 83:       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
 84:       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
 85:       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
 86:       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
 87:       aijseq->free_a  = PETSC_TRUE;
 88:       aijseq->free_ij = PETSC_TRUE;
 89:       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
 90:          we have the CsrMatrix on device first and then copy to host, followed by
 91:          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
 92:          One could improve it by not using NULL garray.
 93:       */
 94:     }
 95:     delete aijkok;
 96:     aijkok   = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /* don't copy mat values to device */);
 97:     A->spptr = aijkok;
 98:   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
 99:     const PetscInt *adiag;
100:     /* the a->diag is created at assmebly here because the rest of the Kokkos AIJ code assumes it always exists. This needs to be fixed since it is now only created when needed! */
101:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
102:     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
103:     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
104:     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
105:   }
106:   PetscFunctionReturn(PETSC_SUCCESS);
107: }

109: /* Sync CSR data to device if not yet */
110: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
111: {
112:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

114:   PetscFunctionBegin;
115:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
116:   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
117:   if (aijkok->a_dual.need_sync_device()) {
118:     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
119:     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
120:     aijkok->hermitian_updated = PETSC_FALSE;
121:   }
122:   PetscFunctionReturn(PETSC_SUCCESS);
123: }

125: /* Mark the CSR data on device as modified */
126: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
127: {
128:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

130:   PetscFunctionBegin;
131:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
132:   aijkok->a_dual.clear_sync_state();
133:   aijkok->a_dual.modify_device();
134:   aijkok->transpose_updated = PETSC_FALSE;
135:   aijkok->hermitian_updated = PETSC_FALSE;
136:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
137:   PetscFunctionReturn(PETSC_SUCCESS);
138: }

140: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
141: {
142:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
143:   auto              exec   = PetscGetKokkosExecutionSpace();

145:   PetscFunctionBegin;
146:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
147:   /* We do not expect one needs factors on host  */
148:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
149:   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
150:   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
151:   PetscFunctionReturn(PETSC_SUCCESS);
152: }

154: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
155: {
156:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

158:   PetscFunctionBegin;
159:   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
160:     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
161:     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
162:     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
163:   */
164:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
165:     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
166:     *array = aijkok->a_dual.view_host().data();
167:   } else { /* Happens when calling MatSetValues on a newly created matrix */
168:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
169:   }
170:   PetscFunctionReturn(PETSC_SUCCESS);
171: }

173: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
174: {
175:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

177:   PetscFunctionBegin;
178:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
179:   PetscFunctionReturn(PETSC_SUCCESS);
180: }

182: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
183: {
184:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

186:   PetscFunctionBegin;
187:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
188:     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
189:     *array = aijkok->a_dual.view_host().data();
190:   } else {
191:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
192:   }
193:   PetscFunctionReturn(PETSC_SUCCESS);
194: }

196: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
197: {
198:   PetscFunctionBegin;
199:   *array = NULL;
200:   PetscFunctionReturn(PETSC_SUCCESS);
201: }

203: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
204: {
205:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

207:   PetscFunctionBegin;
208:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
209:     *array = aijkok->a_dual.view_host().data();
210:   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
211:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
212:   }
213:   PetscFunctionReturn(PETSC_SUCCESS);
214: }

216: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
217: {
218:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

220:   PetscFunctionBegin;
221:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
222:     aijkok->a_dual.clear_sync_state();
223:     aijkok->a_dual.modify_host();
224:   }
225:   PetscFunctionReturn(PETSC_SUCCESS);
226: }

228: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
229: {
230:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

232:   PetscFunctionBegin;
233:   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");

235:   if (i) *i = aijkok->i_device_data();
236:   if (j) *j = aijkok->j_device_data();
237:   if (a) {
238:     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
239:     *a = aijkok->a_device_data();
240:   }
241:   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
242:   PetscFunctionReturn(PETSC_SUCCESS);
243: }

245: static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
246: {
247:   PetscFunctionBegin;
248:   *mtype = PETSC_MEMTYPE_KOKKOS;
249:   PetscFunctionReturn(PETSC_SUCCESS);
250: }

252: /*
253:   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.

255:   Input Parameter:
256: .  A       - the MATSEQAIJKOKKOS matrix

258:   Output Parameters:
259: +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
260: -  T_d    - the transpose on device, whose value array is allocated but not initialized
261: */
262: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
263: {
264:   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
265:   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
266:   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
267:   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
268:   MatRowMapType          *Ti = Ti_h.data();
269:   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
270:   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
271:   PetscInt               *Tj   = Tj_h.data();
272:   PetscInt               *perm = perm_h.data();
273:   PetscInt               *offset;

275:   PetscFunctionBegin;
276:   // Populate Ti
277:   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
278:   Ti++;
279:   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
280:   Ti--;
281:   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];

283:   // Populate Tj and the permutation array
284:   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
285:   for (PetscInt i = 0; i < m; i++) {
286:     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
287:       PetscInt r    = Aj[j];                       // row r of T
288:       PetscInt disp = Ti[r] + offset[r];

290:       Tj[disp]   = i; // col i of T
291:       perm[disp] = j;
292:       offset[r]++;
293:     }
294:   }
295:   PetscCall(PetscFree(offset));

297:   // Sort each row of T, along with the permutation array
298:   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));

300:   // Output perm and T on device
301:   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
302:   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
303:   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
304:   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
305:   PetscFunctionReturn(PETSC_SUCCESS);
306: }

308: // Generate the transpose on device and cache it internally
309: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
310: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
311: {
312:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
313:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
314:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
315:   KokkosCsrMatrix  &T = akok->csrmatT;

317:   PetscFunctionBegin;
318:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
319:   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device

321:   const auto &Aa = akok->a_dual.view_device();

323:   if (A->symmetric == PETSC_BOOL3_TRUE) {
324:     *csrmatT = akok->csrmat;
325:   } else {
326:     // See if we already have a cached transpose and its value is up to date
327:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
328:       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
329:         const auto &perm = akok->transpose_perm; // get the permutation array
330:         auto       &Ta   = T.values;

332:         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
333:       }
334:     } else { // Generate T of size n x m for the first time
335:       MatRowMapKokkosView perm;

337:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
338:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
339:       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
340:     }
341:     akok->transpose_updated = PETSC_TRUE;
342:     *csrmatT                = akok->csrmatT;
343:   }
344:   PetscFunctionReturn(PETSC_SUCCESS);
345: }

347: // Generate the Hermitian on device and cache it internally
348: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
349: {
350:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
351:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
352:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
353:   KokkosCsrMatrix  &T = akok->csrmatH;

355:   PetscFunctionBegin;
356:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
357:   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device

359:   const auto &Aa = akok->a_dual.view_device();

361:   if (A->hermitian == PETSC_BOOL3_TRUE) {
362:     *csrmatH = akok->csrmat;
363:   } else {
364:     // See if we already have a cached Hermitian and its value is up to date
365:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
366:       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
367:         const auto &perm = akok->transpose_perm; // get the permutation array
368:         auto       &Ta   = T.values;

370:         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
371:       }
372:     } else { // Generate T of size n x m for the first time
373:       MatRowMapKokkosView perm;

375:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
376:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
377:       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
378:     }
379:     akok->hermitian_updated = PETSC_TRUE;
380:     *csrmatH                = akok->csrmatH;
381:   }
382:   PetscFunctionReturn(PETSC_SUCCESS);
383: }

385: /* y = A x */
386: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
387: {
388:   Mat_SeqAIJKokkos          *aijkok;
389:   ConstPetscScalarKokkosView xv;
390:   PetscScalarKokkosView      yv;

392:   PetscFunctionBegin;
393:   PetscCall(PetscLogGpuTimeBegin());
394:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
395:   PetscCall(VecGetKokkosView(xx, &xv));
396:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
397:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
398:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
399:   PetscCall(VecRestoreKokkosView(xx, &xv));
400:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
401:   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
402:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
403:   PetscCall(PetscLogGpuTimeEnd());
404:   PetscFunctionReturn(PETSC_SUCCESS);
405: }

407: /* y = A^T x */
408: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
409: {
410:   Mat_SeqAIJKokkos          *aijkok;
411:   const char                *mode;
412:   ConstPetscScalarKokkosView xv;
413:   PetscScalarKokkosView      yv;
414:   KokkosCsrMatrix            csrmat;

416:   PetscFunctionBegin;
417:   PetscCall(PetscLogGpuTimeBegin());
418:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
419:   PetscCall(VecGetKokkosView(xx, &xv));
420:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
421:   if (A->form_explicit_transpose) {
422:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
423:     mode = "N";
424:   } else {
425:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
426:     csrmat = aijkok->csrmat;
427:     mode   = "T";
428:   }
429:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
430:   PetscCall(VecRestoreKokkosView(xx, &xv));
431:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
432:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
433:   PetscCall(PetscLogGpuTimeEnd());
434:   PetscFunctionReturn(PETSC_SUCCESS);
435: }

437: /* y = A^H x */
438: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
439: {
440:   Mat_SeqAIJKokkos          *aijkok;
441:   const char                *mode;
442:   ConstPetscScalarKokkosView xv;
443:   PetscScalarKokkosView      yv;
444:   KokkosCsrMatrix            csrmat;

446:   PetscFunctionBegin;
447:   PetscCall(PetscLogGpuTimeBegin());
448:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
449:   PetscCall(VecGetKokkosView(xx, &xv));
450:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
451:   if (A->form_explicit_transpose) {
452:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
453:     mode = "N";
454:   } else {
455:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
456:     csrmat = aijkok->csrmat;
457:     mode   = "C";
458:   }
459:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
460:   PetscCall(VecRestoreKokkosView(xx, &xv));
461:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
462:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
463:   PetscCall(PetscLogGpuTimeEnd());
464:   PetscFunctionReturn(PETSC_SUCCESS);
465: }

467: /* z = A x + y */
468: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
469: {
470:   Mat_SeqAIJKokkos          *aijkok;
471:   ConstPetscScalarKokkosView xv;
472:   PetscScalarKokkosView      zv;

474:   PetscFunctionBegin;
475:   PetscCall(PetscLogGpuTimeBegin());
476:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
477:   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
478:   PetscCall(VecGetKokkosView(xx, &xv));
479:   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
480:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
481:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
482:   PetscCall(VecRestoreKokkosView(xx, &xv));
483:   PetscCall(VecRestoreKokkosView(zz, &zv));
484:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
485:   PetscCall(PetscLogGpuTimeEnd());
486:   PetscFunctionReturn(PETSC_SUCCESS);
487: }

489: /* z = A^T x + y */
490: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
491: {
492:   Mat_SeqAIJKokkos          *aijkok;
493:   const char                *mode;
494:   ConstPetscScalarKokkosView xv;
495:   PetscScalarKokkosView      zv;
496:   KokkosCsrMatrix            csrmat;

498:   PetscFunctionBegin;
499:   PetscCall(PetscLogGpuTimeBegin());
500:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
501:   if (zz != yy) PetscCall(VecCopy(yy, zz));
502:   PetscCall(VecGetKokkosView(xx, &xv));
503:   PetscCall(VecGetKokkosView(zz, &zv));
504:   if (A->form_explicit_transpose) {
505:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
506:     mode = "N";
507:   } else {
508:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
509:     csrmat = aijkok->csrmat;
510:     mode   = "T";
511:   }
512:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
513:   PetscCall(VecRestoreKokkosView(xx, &xv));
514:   PetscCall(VecRestoreKokkosView(zz, &zv));
515:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
516:   PetscCall(PetscLogGpuTimeEnd());
517:   PetscFunctionReturn(PETSC_SUCCESS);
518: }

520: /* z = A^H x + y */
521: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
522: {
523:   Mat_SeqAIJKokkos          *aijkok;
524:   const char                *mode;
525:   ConstPetscScalarKokkosView xv;
526:   PetscScalarKokkosView      zv;
527:   KokkosCsrMatrix            csrmat;

529:   PetscFunctionBegin;
530:   PetscCall(PetscLogGpuTimeBegin());
531:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
532:   if (zz != yy) PetscCall(VecCopy(yy, zz));
533:   PetscCall(VecGetKokkosView(xx, &xv));
534:   PetscCall(VecGetKokkosView(zz, &zv));
535:   if (A->form_explicit_transpose) {
536:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
537:     mode = "N";
538:   } else {
539:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
540:     csrmat = aijkok->csrmat;
541:     mode   = "C";
542:   }
543:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
544:   PetscCall(VecRestoreKokkosView(xx, &xv));
545:   PetscCall(VecRestoreKokkosView(zz, &zv));
546:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
547:   PetscCall(PetscLogGpuTimeEnd());
548:   PetscFunctionReturn(PETSC_SUCCESS);
549: }

551: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
552: {
553:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

555:   PetscFunctionBegin;
556:   switch (op) {
557:   case MAT_FORM_EXPLICIT_TRANSPOSE:
558:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
559:     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
560:     A->form_explicit_transpose = flg;
561:     break;
562:   default:
563:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
564:     break;
565:   }
566:   PetscFunctionReturn(PETSC_SUCCESS);
567: }

569: /* Depending on reuse, either build a new mat, or use the existing mat */
570: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
571: {
572:   Mat_SeqAIJ *aseq;

574:   PetscFunctionBegin;
575:   PetscCall(PetscKokkosInitializeCheck());
576:   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
577:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
578:     PetscCall(MatSetType(*newmat, mtype));
579:   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
580:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
581:   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
582:     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
583:     PetscCall(PetscFree(A->defaultvectype));
584:     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
585:     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
586:     PetscCall(MatSetOps_SeqAIJKokkos(A));
587:     aseq = static_cast<Mat_SeqAIJ *>(A->data);
588:     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
589:       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
590:       A->spptr = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
591:     }
592:   }
593:   PetscFunctionReturn(PETSC_SUCCESS);
594: }

596: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
597:    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
598:  */
599: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
600: {
601:   Mat_SeqAIJ       *bseq;
602:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
603:   Mat               mat;

605:   PetscFunctionBegin;
606:   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
607:   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
608:   mat = *B;
609:   if (A->assembled) {
610:     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
611:     bkok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
612:     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
613:     /* Now copy values to B if needed */
614:     if (dupOption == MAT_COPY_VALUES) {
615:       if (akok->a_dual.need_sync_device()) {
616:         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
617:         bkok->a_dual.modify_host();
618:       } else { /* If device has the latest data, we only copy data on device */
619:         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
620:         bkok->a_dual.modify_device();
621:       }
622:     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
623:       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
624:       bkok->a_dual.modify_host();
625:     }
626:     mat->spptr = bkok;
627:   }

629:   PetscCall(PetscFree(mat->defaultvectype));
630:   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
631:   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
632:   PetscCall(MatSetOps_SeqAIJKokkos(mat));
633:   PetscFunctionReturn(PETSC_SUCCESS);
634: }

636: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
637: {
638:   Mat               At;
639:   KokkosCsrMatrix   internT;
640:   Mat_SeqAIJKokkos *atkok, *bkok;

642:   PetscFunctionBegin;
643:   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
644:   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
645:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
646:     /* Deep copy internT, as we want to isolate the internal transpose */
647:     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
648:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
649:     if (reuse == MAT_INITIAL_MATRIX) *B = At;
650:     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
651:   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
652:     if ((*B)->assembled) {
653:       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
654:       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
655:       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
656:     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
657:       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
658:       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
659:       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
660:       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
661:       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
662:     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
663:   }
664:   PetscFunctionReturn(PETSC_SUCCESS);
665: }

667: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
668: {
669:   Mat_SeqAIJKokkos *aijkok;

671:   PetscFunctionBegin;
672:   if (A->factortype == MAT_FACTOR_NONE) {
673:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
674:     delete aijkok;
675:   } else {
676:     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
677:   }
678:   A->spptr = NULL;
679:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
680:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
681:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
682: #if defined(PETSC_HAVE_HYPRE)
683:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
684: #endif
685:   PetscCall(MatDestroy_SeqAIJ(A));
686:   PetscFunctionReturn(PETSC_SUCCESS);
687: }

689: /*MC
690:    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos

692:    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types

694:    Options Database Key:
695: .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`

697:   Level: beginner

699: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
700: M*/
701: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
702: {
703:   PetscFunctionBegin;
704:   PetscCall(PetscKokkosInitializeCheck());
705:   PetscCall(MatCreate_SeqAIJ(A));
706:   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
707:   PetscFunctionReturn(PETSC_SUCCESS);
708: }

710: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
711: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
712: {
713:   Mat_SeqAIJ         *a, *b;
714:   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
715:   MatScalarKokkosView aa, ba, ca;
716:   MatRowMapKokkosView ai, bi, ci;
717:   MatColIdxKokkosView aj, bj, cj;
718:   PetscInt            m, n, nnz, aN;

720:   PetscFunctionBegin;
723:   PetscAssertPointer(C, 4);
724:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
725:   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
726:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
727:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");

729:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
730:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
731:   a    = static_cast<Mat_SeqAIJ *>(A->data);
732:   b    = static_cast<Mat_SeqAIJ *>(B->data);
733:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
734:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
735:   aa   = akok->a_dual.view_device();
736:   ai   = akok->i_dual.view_device();
737:   ba   = bkok->a_dual.view_device();
738:   bi   = bkok->i_dual.view_device();
739:   m    = A->rmap->n; /* M, N and nnz of C */
740:   n    = A->cmap->n + B->cmap->n;
741:   nnz  = a->nz + b->nz;
742:   aN   = A->cmap->n; /* N of A */
743:   if (reuse == MAT_INITIAL_MATRIX) {
744:     aj      = akok->j_dual.view_device();
745:     bj      = bkok->j_dual.view_device();
746:     auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
747:     auto ci = MatRowMapKokkosView("i", ai.extent(0));
748:     auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));

750:     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
751:     Kokkos::parallel_for(
752:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
753:         PetscInt i       = t.league_rank(); /* row i */
754:         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);

756:         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
757:                                                    ci(i) = coffset;
758:                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
759:         });

761:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
762:           if (k < alen) {
763:             ca(coffset + k) = aa(ai(i) + k);
764:             cj(coffset + k) = aj(ai(i) + k);
765:           } else {
766:             ca(coffset + k) = ba(bi(i) + k - alen);
767:             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
768:           }
769:         });
770:       });
771:     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
772:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
773:   } else if (reuse == MAT_REUSE_MATRIX) {
775:     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
776:     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
777:     ca   = ckok->a_dual.view_device();
778:     ci   = ckok->i_dual.view_device();

780:     Kokkos::parallel_for(
781:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
782:         PetscInt i    = t.league_rank(); /* row i */
783:         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
784:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
785:           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
786:           else ca(ci(i) + k) = ba(bi(i) + k - alen);
787:         });
788:       });
789:     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
790:   }
791:   PetscFunctionReturn(PETSC_SUCCESS);
792: }

794: static PetscErrorCode MatProductCtxDestroy_SeqAIJKokkos(void **pdata)
795: {
796:   PetscFunctionBegin;
797:   delete *reinterpret_cast<MatProductCtx_SeqAIJKokkos **>(pdata);
798:   PetscFunctionReturn(PETSC_SUCCESS);
799: }

801: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
802: {
803:   Mat_Product                *product = C->product;
804:   Mat                         A, B;
805:   bool                        transA, transB; /* use bool, since KK needs this type */
806:   Mat_SeqAIJKokkos           *akok, *bkok, *ckok;
807:   Mat_SeqAIJ                 *c;
808:   MatProductCtx_SeqAIJKokkos *pdata;
809:   KokkosCsrMatrix             csrmatA, csrmatB;

811:   PetscFunctionBegin;
812:   MatCheckProduct(C, 1);
813:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
814:   pdata = static_cast<MatProductCtx_SeqAIJKokkos *>(C->product->data);

816:   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
817:   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
818:   // we still do numeric.
819:   if (pdata->reusesym) { // numeric reuses results from symbolic
820:     pdata->reusesym = PETSC_FALSE;
821:     PetscFunctionReturn(PETSC_SUCCESS);
822:   }

824:   switch (product->type) {
825:   case MATPRODUCT_AB:
826:     transA = false;
827:     transB = false;
828:     break;
829:   case MATPRODUCT_AtB:
830:     transA = true;
831:     transB = false;
832:     break;
833:   case MATPRODUCT_ABt:
834:     transA = false;
835:     transB = true;
836:     break;
837:   default:
838:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
839:   }

841:   A = product->A;
842:   B = product->B;
843:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
844:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
845:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
846:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
847:   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);

849:   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");

851:   csrmatA = akok->csrmat;
852:   csrmatB = bkok->csrmat;

854:   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
855:   if (transA) {
856:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
857:     transA = false;
858:   }

860:   if (transB) {
861:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
862:     transB = false;
863:   }
864:   PetscCall(PetscLogGpuTimeBegin());
865:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
866: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
867:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
868:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
869: #endif

871:   PetscCall(PetscLogGpuTimeEnd());
872:   PetscCall(MatSeqAIJKokkosModifyDevice(C));
873:   /* shorter version of MatAssemblyEnd_SeqAIJ */
874:   c = (Mat_SeqAIJ *)C->data;
875:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
876:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
877:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
878:   c->reallocs         = 0;
879:   C->info.mallocs     = 0;
880:   C->info.nz_unneeded = 0;
881:   C->assembled = C->was_assembled = PETSC_TRUE;
882:   C->num_ass++;
883:   PetscFunctionReturn(PETSC_SUCCESS);
884: }

886: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
887: {
888:   Mat_Product                *product = C->product;
889:   MatProductType              ptype;
890:   Mat                         A, B;
891:   bool                        transA, transB;
892:   Mat_SeqAIJKokkos           *akok, *bkok, *ckok;
893:   MatProductCtx_SeqAIJKokkos *pdata;
894:   MPI_Comm                    comm;
895:   KokkosCsrMatrix             csrmatA, csrmatB, csrmatC;

897:   PetscFunctionBegin;
898:   MatCheckProduct(C, 1);
899:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
900:   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
901:   A = product->A;
902:   B = product->B;
903:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
904:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
905:   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
906:   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
907:   csrmatA = akok->csrmat;
908:   csrmatB = bkok->csrmat;

910:   ptype = product->type;
911:   // Take advantage of the symmetry if true
912:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
913:     ptype                                          = MATPRODUCT_AB;
914:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
915:   }
916:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
917:     ptype                                          = MATPRODUCT_AB;
918:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
919:   }

921:   switch (ptype) {
922:   case MATPRODUCT_AB:
923:     transA = false;
924:     transB = false;
925:     PetscCall(MatSetBlockSizesFromMats(C, A, B));
926:     break;
927:   case MATPRODUCT_AtB:
928:     transA = true;
929:     transB = false;
930:     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
931:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
932:     break;
933:   case MATPRODUCT_ABt:
934:     transA = false;
935:     transB = true;
936:     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
937:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
938:     break;
939:   default:
940:     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
941:   }
942:   PetscCallCXX(product->data = pdata = new MatProductCtx_SeqAIJKokkos());
943:   pdata->reusesym = product->api_user;

945:   /* TODO: add command line options to select spgemm algorithms */
946:   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */

948:   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
949: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
950:   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
951:   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
952:   #endif
953: #endif
954:   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));

956:   PetscCall(PetscLogGpuTimeBegin());
957:   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
958:   if (transA) {
959:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
960:     transA = false;
961:   }

963:   if (transB) {
964:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
965:     transB = false;
966:   }

968:   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
969:   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
970:     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
971:     calling new Mat_SeqAIJKokkos().
972:     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
973:   */
974:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
975: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
976:   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
977:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
978:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
979: #endif
980:   PetscCall(PetscLogGpuTimeEnd());

982:   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
983:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
984:   C->product->destroy = MatProductCtxDestroy_SeqAIJKokkos;
985:   PetscFunctionReturn(PETSC_SUCCESS);
986: }

988: /* handles sparse matrix matrix ops */
989: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
990: {
991:   Mat_Product *product = mat->product;
992:   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;

994:   PetscFunctionBegin;
995:   MatCheckProduct(mat, 1);
996:   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
997:   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
998:   if (Biskok && Ciskok) {
999:     switch (product->type) {
1000:     case MATPRODUCT_AB:
1001:     case MATPRODUCT_AtB:
1002:     case MATPRODUCT_ABt:
1003:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1004:       break;
1005:     case MATPRODUCT_PtAP:
1006:     case MATPRODUCT_RARt:
1007:     case MATPRODUCT_ABC:
1008:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1009:       break;
1010:     default:
1011:       break;
1012:     }
1013:   } else { /* fallback for AIJ */
1014:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1015:   }
1016:   PetscFunctionReturn(PETSC_SUCCESS);
1017: }

1019: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1020: {
1021:   Mat_SeqAIJKokkos *aijkok;

1023:   PetscFunctionBegin;
1024:   PetscCall(PetscLogGpuTimeBegin());
1025:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1026:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1027:   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1028:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1029:   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1030:   PetscCall(PetscLogGpuTimeEnd());
1031:   PetscFunctionReturn(PETSC_SUCCESS);
1032: }

1034: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1035: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1036: {
1037:   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);

1039:   PetscFunctionBegin;
1040:   if (A->assembled && aijseq->diagDense) { // no missing diagonals
1041:     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);

1043:     PetscCall(PetscLogGpuTimeBegin());
1044:     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1045:     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1046:     const auto &Aa     = aijkok->a_dual.view_device();
1047:     const auto &Adiag  = aijkok->diag_dual.view_device();
1048:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1049:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1050:     PetscCall(PetscLogGpuFlops(n));
1051:     PetscCall(PetscLogGpuTimeEnd());
1052:   } else { // need reassembly, very slow!
1053:     PetscCall(MatShift_Basic(A, a));
1054:   }
1055:   PetscFunctionReturn(PETSC_SUCCESS);
1056: }

1058: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1059: {
1060:   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);

1062:   PetscFunctionBegin;
1063:   if (Y->assembled && aijseq->diagDense) { // no missing diagonals
1064:     ConstPetscScalarKokkosView dv;
1065:     PetscInt                   n, nv;

1067:     PetscCall(PetscLogGpuTimeBegin());
1068:     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1069:     PetscCall(VecGetKokkosView(D, &dv));
1070:     PetscCall(VecGetLocalSize(D, &nv));
1071:     n = PetscMin(Y->rmap->n, Y->cmap->n);
1072:     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");

1074:     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1075:     const auto &Aa     = aijkok->a_dual.view_device();
1076:     const auto &Adiag  = aijkok->diag_dual.view_device();
1077:     PetscCallCXX(Kokkos::parallel_for(
1078:       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1079:         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1080:         else Aa(Adiag(i)) += dv(i);
1081:       }));
1082:     PetscCall(VecRestoreKokkosView(D, &dv));
1083:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1084:     PetscCall(PetscLogGpuFlops(n));
1085:     PetscCall(PetscLogGpuTimeEnd());
1086:   } else { // need reassembly, very slow!
1087:     PetscCall(MatDiagonalSet_Default(Y, D, is));
1088:   }
1089:   PetscFunctionReturn(PETSC_SUCCESS);
1090: }

1092: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1093: {
1094:   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1095:   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1096:   ConstPetscScalarKokkosView lv, rv;

1098:   PetscFunctionBegin;
1099:   PetscCall(PetscLogGpuTimeBegin());
1100:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1101:   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1102:   const auto &Aa     = aijkok->a_dual.view_device();
1103:   const auto &Ai     = aijkok->i_dual.view_device();
1104:   const auto &Aj     = aijkok->j_dual.view_device();
1105:   if (ll) {
1106:     PetscCall(VecGetLocalSize(ll, &m));
1107:     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1108:     PetscCall(VecGetKokkosView(ll, &lv));
1109:     PetscCallCXX(Kokkos::parallel_for( // for each row
1110:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1111:         PetscInt i   = t.league_rank(); // row i
1112:         PetscInt len = Ai(i + 1) - Ai(i);
1113:         // scale entries on the row
1114:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1115:       }));
1116:     PetscCall(VecRestoreKokkosView(ll, &lv));
1117:     PetscCall(PetscLogGpuFlops(nz));
1118:   }
1119:   if (rr) {
1120:     PetscCall(VecGetLocalSize(rr, &n));
1121:     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1122:     PetscCall(VecGetKokkosView(rr, &rv));
1123:     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1124:       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1125:     PetscCall(VecRestoreKokkosView(rr, &lv));
1126:     PetscCall(PetscLogGpuFlops(nz));
1127:   }
1128:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1129:   PetscCall(PetscLogGpuTimeEnd());
1130:   PetscFunctionReturn(PETSC_SUCCESS);
1131: }

1133: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1134: {
1135:   Mat_SeqAIJKokkos *aijkok;

1137:   PetscFunctionBegin;
1138:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1139:   if (aijkok) { /* Only zero the device if data is already there */
1140:     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1141:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1142:   } else { /* Might be preallocated but not assembled */
1143:     PetscCall(MatZeroEntries_SeqAIJ(A));
1144:   }
1145:   PetscFunctionReturn(PETSC_SUCCESS);
1146: }

1148: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1149: {
1150:   Mat_SeqAIJKokkos     *aijkok;
1151:   PetscInt              n;
1152:   PetscScalarKokkosView xv;

1154:   PetscFunctionBegin;
1155:   PetscCall(VecGetLocalSize(x, &n));
1156:   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1157:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");

1159:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1160:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

1162:   const auto &Aa    = aijkok->a_dual.view_device();
1163:   const auto &Ai    = aijkok->i_dual.view_device();
1164:   const auto &Adiag = aijkok->diag_dual.view_device();

1166:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1167:   Kokkos::parallel_for(
1168:     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1169:       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1170:       else xv(i) = 0;
1171:     });
1172:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1173:   PetscFunctionReturn(PETSC_SUCCESS);
1174: }

1176: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1177: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1178: {
1179:   Mat_SeqAIJKokkos *aijkok;

1181:   PetscFunctionBegin;
1183:   PetscAssertPointer(kv, 2);
1184:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1185:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1186:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1187:   *kv    = aijkok->a_dual.view_device();
1188:   PetscFunctionReturn(PETSC_SUCCESS);
1189: }

1191: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1192: {
1193:   PetscFunctionBegin;
1195:   PetscAssertPointer(kv, 2);
1196:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1197:   PetscFunctionReturn(PETSC_SUCCESS);
1198: }

1200: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1201: {
1202:   Mat_SeqAIJKokkos *aijkok;

1204:   PetscFunctionBegin;
1206:   PetscAssertPointer(kv, 2);
1207:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1208:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1209:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1210:   *kv    = aijkok->a_dual.view_device();
1211:   PetscFunctionReturn(PETSC_SUCCESS);
1212: }

1214: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1215: {
1216:   PetscFunctionBegin;
1218:   PetscAssertPointer(kv, 2);
1219:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1220:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1221:   PetscFunctionReturn(PETSC_SUCCESS);
1222: }

1224: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1225: {
1226:   Mat_SeqAIJKokkos *aijkok;

1228:   PetscFunctionBegin;
1230:   PetscAssertPointer(kv, 2);
1231:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1232:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1233:   *kv    = aijkok->a_dual.view_device();
1234:   PetscFunctionReturn(PETSC_SUCCESS);
1235: }

1237: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1238: {
1239:   PetscFunctionBegin;
1241:   PetscAssertPointer(kv, 2);
1242:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1243:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1244:   PetscFunctionReturn(PETSC_SUCCESS);
1245: }

1247: PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1248: {
1249:   Mat_SeqAIJKokkos *akok;

1251:   PetscFunctionBegin;
1252:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1253:   PetscCall(MatCreate(comm, A));
1254:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1255:   PetscFunctionReturn(PETSC_SUCCESS);
1256: }

1258: /* Computes Y += alpha X */
1259: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1260: {
1261:   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1262:   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1263:   ConstMatScalarKokkosView Xa;
1264:   MatScalarKokkosView      Ya;
1265:   auto                     exec = PetscGetKokkosExecutionSpace();

1267:   PetscFunctionBegin;
1268:   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1269:   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1270:   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1271:   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1272:   PetscCall(PetscLogGpuTimeBegin());

1274:   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1275:     PetscBool e;
1276:     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1277:     if (e) {
1278:       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1279:       if (e) pattern = SAME_NONZERO_PATTERN;
1280:     }
1281:   }

1283:   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1284:     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1285:     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1286:     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1287:   */
1288:   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1289:   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1290:   Xa   = xkok->a_dual.view_device();
1291:   Ya   = ykok->a_dual.view_device();

1293:   if (pattern == SAME_NONZERO_PATTERN) {
1294:     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1295:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1296:   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1297:     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1298:     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();

1300:     Kokkos::parallel_for(
1301:       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1302:         PetscInt i = t.league_rank(); // row i
1303:         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1304:           // Only one thread works in a team
1305:           PetscInt p, q = Yi(i);
1306:           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1307:             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1308:             if (Xj(p) == Yj(q)) {                        // Found it
1309:               Ya(q) += alpha * Xa(p);
1310:               q++;
1311:             } else {
1312:             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1313:             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1314: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1315:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1316: #else
1317:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1318: #endif
1319:             }
1320:           }
1321:         });
1322:       });
1323:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1324:   } else { // different nonzero patterns
1325:     Mat             Z;
1326:     KokkosCsrMatrix zcsr;
1327:     KernelHandle    kh;
1328:     kh.create_spadd_handle(true); // X, Y are sorted
1329:     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1330:     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1331:     zkok = new Mat_SeqAIJKokkos(zcsr);
1332:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1333:     PetscCall(MatHeaderReplace(Y, &Z));
1334:     kh.destroy_spadd_handle();
1335:   }
1336:   PetscCall(PetscLogGpuTimeEnd());
1337:   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1338:   PetscFunctionReturn(PETSC_SUCCESS);
1339: }

1341: struct MatCOOStruct_SeqAIJKokkos {
1342:   PetscCount           n;
1343:   PetscCount           Atot;
1344:   PetscInt             nz;
1345:   PetscCountKokkosView jmap;
1346:   PetscCountKokkosView perm;

1348:   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1349:   {
1350:     nz   = coo_h->nz;
1351:     n    = coo_h->n;
1352:     Atot = coo_h->Atot;
1353:     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1354:     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1355:   }
1356: };

1358: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1359: {
1360:   PetscFunctionBegin;
1361:   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1362:   PetscFunctionReturn(PETSC_SUCCESS);
1363: }

1365: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1366: {
1367:   Mat_SeqAIJKokkos          *akok;
1368:   Mat_SeqAIJ                *aseq;
1369:   PetscContainer             container_h;
1370:   MatCOOStruct_SeqAIJ       *coo_h;
1371:   MatCOOStruct_SeqAIJKokkos *coo_d;

1373:   PetscFunctionBegin;
1374:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1375:   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1376:   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1377:   delete akok;
1378:   mat->spptr = akok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1379:   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));

1381:   // Copy the COO struct to device
1382:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1383:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1384:   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));

1386:   // Put the COO struct in a container and then attach that to the matrix
1387:   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1388:   PetscFunctionReturn(PETSC_SUCCESS);
1389: }

1391: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1392: {
1393:   MatScalarKokkosView        Aa;
1394:   ConstMatScalarKokkosView   kv;
1395:   PetscMemType               memtype;
1396:   PetscContainer             container;
1397:   MatCOOStruct_SeqAIJKokkos *coo;

1399:   PetscFunctionBegin;
1400:   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1401:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

1403:   const auto &n    = coo->n;
1404:   const auto &Annz = coo->nz;
1405:   const auto &jmap = coo->jmap;
1406:   const auto &perm = coo->perm;

1408:   PetscCall(PetscGetMemType(v, &memtype));
1409:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1410:     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1411:   } else {
1412:     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1413:   }

1415:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1416:   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */

1418:   PetscCall(PetscLogGpuTimeBegin());
1419:   Kokkos::parallel_for(
1420:     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1421:       PetscScalar sum = 0.0;
1422:       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1423:       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1424:     });
1425:   PetscCall(PetscLogGpuTimeEnd());

1427:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1428:   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1429:   PetscFunctionReturn(PETSC_SUCCESS);
1430: }

1432: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1433: {
1434:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

1436:   PetscFunctionBegin;
1437:   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1438:   A->boundtocpu  = PETSC_FALSE;

1440:   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1441:   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1442:   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1443:   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1444:   A->ops->scale                     = MatScale_SeqAIJKokkos;
1445:   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1446:   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1447:   A->ops->mult                      = MatMult_SeqAIJKokkos;
1448:   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1449:   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1450:   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1451:   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1452:   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1453:   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1454:   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1455:   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1456:   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1457:   A->ops->shift                     = MatShift_SeqAIJKokkos;
1458:   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1459:   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1460:   A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJKokkos;
1461:   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1462:   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1463:   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1464:   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1465:   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1466:   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1467:   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;

1469:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1470:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1471: #if defined(PETSC_HAVE_HYPRE)
1472:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1473: #endif
1474:   PetscFunctionReturn(PETSC_SUCCESS);
1475: }

1477: /*
1478:    Extract the (prescribled) diagonal blocks of the matrix and then invert them

1480:   Input Parameters:
1481: +  A       - the MATSEQAIJKOKKOS matrix
1482: .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1483: .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1484: .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1485: -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine

1487:   Output Parameter:
1488: .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1489: */
1490: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1491: {
1492:   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1493:   PetscInt          nblocks = bs.extent(0) - 1;

1495:   PetscFunctionBegin;
1496:   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device

1498:   // Pull out the diagonal blocks of the matrix and then invert the blocks
1499:   auto Aa    = akok->a_dual.view_device();
1500:   auto Ai    = akok->i_dual.view_device();
1501:   auto Aj    = akok->j_dual.view_device();
1502:   auto Adiag = akok->diag_dual.view_device();
1503:   // TODO: how to tune the team size?
1504: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1505:   auto ts = Kokkos::AUTO();
1506: #else
1507:   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1508: #endif
1509:   PetscCallCXX(Kokkos::parallel_for(
1510:     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1511:       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1512:       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1513:       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1514:       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1515:       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);

1517:       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1518:         PetscInt i = rstart + r;                                                            // i-th row in A

1520:         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1521:           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row

1523:           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1524:             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1525:               B(r, c) = 0.0;
1526:             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1527:               B(r, c) = Aa(first + c);
1528:             } else { // this entry does not show up in the CSR
1529:               B(r, c) = 0.0;
1530:             }
1531:           }
1532:         } else { // rare case that the diagonal does not exist
1533:           const PetscInt begin = Ai(i);
1534:           const PetscInt end   = Ai(i + 1);
1535:           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1536:           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1537:             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1538:             else if (Aj(j) >= rstart + m) break;
1539:           }
1540:         }
1541:       });

1543:       // LU-decompose B (w/o pivoting) and then invert B
1544:       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1545:       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1546:     }));
1547:   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1548:   PetscFunctionReturn(PETSC_SUCCESS);
1549: }

1551: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1552: {
1553:   Mat_SeqAIJ *aseq;
1554:   PetscInt    i, m, n;
1555:   auto        exec = PetscGetKokkosExecutionSpace();

1557:   PetscFunctionBegin;
1558:   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");

1560:   m = akok->nrows();
1561:   n = akok->ncols();
1562:   PetscCall(MatSetSizes(A, m, n, m, n));
1563:   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));

1565:   /* Set up data structures of A as a MATSEQAIJ */
1566:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1567:   aseq = (Mat_SeqAIJ *)A->data;

1569:   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1570:   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));

1572:   aseq->i       = akok->i_host_data();
1573:   aseq->j       = akok->j_host_data();
1574:   aseq->a       = akok->a_host_data();
1575:   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1576:   aseq->free_a  = PETSC_FALSE;
1577:   aseq->free_ij = PETSC_FALSE;
1578:   aseq->nz      = akok->nnz();
1579:   aseq->maxnz   = aseq->nz;

1581:   PetscCall(PetscMalloc1(m, &aseq->imax));
1582:   PetscCall(PetscMalloc1(m, &aseq->ilen));
1583:   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];

1585:   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1586:   akok->nonzerostate = A->nonzerostate;
1587:   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1588:   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1589:   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1590:   PetscFunctionReturn(PETSC_SUCCESS);
1591: }

1593: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1594: {
1595:   PetscFunctionBegin;
1596:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1597:   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1598:   PetscFunctionReturn(PETSC_SUCCESS);
1599: }

1601: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1602: {
1603:   Mat_SeqAIJKokkos *akok;

1605:   PetscFunctionBegin;
1606:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1607:   PetscCall(MatCreate(comm, A));
1608:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1609:   PetscFunctionReturn(PETSC_SUCCESS);
1610: }

1612: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure

1614:    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1615:  */
1616: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1617: {
1618:   PetscFunctionBegin;
1619:   PetscCall(MatCreate(comm, A));
1620:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1621:   PetscFunctionReturn(PETSC_SUCCESS);
1622: }

1624: /*@C
1625:   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1626:   (the default parallel PETSc format). This matrix will ultimately be handled by
1627:   Kokkos for calculations.

1629:   Collective

1631:   Input Parameters:
1632: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1633: . m    - number of rows
1634: . n    - number of columns
1635: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1636: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

1638:   Output Parameter:
1639: . A - the matrix

1641:   Level: intermediate

1643:   Notes:
1644:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1645:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1646:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

1648:   The AIJ format, also called
1649:   compressed row storage, is fully compatible with standard Fortran
1650:   storage.  That is, the stored row and column indices can begin at
1651:   either one (as in Fortran) or zero.

1653:   Specify the preallocated storage with either `nz` or `nnz` (not both).
1654:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1655:   allocation.

1657: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1658: @*/
1659: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1660: {
1661:   PetscFunctionBegin;
1662:   PetscCall(PetscKokkosInitializeCheck());
1663:   PetscCall(MatCreate(comm, A));
1664:   PetscCall(MatSetSizes(*A, m, n, m, n));
1665:   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1666:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1667:   PetscFunctionReturn(PETSC_SUCCESS);
1668: }

1670: // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1671: // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1672: // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1673: // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1674: static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1675: {
1676:   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1677:   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1678:   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy

1680:   PetscFunctionBegin;
1681:   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1682:     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1683:     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1684:     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1685:   }
1686:   PetscFunctionReturn(PETSC_SUCCESS);
1687: }

1689: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1690: {
1691:   const PetscInt              n         = A->rmap->n;
1692:   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1693:   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1694:   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy

1696:   PetscFunctionBegin;
1697:   if (!factors->transpose_updated) {
1698:     if (has_upper) {
1699:       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1700:         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1701:         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1702:         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1703:       }

1705:       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1706:         if (!factors->U) {
1707:           Mat_SeqAIJ *seq;

1709:           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1710:           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));

1712:           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1713:           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1714:           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1715:           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1716:         } else {
1717:           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1718:         }
1719:         // Copy Ut from host to device
1720:         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1721:         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1722:         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1723:       } else { // If U was computed on device, we also compute the transpose there
1724:         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1725:         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1726:                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1727:                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1728:                                                                                                                                                                                                                                factors->aUt_d));
1729:         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1730:       }
1731:       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1732:     }

1734:     // do the same for L with LU
1735:     if (has_lower) {
1736:       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1737:         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1738:         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1739:         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1740:       }

1742:       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1743:         if (!factors->L) {
1744:           Mat_SeqAIJ *seq;

1746:           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1747:           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));

1749:           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1750:           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1751:           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1752:           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1753:         } else {
1754:           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1755:         }
1756:         // Copy Lt from host to device
1757:         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1758:         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1759:         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1760:       } else { // If L was computed on device, we also compute the transpose there
1761:         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1762:         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1763:                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1764:                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1765:                                                                                                                                                                                                                                factors->aLt_d));
1766:         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1767:       }
1768:       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1769:     }

1771:     factors->transpose_updated = PETSC_TRUE;
1772:   }
1773:   PetscFunctionReturn(PETSC_SUCCESS);
1774: }

1776: // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1777: // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1778: static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1779: {
1780:   auto                        exec    = PetscGetKokkosExecutionSpace();
1781:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1782:   PetscInt                    m       = A->rmap->n;
1783:   PetscScalarKokkosView       D       = factors->D_d;
1784:   PetscScalarKokkosView       X, Y, B; // alias
1785:   ConstPetscScalarKokkosView  b;
1786:   PetscScalarKokkosView       x;
1787:   PetscIntKokkosView         &rowperm  = factors->rowperm;
1788:   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;

1790:   PetscFunctionBegin;
1791:   PetscCall(PetscLogGpuTimeBegin());
1792:   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1793:   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1794:   PetscCall(VecGetKokkosView(bb, &b));
1795:   PetscCall(VecGetKokkosViewWrite(xx, &x));

1797:   // Solve U^T Y = B
1798:   if (identity) { // Reorder b with the row permutation
1799:     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1800:     Y = factors->workVector;
1801:   } else {
1802:     B = factors->workVector;
1803:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1804:     Y = x;
1805:   }
1806:   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));

1808:   // Solve diag(D) Y' = Y.
1809:   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1810:   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));

1812:   // Solve UX = Y
1813:   if (identity) {
1814:     X = x;
1815:   } else {
1816:     X = factors->workVector; // B is not needed anymore
1817:   }
1818:   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));

1820:   // Reorder X with the inverse column (row) permutation
1821:   if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));

1823:   PetscCall(VecRestoreKokkosView(bb, &b));
1824:   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1825:   PetscCall(PetscLogGpuTimeEnd());
1826:   PetscFunctionReturn(PETSC_SUCCESS);
1827: }

1829: // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1830: // R and C are represented by rowperm and colperm in factors.
1831: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1832: static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1833: {
1834:   auto                        exec    = PetscGetKokkosExecutionSpace();
1835:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1836:   PetscInt                    m       = A->rmap->n;
1837:   PetscScalarKokkosView       X, Y, B; // alias
1838:   ConstPetscScalarKokkosView  b;
1839:   PetscScalarKokkosView       x;
1840:   PetscIntKokkosView         &rowperm      = factors->rowperm;
1841:   PetscIntKokkosView         &colperm      = factors->colperm;
1842:   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1843:   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;

1845:   PetscFunctionBegin;
1846:   PetscCall(PetscLogGpuTimeBegin());
1847:   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1848:   PetscCall(VecGetKokkosView(bb, &b));
1849:   PetscCall(VecGetKokkosViewWrite(xx, &x));

1851:   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1852:   if (row_identity) {
1853:     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1854:     Y = factors->workVector;
1855:   } else {
1856:     B = factors->workVector;
1857:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1858:     Y = x;
1859:   }
1860:   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));

1862:   // Solve U C^- x = Y
1863:   if (col_identity) {
1864:     X = x;
1865:   } else {
1866:     X = factors->workVector;
1867:   }
1868:   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));

1870:   // x = C X; Reorder X with the inverse col permutation
1871:   if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));

1873:   PetscCall(VecRestoreKokkosView(bb, &b));
1874:   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1875:   PetscCall(PetscLogGpuTimeEnd());
1876:   PetscFunctionReturn(PETSC_SUCCESS);
1877: }

1879: // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1880: // R and C are represented by rowperm and colperm in factors.
1881: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1882: // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1883: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1884: {
1885:   auto                        exec    = PetscGetKokkosExecutionSpace();
1886:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1887:   PetscInt                    m       = A->rmap->n;
1888:   PetscScalarKokkosView       X, Y, B; // alias
1889:   ConstPetscScalarKokkosView  b;
1890:   PetscScalarKokkosView       x;
1891:   PetscIntKokkosView         &rowperm      = factors->rowperm;
1892:   PetscIntKokkosView         &colperm      = factors->colperm;
1893:   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1894:   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;

1896:   PetscFunctionBegin;
1897:   PetscCall(PetscLogGpuTimeBegin());
1898:   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1899:   PetscCall(VecGetKokkosView(bb, &b));
1900:   PetscCall(VecGetKokkosViewWrite(xx, &x));

1902:   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1903:   if (col_identity) { // Reorder b with the col permutation
1904:     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1905:     Y = factors->workVector;
1906:   } else {
1907:     B = factors->workVector;
1908:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1909:     Y = x;
1910:   }
1911:   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));

1913:   // Solve L^T X = Y
1914:   if (row_identity) {
1915:     X = x;
1916:   } else {
1917:     X = factors->workVector;
1918:   }
1919:   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));

1921:   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1922:   if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));

1924:   PetscCall(VecRestoreKokkosView(bb, &b));
1925:   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1926:   PetscCall(PetscLogGpuTimeEnd());
1927:   PetscFunctionReturn(PETSC_SUCCESS);
1928: }

1930: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1931: {
1932:   PetscFunctionBegin;
1933:   PetscCall(MatSeqAIJKokkosSyncHost(A));
1934:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));

1936:   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1937:     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1938:     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1939:     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1940:     const MatScalar            *Ba = b->a;
1941:     PetscInt                    m = B->rmap->n, n = B->cmap->n;

1943:     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1944:       // Allocate memory and copy the structure
1945:       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1946:       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1947:       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1948:       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1949:       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1950:       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));

1952:       PetscInt *Li = factors->iL_h.data();
1953:       PetscInt *Lj = factors->jL_h.data();
1954:       PetscInt *Ui = factors->iU_h.data();
1955:       PetscInt *Uj = factors->jU_h.data();

1957:       Li[0] = Ui[0] = 0;
1958:       for (PetscInt i = 0; i < m; i++) {
1959:         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1960:         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry

1962:         PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal
1963:         Lj[Li[i] + llen] = i;                                   // diagonal entry of L

1965:         Uj[Ui[i]] = i;                                                             // diagonal entry of U
1966:         PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on  the right of the diagonal

1968:         Li[i + 1] = Li[i] + llen + 1;
1969:         Ui[i + 1] = Ui[i] + ulen;
1970:       }

1972:       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1973:       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1974:       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1975:       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1976:       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1977:       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);

1979:       // Copy row/col permutation to device
1980:       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1981:       PetscBool row_identity;
1982:       PetscCall(ISIdentity(rowperm, &row_identity));
1983:       if (!row_identity) {
1984:         const PetscInt *ip;

1986:         PetscCall(ISGetIndices(rowperm, &ip));
1987:         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1988:         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1989:         PetscCall(ISRestoreIndices(rowperm, &ip));
1990:         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1991:       }

1993:       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1994:       PetscBool col_identity;
1995:       PetscCall(ISIdentity(colperm, &col_identity));
1996:       if (!col_identity) {
1997:         const PetscInt *ip;

1999:         PetscCall(ISGetIndices(colperm, &ip));
2000:         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2001:         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2002:         PetscCall(ISRestoreIndices(colperm, &ip));
2003:         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2004:       }

2006:       /* Create sptrsv handles for L, U and their transpose */
2007: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2008:       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2009: #else
2010:       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2011: #endif
2012:       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2013:       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2014:       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2015:       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2016:     }

2018:     // Copy the value
2019:     for (PetscInt i = 0; i < m; i++) {
2020:       PetscInt        llen = Bi[i + 1] - Bi[i];
2021:       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2022:       const PetscInt *Li   = factors->iL_h.data();
2023:       const PetscInt *Ui   = factors->iU_h.data();

2025:       PetscScalar *La = factors->aL_h.data();
2026:       PetscScalar *Ua = factors->aU_h.data();

2028:       PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L
2029:       La[Li[i] + llen] = 1.0;                                 // diagonal entry

2031:       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                            // diagonal entry
2032:       PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U
2033:     }

2035:     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2036:     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2037:     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2038:     factors->transpose_updated         = PETSC_FALSE;
2039:     factors->sptrsv_symbolic_completed = PETSC_FALSE;

2041:     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2042:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2043:   }

2045:   B->ops->matsolve          = NULL;
2046:   B->ops->matsolvetranspose = NULL;
2047:   PetscFunctionReturn(PETSC_SUCCESS);
2048: }

2050: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2051: {
2052:   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2053:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2054:   PetscInt                    fill_lev = info->levels;

2056:   PetscFunctionBegin;
2057:   PetscCall(PetscLogGpuTimeBegin());
2058:   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2059:   PetscCall(MatSeqAIJKokkosSyncDevice(A));

2061:   auto a_d = aijkok->a_dual.view_device();
2062:   auto i_d = aijkok->i_dual.view_device();
2063:   auto j_d = aijkok->j_dual.view_device();

2065:   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));

2067:   B->assembled              = PETSC_TRUE;
2068:   B->preallocated           = PETSC_TRUE;
2069:   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2070:   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2071:   B->ops->matsolve          = NULL;
2072:   B->ops->matsolvetranspose = NULL;

2074:   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2075:   factors->transpose_updated         = PETSC_FALSE;
2076:   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2077:   /* TODO: log flops, but how to know that? */
2078:   PetscCall(PetscLogGpuTimeEnd());
2079:   PetscFunctionReturn(PETSC_SUCCESS);
2080: }

2082: // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2083: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2084: {
2085:   Mat_SeqAIJKokkos           *aijkok;
2086:   Mat_SeqAIJ                 *b;
2087:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2088:   PetscInt                    fill_lev = info->levels;
2089:   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2090:   PetscInt                    n        = A->rmap->n;

2092:   PetscFunctionBegin;
2093:   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2094:   PetscCall(MatSeqAIJKokkosSyncDevice(A));

2096:   /* Create a spiluk handle and then do symbolic factorization */
2097:   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2098:   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);

2100:   auto spiluk_handle = factors->kh.get_spiluk_handle();

2102:   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2103:   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2104:   Kokkos::realloc(factors->iU_d, n + 1);
2105:   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());

2107:   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2108:   auto i_d = aijkok->i_dual.view_device();
2109:   auto j_d = aijkok->j_dual.view_device();
2110:   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2111:   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */

2113:   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2114:   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2115:   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2116:   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());

2118:   /* TODO: add options to select sptrsv algorithms */
2119:   /* Create sptrsv handles for L, U and their transpose */
2120: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2121:   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2122: #else
2123:   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2124: #endif

2126:   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2127:   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2128:   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2129:   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);

2131:   /* Fill fields of the factor matrix B */
2132:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2133:   b     = (Mat_SeqAIJ *)B->data;
2134:   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2135:   B->info.fill_ratio_given  = info->fill;
2136:   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;

2138:   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2139:   PetscFunctionReturn(PETSC_SUCCESS);
2140: }

2142: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2143: {
2144:   PetscFunctionBegin;
2145:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2146:   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2147:   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2148:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2149:   PetscFunctionReturn(PETSC_SUCCESS);
2150: }

2152: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2153: {
2154:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;

2156:   PetscFunctionBegin;
2157:   if (!info->factoronhost) {
2158:     PetscCall(ISIdentity(isrow, &row_identity));
2159:     PetscCall(ISIdentity(iscol, &col_identity));
2160:   }

2162:   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2163:   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));

2165:   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2166:     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2167:   } else {
2168:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2169:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2170:   }
2171:   PetscFunctionReturn(PETSC_SUCCESS);
2172: }

2174: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2175: {
2176:   PetscFunctionBegin;
2177:   PetscCall(MatSeqAIJKokkosSyncHost(A));
2178:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));

2180:   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2181:     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2182:     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2183:     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2184:     const MatScalar            *Ba = b->a;
2185:     PetscInt                    m  = B->rmap->n;

2187:     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2188:       // Allocate memory and copy the structure
2189:       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2190:       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2191:       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2192:       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2193:       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2194:       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);

2196:       // Build jU_h from the skewed Aj
2197:       PetscInt *Uj = factors->jU_h.data();
2198:       for (PetscInt i = 0; i < m; i++) {
2199:         PetscInt ulen = Bi[i + 1] - Bi[i];
2200:         Uj[Bi[i]]     = i;                                              // diagonal entry
2201:         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2202:       }

2204:       // Copy iU, jU to device
2205:       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2206:       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));

2208:       // Copy row/col permutation to device
2209:       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2210:       PetscBool row_identity;
2211:       PetscCall(ISIdentity(rowperm, &row_identity));
2212:       if (!row_identity) {
2213:         const PetscInt *ip;

2215:         PetscCall(ISGetIndices(rowperm, &ip));
2216:         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2217:         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2218:         PetscCall(ISRestoreIndices(rowperm, &ip));
2219:         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2220:       }

2222:       // Create sptrsv handles for U and U^T
2223: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2224:       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2225: #else
2226:       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2227: #endif
2228:       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2229:       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2230:     }
2231:     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2232:     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2233:     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;

2235:     // Copy the value
2236:     PetscScalar *Ua = factors->aU_h.data();
2237:     PetscScalar *D  = factors->D_h.data();
2238:     for (PetscInt i = 0; i < m; i++) {
2239:       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2240:       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2241:       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2242:     }
2243:     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2244:     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));

2246:     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2247:     factors->transpose_updated         = PETSC_FALSE;
2248:   }

2250:   B->ops->matsolve          = NULL;
2251:   B->ops->matsolvetranspose = NULL;
2252:   PetscFunctionReturn(PETSC_SUCCESS);
2253: }

2255: static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2256: {
2257:   PetscFunctionBegin;
2258:   if (info->solveonhost) {
2259:     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2260:     PetscCall(MatSetType(B, MATSEQSBAIJ));
2261:     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2262:   }

2264:   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));

2266:   if (!info->solveonhost) {
2267:     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2268:     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2269:     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2270:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2271:   }
2272:   PetscFunctionReturn(PETSC_SUCCESS);
2273: }

2275: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2276: {
2277:   PetscFunctionBegin;
2278:   if (info->solveonhost) {
2279:     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2280:     PetscCall(MatSetType(B, MATSEQSBAIJ));
2281:     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2282:   }

2284:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm

2286:   if (!info->solveonhost) {
2287:     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2288:     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2289:     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2290:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2291:   }
2292:   PetscFunctionReturn(PETSC_SUCCESS);
2293: }

2295: // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2296: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2297: {
2298:   PetscFunctionBegin;
2299:   *type = MATSOLVERKOKKOS;
2300:   PetscFunctionReturn(PETSC_SUCCESS);
2301: }

2303: /*MC
2304:   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2305:   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.

2307:   Level: beginner

2309: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`
2310: M*/
2311: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2312: {
2313:   PetscInt n = A->rmap->n;
2314:   MPI_Comm comm;

2316:   PetscFunctionBegin;
2317:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2318:   PetscCall(MatCreate(comm, B));
2319:   PetscCall(MatSetSizes(*B, n, n, n, n));
2320:   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2321:   (*B)->factortype = ftype;
2322:   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2323:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2324:   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");

2326:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2327:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2328:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2329:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2330:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2331:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2332:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2333:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2334:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2335:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2336:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2337:   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);

2339:   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2340:   (*B)->canuseordering = PETSC_TRUE;
2341:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2342:   PetscFunctionReturn(PETSC_SUCCESS);
2343: }

2345: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2346: {
2347:   PetscFunctionBegin;
2348:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2349:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2350:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2351:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2352:   PetscFunctionReturn(PETSC_SUCCESS);
2353: }

2355: /* Utility to print out a KokkosCsrMatrix for debugging */
2356: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2357: {
2358:   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2359:   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2360:   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2361:   const PetscInt    *i  = iv.data();
2362:   const PetscInt    *j  = jv.data();
2363:   const PetscScalar *a  = av.data();
2364:   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();

2366:   PetscFunctionBegin;
2367:   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2368:   for (PetscInt k = 0; k < m; k++) {
2369:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2370:     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2371:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2372:   }
2373:   PetscFunctionReturn(PETSC_SUCCESS);
2374: }