Actual source code: aijkok.kokkos.cxx

  1: #include <petsc_kokkos.hpp>
  2: #include <petscvec_kokkos.hpp>
  3: #include <petscmat_kokkos.hpp>
  4: #include <petscpkg_version.h>
  5: #include <petsc/private/petscimpl.h>
  6: #include <petsc/private/sfimpl.h>
  7: #include <petsc/private/kokkosimpl.hpp>
  8: #include <petscsys.h>

 10: #include <Kokkos_Core.hpp>
 11: #include <KokkosBlas.hpp>
 12: #include <KokkosSparse_CrsMatrix.hpp>

 14: // To suppress compiler warnings:
 15: // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
 16: // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
 17: // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
 18: // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
 19: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
 20: #include <KokkosSparse_spmv.hpp>
 21: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()

 23: #include <KokkosSparse_spiluk.hpp>
 24: #include <KokkosSparse_sptrsv.hpp>
 25: #include <KokkosSparse_spgemm.hpp>
 26: #include <KokkosSparse_spadd.hpp>
 27: #include <KokkosBatched_LU_Decl.hpp>
 28: #include <KokkosBatched_InverseLU_Decl.hpp>

 30: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>

 32: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
 33:   #include <KokkosSparse_Utils.hpp>
 34: using KokkosSparse::sort_crs_matrix;
 35: using KokkosSparse::Impl::transpose_matrix;
 36: #else
 37:   #include <KokkosKernels_Sorting.hpp>
 38: using KokkosKernels::sort_crs_matrix;
 39: using KokkosKernels::Impl::transpose_matrix;
 40: #endif

 42: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
 43: using KokkosSparse::spiluk_symbolic;
 44: using KokkosSparse::spiluk_numeric;
 45: using KokkosSparse::sptrsv_symbolic;
 46: using KokkosSparse::sptrsv_solve;
 47: using KokkosSparse::Experimental::SPTRSVAlgorithm;
 48: using KokkosSparse::Experimental::SPILUKAlgorithm;
 49: #else
 50: using KokkosSparse::Experimental::spiluk_symbolic;
 51: using KokkosSparse::Experimental::spiluk_numeric;
 52: using KokkosSparse::Experimental::sptrsv_symbolic;
 53: using KokkosSparse::Experimental::sptrsv_solve;
 54: using KokkosSparse::Experimental::SPTRSVAlgorithm;
 55: using KokkosSparse::Experimental::SPILUKAlgorithm;
 56: #endif

 58: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */

 60: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
 61:    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
 62:    In the latter case, it is important to set a_dual's sync state correctly.
 63:  */
 64: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
 65: {
 66:   Mat_SeqAIJ       *aijseq;
 67:   Mat_SeqAIJKokkos *aijkok;

 69:   PetscFunctionBegin;
 70:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
 71:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));

 73:   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
 74:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 76:   /* If aijkok does not exist, we just copy i, j to device.
 77:      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
 78:      In both cases, we build a new aijkok structure.
 79:   */
 80:   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
 81:     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
 82:       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
 83:       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
 84:       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
 85:       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
 86:       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
 87:       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
 88:       aijseq->free_a  = PETSC_TRUE;
 89:       aijseq->free_ij = PETSC_TRUE;
 90:       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
 91:          we have the CsrMatrix on device first and then copy to host, followed by
 92:          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
 93:          One could improve it by not using NULL garray.
 94:       */
 95:     }
 96:     delete aijkok;
 97:     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
 98:     A->spptr = aijkok;
 99:   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100:     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101:     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102:     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
103:   }
104:   PetscFunctionReturn(PETSC_SUCCESS);
105: }

107: /* Sync CSR data to device if not yet */
108: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109: {
110:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

112:   PetscFunctionBegin;
113:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
114:   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115:   if (aijkok->a_dual.need_sync_device()) {
116:     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
117:     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
118:     aijkok->hermitian_updated = PETSC_FALSE;
119:   }
120:   PetscFunctionReturn(PETSC_SUCCESS);
121: }

123: /* Mark the CSR data on device as modified */
124: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125: {
126:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

128:   PetscFunctionBegin;
129:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
130:   aijkok->a_dual.clear_sync_state();
131:   aijkok->a_dual.modify_device();
132:   aijkok->transpose_updated = PETSC_FALSE;
133:   aijkok->hermitian_updated = PETSC_FALSE;
134:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
135:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
136:   PetscFunctionReturn(PETSC_SUCCESS);
137: }

139: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140: {
141:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142:   auto              exec   = PetscGetKokkosExecutionSpace();

144:   PetscFunctionBegin;
145:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146:   /* We do not expect one needs factors on host  */
147:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148:   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149:   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
150:   PetscFunctionReturn(PETSC_SUCCESS);
151: }

153: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154: {
155:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

157:   PetscFunctionBegin;
158:   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159:     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160:     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161:     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162:   */
163:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164:     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
165:     *array = aijkok->a_dual.view_host().data();
166:   } else { /* Happens when calling MatSetValues on a newly created matrix */
167:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
168:   }
169:   PetscFunctionReturn(PETSC_SUCCESS);
170: }

172: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
173: {
174:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

176:   PetscFunctionBegin;
177:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
178:   PetscFunctionReturn(PETSC_SUCCESS);
179: }

181: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
182: {
183:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

185:   PetscFunctionBegin;
186:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
187:     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
188:     *array = aijkok->a_dual.view_host().data();
189:   } else {
190:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
191:   }
192:   PetscFunctionReturn(PETSC_SUCCESS);
193: }

195: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
196: {
197:   PetscFunctionBegin;
198:   *array = NULL;
199:   PetscFunctionReturn(PETSC_SUCCESS);
200: }

202: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
203: {
204:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

206:   PetscFunctionBegin;
207:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
208:     *array = aijkok->a_dual.view_host().data();
209:   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
210:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
211:   }
212:   PetscFunctionReturn(PETSC_SUCCESS);
213: }

215: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
216: {
217:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

219:   PetscFunctionBegin;
220:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
221:     aijkok->a_dual.clear_sync_state();
222:     aijkok->a_dual.modify_host();
223:   }
224:   PetscFunctionReturn(PETSC_SUCCESS);
225: }

227: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
228: {
229:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

231:   PetscFunctionBegin;
232:   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");

234:   if (i) *i = aijkok->i_device_data();
235:   if (j) *j = aijkok->j_device_data();
236:   if (a) {
237:     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
238:     *a = aijkok->a_device_data();
239:   }
240:   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
241:   PetscFunctionReturn(PETSC_SUCCESS);
242: }

244: /*
245:   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.

247:   Input Parameter:
248: .  A       - the MATSEQAIJKOKKOS matrix

250:   Output Parameters:
251: +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
252: -  T_d    - the transpose on device, whose value array is allocated but not initialized
253: */
254: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
255: {
256:   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
257:   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
258:   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
259:   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
260:   MatRowMapType          *Ti = Ti_h.data();
261:   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
262:   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
263:   PetscInt               *Tj   = Tj_h.data();
264:   PetscInt               *perm = perm_h.data();
265:   PetscInt               *offset;

267:   PetscFunctionBegin;
268:   // Populate Ti
269:   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
270:   Ti++;
271:   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
272:   Ti--;
273:   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];

275:   // Populate Tj and the permutation array
276:   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
277:   for (PetscInt i = 0; i < m; i++) {
278:     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
279:       PetscInt r    = Aj[j];                       // row r of T
280:       PetscInt disp = Ti[r] + offset[r];

282:       Tj[disp]   = i; // col i of T
283:       perm[disp] = j;
284:       offset[r]++;
285:     }
286:   }
287:   PetscCall(PetscFree(offset));

289:   // Sort each row of T, along with the permutation array
290:   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));

292:   // Output perm and T on device
293:   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
294:   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
295:   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
296:   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
297:   PetscFunctionReturn(PETSC_SUCCESS);
298: }

300: // Generate the transpose on device and cache it internally
301: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
302: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
303: {
304:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
305:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
306:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
307:   KokkosCsrMatrix  &T = akok->csrmatT;

309:   PetscFunctionBegin;
310:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
311:   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device

313:   const auto &Aa = akok->a_dual.view_device();

315:   if (A->symmetric == PETSC_BOOL3_TRUE) {
316:     *csrmatT = akok->csrmat;
317:   } else {
318:     // See if we already have a cached transpose and its value is up to date
319:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
320:       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
321:         const auto &perm = akok->transpose_perm; // get the permutation array
322:         auto       &Ta   = T.values;

324:         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
325:       }
326:     } else { // Generate T of size n x m for the first time
327:       MatRowMapKokkosView perm;

329:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
330:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
331:       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
332:     }
333:     akok->transpose_updated = PETSC_TRUE;
334:     *csrmatT                = akok->csrmatT;
335:   }
336:   PetscFunctionReturn(PETSC_SUCCESS);
337: }

339: // Generate the Hermitian on device and cache it internally
340: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
341: {
342:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
343:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
344:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
345:   KokkosCsrMatrix  &T = akok->csrmatH;

347:   PetscFunctionBegin;
348:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
349:   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device

351:   const auto &Aa = akok->a_dual.view_device();

353:   if (A->hermitian == PETSC_BOOL3_TRUE) {
354:     *csrmatH = akok->csrmat;
355:   } else {
356:     // See if we already have a cached hermitian and its value is up to date
357:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
358:       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
359:         const auto &perm = akok->transpose_perm; // get the permutation array
360:         auto       &Ta   = T.values;

362:         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
363:       }
364:     } else { // Generate T of size n x m for the first time
365:       MatRowMapKokkosView perm;

367:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
368:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
369:       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
370:     }
371:     akok->hermitian_updated = PETSC_TRUE;
372:     *csrmatH                = akok->csrmatH;
373:   }
374:   PetscFunctionReturn(PETSC_SUCCESS);
375: }

377: /* y = A x */
378: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
379: {
380:   Mat_SeqAIJKokkos          *aijkok;
381:   ConstPetscScalarKokkosView xv;
382:   PetscScalarKokkosView      yv;

384:   PetscFunctionBegin;
385:   PetscCall(PetscLogGpuTimeBegin());
386:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
387:   PetscCall(VecGetKokkosView(xx, &xv));
388:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
389:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
390:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
391:   PetscCall(VecRestoreKokkosView(xx, &xv));
392:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
393:   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
394:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
395:   PetscCall(PetscLogGpuTimeEnd());
396:   PetscFunctionReturn(PETSC_SUCCESS);
397: }

399: /* y = A^T x */
400: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
401: {
402:   Mat_SeqAIJKokkos          *aijkok;
403:   const char                *mode;
404:   ConstPetscScalarKokkosView xv;
405:   PetscScalarKokkosView      yv;
406:   KokkosCsrMatrix            csrmat;

408:   PetscFunctionBegin;
409:   PetscCall(PetscLogGpuTimeBegin());
410:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
411:   PetscCall(VecGetKokkosView(xx, &xv));
412:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
413:   if (A->form_explicit_transpose) {
414:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
415:     mode = "N";
416:   } else {
417:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
418:     csrmat = aijkok->csrmat;
419:     mode   = "T";
420:   }
421:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
422:   PetscCall(VecRestoreKokkosView(xx, &xv));
423:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
424:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
425:   PetscCall(PetscLogGpuTimeEnd());
426:   PetscFunctionReturn(PETSC_SUCCESS);
427: }

429: /* y = A^H x */
430: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
431: {
432:   Mat_SeqAIJKokkos          *aijkok;
433:   const char                *mode;
434:   ConstPetscScalarKokkosView xv;
435:   PetscScalarKokkosView      yv;
436:   KokkosCsrMatrix            csrmat;

438:   PetscFunctionBegin;
439:   PetscCall(PetscLogGpuTimeBegin());
440:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
441:   PetscCall(VecGetKokkosView(xx, &xv));
442:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
443:   if (A->form_explicit_transpose) {
444:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
445:     mode = "N";
446:   } else {
447:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
448:     csrmat = aijkok->csrmat;
449:     mode   = "C";
450:   }
451:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
452:   PetscCall(VecRestoreKokkosView(xx, &xv));
453:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
454:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
455:   PetscCall(PetscLogGpuTimeEnd());
456:   PetscFunctionReturn(PETSC_SUCCESS);
457: }

459: /* z = A x + y */
460: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
461: {
462:   Mat_SeqAIJKokkos          *aijkok;
463:   ConstPetscScalarKokkosView xv;
464:   PetscScalarKokkosView      zv;

466:   PetscFunctionBegin;
467:   PetscCall(PetscLogGpuTimeBegin());
468:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
469:   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
470:   PetscCall(VecGetKokkosView(xx, &xv));
471:   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
472:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
473:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
474:   PetscCall(VecRestoreKokkosView(xx, &xv));
475:   PetscCall(VecRestoreKokkosView(zz, &zv));
476:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
477:   PetscCall(PetscLogGpuTimeEnd());
478:   PetscFunctionReturn(PETSC_SUCCESS);
479: }

481: /* z = A^T x + y */
482: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
483: {
484:   Mat_SeqAIJKokkos          *aijkok;
485:   const char                *mode;
486:   ConstPetscScalarKokkosView xv;
487:   PetscScalarKokkosView      zv;
488:   KokkosCsrMatrix            csrmat;

490:   PetscFunctionBegin;
491:   PetscCall(PetscLogGpuTimeBegin());
492:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
493:   if (zz != yy) PetscCall(VecCopy(yy, zz));
494:   PetscCall(VecGetKokkosView(xx, &xv));
495:   PetscCall(VecGetKokkosView(zz, &zv));
496:   if (A->form_explicit_transpose) {
497:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
498:     mode = "N";
499:   } else {
500:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
501:     csrmat = aijkok->csrmat;
502:     mode   = "T";
503:   }
504:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
505:   PetscCall(VecRestoreKokkosView(xx, &xv));
506:   PetscCall(VecRestoreKokkosView(zz, &zv));
507:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
508:   PetscCall(PetscLogGpuTimeEnd());
509:   PetscFunctionReturn(PETSC_SUCCESS);
510: }

512: /* z = A^H x + y */
513: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
514: {
515:   Mat_SeqAIJKokkos          *aijkok;
516:   const char                *mode;
517:   ConstPetscScalarKokkosView xv;
518:   PetscScalarKokkosView      zv;
519:   KokkosCsrMatrix            csrmat;

521:   PetscFunctionBegin;
522:   PetscCall(PetscLogGpuTimeBegin());
523:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
524:   if (zz != yy) PetscCall(VecCopy(yy, zz));
525:   PetscCall(VecGetKokkosView(xx, &xv));
526:   PetscCall(VecGetKokkosView(zz, &zv));
527:   if (A->form_explicit_transpose) {
528:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
529:     mode = "N";
530:   } else {
531:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
532:     csrmat = aijkok->csrmat;
533:     mode   = "C";
534:   }
535:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
536:   PetscCall(VecRestoreKokkosView(xx, &xv));
537:   PetscCall(VecRestoreKokkosView(zz, &zv));
538:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
539:   PetscCall(PetscLogGpuTimeEnd());
540:   PetscFunctionReturn(PETSC_SUCCESS);
541: }

543: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
544: {
545:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

547:   PetscFunctionBegin;
548:   switch (op) {
549:   case MAT_FORM_EXPLICIT_TRANSPOSE:
550:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
551:     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
552:     A->form_explicit_transpose = flg;
553:     break;
554:   default:
555:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
556:     break;
557:   }
558:   PetscFunctionReturn(PETSC_SUCCESS);
559: }

561: /* Depending on reuse, either build a new mat, or use the existing mat */
562: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
563: {
564:   Mat_SeqAIJ *aseq;

566:   PetscFunctionBegin;
567:   PetscCall(PetscKokkosInitializeCheck());
568:   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
569:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
570:     PetscCall(MatSetType(*newmat, mtype));
571:   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
572:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
573:   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
574:     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
575:     PetscCall(PetscFree(A->defaultvectype));
576:     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
577:     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
578:     PetscCall(MatSetOps_SeqAIJKokkos(A));
579:     aseq = static_cast<Mat_SeqAIJ *>(A->data);
580:     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
581:       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
582:       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
583:     }
584:   }
585:   PetscFunctionReturn(PETSC_SUCCESS);
586: }

588: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
589:    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
590:  */
591: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
592: {
593:   Mat_SeqAIJ       *bseq;
594:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
595:   Mat               mat;

597:   PetscFunctionBegin;
598:   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
599:   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
600:   mat = *B;
601:   if (A->assembled) {
602:     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
603:     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
604:     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
605:     /* Now copy values to B if needed */
606:     if (dupOption == MAT_COPY_VALUES) {
607:       if (akok->a_dual.need_sync_device()) {
608:         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
609:         bkok->a_dual.modify_host();
610:       } else { /* If device has the latest data, we only copy data on device */
611:         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
612:         bkok->a_dual.modify_device();
613:       }
614:     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
615:       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
616:       bkok->a_dual.modify_host();
617:     }
618:     mat->spptr = bkok;
619:   }

621:   PetscCall(PetscFree(mat->defaultvectype));
622:   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
623:   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
624:   PetscCall(MatSetOps_SeqAIJKokkos(mat));
625:   PetscFunctionReturn(PETSC_SUCCESS);
626: }

628: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
629: {
630:   Mat               At;
631:   KokkosCsrMatrix   internT;
632:   Mat_SeqAIJKokkos *atkok, *bkok;

634:   PetscFunctionBegin;
635:   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
636:   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
637:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
638:     /* Deep copy internT, as we want to isolate the internal transpose */
639:     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
640:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
641:     if (reuse == MAT_INITIAL_MATRIX) *B = At;
642:     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
643:   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
644:     if ((*B)->assembled) {
645:       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
646:       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
647:       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
648:     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
649:       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
650:       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
651:       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
652:       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
653:       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
654:     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
655:   }
656:   PetscFunctionReturn(PETSC_SUCCESS);
657: }

659: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
660: {
661:   Mat_SeqAIJKokkos *aijkok;

663:   PetscFunctionBegin;
664:   if (A->factortype == MAT_FACTOR_NONE) {
665:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
666:     delete aijkok;
667:   } else {
668:     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
669:   }
670:   A->spptr = NULL;
671:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
672:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
673:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
674: #if defined(PETSC_HAVE_HYPRE)
675:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
676: #endif
677:   PetscCall(MatDestroy_SeqAIJ(A));
678:   PetscFunctionReturn(PETSC_SUCCESS);
679: }

681: /*MC
682:    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos

684:    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types

686:    Options Database Key:
687: .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`

689:   Level: beginner

691: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
692: M*/
693: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
694: {
695:   PetscFunctionBegin;
696:   PetscCall(PetscKokkosInitializeCheck());
697:   PetscCall(MatCreate_SeqAIJ(A));
698:   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
699:   PetscFunctionReturn(PETSC_SUCCESS);
700: }

702: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
703: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
704: {
705:   Mat_SeqAIJ         *a, *b;
706:   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
707:   MatScalarKokkosView aa, ba, ca;
708:   MatRowMapKokkosView ai, bi, ci;
709:   MatColIdxKokkosView aj, bj, cj;
710:   PetscInt            m, n, nnz, aN;

712:   PetscFunctionBegin;
715:   PetscAssertPointer(C, 4);
716:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
717:   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
718:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
719:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");

721:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
722:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
723:   a    = static_cast<Mat_SeqAIJ *>(A->data);
724:   b    = static_cast<Mat_SeqAIJ *>(B->data);
725:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
726:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
727:   aa   = akok->a_dual.view_device();
728:   ai   = akok->i_dual.view_device();
729:   ba   = bkok->a_dual.view_device();
730:   bi   = bkok->i_dual.view_device();
731:   m    = A->rmap->n; /* M, N and nnz of C */
732:   n    = A->cmap->n + B->cmap->n;
733:   nnz  = a->nz + b->nz;
734:   aN   = A->cmap->n; /* N of A */
735:   if (reuse == MAT_INITIAL_MATRIX) {
736:     aj      = akok->j_dual.view_device();
737:     bj      = bkok->j_dual.view_device();
738:     auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
739:     auto ci = MatRowMapKokkosView("i", ai.extent(0));
740:     auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));

742:     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
743:     Kokkos::parallel_for(
744:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
745:         PetscInt i       = t.league_rank(); /* row i */
746:         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);

748:         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
749:                                                    ci(i) = coffset;
750:                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
751:         });

753:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
754:           if (k < alen) {
755:             ca(coffset + k) = aa(ai(i) + k);
756:             cj(coffset + k) = aj(ai(i) + k);
757:           } else {
758:             ca(coffset + k) = ba(bi(i) + k - alen);
759:             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
760:           }
761:         });
762:       });
763:     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
764:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
765:   } else if (reuse == MAT_REUSE_MATRIX) {
767:     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
768:     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
769:     ca   = ckok->a_dual.view_device();
770:     ci   = ckok->i_dual.view_device();

772:     Kokkos::parallel_for(
773:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
774:         PetscInt i    = t.league_rank(); /* row i */
775:         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
776:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
777:           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
778:           else ca(ci(i) + k) = ba(bi(i) + k - alen);
779:         });
780:       });
781:     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
782:   }
783:   PetscFunctionReturn(PETSC_SUCCESS);
784: }

786: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
787: {
788:   PetscFunctionBegin;
789:   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
790:   PetscFunctionReturn(PETSC_SUCCESS);
791: }

793: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
794: {
795:   Mat_Product                 *product = C->product;
796:   Mat                          A, B;
797:   bool                         transA, transB; /* use bool, since KK needs this type */
798:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
799:   Mat_SeqAIJ                  *c;
800:   MatProductData_SeqAIJKokkos *pdata;
801:   KokkosCsrMatrix              csrmatA, csrmatB;

803:   PetscFunctionBegin;
804:   MatCheckProduct(C, 1);
805:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
806:   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);

808:   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
809:   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
810:   // we still do numeric.
811:   if (pdata->reusesym) { // numeric reuses results from symbolic
812:     pdata->reusesym = PETSC_FALSE;
813:     PetscFunctionReturn(PETSC_SUCCESS);
814:   }

816:   switch (product->type) {
817:   case MATPRODUCT_AB:
818:     transA = false;
819:     transB = false;
820:     break;
821:   case MATPRODUCT_AtB:
822:     transA = true;
823:     transB = false;
824:     break;
825:   case MATPRODUCT_ABt:
826:     transA = false;
827:     transB = true;
828:     break;
829:   default:
830:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
831:   }

833:   A = product->A;
834:   B = product->B;
835:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
836:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
837:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
838:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
839:   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);

841:   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");

843:   csrmatA = akok->csrmat;
844:   csrmatB = bkok->csrmat;

846:   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
847:   if (transA) {
848:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
849:     transA = false;
850:   }

852:   if (transB) {
853:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
854:     transB = false;
855:   }
856:   PetscCall(PetscLogGpuTimeBegin());
857:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
858: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
859:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
860:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
861: #endif

863:   PetscCall(PetscLogGpuTimeEnd());
864:   PetscCall(MatSeqAIJKokkosModifyDevice(C));
865:   /* shorter version of MatAssemblyEnd_SeqAIJ */
866:   c = (Mat_SeqAIJ *)C->data;
867:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
868:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
869:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
870:   c->reallocs         = 0;
871:   C->info.mallocs     = 0;
872:   C->info.nz_unneeded = 0;
873:   C->assembled = C->was_assembled = PETSC_TRUE;
874:   C->num_ass++;
875:   PetscFunctionReturn(PETSC_SUCCESS);
876: }

878: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
879: {
880:   Mat_Product                 *product = C->product;
881:   MatProductType               ptype;
882:   Mat                          A, B;
883:   bool                         transA, transB;
884:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
885:   MatProductData_SeqAIJKokkos *pdata;
886:   MPI_Comm                     comm;
887:   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;

889:   PetscFunctionBegin;
890:   MatCheckProduct(C, 1);
891:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
892:   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
893:   A = product->A;
894:   B = product->B;
895:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
896:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
897:   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
898:   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
899:   csrmatA = akok->csrmat;
900:   csrmatB = bkok->csrmat;

902:   ptype = product->type;
903:   // Take advantage of the symmetry if true
904:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
905:     ptype                                          = MATPRODUCT_AB;
906:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
907:   }
908:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
909:     ptype                                          = MATPRODUCT_AB;
910:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
911:   }

913:   switch (ptype) {
914:   case MATPRODUCT_AB:
915:     transA = false;
916:     transB = false;
917:     PetscCall(MatSetBlockSizesFromMats(C, A, B));
918:     break;
919:   case MATPRODUCT_AtB:
920:     transA = true;
921:     transB = false;
922:     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
923:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
924:     break;
925:   case MATPRODUCT_ABt:
926:     transA = false;
927:     transB = true;
928:     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
929:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
930:     break;
931:   default:
932:     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
933:   }
934:   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
935:   pdata->reusesym = product->api_user;

937:   /* TODO: add command line options to select spgemm algorithms */
938:   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */

940:   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
941: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
942:   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
943:   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
944:   #endif
945: #endif
946:   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));

948:   PetscCall(PetscLogGpuTimeBegin());
949:   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
950:   if (transA) {
951:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
952:     transA = false;
953:   }

955:   if (transB) {
956:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
957:     transB = false;
958:   }

960:   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
961:   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
962:     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
963:     calling new Mat_SeqAIJKokkos().
964:     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
965:   */
966:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
967: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
968:   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
969:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
970:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
971: #endif
972:   PetscCall(PetscLogGpuTimeEnd());

974:   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
975:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
976:   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
977:   PetscFunctionReturn(PETSC_SUCCESS);
978: }

980: /* handles sparse matrix matrix ops */
981: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
982: {
983:   Mat_Product *product = mat->product;
984:   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;

986:   PetscFunctionBegin;
987:   MatCheckProduct(mat, 1);
988:   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
989:   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
990:   if (Biskok && Ciskok) {
991:     switch (product->type) {
992:     case MATPRODUCT_AB:
993:     case MATPRODUCT_AtB:
994:     case MATPRODUCT_ABt:
995:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
996:       break;
997:     case MATPRODUCT_PtAP:
998:     case MATPRODUCT_RARt:
999:     case MATPRODUCT_ABC:
1000:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1001:       break;
1002:     default:
1003:       break;
1004:     }
1005:   } else { /* fallback for AIJ */
1006:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1007:   }
1008:   PetscFunctionReturn(PETSC_SUCCESS);
1009: }

1011: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1012: {
1013:   Mat_SeqAIJKokkos *aijkok;

1015:   PetscFunctionBegin;
1016:   PetscCall(PetscLogGpuTimeBegin());
1017:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1018:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1019:   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1020:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1021:   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1022:   PetscCall(PetscLogGpuTimeEnd());
1023:   PetscFunctionReturn(PETSC_SUCCESS);
1024: }

1026: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1027: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1028: {
1029:   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);

1031:   PetscFunctionBegin;
1032:   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1033:     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);

1035:     PetscCall(PetscLogGpuTimeBegin());
1036:     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1037:     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1038:     const auto &Aa     = aijkok->a_dual.view_device();
1039:     const auto &Adiag  = aijkok->diag_dual.view_device();
1040:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1041:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1042:     PetscCall(PetscLogGpuFlops(n));
1043:     PetscCall(PetscLogGpuTimeEnd());
1044:   } else { // need reassembly, very slow!
1045:     PetscCall(MatShift_Basic(A, a));
1046:   }
1047:   PetscFunctionReturn(PETSC_SUCCESS);
1048: }

1050: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1051: {
1052:   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);

1054:   PetscFunctionBegin;
1055:   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1056:     ConstPetscScalarKokkosView dv;
1057:     PetscInt                   n, nv;

1059:     PetscCall(PetscLogGpuTimeBegin());
1060:     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1061:     PetscCall(VecGetKokkosView(D, &dv));
1062:     PetscCall(VecGetLocalSize(D, &nv));
1063:     n = PetscMin(Y->rmap->n, Y->cmap->n);
1064:     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");

1066:     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1067:     const auto &Aa     = aijkok->a_dual.view_device();
1068:     const auto &Adiag  = aijkok->diag_dual.view_device();
1069:     PetscCallCXX(Kokkos::parallel_for(
1070:       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1071:         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1072:         else Aa(Adiag(i)) += dv(i);
1073:       }));
1074:     PetscCall(VecRestoreKokkosView(D, &dv));
1075:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1076:     PetscCall(PetscLogGpuFlops(n));
1077:     PetscCall(PetscLogGpuTimeEnd());
1078:   } else { // need reassembly, very slow!
1079:     PetscCall(MatDiagonalSet_Default(Y, D, is));
1080:   }
1081:   PetscFunctionReturn(PETSC_SUCCESS);
1082: }

1084: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1085: {
1086:   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1087:   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1088:   ConstPetscScalarKokkosView lv, rv;

1090:   PetscFunctionBegin;
1091:   PetscCall(PetscLogGpuTimeBegin());
1092:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1093:   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1094:   const auto &Aa     = aijkok->a_dual.view_device();
1095:   const auto &Ai     = aijkok->i_dual.view_device();
1096:   const auto &Aj     = aijkok->j_dual.view_device();
1097:   if (ll) {
1098:     PetscCall(VecGetLocalSize(ll, &m));
1099:     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1100:     PetscCall(VecGetKokkosView(ll, &lv));
1101:     PetscCallCXX(Kokkos::parallel_for( // for each row
1102:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1103:         PetscInt i   = t.league_rank(); // row i
1104:         PetscInt len = Ai(i + 1) - Ai(i);
1105:         // scale entries on the row
1106:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1107:       }));
1108:     PetscCall(VecRestoreKokkosView(ll, &lv));
1109:     PetscCall(PetscLogGpuFlops(nz));
1110:   }
1111:   if (rr) {
1112:     PetscCall(VecGetLocalSize(rr, &n));
1113:     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1114:     PetscCall(VecGetKokkosView(rr, &rv));
1115:     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1116:       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1117:     PetscCall(VecRestoreKokkosView(rr, &lv));
1118:     PetscCall(PetscLogGpuFlops(nz));
1119:   }
1120:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1121:   PetscCall(PetscLogGpuTimeEnd());
1122:   PetscFunctionReturn(PETSC_SUCCESS);
1123: }

1125: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1126: {
1127:   Mat_SeqAIJKokkos *aijkok;

1129:   PetscFunctionBegin;
1130:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1131:   if (aijkok) { /* Only zero the device if data is already there */
1132:     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1133:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1134:   } else { /* Might be preallocated but not assembled */
1135:     PetscCall(MatZeroEntries_SeqAIJ(A));
1136:   }
1137:   PetscFunctionReturn(PETSC_SUCCESS);
1138: }

1140: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1141: {
1142:   Mat_SeqAIJKokkos     *aijkok;
1143:   PetscInt              n;
1144:   PetscScalarKokkosView xv;

1146:   PetscFunctionBegin;
1147:   PetscCall(VecGetLocalSize(x, &n));
1148:   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1149:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");

1151:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1152:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

1154:   const auto &Aa    = aijkok->a_dual.view_device();
1155:   const auto &Ai    = aijkok->i_dual.view_device();
1156:   const auto &Adiag = aijkok->diag_dual.view_device();

1158:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1159:   Kokkos::parallel_for(
1160:     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1161:       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1162:       else xv(i) = 0;
1163:     });
1164:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1165:   PetscFunctionReturn(PETSC_SUCCESS);
1166: }

1168: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1169: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1170: {
1171:   Mat_SeqAIJKokkos *aijkok;

1173:   PetscFunctionBegin;
1175:   PetscAssertPointer(kv, 2);
1176:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1177:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1178:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1179:   *kv    = aijkok->a_dual.view_device();
1180:   PetscFunctionReturn(PETSC_SUCCESS);
1181: }

1183: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1184: {
1185:   PetscFunctionBegin;
1187:   PetscAssertPointer(kv, 2);
1188:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1189:   PetscFunctionReturn(PETSC_SUCCESS);
1190: }

1192: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1193: {
1194:   Mat_SeqAIJKokkos *aijkok;

1196:   PetscFunctionBegin;
1198:   PetscAssertPointer(kv, 2);
1199:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1200:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1201:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1202:   *kv    = aijkok->a_dual.view_device();
1203:   PetscFunctionReturn(PETSC_SUCCESS);
1204: }

1206: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1207: {
1208:   PetscFunctionBegin;
1210:   PetscAssertPointer(kv, 2);
1211:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1212:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1213:   PetscFunctionReturn(PETSC_SUCCESS);
1214: }

1216: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1217: {
1218:   Mat_SeqAIJKokkos *aijkok;

1220:   PetscFunctionBegin;
1222:   PetscAssertPointer(kv, 2);
1223:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1224:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1225:   *kv    = aijkok->a_dual.view_device();
1226:   PetscFunctionReturn(PETSC_SUCCESS);
1227: }

1229: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1230: {
1231:   PetscFunctionBegin;
1233:   PetscAssertPointer(kv, 2);
1234:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1235:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1236:   PetscFunctionReturn(PETSC_SUCCESS);
1237: }

1239: PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1240: {
1241:   Mat_SeqAIJKokkos *akok;

1243:   PetscFunctionBegin;
1244:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1245:   PetscCall(MatCreate(comm, A));
1246:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1247:   PetscFunctionReturn(PETSC_SUCCESS);
1248: }

1250: /* Computes Y += alpha X */
1251: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1252: {
1253:   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1254:   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1255:   ConstMatScalarKokkosView Xa;
1256:   MatScalarKokkosView      Ya;
1257:   auto                     exec = PetscGetKokkosExecutionSpace();

1259:   PetscFunctionBegin;
1260:   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1261:   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1262:   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1263:   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1264:   PetscCall(PetscLogGpuTimeBegin());

1266:   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1267:     PetscBool e;
1268:     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1269:     if (e) {
1270:       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1271:       if (e) pattern = SAME_NONZERO_PATTERN;
1272:     }
1273:   }

1275:   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1276:     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1277:     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1278:     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1279:   */
1280:   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1281:   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1282:   Xa   = xkok->a_dual.view_device();
1283:   Ya   = ykok->a_dual.view_device();

1285:   if (pattern == SAME_NONZERO_PATTERN) {
1286:     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1287:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1288:   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1289:     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1290:     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();

1292:     Kokkos::parallel_for(
1293:       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1294:         PetscInt i = t.league_rank(); // row i
1295:         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1296:           // Only one thread works in a team
1297:           PetscInt p, q = Yi(i);
1298:           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1299:             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1300:             if (Xj(p) == Yj(q)) {                        // Found it
1301:               Ya(q) += alpha * Xa(p);
1302:               q++;
1303:             } else {
1304:             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1305:             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1306: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1307:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1308: #else
1309:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1310: #endif
1311:             }
1312:           }
1313:         });
1314:       });
1315:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1316:   } else { // different nonzero patterns
1317:     Mat             Z;
1318:     KokkosCsrMatrix zcsr;
1319:     KernelHandle    kh;
1320:     kh.create_spadd_handle(true); // X, Y are sorted
1321:     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1322:     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1323:     zkok = new Mat_SeqAIJKokkos(zcsr);
1324:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1325:     PetscCall(MatHeaderReplace(Y, &Z));
1326:     kh.destroy_spadd_handle();
1327:   }
1328:   PetscCall(PetscLogGpuTimeEnd());
1329:   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1330:   PetscFunctionReturn(PETSC_SUCCESS);
1331: }

1333: struct MatCOOStruct_SeqAIJKokkos {
1334:   PetscCount           n;
1335:   PetscCount           Atot;
1336:   PetscInt             nz;
1337:   PetscCountKokkosView jmap;
1338:   PetscCountKokkosView perm;

1340:   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1341:   {
1342:     nz   = coo_h->nz;
1343:     n    = coo_h->n;
1344:     Atot = coo_h->Atot;
1345:     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1346:     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1347:   }
1348: };

1350: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1351: {
1352:   PetscFunctionBegin;
1353:   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1354:   PetscFunctionReturn(PETSC_SUCCESS);
1355: }

1357: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1358: {
1359:   Mat_SeqAIJKokkos          *akok;
1360:   Mat_SeqAIJ                *aseq;
1361:   PetscContainer             container_h;
1362:   MatCOOStruct_SeqAIJ       *coo_h;
1363:   MatCOOStruct_SeqAIJKokkos *coo_d;

1365:   PetscFunctionBegin;
1366:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1367:   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1368:   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1369:   delete akok;
1370:   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1371:   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));

1373:   // Copy the COO struct to device
1374:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1375:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1376:   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));

1378:   // Put the COO struct in a container and then attach that to the matrix
1379:   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1380:   PetscFunctionReturn(PETSC_SUCCESS);
1381: }

1383: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1384: {
1385:   MatScalarKokkosView        Aa;
1386:   ConstMatScalarKokkosView   kv;
1387:   PetscMemType               memtype;
1388:   PetscContainer             container;
1389:   MatCOOStruct_SeqAIJKokkos *coo;

1391:   PetscFunctionBegin;
1392:   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1393:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

1395:   const auto &n    = coo->n;
1396:   const auto &Annz = coo->nz;
1397:   const auto &jmap = coo->jmap;
1398:   const auto &perm = coo->perm;

1400:   PetscCall(PetscGetMemType(v, &memtype));
1401:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1402:     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1403:   } else {
1404:     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1405:   }

1407:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1408:   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */

1410:   PetscCall(PetscLogGpuTimeBegin());
1411:   Kokkos::parallel_for(
1412:     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1413:       PetscScalar sum = 0.0;
1414:       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1415:       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1416:     });
1417:   PetscCall(PetscLogGpuTimeEnd());

1419:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1420:   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1421:   PetscFunctionReturn(PETSC_SUCCESS);
1422: }

1424: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1425: {
1426:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

1428:   PetscFunctionBegin;
1429:   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1430:   A->boundtocpu  = PETSC_FALSE;

1432:   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1433:   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1434:   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1435:   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1436:   A->ops->scale                     = MatScale_SeqAIJKokkos;
1437:   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1438:   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1439:   A->ops->mult                      = MatMult_SeqAIJKokkos;
1440:   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1441:   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1442:   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1443:   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1444:   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1445:   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1446:   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1447:   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1448:   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1449:   A->ops->shift                     = MatShift_SeqAIJKokkos;
1450:   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1451:   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1452:   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1453:   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1454:   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1455:   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1456:   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1457:   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1458:   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;

1460:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1461:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1462: #if defined(PETSC_HAVE_HYPRE)
1463:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1464: #endif
1465:   PetscFunctionReturn(PETSC_SUCCESS);
1466: }

1468: /*
1469:    Extract the (prescribled) diagonal blocks of the matrix and then invert them

1471:   Input Parameters:
1472: +  A       - the MATSEQAIJKOKKOS matrix
1473: .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1474: .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1475: .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1476: -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine

1478:   Output Parameter:
1479: .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1480: */
1481: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1482: {
1483:   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1484:   PetscInt          nblocks = bs.extent(0) - 1;

1486:   PetscFunctionBegin;
1487:   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device

1489:   // Pull out the diagonal blocks of the matrix and then invert the blocks
1490:   auto Aa    = akok->a_dual.view_device();
1491:   auto Ai    = akok->i_dual.view_device();
1492:   auto Aj    = akok->j_dual.view_device();
1493:   auto Adiag = akok->diag_dual.view_device();
1494:   // TODO: how to tune the team size?
1495: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1496:   auto ts = Kokkos::AUTO();
1497: #else
1498:   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1499: #endif
1500:   PetscCallCXX(Kokkos::parallel_for(
1501:     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1502:       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1503:       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1504:       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1505:       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1506:       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);

1508:       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1509:         PetscInt i = rstart + r;                                                            // i-th row in A

1511:         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1512:           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row

1514:           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1515:             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1516:               B(r, c) = 0.0;
1517:             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1518:               B(r, c) = Aa(first + c);
1519:             } else { // this entry does not show up in the CSR
1520:               B(r, c) = 0.0;
1521:             }
1522:           }
1523:         } else { // rare case that the diagonal does not exist
1524:           const PetscInt begin = Ai(i);
1525:           const PetscInt end   = Ai(i + 1);
1526:           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1527:           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1528:             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1529:             else if (Aj(j) >= rstart + m) break;
1530:           }
1531:         }
1532:       });

1534:       // LU-decompose B (w/o pivoting) and then invert B
1535:       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1536:       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1537:     }));
1538:   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1539:   PetscFunctionReturn(PETSC_SUCCESS);
1540: }

1542: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1543: {
1544:   Mat_SeqAIJ *aseq;
1545:   PetscInt    i, m, n;
1546:   auto        exec = PetscGetKokkosExecutionSpace();

1548:   PetscFunctionBegin;
1549:   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");

1551:   m = akok->nrows();
1552:   n = akok->ncols();
1553:   PetscCall(MatSetSizes(A, m, n, m, n));
1554:   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));

1556:   /* Set up data structures of A as a MATSEQAIJ */
1557:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1558:   aseq = (Mat_SeqAIJ *)A->data;

1560:   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1561:   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));

1563:   aseq->i       = akok->i_host_data();
1564:   aseq->j       = akok->j_host_data();
1565:   aseq->a       = akok->a_host_data();
1566:   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1567:   aseq->free_a  = PETSC_FALSE;
1568:   aseq->free_ij = PETSC_FALSE;
1569:   aseq->nz      = akok->nnz();
1570:   aseq->maxnz   = aseq->nz;

1572:   PetscCall(PetscMalloc1(m, &aseq->imax));
1573:   PetscCall(PetscMalloc1(m, &aseq->ilen));
1574:   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];

1576:   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1577:   akok->nonzerostate = A->nonzerostate;
1578:   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1579:   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1580:   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1581:   PetscFunctionReturn(PETSC_SUCCESS);
1582: }

1584: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1585: {
1586:   PetscFunctionBegin;
1587:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1588:   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1589:   PetscFunctionReturn(PETSC_SUCCESS);
1590: }

1592: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1593: {
1594:   Mat_SeqAIJKokkos *akok;

1596:   PetscFunctionBegin;
1597:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1598:   PetscCall(MatCreate(comm, A));
1599:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1600:   PetscFunctionReturn(PETSC_SUCCESS);
1601: }

1603: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure

1605:    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1606:  */
1607: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1608: {
1609:   PetscFunctionBegin;
1610:   PetscCall(MatCreate(comm, A));
1611:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1612:   PetscFunctionReturn(PETSC_SUCCESS);
1613: }

1615: /*@C
1616:   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1617:   (the default parallel PETSc format). This matrix will ultimately be handled by
1618:   Kokkos for calculations.

1620:   Collective

1622:   Input Parameters:
1623: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1624: . m    - number of rows
1625: . n    - number of columns
1626: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1627: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

1629:   Output Parameter:
1630: . A - the matrix

1632:   Level: intermediate

1634:   Notes:
1635:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1636:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1637:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

1639:   The AIJ format, also called
1640:   compressed row storage, is fully compatible with standard Fortran
1641:   storage.  That is, the stored row and column indices can begin at
1642:   either one (as in Fortran) or zero.

1644:   Specify the preallocated storage with either `nz` or `nnz` (not both).
1645:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1646:   allocation.

1648: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1649: @*/
1650: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1651: {
1652:   PetscFunctionBegin;
1653:   PetscCall(PetscKokkosInitializeCheck());
1654:   PetscCall(MatCreate(comm, A));
1655:   PetscCall(MatSetSizes(*A, m, n, m, n));
1656:   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1657:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1658:   PetscFunctionReturn(PETSC_SUCCESS);
1659: }

1661: // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1662: // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1663: // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1664: // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1665: static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1666: {
1667:   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1668:   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1669:   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy

1671:   PetscFunctionBegin;
1672:   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1673:     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1674:     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1675:     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1676:   }
1677:   PetscFunctionReturn(PETSC_SUCCESS);
1678: }

1680: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1681: {
1682:   const PetscInt              n         = A->rmap->n;
1683:   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1684:   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1685:   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy

1687:   PetscFunctionBegin;
1688:   if (!factors->transpose_updated) {
1689:     if (has_upper) {
1690:       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1691:         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1692:         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1693:         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1694:       }

1696:       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1697:         if (!factors->U) {
1698:           Mat_SeqAIJ *seq;

1700:           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1701:           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));

1703:           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1704:           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1705:           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1706:           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1707:         } else {
1708:           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1709:         }
1710:         // Copy Ut from host to device
1711:         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1712:         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1713:         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1714:       } else { // If U was computed on device, we also compute the transpose there
1715:         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1716:         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1717:                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1718:                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1719:                                                                                                                                                                                                                                factors->aUt_d));
1720:         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1721:       }
1722:       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1723:     }

1725:     // do the same for L with LU
1726:     if (has_lower) {
1727:       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1728:         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1729:         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1730:         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1731:       }

1733:       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1734:         if (!factors->L) {
1735:           Mat_SeqAIJ *seq;

1737:           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1738:           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));

1740:           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1741:           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1742:           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1743:           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1744:         } else {
1745:           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1746:         }
1747:         // Copy Lt from host to device
1748:         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1749:         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1750:         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1751:       } else { // If L was computed on device, we also compute the transpose there
1752:         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1753:         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1754:                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1755:                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1756:                                                                                                                                                                                                                                factors->aLt_d));
1757:         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1758:       }
1759:       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1760:     }

1762:     factors->transpose_updated = PETSC_TRUE;
1763:   }
1764:   PetscFunctionReturn(PETSC_SUCCESS);
1765: }

1767: // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1768: // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1769: static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1770: {
1771:   auto                        exec    = PetscGetKokkosExecutionSpace();
1772:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1773:   PetscInt                    m       = A->rmap->n;
1774:   PetscScalarKokkosView       D       = factors->D_d;
1775:   PetscScalarKokkosView       X, Y, B; // alias
1776:   ConstPetscScalarKokkosView  b;
1777:   PetscScalarKokkosView       x;
1778:   PetscIntKokkosView         &rowperm  = factors->rowperm;
1779:   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;

1781:   PetscFunctionBegin;
1782:   PetscCall(PetscLogGpuTimeBegin());
1783:   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1784:   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1785:   PetscCall(VecGetKokkosView(bb, &b));
1786:   PetscCall(VecGetKokkosViewWrite(xx, &x));

1788:   // Solve U^T Y = B
1789:   if (identity) { // Reorder b with the row permutation
1790:     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1791:     Y = factors->workVector;
1792:   } else {
1793:     B = factors->workVector;
1794:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1795:     Y = x;
1796:   }
1797:   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));

1799:   // Solve diag(D) Y' = Y.
1800:   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1801:   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));

1803:   // Solve UX = Y
1804:   if (identity) {
1805:     X = x;
1806:   } else {
1807:     X = factors->workVector; // B is not needed anymore
1808:   }
1809:   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));

1811:   // Reorder X with the inverse column (row) permutation
1812:   if (!identity) {
1813:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1814:   }

1816:   PetscCall(VecRestoreKokkosView(bb, &b));
1817:   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1818:   PetscCall(PetscLogGpuTimeEnd());
1819:   PetscFunctionReturn(PETSC_SUCCESS);
1820: }

1822: // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1823: // R and C are represented by rowperm and colperm in factors.
1824: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1825: static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1826: {
1827:   auto                        exec    = PetscGetKokkosExecutionSpace();
1828:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1829:   PetscInt                    m       = A->rmap->n;
1830:   PetscScalarKokkosView       X, Y, B; // alias
1831:   ConstPetscScalarKokkosView  b;
1832:   PetscScalarKokkosView       x;
1833:   PetscIntKokkosView         &rowperm      = factors->rowperm;
1834:   PetscIntKokkosView         &colperm      = factors->colperm;
1835:   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1836:   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;

1838:   PetscFunctionBegin;
1839:   PetscCall(PetscLogGpuTimeBegin());
1840:   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1841:   PetscCall(VecGetKokkosView(bb, &b));
1842:   PetscCall(VecGetKokkosViewWrite(xx, &x));

1844:   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1845:   if (row_identity) {
1846:     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1847:     Y = factors->workVector;
1848:   } else {
1849:     B = factors->workVector;
1850:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1851:     Y = x;
1852:   }
1853:   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));

1855:   // Solve U C^- x = Y
1856:   if (col_identity) {
1857:     X = x;
1858:   } else {
1859:     X = factors->workVector;
1860:   }
1861:   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));

1863:   // x = C X; Reorder X with the inverse col permutation
1864:   if (!col_identity) {
1865:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1866:   }

1868:   PetscCall(VecRestoreKokkosView(bb, &b));
1869:   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1870:   PetscCall(PetscLogGpuTimeEnd());
1871:   PetscFunctionReturn(PETSC_SUCCESS);
1872: }

1874: // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1875: // R and C are represented by rowperm and colperm in factors.
1876: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1877: // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1878: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1879: {
1880:   auto                        exec    = PetscGetKokkosExecutionSpace();
1881:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1882:   PetscInt                    m       = A->rmap->n;
1883:   PetscScalarKokkosView       X, Y, B; // alias
1884:   ConstPetscScalarKokkosView  b;
1885:   PetscScalarKokkosView       x;
1886:   PetscIntKokkosView         &rowperm      = factors->rowperm;
1887:   PetscIntKokkosView         &colperm      = factors->colperm;
1888:   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1889:   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;

1891:   PetscFunctionBegin;
1892:   PetscCall(PetscLogGpuTimeBegin());
1893:   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1894:   PetscCall(VecGetKokkosView(bb, &b));
1895:   PetscCall(VecGetKokkosViewWrite(xx, &x));

1897:   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1898:   if (col_identity) { // Reorder b with the col permutation
1899:     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1900:     Y = factors->workVector;
1901:   } else {
1902:     B = factors->workVector;
1903:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1904:     Y = x;
1905:   }
1906:   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));

1908:   // Solve L^T X = Y
1909:   if (row_identity) {
1910:     X = x;
1911:   } else {
1912:     X = factors->workVector;
1913:   }
1914:   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));

1916:   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1917:   if (!row_identity) {
1918:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1919:   }

1921:   PetscCall(VecRestoreKokkosView(bb, &b));
1922:   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1923:   PetscCall(PetscLogGpuTimeEnd());
1924:   PetscFunctionReturn(PETSC_SUCCESS);
1925: }

1927: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1928: {
1929:   PetscFunctionBegin;
1930:   PetscCall(MatSeqAIJKokkosSyncHost(A));
1931:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));

1933:   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1934:     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1935:     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1936:     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1937:     const MatScalar            *Ba = b->a;
1938:     PetscInt                    m = B->rmap->n, n = B->cmap->n;

1940:     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1941:       // Allocate memory and copy the structure
1942:       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1943:       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1944:       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1945:       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1946:       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1947:       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));

1949:       PetscInt *Li = factors->iL_h.data();
1950:       PetscInt *Lj = factors->jL_h.data();
1951:       PetscInt *Ui = factors->iU_h.data();
1952:       PetscInt *Uj = factors->jU_h.data();

1954:       Li[0] = Ui[0] = 0;
1955:       for (PetscInt i = 0; i < m; i++) {
1956:         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1957:         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry

1959:         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1960:         Lj[Li[i] + llen] = i;                        // diagonal entry of L

1962:         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1963:         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal

1965:         Li[i + 1] = Li[i] + llen + 1;
1966:         Ui[i + 1] = Ui[i] + ulen;
1967:       }

1969:       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1970:       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1971:       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1972:       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1973:       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1974:       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);

1976:       // Copy row/col permutation to device
1977:       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1978:       PetscBool row_identity;
1979:       PetscCall(ISIdentity(rowperm, &row_identity));
1980:       if (!row_identity) {
1981:         const PetscInt *ip;

1983:         PetscCall(ISGetIndices(rowperm, &ip));
1984:         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1985:         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1986:         PetscCall(ISRestoreIndices(rowperm, &ip));
1987:         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1988:       }

1990:       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1991:       PetscBool col_identity;
1992:       PetscCall(ISIdentity(colperm, &col_identity));
1993:       if (!col_identity) {
1994:         const PetscInt *ip;

1996:         PetscCall(ISGetIndices(colperm, &ip));
1997:         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
1998:         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
1999:         PetscCall(ISRestoreIndices(colperm, &ip));
2000:         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2001:       }

2003:       /* Create sptrsv handles for L, U and their transpose */
2004: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2005:       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2006: #else
2007:       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2008: #endif
2009:       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2010:       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2011:       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2012:       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2013:     }

2015:     // Copy the value
2016:     for (PetscInt i = 0; i < m; i++) {
2017:       PetscInt        llen = Bi[i + 1] - Bi[i];
2018:       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2019:       const PetscInt *Li   = factors->iL_h.data();
2020:       const PetscInt *Ui   = factors->iU_h.data();

2022:       PetscScalar *La = factors->aL_h.data();
2023:       PetscScalar *Ua = factors->aU_h.data();

2025:       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2026:       La[Li[i] + llen] = 1.0;                      // diagonal entry

2028:       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2029:       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2030:     }

2032:     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2033:     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2034:     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2035:     factors->transpose_updated         = PETSC_FALSE;
2036:     factors->sptrsv_symbolic_completed = PETSC_FALSE;

2038:     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2039:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2040:   }

2042:   B->ops->matsolve          = NULL;
2043:   B->ops->matsolvetranspose = NULL;
2044:   PetscFunctionReturn(PETSC_SUCCESS);
2045: }

2047: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2048: {
2049:   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2050:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2051:   PetscInt                    fill_lev = info->levels;

2053:   PetscFunctionBegin;
2054:   PetscCall(PetscLogGpuTimeBegin());
2055:   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2056:   PetscCall(MatSeqAIJKokkosSyncDevice(A));

2058:   auto a_d = aijkok->a_dual.view_device();
2059:   auto i_d = aijkok->i_dual.view_device();
2060:   auto j_d = aijkok->j_dual.view_device();

2062:   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));

2064:   B->assembled              = PETSC_TRUE;
2065:   B->preallocated           = PETSC_TRUE;
2066:   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2067:   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2068:   B->ops->matsolve          = NULL;
2069:   B->ops->matsolvetranspose = NULL;

2071:   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2072:   factors->transpose_updated         = PETSC_FALSE;
2073:   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2074:   /* TODO: log flops, but how to know that? */
2075:   PetscCall(PetscLogGpuTimeEnd());
2076:   PetscFunctionReturn(PETSC_SUCCESS);
2077: }

2079: // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2080: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2081: {
2082:   Mat_SeqAIJKokkos           *aijkok;
2083:   Mat_SeqAIJ                 *b;
2084:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2085:   PetscInt                    fill_lev = info->levels;
2086:   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2087:   PetscInt                    n        = A->rmap->n;

2089:   PetscFunctionBegin;
2090:   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2091:   PetscCall(MatSeqAIJKokkosSyncDevice(A));

2093:   /* Create a spiluk handle and then do symbolic factorization */
2094:   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2095:   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);

2097:   auto spiluk_handle = factors->kh.get_spiluk_handle();

2099:   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2100:   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2101:   Kokkos::realloc(factors->iU_d, n + 1);
2102:   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());

2104:   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2105:   auto i_d = aijkok->i_dual.view_device();
2106:   auto j_d = aijkok->j_dual.view_device();
2107:   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2108:   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */

2110:   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2111:   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2112:   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2113:   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());

2115:   /* TODO: add options to select sptrsv algorithms */
2116:   /* Create sptrsv handles for L, U and their transpose */
2117: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2118:   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2119: #else
2120:   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2121: #endif

2123:   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2124:   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2125:   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2126:   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);

2128:   /* Fill fields of the factor matrix B */
2129:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2130:   b     = (Mat_SeqAIJ *)B->data;
2131:   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2132:   B->info.fill_ratio_given  = info->fill;
2133:   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;

2135:   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2136:   PetscFunctionReturn(PETSC_SUCCESS);
2137: }

2139: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2140: {
2141:   PetscFunctionBegin;
2142:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2143:   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2144:   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2145:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2146:   PetscFunctionReturn(PETSC_SUCCESS);
2147: }

2149: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2150: {
2151:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;

2153:   PetscFunctionBegin;
2154:   if (!info->factoronhost) {
2155:     PetscCall(ISIdentity(isrow, &row_identity));
2156:     PetscCall(ISIdentity(iscol, &col_identity));
2157:   }

2159:   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2160:   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));

2162:   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2163:     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2164:   } else {
2165:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2166:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2167:   }
2168:   PetscFunctionReturn(PETSC_SUCCESS);
2169: }

2171: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2172: {
2173:   PetscFunctionBegin;
2174:   PetscCall(MatSeqAIJKokkosSyncHost(A));
2175:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));

2177:   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2178:     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2179:     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2180:     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2181:     const MatScalar            *Ba = b->a;
2182:     PetscInt                    m  = B->rmap->n;

2184:     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2185:       // Allocate memory and copy the structure
2186:       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2187:       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2188:       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2189:       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2190:       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2191:       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);

2193:       // Build jU_h from the skewed Aj
2194:       PetscInt *Uj = factors->jU_h.data();
2195:       for (PetscInt i = 0; i < m; i++) {
2196:         PetscInt ulen = Bi[i + 1] - Bi[i];
2197:         Uj[Bi[i]]     = i;                                              // diagonal entry
2198:         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2199:       }

2201:       // Copy iU, jU to device
2202:       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2203:       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));

2205:       // Copy row/col permutation to device
2206:       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2207:       PetscBool row_identity;
2208:       PetscCall(ISIdentity(rowperm, &row_identity));
2209:       if (!row_identity) {
2210:         const PetscInt *ip;

2212:         PetscCall(ISGetIndices(rowperm, &ip));
2213:         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2214:         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2215:         PetscCall(ISRestoreIndices(rowperm, &ip));
2216:         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2217:       }

2219:       // Create sptrsv handles for U and U^T
2220: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2221:       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2222: #else
2223:       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2224: #endif
2225:       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2226:       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2227:     }
2228:     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2229:     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2230:     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;

2232:     // Copy the value
2233:     PetscScalar *Ua = factors->aU_h.data();
2234:     PetscScalar *D  = factors->D_h.data();
2235:     for (PetscInt i = 0; i < m; i++) {
2236:       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2237:       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2238:       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2239:     }
2240:     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2241:     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));

2243:     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2244:     factors->transpose_updated         = PETSC_FALSE;
2245:   }

2247:   B->ops->matsolve          = NULL;
2248:   B->ops->matsolvetranspose = NULL;
2249:   PetscFunctionReturn(PETSC_SUCCESS);
2250: }

2252: static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2253: {
2254:   PetscFunctionBegin;
2255:   if (info->solveonhost) {
2256:     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2257:     PetscCall(MatSetType(B, MATSEQSBAIJ));
2258:     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2259:   }

2261:   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));

2263:   if (!info->solveonhost) {
2264:     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2265:     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2266:     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2267:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2268:   }
2269:   PetscFunctionReturn(PETSC_SUCCESS);
2270: }

2272: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2273: {
2274:   PetscFunctionBegin;
2275:   if (info->solveonhost) {
2276:     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2277:     PetscCall(MatSetType(B, MATSEQSBAIJ));
2278:     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2279:   }

2281:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm

2283:   if (!info->solveonhost) {
2284:     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2285:     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2286:     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2287:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2288:   }
2289:   PetscFunctionReturn(PETSC_SUCCESS);
2290: }

2292: // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2293: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2294: {
2295:   PetscFunctionBegin;
2296:   *type = MATSOLVERKOKKOS;
2297:   PetscFunctionReturn(PETSC_SUCCESS);
2298: }

2300: /*MC
2301:   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2302:   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.

2304:   Level: beginner

2306: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2307: M*/
2308: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2309: {
2310:   PetscInt n = A->rmap->n;
2311:   MPI_Comm comm;

2313:   PetscFunctionBegin;
2314:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2315:   PetscCall(MatCreate(comm, B));
2316:   PetscCall(MatSetSizes(*B, n, n, n, n));
2317:   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2318:   (*B)->factortype = ftype;
2319:   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2320:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2321:   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");

2323:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2324:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2325:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2326:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2327:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2328:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2329:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2330:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2331:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2332:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2333:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2334:   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);

2336:   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2337:   (*B)->canuseordering = PETSC_TRUE;
2338:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2339:   PetscFunctionReturn(PETSC_SUCCESS);
2340: }

2342: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2343: {
2344:   PetscFunctionBegin;
2345:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2346:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2347:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2348:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2349:   PetscFunctionReturn(PETSC_SUCCESS);
2350: }

2352: /* Utility to print out a KokkosCsrMatrix for debugging */
2353: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2354: {
2355:   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2356:   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2357:   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2358:   const PetscInt    *i  = iv.data();
2359:   const PetscInt    *j  = jv.data();
2360:   const PetscScalar *a  = av.data();
2361:   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();

2363:   PetscFunctionBegin;
2364:   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2365:   for (PetscInt k = 0; k < m; k++) {
2366:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2367:     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2368:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2369:   }
2370:   PetscFunctionReturn(PETSC_SUCCESS);
2371: }