Actual source code: aijkok.kokkos.cxx

  1: #include <petsc_kokkos.hpp>
  2: #include <petscvec_kokkos.hpp>
  3: #include <petscmat_kokkos.hpp>
  4: #include <petscpkg_version.h>
  5: #include <petsc/private/petscimpl.h>
  6: #include <petsc/private/sfimpl.h>
  7: #include <petsc/private/kokkosimpl.hpp>
  8: #include <petscsys.h>

 10: #include <Kokkos_Core.hpp>
 11: #include <KokkosBlas.hpp>
 12: #include <KokkosSparse_CrsMatrix.hpp>

 14: // To suppress compiler warnings:
 15: // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
 16: // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
 17: // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
 18: // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
 19: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
 20: #include <KokkosSparse_spmv.hpp>
 21: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()

 23: #include <KokkosSparse_spiluk.hpp>
 24: #include <KokkosSparse_sptrsv.hpp>
 25: #include <KokkosSparse_spgemm.hpp>
 26: #include <KokkosSparse_spadd.hpp>
 27: #include <KokkosBatched_LU_Decl.hpp>
 28: #include <KokkosBatched_InverseLU_Decl.hpp>

 30: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>

 32: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
 33:   #include <KokkosSparse_Utils.hpp>
 34: using KokkosSparse::sort_crs_matrix;
 35: using KokkosSparse::Impl::transpose_matrix;
 36: #else
 37:   #include <KokkosKernels_Sorting.hpp>
 38: using KokkosKernels::sort_crs_matrix;
 39: using KokkosKernels::Impl::transpose_matrix;
 40: #endif

 42: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
 43: using KokkosSparse::spiluk_symbolic;
 44: using KokkosSparse::spiluk_numeric;
 45: using KokkosSparse::sptrsv_symbolic;
 46: using KokkosSparse::sptrsv_solve;
 47: using KokkosSparse::Experimental::SPTRSVAlgorithm;
 48: using KokkosSparse::Experimental::SPILUKAlgorithm;
 49: #else
 50: using KokkosSparse::Experimental::spiluk_symbolic;
 51: using KokkosSparse::Experimental::spiluk_numeric;
 52: using KokkosSparse::Experimental::sptrsv_symbolic;
 53: using KokkosSparse::Experimental::sptrsv_solve;
 54: using KokkosSparse::Experimental::SPTRSVAlgorithm;
 55: using KokkosSparse::Experimental::SPILUKAlgorithm;
 56: #endif

 58: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */

 60: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
 61:    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
 62:    In the latter case, it is important to set a_dual's sync state correctly.
 63:  */
 64: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
 65: {
 66:   Mat_SeqAIJ       *aijseq;
 67:   Mat_SeqAIJKokkos *aijkok;

 69:   PetscFunctionBegin;
 70:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
 71:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));

 73:   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
 74:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 76:   /* If aijkok does not exist, we just copy i, j to device.
 77:      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
 78:      In both cases, we build a new aijkok structure.
 79:   */
 80:   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
 81:     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidently freeing much needed a,i,j on host when deleting aijkok */
 82:       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
 83:       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
 84:       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
 85:       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
 86:       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
 87:       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
 88:       aijseq->free_a  = PETSC_TRUE;
 89:       aijseq->free_ij = PETSC_TRUE;
 90:       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
 91:          we have the CsrMatrix on device first and then copy to host, followed by
 92:          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
 93:          One could improve it by not using NULL garray.
 94:       */
 95:     }
 96:     delete aijkok;
 97:     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
 98:     A->spptr = aijkok;
 99:   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100:     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101:     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102:     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
103:   }
104:   PetscFunctionReturn(PETSC_SUCCESS);
105: }

107: /* Sync CSR data to device if not yet */
108: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109: {
110:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

112:   PetscFunctionBegin;
113:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
114:   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115:   if (aijkok->a_dual.need_sync_device()) {
116:     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
117:     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
118:     aijkok->hermitian_updated = PETSC_FALSE;
119:   }
120:   PetscFunctionReturn(PETSC_SUCCESS);
121: }

123: /* Mark the CSR data on device as modified */
124: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125: {
126:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

128:   PetscFunctionBegin;
129:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
130:   aijkok->a_dual.clear_sync_state();
131:   aijkok->a_dual.modify_device();
132:   aijkok->transpose_updated = PETSC_FALSE;
133:   aijkok->hermitian_updated = PETSC_FALSE;
134:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
135:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
136:   PetscFunctionReturn(PETSC_SUCCESS);
137: }

139: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140: {
141:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142:   auto              exec   = PetscGetKokkosExecutionSpace();

144:   PetscFunctionBegin;
145:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146:   /* We do not expect one needs factors on host  */
147:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148:   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149:   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
150:   PetscFunctionReturn(PETSC_SUCCESS);
151: }

153: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154: {
155:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

157:   PetscFunctionBegin;
158:   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159:     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160:     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161:     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162:   */
163:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164:     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
165:     *array = aijkok->a_dual.view_host().data();
166:   } else { /* Happens when calling MatSetValues on a newly created matrix */
167:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
168:   }
169:   PetscFunctionReturn(PETSC_SUCCESS);
170: }

172: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
173: {
174:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

176:   PetscFunctionBegin;
177:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
178:   PetscFunctionReturn(PETSC_SUCCESS);
179: }

181: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
182: {
183:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

185:   PetscFunctionBegin;
186:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
187:     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
188:     *array = aijkok->a_dual.view_host().data();
189:   } else {
190:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
191:   }
192:   PetscFunctionReturn(PETSC_SUCCESS);
193: }

195: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
196: {
197:   PetscFunctionBegin;
198:   *array = NULL;
199:   PetscFunctionReturn(PETSC_SUCCESS);
200: }

202: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
203: {
204:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

206:   PetscFunctionBegin;
207:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
208:     *array = aijkok->a_dual.view_host().data();
209:   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
210:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
211:   }
212:   PetscFunctionReturn(PETSC_SUCCESS);
213: }

215: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
216: {
217:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

219:   PetscFunctionBegin;
220:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
221:     aijkok->a_dual.clear_sync_state();
222:     aijkok->a_dual.modify_host();
223:   }
224:   PetscFunctionReturn(PETSC_SUCCESS);
225: }

227: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
228: {
229:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

231:   PetscFunctionBegin;
232:   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");

234:   if (i) *i = aijkok->i_device_data();
235:   if (j) *j = aijkok->j_device_data();
236:   if (a) {
237:     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
238:     *a = aijkok->a_device_data();
239:   }
240:   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
241:   PetscFunctionReturn(PETSC_SUCCESS);
242: }

244: /*
245:   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.

247:   Input Parameter:
248: .  A       - the MATSEQAIJKOKKOS matrix

250:   Output Parameters:
251: +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
252: -  T_d    - the transpose on device, whose value array is allocated but not initialized
253: */
254: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
255: {
256:   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
257:   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
258:   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
259:   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
260:   MatRowMapType          *Ti = Ti_h.data();
261:   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
262:   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
263:   PetscInt               *Tj   = Tj_h.data();
264:   PetscInt               *perm = perm_h.data();
265:   PetscInt               *offset;

267:   PetscFunctionBegin;
268:   // Populate Ti
269:   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
270:   Ti++;
271:   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
272:   Ti--;
273:   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];

275:   // Populate Tj and the permutation array
276:   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
277:   for (PetscInt i = 0; i < m; i++) {
278:     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
279:       PetscInt r    = Aj[j];                       // row r of T
280:       PetscInt disp = Ti[r] + offset[r];

282:       Tj[disp]   = i; // col i of T
283:       perm[disp] = j;
284:       offset[r]++;
285:     }
286:   }
287:   PetscCall(PetscFree(offset));

289:   // Sort each row of T, along with the permutation array
290:   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));

292:   // Output perm and T on device
293:   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
294:   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
295:   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
296:   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
297:   PetscFunctionReturn(PETSC_SUCCESS);
298: }

300: // Generate the transpose on device and cache it internally
301: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
302: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
303: {
304:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
305:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
306:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
307:   KokkosCsrMatrix  &T = akok->csrmatT;

309:   PetscFunctionBegin;
310:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
311:   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device

313:   const auto &Aa = akok->a_dual.view_device();

315:   if (A->symmetric == PETSC_BOOL3_TRUE) {
316:     *csrmatT = akok->csrmat;
317:   } else {
318:     // See if we already have a cached transpose and its value is up to date
319:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
320:       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
321:         const auto &perm = akok->transpose_perm; // get the permutation array
322:         auto       &Ta   = T.values;

324:         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
325:       }
326:     } else { // Generate T of size n x m for the first time
327:       MatRowMapKokkosView perm;

329:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
330:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
331:       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
332:     }
333:     akok->transpose_updated = PETSC_TRUE;
334:     *csrmatT                = akok->csrmatT;
335:   }
336:   PetscFunctionReturn(PETSC_SUCCESS);
337: }

339: // Generate the Hermitian on device and cache it internally
340: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
341: {
342:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
343:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
344:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
345:   KokkosCsrMatrix  &T = akok->csrmatH;

347:   PetscFunctionBegin;
348:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
349:   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device

351:   const auto &Aa = akok->a_dual.view_device();

353:   if (A->hermitian == PETSC_BOOL3_TRUE) {
354:     *csrmatH = akok->csrmat;
355:   } else {
356:     // See if we already have a cached hermitian and its value is up to date
357:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
358:       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
359:         const auto &perm = akok->transpose_perm; // get the permutation array
360:         auto       &Ta   = T.values;

362:         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
363:       }
364:     } else { // Generate T of size n x m for the first time
365:       MatRowMapKokkosView perm;

367:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
368:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
369:       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
370:     }
371:     akok->hermitian_updated = PETSC_TRUE;
372:     *csrmatH                = akok->csrmatH;
373:   }
374:   PetscFunctionReturn(PETSC_SUCCESS);
375: }

377: /* y = A x */
378: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
379: {
380:   Mat_SeqAIJKokkos          *aijkok;
381:   ConstPetscScalarKokkosView xv;
382:   PetscScalarKokkosView      yv;

384:   PetscFunctionBegin;
385:   PetscCall(PetscLogGpuTimeBegin());
386:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
387:   PetscCall(VecGetKokkosView(xx, &xv));
388:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
389:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
390:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
391:   PetscCall(VecRestoreKokkosView(xx, &xv));
392:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
393:   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
394:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
395:   PetscCall(PetscLogGpuTimeEnd());
396:   PetscFunctionReturn(PETSC_SUCCESS);
397: }

399: /* y = A^T x */
400: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
401: {
402:   Mat_SeqAIJKokkos          *aijkok;
403:   const char                *mode;
404:   ConstPetscScalarKokkosView xv;
405:   PetscScalarKokkosView      yv;
406:   KokkosCsrMatrix            csrmat;

408:   PetscFunctionBegin;
409:   PetscCall(PetscLogGpuTimeBegin());
410:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
411:   PetscCall(VecGetKokkosView(xx, &xv));
412:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
413:   if (A->form_explicit_transpose) {
414:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
415:     mode = "N";
416:   } else {
417:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
418:     csrmat = aijkok->csrmat;
419:     mode   = "T";
420:   }
421:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
422:   PetscCall(VecRestoreKokkosView(xx, &xv));
423:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
424:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
425:   PetscCall(PetscLogGpuTimeEnd());
426:   PetscFunctionReturn(PETSC_SUCCESS);
427: }

429: /* y = A^H x */
430: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
431: {
432:   Mat_SeqAIJKokkos          *aijkok;
433:   const char                *mode;
434:   ConstPetscScalarKokkosView xv;
435:   PetscScalarKokkosView      yv;
436:   KokkosCsrMatrix            csrmat;

438:   PetscFunctionBegin;
439:   PetscCall(PetscLogGpuTimeBegin());
440:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
441:   PetscCall(VecGetKokkosView(xx, &xv));
442:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
443:   if (A->form_explicit_transpose) {
444:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
445:     mode = "N";
446:   } else {
447:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
448:     csrmat = aijkok->csrmat;
449:     mode   = "C";
450:   }
451:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
452:   PetscCall(VecRestoreKokkosView(xx, &xv));
453:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
454:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
455:   PetscCall(PetscLogGpuTimeEnd());
456:   PetscFunctionReturn(PETSC_SUCCESS);
457: }

459: /* z = A x + y */
460: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
461: {
462:   Mat_SeqAIJKokkos          *aijkok;
463:   ConstPetscScalarKokkosView xv;
464:   PetscScalarKokkosView      zv;

466:   PetscFunctionBegin;
467:   PetscCall(PetscLogGpuTimeBegin());
468:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
469:   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
470:   PetscCall(VecGetKokkosView(xx, &xv));
471:   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
472:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
473:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
474:   PetscCall(VecRestoreKokkosView(xx, &xv));
475:   PetscCall(VecRestoreKokkosView(zz, &zv));
476:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
477:   PetscCall(PetscLogGpuTimeEnd());
478:   PetscFunctionReturn(PETSC_SUCCESS);
479: }

481: /* z = A^T x + y */
482: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
483: {
484:   Mat_SeqAIJKokkos          *aijkok;
485:   const char                *mode;
486:   ConstPetscScalarKokkosView xv;
487:   PetscScalarKokkosView      zv;
488:   KokkosCsrMatrix            csrmat;

490:   PetscFunctionBegin;
491:   PetscCall(PetscLogGpuTimeBegin());
492:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
493:   if (zz != yy) PetscCall(VecCopy(yy, zz));
494:   PetscCall(VecGetKokkosView(xx, &xv));
495:   PetscCall(VecGetKokkosView(zz, &zv));
496:   if (A->form_explicit_transpose) {
497:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
498:     mode = "N";
499:   } else {
500:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
501:     csrmat = aijkok->csrmat;
502:     mode   = "T";
503:   }
504:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
505:   PetscCall(VecRestoreKokkosView(xx, &xv));
506:   PetscCall(VecRestoreKokkosView(zz, &zv));
507:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
508:   PetscCall(PetscLogGpuTimeEnd());
509:   PetscFunctionReturn(PETSC_SUCCESS);
510: }

512: /* z = A^H x + y */
513: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
514: {
515:   Mat_SeqAIJKokkos          *aijkok;
516:   const char                *mode;
517:   ConstPetscScalarKokkosView xv;
518:   PetscScalarKokkosView      zv;
519:   KokkosCsrMatrix            csrmat;

521:   PetscFunctionBegin;
522:   PetscCall(PetscLogGpuTimeBegin());
523:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
524:   if (zz != yy) PetscCall(VecCopy(yy, zz));
525:   PetscCall(VecGetKokkosView(xx, &xv));
526:   PetscCall(VecGetKokkosView(zz, &zv));
527:   if (A->form_explicit_transpose) {
528:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
529:     mode = "N";
530:   } else {
531:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
532:     csrmat = aijkok->csrmat;
533:     mode   = "C";
534:   }
535:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
536:   PetscCall(VecRestoreKokkosView(xx, &xv));
537:   PetscCall(VecRestoreKokkosView(zz, &zv));
538:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
539:   PetscCall(PetscLogGpuTimeEnd());
540:   PetscFunctionReturn(PETSC_SUCCESS);
541: }

543: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
544: {
545:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

547:   PetscFunctionBegin;
548:   switch (op) {
549:   case MAT_FORM_EXPLICIT_TRANSPOSE:
550:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
551:     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
552:     A->form_explicit_transpose = flg;
553:     break;
554:   default:
555:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
556:     break;
557:   }
558:   PetscFunctionReturn(PETSC_SUCCESS);
559: }

561: /* Depending on reuse, either build a new mat, or use the existing mat */
562: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
563: {
564:   Mat_SeqAIJ *aseq;

566:   PetscFunctionBegin;
567:   PetscCall(PetscKokkosInitializeCheck());
568:   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
569:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
570:     PetscCall(MatSetType(*newmat, mtype));
571:   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
572:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
573:   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
574:     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
575:     PetscCall(PetscFree(A->defaultvectype));
576:     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
577:     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
578:     PetscCall(MatSetOps_SeqAIJKokkos(A));
579:     aseq = static_cast<Mat_SeqAIJ *>(A->data);
580:     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
581:       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
582:       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
583:     }
584:   }
585:   PetscFunctionReturn(PETSC_SUCCESS);
586: }

588: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
589:    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
590:  */
591: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
592: {
593:   Mat_SeqAIJ       *bseq;
594:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
595:   Mat               mat;

597:   PetscFunctionBegin;
598:   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
599:   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
600:   mat = *B;
601:   if (A->assembled) {
602:     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
603:     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
604:     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
605:     /* Now copy values to B if needed */
606:     if (dupOption == MAT_COPY_VALUES) {
607:       if (akok->a_dual.need_sync_device()) {
608:         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
609:         bkok->a_dual.modify_host();
610:       } else { /* If device has the latest data, we only copy data on device */
611:         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
612:         bkok->a_dual.modify_device();
613:       }
614:     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
615:       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
616:       bkok->a_dual.modify_host();
617:     }
618:     mat->spptr = bkok;
619:   }

621:   PetscCall(PetscFree(mat->defaultvectype));
622:   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
623:   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
624:   PetscCall(MatSetOps_SeqAIJKokkos(mat));
625:   PetscFunctionReturn(PETSC_SUCCESS);
626: }

628: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
629: {
630:   Mat               At;
631:   KokkosCsrMatrix   internT;
632:   Mat_SeqAIJKokkos *atkok, *bkok;

634:   PetscFunctionBegin;
635:   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
636:   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
637:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
638:     /* Deep copy internT, as we want to isolate the internal transpose */
639:     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
640:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
641:     if (reuse == MAT_INITIAL_MATRIX) *B = At;
642:     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
643:   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
644:     if ((*B)->assembled) {
645:       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
646:       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
647:       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
648:     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
649:       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
650:       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
651:       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
652:       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
653:       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
654:     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
655:   }
656:   PetscFunctionReturn(PETSC_SUCCESS);
657: }

659: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
660: {
661:   Mat_SeqAIJKokkos *aijkok;

663:   PetscFunctionBegin;
664:   if (A->factortype == MAT_FACTOR_NONE) {
665:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
666:     delete aijkok;
667:   } else {
668:     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
669:   }
670:   A->spptr = NULL;
671:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
672:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
673:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
674: #if defined(PETSC_HAVE_HYPRE)
675:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
676: #endif
677:   PetscCall(MatDestroy_SeqAIJ(A));
678:   PetscFunctionReturn(PETSC_SUCCESS);
679: }

681: /*MC
682:    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos

684:    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types

686:    Options Database Key:
687: .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`

689:   Level: beginner

691: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
692: M*/
693: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
694: {
695:   PetscFunctionBegin;
696:   PetscCall(PetscKokkosInitializeCheck());
697:   PetscCall(MatCreate_SeqAIJ(A));
698:   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
699:   PetscFunctionReturn(PETSC_SUCCESS);
700: }

702: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
703: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
704: {
705:   Mat_SeqAIJ         *a, *b;
706:   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
707:   MatScalarKokkosView aa, ba, ca;
708:   MatRowMapKokkosView ai, bi, ci;
709:   MatColIdxKokkosView aj, bj, cj;
710:   PetscInt            m, n, nnz, aN;

712:   PetscFunctionBegin;
715:   PetscAssertPointer(C, 4);
716:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
717:   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
718:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
719:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");

721:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
722:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
723:   a    = static_cast<Mat_SeqAIJ *>(A->data);
724:   b    = static_cast<Mat_SeqAIJ *>(B->data);
725:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
726:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
727:   aa   = akok->a_dual.view_device();
728:   ai   = akok->i_dual.view_device();
729:   ba   = bkok->a_dual.view_device();
730:   bi   = bkok->i_dual.view_device();
731:   m    = A->rmap->n; /* M, N and nnz of C */
732:   n    = A->cmap->n + B->cmap->n;
733:   nnz  = a->nz + b->nz;
734:   aN   = A->cmap->n; /* N of A */
735:   if (reuse == MAT_INITIAL_MATRIX) {
736:     aj           = akok->j_dual.view_device();
737:     bj           = bkok->j_dual.view_device();
738:     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
739:     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
740:     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
741:     ca           = ca_dual.view_device();
742:     ci           = ci_dual.view_device();
743:     cj           = cj_dual.view_device();

745:     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
746:     Kokkos::parallel_for(
747:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
748:         PetscInt i       = t.league_rank(); /* row i */
749:         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);

751:         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
752:                                                    ci(i) = coffset;
753:                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
754:         });

756:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
757:           if (k < alen) {
758:             ca(coffset + k) = aa(ai(i) + k);
759:             cj(coffset + k) = aj(ai(i) + k);
760:           } else {
761:             ca(coffset + k) = ba(bi(i) + k - alen);
762:             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
763:           }
764:         });
765:       });
766:     ca_dual.modify_device();
767:     ci_dual.modify_device();
768:     cj_dual.modify_device();
769:     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
770:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
771:   } else if (reuse == MAT_REUSE_MATRIX) {
773:     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
774:     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
775:     ca   = ckok->a_dual.view_device();
776:     ci   = ckok->i_dual.view_device();

778:     Kokkos::parallel_for(
779:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
780:         PetscInt i    = t.league_rank(); /* row i */
781:         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
782:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
783:           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
784:           else ca(ci(i) + k) = ba(bi(i) + k - alen);
785:         });
786:       });
787:     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
788:   }
789:   PetscFunctionReturn(PETSC_SUCCESS);
790: }

792: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
793: {
794:   PetscFunctionBegin;
795:   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
796:   PetscFunctionReturn(PETSC_SUCCESS);
797: }

799: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
800: {
801:   Mat_Product                 *product = C->product;
802:   Mat                          A, B;
803:   bool                         transA, transB; /* use bool, since KK needs this type */
804:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
805:   Mat_SeqAIJ                  *c;
806:   MatProductData_SeqAIJKokkos *pdata;
807:   KokkosCsrMatrix              csrmatA, csrmatB;

809:   PetscFunctionBegin;
810:   MatCheckProduct(C, 1);
811:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
812:   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);

814:   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
815:   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
816:   // we still do numeric.
817:   if (pdata->reusesym) { // numeric reuses results from symbolic
818:     pdata->reusesym = PETSC_FALSE;
819:     PetscFunctionReturn(PETSC_SUCCESS);
820:   }

822:   switch (product->type) {
823:   case MATPRODUCT_AB:
824:     transA = false;
825:     transB = false;
826:     break;
827:   case MATPRODUCT_AtB:
828:     transA = true;
829:     transB = false;
830:     break;
831:   case MATPRODUCT_ABt:
832:     transA = false;
833:     transB = true;
834:     break;
835:   default:
836:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
837:   }

839:   A = product->A;
840:   B = product->B;
841:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
842:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
843:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
844:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
845:   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);

847:   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");

849:   csrmatA = akok->csrmat;
850:   csrmatB = bkok->csrmat;

852:   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
853:   if (transA) {
854:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
855:     transA = false;
856:   }

858:   if (transB) {
859:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
860:     transB = false;
861:   }
862:   PetscCall(PetscLogGpuTimeBegin());
863:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
864: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
865:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
866:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
867: #endif

869:   PetscCall(PetscLogGpuTimeEnd());
870:   PetscCall(MatSeqAIJKokkosModifyDevice(C));
871:   /* shorter version of MatAssemblyEnd_SeqAIJ */
872:   c = (Mat_SeqAIJ *)C->data;
873:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
874:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
875:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
876:   c->reallocs         = 0;
877:   C->info.mallocs     = 0;
878:   C->info.nz_unneeded = 0;
879:   C->assembled = C->was_assembled = PETSC_TRUE;
880:   C->num_ass++;
881:   PetscFunctionReturn(PETSC_SUCCESS);
882: }

884: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
885: {
886:   Mat_Product                 *product = C->product;
887:   MatProductType               ptype;
888:   Mat                          A, B;
889:   bool                         transA, transB;
890:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
891:   MatProductData_SeqAIJKokkos *pdata;
892:   MPI_Comm                     comm;
893:   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;

895:   PetscFunctionBegin;
896:   MatCheckProduct(C, 1);
897:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
898:   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
899:   A = product->A;
900:   B = product->B;
901:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
902:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
903:   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
904:   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
905:   csrmatA = akok->csrmat;
906:   csrmatB = bkok->csrmat;

908:   ptype = product->type;
909:   // Take advantage of the symmetry if true
910:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
911:     ptype                                          = MATPRODUCT_AB;
912:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
913:   }
914:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
915:     ptype                                          = MATPRODUCT_AB;
916:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
917:   }

919:   switch (ptype) {
920:   case MATPRODUCT_AB:
921:     transA = false;
922:     transB = false;
923:     PetscCall(MatSetBlockSizesFromMats(C, A, B));
924:     break;
925:   case MATPRODUCT_AtB:
926:     transA = true;
927:     transB = false;
928:     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
929:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
930:     break;
931:   case MATPRODUCT_ABt:
932:     transA = false;
933:     transB = true;
934:     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
935:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
936:     break;
937:   default:
938:     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
939:   }
940:   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
941:   pdata->reusesym = product->api_user;

943:   /* TODO: add command line options to select spgemm algorithms */
944:   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */

946:   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
947: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
948:   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
949:   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
950:   #endif
951: #endif
952:   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));

954:   PetscCall(PetscLogGpuTimeBegin());
955:   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
956:   if (transA) {
957:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
958:     transA = false;
959:   }

961:   if (transB) {
962:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
963:     transB = false;
964:   }

966:   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
967:   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
968:     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
969:     calling new Mat_SeqAIJKokkos().
970:     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
971:   */
972:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
973: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
974:   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
975:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
976:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
977: #endif
978:   PetscCall(PetscLogGpuTimeEnd());

980:   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
981:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
982:   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
983:   PetscFunctionReturn(PETSC_SUCCESS);
984: }

986: /* handles sparse matrix matrix ops */
987: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
988: {
989:   Mat_Product *product = mat->product;
990:   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;

992:   PetscFunctionBegin;
993:   MatCheckProduct(mat, 1);
994:   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
995:   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
996:   if (Biskok && Ciskok) {
997:     switch (product->type) {
998:     case MATPRODUCT_AB:
999:     case MATPRODUCT_AtB:
1000:     case MATPRODUCT_ABt:
1001:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1002:       break;
1003:     case MATPRODUCT_PtAP:
1004:     case MATPRODUCT_RARt:
1005:     case MATPRODUCT_ABC:
1006:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1007:       break;
1008:     default:
1009:       break;
1010:     }
1011:   } else { /* fallback for AIJ */
1012:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1013:   }
1014:   PetscFunctionReturn(PETSC_SUCCESS);
1015: }

1017: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1018: {
1019:   Mat_SeqAIJKokkos *aijkok;

1021:   PetscFunctionBegin;
1022:   PetscCall(PetscLogGpuTimeBegin());
1023:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1024:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1025:   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1026:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1027:   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1028:   PetscCall(PetscLogGpuTimeEnd());
1029:   PetscFunctionReturn(PETSC_SUCCESS);
1030: }

1032: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1033: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1034: {
1035:   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);

1037:   PetscFunctionBegin;
1038:   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1039:     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);

1041:     PetscCall(PetscLogGpuTimeBegin());
1042:     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1043:     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1044:     const auto &Aa     = aijkok->a_dual.view_device();
1045:     const auto &Adiag  = aijkok->diag_dual.view_device();
1046:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1047:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1048:     PetscCall(PetscLogGpuFlops(n));
1049:     PetscCall(PetscLogGpuTimeEnd());
1050:   } else { // need reassembly, very slow!
1051:     PetscCall(MatShift_Basic(A, a));
1052:   }
1053:   PetscFunctionReturn(PETSC_SUCCESS);
1054: }

1056: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1057: {
1058:   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);

1060:   PetscFunctionBegin;
1061:   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1062:     ConstPetscScalarKokkosView dv;
1063:     PetscInt                   n, nv;

1065:     PetscCall(PetscLogGpuTimeBegin());
1066:     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1067:     PetscCall(VecGetKokkosView(D, &dv));
1068:     PetscCall(VecGetLocalSize(D, &nv));
1069:     n = PetscMin(Y->rmap->n, Y->cmap->n);
1070:     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");

1072:     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1073:     const auto &Aa     = aijkok->a_dual.view_device();
1074:     const auto &Adiag  = aijkok->diag_dual.view_device();
1075:     PetscCallCXX(Kokkos::parallel_for(
1076:       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1077:         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1078:         else Aa(Adiag(i)) += dv(i);
1079:       }));
1080:     PetscCall(VecRestoreKokkosView(D, &dv));
1081:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1082:     PetscCall(PetscLogGpuFlops(n));
1083:     PetscCall(PetscLogGpuTimeEnd());
1084:   } else { // need reassembly, very slow!
1085:     PetscCall(MatDiagonalSet_Default(Y, D, is));
1086:   }
1087:   PetscFunctionReturn(PETSC_SUCCESS);
1088: }

1090: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1091: {
1092:   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1093:   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1094:   ConstPetscScalarKokkosView lv, rv;

1096:   PetscFunctionBegin;
1097:   PetscCall(PetscLogGpuTimeBegin());
1098:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1099:   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1100:   const auto &Aa     = aijkok->a_dual.view_device();
1101:   const auto &Ai     = aijkok->i_dual.view_device();
1102:   const auto &Aj     = aijkok->j_dual.view_device();
1103:   if (ll) {
1104:     PetscCall(VecGetLocalSize(ll, &m));
1105:     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1106:     PetscCall(VecGetKokkosView(ll, &lv));
1107:     PetscCallCXX(Kokkos::parallel_for( // for each row
1108:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1109:         PetscInt i   = t.league_rank(); // row i
1110:         PetscInt len = Ai(i + 1) - Ai(i);
1111:         // scale entries on the row
1112:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1113:       }));
1114:     PetscCall(VecRestoreKokkosView(ll, &lv));
1115:     PetscCall(PetscLogGpuFlops(nz));
1116:   }
1117:   if (rr) {
1118:     PetscCall(VecGetLocalSize(rr, &n));
1119:     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1120:     PetscCall(VecGetKokkosView(rr, &rv));
1121:     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1122:       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1123:     PetscCall(VecRestoreKokkosView(rr, &lv));
1124:     PetscCall(PetscLogGpuFlops(nz));
1125:   }
1126:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1127:   PetscCall(PetscLogGpuTimeEnd());
1128:   PetscFunctionReturn(PETSC_SUCCESS);
1129: }

1131: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1132: {
1133:   Mat_SeqAIJKokkos *aijkok;

1135:   PetscFunctionBegin;
1136:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1137:   if (aijkok) { /* Only zero the device if data is already there */
1138:     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1139:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1140:   } else { /* Might be preallocated but not assembled */
1141:     PetscCall(MatZeroEntries_SeqAIJ(A));
1142:   }
1143:   PetscFunctionReturn(PETSC_SUCCESS);
1144: }

1146: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1147: {
1148:   Mat_SeqAIJKokkos     *aijkok;
1149:   PetscInt              n;
1150:   PetscScalarKokkosView xv;

1152:   PetscFunctionBegin;
1153:   PetscCall(VecGetLocalSize(x, &n));
1154:   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1155:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");

1157:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1158:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

1160:   const auto &Aa    = aijkok->a_dual.view_device();
1161:   const auto &Ai    = aijkok->i_dual.view_device();
1162:   const auto &Adiag = aijkok->diag_dual.view_device();

1164:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1165:   Kokkos::parallel_for(
1166:     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1167:       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1168:       else xv(i) = 0;
1169:     });
1170:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1171:   PetscFunctionReturn(PETSC_SUCCESS);
1172: }

1174: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1175: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1176: {
1177:   Mat_SeqAIJKokkos *aijkok;

1179:   PetscFunctionBegin;
1181:   PetscAssertPointer(kv, 2);
1182:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1183:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1184:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1185:   *kv    = aijkok->a_dual.view_device();
1186:   PetscFunctionReturn(PETSC_SUCCESS);
1187: }

1189: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1190: {
1191:   PetscFunctionBegin;
1193:   PetscAssertPointer(kv, 2);
1194:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1195:   PetscFunctionReturn(PETSC_SUCCESS);
1196: }

1198: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1199: {
1200:   Mat_SeqAIJKokkos *aijkok;

1202:   PetscFunctionBegin;
1204:   PetscAssertPointer(kv, 2);
1205:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1206:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1207:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1208:   *kv    = aijkok->a_dual.view_device();
1209:   PetscFunctionReturn(PETSC_SUCCESS);
1210: }

1212: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1213: {
1214:   PetscFunctionBegin;
1216:   PetscAssertPointer(kv, 2);
1217:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1218:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1219:   PetscFunctionReturn(PETSC_SUCCESS);
1220: }

1222: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1223: {
1224:   Mat_SeqAIJKokkos *aijkok;

1226:   PetscFunctionBegin;
1228:   PetscAssertPointer(kv, 2);
1229:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1230:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1231:   *kv    = aijkok->a_dual.view_device();
1232:   PetscFunctionReturn(PETSC_SUCCESS);
1233: }

1235: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1236: {
1237:   PetscFunctionBegin;
1239:   PetscAssertPointer(kv, 2);
1240:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1241:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1242:   PetscFunctionReturn(PETSC_SUCCESS);
1243: }

1245: PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1246: {
1247:   Mat_SeqAIJKokkos *akok;

1249:   PetscFunctionBegin;
1250:   auto exec = PetscGetKokkosExecutionSpace();
1251:   // Create host copies of the input aij
1252:   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1253:   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1254:   // Don't copy the vals to the host now
1255:   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);

1257:   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1258:   // Note we have modified device data so it will copy lazily
1259:   a_dual.modify_device();
1260:   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1261:   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);

1263:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1264:   PetscCall(MatCreate(comm, A));
1265:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1266:   PetscFunctionReturn(PETSC_SUCCESS);
1267: }

1269: /* Computes Y += alpha X */
1270: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1271: {
1272:   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1273:   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1274:   ConstMatScalarKokkosView Xa;
1275:   MatScalarKokkosView      Ya;
1276:   auto                     exec = PetscGetKokkosExecutionSpace();

1278:   PetscFunctionBegin;
1279:   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1280:   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1281:   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1282:   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1283:   PetscCall(PetscLogGpuTimeBegin());

1285:   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1286:     PetscBool e;
1287:     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1288:     if (e) {
1289:       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1290:       if (e) pattern = SAME_NONZERO_PATTERN;
1291:     }
1292:   }

1294:   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1295:     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1296:     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1297:     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1298:   */
1299:   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1300:   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1301:   Xa   = xkok->a_dual.view_device();
1302:   Ya   = ykok->a_dual.view_device();

1304:   if (pattern == SAME_NONZERO_PATTERN) {
1305:     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1306:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1307:   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1308:     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1309:     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();

1311:     Kokkos::parallel_for(
1312:       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1313:         PetscInt i = t.league_rank(); // row i
1314:         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1315:           // Only one thread works in a team
1316:           PetscInt p, q = Yi(i);
1317:           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1318:             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1319:             if (Xj(p) == Yj(q)) {                        // Found it
1320:               Ya(q) += alpha * Xa(p);
1321:               q++;
1322:             } else {
1323:             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1324:             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1325: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1326:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1327: #else
1328:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1329: #endif
1330:             }
1331:           }
1332:         });
1333:       });
1334:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1335:   } else { // different nonzero patterns
1336:     Mat             Z;
1337:     KokkosCsrMatrix zcsr;
1338:     KernelHandle    kh;
1339:     kh.create_spadd_handle(true); // X, Y are sorted
1340:     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1341:     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1342:     zkok = new Mat_SeqAIJKokkos(zcsr);
1343:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1344:     PetscCall(MatHeaderReplace(Y, &Z));
1345:     kh.destroy_spadd_handle();
1346:   }
1347:   PetscCall(PetscLogGpuTimeEnd());
1348:   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1349:   PetscFunctionReturn(PETSC_SUCCESS);
1350: }

1352: struct MatCOOStruct_SeqAIJKokkos {
1353:   PetscCount           n;
1354:   PetscCount           Atot;
1355:   PetscInt             nz;
1356:   PetscCountKokkosView jmap;
1357:   PetscCountKokkosView perm;

1359:   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1360:   {
1361:     nz   = coo_h->nz;
1362:     n    = coo_h->n;
1363:     Atot = coo_h->Atot;
1364:     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1365:     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1366:   }
1367: };

1369: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1370: {
1371:   PetscFunctionBegin;
1372:   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1373:   PetscFunctionReturn(PETSC_SUCCESS);
1374: }

1376: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1377: {
1378:   Mat_SeqAIJKokkos          *akok;
1379:   Mat_SeqAIJ                *aseq;
1380:   PetscContainer             container_h;
1381:   MatCOOStruct_SeqAIJ       *coo_h;
1382:   MatCOOStruct_SeqAIJKokkos *coo_d;

1384:   PetscFunctionBegin;
1385:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1386:   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1387:   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1388:   delete akok;
1389:   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1390:   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));

1392:   // Copy the COO struct to device
1393:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1394:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1395:   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));

1397:   // Put the COO struct in a container and then attach that to the matrix
1398:   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1399:   PetscFunctionReturn(PETSC_SUCCESS);
1400: }

1402: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1403: {
1404:   MatScalarKokkosView        Aa;
1405:   ConstMatScalarKokkosView   kv;
1406:   PetscMemType               memtype;
1407:   PetscContainer             container;
1408:   MatCOOStruct_SeqAIJKokkos *coo;

1410:   PetscFunctionBegin;
1411:   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1412:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

1414:   const auto &n    = coo->n;
1415:   const auto &Annz = coo->nz;
1416:   const auto &jmap = coo->jmap;
1417:   const auto &perm = coo->perm;

1419:   PetscCall(PetscGetMemType(v, &memtype));
1420:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1421:     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1422:   } else {
1423:     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1424:   }

1426:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1427:   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */

1429:   PetscCall(PetscLogGpuTimeBegin());
1430:   Kokkos::parallel_for(
1431:     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1432:       PetscScalar sum = 0.0;
1433:       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1434:       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1435:     });
1436:   PetscCall(PetscLogGpuTimeEnd());

1438:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1439:   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1440:   PetscFunctionReturn(PETSC_SUCCESS);
1441: }

1443: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1444: {
1445:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

1447:   PetscFunctionBegin;
1448:   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1449:   A->boundtocpu  = PETSC_FALSE;

1451:   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1452:   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1453:   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1454:   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1455:   A->ops->scale                     = MatScale_SeqAIJKokkos;
1456:   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1457:   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1458:   A->ops->mult                      = MatMult_SeqAIJKokkos;
1459:   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1460:   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1461:   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1462:   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1463:   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1464:   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1465:   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1466:   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1467:   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1468:   A->ops->shift                     = MatShift_SeqAIJKokkos;
1469:   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1470:   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1471:   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1472:   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1473:   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1474:   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1475:   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1476:   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1477:   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;

1479:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1480:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1481: #if defined(PETSC_HAVE_HYPRE)
1482:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1483: #endif
1484:   PetscFunctionReturn(PETSC_SUCCESS);
1485: }

1487: /*
1488:    Extract the (prescribled) diagonal blocks of the matrix and then invert them

1490:   Input Parameters:
1491: +  A       - the MATSEQAIJKOKKOS matrix
1492: .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1493: .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1494: .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1495: -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine

1497:   Output Parameter:
1498: .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1499: */
1500: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1501: {
1502:   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1503:   PetscInt          nblocks = bs.extent(0) - 1;

1505:   PetscFunctionBegin;
1506:   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device

1508:   // Pull out the diagonal blocks of the matrix and then invert the blocks
1509:   auto Aa    = akok->a_dual.view_device();
1510:   auto Ai    = akok->i_dual.view_device();
1511:   auto Aj    = akok->j_dual.view_device();
1512:   auto Adiag = akok->diag_dual.view_device();
1513:   // TODO: how to tune the team size?
1514: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1515:   auto ts = Kokkos::AUTO();
1516: #else
1517:   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1518: #endif
1519:   PetscCallCXX(Kokkos::parallel_for(
1520:     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1521:       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1522:       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1523:       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1524:       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1525:       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);

1527:       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1528:         PetscInt i = rstart + r;                                                            // i-th row in A

1530:         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1531:           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row

1533:           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1534:             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1535:               B(r, c) = 0.0;
1536:             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1537:               B(r, c) = Aa(first + c);
1538:             } else { // this entry does not show up in the CSR
1539:               B(r, c) = 0.0;
1540:             }
1541:           }
1542:         } else { // rare case that the diagonal does not exist
1543:           const PetscInt begin = Ai(i);
1544:           const PetscInt end   = Ai(i + 1);
1545:           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1546:           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1547:             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1548:             else if (Aj(j) >= rstart + m) break;
1549:           }
1550:         }
1551:       });

1553:       // LU-decompose B (w/o pivoting) and then invert B
1554:       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1555:       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1556:     }));
1557:   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1558:   PetscFunctionReturn(PETSC_SUCCESS);
1559: }

1561: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1562: {
1563:   Mat_SeqAIJ *aseq;
1564:   PetscInt    i, m, n;
1565:   auto        exec = PetscGetKokkosExecutionSpace();

1567:   PetscFunctionBegin;
1568:   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");

1570:   m = akok->nrows();
1571:   n = akok->ncols();
1572:   PetscCall(MatSetSizes(A, m, n, m, n));
1573:   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));

1575:   /* Set up data structures of A as a MATSEQAIJ */
1576:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1577:   aseq = (Mat_SeqAIJ *)A->data;

1579:   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1580:   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));

1582:   aseq->i       = akok->i_host_data();
1583:   aseq->j       = akok->j_host_data();
1584:   aseq->a       = akok->a_host_data();
1585:   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1586:   aseq->free_a  = PETSC_FALSE;
1587:   aseq->free_ij = PETSC_FALSE;
1588:   aseq->nz      = akok->nnz();
1589:   aseq->maxnz   = aseq->nz;

1591:   PetscCall(PetscMalloc1(m, &aseq->imax));
1592:   PetscCall(PetscMalloc1(m, &aseq->ilen));
1593:   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];

1595:   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1596:   akok->nonzerostate = A->nonzerostate;
1597:   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1598:   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1599:   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1600:   PetscFunctionReturn(PETSC_SUCCESS);
1601: }

1603: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1604: {
1605:   PetscFunctionBegin;
1606:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1607:   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1608:   PetscFunctionReturn(PETSC_SUCCESS);
1609: }

1611: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1612: {
1613:   Mat_SeqAIJKokkos *akok;

1615:   PetscFunctionBegin;
1616:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1617:   PetscCall(MatCreate(comm, A));
1618:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1619:   PetscFunctionReturn(PETSC_SUCCESS);
1620: }

1622: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure

1624:    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1625:  */
1626: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1627: {
1628:   PetscFunctionBegin;
1629:   PetscCall(MatCreate(comm, A));
1630:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1631:   PetscFunctionReturn(PETSC_SUCCESS);
1632: }

1634: /*@C
1635:   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1636:   (the default parallel PETSc format). This matrix will ultimately be handled by
1637:   Kokkos for calculations.

1639:   Collective

1641:   Input Parameters:
1642: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1643: . m    - number of rows
1644: . n    - number of columns
1645: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1646: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

1648:   Output Parameter:
1649: . A - the matrix

1651:   Level: intermediate

1653:   Notes:
1654:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1655:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1656:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

1658:   The AIJ format, also called
1659:   compressed row storage, is fully compatible with standard Fortran
1660:   storage.  That is, the stored row and column indices can begin at
1661:   either one (as in Fortran) or zero.

1663:   Specify the preallocated storage with either `nz` or `nnz` (not both).
1664:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1665:   allocation.

1667: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1668: @*/
1669: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1670: {
1671:   PetscFunctionBegin;
1672:   PetscCall(PetscKokkosInitializeCheck());
1673:   PetscCall(MatCreate(comm, A));
1674:   PetscCall(MatSetSizes(*A, m, n, m, n));
1675:   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1676:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1677:   PetscFunctionReturn(PETSC_SUCCESS);
1678: }

1680: // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1681: // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1682: // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1683: // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1684: static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1685: {
1686:   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1687:   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1688:   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy

1690:   PetscFunctionBegin;
1691:   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1692:     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1693:     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1694:     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1695:   }
1696:   PetscFunctionReturn(PETSC_SUCCESS);
1697: }

1699: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1700: {
1701:   const PetscInt              n         = A->rmap->n;
1702:   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1703:   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1704:   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy

1706:   PetscFunctionBegin;
1707:   if (!factors->transpose_updated) {
1708:     if (has_upper) {
1709:       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1710:         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1711:         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1712:         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1713:       }

1715:       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1716:         if (!factors->U) {
1717:           Mat_SeqAIJ *seq;

1719:           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1720:           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));

1722:           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1723:           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1724:           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1725:           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1726:         } else {
1727:           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1728:         }
1729:         // Copy Ut from host to device
1730:         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1731:         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1732:         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1733:       } else { // If U was computed on device, we also compute the transpose there
1734:         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1735:         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1736:                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1737:                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1738:                                                                                                                                                                                                                                factors->aUt_d));
1739:         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1740:       }
1741:       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1742:     }

1744:     // do the same for L with LU
1745:     if (has_lower) {
1746:       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1747:         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1748:         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1749:         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1750:       }

1752:       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1753:         if (!factors->L) {
1754:           Mat_SeqAIJ *seq;

1756:           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1757:           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));

1759:           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1760:           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1761:           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1762:           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1763:         } else {
1764:           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1765:         }
1766:         // Copy Lt from host to device
1767:         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1768:         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1769:         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1770:       } else { // If L was computed on device, we also compute the transpose there
1771:         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1772:         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1773:                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1774:                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1775:                                                                                                                                                                                                                                factors->aLt_d));
1776:         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1777:       }
1778:       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1779:     }

1781:     factors->transpose_updated = PETSC_TRUE;
1782:   }
1783:   PetscFunctionReturn(PETSC_SUCCESS);
1784: }

1786: // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1787: // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1788: static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1789: {
1790:   auto                        exec    = PetscGetKokkosExecutionSpace();
1791:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1792:   PetscInt                    m       = A->rmap->n;
1793:   PetscScalarKokkosView       D       = factors->D_d;
1794:   PetscScalarKokkosView       X, Y, B; // alias
1795:   ConstPetscScalarKokkosView  b;
1796:   PetscScalarKokkosView       x;
1797:   PetscIntKokkosView         &rowperm  = factors->rowperm;
1798:   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;

1800:   PetscFunctionBegin;
1801:   PetscCall(PetscLogGpuTimeBegin());
1802:   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1803:   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1804:   PetscCall(VecGetKokkosView(bb, &b));
1805:   PetscCall(VecGetKokkosViewWrite(xx, &x));

1807:   // Solve U^T Y = B
1808:   if (identity) { // Reorder b with the row permutation
1809:     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1810:     Y = factors->workVector;
1811:   } else {
1812:     B = factors->workVector;
1813:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1814:     Y = x;
1815:   }
1816:   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));

1818:   // Solve diag(D) Y' = Y.
1819:   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1820:   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));

1822:   // Solve UX = Y
1823:   if (identity) {
1824:     X = x;
1825:   } else {
1826:     X = factors->workVector; // B is not needed anymore
1827:   }
1828:   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));

1830:   // Reorder X with the inverse column (row) permutation
1831:   if (!identity) {
1832:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1833:   }

1835:   PetscCall(VecRestoreKokkosView(bb, &b));
1836:   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1837:   PetscCall(PetscLogGpuTimeEnd());
1838:   PetscFunctionReturn(PETSC_SUCCESS);
1839: }

1841: // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1842: // R and C are represented by rowperm and colperm in factors.
1843: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1844: static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1845: {
1846:   auto                        exec    = PetscGetKokkosExecutionSpace();
1847:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1848:   PetscInt                    m       = A->rmap->n;
1849:   PetscScalarKokkosView       X, Y, B; // alias
1850:   ConstPetscScalarKokkosView  b;
1851:   PetscScalarKokkosView       x;
1852:   PetscIntKokkosView         &rowperm      = factors->rowperm;
1853:   PetscIntKokkosView         &colperm      = factors->colperm;
1854:   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1855:   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;

1857:   PetscFunctionBegin;
1858:   PetscCall(PetscLogGpuTimeBegin());
1859:   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1860:   PetscCall(VecGetKokkosView(bb, &b));
1861:   PetscCall(VecGetKokkosViewWrite(xx, &x));

1863:   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1864:   if (row_identity) {
1865:     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1866:     Y = factors->workVector;
1867:   } else {
1868:     B = factors->workVector;
1869:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1870:     Y = x;
1871:   }
1872:   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));

1874:   // Solve U C^- x = Y
1875:   if (col_identity) {
1876:     X = x;
1877:   } else {
1878:     X = factors->workVector;
1879:   }
1880:   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));

1882:   // x = C X; Reorder X with the inverse col permutation
1883:   if (!col_identity) {
1884:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1885:   }

1887:   PetscCall(VecRestoreKokkosView(bb, &b));
1888:   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1889:   PetscCall(PetscLogGpuTimeEnd());
1890:   PetscFunctionReturn(PETSC_SUCCESS);
1891: }

1893: // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1894: // R and C are represented by rowperm and colperm in factors.
1895: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1896: // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1897: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1898: {
1899:   auto                        exec    = PetscGetKokkosExecutionSpace();
1900:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1901:   PetscInt                    m       = A->rmap->n;
1902:   PetscScalarKokkosView       X, Y, B; // alias
1903:   ConstPetscScalarKokkosView  b;
1904:   PetscScalarKokkosView       x;
1905:   PetscIntKokkosView         &rowperm      = factors->rowperm;
1906:   PetscIntKokkosView         &colperm      = factors->colperm;
1907:   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1908:   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;

1910:   PetscFunctionBegin;
1911:   PetscCall(PetscLogGpuTimeBegin());
1912:   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1913:   PetscCall(VecGetKokkosView(bb, &b));
1914:   PetscCall(VecGetKokkosViewWrite(xx, &x));

1916:   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1917:   if (col_identity) { // Reorder b with the col permutation
1918:     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1919:     Y = factors->workVector;
1920:   } else {
1921:     B = factors->workVector;
1922:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1923:     Y = x;
1924:   }
1925:   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));

1927:   // Solve L^T X = Y
1928:   if (row_identity) {
1929:     X = x;
1930:   } else {
1931:     X = factors->workVector;
1932:   }
1933:   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));

1935:   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1936:   if (!row_identity) {
1937:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1938:   }

1940:   PetscCall(VecRestoreKokkosView(bb, &b));
1941:   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1942:   PetscCall(PetscLogGpuTimeEnd());
1943:   PetscFunctionReturn(PETSC_SUCCESS);
1944: }

1946: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1947: {
1948:   PetscFunctionBegin;
1949:   PetscCall(MatSeqAIJKokkosSyncHost(A));
1950:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));

1952:   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1953:     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1954:     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1955:     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1956:     const MatScalar            *Ba = b->a;
1957:     PetscInt                    m = B->rmap->n, n = B->cmap->n;

1959:     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1960:       // Allocate memory and copy the structure
1961:       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1962:       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1963:       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1964:       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1965:       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1966:       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));

1968:       PetscInt *Li = factors->iL_h.data();
1969:       PetscInt *Lj = factors->jL_h.data();
1970:       PetscInt *Ui = factors->iU_h.data();
1971:       PetscInt *Uj = factors->jU_h.data();

1973:       Li[0] = Ui[0] = 0;
1974:       for (PetscInt i = 0; i < m; i++) {
1975:         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1976:         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry

1978:         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1979:         Lj[Li[i] + llen] = i;                        // diagonal entry of L

1981:         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1982:         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal

1984:         Li[i + 1] = Li[i] + llen + 1;
1985:         Ui[i + 1] = Ui[i] + ulen;
1986:       }

1988:       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1989:       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1990:       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1991:       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1992:       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1993:       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);

1995:       // Copy row/col permutation to device
1996:       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1997:       PetscBool row_identity;
1998:       PetscCall(ISIdentity(rowperm, &row_identity));
1999:       if (!row_identity) {
2000:         const PetscInt *ip;

2002:         PetscCall(ISGetIndices(rowperm, &ip));
2003:         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
2004:         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2005:         PetscCall(ISRestoreIndices(rowperm, &ip));
2006:         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2007:       }

2009:       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
2010:       PetscBool col_identity;
2011:       PetscCall(ISIdentity(colperm, &col_identity));
2012:       if (!col_identity) {
2013:         const PetscInt *ip;

2015:         PetscCall(ISGetIndices(colperm, &ip));
2016:         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2017:         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2018:         PetscCall(ISRestoreIndices(colperm, &ip));
2019:         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2020:       }

2022:       /* Create sptrsv handles for L, U and their transpose */
2023: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2024:       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2025: #else
2026:       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2027: #endif
2028:       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2029:       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2030:       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2031:       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2032:     }

2034:     // Copy the value
2035:     for (PetscInt i = 0; i < m; i++) {
2036:       PetscInt        llen = Bi[i + 1] - Bi[i];
2037:       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2038:       const PetscInt *Li   = factors->iL_h.data();
2039:       const PetscInt *Ui   = factors->iU_h.data();

2041:       PetscScalar *La = factors->aL_h.data();
2042:       PetscScalar *Ua = factors->aU_h.data();

2044:       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2045:       La[Li[i] + llen] = 1.0;                      // diagonal entry

2047:       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2048:       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2049:     }

2051:     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2052:     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2053:     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2054:     factors->transpose_updated         = PETSC_FALSE;
2055:     factors->sptrsv_symbolic_completed = PETSC_FALSE;

2057:     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2058:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2059:   }

2061:   B->ops->matsolve          = NULL;
2062:   B->ops->matsolvetranspose = NULL;
2063:   PetscFunctionReturn(PETSC_SUCCESS);
2064: }

2066: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2067: {
2068:   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2069:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2070:   PetscInt                    fill_lev = info->levels;

2072:   PetscFunctionBegin;
2073:   PetscCall(PetscLogGpuTimeBegin());
2074:   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2075:   PetscCall(MatSeqAIJKokkosSyncDevice(A));

2077:   auto a_d = aijkok->a_dual.view_device();
2078:   auto i_d = aijkok->i_dual.view_device();
2079:   auto j_d = aijkok->j_dual.view_device();

2081:   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));

2083:   B->assembled              = PETSC_TRUE;
2084:   B->preallocated           = PETSC_TRUE;
2085:   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2086:   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2087:   B->ops->matsolve          = NULL;
2088:   B->ops->matsolvetranspose = NULL;

2090:   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2091:   factors->transpose_updated         = PETSC_FALSE;
2092:   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2093:   /* TODO: log flops, but how to know that? */
2094:   PetscCall(PetscLogGpuTimeEnd());
2095:   PetscFunctionReturn(PETSC_SUCCESS);
2096: }

2098: // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2099: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2100: {
2101:   Mat_SeqAIJKokkos           *aijkok;
2102:   Mat_SeqAIJ                 *b;
2103:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2104:   PetscInt                    fill_lev = info->levels;
2105:   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2106:   PetscInt                    n        = A->rmap->n;

2108:   PetscFunctionBegin;
2109:   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2110:   PetscCall(MatSeqAIJKokkosSyncDevice(A));

2112:   /* Create a spiluk handle and then do symbolic factorization */
2113:   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2114:   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);

2116:   auto spiluk_handle = factors->kh.get_spiluk_handle();

2118:   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2119:   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2120:   Kokkos::realloc(factors->iU_d, n + 1);
2121:   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());

2123:   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2124:   auto i_d = aijkok->i_dual.view_device();
2125:   auto j_d = aijkok->j_dual.view_device();
2126:   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2127:   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */

2129:   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2130:   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2131:   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2132:   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());

2134:   /* TODO: add options to select sptrsv algorithms */
2135:   /* Create sptrsv handles for L, U and their transpose */
2136: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2137:   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2138: #else
2139:   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2140: #endif

2142:   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2143:   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2144:   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2145:   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);

2147:   /* Fill fields of the factor matrix B */
2148:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2149:   b     = (Mat_SeqAIJ *)B->data;
2150:   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2151:   B->info.fill_ratio_given  = info->fill;
2152:   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;

2154:   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2155:   PetscFunctionReturn(PETSC_SUCCESS);
2156: }

2158: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2159: {
2160:   PetscFunctionBegin;
2161:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2162:   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2163:   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2164:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2165:   PetscFunctionReturn(PETSC_SUCCESS);
2166: }

2168: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2169: {
2170:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;

2172:   PetscFunctionBegin;
2173:   if (!info->factoronhost) {
2174:     PetscCall(ISIdentity(isrow, &row_identity));
2175:     PetscCall(ISIdentity(iscol, &col_identity));
2176:   }

2178:   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2179:   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));

2181:   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2182:     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2183:   } else {
2184:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2185:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2186:   }
2187:   PetscFunctionReturn(PETSC_SUCCESS);
2188: }

2190: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2191: {
2192:   PetscFunctionBegin;
2193:   PetscCall(MatSeqAIJKokkosSyncHost(A));
2194:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));

2196:   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2197:     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2198:     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2199:     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2200:     const MatScalar            *Ba = b->a;
2201:     PetscInt                    m  = B->rmap->n;

2203:     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2204:       // Allocate memory and copy the structure
2205:       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2206:       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2207:       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2208:       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2209:       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2210:       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);

2212:       // Build jU_h from the skewed Aj
2213:       PetscInt *Uj = factors->jU_h.data();
2214:       for (PetscInt i = 0; i < m; i++) {
2215:         PetscInt ulen = Bi[i + 1] - Bi[i];
2216:         Uj[Bi[i]]     = i;                                              // diagonal entry
2217:         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2218:       }

2220:       // Copy iU, jU to device
2221:       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2222:       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));

2224:       // Copy row/col permutation to device
2225:       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2226:       PetscBool row_identity;
2227:       PetscCall(ISIdentity(rowperm, &row_identity));
2228:       if (!row_identity) {
2229:         const PetscInt *ip;

2231:         PetscCall(ISGetIndices(rowperm, &ip));
2232:         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2233:         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2234:         PetscCall(ISRestoreIndices(rowperm, &ip));
2235:         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2236:       }

2238:       // Create sptrsv handles for U and U^T
2239: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2240:       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2241: #else
2242:       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2243: #endif
2244:       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2245:       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2246:     }
2247:     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2248:     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2249:     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;

2251:     // Copy the value
2252:     PetscScalar *Ua = factors->aU_h.data();
2253:     PetscScalar *D  = factors->D_h.data();
2254:     for (PetscInt i = 0; i < m; i++) {
2255:       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2256:       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2257:       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2258:     }
2259:     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2260:     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));

2262:     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2263:     factors->transpose_updated         = PETSC_FALSE;
2264:   }

2266:   B->ops->matsolve          = NULL;
2267:   B->ops->matsolvetranspose = NULL;
2268:   PetscFunctionReturn(PETSC_SUCCESS);
2269: }

2271: static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2272: {
2273:   PetscFunctionBegin;
2274:   if (info->solveonhost) {
2275:     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2276:     PetscCall(MatSetType(B, MATSEQSBAIJ));
2277:     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2278:   }

2280:   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));

2282:   if (!info->solveonhost) {
2283:     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2284:     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2285:     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2286:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2287:   }
2288:   PetscFunctionReturn(PETSC_SUCCESS);
2289: }

2291: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2292: {
2293:   PetscFunctionBegin;
2294:   if (info->solveonhost) {
2295:     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2296:     PetscCall(MatSetType(B, MATSEQSBAIJ));
2297:     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2298:   }

2300:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm

2302:   if (!info->solveonhost) {
2303:     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2304:     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2305:     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2306:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2307:   }
2308:   PetscFunctionReturn(PETSC_SUCCESS);
2309: }

2311: // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2312: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2313: {
2314:   PetscFunctionBegin;
2315:   *type = MATSOLVERKOKKOS;
2316:   PetscFunctionReturn(PETSC_SUCCESS);
2317: }

2319: /*MC
2320:   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2321:   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.

2323:   Level: beginner

2325: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2326: M*/
2327: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2328: {
2329:   PetscInt n = A->rmap->n;
2330:   MPI_Comm comm;

2332:   PetscFunctionBegin;
2333:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2334:   PetscCall(MatCreate(comm, B));
2335:   PetscCall(MatSetSizes(*B, n, n, n, n));
2336:   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2337:   (*B)->factortype = ftype;
2338:   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2339:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2340:   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");

2342:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2343:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2344:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2345:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2346:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2347:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2348:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2349:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2350:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2351:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2352:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2353:   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);

2355:   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2356:   (*B)->canuseordering = PETSC_TRUE;
2357:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2358:   PetscFunctionReturn(PETSC_SUCCESS);
2359: }

2361: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2362: {
2363:   PetscFunctionBegin;
2364:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2365:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2366:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2367:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2368:   PetscFunctionReturn(PETSC_SUCCESS);
2369: }

2371: /* Utility to print out a KokkosCsrMatrix for debugging */
2372: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2373: {
2374:   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2375:   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2376:   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2377:   const PetscInt    *i  = iv.data();
2378:   const PetscInt    *j  = jv.data();
2379:   const PetscScalar *a  = av.data();
2380:   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();

2382:   PetscFunctionBegin;
2383:   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2384:   for (PetscInt k = 0; k < m; k++) {
2385:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2386:     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2387:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2388:   }
2389:   PetscFunctionReturn(PETSC_SUCCESS);
2390: }