Actual source code: aijkok.kokkos.cxx
1: #include <petscvec_kokkos.hpp>
2: #include <petscpkg_version.h>
3: #include <petsc/private/petscimpl.h>
4: #include <petsc/private/sfimpl.h>
5: #include <petscsystypes.h>
6: #include <petscerror.h>
8: #include <Kokkos_Core.hpp>
9: #include <KokkosBlas.hpp>
10: #include <KokkosSparse_CrsMatrix.hpp>
11: #include <KokkosSparse_spmv.hpp>
12: #include <KokkosSparse_spiluk.hpp>
13: #include <KokkosSparse_sptrsv.hpp>
14: #include <KokkosSparse_spgemm.hpp>
15: #include <KokkosSparse_spadd.hpp>
17: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99)
20: #include <KokkosSparse_Utils.hpp>
21: using KokkosSparse::sort_crs_matrix;
22: using KokkosSparse::Impl::transpose_matrix;
23: #else
24: #include <KokkosKernels_Sorting.hpp>
25: using KokkosKernels::sort_crs_matrix;
26: using KokkosKernels::Impl::transpose_matrix;
27: #endif
29: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
31: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33: In the latter case, it is important to set a_dual's sync state correctly.
34: */
35: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
36: {
37: Mat_SeqAIJ *aijseq;
38: Mat_SeqAIJKokkos *aijkok;
40: if (mode == MAT_FLUSH_ASSEMBLY) return 0;
41: MatAssemblyEnd_SeqAIJ(A, mode);
43: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
44: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
46: /* If aijkok does not exist, we just copy i, j to device.
47: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
48: In both cases, we build a new aijkok structure.
49: */
50: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
51: delete aijkok;
52: aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
53: A->spptr = aijkok;
54: }
56: if (aijkok->device_mat_d.data()) {
57: A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
58: }
59: return 0;
60: }
62: /* Sync CSR data to device if not yet */
63: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
64: {
65: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
69: if (aijkok->a_dual.need_sync_device()) {
70: aijkok->a_dual.sync_device();
71: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
72: aijkok->hermitian_updated = PETSC_FALSE;
73: }
74: return 0;
75: }
77: /* Mark the CSR data on device as modified */
78: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
79: {
80: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
83: aijkok->a_dual.clear_sync_state();
84: aijkok->a_dual.modify_device();
85: aijkok->transpose_updated = PETSC_FALSE;
86: aijkok->hermitian_updated = PETSC_FALSE;
87: MatSeqAIJInvalidateDiagonal(A);
88: PetscObjectStateIncrease((PetscObject)A);
89: return 0;
90: }
92: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
93: {
94: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
97: /* We do not expect one needs factors on host */
100: aijkok->a_dual.sync_host();
101: return 0;
102: }
104: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
105: {
106: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
108: /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
109: Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
110: reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
111: must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
112: */
113: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
114: aijkok->a_dual.sync_host();
115: *array = aijkok->a_dual.view_host().data();
116: } else { /* Happens when calling MatSetValues on a newly created matrix */
117: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
118: }
119: return 0;
120: }
122: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
123: {
124: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
126: if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
127: return 0;
128: }
130: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
131: {
132: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
134: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
135: aijkok->a_dual.sync_host();
136: *array = aijkok->a_dual.view_host().data();
137: } else {
138: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
139: }
140: return 0;
141: }
143: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
144: {
145: *array = NULL;
146: return 0;
147: }
149: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
150: {
151: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
153: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
154: *array = aijkok->a_dual.view_host().data();
155: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
156: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
157: }
158: return 0;
159: }
161: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
162: {
163: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
165: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
166: aijkok->a_dual.clear_sync_state();
167: aijkok->a_dual.modify_host();
168: }
169: return 0;
170: }
172: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
173: {
174: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
178: if (i) *i = aijkok->i_device_data();
179: if (j) *j = aijkok->j_device_data();
180: if (a) {
181: aijkok->a_dual.sync_device();
182: *a = aijkok->a_device_data();
183: }
184: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
185: return 0;
186: }
188: // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
189: PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
190: {
191: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
192: Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
195: aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k);
196: Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k);
197: return 0;
198: }
200: // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
201: PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
202: {
203: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
205: if (aijkok && aijkok->device_mat_d.data()) {
206: *d_mat = aijkok->device_mat_d.data();
207: } else {
208: MatSeqAIJKokkosSyncDevice(A); // create aijkok (we are making d_mat now so make a place for it)
209: *d_mat = NULL;
210: }
211: return 0;
212: }
214: /* Generate the transpose on device and cache it internally */
215: static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
216: {
217: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
220: if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
221: /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
222: aijkok->a_dual.sync_device();
223: aijkok->csrmatT = transpose_matrix(aijkok->csrmat);
224: sort_crs_matrix(aijkok->csrmatT);
225: aijkok->transpose_updated = PETSC_TRUE;
226: }
227: *csrmatT = &aijkok->csrmatT;
228: return 0;
229: }
231: /* Generate the Hermitian on device and cache it internally */
232: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
233: {
234: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
236: PetscLogGpuTimeBegin();
238: if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
239: aijkok->a_dual.sync_device();
240: aijkok->csrmatH = transpose_matrix(aijkok->csrmat);
241: sort_crs_matrix(aijkok->csrmatH);
242: #if defined(PETSC_USE_COMPLEX)
243: const auto &a = aijkok->csrmatH.values;
244: Kokkos::parallel_for(
245: a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); });
246: #endif
247: aijkok->hermitian_updated = PETSC_TRUE;
248: }
249: *csrmatH = &aijkok->csrmatH;
250: PetscLogGpuTimeEnd();
251: return 0;
252: }
254: /* y = A x */
255: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
256: {
257: Mat_SeqAIJKokkos *aijkok;
258: ConstPetscScalarKokkosView xv;
259: PetscScalarKokkosView yv;
261: PetscLogGpuTimeBegin();
262: MatSeqAIJKokkosSyncDevice(A);
263: VecGetKokkosView(xx, &xv);
264: VecGetKokkosViewWrite(yy, &yv);
265: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
266: KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A x + beta y */
267: VecRestoreKokkosView(xx, &xv);
268: VecRestoreKokkosViewWrite(yy, &yv);
269: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
270: PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz());
271: PetscLogGpuTimeEnd();
272: return 0;
273: }
275: /* y = A^T x */
276: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
277: {
278: Mat_SeqAIJKokkos *aijkok;
279: const char *mode;
280: ConstPetscScalarKokkosView xv;
281: PetscScalarKokkosView yv;
282: KokkosCsrMatrix *csrmat;
284: PetscLogGpuTimeBegin();
285: MatSeqAIJKokkosSyncDevice(A);
286: VecGetKokkosView(xx, &xv);
287: VecGetKokkosViewWrite(yy, &yv);
288: if (A->form_explicit_transpose) {
289: MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat);
290: mode = "N";
291: } else {
292: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
293: csrmat = &aijkok->csrmat;
294: mode = "T";
295: }
296: KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A^T x + beta y */
297: VecRestoreKokkosView(xx, &xv);
298: VecRestoreKokkosViewWrite(yy, &yv);
299: PetscLogGpuFlops(2.0 * csrmat->nnz());
300: PetscLogGpuTimeEnd();
301: return 0;
302: }
304: /* y = A^H x */
305: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
306: {
307: Mat_SeqAIJKokkos *aijkok;
308: const char *mode;
309: ConstPetscScalarKokkosView xv;
310: PetscScalarKokkosView yv;
311: KokkosCsrMatrix *csrmat;
313: PetscLogGpuTimeBegin();
314: MatSeqAIJKokkosSyncDevice(A);
315: VecGetKokkosView(xx, &xv);
316: VecGetKokkosViewWrite(yy, &yv);
317: if (A->form_explicit_transpose) {
318: MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat);
319: mode = "N";
320: } else {
321: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
322: csrmat = &aijkok->csrmat;
323: mode = "C";
324: }
325: KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A^H x + beta y */
326: VecRestoreKokkosView(xx, &xv);
327: VecRestoreKokkosViewWrite(yy, &yv);
328: PetscLogGpuFlops(2.0 * csrmat->nnz());
329: PetscLogGpuTimeEnd();
330: return 0;
331: }
333: /* z = A x + y */
334: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
335: {
336: Mat_SeqAIJKokkos *aijkok;
337: ConstPetscScalarKokkosView xv, yv;
338: PetscScalarKokkosView zv;
340: PetscLogGpuTimeBegin();
341: MatSeqAIJKokkosSyncDevice(A);
342: VecGetKokkosView(xx, &xv);
343: VecGetKokkosView(yy, &yv);
344: VecGetKokkosViewWrite(zz, &zv);
345: if (zz != yy) Kokkos::deep_copy(zv, yv);
346: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
347: KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A x + beta z */
348: VecRestoreKokkosView(xx, &xv);
349: VecRestoreKokkosView(yy, &yv);
350: VecRestoreKokkosViewWrite(zz, &zv);
351: PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz());
352: PetscLogGpuTimeEnd();
353: return 0;
354: }
356: /* z = A^T x + y */
357: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
358: {
359: Mat_SeqAIJKokkos *aijkok;
360: const char *mode;
361: ConstPetscScalarKokkosView xv, yv;
362: PetscScalarKokkosView zv;
363: KokkosCsrMatrix *csrmat;
365: PetscLogGpuTimeBegin();
366: MatSeqAIJKokkosSyncDevice(A);
367: VecGetKokkosView(xx, &xv);
368: VecGetKokkosView(yy, &yv);
369: VecGetKokkosViewWrite(zz, &zv);
370: if (zz != yy) Kokkos::deep_copy(zv, yv);
371: if (A->form_explicit_transpose) {
372: MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat);
373: mode = "N";
374: } else {
375: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
376: csrmat = &aijkok->csrmat;
377: mode = "T";
378: }
379: KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A^T x + beta z */
380: VecRestoreKokkosView(xx, &xv);
381: VecRestoreKokkosView(yy, &yv);
382: VecRestoreKokkosViewWrite(zz, &zv);
383: PetscLogGpuFlops(2.0 * csrmat->nnz());
384: PetscLogGpuTimeEnd();
385: return 0;
386: }
388: /* z = A^H x + y */
389: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
390: {
391: Mat_SeqAIJKokkos *aijkok;
392: const char *mode;
393: ConstPetscScalarKokkosView xv, yv;
394: PetscScalarKokkosView zv;
395: KokkosCsrMatrix *csrmat;
397: PetscLogGpuTimeBegin();
398: MatSeqAIJKokkosSyncDevice(A);
399: VecGetKokkosView(xx, &xv);
400: VecGetKokkosView(yy, &yv);
401: VecGetKokkosViewWrite(zz, &zv);
402: if (zz != yy) Kokkos::deep_copy(zv, yv);
403: if (A->form_explicit_transpose) {
404: MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat);
405: mode = "N";
406: } else {
407: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
408: csrmat = &aijkok->csrmat;
409: mode = "C";
410: }
411: KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A^H x + beta z */
412: VecRestoreKokkosView(xx, &xv);
413: VecRestoreKokkosView(yy, &yv);
414: VecRestoreKokkosViewWrite(zz, &zv);
415: PetscLogGpuFlops(2.0 * csrmat->nnz());
416: PetscLogGpuTimeEnd();
417: return 0;
418: }
420: PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
421: {
422: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
424: switch (op) {
425: case MAT_FORM_EXPLICIT_TRANSPOSE:
426: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
427: if (A->form_explicit_transpose && !flg && aijkok) aijkok->DestroyMatTranspose();
428: A->form_explicit_transpose = flg;
429: break;
430: default:
431: MatSetOption_SeqAIJ(A, op, flg);
432: break;
433: }
434: return 0;
435: }
437: /* Depending on reuse, either build a new mat, or use the existing mat */
438: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
439: {
440: Mat_SeqAIJ *aseq;
442: PetscKokkosInitializeCheck();
443: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
444: MatDuplicate(A, MAT_COPY_VALUES, newmat); /* the returned newmat is a SeqAIJKokkos */
445: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
446: MatCopy(A, *newmat, SAME_NONZERO_PATTERN); /* newmat is already a SeqAIJKokkos */
447: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
449: PetscFree(A->defaultvectype);
450: PetscStrallocpy(VECKOKKOS, &A->defaultvectype); /* Allocate and copy the string */
451: PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS);
452: MatSetOps_SeqAIJKokkos(A);
453: aseq = static_cast<Mat_SeqAIJ *>(A->data);
454: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
456: A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
457: }
458: }
459: return 0;
460: }
462: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
463: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
464: */
465: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
466: {
467: Mat_SeqAIJ *bseq;
468: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
469: Mat mat;
471: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
472: MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B);
473: mat = *B;
474: if (A->assembled) {
475: bseq = static_cast<Mat_SeqAIJ *>(mat->data);
476: bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
477: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
478: /* Now copy values to B if needed */
479: if (dupOption == MAT_COPY_VALUES) {
480: if (akok->a_dual.need_sync_device()) {
481: Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
482: bkok->a_dual.modify_host();
483: } else { /* If device has the latest data, we only copy data on device */
484: Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
485: bkok->a_dual.modify_device();
486: }
487: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
488: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
489: bkok->a_dual.modify_host();
490: }
491: mat->spptr = bkok;
492: }
494: PetscFree(mat->defaultvectype);
495: PetscStrallocpy(VECKOKKOS, &mat->defaultvectype); /* Allocate and copy the string */
496: PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS);
497: MatSetOps_SeqAIJKokkos(mat);
498: return 0;
499: }
501: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
502: {
503: Mat At;
504: KokkosCsrMatrix *internT;
505: Mat_SeqAIJKokkos *atkok, *bkok;
507: if (reuse == MAT_REUSE_MATRIX) MatTransposeCheckNonzeroState_Private(A, *B);
508: MatSeqAIJKokkosGenerateTranspose_Private(A, &internT); /* Generate a transpose internally */
509: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
510: /* Deep copy internT, as we want to isolate the internal transpose */
511: atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT));
512: MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At);
513: if (reuse == MAT_INITIAL_MATRIX) *B = At;
514: else MatHeaderReplace(A, &At); /* Replace A with At inplace */
515: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
516: if ((*B)->assembled) {
517: bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
518: Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values);
519: MatSeqAIJKokkosModifyDevice(*B);
520: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
521: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
522: MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */
523: MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz());
524: Kokkos::deep_copy(a_h, internT->values);
525: Kokkos::deep_copy(j_h, internT->graph.entries);
526: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
527: }
528: return 0;
529: }
531: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
532: {
533: Mat_SeqAIJKokkos *aijkok;
535: if (A->factortype == MAT_FACTOR_NONE) {
536: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
537: delete aijkok;
538: } else {
539: delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
540: }
541: A->spptr = NULL;
542: PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL);
543: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
544: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
545: MatDestroy_SeqAIJ(A);
546: return 0;
547: }
549: /*MC
550: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
552: A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
554: Options Database Keys:
555: . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
557: Level: beginner
559: .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
560: M*/
561: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
562: {
563: PetscKokkosInitializeCheck();
564: MatCreate_SeqAIJ(A);
565: MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A);
566: return 0;
567: }
569: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
570: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
571: {
572: Mat_SeqAIJ *a, *b;
573: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
574: MatScalarKokkosView aa, ba, ca;
575: MatRowMapKokkosView ai, bi, ci;
576: MatColIdxKokkosView aj, bj, cj;
577: PetscInt m, n, nnz, aN;
587: MatSeqAIJKokkosSyncDevice(A);
588: MatSeqAIJKokkosSyncDevice(B);
589: a = static_cast<Mat_SeqAIJ *>(A->data);
590: b = static_cast<Mat_SeqAIJ *>(B->data);
591: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
592: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
593: aa = akok->a_dual.view_device();
594: ai = akok->i_dual.view_device();
595: ba = bkok->a_dual.view_device();
596: bi = bkok->i_dual.view_device();
597: m = A->rmap->n; /* M, N and nnz of C */
598: n = A->cmap->n + B->cmap->n;
599: nnz = a->nz + b->nz;
600: aN = A->cmap->n; /* N of A */
601: if (reuse == MAT_INITIAL_MATRIX) {
602: aj = akok->j_dual.view_device();
603: bj = bkok->j_dual.view_device();
604: auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
605: auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
606: auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
607: ca = ca_dual.view_device();
608: ci = ci_dual.view_device();
609: cj = cj_dual.view_device();
611: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
612: Kokkos::parallel_for(
613: Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
614: PetscInt i = t.league_rank(); /* row i */
615: PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
617: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
618: ci(i) = coffset;
619: if (i == m - 1) ci(m) = ai(m) + bi(m);
620: });
622: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
623: if (k < alen) {
624: ca(coffset + k) = aa(ai(i) + k);
625: cj(coffset + k) = aj(ai(i) + k);
626: } else {
627: ca(coffset + k) = ba(bi(i) + k - alen);
628: cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
629: }
630: });
631: });
632: ca_dual.modify_device();
633: ci_dual.modify_device();
634: cj_dual.modify_device();
635: ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual);
636: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C);
637: } else if (reuse == MAT_REUSE_MATRIX) {
640: ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
641: ca = ckok->a_dual.view_device();
642: ci = ckok->i_dual.view_device();
644: Kokkos::parallel_for(
645: Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
646: PetscInt i = t.league_rank(); /* row i */
647: PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
648: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
649: if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
650: else ca(ci(i) + k) = ba(bi(i) + k - alen);
651: });
652: });
653: MatSeqAIJKokkosModifyDevice(*C);
654: }
655: return 0;
656: }
658: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
659: {
660: delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
661: return 0;
662: }
664: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
665: {
666: Mat_Product *product = C->product;
667: Mat A, B;
668: bool transA, transB; /* use bool, since KK needs this type */
669: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
670: Mat_SeqAIJ *c;
671: MatProductData_SeqAIJKokkos *pdata;
672: KokkosCsrMatrix *csrmatA, *csrmatB;
674: MatCheckProduct(C, 1);
676: pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
678: if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
679: pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */
680: return 0;
681: }
683: switch (product->type) {
684: case MATPRODUCT_AB:
685: transA = false;
686: transB = false;
687: break;
688: case MATPRODUCT_AtB:
689: transA = true;
690: transB = false;
691: break;
692: case MATPRODUCT_ABt:
693: transA = false;
694: transB = true;
695: break;
696: default:
697: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
698: }
700: A = product->A;
701: B = product->B;
702: MatSeqAIJKokkosSyncDevice(A);
703: MatSeqAIJKokkosSyncDevice(B);
704: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
705: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
706: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
710: csrmatA = &akok->csrmat;
711: csrmatB = &bkok->csrmat;
713: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
714: if (transA) {
715: MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA);
716: transA = false;
717: }
719: if (transB) {
720: MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB);
721: transB = false;
722: }
723: PetscLogGpuTimeBegin();
724: KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat);
725: auto spgemmHandle = pdata->kh.get_spgemm_handle();
726: if (spgemmHandle->get_sort_option() != 1) sort_crs_matrix(ckok->csrmat); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
728: PetscLogGpuTimeEnd();
729: MatSeqAIJKokkosModifyDevice(C);
730: /* shorter version of MatAssemblyEnd_SeqAIJ */
731: c = (Mat_SeqAIJ *)C->data;
732: PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz);
733: PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n");
734: PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax);
735: c->reallocs = 0;
736: C->info.mallocs = 0;
737: C->info.nz_unneeded = 0;
738: C->assembled = C->was_assembled = PETSC_TRUE;
739: C->num_ass++;
740: return 0;
741: }
743: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
744: {
745: Mat_Product *product = C->product;
746: MatProductType ptype;
747: Mat A, B;
748: bool transA, transB;
749: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
750: MatProductData_SeqAIJKokkos *pdata;
751: MPI_Comm comm;
752: KokkosCsrMatrix *csrmatA, *csrmatB, csrmatC;
754: MatCheckProduct(C, 1);
755: PetscObjectGetComm((PetscObject)C, &comm);
757: A = product->A;
758: B = product->B;
759: MatSeqAIJKokkosSyncDevice(A);
760: MatSeqAIJKokkosSyncDevice(B);
761: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
762: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
763: csrmatA = &akok->csrmat;
764: csrmatB = &bkok->csrmat;
766: ptype = product->type;
767: switch (ptype) {
768: case MATPRODUCT_AB:
769: transA = false;
770: transB = false;
771: break;
772: case MATPRODUCT_AtB:
773: transA = true;
774: transB = false;
775: break;
776: case MATPRODUCT_ABt:
777: transA = false;
778: transB = true;
779: break;
780: default:
781: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
782: }
784: product->data = pdata = new MatProductData_SeqAIJKokkos();
785: pdata->kh.set_team_work_size(16);
786: pdata->kh.set_dynamic_scheduling(true);
787: pdata->reusesym = product->api_user;
789: /* TODO: add command line options to select spgemm algorithms */
790: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
792: /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
793: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
794: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
795: spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
796: #endif
797: #endif
799: pdata->kh.create_spgemm_handle(spgemm_alg);
801: PetscLogGpuTimeBegin();
802: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
803: if (transA) {
804: MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA);
805: transA = false;
806: }
808: if (transB) {
809: MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB);
810: transB = false;
811: }
813: KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC);
815: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
816: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
817: calling new Mat_SeqAIJKokkos().
818: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
819: */
820: KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC);
821: /* Query if KK outputs a sorted matrix. If not, we need to sort it */
822: auto spgemmHandle = pdata->kh.get_spgemm_handle();
823: if (spgemmHandle->get_sort_option() != 1) sort_crs_matrix(csrmatC); /* sort_option defaults to -1 in KK!*/
824: PetscLogGpuTimeEnd();
826: ckok = new Mat_SeqAIJKokkos(csrmatC);
827: MatSetSeqAIJKokkosWithCSRMatrix(C, ckok);
828: C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
829: return 0;
830: }
832: /* handles sparse matrix matrix ops */
833: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
834: {
835: Mat_Product *product = mat->product;
836: PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
838: MatCheckProduct(mat, 1);
839: PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok);
840: if (product->type == MATPRODUCT_ABC) PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok);
841: if (Biskok && Ciskok) {
842: switch (product->type) {
843: case MATPRODUCT_AB:
844: case MATPRODUCT_AtB:
845: case MATPRODUCT_ABt:
846: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
847: break;
848: case MATPRODUCT_PtAP:
849: case MATPRODUCT_RARt:
850: case MATPRODUCT_ABC:
851: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
852: break;
853: default:
854: break;
855: }
856: } else { /* fallback for AIJ */
857: MatProductSetFromOptions_SeqAIJ(mat);
858: }
859: return 0;
860: }
862: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
863: {
864: Mat_SeqAIJKokkos *aijkok;
866: PetscLogGpuTimeBegin();
867: MatSeqAIJKokkosSyncDevice(A);
868: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
869: KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
870: MatSeqAIJKokkosModifyDevice(A);
871: PetscLogGpuFlops(aijkok->a_dual.extent(0));
872: PetscLogGpuTimeEnd();
873: return 0;
874: }
876: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
877: {
878: Mat_SeqAIJKokkos *aijkok;
880: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
881: if (aijkok) { /* Only zero the device if data is already there */
882: KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
883: MatSeqAIJKokkosModifyDevice(A);
884: } else { /* Might be preallocated but not assembled */
885: MatZeroEntries_SeqAIJ(A);
886: }
887: return 0;
888: }
890: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
891: {
892: Mat_SeqAIJ *aijseq;
893: Mat_SeqAIJKokkos *aijkok;
894: PetscInt n;
895: PetscScalarKokkosView xv;
897: VecGetLocalSize(x, &n);
901: MatSeqAIJKokkosSyncDevice(A);
902: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
904: if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
905: MatMarkDiagonal_SeqAIJ(A);
906: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
907: aijkok->SetDiagonal(aijseq->diag);
908: }
910: const auto &Aa = aijkok->a_dual.view_device();
911: const auto &Ai = aijkok->i_dual.view_device();
912: const auto &Adiag = aijkok->diag_dual.view_device();
914: VecGetKokkosViewWrite(x, &xv);
915: Kokkos::parallel_for(
916: n, KOKKOS_LAMBDA(const PetscInt i) {
917: if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
918: else xv(i) = 0;
919: });
920: VecRestoreKokkosViewWrite(x, &xv);
921: return 0;
922: }
924: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
925: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
926: {
927: Mat_SeqAIJKokkos *aijkok;
932: MatSeqAIJKokkosSyncDevice(A);
933: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
934: *kv = aijkok->a_dual.view_device();
935: return 0;
936: }
938: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
939: {
943: return 0;
944: }
946: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
947: {
948: Mat_SeqAIJKokkos *aijkok;
953: MatSeqAIJKokkosSyncDevice(A);
954: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
955: *kv = aijkok->a_dual.view_device();
956: return 0;
957: }
959: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
960: {
964: MatSeqAIJKokkosModifyDevice(A);
965: return 0;
966: }
968: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
969: {
970: Mat_SeqAIJKokkos *aijkok;
975: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
976: *kv = aijkok->a_dual.view_device();
977: return 0;
978: }
980: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
981: {
985: MatSeqAIJKokkosModifyDevice(A);
986: return 0;
987: }
989: /* Computes Y += alpha X */
990: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
991: {
992: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
993: Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
994: ConstMatScalarKokkosView Xa;
995: MatScalarKokkosView Ya;
999: MatSeqAIJKokkosSyncDevice(Y);
1000: MatSeqAIJKokkosSyncDevice(X);
1001: PetscLogGpuTimeBegin();
1003: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1004: /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
1005: PetscBool e;
1006: PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e);
1007: if (e) {
1008: PetscArraycmp(x->j, y->j, y->nz, &e);
1009: if (e) pattern = SAME_NONZERO_PATTERN;
1010: }
1011: }
1013: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1014: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1015: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1016: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1017: */
1018: ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1019: xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1020: Xa = xkok->a_dual.view_device();
1021: Ya = ykok->a_dual.view_device();
1023: if (pattern == SAME_NONZERO_PATTERN) {
1024: KokkosBlas::axpy(alpha, Xa, Ya);
1025: MatSeqAIJKokkosModifyDevice(Y);
1026: } else if (pattern == SUBSET_NONZERO_PATTERN) {
1027: MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1028: MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1030: Kokkos::parallel_for(
1031: Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1032: PetscInt i = t.league_rank(); /* row i */
1033: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */
1034: PetscInt p, q = Yi(i);
1035: for (p = Xi(i); p < Xi(i + 1); p++) { /* For each nonzero on row i of X */
1036: while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */
1037: if (Xj(p) == Yj(q)) { /* Found it */
1038: Ya(q) += alpha * Xa(p);
1039: q++;
1040: } else {
1041: /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1042: Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1043: */
1044: if (Yi(i) != Yi(i + 1))
1045: Ya(Yi(i)) =
1046: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99)
1047: Kokkos::nan("1"); /* auto promote the double NaN if needed */
1048: #else
1049: Kokkos::Experimental::nan("1");
1050: #endif
1051: }
1052: }
1053: });
1054: });
1055: MatSeqAIJKokkosModifyDevice(Y);
1056: } else { /* different nonzero patterns */
1057: Mat Z;
1058: KokkosCsrMatrix zcsr;
1059: KernelHandle kh;
1060: kh.create_spadd_handle(false);
1061: KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1062: KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1063: zkok = new Mat_SeqAIJKokkos(zcsr);
1064: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z);
1065: MatHeaderReplace(Y, &Z);
1066: kh.destroy_spadd_handle();
1067: }
1068: PetscLogGpuTimeEnd();
1069: PetscLogGpuFlops(xkok->a_dual.extent(0) * 2); /* Because we scaled X and then added it to Y */
1070: return 0;
1071: }
1073: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1074: {
1075: Mat_SeqAIJKokkos *akok;
1076: Mat_SeqAIJ *aseq;
1078: MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j);
1079: aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1080: akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1081: delete akok;
1082: mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1083: MatZeroEntries_SeqAIJKokkos(mat);
1084: akok->SetUpCOO(aseq);
1085: return 0;
1086: }
1088: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1089: {
1090: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
1091: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1092: PetscCount Annz = aseq->nz;
1093: const PetscCountKokkosView &jmap = akok->jmap_d;
1094: const PetscCountKokkosView &perm = akok->perm_d;
1095: MatScalarKokkosView Aa;
1096: ConstMatScalarKokkosView kv;
1097: PetscMemType memtype;
1099: PetscGetMemType(v, &memtype);
1100: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1101: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
1102: } else {
1103: kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
1104: }
1106: if (imode == INSERT_VALUES) MatSeqAIJGetKokkosViewWrite(A, &Aa); /* write matrix values */
1107: else MatSeqAIJGetKokkosView(A, &Aa); /* read & write matrix values */
1109: Kokkos::parallel_for(
1110: Annz, KOKKOS_LAMBDA(const PetscCount i) {
1111: PetscScalar sum = 0.0;
1112: for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1113: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1114: });
1116: if (imode == INSERT_VALUES) MatSeqAIJRestoreKokkosViewWrite(A, &Aa);
1117: else MatSeqAIJRestoreKokkosView(A, &Aa);
1118: return 0;
1119: }
1121: PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag)
1122: {
1123: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1124: MatScalarKokkosView Aa;
1125: const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
1126: PetscInt m = A->rmap->n;
1127: ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */
1129: MatSeqAIJGetKokkosViewWrite(A, &Aa);
1130: Kokkos::parallel_for(
1131: m, KOKKOS_LAMBDA(const PetscInt i) {
1132: PetscScalar tmp;
1133: if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
1134: tmp = Aa(Ai(i));
1135: Aa(Ai(i)) = Aa(Adiag(i));
1136: Aa(Adiag(i)) = tmp;
1137: }
1138: });
1139: MatSeqAIJRestoreKokkosViewWrite(A, &Aa);
1140: return 0;
1141: }
1143: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1144: {
1145: MatSeqAIJKokkosSyncHost(A);
1146: MatLUFactorNumeric_SeqAIJ(B, A, info);
1147: B->offloadmask = PETSC_OFFLOAD_CPU;
1148: return 0;
1149: }
1151: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1152: {
1153: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1155: A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1156: A->boundtocpu = PETSC_FALSE;
1158: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1159: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1160: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1161: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1162: A->ops->scale = MatScale_SeqAIJKokkos;
1163: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1164: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1165: A->ops->mult = MatMult_SeqAIJKokkos;
1166: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1167: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1168: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1169: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1170: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1171: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1172: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1173: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1174: A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1175: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1176: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1177: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1178: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1179: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1180: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1181: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1183: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos);
1184: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);
1185: return 0;
1186: }
1188: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1189: {
1190: Mat_SeqAIJ *aseq;
1191: PetscInt i, m, n;
1195: m = akok->nrows();
1196: n = akok->ncols();
1197: MatSetSizes(A, m, n, m, n);
1198: MatSetType(A, MATSEQAIJKOKKOS);
1200: /* Set up data structures of A as a MATSEQAIJ */
1201: MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL);
1202: aseq = (Mat_SeqAIJ *)(A)->data;
1204: akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1205: akok->j_dual.sync_host();
1207: aseq->i = akok->i_host_data();
1208: aseq->j = akok->j_host_data();
1209: aseq->a = akok->a_host_data();
1210: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1211: aseq->singlemalloc = PETSC_FALSE;
1212: aseq->free_a = PETSC_FALSE;
1213: aseq->free_ij = PETSC_FALSE;
1214: aseq->nz = akok->nnz();
1215: aseq->maxnz = aseq->nz;
1217: PetscMalloc1(m, &aseq->imax);
1218: PetscMalloc1(m, &aseq->ilen);
1219: for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1221: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1222: akok->nonzerostate = A->nonzerostate;
1223: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1224: MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
1225: MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
1226: return 0;
1227: }
1229: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1231: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1232: */
1233: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1234: {
1235: MatCreate(comm, A);
1236: MatSetSeqAIJKokkosWithCSRMatrix(*A, akok);
1237: return 0;
1238: }
1240: /* --------------------------------------------------------------------------------*/
1241: /*@C
1242: MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1243: (the default parallel PETSc format). This matrix will ultimately be handled by
1244: Kokkos for calculations. For good matrix
1245: assembly performance the user should preallocate the matrix storage by setting
1246: the parameter nz (or the array nnz). By setting these parameters accurately,
1247: performance during matrix assembly can be increased by more than a factor of 50.
1249: Collective
1251: Input Parameters:
1252: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1253: . m - number of rows
1254: . n - number of columns
1255: . nz - number of nonzeros per row (same for all rows)
1256: - nnz - array containing the number of nonzeros in the various rows
1257: (possibly different for each row) or NULL
1259: Output Parameter:
1260: . A - the matrix
1262: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1263: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1264: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1266: Notes:
1267: If nnz is given then nz is ignored
1269: The AIJ format, also called
1270: compressed row storage, is fully compatible with standard Fortran 77
1271: storage. That is, the stored row and column indices can begin at
1272: either one (as in Fortran) or zero. See the users' manual for details.
1274: Specify the preallocated storage with either nz or nnz (not both).
1275: Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
1276: allocation. For large problems you MUST preallocate memory or you
1277: will get TERRIBLE performance, see the users' manual chapter on matrices.
1279: By default, this format uses inodes (identical nodes) when possible, to
1280: improve numerical efficiency of matrix-vector products and solves. We
1281: search for consecutive rows with the same nonzero structure, thereby
1282: reusing matrix information to achieve increased efficiency.
1284: Level: intermediate
1286: .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1287: @*/
1288: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1289: {
1290: PetscKokkosInitializeCheck();
1291: MatCreate(comm, A);
1292: MatSetSizes(*A, m, n, m, n);
1293: MatSetType(*A, MATSEQAIJKOKKOS);
1294: MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz);
1295: return 0;
1296: }
1298: typedef Kokkos::TeamPolicy<>::member_type team_member;
1299: //
1300: // This factorization exploits block diagonal matrices with "Nf" (not used).
1301: // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1302: //
1303: static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info)
1304: {
1305: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1306: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1307: IS isrow = b->row, isicol = b->icol;
1308: const PetscInt *r_h, *ic_h;
1309: const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1310: const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1311: PetscScalar *ba_d = baijkok->a_dual.view_device().data();
1312: PetscBool row_identity, col_identity;
1313: PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used
1316: MatIsStructurallySymmetric(A, &row_identity);
1318: ISGetIndices(isrow, &r_h);
1319: ISGetIndices(isicol, &ic_h);
1320: ISGetSize(isicol, &nc);
1321: PetscLogGpuTimeBegin();
1322: MatSeqAIJKokkosSyncDevice(A);
1323: {
1324: #define KOKKOS_SHARED_LEVEL 1
1325: using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1326: using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1327: using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1328: const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n);
1329: Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n);
1330: const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc);
1331: Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc);
1332: size_t flops_h = 0.0;
1333: Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h);
1334: Kokkos::View<size_t> d_flops_k("flops");
1335: const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1336: const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1;
1337: Kokkos::deep_copy(d_flops_k, h_flops_k);
1338: Kokkos::deep_copy(d_r_k, h_r_k);
1339: Kokkos::deep_copy(d_ic_k, h_ic_k);
1340: // Fill A --> fact
1341: Kokkos::parallel_for(
1342: Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) {
1343: const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA
1344: const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
1345: const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
1346: // zero rows of B
1347: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1348: PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag
1349: PetscScalar *baL = ba_d + bi_d[rowb];
1350: PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1;
1351: /* zero (unfactored row) */
1352: for (int j = 0; j < nzbL; j++) baL[j] = 0;
1353: for (int j = 0; j < nzbU; j++) baU[j] = 0;
1354: });
1355: // copy A into B
1356: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1357: PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
1358: const PetscScalar *av = aa_d + ai_d[rowa];
1359: const PetscInt *ajtmp = aj_d + ai_d[rowa];
1360: /* load in initial (unfactored row) */
1361: for (int j = 0; j < nza; j++) {
1362: PetscInt colb = ic[ajtmp[j]];
1363: PetscScalar vala = av[j];
1364: if (colb == rowb) {
1365: *(ba_d + bdiag_d[rowb]) = vala;
1366: } else {
1367: const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1368: PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1369: PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0;
1370: for (int j = 0; j < nz; j++) {
1371: if (pbj[j] == colb) {
1372: pba[j] = vala;
1373: set++;
1374: break;
1375: }
1376: }
1377: #if !defined(PETSC_HAVE_SYCL)
1378: if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1379: #endif
1380: }
1381: }
1382: });
1383: });
1384: Kokkos::fence();
1386: Kokkos::parallel_for(
1387: Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) {
1388: sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1389: scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1390: sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1391: const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA
1392: const PetscInt start = field * nloc, end = start + nloc;
1393: Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1394: // A22 panel update for each row A(1,:) and col A(:,1)
1395: for (int ii = start; ii < end - 1; ii++) {
1396: const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end)
1397: const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end)
1398: const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni);
1399: const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1400: Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) {
1401: PetscInt kIdx = j * Ni + field_block_idx;
1402: if (kIdx >= nzUi) /* void */
1403: ;
1404: else {
1405: const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1406: const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1407: const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:)
1408: size_t st_idx;
1409: // find and do L(k,i) = A(:k,i) / A(i,i)
1410: Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1411: // get column, there has got to be a better way
1412: Kokkos::parallel_reduce(
1413: Kokkos::ThreadVectorRange(team, nzL),
1414: [&](const int &j, size_t &idx) {
1415: if (pjL[j] == ii) {
1416: PetscScalar *pLki = ba_d + bi_d[myk] + j;
1417: idx = j; // output
1418: *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i)
1419: }
1420: },
1421: st_idx);
1422: Kokkos::single(Kokkos::PerThread(team), [=]() {
1423: colkIdx() = st_idx;
1424: L_ki() = *(ba_d + bi_d[myk] + st_idx);
1425: });
1426: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1427: if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register
1428: #endif
1429: // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i
1430: // U(i+1,:end)
1431: Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U)
1432: PetscScalar Uij = baUi[uiIdx];
1433: PetscInt col = bjUi[uiIdx];
1434: if (col == myk) {
1435: // A_kk = A_kk - L_ki * U_ij(k)
1436: PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1437: *Akkv = *Akkv - L_ki() * Uij; // UiK
1438: } else {
1439: PetscScalar *start, *end, *pAkjv = NULL;
1440: PetscInt high, low;
1441: const PetscInt *startj;
1442: if (col < myk) { // L
1443: PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1444: PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row
1445: start = pLki + 1; // start at pLki+1, A22(myk,1)
1446: startj = bj_d + bi_d[myk] + idx;
1447: end = ba_d + bi_d[myk + 1];
1448: } else {
1449: PetscInt idx = bdiag_d[myk + 1] + 1;
1450: start = ba_d + idx;
1451: startj = bj_d + idx;
1452: end = ba_d + bdiag_d[myk];
1453: }
1454: // search for 'col', use bisection search - TODO
1455: low = 0;
1456: high = (PetscInt)(end - start);
1457: while (high - low > 5) {
1458: int t = (low + high) / 2;
1459: if (startj[t] > col) high = t;
1460: else low = t;
1461: }
1462: for (pAkjv = start + low; pAkjv < start + high; pAkjv++) {
1463: if (startj[pAkjv - start] == col) break;
1464: }
1465: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1466: if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register
1467: #endif
1468: *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1469: }
1470: });
1471: }
1472: });
1473: team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1474: if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); });
1475: } /* endof for (i=0; i<n; i++) { */
1476: Kokkos::single(Kokkos::PerTeam(team), [=]() {
1477: Kokkos::atomic_add(&d_flops_k(), flops());
1478: flops() = 0;
1479: });
1480: });
1481: Kokkos::fence();
1482: Kokkos::deep_copy(h_flops_k, d_flops_k);
1483: PetscLogGpuFlops((PetscLogDouble)h_flops_k());
1484: Kokkos::parallel_for(
1485: Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) {
1486: const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni;
1487: const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters
1488: /* Invert diagonal for simpler triangular solves */
1489: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) {
1490: int i = start + outer_index * Ni + lg_rank % Ni;
1491: if (i < end) {
1492: PetscScalar *pv = ba_d + bdiag_d[i];
1493: *pv = 1.0 / (*pv);
1494: }
1495: });
1496: });
1497: }
1498: PetscLogGpuTimeEnd();
1499: ISRestoreIndices(isicol, &ic_h);
1500: ISRestoreIndices(isrow, &r_h);
1502: ISIdentity(isrow, &row_identity);
1503: ISIdentity(isicol, &col_identity);
1504: if (b->inode.size) {
1505: B->ops->solve = MatSolve_SeqAIJ_Inode;
1506: } else if (row_identity && col_identity) {
1507: B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1508: } else {
1509: B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1510: }
1511: B->offloadmask = PETSC_OFFLOAD_GPU;
1512: MatSeqAIJKokkosSyncHost(B); // solve on CPU
1513: B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this
1514: B->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1515: B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1516: B->ops->matsolve = MatMatSolve_SeqAIJ;
1517: B->assembled = PETSC_TRUE;
1518: B->preallocated = PETSC_TRUE;
1520: return 0;
1521: }
1523: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1524: {
1525: MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info);
1526: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1527: return 0;
1528: }
1530: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1531: {
1532: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1534: if (!factors->sptrsv_symbolic_completed) {
1535: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1536: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1537: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1538: }
1539: return 0;
1540: }
1542: /* Check if we need to update factors etc for transpose solve */
1543: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1544: {
1545: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1546: MatColIdxType n = A->rmap->n;
1548: if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1549: /* Update L^T and do sptrsv symbolic */
1550: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
1551: Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1552: factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1553: factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));
1555: transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1556: factors->iLt_d, factors->jLt_d, factors->aLt_d);
1558: /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1559: We have to sort the indices, until KK provides finer control options.
1560: */
1561: sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1563: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1565: /* Update U^T and do sptrsv symbolic */
1566: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
1567: Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1568: factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1569: factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));
1571: transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1572: factors->iUt_d, factors->jUt_d, factors->aUt_d);
1574: /* Sort indices. See comments above */
1575: sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1577: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1578: factors->transpose_updated = PETSC_TRUE;
1579: }
1580: return 0;
1581: }
1583: /* Solve Ax = b, with A = LU */
1584: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1585: {
1586: ConstPetscScalarKokkosView bv;
1587: PetscScalarKokkosView xv;
1588: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1590: PetscLogGpuTimeBegin();
1591: MatSeqAIJKokkosSymbolicSolveCheck(A);
1592: VecGetKokkosView(b, &bv);
1593: VecGetKokkosViewWrite(x, &xv);
1594: /* Solve L tmpv = b */
1595: KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector);
1596: /* Solve Ux = tmpv */
1597: KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv);
1598: VecRestoreKokkosView(b, &bv);
1599: VecRestoreKokkosViewWrite(x, &xv);
1600: PetscLogGpuTimeEnd();
1601: return 0;
1602: }
1604: /* Solve A^T x = b, where A^T = U^T L^T */
1605: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1606: {
1607: ConstPetscScalarKokkosView bv;
1608: PetscScalarKokkosView xv;
1609: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1611: PetscLogGpuTimeBegin();
1612: MatSeqAIJKokkosTransposeSolveCheck(A);
1613: VecGetKokkosView(b, &bv);
1614: VecGetKokkosViewWrite(x, &xv);
1615: /* Solve U^T tmpv = b */
1616: KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1618: /* Solve L^T x = tmpv */
1619: KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1620: VecRestoreKokkosView(b, &bv);
1621: VecRestoreKokkosViewWrite(x, &xv);
1622: PetscLogGpuTimeEnd();
1623: return 0;
1624: }
1626: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1627: {
1628: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
1629: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1630: PetscInt fill_lev = info->levels;
1632: PetscLogGpuTimeBegin();
1633: MatSeqAIJKokkosSyncDevice(A);
1635: auto a_d = aijkok->a_dual.view_device();
1636: auto i_d = aijkok->i_dual.view_device();
1637: auto j_d = aijkok->j_dual.view_device();
1639: KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1641: B->assembled = PETSC_TRUE;
1642: B->preallocated = PETSC_TRUE;
1643: B->ops->solve = MatSolve_SeqAIJKokkos;
1644: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos;
1645: B->ops->matsolve = NULL;
1646: B->ops->matsolvetranspose = NULL;
1647: B->offloadmask = PETSC_OFFLOAD_GPU;
1649: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1650: factors->transpose_updated = PETSC_FALSE;
1651: factors->sptrsv_symbolic_completed = PETSC_FALSE;
1652: /* TODO: log flops, but how to know that? */
1653: PetscLogGpuTimeEnd();
1654: return 0;
1655: }
1657: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1658: {
1659: Mat_SeqAIJKokkos *aijkok;
1660: Mat_SeqAIJ *b;
1661: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1662: PetscInt fill_lev = info->levels;
1663: PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1664: PetscInt n = A->rmap->n;
1666: MatSeqAIJKokkosSyncDevice(A);
1667: /* Rebuild factors */
1668: if (factors) {
1669: factors->Destroy();
1670: } /* Destroy the old if it exists */
1671: else {
1672: B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1673: }
1675: /* Create a spiluk handle and then do symbolic factorization */
1676: nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1677: factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1679: auto spiluk_handle = factors->kh.get_spiluk_handle();
1681: Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1682: Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1683: Kokkos::realloc(factors->iU_d, n + 1);
1684: Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1686: aijkok = (Mat_SeqAIJKokkos *)A->spptr;
1687: auto i_d = aijkok->i_dual.view_device();
1688: auto j_d = aijkok->j_dual.view_device();
1689: KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1690: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1692: Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1693: Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1694: Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1695: Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1697: /* TODO: add options to select sptrsv algorithms */
1698: /* Create sptrsv handles for L, U and their transpose */
1699: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1700: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1701: #else
1702: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1703: #endif
1705: factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1706: factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1707: factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1708: factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1710: /* Fill fields of the factor matrix B */
1711: MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL);
1712: b = (Mat_SeqAIJ *)B->data;
1713: b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1714: B->info.fill_ratio_given = info->fill;
1715: B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA);
1717: B->offloadmask = PETSC_OFFLOAD_GPU;
1718: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1719: return 0;
1720: }
1722: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1723: {
1724: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1725: const PetscInt nrows = A->rmap->n;
1727: MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info);
1728: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1729: // move B data into Kokkos
1730: MatSeqAIJKokkosSyncDevice(B); // create aijkok
1731: MatSeqAIJKokkosSyncDevice(A); // create aijkok
1732: {
1733: Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1734: if (!baijkok->diag_d.extent(0)) {
1735: const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1);
1736: baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag));
1737: Kokkos::deep_copy(baijkok->diag_d, h_diag);
1738: }
1739: }
1740: return 0;
1741: }
1743: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1744: {
1745: *type = MATSOLVERKOKKOS;
1746: return 0;
1747: }
1749: static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type)
1750: {
1751: *type = MATSOLVERKOKKOSDEVICE;
1752: return 0;
1753: }
1755: /*MC
1756: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1757: on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1759: Level: beginner
1761: .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1762: M*/
1763: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1764: {
1765: PetscInt n = A->rmap->n;
1767: MatCreate(PetscObjectComm((PetscObject)A), B);
1768: MatSetSizes(*B, n, n, n, n);
1769: (*B)->factortype = ftype;
1770: PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]);
1771: MatSetType(*B, MATSEQAIJKOKKOS);
1773: if (ftype == MAT_FACTOR_LU) {
1774: MatSetBlockSizesFromMats(*B, A, A);
1775: (*B)->canuseordering = PETSC_TRUE;
1776: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1777: } else if (ftype == MAT_FACTOR_ILU) {
1778: MatSetBlockSizesFromMats(*B, A, A);
1779: (*B)->canuseordering = PETSC_FALSE;
1780: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1781: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1783: MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL);
1784: PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos);
1785: return 0;
1786: }
1788: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B)
1789: {
1790: PetscInt n = A->rmap->n;
1792: MatCreate(PetscObjectComm((PetscObject)A), B);
1793: MatSetSizes(*B, n, n, n, n);
1794: (*B)->factortype = ftype;
1795: (*B)->canuseordering = PETSC_TRUE;
1796: PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]);
1797: MatSetType(*B, MATSEQAIJKOKKOS);
1799: if (ftype == MAT_FACTOR_LU) {
1800: MatSetBlockSizesFromMats(*B, A, A);
1801: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1802: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types");
1804: MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL);
1805: PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device);
1806: return 0;
1807: }
1809: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1810: {
1811: MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos);
1812: MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos);
1813: MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device);
1814: return 0;
1815: }
1817: /* Utility to print out a KokkosCsrMatrix for debugging */
1818: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1819: {
1820: const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1821: const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1822: const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1823: const PetscInt *i = iv.data();
1824: const PetscInt *j = jv.data();
1825: const PetscScalar *a = av.data();
1826: PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1828: PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz);
1829: for (PetscInt k = 0; k < m; k++) {
1830: PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k);
1831: for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]));
1832: PetscPrintf(PETSC_COMM_SELF, "\n");
1833: }
1834: return 0;
1835: }