Actual source code: bjkokkos.kokkos.cxx

  1: #include <petsc/private/pcbjkokkosimpl.h>

  3: #include <petsc/private/kspimpl.h>
  4: #include <petscksp.h>
  5: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  6: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
  7: #include <petscsection.h>
  8: #include <petscdmcomposite.h>

 10: #include <../src/mat/impls/aij/seq/aij.h>
 11: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>

 13: #include <petscdevice_cupm.h>

 15: static PetscErrorCode PCBJKOKKOSCreateKSP_BJKOKKOS(PC pc)
 16: {
 17:   const char    *prefix;
 18:   PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;
 19:   DM             dm;

 21:   PetscFunctionBegin;
 22:   PetscCall(KSPCreate(PetscObjectComm((PetscObject)pc), &jac->ksp));
 23:   PetscCall(KSPSetNestLevel(jac->ksp, pc->kspnestlevel));
 24:   PetscCall(KSPSetErrorIfNotConverged(jac->ksp, pc->erroriffailure));
 25:   PetscCall(PetscObjectIncrementTabLevel((PetscObject)jac->ksp, (PetscObject)pc, 1));
 26:   PetscCall(PCGetOptionsPrefix(pc, &prefix));
 27:   PetscCall(KSPSetOptionsPrefix(jac->ksp, prefix));
 28:   PetscCall(KSPAppendOptionsPrefix(jac->ksp, "pc_bjkokkos_"));
 29:   PetscCall(PCGetDM(pc, &dm));
 30:   if (dm) {
 31:     PetscCall(KSPSetDM(jac->ksp, dm));
 32:     PetscCall(KSPSetDMActive(jac->ksp, PETSC_FALSE));
 33:   }
 34:   jac->reason       = PETSC_FALSE;
 35:   jac->monitor      = PETSC_FALSE;
 36:   jac->batch_target = 0;
 37:   jac->rank_target  = 0;
 38:   jac->nsolves_team = 1;
 39:   jac->ksp->max_it  = 50; // this is really for GMRES w/o restarts
 40:   PetscFunctionReturn(PETSC_SUCCESS);
 41: }

 43: // y <-- Ax
 44: KOKKOS_INLINE_FUNCTION PetscErrorCode MatMult(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, const PetscInt start, const PetscInt end, const PetscScalar *x_loc, PetscScalar *y_loc)
 45: {
 46:   Kokkos::parallel_for(Kokkos::TeamThreadRange(team, start, end), [=](const int rowb) {
 47:     int                rowa = ic[rowb];
 48:     int                n    = glb_Aai[rowa + 1] - glb_Aai[rowa];
 49:     const PetscInt    *aj   = glb_Aaj + glb_Aai[rowa]; // global
 50:     const PetscScalar *aa   = glb_Aaa + glb_Aai[rowa];
 51:     PetscScalar        sum;
 52:     Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, n), [=](const int i, PetscScalar &lsum) { lsum += aa[i] * x_loc[r[aj[i]] - start]; }, sum);
 53:     Kokkos::single(Kokkos::PerThread(team), [=]() { y_loc[rowb - start] = sum; });
 54:   });
 55:   team.team_barrier();
 56:   return PETSC_SUCCESS;
 57: }

 59: // temp buffer per thread with reduction at end?
 60: KOKKOS_INLINE_FUNCTION PetscErrorCode MatMultTranspose(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, const PetscInt start, const PetscInt end, const PetscScalar *x_loc, PetscScalar *y_loc)
 61: {
 62:   Kokkos::parallel_for(Kokkos::TeamVectorRange(team, end - start), [=](int i) { y_loc[i] = 0; });
 63:   team.team_barrier();
 64:   Kokkos::parallel_for(Kokkos::TeamThreadRange(team, start, end), [=](const int rowb) {
 65:     int                rowa = ic[rowb];
 66:     int                n    = glb_Aai[rowa + 1] - glb_Aai[rowa];
 67:     const PetscInt    *aj   = glb_Aaj + glb_Aai[rowa]; // global
 68:     const PetscScalar *aa   = glb_Aaa + glb_Aai[rowa];
 69:     const PetscScalar  xx   = x_loc[rowb - start]; // rowb = ic[rowa] = ic[r[rowb]]
 70:     Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, n), [=](const int &i) {
 71:       PetscScalar val = aa[i] * xx;
 72:       Kokkos::atomic_fetch_add(&y_loc[r[aj[i]] - start], val);
 73:     });
 74:   });
 75:   team.team_barrier();
 76:   return PETSC_SUCCESS;
 77: }

 79: typedef struct Batch_MetaData_TAG {
 80:   PetscInt           flops;
 81:   PetscInt           its;
 82:   KSPConvergedReason reason;
 83: } Batch_MetaData;

 85: // Solve A(BB^-1)x = y with TFQMR. Right preconditioned to get un-preconditioned residual
 86: static KOKKOS_INLINE_FUNCTION PetscErrorCode BJSolve_TFQMR(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, PetscScalar *work_space_global, const int stride_global, const int nShareVec, PetscScalar *work_space_shared, const int stride_shared, PetscReal rtol, PetscReal atol, PetscReal dtol, PetscInt maxit, Batch_MetaData *metad, const PetscInt start, const PetscInt end, const PetscScalar glb_idiag[], const PetscScalar *glb_b, PetscScalar *glb_x, bool monitor)
 87: {
 88:   using Kokkos::parallel_for;
 89:   using Kokkos::parallel_reduce;
 90:   int                Nblk = end - start, it, m, stride = stride_shared, idx = 0;
 91:   PetscReal          dp, dpold, w, dpest, tau, psi, cm, r0;
 92:   const PetscScalar *Diag = &glb_idiag[start];
 93:   PetscScalar       *ptr  = work_space_shared, rho, rhoold, a, s, b, eta, etaold, psiold, cf, dpi;

 95:   if (idx++ == nShareVec) {
 96:     ptr    = work_space_global;
 97:     stride = stride_global;
 98:   }
 99:   PetscScalar *XX = ptr;
100:   ptr += stride;
101:   if (idx++ == nShareVec) {
102:     ptr    = work_space_global;
103:     stride = stride_global;
104:   }
105:   PetscScalar *R = ptr;
106:   ptr += stride;
107:   if (idx++ == nShareVec) {
108:     ptr    = work_space_global;
109:     stride = stride_global;
110:   }
111:   PetscScalar *RP = ptr;
112:   ptr += stride;
113:   if (idx++ == nShareVec) {
114:     ptr    = work_space_global;
115:     stride = stride_global;
116:   }
117:   PetscScalar *V = ptr;
118:   ptr += stride;
119:   if (idx++ == nShareVec) {
120:     ptr    = work_space_global;
121:     stride = stride_global;
122:   }
123:   PetscScalar *T = ptr;
124:   ptr += stride;
125:   if (idx++ == nShareVec) {
126:     ptr    = work_space_global;
127:     stride = stride_global;
128:   }
129:   PetscScalar *Q = ptr;
130:   ptr += stride;
131:   if (idx++ == nShareVec) {
132:     ptr    = work_space_global;
133:     stride = stride_global;
134:   }
135:   PetscScalar *P = ptr;
136:   ptr += stride;
137:   if (idx++ == nShareVec) {
138:     ptr    = work_space_global;
139:     stride = stride_global;
140:   }
141:   PetscScalar *U = ptr;
142:   ptr += stride;
143:   if (idx++ == nShareVec) {
144:     ptr    = work_space_global;
145:     stride = stride_global;
146:   }
147:   PetscScalar *D = ptr;
148:   ptr += stride;
149:   if (idx++ == nShareVec) {
150:     ptr    = work_space_global;
151:     stride = stride_global;
152:   }
153:   PetscScalar *AUQ = V;

155:   // init: get b, zero x
156:   parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) {
157:     int rowa         = ic[rowb];
158:     R[rowb - start]  = glb_b[rowa];
159:     XX[rowb - start] = 0;
160:   });
161:   team.team_barrier();
162:   parallel_reduce(Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += R[idx] * PetscConj(R[idx]); }, dpi);
163:   team.team_barrier();
164:   r0 = dp = PetscSqrtReal(PetscRealPart(dpi));
165:   // diagnostics
166: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
167:   if (monitor) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e\n", 0, (double)dp); });
168: #endif
169:   if (dp < atol) {
170:     metad->reason = KSP_CONVERGED_ATOL_NORMAL;
171:     it            = 0;
172:     goto done;
173:   }
174:   if (0 == maxit) {
175:     metad->reason = KSP_CONVERGED_ITS;
176:     it            = 0;
177:     goto done;
178:   }

180:   /* Make the initial Rp = R */
181:   parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { RP[idx] = R[idx]; });
182:   team.team_barrier();
183:   /* Set the initial conditions */
184:   etaold = 0.0;
185:   psiold = 0.0;
186:   tau    = dp;
187:   dpold  = dp;

189:   /* rhoold = (r,rp)     */
190:   parallel_reduce(Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += R[idx] * PetscConj(RP[idx]); }, rhoold);
191:   team.team_barrier();
192:   parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
193:     U[idx] = R[idx];
194:     P[idx] = R[idx];
195:     T[idx] = Diag[idx] * P[idx];
196:     D[idx] = 0;
197:   });
198:   team.team_barrier();
199:   static_cast<void>(MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, T, V));

201:   it = 0;
202:   do {
203:     /* s <- (v,rp)          */
204:     parallel_reduce(Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += V[idx] * PetscConj(RP[idx]); }, s);
205:     team.team_barrier();
206:     if (s == 0) {
207:       metad->reason = KSP_CONVERGED_HAPPY_BREAKDOWN;
208:       goto done;
209:     }
210:     a = rhoold / s; /* a <- rho / s         */
211:     /* q <- u - a v    VecWAXPY(w,alpha,x,y): w = alpha x + y.     */
212:     /* t <- u + q           */
213:     parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
214:       Q[idx] = U[idx] - a * V[idx];
215:       T[idx] = U[idx] + Q[idx];
216:     });
217:     team.team_barrier();
218:     // KSP_PCApplyBAorAB
219:     parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { T[idx] = Diag[idx] * T[idx]; });
220:     team.team_barrier();
221:     static_cast<void>(MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, T, AUQ));
222:     /* r <- r - a K (u + q) */
223:     parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { R[idx] = R[idx] - a * AUQ[idx]; });
224:     team.team_barrier();
225:     parallel_reduce(Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += R[idx] * PetscConj(R[idx]); }, dpi);
226:     team.team_barrier();
227:     dp = PetscSqrtReal(PetscRealPart(dpi));
228:     for (m = 0; m < 2; m++) {
229:       if (!m) w = PetscSqrtReal(dp * dpold);
230:       else w = dp;
231:       psi = w / tau;
232:       cm  = 1.0 / PetscSqrtReal(1.0 + psi * psi);
233:       tau = tau * psi * cm;
234:       eta = cm * cm * a;
235:       cf  = psiold * psiold * etaold / a;
236:       if (!m) {
237:         /* D = U + cf D */
238:         parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { D[idx] = U[idx] + cf * D[idx]; });
239:       } else {
240:         /* D = Q + cf D */
241:         parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { D[idx] = Q[idx] + cf * D[idx]; });
242:       }
243:       team.team_barrier();
244:       parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { XX[idx] = XX[idx] + eta * D[idx]; });
245:       team.team_barrier();
246:       dpest = PetscSqrtReal(2 * it + m + 2.0) * tau;
247: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
248:       if (monitor && m == 1) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e\n", it + 1, (double)dpest); });
249: #endif
250:       if (dpest < atol) {
251:         metad->reason = KSP_CONVERGED_ATOL_NORMAL;
252:         goto done;
253:       }
254:       if (dpest / r0 < rtol) {
255:         metad->reason = KSP_CONVERGED_RTOL_NORMAL;
256:         goto done;
257:       }
258: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
259:       if (dpest / r0 > dtol) {
260:         metad->reason = KSP_DIVERGED_DTOL;
261:         Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("ERROR block %d diverged: %d it, res=%e, r_0=%e\n", team.league_rank(), it, dpest, r0); });
262:         goto done;
263:       }
264: #else
265:       if (dpest / r0 > dtol) {
266:         metad->reason = KSP_DIVERGED_DTOL;
267:         goto done;
268:       }
269: #endif
270:       if (it + 1 == maxit) {
271:         metad->reason = KSP_CONVERGED_ITS;
272: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
273:         Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("ERROR block %d diverged: TFQMR %d:%d it, res=%e, r_0=%e r_res=%e\n", team.league_rank(), it, m, dpest, r0, dpest / r0); });
274: #endif
275:         goto done;
276:       }
277:       etaold = eta;
278:       psiold = psi;
279:     }

281:     /* rho <- (r,rp)       */
282:     parallel_reduce(Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += R[idx] * PetscConj(RP[idx]); }, rho);
283:     team.team_barrier();
284:     if (rho == 0) {
285:       metad->reason = KSP_CONVERGED_HAPPY_BREAKDOWN;
286:       goto done;
287:     }
288:     b = rho / rhoold; /* b <- rho / rhoold   */
289:     /* u <- r + b q        */
290:     /* p <- u + b(q + b p) */
291:     parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
292:       U[idx] = R[idx] + b * Q[idx];
293:       Q[idx] = Q[idx] + b * P[idx];
294:       P[idx] = U[idx] + b * Q[idx];
295:     });
296:     /* v <- K p  */
297:     team.team_barrier();
298:     parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { T[idx] = Diag[idx] * P[idx]; });
299:     team.team_barrier();
300:     static_cast<void>(MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, T, V));

302:     rhoold = rho;
303:     dpold  = dp;

305:     it++;
306:   } while (it < maxit);
307: done:
308:   // KSPUnwindPreconditioner
309:   parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { XX[idx] = Diag[idx] * XX[idx]; });
310:   team.team_barrier();
311:   // put x into Plex order
312:   parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) {
313:     int rowa    = ic[rowb];
314:     glb_x[rowa] = XX[rowb - start];
315:   });
316:   metad->its = it;
317:   if (1) {
318:     int nnz;
319:     parallel_reduce(Kokkos::TeamVectorRange(team, start, end), [=](const int idx, int &lsum) { lsum += (glb_Aai[idx + 1] - glb_Aai[idx]); }, nnz);
320:     metad->flops = 2 * (metad->its * (10 * Nblk + 2 * nnz) + 5 * Nblk);
321:   } else {
322:     metad->flops = 2 * (metad->its * (10 * Nblk + 2 * 50 * Nblk) + 5 * Nblk); // guess
323:   }
324:   return PETSC_SUCCESS;
325: }

327: // Solve Ax = y with biCG
328: static KOKKOS_INLINE_FUNCTION PetscErrorCode BJSolve_BICG(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, PetscScalar *work_space_global, const int stride_global, const int nShareVec, PetscScalar *work_space_shared, const int stride_shared, PetscReal rtol, PetscReal atol, PetscReal dtol, PetscInt maxit, Batch_MetaData *metad, const PetscInt start, const PetscInt end, const PetscScalar glb_idiag[], const PetscScalar *glb_b, PetscScalar *glb_x, bool monitor)
329: {
330:   using Kokkos::parallel_for;
331:   using Kokkos::parallel_reduce;
332:   int                Nblk = end - start, it, stride = stride_shared, idx = 0; // start in shared mem
333:   PetscReal          dp, r0;
334:   const PetscScalar *Di  = &glb_idiag[start];
335:   PetscScalar       *ptr = work_space_shared, dpi, a = 1.0, beta, betaold = 1.0, t1, t2;

337:   if (idx++ == nShareVec) {
338:     ptr    = work_space_global;
339:     stride = stride_global;
340:   }
341:   PetscScalar *XX = ptr;
342:   ptr += stride;
343:   if (idx++ == nShareVec) {
344:     ptr    = work_space_global;
345:     stride = stride_global;
346:   }
347:   PetscScalar *Rl = ptr;
348:   ptr += stride;
349:   if (idx++ == nShareVec) {
350:     ptr    = work_space_global;
351:     stride = stride_global;
352:   }
353:   PetscScalar *Zl = ptr;
354:   ptr += stride;
355:   if (idx++ == nShareVec) {
356:     ptr    = work_space_global;
357:     stride = stride_global;
358:   }
359:   PetscScalar *Pl = ptr;
360:   ptr += stride;
361:   if (idx++ == nShareVec) {
362:     ptr    = work_space_global;
363:     stride = stride_global;
364:   }
365:   PetscScalar *Rr = ptr;
366:   ptr += stride;
367:   if (idx++ == nShareVec) {
368:     ptr    = work_space_global;
369:     stride = stride_global;
370:   }
371:   PetscScalar *Zr = ptr;
372:   ptr += stride;
373:   if (idx++ == nShareVec) {
374:     ptr    = work_space_global;
375:     stride = stride_global;
376:   }
377:   PetscScalar *Pr = ptr;
378:   ptr += stride;

380:   /*     r <- b (x is 0) */
381:   parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) {
382:     int rowa         = ic[rowb];
383:     Rl[rowb - start] = Rr[rowb - start] = glb_b[rowa];
384:     XX[rowb - start]                    = 0;
385:   });
386:   team.team_barrier();
387:   /*     z <- Br         */
388:   parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
389:     Zr[idx] = Di[idx] * Rr[idx];
390:     Zl[idx] = Di[idx] * Rl[idx];
391:   });
392:   team.team_barrier();
393:   /*    dp <- r'*r       */
394:   parallel_reduce(Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += Rr[idx] * PetscConj(Rr[idx]); }, dpi);
395:   team.team_barrier();
396:   r0 = dp = PetscSqrtReal(PetscRealPart(dpi));
397: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
398:   if (monitor) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e\n", 0, (double)dp); });
399: #endif
400:   if (dp < atol) {
401:     metad->reason = KSP_CONVERGED_ATOL_NORMAL;
402:     it            = 0;
403:     goto done;
404:   }
405:   if (0 == maxit) {
406:     metad->reason = KSP_CONVERGED_ITS;
407:     it            = 0;
408:     goto done;
409:   }

411:   it = 0;
412:   do {
413:     /*     beta <- r'z     */
414:     parallel_reduce(Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += Zr[idx] * PetscConj(Rl[idx]); }, beta);
415:     team.team_barrier();
416: #if PCBJKOKKOS_VERBOSE_LEVEL >= 6
417:   #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
418:     Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%7d beta = Z.R = %22.14e \n", i, (double)beta); });
419:   #endif
420: #endif
421:     if (beta == 0.0) {
422:       metad->reason = KSP_CONVERGED_HAPPY_BREAKDOWN;
423:       goto done;
424:     }
425:     if (it == 0) {
426:       /*     p <- z          */
427:       parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
428:         Pr[idx] = Zr[idx];
429:         Pl[idx] = Zl[idx];
430:       });
431:     } else {
432:       t1 = beta / betaold;
433:       /*     p <- z + b* p   */
434:       t2 = PetscConj(t1);
435:       parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
436:         Pr[idx] = t1 * Pr[idx] + Zr[idx];
437:         Pl[idx] = t2 * Pl[idx] + Zl[idx];
438:       });
439:     }
440:     team.team_barrier();
441:     betaold = beta;
442:     /*     z <- Kp         */
443:     static_cast<void>(MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, Pr, Zr));
444:     static_cast<void>(MatMultTranspose(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, Pl, Zl));
445:     /*     dpi <- z'p      */
446:     parallel_reduce(Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += Zr[idx] * PetscConj(Pl[idx]); }, dpi);
447:     team.team_barrier();
448:     if (dpi == 0) {
449:       metad->reason = KSP_CONVERGED_HAPPY_BREAKDOWN;
450:       goto done;
451:     }
452:     //
453:     a  = beta / dpi; /*     a = beta/p'z    */
454:     t1 = -a;
455:     t2 = PetscConj(t1);
456:     /*     x <- x + ap     */
457:     parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
458:       XX[idx] = XX[idx] + a * Pr[idx];
459:       Rr[idx] = Rr[idx] + t1 * Zr[idx];
460:       Rl[idx] = Rl[idx] + t2 * Zl[idx];
461:     });
462:     team.team_barrier();
463:     team.team_barrier();
464:     /*    dp <- r'*r       */
465:     parallel_reduce(Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += Rr[idx] * PetscConj(Rr[idx]); }, dpi);
466:     team.team_barrier();
467:     dp = PetscSqrtReal(PetscRealPart(dpi));
468: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
469:     if (monitor) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e\n", it + 1, (double)dp); });
470: #endif
471:     if (dp < atol) {
472:       metad->reason = KSP_CONVERGED_ATOL_NORMAL;
473:       goto done;
474:     }
475:     if (dp / r0 < rtol) {
476:       metad->reason = KSP_CONVERGED_RTOL_NORMAL;
477:       goto done;
478:     }
479: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
480:     if (dp / r0 > dtol) {
481:       metad->reason = KSP_DIVERGED_DTOL;
482:       Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("ERROR block %d diverged: %d it, res=%e, r_0=%e (BICG does this)\n", team.league_rank(), it, dp, r0); });
483:       goto done;
484:     }
485: #else
486:     if (dp / r0 > dtol) {
487:       metad->reason = KSP_DIVERGED_DTOL;
488:       goto done;
489:     }
490: #endif
491:     if (it + 1 == maxit) {
492:       metad->reason = KSP_CONVERGED_ITS; // don't worry about hitting max iterations
493: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
494:       Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("ERROR block %d diverged: BICG %d it, res=%e, r_0=%e r_res=%e\n", team.league_rank(), it, dp, r0, dp / r0); });
495: #endif
496:       goto done;
497:     }
498:     /* z <- Br  */
499:     parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
500:       Zr[idx] = Di[idx] * Rr[idx];
501:       Zl[idx] = Di[idx] * Rl[idx];
502:     });

504:     it++;
505:   } while (it < maxit);
506: done:
507:   // put x back into Plex order
508:   parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) {
509:     int rowa    = ic[rowb];
510:     glb_x[rowa] = XX[rowb - start];
511:   });
512:   metad->its = it;
513:   if (1) {
514:     int nnz;
515:     parallel_reduce(Kokkos::TeamVectorRange(team, start, end), [=](const int idx, int &lsum) { lsum += (glb_Aai[idx + 1] - glb_Aai[idx]); }, nnz);
516:     metad->flops = 2 * (metad->its * (10 * Nblk + 2 * nnz) + 5 * Nblk);
517:   } else {
518:     metad->flops = 2 * (metad->its * (10 * Nblk + 2 * 50 * Nblk) + 5 * Nblk); // guess
519:   }
520:   return PETSC_SUCCESS;
521: }

523: // KSP solver solve Ax = b; xout is output, bin is input
524: static PetscErrorCode PCApply_BJKOKKOS(PC pc, Vec bin, Vec xout)
525: {
526:   PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;
527:   Mat            A = pc->pmat, Aseq = A;
528:   PetscMPIInt    rank;

530:   PetscFunctionBegin;
531:   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
532:   if (!A->spptr) {
533:     Aseq = ((Mat_MPIAIJ *)A->data)->A; // MPI
534:   }
535:   PetscCall(MatSeqAIJKokkosSyncDevice(Aseq));
536:   {
537:     PetscInt           maxit = jac->ksp->max_it;
538:     const PetscInt     conc = Kokkos::DefaultExecutionSpace().concurrency(), openmp = !!(conc < 1000), team_size = (openmp == 0 && PCBJKOKKOS_VEC_SIZE != 1) ? PCBJKOKKOS_TEAM_SIZE : 1;
539:     const PetscInt     nwork = jac->nwork, nBlk = jac->nBlocks;
540:     PetscScalar       *glb_xdata = NULL, *dummy;
541:     PetscReal          rtol = jac->ksp->rtol, atol = jac->ksp->abstol, dtol = jac->ksp->divtol;
542:     const PetscScalar *glb_idiag = jac->d_idiag_k->data(), *glb_bdata = NULL;
543:     const PetscInt    *glb_Aai, *glb_Aaj, *d_bid_eqOffset = jac->d_bid_eqOffset_k->data();
544:     const PetscScalar *glb_Aaa;
545:     const PetscInt    *d_isicol = jac->d_isicol_k->data(), *d_isrow = jac->d_isrow_k->data();
546:     PCFailedReason     pcreason;
547:     KSPIndex           ksp_type_idx = jac->ksp_type_idx;
548:     PetscMemType       mtype;
549:     PetscContainer     container;
550:     PetscInt           batch_sz;                // the number of repeated DMs, [DM_e_1, DM_e_2, DM_e_batch_sz, DM_i_1, ...]
551:     VecScatter         plex_batch = NULL;       // not used
552:     Vec                bvec;                    // a copy of b for scatter (just alias to bin now)
553:     PetscBool          monitor  = jac->monitor; // captured
554:     PetscInt           view_bid = jac->batch_target;
555:     MatInfo            info;

557:     PetscCall(MatSeqAIJGetCSRAndMemType(Aseq, &glb_Aai, &glb_Aaj, &dummy, &mtype));
558:     jac->max_nits = 0;
559:     glb_Aaa       = dummy;
560:     if (jac->rank_target != rank) view_bid = -1; // turn off all but one process
561:     PetscCall(MatGetInfo(A, MAT_LOCAL, &info));
562:     // get field major is to map plex IO to/from block/field major
563:     PetscCall(PetscObjectQuery((PetscObject)A, "plex_batch_is", (PetscObject *)&container));
564:     if (container) {
565:       PetscCall(VecDuplicate(bin, &bvec));
566:       PetscCall(PetscContainerGetPointer(container, (void **)&plex_batch));
567:       PetscCall(VecScatterBegin(plex_batch, bin, bvec, INSERT_VALUES, SCATTER_FORWARD));
568:       PetscCall(VecScatterEnd(plex_batch, bin, bvec, INSERT_VALUES, SCATTER_FORWARD));
569:       SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "No plex_batch_is -- require NO field major ordering for now");
570:     } else {
571:       bvec = bin;
572:     }
573:     // get x
574:     PetscCall(VecGetArrayAndMemType(xout, &glb_xdata, &mtype));
575: #if defined(PETSC_HAVE_CUDA)
576:     PetscCheck(PetscMemTypeDevice(mtype), PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "No GPU data for x %" PetscInt_FMT " != %" PetscInt_FMT, mtype, PETSC_MEMTYPE_DEVICE);
577: #endif
578:     PetscCall(VecGetArrayReadAndMemType(bvec, &glb_bdata, &mtype));
579: #if defined(PETSC_HAVE_CUDA)
580:     PetscCheck(PetscMemTypeDevice(mtype), PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "No GPU data for b");
581: #endif
582:     // get batch size
583:     PetscCall(PetscObjectQuery((PetscObject)A, "batch size", (PetscObject *)&container));
584:     if (container) {
585:       PetscInt *pNf = NULL;
586:       PetscCall(PetscContainerGetPointer(container, (void **)&pNf));
587:       batch_sz = *pNf; // number of times to repeat the DMs
588:     } else batch_sz = 1;
589:     PetscCheck(nBlk % batch_sz == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "batch_sz = %" PetscInt_FMT ", nBlk = %" PetscInt_FMT, batch_sz, nBlk);
590:     if (ksp_type_idx == BATCH_KSP_GMRESKK_IDX) {
591:       // KK solver - move PETSc data into Kokkos Views, setup solver, solve, move data out of Kokkos, process metadata (convergence tests, etc.)
592: #if defined(PETSC_HAVE_KOKKOS_KERNELS_BATCH)
593:       PetscCall(PCApply_BJKOKKOSKERNELS(pc, glb_bdata, glb_xdata, glb_Aai, glb_Aaj, glb_Aaa, team_size, info, batch_sz, &pcreason));
594: #else
595:       PetscCheck(ksp_type_idx != BATCH_KSP_GMRESKK_IDX, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "Type: BATCH_KSP_GMRES not supported for complex");
596: #endif
597:     } else { // Kokkos Krylov
598:       using scr_mem_t    = Kokkos::DefaultExecutionSpace::scratch_memory_space;
599:       using vect2D_scr_t = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft, scr_mem_t>;
600:       Kokkos::View<Batch_MetaData *, Kokkos::DefaultExecutionSpace> d_metadata("solver meta data", nBlk);
601:       int                                                           stride_shared, stride_global, global_buff_words;
602:       d_bid_eqOffset = jac->d_bid_eqOffset_k->data();
603:       // solve each block independently
604:       int scr_bytes_team_shared = 0, nShareVec = 0, nGlobBVec = 0;
605:       if (jac->const_block_size) { // use shared memory for work vectors only if constant block size - todo: test efficiency loss
606:         size_t      maximum_shared_mem_size = 64000;
607:         PetscDevice device;
608:         PetscCall(PetscDeviceGetDefault_Internal(&device));
609:         PetscCall(PetscDeviceGetAttribute(device, PETSC_DEVICE_ATTR_SIZE_T_SHARED_MEM_PER_BLOCK, &maximum_shared_mem_size));
610:         stride_shared = jac->const_block_size;                                                   // captured
611:         nShareVec     = maximum_shared_mem_size / (jac->const_block_size * sizeof(PetscScalar)); // integer floor, number of vectors that fit in shared
612:         if (nShareVec > nwork) nShareVec = nwork;
613:         else nGlobBVec = nwork - nShareVec;
614:         global_buff_words     = jac->n * nGlobBVec;
615:         scr_bytes_team_shared = jac->const_block_size * nShareVec * sizeof(PetscScalar);
616:       } else {
617:         scr_bytes_team_shared = 0;
618:         stride_shared         = 0;
619:         global_buff_words     = jac->n * nwork;
620:         nGlobBVec             = nwork; // not needed == fix
621:       }
622:       stride_global = jac->n; // captured
623: #if defined(PETSC_HAVE_CUDA)
624:       nvtxRangePushA("batch-kokkos-solve");
625: #endif
626:       Kokkos::View<PetscScalar *, Kokkos::DefaultExecutionSpace> d_work_vecs_k("workvectors", global_buff_words); // global work vectors
627: #if PCBJKOKKOS_VERBOSE_LEVEL > 1
628:       PetscCall(PetscInfo(pc, "\tn = %d. %d shared bytes/team, %d global mem bytes, rtol=%e, num blocks %d, team_size=%d, %d vector threads, %d shared vectors, %d global vectors\n", (int)jac->n, scr_bytes_team_shared, global_buff_words, rtol, (int)nBlk, (int)team_size, PCBJKOKKOS_VEC_SIZE, nShareVec, nGlobBVec));
629: #endif
630:       PetscScalar *d_work_vecs = d_work_vecs_k.data();
631:       Kokkos::parallel_for(
632:         "Solve", Kokkos::TeamPolicy<Kokkos::LaunchBounds<256, 4>>(nBlk, team_size, PCBJKOKKOS_VEC_SIZE).set_scratch_size(PCBJKOKKOS_SHARED_LEVEL, Kokkos::PerTeam(scr_bytes_team_shared)), KOKKOS_LAMBDA(const team_member team) {
633:           const int    blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1];
634:           vect2D_scr_t work_vecs_shared(team.team_scratch(PCBJKOKKOS_SHARED_LEVEL), end - start, nShareVec);
635:           PetscScalar *work_buff_shared = work_vecs_shared.data();
636:           PetscScalar *work_buff_global = &d_work_vecs[start]; // start inc'ed in
637:           bool         print            = monitor && (blkID == view_bid);
638:           switch (ksp_type_idx) {
639:           case BATCH_KSP_BICG_IDX:
640:             static_cast<void>(BJSolve_BICG(team, glb_Aai, glb_Aaj, glb_Aaa, d_isrow, d_isicol, work_buff_global, stride_global, nShareVec, work_buff_shared, stride_shared, rtol, atol, dtol, maxit, &d_metadata[blkID], start, end, glb_idiag, glb_bdata, glb_xdata, print));
641:             break;
642:           case BATCH_KSP_TFQMR_IDX:
643:             static_cast<void>(BJSolve_TFQMR(team, glb_Aai, glb_Aaj, glb_Aaa, d_isrow, d_isicol, work_buff_global, stride_global, nShareVec, work_buff_shared, stride_shared, rtol, atol, dtol, maxit, &d_metadata[blkID], start, end, glb_idiag, glb_bdata, glb_xdata, print));
644:             break;
645:           default:
646: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
647:             printf("Unknown KSP type %d\n", ksp_type_idx);
648: #else
649:             /* void */;
650: #endif
651:           }
652:         });
653:       Kokkos::fence();
654: #if defined(PETSC_HAVE_CUDA)
655:       nvtxRangePop();
656:       nvtxRangePushA("Post-solve-metadata");
657: #endif
658:       auto h_metadata = Kokkos::create_mirror(Kokkos::HostSpace::memory_space(), d_metadata);
659:       Kokkos::deep_copy(h_metadata, d_metadata);
660:       PetscInt count = -1, mbid = 0;
661:       int      in[2], out[2];
662:       if (jac->reason) { // -pc_bjkokkos_ksp_converged_reason
663: #if PCBJKOKKOS_VERBOSE_LEVEL >= 3
664:   #if PCBJKOKKOS_VERBOSE_LEVEL >= 4
665:         PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Iterations\n"));
666:   #endif
667:         // assume species major
668:   #if PCBJKOKKOS_VERBOSE_LEVEL < 4
669:         if (batch_sz != 1) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%s: max iterations per species:", ksp_type_idx == BATCH_KSP_BICG_IDX ? "bicg" : "tfqmr"));
670:         else PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "    Linear solve converged due to %s iterations ", ksp_type_idx == BATCH_KSP_BICG_IDX ? "bicg" : "tfqmr"));
671:   #endif
672:         for (PetscInt dmIdx = 0, head = 0; dmIdx < jac->num_dms; dmIdx += batch_sz) {
673:           for (PetscInt f = 0, idx = head; f < jac->dm_Nf[dmIdx]; f++, s++, idx++) {
674:   #if PCBJKOKKOS_VERBOSE_LEVEL >= 4
675:             PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%2" PetscInt_FMT ":", s));
676:             for (int bid = 0; bid < batch_sz; bid++) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3" PetscInt_FMT " ", h_metadata[idx + bid * jac->dm_Nf[dmIdx]].its));
677:             PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n"));
678:   #else
679:             for (int bid = 0; bid < batch_sz; bid++) {
680:               if (h_metadata[idx + bid * jac->dm_Nf[dmIdx]].its > count) {
681:                 count = h_metadata[idx + bid * jac->dm_Nf[dmIdx]].its;
682:                 mbid  = bid;
683:               }
684:             }
685:             PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3" PetscInt_FMT " ", count));
686:   #endif
687:           }
688:           head += batch_sz * jac->dm_Nf[dmIdx];
689:         }
690:   #if PCBJKOKKOS_VERBOSE_LEVEL == 3
691:         PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n"));
692:   #endif
693: #endif
694:         if (count == -1) {
695:           for (int blkID = 0; blkID < nBlk; blkID++) {
696:             if (h_metadata[blkID].its > count) {
697:               jac->max_nits = count = h_metadata[blkID].its;
698:               mbid                  = blkID;
699:             }
700: #if PCBJKOKKOS_VERBOSE_LEVEL > 0
701:             if (h_metadata[blkID].reason < 0) {
702:               PetscCall(PetscPrintf(PETSC_COMM_SELF, "ERROR reason=%s, its=%" PetscInt_FMT ". species %" PetscInt_FMT ", batch %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[blkID].reason], h_metadata[blkID].its, blkID / batch_sz, blkID % batch_sz));
703:             }
704: #endif
705:             PetscCall(PetscLogGpuFlops((PetscLogDouble)h_metadata[blkID].flops));
706:           }
707:         }
708:         in[0] = count;
709:         in[1] = rank;
710:         PetscCallMPI(MPI_Allreduce(in, out, 1, MPI_2INT, MPI_MAXLOC, PetscObjectComm((PetscObject)A)));
711:         if (0 == rank) {
712:           if (batch_sz != 1)
713:             PetscCall(PetscPrintf(PETSC_COMM_SELF, "[%d] Linear solve converged due to %s iterations %d, batch %" PetscInt_FMT ", species %" PetscInt_FMT " (max)\n", out[1], KSPConvergedReasons[h_metadata[mbid].reason], out[0], mbid % batch_sz, mbid / batch_sz));
714:           else PetscCall(PetscPrintf(PETSC_COMM_SELF, "[%d] Linear solve converged due to %s iterations %d, block %" PetscInt_FMT " (max)\n", out[1], KSPConvergedReasons[h_metadata[mbid].reason], out[0], mbid));
715:         }
716:       }
717:       for (int blkID = 0; blkID < nBlk; blkID++) {
718:         PetscCall(PetscLogGpuFlops((PetscLogDouble)h_metadata[blkID].flops));
719: #if PCBJKOKKOS_VERBOSE_LEVEL > 0
720:         if (h_metadata[blkID].reason < 0) {
721:           PetscCall(PetscPrintf(PETSC_COMM_SELF, "ERROR reason=%s, its=%" PetscInt_FMT ". species %" PetscInt_FMT ", batch %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[blkID].reason], h_metadata[blkID].its, blkID / batch_sz, blkID % batch_sz));
722:         }
723: #endif
724:       }
725:       {
726:         int errsum;
727:         Kokkos::parallel_reduce(
728:           nBlk,
729:           KOKKOS_LAMBDA(const int idx, int &lsum) {
730:             if (d_metadata[idx].reason < 0) ++lsum;
731:           },
732:           errsum);
733:         pcreason = errsum ? PC_SUBPC_ERROR : PC_NOERROR;
734:         if (!errsum && !jac->max_nits) { // set max its to give back to top KSP
735:           for (int blkID = 0; blkID < nBlk; blkID++) {
736:             if (h_metadata[blkID].its > jac->max_nits) jac->max_nits = h_metadata[blkID].its;
737:           }
738:         } else if (errsum) {
739:           PetscCall(PetscPrintf(PETSC_COMM_SELF, "[%d] ERROR Kokkos batch solver did not converge in all solves\n", (int)rank));
740:         }
741:       }
742: #if defined(PETSC_HAVE_CUDA)
743:       nvtxRangePop();
744: #endif
745:     } // end of Kokkos (not Kernels) solvers block
746:     PetscCall(VecRestoreArrayAndMemType(xout, &glb_xdata));
747:     PetscCall(VecRestoreArrayReadAndMemType(bvec, &glb_bdata));
748:     PetscCall(PCSetFailedReason(pc, pcreason));
749:     // map back to Plex space - not used
750:     if (plex_batch) {
751:       PetscCall(VecCopy(xout, bvec));
752:       PetscCall(VecScatterBegin(plex_batch, bvec, xout, INSERT_VALUES, SCATTER_REVERSE));
753:       PetscCall(VecScatterEnd(plex_batch, bvec, xout, INSERT_VALUES, SCATTER_REVERSE));
754:       PetscCall(VecDestroy(&bvec));
755:     }
756:   }
757:   PetscFunctionReturn(PETSC_SUCCESS);
758: }

760: static PetscErrorCode PCSetUp_BJKOKKOS(PC pc)
761: {
762:   PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;
763:   Mat            A = pc->pmat, Aseq = A; // use filtered block matrix, really "P"
764:   PetscBool      flg;

766:   PetscFunctionBegin;
767:   //PetscCheck(!pc->useAmat, PetscObjectComm((PetscObject)pc), PETSC_ERR_SUP, "No support for using 'use_amat'");
768:   PetscCheck(A, PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "No matrix - A is used above");
769:   PetscCall(PetscObjectTypeCompareAny((PetscObject)A, &flg, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, MATAIJKOKKOS, ""));
770:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "must use '-[dm_]mat_type aijkokkos -[dm_]vec_type kokkos' for -pc_type bjkokkos");
771:   if (!A->spptr) {
772:     Aseq = ((Mat_MPIAIJ *)A->data)->A; // MPI
773:   }
774:   PetscCall(MatSeqAIJKokkosSyncDevice(Aseq));
775:   {
776:     PetscInt    Istart, Iend;
777:     PetscMPIInt rank;
778:     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
779:     PetscCall(MatGetOwnershipRange(A, &Istart, &Iend));
780:     if (!jac->vec_diag) {
781:       Vec     *subX = NULL;
782:       DM       pack, *subDM = NULL;
783:       PetscInt nDMs, n, *block_sizes = NULL;
784:       IS       isrow, isicol;
785:       { // Permute the matrix to get a block diagonal system: d_isrow_k, d_isicol_k
786:         MatOrderingType rtype;
787:         const PetscInt *rowindices, *icolindices;
788:         rtype = MATORDERINGRCM;
789:         // get permutation. And invert. should we convert to local indices?
790:         PetscCall(MatGetOrdering(Aseq, rtype, &isrow, &isicol)); // only seems to work for seq matrix
791:         PetscCall(ISDestroy(&isrow));
792:         PetscCall(ISInvertPermutation(isicol, PETSC_DECIDE, &isrow)); // THIS IS BACKWARD -- isrow is inverse
793:         // if (rank==1) PetscCall(ISView(isicol, PETSC_VIEWER_STDOUT_SELF));
794:         if (0) {
795:           Mat mat_block_order; // debug
796:           PetscCall(ISShift(isicol, Istart, isicol));
797:           PetscCall(MatCreateSubMatrix(A, isicol, isicol, MAT_INITIAL_MATRIX, &mat_block_order));
798:           PetscCall(ISShift(isicol, -Istart, isicol));
799:           PetscCall(MatViewFromOptions(mat_block_order, NULL, "-ksp_batch_reorder_view"));
800:           PetscCall(MatDestroy(&mat_block_order));
801:         }
802:         PetscCall(ISGetIndices(isrow, &rowindices)); // local idx
803:         PetscCall(ISGetIndices(isicol, &icolindices));
804:         const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_isrow_k((PetscInt *)rowindices, A->rmap->n);
805:         const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_isicol_k((PetscInt *)icolindices, A->rmap->n);
806:         jac->d_isrow_k  = new Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_isrow_k));
807:         jac->d_isicol_k = new Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_isicol_k));
808:         Kokkos::deep_copy(*jac->d_isrow_k, h_isrow_k);
809:         Kokkos::deep_copy(*jac->d_isicol_k, h_isicol_k);
810:         PetscCall(ISRestoreIndices(isrow, &rowindices));
811:         PetscCall(ISRestoreIndices(isicol, &icolindices));
812:         // if (rank==1) PetscCall(ISView(isicol, PETSC_VIEWER_STDOUT_SELF));
813:       }
814:       // get block sizes & allocate vec_diag
815:       PetscCall(PCGetDM(pc, &pack));
816:       if (pack) {
817:         PetscCall(PetscObjectTypeCompare((PetscObject)pack, DMCOMPOSITE, &flg));
818:         if (flg) {
819:           PetscCall(DMCompositeGetNumberDM(pack, &nDMs));
820:           PetscCall(DMCreateGlobalVector(pack, &jac->vec_diag));
821:         } else pack = NULL; // flag for no DM
822:       }
823:       if (!jac->vec_diag) { // get 'nDMs' and sizes 'block_sizes' w/o DMComposite. User could provide ISs (todo)
824:         PetscInt        bsrt, bend, ncols, ntot = 0;
825:         const PetscInt *colsA, nloc = Iend - Istart;
826:         const PetscInt *rowindices, *icolindices;
827:         PetscCall(PetscMalloc1(nloc, &block_sizes)); // very inefficient, to big
828:         PetscCall(ISGetIndices(isrow, &rowindices));
829:         PetscCall(ISGetIndices(isicol, &icolindices));
830:         nDMs = 0;
831:         bsrt = 0;
832:         bend = 1;
833:         for (PetscInt row_B = 0; row_B < nloc; row_B++) { // for all rows in block diagonal space
834:           PetscInt rowA = icolindices[row_B], minj = PETSC_MAX_INT, maxj = 0;
835:           //PetscCall(PetscPrintf(PETSC_COMM_SELF, "\t[%d] rowA = %d\n",rank,rowA));
836:           PetscCall(MatGetRow(Aseq, rowA, &ncols, &colsA, NULL)); // not sorted in permutation
837:           PetscCheck(ncols, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "Empty row not supported: %" PetscInt_FMT, row_B);
838:           for (PetscInt colj = 0; colj < ncols; colj++) {
839:             PetscInt colB = rowindices[colsA[colj]]; // use local idx
840:             //PetscCall(PetscPrintf(PETSC_COMM_SELF, "\t\t[%d] colB = %d\n",rank,colB));
841:             PetscCheck(colB >= 0 && colB < nloc, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "colB < 0: %" PetscInt_FMT, colB);
842:             if (colB > maxj) maxj = colB;
843:             if (colB < minj) minj = colB;
844:           }
845:           PetscCall(MatRestoreRow(Aseq, rowA, &ncols, &colsA, NULL));
846:           if (minj >= bend) { // first column is > max of last block -- new block or last block
847:             //PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\t\t finish block %d, N loc = %d (%d,%d)\n", nDMs+1, bend - bsrt,bsrt,bend));
848:             block_sizes[nDMs] = bend - bsrt;
849:             ntot += block_sizes[nDMs];
850:             PetscCheck(minj == bend, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "minj != bend: %" PetscInt_FMT " != %" PetscInt_FMT, minj, bend);
851:             bsrt = bend;
852:             bend++; // start with size 1 in new block
853:             nDMs++;
854:           }
855:           if (maxj + 1 > bend) bend = maxj + 1;
856:           PetscCheck(minj >= bsrt || row_B == Iend - 1, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "%" PetscInt_FMT ") minj < bsrt: %" PetscInt_FMT " != %" PetscInt_FMT, rowA, minj, bsrt);
857:           //PetscCall(PetscPrintf(PETSC_COMM_SELF, "[%d] %d) row %d.%d) cols %d : %d ; bsrt = %d, bend = %d\n",rank,row_B,nDMs,rowA,minj,maxj,bsrt,bend));
858:         }
859:         // do last block
860:         //PetscCall(PetscPrintf(PETSC_COMM_SELF, "\t\t\t [%d] finish block %d, N loc = %d (%d,%d)\n", rank, nDMs+1, bend - bsrt,bsrt,bend));
861:         block_sizes[nDMs] = bend - bsrt;
862:         ntot += block_sizes[nDMs];
863:         nDMs++;
864:         // cleanup
865:         PetscCheck(ntot == nloc, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "n total != n local: %" PetscInt_FMT " != %" PetscInt_FMT, ntot, nloc);
866:         PetscCall(ISRestoreIndices(isrow, &rowindices));
867:         PetscCall(ISRestoreIndices(isicol, &icolindices));
868:         PetscCall(PetscRealloc(sizeof(PetscInt) * nDMs, &block_sizes));
869:         PetscCall(MatCreateVecs(A, &jac->vec_diag, NULL));
870:         PetscCall(PetscInfo(pc, "Setup Matrix based meta data (not DMComposite not attached to PC) %" PetscInt_FMT " sub domains\n", nDMs));
871:       }
872:       PetscCall(ISDestroy(&isrow));
873:       PetscCall(ISDestroy(&isicol));
874:       jac->num_dms = nDMs;
875:       PetscCall(VecGetLocalSize(jac->vec_diag, &n));
876:       jac->n         = n;
877:       jac->d_idiag_k = new Kokkos::View<PetscScalar *, Kokkos::LayoutRight>("idiag", n);
878:       // options
879:       PetscCall(PCBJKOKKOSCreateKSP_BJKOKKOS(pc));
880:       PetscCall(KSPSetFromOptions(jac->ksp));
881:       PetscCall(PetscObjectTypeCompareAny((PetscObject)jac->ksp, &flg, KSPBICG, ""));
882:       if (flg) {
883:         jac->ksp_type_idx = BATCH_KSP_BICG_IDX;
884:         jac->nwork        = 7;
885:       } else {
886:         PetscCall(PetscObjectTypeCompareAny((PetscObject)jac->ksp, &flg, KSPTFQMR, ""));
887:         if (flg) {
888:           jac->ksp_type_idx = BATCH_KSP_TFQMR_IDX;
889:           jac->nwork        = 10;
890:         } else {
891: #if defined(PETSC_HAVE_KOKKOS_KERNELS_BATCH)
892:           PetscCall(PetscObjectTypeCompareAny((PetscObject)jac->ksp, &flg, KSPGMRES, ""));
893:           PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unsupported batch ksp type");
894:           jac->ksp_type_idx = BATCH_KSP_GMRESKK_IDX;
895:           jac->nwork        = 0;
896: #else
897:           KSPType ksptype;
898:           PetscCall(KSPGetType(jac->ksp, &ksptype));
899:           PetscCheck(flg, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "Type: %s not supported in complex", ksptype);
900: #endif
901:         }
902:       }
903:       PetscOptionsBegin(PetscObjectComm((PetscObject)jac->ksp), ((PetscObject)jac->ksp)->prefix, "Options for Kokkos batch solver", "none");
904:       PetscCall(PetscOptionsBool("-ksp_converged_reason", "", "bjkokkos.kokkos.cxx.c", jac->reason, &jac->reason, NULL));
905:       PetscCall(PetscOptionsBool("-ksp_monitor", "", "bjkokkos.kokkos.cxx.c", jac->monitor, &jac->monitor, NULL));
906:       PetscCall(PetscOptionsInt("-ksp_batch_target", "", "bjkokkos.kokkos.cxx.c", jac->batch_target, &jac->batch_target, NULL));
907:       PetscCall(PetscOptionsInt("-ksp_rank_target", "", "bjkokkos.kokkos.cxx.c", jac->rank_target, &jac->rank_target, NULL));
908:       PetscCall(PetscOptionsInt("-ksp_batch_nsolves_team", "", "bjkokkos.kokkos.cxx.c", jac->nsolves_team, &jac->nsolves_team, NULL));
909:       PetscCheck(jac->batch_target < jac->num_dms, PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "-ksp_batch_target (%" PetscInt_FMT ") >= number of DMs (%" PetscInt_FMT ")", jac->batch_target, jac->num_dms);
910:       PetscOptionsEnd();
911:       // get blocks - jac->d_bid_eqOffset_k
912:       if (pack) {
913:         PetscCall(PetscMalloc(sizeof(*subX) * nDMs, &subX));
914:         PetscCall(PetscMalloc(sizeof(*subDM) * nDMs, &subDM));
915:       }
916:       PetscCall(PetscMalloc(sizeof(*jac->dm_Nf) * nDMs, &jac->dm_Nf));
917:       PetscCall(PetscInfo(pc, "Have %" PetscInt_FMT " blocks, n=%" PetscInt_FMT " rtol=%g type = %s\n", nDMs, n, (double)jac->ksp->rtol, ((PetscObject)jac->ksp)->type_name));
918:       if (pack) PetscCall(DMCompositeGetEntriesArray(pack, subDM));
919:       jac->nBlocks = 0;
920:       for (PetscInt ii = 0; ii < nDMs; ii++) {
921:         PetscInt Nf;
922:         if (subDM) {
923:           DM           dm = subDM[ii];
924:           PetscSection section;
925:           PetscCall(DMGetLocalSection(dm, &section));
926:           PetscCall(PetscSectionGetNumFields(section, &Nf));
927:         } else Nf = 1;
928:         jac->nBlocks += Nf;
929: #if PCBJKOKKOS_VERBOSE_LEVEL <= 2
930:         if (ii == 0) PetscCall(PetscInfo(pc, "%" PetscInt_FMT ") %" PetscInt_FMT " blocks (%" PetscInt_FMT " total)\n", ii, Nf, jac->nBlocks));
931: #else
932:         PetscCall(PetscInfo(pc, "%" PetscInt_FMT ") %" PetscInt_FMT " blocks (%" PetscInt_FMT " total)\n", ii, Nf, jac->nBlocks));
933: #endif
934:         jac->dm_Nf[ii] = Nf;
935:       }
936:       { // d_bid_eqOffset_k
937:         Kokkos::View<PetscInt *, Kokkos::LayoutRight, Kokkos::HostSpace> h_block_offsets("block_offsets", jac->nBlocks + 1);
938:         if (pack) PetscCall(DMCompositeGetAccessArray(pack, jac->vec_diag, nDMs, NULL, subX));
939:         h_block_offsets[0]    = 0;
940:         jac->const_block_size = -1;
941:         for (PetscInt ii = 0, idx = 0; ii < nDMs; ii++) {
942:           PetscInt nloc, nblk;
943:           if (pack) PetscCall(VecGetSize(subX[ii], &nloc));
944:           else nloc = block_sizes[ii];
945:           nblk = nloc / jac->dm_Nf[ii];
946:           PetscCheck(nloc % jac->dm_Nf[ii] == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_USER, "nloc%%jac->dm_Nf[ii] (%" PetscInt_FMT ") != 0 DMs", nloc % jac->dm_Nf[ii]);
947:           for (PetscInt jj = 0; jj < jac->dm_Nf[ii]; jj++, idx++) {
948:             h_block_offsets[idx + 1] = h_block_offsets[idx] + nblk;
949: #if PCBJKOKKOS_VERBOSE_LEVEL <= 2
950:             if (idx == 0) PetscCall(PetscInfo(pc, "Add first of %" PetscInt_FMT " blocks with %" PetscInt_FMT " equations\n", jac->nBlocks, nblk));
951: #else
952:             PetscCall(PetscInfo(pc, "\t%" PetscInt_FMT ") Add block with %" PetscInt_FMT " equations of %" PetscInt_FMT "\n", idx + 1, nblk, jac->nBlocks));
953: #endif
954:             if (jac->const_block_size == -1) jac->const_block_size = nblk;
955:             else if (jac->const_block_size > 0 && jac->const_block_size != nblk) jac->const_block_size = 0;
956:           }
957:         }
958:         if (pack) {
959:           PetscCall(DMCompositeRestoreAccessArray(pack, jac->vec_diag, jac->nBlocks, NULL, subX));
960:           PetscCall(PetscFree(subX));
961:           PetscCall(PetscFree(subDM));
962:         }
963:         jac->d_bid_eqOffset_k = new Kokkos::View<PetscInt *, Kokkos::LayoutRight>(Kokkos::create_mirror(Kokkos::DefaultExecutionSpace::memory_space(), h_block_offsets));
964:         Kokkos::deep_copy(*jac->d_bid_eqOffset_k, h_block_offsets);
965:       }
966:       if (!pack) PetscCall(PetscFree(block_sizes));
967:     }
968:     { // get jac->d_idiag_k (PC setup),
969:       const PetscInt    *d_ai, *d_aj;
970:       const PetscScalar *d_aa;
971:       const PetscInt     conc = Kokkos::DefaultExecutionSpace().concurrency(), openmp = !!(conc < 1000), team_size = (openmp == 0 && PCBJKOKKOS_VEC_SIZE != 1) ? PCBJKOKKOS_TEAM_SIZE : 1;
972:       const PetscInt    *d_bid_eqOffset = jac->d_bid_eqOffset_k->data(), *r = jac->d_isrow_k->data(), *ic = jac->d_isicol_k->data();
973:       PetscScalar       *d_idiag = jac->d_idiag_k->data(), *dummy;
974:       PetscMemType       mtype;
975:       PetscCall(MatSeqAIJGetCSRAndMemType(Aseq, &d_ai, &d_aj, &dummy, &mtype));
976:       d_aa = dummy;
977:       Kokkos::parallel_for(
978:         "Diag", Kokkos::TeamPolicy<>(jac->nBlocks, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) {
979:           const PetscInt blkID = team.league_rank();
980:           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, d_bid_eqOffset[blkID], d_bid_eqOffset[blkID + 1]), [=](const int rowb) {
981:             const PetscInt     rowa = ic[rowb], ai = d_ai[rowa], *aj = d_aj + ai; // grab original data
982:             const PetscScalar *aa   = d_aa + ai;
983:             const PetscInt     nrow = d_ai[rowa + 1] - ai;
984:             int                found;
985:             Kokkos::parallel_reduce(
986:               Kokkos::ThreadVectorRange(team, nrow),
987:               [=](const int &j, int &count) {
988:                 const PetscInt colb = r[aj[j]];
989:                 if (colb == rowb) {
990:                   d_idiag[rowb] = 1. / aa[j];
991:                   count++;
992:                 }
993:               },
994:               found);
995: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
996:             if (found != 1) Kokkos::single(Kokkos::PerThread(team), [=]() { printf("ERRORrow %d) found = %d\n", rowb, found); });
997: #endif
998:           });
999:         });
1000:     }
1001:   }
1002:   PetscFunctionReturn(PETSC_SUCCESS);
1003: }

1005: /* Default destroy, if it has never been setup */
1006: static PetscErrorCode PCReset_BJKOKKOS(PC pc)
1007: {
1008:   PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;

1010:   PetscFunctionBegin;
1011:   PetscCall(KSPDestroy(&jac->ksp));
1012:   PetscCall(VecDestroy(&jac->vec_diag));
1013:   if (jac->d_bid_eqOffset_k) delete jac->d_bid_eqOffset_k;
1014:   if (jac->d_idiag_k) delete jac->d_idiag_k;
1015:   if (jac->d_isrow_k) delete jac->d_isrow_k;
1016:   if (jac->d_isicol_k) delete jac->d_isicol_k;
1017:   jac->d_bid_eqOffset_k = NULL;
1018:   jac->d_idiag_k        = NULL;
1019:   jac->d_isrow_k        = NULL;
1020:   jac->d_isicol_k       = NULL;
1021:   PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSGetKSP_C", NULL)); // not published now (causes configure errors)
1022:   PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSSetKSP_C", NULL));
1023:   PetscCall(PetscFree(jac->dm_Nf));
1024:   jac->dm_Nf = NULL;
1025:   if (jac->rowOffsets) delete jac->rowOffsets;
1026:   if (jac->colIndices) delete jac->colIndices;
1027:   if (jac->batch_b) delete jac->batch_b;
1028:   if (jac->batch_x) delete jac->batch_x;
1029:   if (jac->batch_values) delete jac->batch_values;
1030:   jac->rowOffsets   = NULL;
1031:   jac->colIndices   = NULL;
1032:   jac->batch_b      = NULL;
1033:   jac->batch_x      = NULL;
1034:   jac->batch_values = NULL;
1035:   PetscFunctionReturn(PETSC_SUCCESS);
1036: }

1038: static PetscErrorCode PCDestroy_BJKOKKOS(PC pc)
1039: {
1040:   PetscFunctionBegin;
1041:   PetscCall(PCReset_BJKOKKOS(pc));
1042:   PetscCall(PetscFree(pc->data));
1043:   PetscFunctionReturn(PETSC_SUCCESS);
1044: }

1046: static PetscErrorCode PCView_BJKOKKOS(PC pc, PetscViewer viewer)
1047: {
1048:   PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;
1049:   PetscBool      iascii;

1051:   PetscFunctionBegin;
1052:   if (!jac->ksp) PetscCall(PCBJKOKKOSCreateKSP_BJKOKKOS(pc));
1053:   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1054:   if (iascii) {
1055:     PetscCall(PetscViewerASCIIPrintf(viewer, "  Batched device linear solver: Krylov (KSP) method with Jacobi preconditioning\n"));
1056:     PetscCall(PetscViewerASCIIPrintf(viewer, "\t\tnwork = %" PetscInt_FMT ", rel tol = %e, abs tol = %e, div tol = %e, max it =%" PetscInt_FMT ", type = %s\n", jac->nwork, jac->ksp->rtol, jac->ksp->abstol, jac->ksp->divtol, jac->ksp->max_it,
1057:                                      ((PetscObject)jac->ksp)->type_name));
1058:   }
1059:   PetscFunctionReturn(PETSC_SUCCESS);
1060: }

1062: static PetscErrorCode PCSetFromOptions_BJKOKKOS(PC pc, PetscOptionItems *PetscOptionsObject)
1063: {
1064:   PetscFunctionBegin;
1065:   PetscOptionsHeadBegin(PetscOptionsObject, "PC BJKOKKOS options");
1066:   PetscOptionsHeadEnd();
1067:   PetscFunctionReturn(PETSC_SUCCESS);
1068: }

1070: static PetscErrorCode PCBJKOKKOSSetKSP_BJKOKKOS(PC pc, KSP ksp)
1071: {
1072:   PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;

1074:   PetscFunctionBegin;
1075:   PetscCall(PetscObjectReference((PetscObject)ksp));
1076:   PetscCall(KSPDestroy(&jac->ksp));
1077:   jac->ksp = ksp;
1078:   PetscFunctionReturn(PETSC_SUCCESS);
1079: }

1081: /*@C
1082:   PCBJKOKKOSSetKSP - Sets the `KSP` context for `PCBJKOKKOS`

1084:   Collective

1086:   Input Parameters:
1087: + pc  - the `PCBJKOKKOS` preconditioner context
1088: - ksp - the `KSP` solver

1090:   Level: advanced

1092:   Notes:
1093:   The `PC` and the `KSP` must have the same communicator

1095:   If the `PC` is not `PCBJKOKKOS` this function returns without doing anything

1097: .seealso: [](ch_ksp), `PCBJKOKKOSGetKSP()`, `PCBJKOKKOS`
1098: @*/
1099: PetscErrorCode PCBJKOKKOSSetKSP(PC pc, KSP ksp)
1100: {
1101:   PetscFunctionBegin;
1104:   PetscCheckSameComm(pc, 1, ksp, 2);
1105:   PetscTryMethod(pc, "PCBJKOKKOSSetKSP_C", (PC, KSP), (pc, ksp));
1106:   PetscFunctionReturn(PETSC_SUCCESS);
1107: }

1109: static PetscErrorCode PCBJKOKKOSGetKSP_BJKOKKOS(PC pc, KSP *ksp)
1110: {
1111:   PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;

1113:   PetscFunctionBegin;
1114:   if (!jac->ksp) PetscCall(PCBJKOKKOSCreateKSP_BJKOKKOS(pc));
1115:   *ksp = jac->ksp;
1116:   PetscFunctionReturn(PETSC_SUCCESS);
1117: }

1119: /*@C
1120:   PCBJKOKKOSGetKSP - Gets the `KSP` context for the `PCBJKOKKOS` preconditioner

1122:   Not Collective but `KSP` returned is parallel if `PC` was parallel

1124:   Input Parameter:
1125: . pc - the preconditioner context

1127:   Output Parameter:
1128: . ksp - the `KSP` solver

1130:   Level: advanced

1132:   Notes:
1133:   You must call `KSPSetUp()` before calling `PCBJKOKKOSGetKSP()`.

1135:   If the `PC` is not a `PCBJKOKKOS` object it raises an error

1137: .seealso: [](ch_ksp), `PCBJKOKKOS`, `PCBJKOKKOSSetKSP()`
1138: @*/
1139: PetscErrorCode PCBJKOKKOSGetKSP(PC pc, KSP *ksp)
1140: {
1141:   PetscFunctionBegin;
1143:   PetscAssertPointer(ksp, 2);
1144:   PetscUseMethod(pc, "PCBJKOKKOSGetKSP_C", (PC, KSP *), (pc, ksp));
1145:   PetscFunctionReturn(PETSC_SUCCESS);
1146: }

1148: /*MC
1149:      PCBJKOKKOS -  Defines a preconditioner that applies a Krylov solver and preconditioner to the blocks in a `MATSEQAIJ` matrix on the GPU using Kokkos

1151:    Options Database Key:
1152: .     -pc_bjkokkos_ - options prefix for its `KSP` options

1154:    Level: intermediate

1156:    Note:
1157:     For use with -ksp_type preonly to bypass any computation on the CPU

1159:    Developer Notes:
1160:    The documentation is incomplete. Is this a block Jacobi preconditioner?

1162:    Why does it have its own `KSP`? Where is the `KSP` run if used with -ksp_type preonly?

1164: .seealso: [](ch_ksp), `PCCreate()`, `PCSetType()`, `PCType`, `PC`, `PCBJACOBI`,
1165:           `PCSHELL`, `PCCOMPOSITE`, `PCSetUseAmat()`, `PCBJKOKKOSGetKSP()`
1166: M*/

1168: PETSC_EXTERN PetscErrorCode PCCreate_BJKOKKOS(PC pc)
1169: {
1170:   PC_PCBJKOKKOS *jac;

1172:   PetscFunctionBegin;
1173:   PetscCall(PetscNew(&jac));
1174:   pc->data = (void *)jac;

1176:   jac->ksp              = NULL;
1177:   jac->vec_diag         = NULL;
1178:   jac->d_bid_eqOffset_k = NULL;
1179:   jac->d_idiag_k        = NULL;
1180:   jac->d_isrow_k        = NULL;
1181:   jac->d_isicol_k       = NULL;
1182:   jac->nBlocks          = 1;
1183:   jac->max_nits         = 0;

1185:   PetscCall(PetscMemzero(pc->ops, sizeof(struct _PCOps)));
1186:   pc->ops->apply          = PCApply_BJKOKKOS;
1187:   pc->ops->applytranspose = NULL;
1188:   pc->ops->setup          = PCSetUp_BJKOKKOS;
1189:   pc->ops->reset          = PCReset_BJKOKKOS;
1190:   pc->ops->destroy        = PCDestroy_BJKOKKOS;
1191:   pc->ops->setfromoptions = PCSetFromOptions_BJKOKKOS;
1192:   pc->ops->view           = PCView_BJKOKKOS;

1194:   jac->rowOffsets   = NULL;
1195:   jac->colIndices   = NULL;
1196:   jac->batch_b      = NULL;
1197:   jac->batch_x      = NULL;
1198:   jac->batch_values = NULL;

1200:   PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSGetKSP_C", PCBJKOKKOSGetKSP_BJKOKKOS));
1201:   PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSSetKSP_C", PCBJKOKKOSSetKSP_BJKOKKOS));
1202:   PetscFunctionReturn(PETSC_SUCCESS);
1203: }