Actual source code: baij.c

  1: /*
  2:     Defines the basic matrix operations for the BAIJ (compressed row)
  3:   matrix storage format.
  4: */
  5: #include <../src/mat/impls/baij/seq/baij.h>
  6: #include <petscblaslapack.h>
  7: #include <petsc/private/kernels/blockinvert.h>
  8: #include <petsc/private/kernels/blockmatmult.h>

 10: /* defines MatSetValues_Seq_Hash(), MatAssemblyEnd_Seq_Hash(), MatSetUp_Seq_Hash() */
 11: #define TYPE BAIJ
 12: #define TYPE_BS
 13: #include "../src/mat/impls/aij/seq/seqhashmatsetvalues.h"
 14: #undef TYPE_BS
 15: #define TYPE_BS _BS
 16: #define TYPE_BS_ON
 17: #include "../src/mat/impls/aij/seq/seqhashmatsetvalues.h"
 18: #undef TYPE_BS
 19: #include "../src/mat/impls/aij/seq/seqhashmat.h"
 20: #undef TYPE
 21: #undef TYPE_BS_ON

 23: #if defined(PETSC_HAVE_HYPRE)
 24: PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
 25: #endif

 27: #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
 28: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBAIJMKL(Mat, MatType, MatReuse, Mat *);
 29: #endif
 30: PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);

 32: static PetscErrorCode MatGetColumnReductions_SeqBAIJ(Mat A, PetscInt type, PetscReal *reductions)
 33: {
 34:   Mat_SeqBAIJ *a_aij = (Mat_SeqBAIJ *)A->data;
 35:   PetscInt     m, n, ib, jb, bs = A->rmap->bs;
 36:   MatScalar   *a_val = a_aij->a;

 38:   PetscFunctionBegin;
 39:   PetscCall(MatGetSize(A, &m, &n));
 40:   PetscCall(PetscArrayzero(reductions, n));
 41:   if (type == NORM_2) {
 42:     for (PetscInt i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
 43:       for (jb = 0; jb < bs; jb++) {
 44:         for (ib = 0; ib < bs; ib++) {
 45:           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val * *a_val);
 46:           a_val++;
 47:         }
 48:       }
 49:     }
 50:   } else if (type == NORM_1) {
 51:     for (PetscInt i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
 52:       for (jb = 0; jb < bs; jb++) {
 53:         for (ib = 0; ib < bs; ib++) {
 54:           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val);
 55:           a_val++;
 56:         }
 57:       }
 58:     }
 59:   } else if (type == NORM_INFINITY) {
 60:     for (PetscInt i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
 61:       for (jb = 0; jb < bs; jb++) {
 62:         for (ib = 0; ib < bs; ib++) {
 63:           PetscInt col    = A->cmap->rstart + a_aij->j[i] * bs + jb;
 64:           reductions[col] = PetscMax(PetscAbsScalar(*a_val), reductions[col]);
 65:           a_val++;
 66:         }
 67:       }
 68:     }
 69:   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
 70:     for (PetscInt i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
 71:       for (jb = 0; jb < bs; jb++) {
 72:         for (ib = 0; ib < bs; ib++) {
 73:           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscRealPart(*a_val);
 74:           a_val++;
 75:         }
 76:       }
 77:     }
 78:   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
 79:     for (PetscInt i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
 80:       for (jb = 0; jb < bs; jb++) {
 81:         for (ib = 0; ib < bs; ib++) {
 82:           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscImaginaryPart(*a_val);
 83:           a_val++;
 84:         }
 85:       }
 86:     }
 87:   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
 88:   if (type == NORM_2) {
 89:     for (PetscInt i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
 90:   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
 91:     for (PetscInt i = 0; i < n; i++) reductions[i] /= m;
 92:   }
 93:   PetscFunctionReturn(PETSC_SUCCESS);
 94: }

 96: static PetscErrorCode MatInvertBlockDiagonal_SeqBAIJ(Mat A, const PetscScalar **values)
 97: {
 98:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
 99:   PetscInt    *diag_offset, i, bs = A->rmap->bs, mbs = a->mbs, ipvt[5], bs2 = bs * bs, *v_pivots;
100:   MatScalar   *v     = a->a, *odiag, *diag, work[25], *v_work;
101:   PetscReal    shift = 0.0;
102:   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;

104:   PetscFunctionBegin;
105:   allowzeropivot = PetscNot(A->erroriffailure);

107:   if (a->idiagvalid) {
108:     if (values) *values = a->idiag;
109:     PetscFunctionReturn(PETSC_SUCCESS);
110:   }
111:   PetscCall(MatMarkDiagonal_SeqBAIJ(A));
112:   diag_offset = a->diag;
113:   if (!a->idiag) PetscCall(PetscMalloc1(bs2 * mbs, &a->idiag));
114:   diag = a->idiag;
115:   if (values) *values = a->idiag;
116:   /* factor and invert each block */
117:   switch (bs) {
118:   case 1:
119:     for (i = 0; i < mbs; i++) {
120:       odiag   = v + 1 * diag_offset[i];
121:       diag[0] = odiag[0];

123:       if (PetscAbsScalar(diag[0] + shift) < PETSC_MACHINE_EPSILON) {
124:         PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot, row %" PetscInt_FMT " pivot value %g tolerance %g", i, (double)PetscAbsScalar(diag[0]), (double)PETSC_MACHINE_EPSILON);
125:         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
126:         A->factorerror_zeropivot_value = PetscAbsScalar(diag[0]);
127:         A->factorerror_zeropivot_row   = i;
128:         PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", i));
129:       }

131:       diag[0] = (PetscScalar)1.0 / (diag[0] + shift);
132:       diag += 1;
133:     }
134:     break;
135:   case 2:
136:     for (i = 0; i < mbs; i++) {
137:       odiag   = v + 4 * diag_offset[i];
138:       diag[0] = odiag[0];
139:       diag[1] = odiag[1];
140:       diag[2] = odiag[2];
141:       diag[3] = odiag[3];
142:       PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected));
143:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
144:       diag += 4;
145:     }
146:     break;
147:   case 3:
148:     for (i = 0; i < mbs; i++) {
149:       odiag   = v + 9 * diag_offset[i];
150:       diag[0] = odiag[0];
151:       diag[1] = odiag[1];
152:       diag[2] = odiag[2];
153:       diag[3] = odiag[3];
154:       diag[4] = odiag[4];
155:       diag[5] = odiag[5];
156:       diag[6] = odiag[6];
157:       diag[7] = odiag[7];
158:       diag[8] = odiag[8];
159:       PetscCall(PetscKernel_A_gets_inverse_A_3(diag, shift, allowzeropivot, &zeropivotdetected));
160:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
161:       diag += 9;
162:     }
163:     break;
164:   case 4:
165:     for (i = 0; i < mbs; i++) {
166:       odiag = v + 16 * diag_offset[i];
167:       PetscCall(PetscArraycpy(diag, odiag, 16));
168:       PetscCall(PetscKernel_A_gets_inverse_A_4(diag, shift, allowzeropivot, &zeropivotdetected));
169:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
170:       diag += 16;
171:     }
172:     break;
173:   case 5:
174:     for (i = 0; i < mbs; i++) {
175:       odiag = v + 25 * diag_offset[i];
176:       PetscCall(PetscArraycpy(diag, odiag, 25));
177:       PetscCall(PetscKernel_A_gets_inverse_A_5(diag, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
178:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
179:       diag += 25;
180:     }
181:     break;
182:   case 6:
183:     for (i = 0; i < mbs; i++) {
184:       odiag = v + 36 * diag_offset[i];
185:       PetscCall(PetscArraycpy(diag, odiag, 36));
186:       PetscCall(PetscKernel_A_gets_inverse_A_6(diag, shift, allowzeropivot, &zeropivotdetected));
187:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
188:       diag += 36;
189:     }
190:     break;
191:   case 7:
192:     for (i = 0; i < mbs; i++) {
193:       odiag = v + 49 * diag_offset[i];
194:       PetscCall(PetscArraycpy(diag, odiag, 49));
195:       PetscCall(PetscKernel_A_gets_inverse_A_7(diag, shift, allowzeropivot, &zeropivotdetected));
196:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
197:       diag += 49;
198:     }
199:     break;
200:   default:
201:     PetscCall(PetscMalloc2(bs, &v_work, bs, &v_pivots));
202:     for (i = 0; i < mbs; i++) {
203:       odiag = v + bs2 * diag_offset[i];
204:       PetscCall(PetscArraycpy(diag, odiag, bs2));
205:       PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
206:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
207:       diag += bs2;
208:     }
209:     PetscCall(PetscFree2(v_work, v_pivots));
210:   }
211:   a->idiagvalid = PETSC_TRUE;
212:   PetscFunctionReturn(PETSC_SUCCESS);
213: }

215: static PetscErrorCode MatSOR_SeqBAIJ(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
216: {
217:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
218:   PetscScalar       *x, *work, *w, *workt, *t;
219:   const MatScalar   *v, *aa = a->a, *idiag;
220:   const PetscScalar *b, *xb;
221:   PetscScalar        s[7], xw[7] = {0}; /* avoid some compilers thinking xw is uninitialized */
222:   PetscInt           m = a->mbs, i, i2, nz, bs = A->rmap->bs, bs2 = bs * bs, k, j, idx, it;
223:   const PetscInt    *diag, *ai = a->i, *aj = a->j, *vi;

225:   PetscFunctionBegin;
226:   its = its * lits;
227:   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
228:   PetscCheck(its > 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Relaxation requires global its %" PetscInt_FMT " and local its %" PetscInt_FMT " both positive", its, lits);
229:   PetscCheck(!fshift, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for diagonal shift");
230:   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for non-trivial relaxation factor");
231:   PetscCheck(!(flag & SOR_APPLY_UPPER) && !(flag & SOR_APPLY_LOWER), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for applying upper or lower triangular parts");

233:   if (!a->idiagvalid) PetscCall(MatInvertBlockDiagonal(A, NULL));

235:   if (!m) PetscFunctionReturn(PETSC_SUCCESS);
236:   diag  = a->diag;
237:   idiag = a->idiag;
238:   k     = PetscMax(A->rmap->n, A->cmap->n);
239:   if (!a->mult_work) PetscCall(PetscMalloc1(k + 1, &a->mult_work));
240:   if (!a->sor_workt) PetscCall(PetscMalloc1(k, &a->sor_workt));
241:   if (!a->sor_work) PetscCall(PetscMalloc1(bs, &a->sor_work));
242:   work = a->mult_work;
243:   t    = a->sor_workt;
244:   w    = a->sor_work;

246:   PetscCall(VecGetArray(xx, &x));
247:   PetscCall(VecGetArrayRead(bb, &b));

249:   if (flag & SOR_ZERO_INITIAL_GUESS) {
250:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
251:       switch (bs) {
252:       case 1:
253:         PetscKernel_v_gets_A_times_w_1(x, idiag, b);
254:         t[0] = b[0];
255:         i2   = 1;
256:         idiag += 1;
257:         for (i = 1; i < m; i++) {
258:           v    = aa + ai[i];
259:           vi   = aj + ai[i];
260:           nz   = diag[i] - ai[i];
261:           s[0] = b[i2];
262:           for (j = 0; j < nz; j++) {
263:             xw[0] = x[vi[j]];
264:             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
265:           }
266:           t[i2] = s[0];
267:           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
268:           x[i2] = xw[0];
269:           idiag += 1;
270:           i2 += 1;
271:         }
272:         break;
273:       case 2:
274:         PetscKernel_v_gets_A_times_w_2(x, idiag, b);
275:         t[0] = b[0];
276:         t[1] = b[1];
277:         i2   = 2;
278:         idiag += 4;
279:         for (i = 1; i < m; i++) {
280:           v    = aa + 4 * ai[i];
281:           vi   = aj + ai[i];
282:           nz   = diag[i] - ai[i];
283:           s[0] = b[i2];
284:           s[1] = b[i2 + 1];
285:           for (j = 0; j < nz; j++) {
286:             idx   = 2 * vi[j];
287:             it    = 4 * j;
288:             xw[0] = x[idx];
289:             xw[1] = x[1 + idx];
290:             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
291:           }
292:           t[i2]     = s[0];
293:           t[i2 + 1] = s[1];
294:           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
295:           x[i2]     = xw[0];
296:           x[i2 + 1] = xw[1];
297:           idiag += 4;
298:           i2 += 2;
299:         }
300:         break;
301:       case 3:
302:         PetscKernel_v_gets_A_times_w_3(x, idiag, b);
303:         t[0] = b[0];
304:         t[1] = b[1];
305:         t[2] = b[2];
306:         i2   = 3;
307:         idiag += 9;
308:         for (i = 1; i < m; i++) {
309:           v    = aa + 9 * ai[i];
310:           vi   = aj + ai[i];
311:           nz   = diag[i] - ai[i];
312:           s[0] = b[i2];
313:           s[1] = b[i2 + 1];
314:           s[2] = b[i2 + 2];
315:           while (nz--) {
316:             idx   = 3 * (*vi++);
317:             xw[0] = x[idx];
318:             xw[1] = x[1 + idx];
319:             xw[2] = x[2 + idx];
320:             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
321:             v += 9;
322:           }
323:           t[i2]     = s[0];
324:           t[i2 + 1] = s[1];
325:           t[i2 + 2] = s[2];
326:           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
327:           x[i2]     = xw[0];
328:           x[i2 + 1] = xw[1];
329:           x[i2 + 2] = xw[2];
330:           idiag += 9;
331:           i2 += 3;
332:         }
333:         break;
334:       case 4:
335:         PetscKernel_v_gets_A_times_w_4(x, idiag, b);
336:         t[0] = b[0];
337:         t[1] = b[1];
338:         t[2] = b[2];
339:         t[3] = b[3];
340:         i2   = 4;
341:         idiag += 16;
342:         for (i = 1; i < m; i++) {
343:           v    = aa + 16 * ai[i];
344:           vi   = aj + ai[i];
345:           nz   = diag[i] - ai[i];
346:           s[0] = b[i2];
347:           s[1] = b[i2 + 1];
348:           s[2] = b[i2 + 2];
349:           s[3] = b[i2 + 3];
350:           while (nz--) {
351:             idx   = 4 * (*vi++);
352:             xw[0] = x[idx];
353:             xw[1] = x[1 + idx];
354:             xw[2] = x[2 + idx];
355:             xw[3] = x[3 + idx];
356:             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
357:             v += 16;
358:           }
359:           t[i2]     = s[0];
360:           t[i2 + 1] = s[1];
361:           t[i2 + 2] = s[2];
362:           t[i2 + 3] = s[3];
363:           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
364:           x[i2]     = xw[0];
365:           x[i2 + 1] = xw[1];
366:           x[i2 + 2] = xw[2];
367:           x[i2 + 3] = xw[3];
368:           idiag += 16;
369:           i2 += 4;
370:         }
371:         break;
372:       case 5:
373:         PetscKernel_v_gets_A_times_w_5(x, idiag, b);
374:         t[0] = b[0];
375:         t[1] = b[1];
376:         t[2] = b[2];
377:         t[3] = b[3];
378:         t[4] = b[4];
379:         i2   = 5;
380:         idiag += 25;
381:         for (i = 1; i < m; i++) {
382:           v    = aa + 25 * ai[i];
383:           vi   = aj + ai[i];
384:           nz   = diag[i] - ai[i];
385:           s[0] = b[i2];
386:           s[1] = b[i2 + 1];
387:           s[2] = b[i2 + 2];
388:           s[3] = b[i2 + 3];
389:           s[4] = b[i2 + 4];
390:           while (nz--) {
391:             idx   = 5 * (*vi++);
392:             xw[0] = x[idx];
393:             xw[1] = x[1 + idx];
394:             xw[2] = x[2 + idx];
395:             xw[3] = x[3 + idx];
396:             xw[4] = x[4 + idx];
397:             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
398:             v += 25;
399:           }
400:           t[i2]     = s[0];
401:           t[i2 + 1] = s[1];
402:           t[i2 + 2] = s[2];
403:           t[i2 + 3] = s[3];
404:           t[i2 + 4] = s[4];
405:           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
406:           x[i2]     = xw[0];
407:           x[i2 + 1] = xw[1];
408:           x[i2 + 2] = xw[2];
409:           x[i2 + 3] = xw[3];
410:           x[i2 + 4] = xw[4];
411:           idiag += 25;
412:           i2 += 5;
413:         }
414:         break;
415:       case 6:
416:         PetscKernel_v_gets_A_times_w_6(x, idiag, b);
417:         t[0] = b[0];
418:         t[1] = b[1];
419:         t[2] = b[2];
420:         t[3] = b[3];
421:         t[4] = b[4];
422:         t[5] = b[5];
423:         i2   = 6;
424:         idiag += 36;
425:         for (i = 1; i < m; i++) {
426:           v    = aa + 36 * ai[i];
427:           vi   = aj + ai[i];
428:           nz   = diag[i] - ai[i];
429:           s[0] = b[i2];
430:           s[1] = b[i2 + 1];
431:           s[2] = b[i2 + 2];
432:           s[3] = b[i2 + 3];
433:           s[4] = b[i2 + 4];
434:           s[5] = b[i2 + 5];
435:           while (nz--) {
436:             idx   = 6 * (*vi++);
437:             xw[0] = x[idx];
438:             xw[1] = x[1 + idx];
439:             xw[2] = x[2 + idx];
440:             xw[3] = x[3 + idx];
441:             xw[4] = x[4 + idx];
442:             xw[5] = x[5 + idx];
443:             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
444:             v += 36;
445:           }
446:           t[i2]     = s[0];
447:           t[i2 + 1] = s[1];
448:           t[i2 + 2] = s[2];
449:           t[i2 + 3] = s[3];
450:           t[i2 + 4] = s[4];
451:           t[i2 + 5] = s[5];
452:           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
453:           x[i2]     = xw[0];
454:           x[i2 + 1] = xw[1];
455:           x[i2 + 2] = xw[2];
456:           x[i2 + 3] = xw[3];
457:           x[i2 + 4] = xw[4];
458:           x[i2 + 5] = xw[5];
459:           idiag += 36;
460:           i2 += 6;
461:         }
462:         break;
463:       case 7:
464:         PetscKernel_v_gets_A_times_w_7(x, idiag, b);
465:         t[0] = b[0];
466:         t[1] = b[1];
467:         t[2] = b[2];
468:         t[3] = b[3];
469:         t[4] = b[4];
470:         t[5] = b[5];
471:         t[6] = b[6];
472:         i2   = 7;
473:         idiag += 49;
474:         for (i = 1; i < m; i++) {
475:           v    = aa + 49 * ai[i];
476:           vi   = aj + ai[i];
477:           nz   = diag[i] - ai[i];
478:           s[0] = b[i2];
479:           s[1] = b[i2 + 1];
480:           s[2] = b[i2 + 2];
481:           s[3] = b[i2 + 3];
482:           s[4] = b[i2 + 4];
483:           s[5] = b[i2 + 5];
484:           s[6] = b[i2 + 6];
485:           while (nz--) {
486:             idx   = 7 * (*vi++);
487:             xw[0] = x[idx];
488:             xw[1] = x[1 + idx];
489:             xw[2] = x[2 + idx];
490:             xw[3] = x[3 + idx];
491:             xw[4] = x[4 + idx];
492:             xw[5] = x[5 + idx];
493:             xw[6] = x[6 + idx];
494:             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
495:             v += 49;
496:           }
497:           t[i2]     = s[0];
498:           t[i2 + 1] = s[1];
499:           t[i2 + 2] = s[2];
500:           t[i2 + 3] = s[3];
501:           t[i2 + 4] = s[4];
502:           t[i2 + 5] = s[5];
503:           t[i2 + 6] = s[6];
504:           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
505:           x[i2]     = xw[0];
506:           x[i2 + 1] = xw[1];
507:           x[i2 + 2] = xw[2];
508:           x[i2 + 3] = xw[3];
509:           x[i2 + 4] = xw[4];
510:           x[i2 + 5] = xw[5];
511:           x[i2 + 6] = xw[6];
512:           idiag += 49;
513:           i2 += 7;
514:         }
515:         break;
516:       default:
517:         PetscKernel_w_gets_Ar_times_v(bs, bs, b, idiag, x);
518:         PetscCall(PetscArraycpy(t, b, bs));
519:         i2 = bs;
520:         idiag += bs2;
521:         for (i = 1; i < m; i++) {
522:           v  = aa + bs2 * ai[i];
523:           vi = aj + ai[i];
524:           nz = diag[i] - ai[i];

526:           PetscCall(PetscArraycpy(w, b + i2, bs));
527:           /* copy all rows of x that are needed into contiguous space */
528:           workt = work;
529:           for (j = 0; j < nz; j++) {
530:             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
531:             workt += bs;
532:           }
533:           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
534:           PetscCall(PetscArraycpy(t + i2, w, bs));
535:           PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);

537:           idiag += bs2;
538:           i2 += bs;
539:         }
540:         break;
541:       }
542:       /* for logging purposes assume number of nonzero in lower half is 1/2 of total */
543:       PetscCall(PetscLogFlops(1.0 * bs2 * a->nz));
544:       xb = t;
545:     } else xb = b;
546:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
547:       idiag = a->idiag + bs2 * (a->mbs - 1);
548:       i2    = bs * (m - 1);
549:       switch (bs) {
550:       case 1:
551:         s[0] = xb[i2];
552:         PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
553:         x[i2] = xw[0];
554:         i2 -= 1;
555:         for (i = m - 2; i >= 0; i--) {
556:           v    = aa + (diag[i] + 1);
557:           vi   = aj + diag[i] + 1;
558:           nz   = ai[i + 1] - diag[i] - 1;
559:           s[0] = xb[i2];
560:           for (j = 0; j < nz; j++) {
561:             xw[0] = x[vi[j]];
562:             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
563:           }
564:           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
565:           x[i2] = xw[0];
566:           idiag -= 1;
567:           i2 -= 1;
568:         }
569:         break;
570:       case 2:
571:         s[0] = xb[i2];
572:         s[1] = xb[i2 + 1];
573:         PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
574:         x[i2]     = xw[0];
575:         x[i2 + 1] = xw[1];
576:         i2 -= 2;
577:         idiag -= 4;
578:         for (i = m - 2; i >= 0; i--) {
579:           v    = aa + 4 * (diag[i] + 1);
580:           vi   = aj + diag[i] + 1;
581:           nz   = ai[i + 1] - diag[i] - 1;
582:           s[0] = xb[i2];
583:           s[1] = xb[i2 + 1];
584:           for (j = 0; j < nz; j++) {
585:             idx   = 2 * vi[j];
586:             it    = 4 * j;
587:             xw[0] = x[idx];
588:             xw[1] = x[1 + idx];
589:             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
590:           }
591:           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
592:           x[i2]     = xw[0];
593:           x[i2 + 1] = xw[1];
594:           idiag -= 4;
595:           i2 -= 2;
596:         }
597:         break;
598:       case 3:
599:         s[0] = xb[i2];
600:         s[1] = xb[i2 + 1];
601:         s[2] = xb[i2 + 2];
602:         PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
603:         x[i2]     = xw[0];
604:         x[i2 + 1] = xw[1];
605:         x[i2 + 2] = xw[2];
606:         i2 -= 3;
607:         idiag -= 9;
608:         for (i = m - 2; i >= 0; i--) {
609:           v    = aa + 9 * (diag[i] + 1);
610:           vi   = aj + diag[i] + 1;
611:           nz   = ai[i + 1] - diag[i] - 1;
612:           s[0] = xb[i2];
613:           s[1] = xb[i2 + 1];
614:           s[2] = xb[i2 + 2];
615:           while (nz--) {
616:             idx   = 3 * (*vi++);
617:             xw[0] = x[idx];
618:             xw[1] = x[1 + idx];
619:             xw[2] = x[2 + idx];
620:             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
621:             v += 9;
622:           }
623:           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
624:           x[i2]     = xw[0];
625:           x[i2 + 1] = xw[1];
626:           x[i2 + 2] = xw[2];
627:           idiag -= 9;
628:           i2 -= 3;
629:         }
630:         break;
631:       case 4:
632:         s[0] = xb[i2];
633:         s[1] = xb[i2 + 1];
634:         s[2] = xb[i2 + 2];
635:         s[3] = xb[i2 + 3];
636:         PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
637:         x[i2]     = xw[0];
638:         x[i2 + 1] = xw[1];
639:         x[i2 + 2] = xw[2];
640:         x[i2 + 3] = xw[3];
641:         i2 -= 4;
642:         idiag -= 16;
643:         for (i = m - 2; i >= 0; i--) {
644:           v    = aa + 16 * (diag[i] + 1);
645:           vi   = aj + diag[i] + 1;
646:           nz   = ai[i + 1] - diag[i] - 1;
647:           s[0] = xb[i2];
648:           s[1] = xb[i2 + 1];
649:           s[2] = xb[i2 + 2];
650:           s[3] = xb[i2 + 3];
651:           while (nz--) {
652:             idx   = 4 * (*vi++);
653:             xw[0] = x[idx];
654:             xw[1] = x[1 + idx];
655:             xw[2] = x[2 + idx];
656:             xw[3] = x[3 + idx];
657:             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
658:             v += 16;
659:           }
660:           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
661:           x[i2]     = xw[0];
662:           x[i2 + 1] = xw[1];
663:           x[i2 + 2] = xw[2];
664:           x[i2 + 3] = xw[3];
665:           idiag -= 16;
666:           i2 -= 4;
667:         }
668:         break;
669:       case 5:
670:         s[0] = xb[i2];
671:         s[1] = xb[i2 + 1];
672:         s[2] = xb[i2 + 2];
673:         s[3] = xb[i2 + 3];
674:         s[4] = xb[i2 + 4];
675:         PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
676:         x[i2]     = xw[0];
677:         x[i2 + 1] = xw[1];
678:         x[i2 + 2] = xw[2];
679:         x[i2 + 3] = xw[3];
680:         x[i2 + 4] = xw[4];
681:         i2 -= 5;
682:         idiag -= 25;
683:         for (i = m - 2; i >= 0; i--) {
684:           v    = aa + 25 * (diag[i] + 1);
685:           vi   = aj + diag[i] + 1;
686:           nz   = ai[i + 1] - diag[i] - 1;
687:           s[0] = xb[i2];
688:           s[1] = xb[i2 + 1];
689:           s[2] = xb[i2 + 2];
690:           s[3] = xb[i2 + 3];
691:           s[4] = xb[i2 + 4];
692:           while (nz--) {
693:             idx   = 5 * (*vi++);
694:             xw[0] = x[idx];
695:             xw[1] = x[1 + idx];
696:             xw[2] = x[2 + idx];
697:             xw[3] = x[3 + idx];
698:             xw[4] = x[4 + idx];
699:             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
700:             v += 25;
701:           }
702:           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
703:           x[i2]     = xw[0];
704:           x[i2 + 1] = xw[1];
705:           x[i2 + 2] = xw[2];
706:           x[i2 + 3] = xw[3];
707:           x[i2 + 4] = xw[4];
708:           idiag -= 25;
709:           i2 -= 5;
710:         }
711:         break;
712:       case 6:
713:         s[0] = xb[i2];
714:         s[1] = xb[i2 + 1];
715:         s[2] = xb[i2 + 2];
716:         s[3] = xb[i2 + 3];
717:         s[4] = xb[i2 + 4];
718:         s[5] = xb[i2 + 5];
719:         PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
720:         x[i2]     = xw[0];
721:         x[i2 + 1] = xw[1];
722:         x[i2 + 2] = xw[2];
723:         x[i2 + 3] = xw[3];
724:         x[i2 + 4] = xw[4];
725:         x[i2 + 5] = xw[5];
726:         i2 -= 6;
727:         idiag -= 36;
728:         for (i = m - 2; i >= 0; i--) {
729:           v    = aa + 36 * (diag[i] + 1);
730:           vi   = aj + diag[i] + 1;
731:           nz   = ai[i + 1] - diag[i] - 1;
732:           s[0] = xb[i2];
733:           s[1] = xb[i2 + 1];
734:           s[2] = xb[i2 + 2];
735:           s[3] = xb[i2 + 3];
736:           s[4] = xb[i2 + 4];
737:           s[5] = xb[i2 + 5];
738:           while (nz--) {
739:             idx   = 6 * (*vi++);
740:             xw[0] = x[idx];
741:             xw[1] = x[1 + idx];
742:             xw[2] = x[2 + idx];
743:             xw[3] = x[3 + idx];
744:             xw[4] = x[4 + idx];
745:             xw[5] = x[5 + idx];
746:             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
747:             v += 36;
748:           }
749:           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
750:           x[i2]     = xw[0];
751:           x[i2 + 1] = xw[1];
752:           x[i2 + 2] = xw[2];
753:           x[i2 + 3] = xw[3];
754:           x[i2 + 4] = xw[4];
755:           x[i2 + 5] = xw[5];
756:           idiag -= 36;
757:           i2 -= 6;
758:         }
759:         break;
760:       case 7:
761:         s[0] = xb[i2];
762:         s[1] = xb[i2 + 1];
763:         s[2] = xb[i2 + 2];
764:         s[3] = xb[i2 + 3];
765:         s[4] = xb[i2 + 4];
766:         s[5] = xb[i2 + 5];
767:         s[6] = xb[i2 + 6];
768:         PetscKernel_v_gets_A_times_w_7(x, idiag, b);
769:         x[i2]     = xw[0];
770:         x[i2 + 1] = xw[1];
771:         x[i2 + 2] = xw[2];
772:         x[i2 + 3] = xw[3];
773:         x[i2 + 4] = xw[4];
774:         x[i2 + 5] = xw[5];
775:         x[i2 + 6] = xw[6];
776:         i2 -= 7;
777:         idiag -= 49;
778:         for (i = m - 2; i >= 0; i--) {
779:           v    = aa + 49 * (diag[i] + 1);
780:           vi   = aj + diag[i] + 1;
781:           nz   = ai[i + 1] - diag[i] - 1;
782:           s[0] = xb[i2];
783:           s[1] = xb[i2 + 1];
784:           s[2] = xb[i2 + 2];
785:           s[3] = xb[i2 + 3];
786:           s[4] = xb[i2 + 4];
787:           s[5] = xb[i2 + 5];
788:           s[6] = xb[i2 + 6];
789:           while (nz--) {
790:             idx   = 7 * (*vi++);
791:             xw[0] = x[idx];
792:             xw[1] = x[1 + idx];
793:             xw[2] = x[2 + idx];
794:             xw[3] = x[3 + idx];
795:             xw[4] = x[4 + idx];
796:             xw[5] = x[5 + idx];
797:             xw[6] = x[6 + idx];
798:             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
799:             v += 49;
800:           }
801:           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
802:           x[i2]     = xw[0];
803:           x[i2 + 1] = xw[1];
804:           x[i2 + 2] = xw[2];
805:           x[i2 + 3] = xw[3];
806:           x[i2 + 4] = xw[4];
807:           x[i2 + 5] = xw[5];
808:           x[i2 + 6] = xw[6];
809:           idiag -= 49;
810:           i2 -= 7;
811:         }
812:         break;
813:       default:
814:         PetscCall(PetscArraycpy(w, xb + i2, bs));
815:         PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
816:         i2 -= bs;
817:         idiag -= bs2;
818:         for (i = m - 2; i >= 0; i--) {
819:           v  = aa + bs2 * (diag[i] + 1);
820:           vi = aj + diag[i] + 1;
821:           nz = ai[i + 1] - diag[i] - 1;

823:           PetscCall(PetscArraycpy(w, xb + i2, bs));
824:           /* copy all rows of x that are needed into contiguous space */
825:           workt = work;
826:           for (j = 0; j < nz; j++) {
827:             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
828:             workt += bs;
829:           }
830:           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
831:           PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);

833:           idiag -= bs2;
834:           i2 -= bs;
835:         }
836:         break;
837:       }
838:       PetscCall(PetscLogFlops(1.0 * bs2 * (a->nz)));
839:     }
840:     its--;
841:   }
842:   while (its--) {
843:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
844:       idiag = a->idiag;
845:       i2    = 0;
846:       switch (bs) {
847:       case 1:
848:         for (i = 0; i < m; i++) {
849:           v    = aa + ai[i];
850:           vi   = aj + ai[i];
851:           nz   = ai[i + 1] - ai[i];
852:           s[0] = b[i2];
853:           for (j = 0; j < nz; j++) {
854:             xw[0] = x[vi[j]];
855:             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
856:           }
857:           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
858:           x[i2] += xw[0];
859:           idiag += 1;
860:           i2 += 1;
861:         }
862:         break;
863:       case 2:
864:         for (i = 0; i < m; i++) {
865:           v    = aa + 4 * ai[i];
866:           vi   = aj + ai[i];
867:           nz   = ai[i + 1] - ai[i];
868:           s[0] = b[i2];
869:           s[1] = b[i2 + 1];
870:           for (j = 0; j < nz; j++) {
871:             idx   = 2 * vi[j];
872:             it    = 4 * j;
873:             xw[0] = x[idx];
874:             xw[1] = x[1 + idx];
875:             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
876:           }
877:           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
878:           x[i2] += xw[0];
879:           x[i2 + 1] += xw[1];
880:           idiag += 4;
881:           i2 += 2;
882:         }
883:         break;
884:       case 3:
885:         for (i = 0; i < m; i++) {
886:           v    = aa + 9 * ai[i];
887:           vi   = aj + ai[i];
888:           nz   = ai[i + 1] - ai[i];
889:           s[0] = b[i2];
890:           s[1] = b[i2 + 1];
891:           s[2] = b[i2 + 2];
892:           while (nz--) {
893:             idx   = 3 * (*vi++);
894:             xw[0] = x[idx];
895:             xw[1] = x[1 + idx];
896:             xw[2] = x[2 + idx];
897:             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
898:             v += 9;
899:           }
900:           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
901:           x[i2] += xw[0];
902:           x[i2 + 1] += xw[1];
903:           x[i2 + 2] += xw[2];
904:           idiag += 9;
905:           i2 += 3;
906:         }
907:         break;
908:       case 4:
909:         for (i = 0; i < m; i++) {
910:           v    = aa + 16 * ai[i];
911:           vi   = aj + ai[i];
912:           nz   = ai[i + 1] - ai[i];
913:           s[0] = b[i2];
914:           s[1] = b[i2 + 1];
915:           s[2] = b[i2 + 2];
916:           s[3] = b[i2 + 3];
917:           while (nz--) {
918:             idx   = 4 * (*vi++);
919:             xw[0] = x[idx];
920:             xw[1] = x[1 + idx];
921:             xw[2] = x[2 + idx];
922:             xw[3] = x[3 + idx];
923:             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
924:             v += 16;
925:           }
926:           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
927:           x[i2] += xw[0];
928:           x[i2 + 1] += xw[1];
929:           x[i2 + 2] += xw[2];
930:           x[i2 + 3] += xw[3];
931:           idiag += 16;
932:           i2 += 4;
933:         }
934:         break;
935:       case 5:
936:         for (i = 0; i < m; i++) {
937:           v    = aa + 25 * ai[i];
938:           vi   = aj + ai[i];
939:           nz   = ai[i + 1] - ai[i];
940:           s[0] = b[i2];
941:           s[1] = b[i2 + 1];
942:           s[2] = b[i2 + 2];
943:           s[3] = b[i2 + 3];
944:           s[4] = b[i2 + 4];
945:           while (nz--) {
946:             idx   = 5 * (*vi++);
947:             xw[0] = x[idx];
948:             xw[1] = x[1 + idx];
949:             xw[2] = x[2 + idx];
950:             xw[3] = x[3 + idx];
951:             xw[4] = x[4 + idx];
952:             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
953:             v += 25;
954:           }
955:           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
956:           x[i2] += xw[0];
957:           x[i2 + 1] += xw[1];
958:           x[i2 + 2] += xw[2];
959:           x[i2 + 3] += xw[3];
960:           x[i2 + 4] += xw[4];
961:           idiag += 25;
962:           i2 += 5;
963:         }
964:         break;
965:       case 6:
966:         for (i = 0; i < m; i++) {
967:           v    = aa + 36 * ai[i];
968:           vi   = aj + ai[i];
969:           nz   = ai[i + 1] - ai[i];
970:           s[0] = b[i2];
971:           s[1] = b[i2 + 1];
972:           s[2] = b[i2 + 2];
973:           s[3] = b[i2 + 3];
974:           s[4] = b[i2 + 4];
975:           s[5] = b[i2 + 5];
976:           while (nz--) {
977:             idx   = 6 * (*vi++);
978:             xw[0] = x[idx];
979:             xw[1] = x[1 + idx];
980:             xw[2] = x[2 + idx];
981:             xw[3] = x[3 + idx];
982:             xw[4] = x[4 + idx];
983:             xw[5] = x[5 + idx];
984:             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
985:             v += 36;
986:           }
987:           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
988:           x[i2] += xw[0];
989:           x[i2 + 1] += xw[1];
990:           x[i2 + 2] += xw[2];
991:           x[i2 + 3] += xw[3];
992:           x[i2 + 4] += xw[4];
993:           x[i2 + 5] += xw[5];
994:           idiag += 36;
995:           i2 += 6;
996:         }
997:         break;
998:       case 7:
999:         for (i = 0; i < m; i++) {
1000:           v    = aa + 49 * ai[i];
1001:           vi   = aj + ai[i];
1002:           nz   = ai[i + 1] - ai[i];
1003:           s[0] = b[i2];
1004:           s[1] = b[i2 + 1];
1005:           s[2] = b[i2 + 2];
1006:           s[3] = b[i2 + 3];
1007:           s[4] = b[i2 + 4];
1008:           s[5] = b[i2 + 5];
1009:           s[6] = b[i2 + 6];
1010:           while (nz--) {
1011:             idx   = 7 * (*vi++);
1012:             xw[0] = x[idx];
1013:             xw[1] = x[1 + idx];
1014:             xw[2] = x[2 + idx];
1015:             xw[3] = x[3 + idx];
1016:             xw[4] = x[4 + idx];
1017:             xw[5] = x[5 + idx];
1018:             xw[6] = x[6 + idx];
1019:             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
1020:             v += 49;
1021:           }
1022:           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
1023:           x[i2] += xw[0];
1024:           x[i2 + 1] += xw[1];
1025:           x[i2 + 2] += xw[2];
1026:           x[i2 + 3] += xw[3];
1027:           x[i2 + 4] += xw[4];
1028:           x[i2 + 5] += xw[5];
1029:           x[i2 + 6] += xw[6];
1030:           idiag += 49;
1031:           i2 += 7;
1032:         }
1033:         break;
1034:       default:
1035:         for (i = 0; i < m; i++) {
1036:           v  = aa + bs2 * ai[i];
1037:           vi = aj + ai[i];
1038:           nz = ai[i + 1] - ai[i];

1040:           PetscCall(PetscArraycpy(w, b + i2, bs));
1041:           /* copy all rows of x that are needed into contiguous space */
1042:           workt = work;
1043:           for (j = 0; j < nz; j++) {
1044:             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
1045:             workt += bs;
1046:           }
1047:           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
1048:           PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2);

1050:           idiag += bs2;
1051:           i2 += bs;
1052:         }
1053:         break;
1054:       }
1055:       PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1056:     }
1057:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
1058:       idiag = a->idiag + bs2 * (a->mbs - 1);
1059:       i2    = bs * (m - 1);
1060:       switch (bs) {
1061:       case 1:
1062:         for (i = m - 1; i >= 0; i--) {
1063:           v    = aa + ai[i];
1064:           vi   = aj + ai[i];
1065:           nz   = ai[i + 1] - ai[i];
1066:           s[0] = b[i2];
1067:           for (j = 0; j < nz; j++) {
1068:             xw[0] = x[vi[j]];
1069:             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
1070:           }
1071:           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
1072:           x[i2] += xw[0];
1073:           idiag -= 1;
1074:           i2 -= 1;
1075:         }
1076:         break;
1077:       case 2:
1078:         for (i = m - 1; i >= 0; i--) {
1079:           v    = aa + 4 * ai[i];
1080:           vi   = aj + ai[i];
1081:           nz   = ai[i + 1] - ai[i];
1082:           s[0] = b[i2];
1083:           s[1] = b[i2 + 1];
1084:           for (j = 0; j < nz; j++) {
1085:             idx   = 2 * vi[j];
1086:             it    = 4 * j;
1087:             xw[0] = x[idx];
1088:             xw[1] = x[1 + idx];
1089:             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
1090:           }
1091:           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
1092:           x[i2] += xw[0];
1093:           x[i2 + 1] += xw[1];
1094:           idiag -= 4;
1095:           i2 -= 2;
1096:         }
1097:         break;
1098:       case 3:
1099:         for (i = m - 1; i >= 0; i--) {
1100:           v    = aa + 9 * ai[i];
1101:           vi   = aj + ai[i];
1102:           nz   = ai[i + 1] - ai[i];
1103:           s[0] = b[i2];
1104:           s[1] = b[i2 + 1];
1105:           s[2] = b[i2 + 2];
1106:           while (nz--) {
1107:             idx   = 3 * (*vi++);
1108:             xw[0] = x[idx];
1109:             xw[1] = x[1 + idx];
1110:             xw[2] = x[2 + idx];
1111:             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
1112:             v += 9;
1113:           }
1114:           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
1115:           x[i2] += xw[0];
1116:           x[i2 + 1] += xw[1];
1117:           x[i2 + 2] += xw[2];
1118:           idiag -= 9;
1119:           i2 -= 3;
1120:         }
1121:         break;
1122:       case 4:
1123:         for (i = m - 1; i >= 0; i--) {
1124:           v    = aa + 16 * ai[i];
1125:           vi   = aj + ai[i];
1126:           nz   = ai[i + 1] - ai[i];
1127:           s[0] = b[i2];
1128:           s[1] = b[i2 + 1];
1129:           s[2] = b[i2 + 2];
1130:           s[3] = b[i2 + 3];
1131:           while (nz--) {
1132:             idx   = 4 * (*vi++);
1133:             xw[0] = x[idx];
1134:             xw[1] = x[1 + idx];
1135:             xw[2] = x[2 + idx];
1136:             xw[3] = x[3 + idx];
1137:             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
1138:             v += 16;
1139:           }
1140:           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
1141:           x[i2] += xw[0];
1142:           x[i2 + 1] += xw[1];
1143:           x[i2 + 2] += xw[2];
1144:           x[i2 + 3] += xw[3];
1145:           idiag -= 16;
1146:           i2 -= 4;
1147:         }
1148:         break;
1149:       case 5:
1150:         for (i = m - 1; i >= 0; i--) {
1151:           v    = aa + 25 * ai[i];
1152:           vi   = aj + ai[i];
1153:           nz   = ai[i + 1] - ai[i];
1154:           s[0] = b[i2];
1155:           s[1] = b[i2 + 1];
1156:           s[2] = b[i2 + 2];
1157:           s[3] = b[i2 + 3];
1158:           s[4] = b[i2 + 4];
1159:           while (nz--) {
1160:             idx   = 5 * (*vi++);
1161:             xw[0] = x[idx];
1162:             xw[1] = x[1 + idx];
1163:             xw[2] = x[2 + idx];
1164:             xw[3] = x[3 + idx];
1165:             xw[4] = x[4 + idx];
1166:             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
1167:             v += 25;
1168:           }
1169:           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
1170:           x[i2] += xw[0];
1171:           x[i2 + 1] += xw[1];
1172:           x[i2 + 2] += xw[2];
1173:           x[i2 + 3] += xw[3];
1174:           x[i2 + 4] += xw[4];
1175:           idiag -= 25;
1176:           i2 -= 5;
1177:         }
1178:         break;
1179:       case 6:
1180:         for (i = m - 1; i >= 0; i--) {
1181:           v    = aa + 36 * ai[i];
1182:           vi   = aj + ai[i];
1183:           nz   = ai[i + 1] - ai[i];
1184:           s[0] = b[i2];
1185:           s[1] = b[i2 + 1];
1186:           s[2] = b[i2 + 2];
1187:           s[3] = b[i2 + 3];
1188:           s[4] = b[i2 + 4];
1189:           s[5] = b[i2 + 5];
1190:           while (nz--) {
1191:             idx   = 6 * (*vi++);
1192:             xw[0] = x[idx];
1193:             xw[1] = x[1 + idx];
1194:             xw[2] = x[2 + idx];
1195:             xw[3] = x[3 + idx];
1196:             xw[4] = x[4 + idx];
1197:             xw[5] = x[5 + idx];
1198:             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
1199:             v += 36;
1200:           }
1201:           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
1202:           x[i2] += xw[0];
1203:           x[i2 + 1] += xw[1];
1204:           x[i2 + 2] += xw[2];
1205:           x[i2 + 3] += xw[3];
1206:           x[i2 + 4] += xw[4];
1207:           x[i2 + 5] += xw[5];
1208:           idiag -= 36;
1209:           i2 -= 6;
1210:         }
1211:         break;
1212:       case 7:
1213:         for (i = m - 1; i >= 0; i--) {
1214:           v    = aa + 49 * ai[i];
1215:           vi   = aj + ai[i];
1216:           nz   = ai[i + 1] - ai[i];
1217:           s[0] = b[i2];
1218:           s[1] = b[i2 + 1];
1219:           s[2] = b[i2 + 2];
1220:           s[3] = b[i2 + 3];
1221:           s[4] = b[i2 + 4];
1222:           s[5] = b[i2 + 5];
1223:           s[6] = b[i2 + 6];
1224:           while (nz--) {
1225:             idx   = 7 * (*vi++);
1226:             xw[0] = x[idx];
1227:             xw[1] = x[1 + idx];
1228:             xw[2] = x[2 + idx];
1229:             xw[3] = x[3 + idx];
1230:             xw[4] = x[4 + idx];
1231:             xw[5] = x[5 + idx];
1232:             xw[6] = x[6 + idx];
1233:             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
1234:             v += 49;
1235:           }
1236:           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
1237:           x[i2] += xw[0];
1238:           x[i2 + 1] += xw[1];
1239:           x[i2 + 2] += xw[2];
1240:           x[i2 + 3] += xw[3];
1241:           x[i2 + 4] += xw[4];
1242:           x[i2 + 5] += xw[5];
1243:           x[i2 + 6] += xw[6];
1244:           idiag -= 49;
1245:           i2 -= 7;
1246:         }
1247:         break;
1248:       default:
1249:         for (i = m - 1; i >= 0; i--) {
1250:           v  = aa + bs2 * ai[i];
1251:           vi = aj + ai[i];
1252:           nz = ai[i + 1] - ai[i];

1254:           PetscCall(PetscArraycpy(w, b + i2, bs));
1255:           /* copy all rows of x that are needed into contiguous space */
1256:           workt = work;
1257:           for (j = 0; j < nz; j++) {
1258:             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
1259:             workt += bs;
1260:           }
1261:           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
1262:           PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2);

1264:           idiag -= bs2;
1265:           i2 -= bs;
1266:         }
1267:         break;
1268:       }
1269:       PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz)));
1270:     }
1271:   }
1272:   PetscCall(VecRestoreArray(xx, &x));
1273:   PetscCall(VecRestoreArrayRead(bb, &b));
1274:   PetscFunctionReturn(PETSC_SUCCESS);
1275: }

1277: /*
1278:     Special version for direct calls from Fortran (Used in PETSc-fun3d)
1279: */
1280: #if defined(PETSC_HAVE_FORTRAN_CAPS)
1281:   #define matsetvaluesblocked4_ MATSETVALUESBLOCKED4
1282: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1283:   #define matsetvaluesblocked4_ matsetvaluesblocked4
1284: #endif

1286: PETSC_EXTERN void matsetvaluesblocked4_(Mat *AA, PetscInt *mm, const PetscInt im[], PetscInt *nn, const PetscInt in[], const PetscScalar v[])
1287: {
1288:   Mat                A = *AA;
1289:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1290:   PetscInt          *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, N, m = *mm, n = *nn;
1291:   PetscInt          *ai = a->i, *ailen = a->ilen;
1292:   PetscInt          *aj = a->j, stepval, lastcol = -1;
1293:   const PetscScalar *value = v;
1294:   MatScalar         *ap, *aa = a->a, *bap;

1296:   PetscFunctionBegin;
1297:   if (A->rmap->bs != 4) SETERRABORT(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Can only be called with a block size of 4");
1298:   stepval = (n - 1) * 4;
1299:   for (k = 0; k < m; k++) { /* loop over added rows */
1300:     row  = im[k];
1301:     rp   = aj + ai[row];
1302:     ap   = aa + 16 * ai[row];
1303:     nrow = ailen[row];
1304:     low  = 0;
1305:     high = nrow;
1306:     for (l = 0; l < n; l++) { /* loop over added columns */
1307:       col = in[l];
1308:       if (col <= lastcol) low = 0;
1309:       else high = nrow;
1310:       lastcol = col;
1311:       value   = v + k * (stepval + 4 + l) * 4;
1312:       while (high - low > 7) {
1313:         t = (low + high) / 2;
1314:         if (rp[t] > col) high = t;
1315:         else low = t;
1316:       }
1317:       for (i = low; i < high; i++) {
1318:         if (rp[i] > col) break;
1319:         if (rp[i] == col) {
1320:           bap = ap + 16 * i;
1321:           for (ii = 0; ii < 4; ii++, value += stepval) {
1322:             for (jj = ii; jj < 16; jj += 4) bap[jj] += *value++;
1323:           }
1324:           goto noinsert2;
1325:         }
1326:       }
1327:       N = nrow++ - 1;
1328:       high++; /* added new column index thus must search to one higher than before */
1329:       /* shift up all the later entries in this row */
1330:       for (ii = N; ii >= i; ii--) {
1331:         rp[ii + 1] = rp[ii];
1332:         PetscCallVoid(PetscArraycpy(ap + 16 * (ii + 1), ap + 16 * (ii), 16));
1333:       }
1334:       if (N >= i) PetscCallVoid(PetscArrayzero(ap + 16 * i, 16));
1335:       rp[i] = col;
1336:       bap   = ap + 16 * i;
1337:       for (ii = 0; ii < 4; ii++, value += stepval) {
1338:         for (jj = ii; jj < 16; jj += 4) bap[jj] = *value++;
1339:       }
1340:     noinsert2:;
1341:       low = i;
1342:     }
1343:     ailen[row] = nrow;
1344:   }
1345:   PetscFunctionReturnVoid();
1346: }

1348: #if defined(PETSC_HAVE_FORTRAN_CAPS)
1349:   #define matsetvalues4_ MATSETVALUES4
1350: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1351:   #define matsetvalues4_ matsetvalues4
1352: #endif

1354: PETSC_EXTERN void matsetvalues4_(Mat *AA, PetscInt *mm, PetscInt *im, PetscInt *nn, PetscInt *in, PetscScalar *v)
1355: {
1356:   Mat          A = *AA;
1357:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1358:   PetscInt    *rp, k, low, high, t, row, nrow, i, col, l, N, n = *nn, m = *mm;
1359:   PetscInt    *ai = a->i, *ailen = a->ilen;
1360:   PetscInt    *aj = a->j, brow, bcol;
1361:   PetscInt     ridx, cidx, lastcol = -1;
1362:   MatScalar   *ap, value, *aa      = a->a, *bap;

1364:   PetscFunctionBegin;
1365:   for (k = 0; k < m; k++) { /* loop over added rows */
1366:     row  = im[k];
1367:     brow = row / 4;
1368:     rp   = aj + ai[brow];
1369:     ap   = aa + 16 * ai[brow];
1370:     nrow = ailen[brow];
1371:     low  = 0;
1372:     high = nrow;
1373:     for (l = 0; l < n; l++) { /* loop over added columns */
1374:       col   = in[l];
1375:       bcol  = col / 4;
1376:       ridx  = row % 4;
1377:       cidx  = col % 4;
1378:       value = v[l + k * n];
1379:       if (col <= lastcol) low = 0;
1380:       else high = nrow;
1381:       lastcol = col;
1382:       while (high - low > 7) {
1383:         t = (low + high) / 2;
1384:         if (rp[t] > bcol) high = t;
1385:         else low = t;
1386:       }
1387:       for (i = low; i < high; i++) {
1388:         if (rp[i] > bcol) break;
1389:         if (rp[i] == bcol) {
1390:           bap = ap + 16 * i + 4 * cidx + ridx;
1391:           *bap += value;
1392:           goto noinsert1;
1393:         }
1394:       }
1395:       N = nrow++ - 1;
1396:       high++; /* added new column thus must search to one higher than before */
1397:       /* shift up all the later entries in this row */
1398:       PetscCallVoid(PetscArraymove(rp + i + 1, rp + i, N - i + 1));
1399:       PetscCallVoid(PetscArraymove(ap + 16 * i + 16, ap + 16 * i, 16 * (N - i + 1)));
1400:       PetscCallVoid(PetscArrayzero(ap + 16 * i, 16));
1401:       rp[i]                        = bcol;
1402:       ap[16 * i + 4 * cidx + ridx] = value;
1403:     noinsert1:;
1404:       low = i;
1405:     }
1406:     ailen[brow] = nrow;
1407:   }
1408:   PetscFunctionReturnVoid();
1409: }

1411: /*
1412:      Checks for missing diagonals
1413: */
1414: PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat A, PetscBool *missing, PetscInt *d)
1415: {
1416:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1417:   PetscInt    *diag, *ii = a->i, i;

1419:   PetscFunctionBegin;
1420:   PetscCall(MatMarkDiagonal_SeqBAIJ(A));
1421:   *missing = PETSC_FALSE;
1422:   if (A->rmap->n > 0 && !ii) {
1423:     *missing = PETSC_TRUE;
1424:     if (d) *d = 0;
1425:     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
1426:   } else {
1427:     PetscInt n;
1428:     n    = PetscMin(a->mbs, a->nbs);
1429:     diag = a->diag;
1430:     for (i = 0; i < n; i++) {
1431:       if (diag[i] >= ii[i + 1]) {
1432:         *missing = PETSC_TRUE;
1433:         if (d) *d = i;
1434:         PetscCall(PetscInfo(A, "Matrix is missing block diagonal number %" PetscInt_FMT "\n", i));
1435:         break;
1436:       }
1437:     }
1438:   }
1439:   PetscFunctionReturn(PETSC_SUCCESS);
1440: }

1442: PetscErrorCode MatMarkDiagonal_SeqBAIJ(Mat A)
1443: {
1444:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1445:   PetscInt     i, j, m = a->mbs;

1447:   PetscFunctionBegin;
1448:   if (!a->diag) {
1449:     PetscCall(PetscMalloc1(m, &a->diag));
1450:     a->free_diag = PETSC_TRUE;
1451:   }
1452:   for (i = 0; i < m; i++) {
1453:     a->diag[i] = a->i[i + 1];
1454:     for (j = a->i[i]; j < a->i[i + 1]; j++) {
1455:       if (a->j[j] == i) {
1456:         a->diag[i] = j;
1457:         break;
1458:       }
1459:     }
1460:   }
1461:   PetscFunctionReturn(PETSC_SUCCESS);
1462: }

1464: static PetscErrorCode MatGetRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *inia[], const PetscInt *inja[], PetscBool *done)
1465: {
1466:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1467:   PetscInt     i, j, n = a->mbs, nz = a->i[n], *tia, *tja, bs = A->rmap->bs, k, l, cnt;
1468:   PetscInt   **ia = (PetscInt **)inia, **ja = (PetscInt **)inja;

1470:   PetscFunctionBegin;
1471:   *nn = n;
1472:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
1473:   if (symmetric) {
1474:     PetscCall(MatToSymmetricIJ_SeqAIJ(n, a->i, a->j, PETSC_TRUE, 0, 0, &tia, &tja));
1475:     nz = tia[n];
1476:   } else {
1477:     tia = a->i;
1478:     tja = a->j;
1479:   }

1481:   if (!blockcompressed && bs > 1) {
1482:     (*nn) *= bs;
1483:     /* malloc & create the natural set of indices */
1484:     PetscCall(PetscMalloc1((n + 1) * bs, ia));
1485:     if (n) {
1486:       (*ia)[0] = oshift;
1487:       for (j = 1; j < bs; j++) (*ia)[j] = (tia[1] - tia[0]) * bs + (*ia)[j - 1];
1488:     }

1490:     for (i = 1; i < n; i++) {
1491:       (*ia)[i * bs] = (tia[i] - tia[i - 1]) * bs + (*ia)[i * bs - 1];
1492:       for (j = 1; j < bs; j++) (*ia)[i * bs + j] = (tia[i + 1] - tia[i]) * bs + (*ia)[i * bs + j - 1];
1493:     }
1494:     if (n) (*ia)[n * bs] = (tia[n] - tia[n - 1]) * bs + (*ia)[n * bs - 1];

1496:     if (inja) {
1497:       PetscCall(PetscMalloc1(nz * bs * bs, ja));
1498:       cnt = 0;
1499:       for (i = 0; i < n; i++) {
1500:         for (j = 0; j < bs; j++) {
1501:           for (k = tia[i]; k < tia[i + 1]; k++) {
1502:             for (l = 0; l < bs; l++) (*ja)[cnt++] = bs * tja[k] + l;
1503:           }
1504:         }
1505:       }
1506:     }

1508:     if (symmetric) { /* deallocate memory allocated in MatToSymmetricIJ_SeqAIJ() */
1509:       PetscCall(PetscFree(tia));
1510:       PetscCall(PetscFree(tja));
1511:     }
1512:   } else if (oshift == 1) {
1513:     if (symmetric) {
1514:       nz = tia[A->rmap->n / bs];
1515:       /*  add 1 to i and j indices */
1516:       for (i = 0; i < A->rmap->n / bs + 1; i++) tia[i] = tia[i] + 1;
1517:       *ia = tia;
1518:       if (ja) {
1519:         for (i = 0; i < nz; i++) tja[i] = tja[i] + 1;
1520:         *ja = tja;
1521:       }
1522:     } else {
1523:       nz = a->i[A->rmap->n / bs];
1524:       /* malloc space and  add 1 to i and j indices */
1525:       PetscCall(PetscMalloc1(A->rmap->n / bs + 1, ia));
1526:       for (i = 0; i < A->rmap->n / bs + 1; i++) (*ia)[i] = a->i[i] + 1;
1527:       if (ja) {
1528:         PetscCall(PetscMalloc1(nz, ja));
1529:         for (i = 0; i < nz; i++) (*ja)[i] = a->j[i] + 1;
1530:       }
1531:     }
1532:   } else {
1533:     *ia = tia;
1534:     if (ja) *ja = tja;
1535:   }
1536:   PetscFunctionReturn(PETSC_SUCCESS);
1537: }

1539: static PetscErrorCode MatRestoreRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
1540: {
1541:   PetscFunctionBegin;
1542:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
1543:   if ((!blockcompressed && A->rmap->bs > 1) || (symmetric || oshift == 1)) {
1544:     PetscCall(PetscFree(*ia));
1545:     if (ja) PetscCall(PetscFree(*ja));
1546:   }
1547:   PetscFunctionReturn(PETSC_SUCCESS);
1548: }

1550: PetscErrorCode MatDestroy_SeqBAIJ(Mat A)
1551: {
1552:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

1554:   PetscFunctionBegin;
1555:   if (A->hash_active) {
1556:     PetscInt bs;
1557:     A->ops[0] = a->cops;
1558:     PetscCall(PetscHMapIJVDestroy(&a->ht));
1559:     PetscCall(MatGetBlockSize(A, &bs));
1560:     if (bs > 1) PetscCall(PetscHSetIJDestroy(&a->bht));
1561:     PetscCall(PetscFree(a->dnz));
1562:     PetscCall(PetscFree(a->bdnz));
1563:     A->hash_active = PETSC_FALSE;
1564:   }
1565:   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->N, A->cmap->n, a->nz));
1566:   PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
1567:   PetscCall(ISDestroy(&a->row));
1568:   PetscCall(ISDestroy(&a->col));
1569:   if (a->free_diag) PetscCall(PetscFree(a->diag));
1570:   PetscCall(PetscFree(a->idiag));
1571:   if (a->free_imax_ilen) PetscCall(PetscFree2(a->imax, a->ilen));
1572:   PetscCall(PetscFree(a->solve_work));
1573:   PetscCall(PetscFree(a->mult_work));
1574:   PetscCall(PetscFree(a->sor_workt));
1575:   PetscCall(PetscFree(a->sor_work));
1576:   PetscCall(ISDestroy(&a->icol));
1577:   PetscCall(PetscFree(a->saved_values));
1578:   PetscCall(PetscFree2(a->compressedrow.i, a->compressedrow.rindex));

1580:   PetscCall(MatDestroy(&a->sbaijMat));
1581:   PetscCall(MatDestroy(&a->parent));
1582:   PetscCall(PetscFree(A->data));

1584:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
1585:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJGetArray_C", NULL));
1586:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJRestoreArray_C", NULL));
1587:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
1588:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
1589:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetColumnIndices_C", NULL));
1590:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqaij_C", NULL));
1591:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqsbaij_C", NULL));
1592:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocation_C", NULL));
1593:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocationCSR_C", NULL));
1594:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqbstrm_C", NULL));
1595:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatIsTranspose_C", NULL));
1596: #if defined(PETSC_HAVE_HYPRE)
1597:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_hypre_C", NULL));
1598: #endif
1599:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_is_C", NULL));
1600:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
1601:   PetscFunctionReturn(PETSC_SUCCESS);
1602: }

1604: static PetscErrorCode MatSetOption_SeqBAIJ(Mat A, MatOption op, PetscBool flg)
1605: {
1606:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

1608:   PetscFunctionBegin;
1609:   switch (op) {
1610:   case MAT_ROW_ORIENTED:
1611:     a->roworiented = flg;
1612:     break;
1613:   case MAT_KEEP_NONZERO_PATTERN:
1614:     a->keepnonzeropattern = flg;
1615:     break;
1616:   case MAT_NEW_NONZERO_LOCATIONS:
1617:     a->nonew = (flg ? 0 : 1);
1618:     break;
1619:   case MAT_NEW_NONZERO_LOCATION_ERR:
1620:     a->nonew = (flg ? -1 : 0);
1621:     break;
1622:   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1623:     a->nonew = (flg ? -2 : 0);
1624:     break;
1625:   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1626:     a->nounused = (flg ? -1 : 0);
1627:     break;
1628:   default:
1629:     break;
1630:   }
1631:   PetscFunctionReturn(PETSC_SUCCESS);
1632: }

1634: /* used for both SeqBAIJ and SeqSBAIJ matrices */
1635: PetscErrorCode MatGetRow_SeqBAIJ_private(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v, PetscInt *ai, PetscInt *aj, PetscScalar *aa)
1636: {
1637:   PetscInt     itmp, i, j, k, M, bn, bp, *idx_i, bs, bs2;
1638:   MatScalar   *aa_i;
1639:   PetscScalar *v_i;

1641:   PetscFunctionBegin;
1642:   bs  = A->rmap->bs;
1643:   bs2 = bs * bs;
1644:   PetscCheck(row >= 0 && row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);

1646:   bn  = row / bs; /* Block number */
1647:   bp  = row % bs; /* Block Position */
1648:   M   = ai[bn + 1] - ai[bn];
1649:   *nz = bs * M;

1651:   if (v) {
1652:     *v = NULL;
1653:     if (*nz) {
1654:       PetscCall(PetscMalloc1(*nz, v));
1655:       for (i = 0; i < M; i++) { /* for each block in the block row */
1656:         v_i  = *v + i * bs;
1657:         aa_i = aa + bs2 * (ai[bn] + i);
1658:         for (j = bp, k = 0; j < bs2; j += bs, k++) v_i[k] = aa_i[j];
1659:       }
1660:     }
1661:   }

1663:   if (idx) {
1664:     *idx = NULL;
1665:     if (*nz) {
1666:       PetscCall(PetscMalloc1(*nz, idx));
1667:       for (i = 0; i < M; i++) { /* for each block in the block row */
1668:         idx_i = *idx + i * bs;
1669:         itmp  = bs * aj[ai[bn] + i];
1670:         for (j = 0; j < bs; j++) idx_i[j] = itmp++;
1671:       }
1672:     }
1673:   }
1674:   PetscFunctionReturn(PETSC_SUCCESS);
1675: }

1677: PetscErrorCode MatGetRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1678: {
1679:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

1681:   PetscFunctionBegin;
1682:   PetscCall(MatGetRow_SeqBAIJ_private(A, row, nz, idx, v, a->i, a->j, a->a));
1683:   PetscFunctionReturn(PETSC_SUCCESS);
1684: }

1686: PetscErrorCode MatRestoreRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1687: {
1688:   PetscFunctionBegin;
1689:   if (idx) PetscCall(PetscFree(*idx));
1690:   if (v) PetscCall(PetscFree(*v));
1691:   PetscFunctionReturn(PETSC_SUCCESS);
1692: }

1694: static PetscErrorCode MatTranspose_SeqBAIJ(Mat A, MatReuse reuse, Mat *B)
1695: {
1696:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *at;
1697:   Mat          C;
1698:   PetscInt     i, j, k, *aj = a->j, *ai = a->i, bs = A->rmap->bs, mbs = a->mbs, nbs = a->nbs, *atfill;
1699:   PetscInt     bs2 = a->bs2, *ati, *atj, anzj, kr;
1700:   MatScalar   *ata, *aa = a->a;

1702:   PetscFunctionBegin;
1703:   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
1704:   PetscCall(PetscCalloc1(1 + nbs, &atfill));
1705:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
1706:     for (i = 0; i < ai[mbs]; i++) atfill[aj[i]] += 1; /* count num of non-zeros in row aj[i] */

1708:     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1709:     PetscCall(MatSetSizes(C, A->cmap->n, A->rmap->N, A->cmap->n, A->rmap->N));
1710:     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1711:     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, atfill));

1713:     at  = (Mat_SeqBAIJ *)C->data;
1714:     ati = at->i;
1715:     for (i = 0; i < nbs; i++) at->ilen[i] = at->imax[i] = ati[i + 1] - ati[i];
1716:   } else {
1717:     C   = *B;
1718:     at  = (Mat_SeqBAIJ *)C->data;
1719:     ati = at->i;
1720:   }

1722:   atj = at->j;
1723:   ata = at->a;

1725:   /* Copy ati into atfill so we have locations of the next free space in atj */
1726:   PetscCall(PetscArraycpy(atfill, ati, nbs));

1728:   /* Walk through A row-wise and mark nonzero entries of A^T. */
1729:   for (i = 0; i < mbs; i++) {
1730:     anzj = ai[i + 1] - ai[i];
1731:     for (j = 0; j < anzj; j++) {
1732:       atj[atfill[*aj]] = i;
1733:       for (kr = 0; kr < bs; kr++) {
1734:         for (k = 0; k < bs; k++) ata[bs2 * atfill[*aj] + k * bs + kr] = *aa++;
1735:       }
1736:       atfill[*aj++] += 1;
1737:     }
1738:   }
1739:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1740:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

1742:   /* Clean up temporary space and complete requests. */
1743:   PetscCall(PetscFree(atfill));

1745:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1746:     PetscCall(MatSetBlockSizes(C, A->cmap->bs, A->rmap->bs));
1747:     *B = C;
1748:   } else {
1749:     PetscCall(MatHeaderMerge(A, &C));
1750:   }
1751:   PetscFunctionReturn(PETSC_SUCCESS);
1752: }

1754: static PetscErrorCode MatCompare_SeqBAIJ_Private(Mat A, Mat B, PetscReal tol, PetscBool *flg)
1755: {
1756:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;

1758:   PetscFunctionBegin;
1759:   /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1760:   if (A->rmap->N != B->rmap->N || A->cmap->n != B->cmap->n || A->rmap->bs != B->rmap->bs || a->nz != b->nz) {
1761:     *flg = PETSC_FALSE;
1762:     PetscFunctionReturn(PETSC_SUCCESS);
1763:   }

1765:   /* if the a->i are the same */
1766:   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1767:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

1769:   /* if a->j are the same */
1770:   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1771:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

1773:   if (tol == 0.0) PetscCall(PetscArraycmp(a->a, b->a, a->nz * A->rmap->bs * A->rmap->bs, flg)); /* if a->a are the same */
1774:   else {
1775:     *flg = PETSC_TRUE;
1776:     for (PetscInt i = 0; (i < a->nz * A->rmap->bs * A->rmap->bs) && *flg; ++i)
1777:       if (PetscAbsScalar(a->a[i] - b->a[i]) > tol) *flg = PETSC_FALSE;
1778:   }
1779:   PetscFunctionReturn(PETSC_SUCCESS);
1780: }

1782: static PetscErrorCode MatIsTranspose_SeqBAIJ(Mat A, Mat B, PetscReal tol, PetscBool *f)
1783: {
1784:   Mat Btrans;

1786:   PetscFunctionBegin;
1787:   PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &Btrans));
1788:   PetscCall(MatCompare_SeqBAIJ_Private(A, Btrans, tol, f));
1789:   PetscCall(MatDestroy(&Btrans));
1790:   PetscFunctionReturn(PETSC_SUCCESS);
1791: }

1793: static PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
1794: {
1795:   PetscFunctionBegin;
1796:   PetscCall(MatCompare_SeqBAIJ_Private(A, B, 0.0, flg));
1797:   PetscFunctionReturn(PETSC_SUCCESS);
1798: }

1800: /* Used for both SeqBAIJ and SeqSBAIJ matrices */
1801: PetscErrorCode MatView_SeqBAIJ_Binary(Mat mat, PetscViewer viewer)
1802: {
1803:   Mat_SeqBAIJ *A = (Mat_SeqBAIJ *)mat->data;
1804:   PetscInt     header[4], M, N, m, bs, nz, cnt, i, j, k, l;
1805:   PetscInt    *rowlens, *colidxs;
1806:   PetscScalar *matvals;

1808:   PetscFunctionBegin;
1809:   PetscCall(PetscViewerSetUp(viewer));

1811:   M  = mat->rmap->N;
1812:   N  = mat->cmap->N;
1813:   m  = mat->rmap->n;
1814:   bs = mat->rmap->bs;
1815:   nz = bs * bs * A->nz;

1817:   /* write matrix header */
1818:   header[0] = MAT_FILE_CLASSID;
1819:   header[1] = M;
1820:   header[2] = N;
1821:   header[3] = nz;
1822:   PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT));

1824:   /* store row lengths */
1825:   PetscCall(PetscMalloc1(m, &rowlens));
1826:   for (cnt = 0, i = 0; i < A->mbs; i++)
1827:     for (j = 0; j < bs; j++) rowlens[cnt++] = bs * (A->i[i + 1] - A->i[i]);
1828:   PetscCall(PetscViewerBinaryWrite(viewer, rowlens, m, PETSC_INT));
1829:   PetscCall(PetscFree(rowlens));

1831:   /* store column indices  */
1832:   PetscCall(PetscMalloc1(nz, &colidxs));
1833:   for (cnt = 0, i = 0; i < A->mbs; i++)
1834:     for (k = 0; k < bs; k++)
1835:       for (j = A->i[i]; j < A->i[i + 1]; j++)
1836:         for (l = 0; l < bs; l++) colidxs[cnt++] = bs * A->j[j] + l;
1837:   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt_FMT, cnt, nz);
1838:   PetscCall(PetscViewerBinaryWrite(viewer, colidxs, nz, PETSC_INT));
1839:   PetscCall(PetscFree(colidxs));

1841:   /* store nonzero values */
1842:   PetscCall(PetscMalloc1(nz, &matvals));
1843:   for (cnt = 0, i = 0; i < A->mbs; i++)
1844:     for (k = 0; k < bs; k++)
1845:       for (j = A->i[i]; j < A->i[i + 1]; j++)
1846:         for (l = 0; l < bs; l++) matvals[cnt++] = A->a[bs * (bs * j + l) + k];
1847:   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt_FMT, cnt, nz);
1848:   PetscCall(PetscViewerBinaryWrite(viewer, matvals, nz, PETSC_SCALAR));
1849:   PetscCall(PetscFree(matvals));

1851:   /* write block size option to the viewer's .info file */
1852:   PetscCall(MatView_Binary_BlockSizes(mat, viewer));
1853:   PetscFunctionReturn(PETSC_SUCCESS);
1854: }

1856: static PetscErrorCode MatView_SeqBAIJ_ASCII_structonly(Mat A, PetscViewer viewer)
1857: {
1858:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1859:   PetscInt     i, bs = A->rmap->bs, k;

1861:   PetscFunctionBegin;
1862:   PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1863:   for (i = 0; i < a->mbs; i++) {
1864:     PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT "-%" PetscInt_FMT ":", i * bs, i * bs + bs - 1));
1865:     for (k = a->i[i]; k < a->i[i + 1]; k++) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT "-%" PetscInt_FMT ") ", bs * a->j[k], bs * a->j[k] + bs - 1));
1866:     PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1867:   }
1868:   PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1869:   PetscFunctionReturn(PETSC_SUCCESS);
1870: }

1872: static PetscErrorCode MatView_SeqBAIJ_ASCII(Mat A, PetscViewer viewer)
1873: {
1874:   Mat_SeqBAIJ      *a = (Mat_SeqBAIJ *)A->data;
1875:   PetscInt          i, j, bs = A->rmap->bs, k, l, bs2 = a->bs2;
1876:   PetscViewerFormat format;

1878:   PetscFunctionBegin;
1879:   if (A->structure_only) {
1880:     PetscCall(MatView_SeqBAIJ_ASCII_structonly(A, viewer));
1881:     PetscFunctionReturn(PETSC_SUCCESS);
1882:   }

1884:   PetscCall(PetscViewerGetFormat(viewer, &format));
1885:   if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1886:     PetscCall(PetscViewerASCIIPrintf(viewer, "  block size is %" PetscInt_FMT "\n", bs));
1887:   } else if (format == PETSC_VIEWER_ASCII_MATLAB) {
1888:     const char *matname;
1889:     Mat         aij;
1890:     PetscCall(MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &aij));
1891:     PetscCall(PetscObjectGetName((PetscObject)A, &matname));
1892:     PetscCall(PetscObjectSetName((PetscObject)aij, matname));
1893:     PetscCall(MatView(aij, viewer));
1894:     PetscCall(MatDestroy(&aij));
1895:   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1896:     PetscFunctionReturn(PETSC_SUCCESS);
1897:   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1898:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1899:     for (i = 0; i < a->mbs; i++) {
1900:       for (j = 0; j < bs; j++) {
1901:         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j));
1902:         for (k = a->i[i]; k < a->i[i + 1]; k++) {
1903:           for (l = 0; l < bs; l++) {
1904: #if defined(PETSC_USE_COMPLEX)
1905:             if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1906:               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1907:             } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1908:               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1909:             } else if (PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1910:               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j])));
1911:             }
1912: #else
1913:             if (a->a[bs2 * k + l * bs + j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j]));
1914: #endif
1915:           }
1916:         }
1917:         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1918:       }
1919:     }
1920:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1921:   } else {
1922:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1923:     for (i = 0; i < a->mbs; i++) {
1924:       for (j = 0; j < bs; j++) {
1925:         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j));
1926:         for (k = a->i[i]; k < a->i[i + 1]; k++) {
1927:           for (l = 0; l < bs; l++) {
1928: #if defined(PETSC_USE_COMPLEX)
1929:             if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0) {
1930:               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1931:             } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0) {
1932:               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1933:             } else {
1934:               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j])));
1935:             }
1936: #else
1937:             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j]));
1938: #endif
1939:           }
1940:         }
1941:         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1942:       }
1943:     }
1944:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1945:   }
1946:   PetscCall(PetscViewerFlush(viewer));
1947:   PetscFunctionReturn(PETSC_SUCCESS);
1948: }

1950: #include <petscdraw.h>
1951: static PetscErrorCode MatView_SeqBAIJ_Draw_Zoom(PetscDraw draw, void *Aa)
1952: {
1953:   Mat               A = (Mat)Aa;
1954:   Mat_SeqBAIJ      *a = (Mat_SeqBAIJ *)A->data;
1955:   PetscInt          row, i, j, k, l, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2;
1956:   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1957:   MatScalar        *aa;
1958:   PetscViewer       viewer;
1959:   PetscViewerFormat format;
1960:   int               color;

1962:   PetscFunctionBegin;
1963:   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1964:   PetscCall(PetscViewerGetFormat(viewer, &format));
1965:   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));

1967:   /* loop over matrix elements drawing boxes */

1969:   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1970:     PetscDrawCollectiveBegin(draw);
1971:     /* Blue for negative, Cyan for zero and  Red for positive */
1972:     color = PETSC_DRAW_BLUE;
1973:     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1974:       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1975:         y_l = A->rmap->N - row - 1.0;
1976:         y_r = y_l + 1.0;
1977:         x_l = a->j[j] * bs;
1978:         x_r = x_l + 1.0;
1979:         aa  = a->a + j * bs2;
1980:         for (k = 0; k < bs; k++) {
1981:           for (l = 0; l < bs; l++) {
1982:             if (PetscRealPart(*aa++) >= 0.) continue;
1983:             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
1984:           }
1985:         }
1986:       }
1987:     }
1988:     color = PETSC_DRAW_CYAN;
1989:     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1990:       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1991:         y_l = A->rmap->N - row - 1.0;
1992:         y_r = y_l + 1.0;
1993:         x_l = a->j[j] * bs;
1994:         x_r = x_l + 1.0;
1995:         aa  = a->a + j * bs2;
1996:         for (k = 0; k < bs; k++) {
1997:           for (l = 0; l < bs; l++) {
1998:             if (PetscRealPart(*aa++) != 0.) continue;
1999:             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
2000:           }
2001:         }
2002:       }
2003:     }
2004:     color = PETSC_DRAW_RED;
2005:     for (i = 0, row = 0; i < mbs; i++, row += bs) {
2006:       for (j = a->i[i]; j < a->i[i + 1]; j++) {
2007:         y_l = A->rmap->N - row - 1.0;
2008:         y_r = y_l + 1.0;
2009:         x_l = a->j[j] * bs;
2010:         x_r = x_l + 1.0;
2011:         aa  = a->a + j * bs2;
2012:         for (k = 0; k < bs; k++) {
2013:           for (l = 0; l < bs; l++) {
2014:             if (PetscRealPart(*aa++) <= 0.) continue;
2015:             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
2016:           }
2017:         }
2018:       }
2019:     }
2020:     PetscDrawCollectiveEnd(draw);
2021:   } else {
2022:     /* use contour shading to indicate magnitude of values */
2023:     /* first determine max of all nonzero values */
2024:     PetscReal minv = 0.0, maxv = 0.0;
2025:     PetscDraw popup;

2027:     for (i = 0; i < a->nz * a->bs2; i++) {
2028:       if (PetscAbsScalar(a->a[i]) > maxv) maxv = PetscAbsScalar(a->a[i]);
2029:     }
2030:     if (minv >= maxv) maxv = minv + PETSC_SMALL;
2031:     PetscCall(PetscDrawGetPopup(draw, &popup));
2032:     PetscCall(PetscDrawScalePopup(popup, 0.0, maxv));

2034:     PetscDrawCollectiveBegin(draw);
2035:     for (i = 0, row = 0; i < mbs; i++, row += bs) {
2036:       for (j = a->i[i]; j < a->i[i + 1]; j++) {
2037:         y_l = A->rmap->N - row - 1.0;
2038:         y_r = y_l + 1.0;
2039:         x_l = a->j[j] * bs;
2040:         x_r = x_l + 1.0;
2041:         aa  = a->a + j * bs2;
2042:         for (k = 0; k < bs; k++) {
2043:           for (l = 0; l < bs; l++) {
2044:             MatScalar v = *aa++;
2045:             color       = PetscDrawRealToColor(PetscAbsScalar(v), minv, maxv);
2046:             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
2047:           }
2048:         }
2049:       }
2050:     }
2051:     PetscDrawCollectiveEnd(draw);
2052:   }
2053:   PetscFunctionReturn(PETSC_SUCCESS);
2054: }

2056: static PetscErrorCode MatView_SeqBAIJ_Draw(Mat A, PetscViewer viewer)
2057: {
2058:   PetscReal xl, yl, xr, yr, w, h;
2059:   PetscDraw draw;
2060:   PetscBool isnull;

2062:   PetscFunctionBegin;
2063:   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
2064:   PetscCall(PetscDrawIsNull(draw, &isnull));
2065:   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);

2067:   xr = A->cmap->n;
2068:   yr = A->rmap->N;
2069:   h  = yr / 10.0;
2070:   w  = xr / 10.0;
2071:   xr += w;
2072:   yr += h;
2073:   xl = -w;
2074:   yl = -h;
2075:   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
2076:   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
2077:   PetscCall(PetscDrawZoom(draw, MatView_SeqBAIJ_Draw_Zoom, A));
2078:   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
2079:   PetscCall(PetscDrawSave(draw));
2080:   PetscFunctionReturn(PETSC_SUCCESS);
2081: }

2083: PetscErrorCode MatView_SeqBAIJ(Mat A, PetscViewer viewer)
2084: {
2085:   PetscBool isascii, isbinary, isdraw;

2087:   PetscFunctionBegin;
2088:   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
2089:   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
2090:   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
2091:   if (isascii) {
2092:     PetscCall(MatView_SeqBAIJ_ASCII(A, viewer));
2093:   } else if (isbinary) {
2094:     PetscCall(MatView_SeqBAIJ_Binary(A, viewer));
2095:   } else if (isdraw) {
2096:     PetscCall(MatView_SeqBAIJ_Draw(A, viewer));
2097:   } else {
2098:     Mat B;
2099:     PetscCall(MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &B));
2100:     PetscCall(MatView(B, viewer));
2101:     PetscCall(MatDestroy(&B));
2102:   }
2103:   PetscFunctionReturn(PETSC_SUCCESS);
2104: }

2106: PetscErrorCode MatGetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
2107: {
2108:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2109:   PetscInt    *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
2110:   PetscInt    *ai = a->i, *ailen = a->ilen;
2111:   PetscInt     brow, bcol, ridx, cidx, bs = A->rmap->bs, bs2 = a->bs2;
2112:   MatScalar   *ap, *aa = a->a;

2114:   PetscFunctionBegin;
2115:   for (k = 0; k < m; k++) { /* loop over rows */
2116:     row  = im[k];
2117:     brow = row / bs;
2118:     if (row < 0) {
2119:       v += n;
2120:       continue;
2121:     } /* negative row */
2122:     PetscCheck(row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " too large", row);
2123:     rp   = PetscSafePointerPlusOffset(aj, ai[brow]);
2124:     ap   = PetscSafePointerPlusOffset(aa, bs2 * ai[brow]);
2125:     nrow = ailen[brow];
2126:     for (l = 0; l < n; l++) { /* loop over columns */
2127:       if (in[l] < 0) {
2128:         v++;
2129:         continue;
2130:       } /* negative column */
2131:       PetscCheck(in[l] < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column %" PetscInt_FMT " too large", in[l]);
2132:       col  = in[l];
2133:       bcol = col / bs;
2134:       cidx = col % bs;
2135:       ridx = row % bs;
2136:       high = nrow;
2137:       low  = 0; /* assume unsorted */
2138:       while (high - low > 5) {
2139:         t = (low + high) / 2;
2140:         if (rp[t] > bcol) high = t;
2141:         else low = t;
2142:       }
2143:       for (i = low; i < high; i++) {
2144:         if (rp[i] > bcol) break;
2145:         if (rp[i] == bcol) {
2146:           *v++ = ap[bs2 * i + bs * cidx + ridx];
2147:           goto finished;
2148:         }
2149:       }
2150:       *v++ = 0.0;
2151:     finished:;
2152:     }
2153:   }
2154:   PetscFunctionReturn(PETSC_SUCCESS);
2155: }

2157: PetscErrorCode MatSetValuesBlocked_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
2158: {
2159:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2160:   PetscInt          *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, rmax, N, lastcol = -1;
2161:   PetscInt          *imax = a->imax, *ai = a->i, *ailen = a->ilen;
2162:   PetscInt          *aj = a->j, nonew = a->nonew, bs2 = a->bs2, bs = A->rmap->bs, stepval;
2163:   PetscBool          roworiented = a->roworiented;
2164:   const PetscScalar *value       = v;
2165:   MatScalar         *ap = NULL, *aa = a->a, *bap;

2167:   PetscFunctionBegin;
2168:   if (roworiented) {
2169:     stepval = (n - 1) * bs;
2170:   } else {
2171:     stepval = (m - 1) * bs;
2172:   }
2173:   for (k = 0; k < m; k++) { /* loop over added rows */
2174:     row = im[k];
2175:     if (row < 0) continue;
2176:     PetscCheck(row < a->mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Block row index too large %" PetscInt_FMT " max %" PetscInt_FMT, row, a->mbs - 1);
2177:     rp = aj + ai[row];
2178:     if (!A->structure_only) ap = aa + bs2 * ai[row];
2179:     rmax = imax[row];
2180:     nrow = ailen[row];
2181:     low  = 0;
2182:     high = nrow;
2183:     for (l = 0; l < n; l++) { /* loop over added columns */
2184:       if (in[l] < 0) continue;
2185:       PetscCheck(in[l] < a->nbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Block column index too large %" PetscInt_FMT " max %" PetscInt_FMT, in[l], a->nbs - 1);
2186:       col = in[l];
2187:       if (!A->structure_only) {
2188:         if (roworiented) {
2189:           value = v + (k * (stepval + bs) + l) * bs;
2190:         } else {
2191:           value = v + (l * (stepval + bs) + k) * bs;
2192:         }
2193:       }
2194:       if (col <= lastcol) low = 0;
2195:       else high = nrow;
2196:       lastcol = col;
2197:       while (high - low > 7) {
2198:         t = (low + high) / 2;
2199:         if (rp[t] > col) high = t;
2200:         else low = t;
2201:       }
2202:       for (i = low; i < high; i++) {
2203:         if (rp[i] > col) break;
2204:         if (rp[i] == col) {
2205:           if (A->structure_only) goto noinsert2;
2206:           bap = ap + bs2 * i;
2207:           if (roworiented) {
2208:             if (is == ADD_VALUES) {
2209:               for (ii = 0; ii < bs; ii++, value += stepval) {
2210:                 for (jj = ii; jj < bs2; jj += bs) bap[jj] += *value++;
2211:               }
2212:             } else {
2213:               for (ii = 0; ii < bs; ii++, value += stepval) {
2214:                 for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++;
2215:               }
2216:             }
2217:           } else {
2218:             if (is == ADD_VALUES) {
2219:               for (ii = 0; ii < bs; ii++, value += bs + stepval) {
2220:                 for (jj = 0; jj < bs; jj++) bap[jj] += value[jj];
2221:                 bap += bs;
2222:               }
2223:             } else {
2224:               for (ii = 0; ii < bs; ii++, value += bs + stepval) {
2225:                 for (jj = 0; jj < bs; jj++) bap[jj] = value[jj];
2226:                 bap += bs;
2227:               }
2228:             }
2229:           }
2230:           goto noinsert2;
2231:         }
2232:       }
2233:       if (nonew == 1) goto noinsert2;
2234:       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new blocked index new nonzero block (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
2235:       if (A->structure_only) {
2236:         MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, row, col, rmax, ai, aj, rp, imax, nonew, MatScalar);
2237:       } else {
2238:         MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, row, col, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar);
2239:       }
2240:       N = nrow++ - 1;
2241:       high++;
2242:       /* shift up all the later entries in this row */
2243:       PetscCall(PetscArraymove(rp + i + 1, rp + i, N - i + 1));
2244:       rp[i] = col;
2245:       if (!A->structure_only) {
2246:         PetscCall(PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1)));
2247:         bap = ap + bs2 * i;
2248:         if (roworiented) {
2249:           for (ii = 0; ii < bs; ii++, value += stepval) {
2250:             for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++;
2251:           }
2252:         } else {
2253:           for (ii = 0; ii < bs; ii++, value += stepval) {
2254:             for (jj = 0; jj < bs; jj++) *bap++ = *value++;
2255:           }
2256:         }
2257:       }
2258:     noinsert2:;
2259:       low = i;
2260:     }
2261:     ailen[row] = nrow;
2262:   }
2263:   PetscFunctionReturn(PETSC_SUCCESS);
2264: }

2266: PetscErrorCode MatAssemblyEnd_SeqBAIJ(Mat A, MatAssemblyType mode)
2267: {
2268:   Mat_SeqBAIJ *a      = (Mat_SeqBAIJ *)A->data;
2269:   PetscInt     fshift = 0, i, *ai = a->i, *aj = a->j, *imax = a->imax;
2270:   PetscInt     m = A->rmap->N, *ip, N, *ailen = a->ilen;
2271:   PetscInt     mbs = a->mbs, bs2 = a->bs2, rmax = 0;
2272:   MatScalar   *aa    = a->a, *ap;
2273:   PetscReal    ratio = 0.6;

2275:   PetscFunctionBegin;
2276:   if (mode == MAT_FLUSH_ASSEMBLY || (A->was_assembled && A->ass_nonzerostate == A->nonzerostate)) PetscFunctionReturn(PETSC_SUCCESS);

2278:   if (m) rmax = ailen[0];
2279:   for (i = 1; i < mbs; i++) {
2280:     /* move each row back by the amount of empty slots (fshift) before it*/
2281:     fshift += imax[i - 1] - ailen[i - 1];
2282:     rmax = PetscMax(rmax, ailen[i]);
2283:     if (fshift) {
2284:       ip = aj + ai[i];
2285:       ap = aa + bs2 * ai[i];
2286:       N  = ailen[i];
2287:       PetscCall(PetscArraymove(ip - fshift, ip, N));
2288:       if (!A->structure_only) PetscCall(PetscArraymove(ap - bs2 * fshift, ap, bs2 * N));
2289:     }
2290:     ai[i] = ai[i - 1] + ailen[i - 1];
2291:   }
2292:   if (mbs) {
2293:     fshift += imax[mbs - 1] - ailen[mbs - 1];
2294:     ai[mbs] = ai[mbs - 1] + ailen[mbs - 1];
2295:   }

2297:   /* reset ilen and imax for each row */
2298:   a->nonzerorowcnt = 0;
2299:   if (A->structure_only) {
2300:     PetscCall(PetscFree2(a->imax, a->ilen));
2301:   } else { /* !A->structure_only */
2302:     for (i = 0; i < mbs; i++) {
2303:       ailen[i] = imax[i] = ai[i + 1] - ai[i];
2304:       a->nonzerorowcnt += ((ai[i + 1] - ai[i]) > 0);
2305:     }
2306:   }
2307:   a->nz = ai[mbs];

2309:   /* diagonals may have moved, so kill the diagonal pointers */
2310:   a->idiagvalid = PETSC_FALSE;
2311:   if (fshift && a->diag) PetscCall(PetscFree(a->diag));
2312:   if (fshift) PetscCheck(a->nounused != -1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unused space detected in matrix: %" PetscInt_FMT " X %" PetscInt_FMT " block size %" PetscInt_FMT ", %" PetscInt_FMT " unneeded", m, A->cmap->n, A->rmap->bs, fshift * bs2);
2313:   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT ", block size %" PetscInt_FMT "; storage space: %" PetscInt_FMT " unneeded, %" PetscInt_FMT " used\n", m, A->cmap->n, A->rmap->bs, fshift * bs2, a->nz * bs2));
2314:   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues is %" PetscInt_FMT "\n", a->reallocs));
2315:   PetscCall(PetscInfo(A, "Most nonzeros blocks in any row is %" PetscInt_FMT "\n", rmax));

2317:   A->info.mallocs += a->reallocs;
2318:   a->reallocs         = 0;
2319:   A->info.nz_unneeded = (PetscReal)fshift * bs2;
2320:   a->rmax             = rmax;

2322:   if (!A->structure_only) PetscCall(MatCheckCompressedRow(A, a->nonzerorowcnt, &a->compressedrow, a->i, mbs, ratio));
2323:   PetscFunctionReturn(PETSC_SUCCESS);
2324: }

2326: /*
2327:    This function returns an array of flags which indicate the locations of contiguous
2328:    blocks that should be zeroed. for eg: if bs = 3  and is = [0,1,2,3,5,6,7,8,9]
2329:    then the resulting sizes = [3,1,1,3,1] corresponding to sets [(0,1,2),(3),(5),(6,7,8),(9)]
2330:    Assume: sizes should be long enough to hold all the values.
2331: */
2332: static PetscErrorCode MatZeroRows_SeqBAIJ_Check_Blocks(PetscInt idx[], PetscInt n, PetscInt bs, PetscInt sizes[], PetscInt *bs_max)
2333: {
2334:   PetscInt j = 0;

2336:   PetscFunctionBegin;
2337:   for (PetscInt i = 0; i < n; j++) {
2338:     PetscInt row = idx[i];
2339:     if (row % bs != 0) { /* Not the beginning of a block */
2340:       sizes[j] = 1;
2341:       i++;
2342:     } else if (i + bs > n) { /* complete block doesn't exist (at idx end) */
2343:       sizes[j] = 1;          /* Also makes sure at least 'bs' values exist for next else */
2344:       i++;
2345:     } else { /* Beginning of the block, so check if the complete block exists */
2346:       PetscBool flg = PETSC_TRUE;
2347:       for (PetscInt k = 1; k < bs; k++) {
2348:         if (row + k != idx[i + k]) { /* break in the block */
2349:           flg = PETSC_FALSE;
2350:           break;
2351:         }
2352:       }
2353:       if (flg) { /* No break in the bs */
2354:         sizes[j] = bs;
2355:         i += bs;
2356:       } else {
2357:         sizes[j] = 1;
2358:         i++;
2359:       }
2360:     }
2361:   }
2362:   *bs_max = j;
2363:   PetscFunctionReturn(PETSC_SUCCESS);
2364: }

2366: PetscErrorCode MatZeroRows_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b)
2367: {
2368:   Mat_SeqBAIJ       *baij = (Mat_SeqBAIJ *)A->data;
2369:   PetscInt           i, j, k, count, *rows;
2370:   PetscInt           bs = A->rmap->bs, bs2 = baij->bs2, *sizes, row, bs_max;
2371:   PetscScalar        zero = 0.0;
2372:   MatScalar         *aa;
2373:   const PetscScalar *xx;
2374:   PetscScalar       *bb;

2376:   PetscFunctionBegin;
2377:   /* fix right-hand side if needed */
2378:   if (x && b) {
2379:     PetscCall(VecGetArrayRead(x, &xx));
2380:     PetscCall(VecGetArray(b, &bb));
2381:     for (i = 0; i < is_n; i++) bb[is_idx[i]] = diag * xx[is_idx[i]];
2382:     PetscCall(VecRestoreArrayRead(x, &xx));
2383:     PetscCall(VecRestoreArray(b, &bb));
2384:   }

2386:   /* Make a copy of the IS and  sort it */
2387:   /* allocate memory for rows,sizes */
2388:   PetscCall(PetscMalloc2(is_n, &rows, 2 * is_n, &sizes));

2390:   /* copy IS values to rows, and sort them */
2391:   for (i = 0; i < is_n; i++) rows[i] = is_idx[i];
2392:   PetscCall(PetscSortInt(is_n, rows));

2394:   if (baij->keepnonzeropattern) {
2395:     for (i = 0; i < is_n; i++) sizes[i] = 1;
2396:     bs_max = is_n;
2397:   } else {
2398:     PetscCall(MatZeroRows_SeqBAIJ_Check_Blocks(rows, is_n, bs, sizes, &bs_max));
2399:     A->nonzerostate++;
2400:   }

2402:   for (i = 0, j = 0; i < bs_max; j += sizes[i], i++) {
2403:     row = rows[j];
2404:     PetscCheck(row >= 0 && row <= A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "row %" PetscInt_FMT " out of range", row);
2405:     count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs;
2406:     aa    = baij->a + baij->i[row / bs] * bs2 + (row % bs);
2407:     if (sizes[i] == bs && !baij->keepnonzeropattern) {
2408:       if (diag != (PetscScalar)0.0) {
2409:         if (baij->ilen[row / bs] > 0) {
2410:           baij->ilen[row / bs]       = 1;
2411:           baij->j[baij->i[row / bs]] = row / bs;

2413:           PetscCall(PetscArrayzero(aa, count * bs));
2414:         }
2415:         /* Now insert all the diagonal values for this bs */
2416:         for (k = 0; k < bs; k++) PetscUseTypeMethod(A, setvalues, 1, rows + j + k, 1, rows + j + k, &diag, INSERT_VALUES);
2417:       } else { /* (diag == 0.0) */
2418:         baij->ilen[row / bs] = 0;
2419:       } /* end (diag == 0.0) */
2420:     } else { /* (sizes[i] != bs) */
2421:       PetscAssert(sizes[i] == 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal Error. Value should be 1");
2422:       for (k = 0; k < count; k++) {
2423:         aa[0] = zero;
2424:         aa += bs;
2425:       }
2426:       if (diag != (PetscScalar)0.0) PetscUseTypeMethod(A, setvalues, 1, rows + j, 1, rows + j, &diag, INSERT_VALUES);
2427:     }
2428:   }

2430:   PetscCall(PetscFree2(rows, sizes));
2431:   PetscCall(MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY));
2432:   PetscFunctionReturn(PETSC_SUCCESS);
2433: }

2435: static PetscErrorCode MatZeroRowsColumns_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b)
2436: {
2437:   Mat_SeqBAIJ       *baij = (Mat_SeqBAIJ *)A->data;
2438:   PetscInt           i, j, k, count;
2439:   PetscInt           bs = A->rmap->bs, bs2 = baij->bs2, row, col;
2440:   PetscScalar        zero = 0.0;
2441:   MatScalar         *aa;
2442:   const PetscScalar *xx;
2443:   PetscScalar       *bb;
2444:   PetscBool         *zeroed, vecs = PETSC_FALSE;

2446:   PetscFunctionBegin;
2447:   /* fix right-hand side if needed */
2448:   if (x && b) {
2449:     PetscCall(VecGetArrayRead(x, &xx));
2450:     PetscCall(VecGetArray(b, &bb));
2451:     vecs = PETSC_TRUE;
2452:   }

2454:   /* zero the columns */
2455:   PetscCall(PetscCalloc1(A->rmap->n, &zeroed));
2456:   for (i = 0; i < is_n; i++) {
2457:     PetscCheck(is_idx[i] >= 0 && is_idx[i] < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "row %" PetscInt_FMT " out of range", is_idx[i]);
2458:     zeroed[is_idx[i]] = PETSC_TRUE;
2459:   }
2460:   for (i = 0; i < A->rmap->N; i++) {
2461:     if (!zeroed[i]) {
2462:       row = i / bs;
2463:       for (j = baij->i[row]; j < baij->i[row + 1]; j++) {
2464:         for (k = 0; k < bs; k++) {
2465:           col = bs * baij->j[j] + k;
2466:           if (zeroed[col]) {
2467:             aa = baij->a + j * bs2 + (i % bs) + bs * k;
2468:             if (vecs) bb[i] -= aa[0] * xx[col];
2469:             aa[0] = 0.0;
2470:           }
2471:         }
2472:       }
2473:     } else if (vecs) bb[i] = diag * xx[i];
2474:   }
2475:   PetscCall(PetscFree(zeroed));
2476:   if (vecs) {
2477:     PetscCall(VecRestoreArrayRead(x, &xx));
2478:     PetscCall(VecRestoreArray(b, &bb));
2479:   }

2481:   /* zero the rows */
2482:   for (i = 0; i < is_n; i++) {
2483:     row   = is_idx[i];
2484:     count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs;
2485:     aa    = baij->a + baij->i[row / bs] * bs2 + (row % bs);
2486:     for (k = 0; k < count; k++) {
2487:       aa[0] = zero;
2488:       aa += bs;
2489:     }
2490:     if (diag != (PetscScalar)0.0) PetscUseTypeMethod(A, setvalues, 1, &row, 1, &row, &diag, INSERT_VALUES);
2491:   }
2492:   PetscCall(MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY));
2493:   PetscFunctionReturn(PETSC_SUCCESS);
2494: }

2496: PetscErrorCode MatSetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
2497: {
2498:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2499:   PetscInt    *rp, k, low, high, t, ii, row, nrow, i, col, l, rmax, N, lastcol = -1;
2500:   PetscInt    *imax = a->imax, *ai = a->i, *ailen = a->ilen;
2501:   PetscInt    *aj = a->j, nonew = a->nonew, bs = A->rmap->bs, brow, bcol;
2502:   PetscInt     ridx, cidx, bs2                 = a->bs2;
2503:   PetscBool    roworiented = a->roworiented;
2504:   MatScalar   *ap = NULL, value = 0.0, *aa = a->a, *bap;

2506:   PetscFunctionBegin;
2507:   for (k = 0; k < m; k++) { /* loop over added rows */
2508:     row  = im[k];
2509:     brow = row / bs;
2510:     if (row < 0) continue;
2511:     PetscCheck(row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->N - 1);
2512:     rp = PetscSafePointerPlusOffset(aj, ai[brow]);
2513:     if (!A->structure_only) ap = PetscSafePointerPlusOffset(aa, bs2 * ai[brow]);
2514:     rmax = imax[brow];
2515:     nrow = ailen[brow];
2516:     low  = 0;
2517:     high = nrow;
2518:     for (l = 0; l < n; l++) { /* loop over added columns */
2519:       if (in[l] < 0) continue;
2520:       PetscCheck(in[l] < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[l], A->cmap->n - 1);
2521:       col  = in[l];
2522:       bcol = col / bs;
2523:       ridx = row % bs;
2524:       cidx = col % bs;
2525:       if (!A->structure_only) {
2526:         if (roworiented) {
2527:           value = v[l + k * n];
2528:         } else {
2529:           value = v[k + l * m];
2530:         }
2531:       }
2532:       if (col <= lastcol) low = 0;
2533:       else high = nrow;
2534:       lastcol = col;
2535:       while (high - low > 7) {
2536:         t = (low + high) / 2;
2537:         if (rp[t] > bcol) high = t;
2538:         else low = t;
2539:       }
2540:       for (i = low; i < high; i++) {
2541:         if (rp[i] > bcol) break;
2542:         if (rp[i] == bcol) {
2543:           bap = PetscSafePointerPlusOffset(ap, bs2 * i + bs * cidx + ridx);
2544:           if (!A->structure_only) {
2545:             if (is == ADD_VALUES) *bap += value;
2546:             else *bap = value;
2547:           }
2548:           goto noinsert1;
2549:         }
2550:       }
2551:       if (nonew == 1) goto noinsert1;
2552:       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
2553:       if (A->structure_only) {
2554:         MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, brow, bcol, rmax, ai, aj, rp, imax, nonew, MatScalar);
2555:       } else {
2556:         MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, brow, bcol, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar);
2557:       }
2558:       N = nrow++ - 1;
2559:       high++;
2560:       /* shift up all the later entries in this row */
2561:       PetscCall(PetscArraymove(rp + i + 1, rp + i, N - i + 1));
2562:       rp[i] = bcol;
2563:       if (!A->structure_only) {
2564:         PetscCall(PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1)));
2565:         PetscCall(PetscArrayzero(ap + bs2 * i, bs2));
2566:         ap[bs2 * i + bs * cidx + ridx] = value;
2567:       }
2568:       a->nz++;
2569:     noinsert1:;
2570:       low = i;
2571:     }
2572:     ailen[brow] = nrow;
2573:   }
2574:   PetscFunctionReturn(PETSC_SUCCESS);
2575: }

2577: static PetscErrorCode MatILUFactor_SeqBAIJ(Mat inA, IS row, IS col, const MatFactorInfo *info)
2578: {
2579:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
2580:   Mat          outA;
2581:   PetscBool    row_identity, col_identity;

2583:   PetscFunctionBegin;
2584:   PetscCheck(info->levels == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only levels = 0 supported for in-place ILU");
2585:   PetscCall(ISIdentity(row, &row_identity));
2586:   PetscCall(ISIdentity(col, &col_identity));
2587:   PetscCheck(row_identity && col_identity, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column permutations must be identity for in-place ILU");

2589:   outA            = inA;
2590:   inA->factortype = MAT_FACTOR_LU;
2591:   PetscCall(PetscFree(inA->solvertype));
2592:   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &inA->solvertype));

2594:   PetscCall(MatMarkDiagonal_SeqBAIJ(inA));

2596:   PetscCall(PetscObjectReference((PetscObject)row));
2597:   PetscCall(ISDestroy(&a->row));
2598:   a->row = row;
2599:   PetscCall(PetscObjectReference((PetscObject)col));
2600:   PetscCall(ISDestroy(&a->col));
2601:   a->col = col;

2603:   /* Create the invert permutation so that it can be used in MatLUFactorNumeric() */
2604:   PetscCall(ISDestroy(&a->icol));
2605:   PetscCall(ISInvertPermutation(col, PETSC_DECIDE, &a->icol));

2607:   PetscCall(MatSeqBAIJSetNumericFactorization_inplace(inA, (PetscBool)(row_identity && col_identity)));
2608:   if (!a->solve_work) PetscCall(PetscMalloc1(inA->rmap->N + inA->rmap->bs, &a->solve_work));
2609:   PetscCall(MatLUFactorNumeric(outA, inA, info));
2610:   PetscFunctionReturn(PETSC_SUCCESS);
2611: }

2613: static PetscErrorCode MatSeqBAIJSetColumnIndices_SeqBAIJ(Mat mat, const PetscInt *indices)
2614: {
2615:   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *)mat->data;

2617:   PetscFunctionBegin;
2618:   baij->nz = baij->maxnz;
2619:   PetscCall(PetscArraycpy(baij->j, indices, baij->nz));
2620:   PetscCall(PetscArraycpy(baij->ilen, baij->imax, baij->mbs));
2621:   PetscFunctionReturn(PETSC_SUCCESS);
2622: }

2624: /*@
2625:   MatSeqBAIJSetColumnIndices - Set the column indices for all the block rows in the matrix.

2627:   Input Parameters:
2628: + mat     - the `MATSEQBAIJ` matrix
2629: - indices - the block column indices

2631:   Level: advanced

2633:   Notes:
2634:   This can be called if you have precomputed the nonzero structure of the
2635:   matrix and want to provide it to the matrix object to improve the performance
2636:   of the `MatSetValues()` operation.

2638:   You MUST have set the correct numbers of nonzeros per row in the call to
2639:   `MatCreateSeqBAIJ()`, and the columns indices MUST be sorted.

2641:   MUST be called before any calls to `MatSetValues()`

2643: .seealso: [](ch_matrices), `Mat`, `MATSEQBAIJ`, `MatSetValues()`
2644: @*/
2645: PetscErrorCode MatSeqBAIJSetColumnIndices(Mat mat, PetscInt *indices)
2646: {
2647:   PetscFunctionBegin;
2649:   PetscAssertPointer(indices, 2);
2650:   PetscUseMethod(mat, "MatSeqBAIJSetColumnIndices_C", (Mat, const PetscInt *), (mat, (const PetscInt *)indices));
2651:   PetscFunctionReturn(PETSC_SUCCESS);
2652: }

2654: static PetscErrorCode MatGetRowMaxAbs_SeqBAIJ(Mat A, Vec v, PetscInt idx[])
2655: {
2656:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2657:   PetscInt     i, j, n, row, bs, *ai, *aj, mbs;
2658:   PetscReal    atmp;
2659:   PetscScalar *x, zero = 0.0;
2660:   MatScalar   *aa;
2661:   PetscInt     ncols, brow, krow, kcol;

2663:   PetscFunctionBegin;
2664:   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2665:   bs  = A->rmap->bs;
2666:   aa  = a->a;
2667:   ai  = a->i;
2668:   aj  = a->j;
2669:   mbs = a->mbs;

2671:   PetscCall(VecSet(v, zero));
2672:   PetscCall(VecGetArray(v, &x));
2673:   PetscCall(VecGetLocalSize(v, &n));
2674:   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
2675:   for (i = 0; i < mbs; i++) {
2676:     ncols = ai[1] - ai[0];
2677:     ai++;
2678:     brow = bs * i;
2679:     for (j = 0; j < ncols; j++) {
2680:       for (kcol = 0; kcol < bs; kcol++) {
2681:         for (krow = 0; krow < bs; krow++) {
2682:           atmp = PetscAbsScalar(*aa);
2683:           aa++;
2684:           row = brow + krow; /* row index */
2685:           if (PetscAbsScalar(x[row]) < atmp) {
2686:             x[row] = atmp;
2687:             if (idx) idx[row] = bs * (*aj) + kcol;
2688:           }
2689:         }
2690:       }
2691:       aj++;
2692:     }
2693:   }
2694:   PetscCall(VecRestoreArray(v, &x));
2695:   PetscFunctionReturn(PETSC_SUCCESS);
2696: }

2698: static PetscErrorCode MatGetRowSumAbs_SeqBAIJ(Mat A, Vec v)
2699: {
2700:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2701:   PetscInt     i, j, n, row, bs, *ai, mbs;
2702:   PetscReal    atmp;
2703:   PetscScalar *x, zero = 0.0;
2704:   MatScalar   *aa;
2705:   PetscInt     ncols, brow, krow, kcol;

2707:   PetscFunctionBegin;
2708:   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2709:   bs  = A->rmap->bs;
2710:   aa  = a->a;
2711:   ai  = a->i;
2712:   mbs = a->mbs;

2714:   PetscCall(VecSet(v, zero));
2715:   PetscCall(VecGetArrayWrite(v, &x));
2716:   PetscCall(VecGetLocalSize(v, &n));
2717:   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
2718:   for (i = 0; i < mbs; i++) {
2719:     ncols = ai[1] - ai[0];
2720:     ai++;
2721:     brow = bs * i;
2722:     for (j = 0; j < ncols; j++) {
2723:       for (kcol = 0; kcol < bs; kcol++) {
2724:         for (krow = 0; krow < bs; krow++) {
2725:           atmp = PetscAbsScalar(*aa);
2726:           aa++;
2727:           row = brow + krow; /* row index */
2728:           x[row] += atmp;
2729:         }
2730:       }
2731:     }
2732:   }
2733:   PetscCall(VecRestoreArrayWrite(v, &x));
2734:   PetscFunctionReturn(PETSC_SUCCESS);
2735: }

2737: static PetscErrorCode MatCopy_SeqBAIJ(Mat A, Mat B, MatStructure str)
2738: {
2739:   PetscFunctionBegin;
2740:   /* If the two matrices have the same copy implementation, use fast copy. */
2741:   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
2742:     Mat_SeqBAIJ *a    = (Mat_SeqBAIJ *)A->data;
2743:     Mat_SeqBAIJ *b    = (Mat_SeqBAIJ *)B->data;
2744:     PetscInt     ambs = a->mbs, bmbs = b->mbs, abs = A->rmap->bs, bbs = B->rmap->bs, bs2 = abs * abs;

2746:     PetscCheck(a->i[ambs] == b->i[bmbs], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzero blocks in matrices A %" PetscInt_FMT " and B %" PetscInt_FMT " are different", a->i[ambs], b->i[bmbs]);
2747:     PetscCheck(abs == bbs, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Block size A %" PetscInt_FMT " and B %" PetscInt_FMT " are different", abs, bbs);
2748:     PetscCall(PetscArraycpy(b->a, a->a, bs2 * a->i[ambs]));
2749:     PetscCall(PetscObjectStateIncrease((PetscObject)B));
2750:   } else {
2751:     PetscCall(MatCopy_Basic(A, B, str));
2752:   }
2753:   PetscFunctionReturn(PETSC_SUCCESS);
2754: }

2756: static PetscErrorCode MatSeqBAIJGetArray_SeqBAIJ(Mat A, PetscScalar *array[])
2757: {
2758:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

2760:   PetscFunctionBegin;
2761:   *array = a->a;
2762:   PetscFunctionReturn(PETSC_SUCCESS);
2763: }

2765: static PetscErrorCode MatSeqBAIJRestoreArray_SeqBAIJ(Mat A, PetscScalar *array[])
2766: {
2767:   PetscFunctionBegin;
2768:   *array = NULL;
2769:   PetscFunctionReturn(PETSC_SUCCESS);
2770: }

2772: PetscErrorCode MatAXPYGetPreallocation_SeqBAIJ(Mat Y, Mat X, PetscInt *nnz)
2773: {
2774:   PetscInt     bs = Y->rmap->bs, mbs = Y->rmap->N / bs;
2775:   Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data;
2776:   Mat_SeqBAIJ *y = (Mat_SeqBAIJ *)Y->data;

2778:   PetscFunctionBegin;
2779:   /* Set the number of nonzeros in the new matrix */
2780:   PetscCall(MatAXPYGetPreallocation_SeqX_private(mbs, x->i, x->j, y->i, y->j, nnz));
2781:   PetscFunctionReturn(PETSC_SUCCESS);
2782: }

2784: PetscErrorCode MatAXPY_SeqBAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2785: {
2786:   Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data, *y = (Mat_SeqBAIJ *)Y->data;
2787:   PetscInt     bs = Y->rmap->bs, bs2 = bs * bs;
2788:   PetscBLASInt one = 1;

2790:   PetscFunctionBegin;
2791:   if (str == UNKNOWN_NONZERO_PATTERN || (PetscDefined(USE_DEBUG) && str == SAME_NONZERO_PATTERN)) {
2792:     PetscBool e = x->nz == y->nz && x->mbs == y->mbs && bs == X->rmap->bs ? PETSC_TRUE : PETSC_FALSE;
2793:     if (e) {
2794:       PetscCall(PetscArraycmp(x->i, y->i, x->mbs + 1, &e));
2795:       if (e) {
2796:         PetscCall(PetscArraycmp(x->j, y->j, x->i[x->mbs], &e));
2797:         if (e) str = SAME_NONZERO_PATTERN;
2798:       }
2799:     }
2800:     if (!e) PetscCheck(str != SAME_NONZERO_PATTERN, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "MatStructure is not SAME_NONZERO_PATTERN");
2801:   }
2802:   if (str == SAME_NONZERO_PATTERN) {
2803:     PetscScalar  alpha = a;
2804:     PetscBLASInt bnz;
2805:     PetscCall(PetscBLASIntCast(x->nz * bs2, &bnz));
2806:     PetscCallBLAS("BLASaxpy", BLASaxpy_(&bnz, &alpha, x->a, &one, y->a, &one));
2807:     PetscCall(PetscObjectStateIncrease((PetscObject)Y));
2808:   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2809:     PetscCall(MatAXPY_Basic(Y, a, X, str));
2810:   } else {
2811:     Mat       B;
2812:     PetscInt *nnz;
2813:     PetscCheck(bs == X->rmap->bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrices must have same block size");
2814:     PetscCall(PetscMalloc1(Y->rmap->N, &nnz));
2815:     PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B));
2816:     PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name));
2817:     PetscCall(MatSetSizes(B, Y->rmap->n, Y->cmap->n, Y->rmap->N, Y->cmap->N));
2818:     PetscCall(MatSetBlockSizesFromMats(B, Y, Y));
2819:     PetscCall(MatSetType(B, (MatType)((PetscObject)Y)->type_name));
2820:     PetscCall(MatAXPYGetPreallocation_SeqBAIJ(Y, X, nnz));
2821:     PetscCall(MatSeqBAIJSetPreallocation(B, bs, 0, nnz));
2822:     PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str));
2823:     PetscCall(MatHeaderMerge(Y, &B));
2824:     PetscCall(PetscFree(nnz));
2825:   }
2826:   PetscFunctionReturn(PETSC_SUCCESS);
2827: }

2829: PETSC_INTERN PetscErrorCode MatConjugate_SeqBAIJ(Mat A)
2830: {
2831: #if PetscDefined(USE_COMPLEX)
2832:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2833:   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2834:   MatScalar   *aa = a->a;

2836:   PetscFunctionBegin;
2837:   for (i = 0; i < nz; i++) aa[i] = PetscConj(aa[i]);
2838:   PetscFunctionReturn(PETSC_SUCCESS);
2839: #else
2840:   (void)A;
2841:   return PETSC_SUCCESS;
2842: #endif
2843: }

2845: static PetscErrorCode MatRealPart_SeqBAIJ(Mat A)
2846: {
2847: #if PetscDefined(USE_COMPLEX)
2848:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2849:   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2850:   MatScalar   *aa = a->a;

2852:   PetscFunctionBegin;
2853:   for (i = 0; i < nz; i++) aa[i] = PetscRealPart(aa[i]);
2854:   PetscFunctionReturn(PETSC_SUCCESS);
2855: #else
2856:   (void)A;
2857:   return PETSC_SUCCESS;
2858: #endif
2859: }

2861: static PetscErrorCode MatImaginaryPart_SeqBAIJ(Mat A)
2862: {
2863: #if PetscDefined(USE_COMPLEX)
2864:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2865:   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2866:   MatScalar   *aa = a->a;

2868:   PetscFunctionBegin;
2869:   for (i = 0; i < nz; i++) aa[i] = PetscImaginaryPart(aa[i]);
2870:   PetscFunctionReturn(PETSC_SUCCESS);
2871: #else
2872:   (void)A;
2873:   return PETSC_SUCCESS;
2874: #endif
2875: }

2877: /*
2878:     Code almost identical to MatGetColumnIJ_SeqAIJ() should share common code
2879: */
2880: static PetscErrorCode MatGetColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
2881: {
2882:   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
2883:   PetscInt     bs = A->rmap->bs, i, *collengths, *cia, *cja, n = A->cmap->n / bs, m = A->rmap->n / bs;
2884:   PetscInt     nz = a->i[m], row, *jj, mr, col;

2886:   PetscFunctionBegin;
2887:   *nn = n;
2888:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2889:   PetscCheck(!symmetric, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not for BAIJ matrices");
2890:   PetscCall(PetscCalloc1(n, &collengths));
2891:   PetscCall(PetscMalloc1(n + 1, &cia));
2892:   PetscCall(PetscMalloc1(nz, &cja));
2893:   jj = a->j;
2894:   for (i = 0; i < nz; i++) collengths[jj[i]]++;
2895:   cia[0] = oshift;
2896:   for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i];
2897:   PetscCall(PetscArrayzero(collengths, n));
2898:   jj = a->j;
2899:   for (row = 0; row < m; row++) {
2900:     mr = a->i[row + 1] - a->i[row];
2901:     for (i = 0; i < mr; i++) {
2902:       col = *jj++;

2904:       cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
2905:     }
2906:   }
2907:   PetscCall(PetscFree(collengths));
2908:   *ia = cia;
2909:   *ja = cja;
2910:   PetscFunctionReturn(PETSC_SUCCESS);
2911: }

2913: static PetscErrorCode MatRestoreColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
2914: {
2915:   PetscFunctionBegin;
2916:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2917:   PetscCall(PetscFree(*ia));
2918:   PetscCall(PetscFree(*ja));
2919:   PetscFunctionReturn(PETSC_SUCCESS);
2920: }

2922: /*
2923:  MatGetColumnIJ_SeqBAIJ_Color() and MatRestoreColumnIJ_SeqBAIJ_Color() are customized from
2924:  MatGetColumnIJ_SeqBAIJ() and MatRestoreColumnIJ_SeqBAIJ() by adding an output
2925:  spidx[], index of a->a, to be used in MatTransposeColoringCreate() and MatFDColoringCreate()
2926:  */
2927: PetscErrorCode MatGetColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done)
2928: {
2929:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2930:   PetscInt     i, *collengths, *cia, *cja, n = a->nbs, m = a->mbs;
2931:   PetscInt     nz = a->i[m], row, *jj, mr, col;
2932:   PetscInt    *cspidx;

2934:   PetscFunctionBegin;
2935:   *nn = n;
2936:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);

2938:   PetscCall(PetscCalloc1(n, &collengths));
2939:   PetscCall(PetscMalloc1(n + 1, &cia));
2940:   PetscCall(PetscMalloc1(nz, &cja));
2941:   PetscCall(PetscMalloc1(nz, &cspidx));
2942:   jj = a->j;
2943:   for (i = 0; i < nz; i++) collengths[jj[i]]++;
2944:   cia[0] = oshift;
2945:   for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i];
2946:   PetscCall(PetscArrayzero(collengths, n));
2947:   jj = a->j;
2948:   for (row = 0; row < m; row++) {
2949:     mr = a->i[row + 1] - a->i[row];
2950:     for (i = 0; i < mr; i++) {
2951:       col                                         = *jj++;
2952:       cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
2953:       cja[cia[col] + collengths[col]++ - oshift]  = row + oshift;
2954:     }
2955:   }
2956:   PetscCall(PetscFree(collengths));
2957:   *ia    = cia;
2958:   *ja    = cja;
2959:   *spidx = cspidx;
2960:   PetscFunctionReturn(PETSC_SUCCESS);
2961: }

2963: PetscErrorCode MatRestoreColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done)
2964: {
2965:   PetscFunctionBegin;
2966:   PetscCall(MatRestoreColumnIJ_SeqBAIJ(A, oshift, symmetric, inodecompressed, n, ia, ja, done));
2967:   PetscCall(PetscFree(*spidx));
2968:   PetscFunctionReturn(PETSC_SUCCESS);
2969: }

2971: static PetscErrorCode MatShift_SeqBAIJ(Mat Y, PetscScalar a)
2972: {
2973:   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)Y->data;

2975:   PetscFunctionBegin;
2976:   if (!Y->preallocated || !aij->nz) PetscCall(MatSeqBAIJSetPreallocation(Y, Y->rmap->bs, 1, NULL));
2977:   PetscCall(MatShift_Basic(Y, a));
2978:   PetscFunctionReturn(PETSC_SUCCESS);
2979: }

2981: PetscErrorCode MatEliminateZeros_SeqBAIJ(Mat A, PetscBool keep)
2982: {
2983:   Mat_SeqBAIJ *a      = (Mat_SeqBAIJ *)A->data;
2984:   PetscInt     fshift = 0, fshift_prev = 0, i, *ai = a->i, *aj = a->j, *imax = a->imax, j, k;
2985:   PetscInt     m = A->rmap->N, *ailen = a->ilen;
2986:   PetscInt     mbs = a->mbs, bs2 = a->bs2, rmax = 0;
2987:   MatScalar   *aa = a->a, *ap;
2988:   PetscBool    zero;

2990:   PetscFunctionBegin;
2991:   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Cannot eliminate zeros for unassembled matrix");
2992:   if (m) rmax = ailen[0];
2993:   for (i = 1; i <= mbs; i++) {
2994:     for (k = ai[i - 1]; k < ai[i]; k++) {
2995:       zero = PETSC_TRUE;
2996:       ap   = aa + bs2 * k;
2997:       for (j = 0; j < bs2 && zero; j++) {
2998:         if (ap[j] != 0.0) zero = PETSC_FALSE;
2999:       }
3000:       if (zero && (aj[k] != i - 1 || !keep)) fshift++;
3001:       else {
3002:         if (zero && aj[k] == i - 1) PetscCall(PetscInfo(A, "Keep the diagonal block at row %" PetscInt_FMT "\n", i - 1));
3003:         aj[k - fshift] = aj[k];
3004:         PetscCall(PetscArraymove(ap - bs2 * fshift, ap, bs2));
3005:       }
3006:     }
3007:     ai[i - 1] -= fshift_prev;
3008:     fshift_prev  = fshift;
3009:     ailen[i - 1] = imax[i - 1] = ai[i] - fshift - ai[i - 1];
3010:     a->nonzerorowcnt += ((ai[i] - fshift - ai[i - 1]) > 0);
3011:     rmax = PetscMax(rmax, ailen[i - 1]);
3012:   }
3013:   if (fshift) {
3014:     if (mbs) {
3015:       ai[mbs] -= fshift;
3016:       a->nz = ai[mbs];
3017:     }
3018:     PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; zeros eliminated: %" PetscInt_FMT "; nonzeros left: %" PetscInt_FMT "\n", m, A->cmap->n, fshift, a->nz));
3019:     A->nonzerostate++;
3020:     A->info.nz_unneeded += (PetscReal)fshift;
3021:     a->rmax = rmax;
3022:     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
3023:     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
3024:   }
3025:   PetscFunctionReturn(PETSC_SUCCESS);
3026: }

3028: static struct _MatOps MatOps_Values = {MatSetValues_SeqBAIJ,
3029:                                        MatGetRow_SeqBAIJ,
3030:                                        MatRestoreRow_SeqBAIJ,
3031:                                        MatMult_SeqBAIJ_N,
3032:                                        /* 4*/ MatMultAdd_SeqBAIJ_N,
3033:                                        MatMultTranspose_SeqBAIJ,
3034:                                        MatMultTransposeAdd_SeqBAIJ,
3035:                                        NULL,
3036:                                        NULL,
3037:                                        NULL,
3038:                                        /* 10*/ NULL,
3039:                                        MatLUFactor_SeqBAIJ,
3040:                                        NULL,
3041:                                        NULL,
3042:                                        MatTranspose_SeqBAIJ,
3043:                                        /* 15*/ MatGetInfo_SeqBAIJ,
3044:                                        MatEqual_SeqBAIJ,
3045:                                        MatGetDiagonal_SeqBAIJ,
3046:                                        MatDiagonalScale_SeqBAIJ,
3047:                                        MatNorm_SeqBAIJ,
3048:                                        /* 20*/ NULL,
3049:                                        MatAssemblyEnd_SeqBAIJ,
3050:                                        MatSetOption_SeqBAIJ,
3051:                                        MatZeroEntries_SeqBAIJ,
3052:                                        /* 24*/ MatZeroRows_SeqBAIJ,
3053:                                        NULL,
3054:                                        NULL,
3055:                                        NULL,
3056:                                        NULL,
3057:                                        /* 29*/ MatSetUp_Seq_Hash,
3058:                                        NULL,
3059:                                        NULL,
3060:                                        NULL,
3061:                                        NULL,
3062:                                        /* 34*/ MatDuplicate_SeqBAIJ,
3063:                                        NULL,
3064:                                        NULL,
3065:                                        MatILUFactor_SeqBAIJ,
3066:                                        NULL,
3067:                                        /* 39*/ MatAXPY_SeqBAIJ,
3068:                                        MatCreateSubMatrices_SeqBAIJ,
3069:                                        MatIncreaseOverlap_SeqBAIJ,
3070:                                        MatGetValues_SeqBAIJ,
3071:                                        MatCopy_SeqBAIJ,
3072:                                        /* 44*/ NULL,
3073:                                        MatScale_SeqBAIJ,
3074:                                        MatShift_SeqBAIJ,
3075:                                        NULL,
3076:                                        MatZeroRowsColumns_SeqBAIJ,
3077:                                        /* 49*/ NULL,
3078:                                        MatGetRowIJ_SeqBAIJ,
3079:                                        MatRestoreRowIJ_SeqBAIJ,
3080:                                        MatGetColumnIJ_SeqBAIJ,
3081:                                        MatRestoreColumnIJ_SeqBAIJ,
3082:                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
3083:                                        NULL,
3084:                                        NULL,
3085:                                        NULL,
3086:                                        MatSetValuesBlocked_SeqBAIJ,
3087:                                        /* 59*/ MatCreateSubMatrix_SeqBAIJ,
3088:                                        MatDestroy_SeqBAIJ,
3089:                                        MatView_SeqBAIJ,
3090:                                        NULL,
3091:                                        NULL,
3092:                                        /* 64*/ NULL,
3093:                                        NULL,
3094:                                        NULL,
3095:                                        NULL,
3096:                                        MatGetRowMaxAbs_SeqBAIJ,
3097:                                        /* 69*/ NULL,
3098:                                        MatConvert_Basic,
3099:                                        NULL,
3100:                                        MatFDColoringApply_BAIJ,
3101:                                        NULL,
3102:                                        /* 74*/ NULL,
3103:                                        NULL,
3104:                                        NULL,
3105:                                        NULL,
3106:                                        MatLoad_SeqBAIJ,
3107:                                        /* 79*/ NULL,
3108:                                        NULL,
3109:                                        NULL,
3110:                                        NULL,
3111:                                        NULL,
3112:                                        /* 84*/ NULL,
3113:                                        NULL,
3114:                                        NULL,
3115:                                        NULL,
3116:                                        NULL,
3117:                                        /* 89*/ NULL,
3118:                                        NULL,
3119:                                        NULL,
3120:                                        NULL,
3121:                                        MatConjugate_SeqBAIJ,
3122:                                        /* 94*/ NULL,
3123:                                        NULL,
3124:                                        MatRealPart_SeqBAIJ,
3125:                                        MatImaginaryPart_SeqBAIJ,
3126:                                        NULL,
3127:                                        /* 99*/ NULL,
3128:                                        NULL,
3129:                                        NULL,
3130:                                        NULL,
3131:                                        NULL,
3132:                                        /*104*/ MatMissingDiagonal_SeqBAIJ,
3133:                                        NULL,
3134:                                        NULL,
3135:                                        NULL,
3136:                                        NULL,
3137:                                        /*109*/ NULL,
3138:                                        NULL,
3139:                                        NULL,
3140:                                        MatMultHermitianTranspose_SeqBAIJ,
3141:                                        MatMultHermitianTransposeAdd_SeqBAIJ,
3142:                                        /*114*/ NULL,
3143:                                        NULL,
3144:                                        MatGetColumnReductions_SeqBAIJ,
3145:                                        MatInvertBlockDiagonal_SeqBAIJ,
3146:                                        NULL,
3147:                                        /*119*/ NULL,
3148:                                        NULL,
3149:                                        NULL,
3150:                                        NULL,
3151:                                        NULL,
3152:                                        /*124*/ NULL,
3153:                                        NULL,
3154:                                        NULL,
3155:                                        MatSetBlockSizes_Default,
3156:                                        NULL,
3157:                                        /*129*/ MatFDColoringSetUp_SeqXAIJ,
3158:                                        NULL,
3159:                                        MatCreateMPIMatConcatenateSeqMat_SeqBAIJ,
3160:                                        MatDestroySubMatrices_SeqBAIJ,
3161:                                        NULL,
3162:                                        /*134*/ NULL,
3163:                                        NULL,
3164:                                        NULL,
3165:                                        MatEliminateZeros_SeqBAIJ,
3166:                                        MatGetRowSumAbs_SeqBAIJ,
3167:                                        /*139*/ NULL,
3168:                                        NULL,
3169:                                        NULL,
3170:                                        MatCopyHashToXAIJ_Seq_Hash,
3171:                                        NULL,
3172:                                        NULL};

3174: static PetscErrorCode MatStoreValues_SeqBAIJ(Mat mat)
3175: {
3176:   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data;
3177:   PetscInt     nz  = aij->i[aij->mbs] * aij->bs2;

3179:   PetscFunctionBegin;
3180:   PetscCheck(aij->nonew == 1, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");

3182:   /* allocate space for values if not already there */
3183:   if (!aij->saved_values) PetscCall(PetscMalloc1(nz + 1, &aij->saved_values));

3185:   /* copy values over */
3186:   PetscCall(PetscArraycpy(aij->saved_values, aij->a, nz));
3187:   PetscFunctionReturn(PETSC_SUCCESS);
3188: }

3190: static PetscErrorCode MatRetrieveValues_SeqBAIJ(Mat mat)
3191: {
3192:   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data;
3193:   PetscInt     nz  = aij->i[aij->mbs] * aij->bs2;

3195:   PetscFunctionBegin;
3196:   PetscCheck(aij->nonew == 1, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
3197:   PetscCheck(aij->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");

3199:   /* copy values over */
3200:   PetscCall(PetscArraycpy(aij->a, aij->saved_values, nz));
3201:   PetscFunctionReturn(PETSC_SUCCESS);
3202: }

3204: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqAIJ(Mat, MatType, MatReuse, Mat *);
3205: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqSBAIJ(Mat, MatType, MatReuse, Mat *);

3207: PetscErrorCode MatSeqBAIJSetPreallocation_SeqBAIJ(Mat B, PetscInt bs, PetscInt nz, const PetscInt nnz[])
3208: {
3209:   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *)B->data;
3210:   PetscInt     i, mbs, nbs, bs2;
3211:   PetscBool    flg = PETSC_FALSE, skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;

3213:   PetscFunctionBegin;
3214:   if (B->hash_active) {
3215:     PetscInt bs;
3216:     B->ops[0] = b->cops;
3217:     PetscCall(PetscHMapIJVDestroy(&b->ht));
3218:     PetscCall(MatGetBlockSize(B, &bs));
3219:     if (bs > 1) PetscCall(PetscHSetIJDestroy(&b->bht));
3220:     PetscCall(PetscFree(b->dnz));
3221:     PetscCall(PetscFree(b->bdnz));
3222:     B->hash_active = PETSC_FALSE;
3223:   }
3224:   if (nz >= 0 || nnz) realalloc = PETSC_TRUE;
3225:   if (nz == MAT_SKIP_ALLOCATION) {
3226:     skipallocation = PETSC_TRUE;
3227:     nz             = 0;
3228:   }

3230:   PetscCall(MatSetBlockSize(B, bs));
3231:   PetscCall(PetscLayoutSetUp(B->rmap));
3232:   PetscCall(PetscLayoutSetUp(B->cmap));
3233:   PetscCall(PetscLayoutGetBlockSize(B->rmap, &bs));

3235:   B->preallocated = PETSC_TRUE;

3237:   mbs = B->rmap->n / bs;
3238:   nbs = B->cmap->n / bs;
3239:   bs2 = bs * bs;

3241:   PetscCheck(mbs * bs == B->rmap->n && nbs * bs == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows %" PetscInt_FMT ", cols %" PetscInt_FMT " must be divisible by blocksize %" PetscInt_FMT, B->rmap->N, B->cmap->n, bs);

3243:   if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3244:   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nz cannot be less than 0: value %" PetscInt_FMT, nz);
3245:   if (nnz) {
3246:     for (i = 0; i < mbs; i++) {
3247:       PetscCheck(nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, nnz[i]);
3248:       PetscCheck(nnz[i] <= nbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nnz cannot be greater than block row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, nnz[i], nbs);
3249:     }
3250:   }

3252:   PetscOptionsBegin(PetscObjectComm((PetscObject)B), NULL, "Optimize options for SEQBAIJ matrix 2 ", "Mat");
3253:   PetscCall(PetscOptionsBool("-mat_no_unroll", "Do not optimize for block size (slow)", NULL, flg, &flg, NULL));
3254:   PetscOptionsEnd();

3256:   if (!flg) {
3257:     switch (bs) {
3258:     case 1:
3259:       B->ops->mult    = MatMult_SeqBAIJ_1;
3260:       B->ops->multadd = MatMultAdd_SeqBAIJ_1;
3261:       break;
3262:     case 2:
3263:       B->ops->mult    = MatMult_SeqBAIJ_2;
3264:       B->ops->multadd = MatMultAdd_SeqBAIJ_2;
3265:       break;
3266:     case 3:
3267:       B->ops->mult    = MatMult_SeqBAIJ_3;
3268:       B->ops->multadd = MatMultAdd_SeqBAIJ_3;
3269:       break;
3270:     case 4:
3271:       B->ops->mult    = MatMult_SeqBAIJ_4;
3272:       B->ops->multadd = MatMultAdd_SeqBAIJ_4;
3273:       break;
3274:     case 5:
3275:       B->ops->mult    = MatMult_SeqBAIJ_5;
3276:       B->ops->multadd = MatMultAdd_SeqBAIJ_5;
3277:       break;
3278:     case 6:
3279:       B->ops->mult    = MatMult_SeqBAIJ_6;
3280:       B->ops->multadd = MatMultAdd_SeqBAIJ_6;
3281:       break;
3282:     case 7:
3283:       B->ops->mult    = MatMult_SeqBAIJ_7;
3284:       B->ops->multadd = MatMultAdd_SeqBAIJ_7;
3285:       break;
3286:     case 9: {
3287:       PetscInt version = 1;
3288:       PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL));
3289:       switch (version) {
3290: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
3291:       case 1:
3292:         B->ops->mult    = MatMult_SeqBAIJ_9_AVX2;
3293:         B->ops->multadd = MatMultAdd_SeqBAIJ_9_AVX2;
3294:         PetscCall(PetscInfo(B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3295:         break;
3296: #endif
3297:       default:
3298:         B->ops->mult    = MatMult_SeqBAIJ_N;
3299:         B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3300:         PetscCall(PetscInfo(B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3301:         break;
3302:       }
3303:       break;
3304:     }
3305:     case 11:
3306:       B->ops->mult    = MatMult_SeqBAIJ_11;
3307:       B->ops->multadd = MatMultAdd_SeqBAIJ_11;
3308:       break;
3309:     case 12: {
3310:       PetscInt version = 1;
3311:       PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL));
3312:       switch (version) {
3313:       case 1:
3314:         B->ops->mult    = MatMult_SeqBAIJ_12_ver1;
3315:         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
3316:         PetscCall(PetscInfo(B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3317:         break;
3318:       case 2:
3319:         B->ops->mult    = MatMult_SeqBAIJ_12_ver2;
3320:         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver2;
3321:         PetscCall(PetscInfo(B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3322:         break;
3323: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
3324:       case 3:
3325:         B->ops->mult    = MatMult_SeqBAIJ_12_AVX2;
3326:         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
3327:         PetscCall(PetscInfo(B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3328:         break;
3329: #endif
3330:       default:
3331:         B->ops->mult    = MatMult_SeqBAIJ_N;
3332:         B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3333:         PetscCall(PetscInfo(B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3334:         break;
3335:       }
3336:       break;
3337:     }
3338:     case 15: {
3339:       PetscInt version = 1;
3340:       PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL));
3341:       switch (version) {
3342:       case 1:
3343:         B->ops->mult = MatMult_SeqBAIJ_15_ver1;
3344:         PetscCall(PetscInfo(B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3345:         break;
3346:       case 2:
3347:         B->ops->mult = MatMult_SeqBAIJ_15_ver2;
3348:         PetscCall(PetscInfo(B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3349:         break;
3350:       case 3:
3351:         B->ops->mult = MatMult_SeqBAIJ_15_ver3;
3352:         PetscCall(PetscInfo(B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3353:         break;
3354:       case 4:
3355:         B->ops->mult = MatMult_SeqBAIJ_15_ver4;
3356:         PetscCall(PetscInfo(B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3357:         break;
3358:       default:
3359:         B->ops->mult = MatMult_SeqBAIJ_N;
3360:         PetscCall(PetscInfo(B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3361:         break;
3362:       }
3363:       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3364:       break;
3365:     }
3366:     default:
3367:       B->ops->mult    = MatMult_SeqBAIJ_N;
3368:       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3369:       PetscCall(PetscInfo(B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3370:       break;
3371:     }
3372:   }
3373:   B->ops->sor = MatSOR_SeqBAIJ;
3374:   b->mbs      = mbs;
3375:   b->nbs      = nbs;
3376:   if (!skipallocation) {
3377:     if (!b->imax) {
3378:       PetscCall(PetscMalloc2(mbs, &b->imax, mbs, &b->ilen));

3380:       b->free_imax_ilen = PETSC_TRUE;
3381:     }
3382:     /* b->ilen will count nonzeros in each block row so far. */
3383:     for (i = 0; i < mbs; i++) b->ilen[i] = 0;
3384:     if (!nnz) {
3385:       if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3386:       else if (nz < 0) nz = 1;
3387:       nz = PetscMin(nz, nbs);
3388:       for (i = 0; i < mbs; i++) b->imax[i] = nz;
3389:       PetscCall(PetscIntMultError(nz, mbs, &nz));
3390:     } else {
3391:       PetscInt64 nz64 = 0;
3392:       for (i = 0; i < mbs; i++) {
3393:         b->imax[i] = nnz[i];
3394:         nz64 += nnz[i];
3395:       }
3396:       PetscCall(PetscIntCast(nz64, &nz));
3397:     }

3399:     /* allocate the matrix space */
3400:     PetscCall(MatSeqXAIJFreeAIJ(B, &b->a, &b->j, &b->i));
3401:     PetscCall(PetscShmgetAllocateArray(nz, sizeof(PetscInt), (void **)&b->j));
3402:     PetscCall(PetscShmgetAllocateArray(B->rmap->N + 1, sizeof(PetscInt), (void **)&b->i));
3403:     if (B->structure_only) {
3404:       b->free_a = PETSC_FALSE;
3405:     } else {
3406:       PetscInt nzbs2 = 0;
3407:       PetscCall(PetscIntMultError(nz, bs2, &nzbs2));
3408:       PetscCall(PetscShmgetAllocateArray(nzbs2, sizeof(PetscScalar), (void **)&b->a));
3409:       b->free_a = PETSC_TRUE;
3410:       PetscCall(PetscArrayzero(b->a, nzbs2));
3411:     }
3412:     b->free_ij = PETSC_TRUE;
3413:     PetscCall(PetscArrayzero(b->j, nz));

3415:     b->i[0] = 0;
3416:     for (i = 1; i < mbs + 1; i++) b->i[i] = b->i[i - 1] + b->imax[i - 1];
3417:   } else {
3418:     b->free_a  = PETSC_FALSE;
3419:     b->free_ij = PETSC_FALSE;
3420:   }

3422:   b->bs2              = bs2;
3423:   b->mbs              = mbs;
3424:   b->nz               = 0;
3425:   b->maxnz            = nz;
3426:   B->info.nz_unneeded = (PetscReal)b->maxnz * bs2;
3427:   B->was_assembled    = PETSC_FALSE;
3428:   B->assembled        = PETSC_FALSE;
3429:   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
3430:   PetscFunctionReturn(PETSC_SUCCESS);
3431: }

3433: static PetscErrorCode MatSeqBAIJSetPreallocationCSR_SeqBAIJ(Mat B, PetscInt bs, const PetscInt ii[], const PetscInt jj[], const PetscScalar V[])
3434: {
3435:   PetscInt     i, m, nz, nz_max = 0, *nnz;
3436:   PetscScalar *values      = NULL;
3437:   PetscBool    roworiented = ((Mat_SeqBAIJ *)B->data)->roworiented;

3439:   PetscFunctionBegin;
3440:   PetscCheck(bs >= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Invalid block size specified, must be positive but it is %" PetscInt_FMT, bs);
3441:   PetscCall(PetscLayoutSetBlockSize(B->rmap, bs));
3442:   PetscCall(PetscLayoutSetBlockSize(B->cmap, bs));
3443:   PetscCall(PetscLayoutSetUp(B->rmap));
3444:   PetscCall(PetscLayoutSetUp(B->cmap));
3445:   PetscCall(PetscLayoutGetBlockSize(B->rmap, &bs));
3446:   m = B->rmap->n / bs;

3448:   PetscCheck(ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "ii[0] must be 0 but it is %" PetscInt_FMT, ii[0]);
3449:   PetscCall(PetscMalloc1(m + 1, &nnz));
3450:   for (i = 0; i < m; i++) {
3451:     nz = ii[i + 1] - ii[i];
3452:     PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative number of columns %" PetscInt_FMT, i, nz);
3453:     nz_max = PetscMax(nz_max, nz);
3454:     nnz[i] = nz;
3455:   }
3456:   PetscCall(MatSeqBAIJSetPreallocation(B, bs, 0, nnz));
3457:   PetscCall(PetscFree(nnz));

3459:   values = (PetscScalar *)V;
3460:   if (!values) PetscCall(PetscCalloc1(bs * bs * (nz_max + 1), &values));
3461:   for (i = 0; i < m; i++) {
3462:     PetscInt        ncols = ii[i + 1] - ii[i];
3463:     const PetscInt *icols = jj + ii[i];
3464:     if (bs == 1 || !roworiented) {
3465:       const PetscScalar *svals = values + (V ? (bs * bs * ii[i]) : 0);
3466:       PetscCall(MatSetValuesBlocked_SeqBAIJ(B, 1, &i, ncols, icols, svals, INSERT_VALUES));
3467:     } else {
3468:       PetscInt j;
3469:       for (j = 0; j < ncols; j++) {
3470:         const PetscScalar *svals = values + (V ? (bs * bs * (ii[i] + j)) : 0);
3471:         PetscCall(MatSetValuesBlocked_SeqBAIJ(B, 1, &i, 1, &icols[j], svals, INSERT_VALUES));
3472:       }
3473:     }
3474:   }
3475:   if (!V) PetscCall(PetscFree(values));
3476:   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
3477:   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
3478:   PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3479:   PetscFunctionReturn(PETSC_SUCCESS);
3480: }

3482: /*@C
3483:   MatSeqBAIJGetArray - gives read/write access to the array where the data for a `MATSEQBAIJ` matrix is stored

3485:   Not Collective

3487:   Input Parameter:
3488: . A - a `MATSEQBAIJ` matrix

3490:   Output Parameter:
3491: . array - pointer to the data

3493:   Level: intermediate

3495: .seealso: [](ch_matrices), `Mat`, `MATSEQBAIJ`, `MatSeqBAIJRestoreArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()`
3496: @*/
3497: PetscErrorCode MatSeqBAIJGetArray(Mat A, PetscScalar *array[])
3498: {
3499:   PetscFunctionBegin;
3500:   PetscUseMethod(A, "MatSeqBAIJGetArray_C", (Mat, PetscScalar **), (A, array));
3501:   PetscFunctionReturn(PETSC_SUCCESS);
3502: }

3504: /*@C
3505:   MatSeqBAIJRestoreArray - returns access to the array where the data for a `MATSEQBAIJ` matrix is stored obtained by `MatSeqBAIJGetArray()`

3507:   Not Collective

3509:   Input Parameters:
3510: + A     - a `MATSEQBAIJ` matrix
3511: - array - pointer to the data

3513:   Level: intermediate

3515: .seealso: [](ch_matrices), `Mat`, `MatSeqBAIJGetArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()`
3516: @*/
3517: PetscErrorCode MatSeqBAIJRestoreArray(Mat A, PetscScalar *array[])
3518: {
3519:   PetscFunctionBegin;
3520:   PetscUseMethod(A, "MatSeqBAIJRestoreArray_C", (Mat, PetscScalar **), (A, array));
3521:   PetscFunctionReturn(PETSC_SUCCESS);
3522: }

3524: /*MC
3525:    MATSEQBAIJ - MATSEQBAIJ = "seqbaij" - A matrix type to be used for sequential block sparse matrices, based on
3526:    block sparse compressed row format.

3528:    Options Database Keys:
3529: + -mat_type seqbaij - sets the matrix type to `MATSEQBAIJ` during a call to `MatSetFromOptions()`
3530: - -mat_baij_mult_version version - indicate the version of the matrix-vector product to use (0 often indicates using BLAS)

3532:    Level: beginner

3534:    Notes:
3535:    `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
3536:    space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored

3538:    Run with `-info` to see what version of the matrix-vector product is being used

3540: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqBAIJ()`
3541: M*/

3543: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBSTRM(Mat, MatType, MatReuse, Mat *);

3545: PETSC_EXTERN PetscErrorCode MatCreate_SeqBAIJ(Mat B)
3546: {
3547:   PetscMPIInt  size;
3548:   Mat_SeqBAIJ *b;

3550:   PetscFunctionBegin;
3551:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
3552:   PetscCheck(size == 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Comm must be of size 1");

3554:   PetscCall(PetscNew(&b));
3555:   B->data   = (void *)b;
3556:   B->ops[0] = MatOps_Values;

3558:   b->row          = NULL;
3559:   b->col          = NULL;
3560:   b->icol         = NULL;
3561:   b->reallocs     = 0;
3562:   b->saved_values = NULL;

3564:   b->roworiented        = PETSC_TRUE;
3565:   b->nonew              = 0;
3566:   b->diag               = NULL;
3567:   B->spptr              = NULL;
3568:   B->info.nz_unneeded   = (PetscReal)b->maxnz * b->bs2;
3569:   b->keepnonzeropattern = PETSC_FALSE;

3571:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJGetArray_C", MatSeqBAIJGetArray_SeqBAIJ));
3572:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJRestoreArray_C", MatSeqBAIJRestoreArray_SeqBAIJ));
3573:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqBAIJ));
3574:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqBAIJ));
3575:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetColumnIndices_C", MatSeqBAIJSetColumnIndices_SeqBAIJ));
3576:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqaij_C", MatConvert_SeqBAIJ_SeqAIJ));
3577:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqsbaij_C", MatConvert_SeqBAIJ_SeqSBAIJ));
3578:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocation_C", MatSeqBAIJSetPreallocation_SeqBAIJ));
3579:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocationCSR_C", MatSeqBAIJSetPreallocationCSR_SeqBAIJ));
3580:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_SeqBAIJ));
3581: #if defined(PETSC_HAVE_HYPRE)
3582:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_hypre_C", MatConvert_AIJ_HYPRE));
3583: #endif
3584:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_is_C", MatConvert_XAIJ_IS));
3585:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQBAIJ));
3586:   PetscFunctionReturn(PETSC_SUCCESS);
3587: }

3589: PETSC_INTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
3590: {
3591:   Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data, *a = (Mat_SeqBAIJ *)A->data;
3592:   PetscInt     i, mbs = a->mbs, nz = a->nz, bs2 = a->bs2;

3594:   PetscFunctionBegin;
3595:   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Cannot duplicate unassembled matrix");
3596:   PetscCheck(a->i[mbs] == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupt matrix");

3598:   if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3599:     c->imax           = a->imax;
3600:     c->ilen           = a->ilen;
3601:     c->free_imax_ilen = PETSC_FALSE;
3602:   } else {
3603:     PetscCall(PetscMalloc2(mbs, &c->imax, mbs, &c->ilen));
3604:     for (i = 0; i < mbs; i++) {
3605:       c->imax[i] = a->imax[i];
3606:       c->ilen[i] = a->ilen[i];
3607:     }
3608:     c->free_imax_ilen = PETSC_TRUE;
3609:   }

3611:   /* allocate the matrix space */
3612:   if (mallocmatspace) {
3613:     if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3614:       PetscCall(PetscShmgetAllocateArray(bs2 * nz, sizeof(PetscScalar), (void **)&c->a));
3615:       PetscCall(PetscArrayzero(c->a, bs2 * nz));
3616:       c->free_a       = PETSC_TRUE;
3617:       c->i            = a->i;
3618:       c->j            = a->j;
3619:       c->free_ij      = PETSC_FALSE;
3620:       c->parent       = A;
3621:       C->preallocated = PETSC_TRUE;
3622:       C->assembled    = PETSC_TRUE;

3624:       PetscCall(PetscObjectReference((PetscObject)A));
3625:       PetscCall(MatSetOption(A, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3626:       PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3627:     } else {
3628:       PetscCall(PetscShmgetAllocateArray(bs2 * nz, sizeof(PetscScalar), (void **)&c->a));
3629:       PetscCall(PetscShmgetAllocateArray(nz, sizeof(PetscInt), (void **)&c->j));
3630:       PetscCall(PetscShmgetAllocateArray(mbs + 1, sizeof(PetscInt), (void **)&c->i));
3631:       c->free_a  = PETSC_TRUE;
3632:       c->free_ij = PETSC_TRUE;

3634:       PetscCall(PetscArraycpy(c->i, a->i, mbs + 1));
3635:       if (mbs > 0) {
3636:         PetscCall(PetscArraycpy(c->j, a->j, nz));
3637:         if (cpvalues == MAT_COPY_VALUES) {
3638:           PetscCall(PetscArraycpy(c->a, a->a, bs2 * nz));
3639:         } else {
3640:           PetscCall(PetscArrayzero(c->a, bs2 * nz));
3641:         }
3642:       }
3643:       C->preallocated = PETSC_TRUE;
3644:       C->assembled    = PETSC_TRUE;
3645:     }
3646:   }

3648:   c->roworiented = a->roworiented;
3649:   c->nonew       = a->nonew;

3651:   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
3652:   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));

3654:   c->bs2 = a->bs2;
3655:   c->mbs = a->mbs;
3656:   c->nbs = a->nbs;

3658:   if (a->diag) {
3659:     if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3660:       c->diag      = a->diag;
3661:       c->free_diag = PETSC_FALSE;
3662:     } else {
3663:       PetscCall(PetscMalloc1(mbs + 1, &c->diag));
3664:       for (i = 0; i < mbs; i++) c->diag[i] = a->diag[i];
3665:       c->free_diag = PETSC_TRUE;
3666:     }
3667:   } else c->diag = NULL;

3669:   c->nz         = a->nz;
3670:   c->maxnz      = a->nz; /* Since we allocate exactly the right amount */
3671:   c->solve_work = NULL;
3672:   c->mult_work  = NULL;
3673:   c->sor_workt  = NULL;
3674:   c->sor_work   = NULL;

3676:   c->compressedrow.use   = a->compressedrow.use;
3677:   c->compressedrow.nrows = a->compressedrow.nrows;
3678:   if (a->compressedrow.use) {
3679:     i = a->compressedrow.nrows;
3680:     PetscCall(PetscMalloc2(i + 1, &c->compressedrow.i, i + 1, &c->compressedrow.rindex));
3681:     PetscCall(PetscArraycpy(c->compressedrow.i, a->compressedrow.i, i + 1));
3682:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, i));
3683:   } else {
3684:     c->compressedrow.use    = PETSC_FALSE;
3685:     c->compressedrow.i      = NULL;
3686:     c->compressedrow.rindex = NULL;
3687:   }
3688:   c->nonzerorowcnt = a->nonzerorowcnt;
3689:   C->nonzerostate  = A->nonzerostate;

3691:   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
3692:   PetscFunctionReturn(PETSC_SUCCESS);
3693: }

3695: PetscErrorCode MatDuplicate_SeqBAIJ(Mat A, MatDuplicateOption cpvalues, Mat *B)
3696: {
3697:   PetscFunctionBegin;
3698:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
3699:   PetscCall(MatSetSizes(*B, A->rmap->N, A->cmap->n, A->rmap->N, A->cmap->n));
3700:   PetscCall(MatSetType(*B, MATSEQBAIJ));
3701:   PetscCall(MatDuplicateNoCreate_SeqBAIJ(*B, A, cpvalues, PETSC_TRUE));
3702:   PetscFunctionReturn(PETSC_SUCCESS);
3703: }

3705: /* Used for both SeqBAIJ and SeqSBAIJ matrices */
3706: PetscErrorCode MatLoad_SeqBAIJ_Binary(Mat mat, PetscViewer viewer)
3707: {
3708:   PetscInt     header[4], M, N, nz, bs, m, n, mbs, nbs, rows, cols, sum, i, j, k;
3709:   PetscInt    *rowidxs, *colidxs;
3710:   PetscScalar *matvals;

3712:   PetscFunctionBegin;
3713:   PetscCall(PetscViewerSetUp(viewer));

3715:   /* read matrix header */
3716:   PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT));
3717:   PetscCheck(header[0] == MAT_FILE_CLASSID, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file");
3718:   M  = header[1];
3719:   N  = header[2];
3720:   nz = header[3];
3721:   PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M);
3722:   PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N);
3723:   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as SeqBAIJ");

3725:   /* set block sizes from the viewer's .info file */
3726:   PetscCall(MatLoad_Binary_BlockSizes(mat, viewer));
3727:   /* set local and global sizes if not set already */
3728:   if (mat->rmap->n < 0) mat->rmap->n = M;
3729:   if (mat->cmap->n < 0) mat->cmap->n = N;
3730:   if (mat->rmap->N < 0) mat->rmap->N = M;
3731:   if (mat->cmap->N < 0) mat->cmap->N = N;
3732:   PetscCall(PetscLayoutSetUp(mat->rmap));
3733:   PetscCall(PetscLayoutSetUp(mat->cmap));

3735:   /* check if the matrix sizes are correct */
3736:   PetscCall(MatGetSize(mat, &rows, &cols));
3737:   PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols);
3738:   PetscCall(MatGetBlockSize(mat, &bs));
3739:   PetscCall(MatGetLocalSize(mat, &m, &n));
3740:   mbs = m / bs;
3741:   nbs = n / bs;

3743:   /* read in row lengths, column indices and nonzero values */
3744:   PetscCall(PetscMalloc1(m + 1, &rowidxs));
3745:   PetscCall(PetscViewerBinaryRead(viewer, rowidxs + 1, m, NULL, PETSC_INT));
3746:   rowidxs[0] = 0;
3747:   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3748:   sum = rowidxs[m];
3749:   PetscCheck(sum == nz, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum);

3751:   /* read in column indices and nonzero values */
3752:   PetscCall(PetscMalloc2(rowidxs[m], &colidxs, nz, &matvals));
3753:   PetscCall(PetscViewerBinaryRead(viewer, colidxs, rowidxs[m], NULL, PETSC_INT));
3754:   PetscCall(PetscViewerBinaryRead(viewer, matvals, rowidxs[m], NULL, PETSC_SCALAR));

3756:   {               /* preallocate matrix storage */
3757:     PetscBT   bt; /* helper bit set to count nonzeros */
3758:     PetscInt *nnz;
3759:     PetscBool sbaij;

3761:     PetscCall(PetscBTCreate(nbs, &bt));
3762:     PetscCall(PetscCalloc1(mbs, &nnz));
3763:     PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATSEQSBAIJ, &sbaij));
3764:     for (i = 0; i < mbs; i++) {
3765:       PetscCall(PetscBTMemzero(nbs, bt));
3766:       for (k = 0; k < bs; k++) {
3767:         PetscInt row = bs * i + k;
3768:         for (j = rowidxs[row]; j < rowidxs[row + 1]; j++) {
3769:           PetscInt col = colidxs[j];
3770:           if (!sbaij || col >= row)
3771:             if (!PetscBTLookupSet(bt, col / bs)) nnz[i]++;
3772:         }
3773:       }
3774:     }
3775:     PetscCall(PetscBTDestroy(&bt));
3776:     PetscCall(MatSeqBAIJSetPreallocation(mat, bs, 0, nnz));
3777:     PetscCall(MatSeqSBAIJSetPreallocation(mat, bs, 0, nnz));
3778:     PetscCall(PetscFree(nnz));
3779:   }

3781:   /* store matrix values */
3782:   for (i = 0; i < m; i++) {
3783:     PetscInt row = i, s = rowidxs[i], e = rowidxs[i + 1];
3784:     PetscUseTypeMethod(mat, setvalues, 1, &row, e - s, colidxs + s, matvals + s, INSERT_VALUES);
3785:   }

3787:   PetscCall(PetscFree(rowidxs));
3788:   PetscCall(PetscFree2(colidxs, matvals));
3789:   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
3790:   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
3791:   PetscFunctionReturn(PETSC_SUCCESS);
3792: }

3794: PetscErrorCode MatLoad_SeqBAIJ(Mat mat, PetscViewer viewer)
3795: {
3796:   PetscBool isbinary;

3798:   PetscFunctionBegin;
3799:   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
3800:   PetscCheck(isbinary, PetscObjectComm((PetscObject)viewer), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)mat)->type_name);
3801:   PetscCall(MatLoad_SeqBAIJ_Binary(mat, viewer));
3802:   PetscFunctionReturn(PETSC_SUCCESS);
3803: }

3805: /*@
3806:   MatCreateSeqBAIJ - Creates a sparse matrix in `MATSEQAIJ` (block
3807:   compressed row) format.  For good matrix assembly performance the
3808:   user should preallocate the matrix storage by setting the parameter `nz`
3809:   (or the array `nnz`).

3811:   Collective

3813:   Input Parameters:
3814: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3815: . bs   - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
3816:          blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
3817: . m    - number of rows
3818: . n    - number of columns
3819: . nz   - number of nonzero blocks  per block row (same for all rows)
3820: - nnz  - array containing the number of nonzero blocks in the various block rows
3821:          (possibly different for each block row) or `NULL`

3823:   Output Parameter:
3824: . A - the matrix

3826:   Options Database Keys:
3827: + -mat_no_unroll  - uses code that does not unroll the loops in the block calculations (much slower)
3828: - -mat_block_size - size of the blocks to use

3830:   Level: intermediate

3832:   Notes:
3833:   It is recommended that one use `MatCreateFromOptions()` or the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3834:   MatXXXXSetPreallocation() paradigm instead of this routine directly.
3835:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

3837:   The number of rows and columns must be divisible by blocksize.

3839:   If the `nnz` parameter is given then the `nz` parameter is ignored

3841:   A nonzero block is any block that as 1 or more nonzeros in it

3843:   The `MATSEQBAIJ` format is fully compatible with standard Fortran
3844:   storage.  That is, the stored row and column indices can begin at
3845:   either one (as in Fortran) or zero.

3847:   Specify the preallocated storage with either `nz` or `nnz` (not both).
3848:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3849:   allocation.  See [Sparse Matrices](sec_matsparse) for details.
3850:   matrices.

3852: .seealso: [](ch_matrices), `Mat`, [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`
3853: @*/
3854: PetscErrorCode MatCreateSeqBAIJ(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3855: {
3856:   PetscFunctionBegin;
3857:   PetscCall(MatCreate(comm, A));
3858:   PetscCall(MatSetSizes(*A, m, n, m, n));
3859:   PetscCall(MatSetType(*A, MATSEQBAIJ));
3860:   PetscCall(MatSeqBAIJSetPreallocation(*A, bs, nz, (PetscInt *)nnz));
3861:   PetscFunctionReturn(PETSC_SUCCESS);
3862: }

3864: /*@
3865:   MatSeqBAIJSetPreallocation - Sets the block size and expected nonzeros
3866:   per row in the matrix. For good matrix assembly performance the
3867:   user should preallocate the matrix storage by setting the parameter `nz`
3868:   (or the array `nnz`).

3870:   Collective

3872:   Input Parameters:
3873: + B   - the matrix
3874: . bs  - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
3875:         blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
3876: . nz  - number of block nonzeros per block row (same for all rows)
3877: - nnz - array containing the number of block nonzeros in the various block rows
3878:         (possibly different for each block row) or `NULL`

3880:   Options Database Keys:
3881: + -mat_no_unroll  - uses code that does not unroll the loops in the block calculations (much slower)
3882: - -mat_block_size - size of the blocks to use

3884:   Level: intermediate

3886:   Notes:
3887:   If the `nnz` parameter is given then the `nz` parameter is ignored

3889:   You can call `MatGetInfo()` to get information on how effective the preallocation was;
3890:   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
3891:   You can also run with the option `-info` and look for messages with the string
3892:   malloc in them to see if additional memory allocation was needed.

3894:   The `MATSEQBAIJ` format is fully compatible with standard Fortran
3895:   storage.  That is, the stored row and column indices can begin at
3896:   either one (as in Fortran) or zero.

3898:   Specify the preallocated storage with either `nz` or `nnz` (not both).
3899:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3900:   allocation.  See [Sparse Matrices](sec_matsparse) for details.

3902: .seealso: [](ch_matrices), `Mat`, [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`, `MatGetInfo()`
3903: @*/
3904: PetscErrorCode MatSeqBAIJSetPreallocation(Mat B, PetscInt bs, PetscInt nz, const PetscInt nnz[])
3905: {
3906:   PetscFunctionBegin;
3910:   PetscTryMethod(B, "MatSeqBAIJSetPreallocation_C", (Mat, PetscInt, PetscInt, const PetscInt[]), (B, bs, nz, nnz));
3911:   PetscFunctionReturn(PETSC_SUCCESS);
3912: }

3914: /*@C
3915:   MatSeqBAIJSetPreallocationCSR - Creates a sparse sequential matrix in `MATSEQBAIJ` format using the given nonzero structure and (optional) numerical values

3917:   Collective

3919:   Input Parameters:
3920: + B  - the matrix
3921: . bs - the blocksize
3922: . i  - the indices into `j` for the start of each local row (indices start with zero)
3923: . j  - the column indices for each local row (indices start with zero) these must be sorted for each row
3924: - v  - optional values in the matrix, use `NULL` if not provided

3926:   Level: advanced

3928:   Notes:
3929:   The `i`,`j`,`v` values are COPIED with this routine; to avoid the copy use `MatCreateSeqBAIJWithArrays()`

3931:   The order of the entries in values is specified by the `MatOption` `MAT_ROW_ORIENTED`.  For example, C programs
3932:   may want to use the default `MAT_ROW_ORIENTED` of `PETSC_TRUE` and use an array v[nnz][bs][bs] where the second index is
3933:   over rows within a block and the last index is over columns within a block row.  Fortran programs will likely set
3934:   `MAT_ROW_ORIENTED` of `PETSC_FALSE` and use a Fortran array v(bs,bs,nnz) in which the first index is over rows within a
3935:   block column and the second index is over columns within a block.

3937:   Though this routine has Preallocation() in the name it also sets the exact nonzero locations of the matrix entries and usually the numerical values as well

3939: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqBAIJ()`, `MatSetValues()`, `MatSeqBAIJSetPreallocation()`, `MATSEQBAIJ`
3940: @*/
3941: PetscErrorCode MatSeqBAIJSetPreallocationCSR(Mat B, PetscInt bs, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
3942: {
3943:   PetscFunctionBegin;
3947:   PetscTryMethod(B, "MatSeqBAIJSetPreallocationCSR_C", (Mat, PetscInt, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, bs, i, j, v));
3948:   PetscFunctionReturn(PETSC_SUCCESS);
3949: }

3951: /*@
3952:   MatCreateSeqBAIJWithArrays - Creates a `MATSEQBAIJ` matrix using matrix elements provided by the user.

3954:   Collective

3956:   Input Parameters:
3957: + comm - must be an MPI communicator of size 1
3958: . bs   - size of block
3959: . m    - number of rows
3960: . n    - number of columns
3961: . i    - row indices; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row block row of the matrix
3962: . j    - column indices
3963: - a    - matrix values

3965:   Output Parameter:
3966: . mat - the matrix

3968:   Level: advanced

3970:   Notes:
3971:   The `i`, `j`, and `a` arrays are not copied by this routine, the user must free these arrays
3972:   once the matrix is destroyed

3974:   You cannot set new nonzero locations into this matrix, that will generate an error.

3976:   The `i` and `j` indices are 0 based

3978:   When block size is greater than 1 the matrix values must be stored using the `MATSEQBAIJ` storage format

3980:   The order of the entries in values is the same as the block compressed sparse row storage format; that is, it is
3981:   the same as a three dimensional array in Fortran values(bs,bs,nnz) that contains the first column of the first
3982:   block, followed by the second column of the first block etc etc.  That is, the blocks are contiguous in memory
3983:   with column-major ordering within blocks.

3985: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateBAIJ()`, `MatCreateSeqBAIJ()`
3986: @*/
3987: PetscErrorCode MatCreateSeqBAIJWithArrays(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], Mat *mat)
3988: {
3989:   Mat_SeqBAIJ *baij;

3991:   PetscFunctionBegin;
3992:   PetscCheck(bs == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "block size %" PetscInt_FMT " > 1 is not supported yet", bs);
3993:   if (m > 0) PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");

3995:   PetscCall(MatCreate(comm, mat));
3996:   PetscCall(MatSetSizes(*mat, m, n, m, n));
3997:   PetscCall(MatSetType(*mat, MATSEQBAIJ));
3998:   PetscCall(MatSeqBAIJSetPreallocation(*mat, bs, MAT_SKIP_ALLOCATION, NULL));
3999:   baij = (Mat_SeqBAIJ *)(*mat)->data;
4000:   PetscCall(PetscMalloc2(m, &baij->imax, m, &baij->ilen));

4002:   baij->i = i;
4003:   baij->j = j;
4004:   baij->a = a;

4006:   baij->nonew          = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
4007:   baij->free_a         = PETSC_FALSE;
4008:   baij->free_ij        = PETSC_FALSE;
4009:   baij->free_imax_ilen = PETSC_TRUE;

4011:   for (PetscInt ii = 0; ii < m; ii++) {
4012:     const PetscInt row_len = i[ii + 1] - i[ii];

4014:     baij->ilen[ii] = baij->imax[ii] = row_len;
4015:     PetscCheck(row_len >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative row length in i (row indices) row = %" PetscInt_FMT " length = %" PetscInt_FMT, ii, row_len);
4016:   }
4017:   if (PetscDefined(USE_DEBUG)) {
4018:     for (PetscInt ii = 0; ii < baij->i[m]; ii++) {
4019:       PetscCheck(j[ii] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative column index at location = %" PetscInt_FMT " index = %" PetscInt_FMT, ii, j[ii]);
4020:       PetscCheck(j[ii] <= n - 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index to large at location = %" PetscInt_FMT " index = %" PetscInt_FMT, ii, j[ii]);
4021:     }
4022:   }

4024:   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
4025:   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
4026:   PetscFunctionReturn(PETSC_SUCCESS);
4027: }

4029: PetscErrorCode MatCreateMPIMatConcatenateSeqMat_SeqBAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
4030: {
4031:   PetscFunctionBegin;
4032:   PetscCall(MatCreateMPIMatConcatenateSeqMat_MPIBAIJ(comm, inmat, n, scall, outmat));
4033:   PetscFunctionReturn(PETSC_SUCCESS);
4034: }