Actual source code: baij.c


  2: /*
  3:     Defines the basic matrix operations for the BAIJ (compressed row)
  4:   matrix storage format.
  5: */
  6: #include <../src/mat/impls/baij/seq/baij.h>
  7: #include <petscblaslapack.h>
  8: #include <petsc/private/kernels/blockinvert.h>
  9: #include <petsc/private/kernels/blockmatmult.h>

 11: #if defined(PETSC_HAVE_HYPRE)
 12: PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
 13: #endif

 15: #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
 16: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBAIJMKL(Mat, MatType, MatReuse, Mat *);
 17: #endif
 18: PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);

 20: PetscErrorCode MatGetColumnReductions_SeqBAIJ(Mat A, PetscInt type, PetscReal *reductions)
 21: {
 22:   Mat_SeqBAIJ *a_aij = (Mat_SeqBAIJ *)A->data;
 23:   PetscInt     m, n, i;
 24:   PetscInt     ib, jb, bs = A->rmap->bs;
 25:   MatScalar   *a_val = a_aij->a;

 27:   MatGetSize(A, &m, &n);
 28:   for (i = 0; i < n; i++) reductions[i] = 0.0;
 29:   if (type == NORM_2) {
 30:     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
 31:       for (jb = 0; jb < bs; jb++) {
 32:         for (ib = 0; ib < bs; ib++) {
 33:           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val * *a_val);
 34:           a_val++;
 35:         }
 36:       }
 37:     }
 38:   } else if (type == NORM_1) {
 39:     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
 40:       for (jb = 0; jb < bs; jb++) {
 41:         for (ib = 0; ib < bs; ib++) {
 42:           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val);
 43:           a_val++;
 44:         }
 45:       }
 46:     }
 47:   } else if (type == NORM_INFINITY) {
 48:     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
 49:       for (jb = 0; jb < bs; jb++) {
 50:         for (ib = 0; ib < bs; ib++) {
 51:           int col         = A->cmap->rstart + a_aij->j[i] * bs + jb;
 52:           reductions[col] = PetscMax(PetscAbsScalar(*a_val), reductions[col]);
 53:           a_val++;
 54:         }
 55:       }
 56:     }
 57:   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
 58:     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
 59:       for (jb = 0; jb < bs; jb++) {
 60:         for (ib = 0; ib < bs; ib++) {
 61:           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscRealPart(*a_val);
 62:           a_val++;
 63:         }
 64:       }
 65:     }
 66:   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
 67:     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
 68:       for (jb = 0; jb < bs; jb++) {
 69:         for (ib = 0; ib < bs; ib++) {
 70:           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscImaginaryPart(*a_val);
 71:           a_val++;
 72:         }
 73:       }
 74:     }
 75:   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
 76:   if (type == NORM_2) {
 77:     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
 78:   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
 79:     for (i = 0; i < n; i++) reductions[i] /= m;
 80:   }
 81:   return 0;
 82: }

 84: PetscErrorCode MatInvertBlockDiagonal_SeqBAIJ(Mat A, const PetscScalar **values)
 85: {
 86:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
 87:   PetscInt    *diag_offset, i, bs = A->rmap->bs, mbs = a->mbs, ipvt[5], bs2 = bs * bs, *v_pivots;
 88:   MatScalar   *v     = a->a, *odiag, *diag, work[25], *v_work;
 89:   PetscReal    shift = 0.0;
 90:   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;

 92:   allowzeropivot = PetscNot(A->erroriffailure);

 94:   if (a->idiagvalid) {
 95:     if (values) *values = a->idiag;
 96:     return 0;
 97:   }
 98:   MatMarkDiagonal_SeqBAIJ(A);
 99:   diag_offset = a->diag;
100:   if (!a->idiag) { PetscMalloc1(bs2 * mbs, &a->idiag); }
101:   diag = a->idiag;
102:   if (values) *values = a->idiag;
103:   /* factor and invert each block */
104:   switch (bs) {
105:   case 1:
106:     for (i = 0; i < mbs; i++) {
107:       odiag   = v + 1 * diag_offset[i];
108:       diag[0] = odiag[0];

110:       if (PetscAbsScalar(diag[0] + shift) < PETSC_MACHINE_EPSILON) {
111:         if (allowzeropivot) {
112:           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
113:           A->factorerror_zeropivot_value = PetscAbsScalar(diag[0]);
114:           A->factorerror_zeropivot_row   = i;
115:           PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", i);
116:         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot, row %" PetscInt_FMT " pivot value %g tolerance %g", i, (double)PetscAbsScalar(diag[0]), (double)PETSC_MACHINE_EPSILON);
117:       }

119:       diag[0] = (PetscScalar)1.0 / (diag[0] + shift);
120:       diag += 1;
121:     }
122:     break;
123:   case 2:
124:     for (i = 0; i < mbs; i++) {
125:       odiag   = v + 4 * diag_offset[i];
126:       diag[0] = odiag[0];
127:       diag[1] = odiag[1];
128:       diag[2] = odiag[2];
129:       diag[3] = odiag[3];
130:       PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected);
131:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
132:       diag += 4;
133:     }
134:     break;
135:   case 3:
136:     for (i = 0; i < mbs; i++) {
137:       odiag   = v + 9 * diag_offset[i];
138:       diag[0] = odiag[0];
139:       diag[1] = odiag[1];
140:       diag[2] = odiag[2];
141:       diag[3] = odiag[3];
142:       diag[4] = odiag[4];
143:       diag[5] = odiag[5];
144:       diag[6] = odiag[6];
145:       diag[7] = odiag[7];
146:       diag[8] = odiag[8];
147:       PetscKernel_A_gets_inverse_A_3(diag, shift, allowzeropivot, &zeropivotdetected);
148:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
149:       diag += 9;
150:     }
151:     break;
152:   case 4:
153:     for (i = 0; i < mbs; i++) {
154:       odiag = v + 16 * diag_offset[i];
155:       PetscArraycpy(diag, odiag, 16);
156:       PetscKernel_A_gets_inverse_A_4(diag, shift, allowzeropivot, &zeropivotdetected);
157:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
158:       diag += 16;
159:     }
160:     break;
161:   case 5:
162:     for (i = 0; i < mbs; i++) {
163:       odiag = v + 25 * diag_offset[i];
164:       PetscArraycpy(diag, odiag, 25);
165:       PetscKernel_A_gets_inverse_A_5(diag, ipvt, work, shift, allowzeropivot, &zeropivotdetected);
166:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
167:       diag += 25;
168:     }
169:     break;
170:   case 6:
171:     for (i = 0; i < mbs; i++) {
172:       odiag = v + 36 * diag_offset[i];
173:       PetscArraycpy(diag, odiag, 36);
174:       PetscKernel_A_gets_inverse_A_6(diag, shift, allowzeropivot, &zeropivotdetected);
175:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
176:       diag += 36;
177:     }
178:     break;
179:   case 7:
180:     for (i = 0; i < mbs; i++) {
181:       odiag = v + 49 * diag_offset[i];
182:       PetscArraycpy(diag, odiag, 49);
183:       PetscKernel_A_gets_inverse_A_7(diag, shift, allowzeropivot, &zeropivotdetected);
184:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
185:       diag += 49;
186:     }
187:     break;
188:   default:
189:     PetscMalloc2(bs, &v_work, bs, &v_pivots);
190:     for (i = 0; i < mbs; i++) {
191:       odiag = v + bs2 * diag_offset[i];
192:       PetscArraycpy(diag, odiag, bs2);
193:       PetscKernel_A_gets_inverse_A(bs, diag, v_pivots, v_work, allowzeropivot, &zeropivotdetected);
194:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
195:       diag += bs2;
196:     }
197:     PetscFree2(v_work, v_pivots);
198:   }
199:   a->idiagvalid = PETSC_TRUE;
200:   return 0;
201: }

203: PetscErrorCode MatSOR_SeqBAIJ(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
204: {
205:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
206:   PetscScalar       *x, *work, *w, *workt, *t;
207:   const MatScalar   *v, *aa = a->a, *idiag;
208:   const PetscScalar *b, *xb;
209:   PetscScalar        s[7], xw[7] = {0}; /* avoid some compilers thinking xw is uninitialized */
210:   PetscInt           m = a->mbs, i, i2, nz, bs = A->rmap->bs, bs2 = bs * bs, k, j, idx, it;
211:   const PetscInt    *diag, *ai = a->i, *aj = a->j, *vi;

213:   its = its * lits;

220:   if (!a->idiagvalid) MatInvertBlockDiagonal(A, NULL);

222:   if (!m) return 0;
223:   diag  = a->diag;
224:   idiag = a->idiag;
225:   k     = PetscMax(A->rmap->n, A->cmap->n);
226:   if (!a->mult_work) PetscMalloc1(k + 1, &a->mult_work);
227:   if (!a->sor_workt) PetscMalloc1(k, &a->sor_workt);
228:   if (!a->sor_work) PetscMalloc1(bs, &a->sor_work);
229:   work = a->mult_work;
230:   t    = a->sor_workt;
231:   w    = a->sor_work;

233:   VecGetArray(xx, &x);
234:   VecGetArrayRead(bb, &b);

236:   if (flag & SOR_ZERO_INITIAL_GUESS) {
237:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
238:       switch (bs) {
239:       case 1:
240:         PetscKernel_v_gets_A_times_w_1(x, idiag, b);
241:         t[0] = b[0];
242:         i2   = 1;
243:         idiag += 1;
244:         for (i = 1; i < m; i++) {
245:           v    = aa + ai[i];
246:           vi   = aj + ai[i];
247:           nz   = diag[i] - ai[i];
248:           s[0] = b[i2];
249:           for (j = 0; j < nz; j++) {
250:             xw[0] = x[vi[j]];
251:             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
252:           }
253:           t[i2] = s[0];
254:           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
255:           x[i2] = xw[0];
256:           idiag += 1;
257:           i2 += 1;
258:         }
259:         break;
260:       case 2:
261:         PetscKernel_v_gets_A_times_w_2(x, idiag, b);
262:         t[0] = b[0];
263:         t[1] = b[1];
264:         i2   = 2;
265:         idiag += 4;
266:         for (i = 1; i < m; i++) {
267:           v    = aa + 4 * ai[i];
268:           vi   = aj + ai[i];
269:           nz   = diag[i] - ai[i];
270:           s[0] = b[i2];
271:           s[1] = b[i2 + 1];
272:           for (j = 0; j < nz; j++) {
273:             idx   = 2 * vi[j];
274:             it    = 4 * j;
275:             xw[0] = x[idx];
276:             xw[1] = x[1 + idx];
277:             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
278:           }
279:           t[i2]     = s[0];
280:           t[i2 + 1] = s[1];
281:           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
282:           x[i2]     = xw[0];
283:           x[i2 + 1] = xw[1];
284:           idiag += 4;
285:           i2 += 2;
286:         }
287:         break;
288:       case 3:
289:         PetscKernel_v_gets_A_times_w_3(x, idiag, b);
290:         t[0] = b[0];
291:         t[1] = b[1];
292:         t[2] = b[2];
293:         i2   = 3;
294:         idiag += 9;
295:         for (i = 1; i < m; i++) {
296:           v    = aa + 9 * ai[i];
297:           vi   = aj + ai[i];
298:           nz   = diag[i] - ai[i];
299:           s[0] = b[i2];
300:           s[1] = b[i2 + 1];
301:           s[2] = b[i2 + 2];
302:           while (nz--) {
303:             idx   = 3 * (*vi++);
304:             xw[0] = x[idx];
305:             xw[1] = x[1 + idx];
306:             xw[2] = x[2 + idx];
307:             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
308:             v += 9;
309:           }
310:           t[i2]     = s[0];
311:           t[i2 + 1] = s[1];
312:           t[i2 + 2] = s[2];
313:           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
314:           x[i2]     = xw[0];
315:           x[i2 + 1] = xw[1];
316:           x[i2 + 2] = xw[2];
317:           idiag += 9;
318:           i2 += 3;
319:         }
320:         break;
321:       case 4:
322:         PetscKernel_v_gets_A_times_w_4(x, idiag, b);
323:         t[0] = b[0];
324:         t[1] = b[1];
325:         t[2] = b[2];
326:         t[3] = b[3];
327:         i2   = 4;
328:         idiag += 16;
329:         for (i = 1; i < m; i++) {
330:           v    = aa + 16 * ai[i];
331:           vi   = aj + ai[i];
332:           nz   = diag[i] - ai[i];
333:           s[0] = b[i2];
334:           s[1] = b[i2 + 1];
335:           s[2] = b[i2 + 2];
336:           s[3] = b[i2 + 3];
337:           while (nz--) {
338:             idx   = 4 * (*vi++);
339:             xw[0] = x[idx];
340:             xw[1] = x[1 + idx];
341:             xw[2] = x[2 + idx];
342:             xw[3] = x[3 + idx];
343:             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
344:             v += 16;
345:           }
346:           t[i2]     = s[0];
347:           t[i2 + 1] = s[1];
348:           t[i2 + 2] = s[2];
349:           t[i2 + 3] = s[3];
350:           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
351:           x[i2]     = xw[0];
352:           x[i2 + 1] = xw[1];
353:           x[i2 + 2] = xw[2];
354:           x[i2 + 3] = xw[3];
355:           idiag += 16;
356:           i2 += 4;
357:         }
358:         break;
359:       case 5:
360:         PetscKernel_v_gets_A_times_w_5(x, idiag, b);
361:         t[0] = b[0];
362:         t[1] = b[1];
363:         t[2] = b[2];
364:         t[3] = b[3];
365:         t[4] = b[4];
366:         i2   = 5;
367:         idiag += 25;
368:         for (i = 1; i < m; i++) {
369:           v    = aa + 25 * ai[i];
370:           vi   = aj + ai[i];
371:           nz   = diag[i] - ai[i];
372:           s[0] = b[i2];
373:           s[1] = b[i2 + 1];
374:           s[2] = b[i2 + 2];
375:           s[3] = b[i2 + 3];
376:           s[4] = b[i2 + 4];
377:           while (nz--) {
378:             idx   = 5 * (*vi++);
379:             xw[0] = x[idx];
380:             xw[1] = x[1 + idx];
381:             xw[2] = x[2 + idx];
382:             xw[3] = x[3 + idx];
383:             xw[4] = x[4 + idx];
384:             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
385:             v += 25;
386:           }
387:           t[i2]     = s[0];
388:           t[i2 + 1] = s[1];
389:           t[i2 + 2] = s[2];
390:           t[i2 + 3] = s[3];
391:           t[i2 + 4] = s[4];
392:           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
393:           x[i2]     = xw[0];
394:           x[i2 + 1] = xw[1];
395:           x[i2 + 2] = xw[2];
396:           x[i2 + 3] = xw[3];
397:           x[i2 + 4] = xw[4];
398:           idiag += 25;
399:           i2 += 5;
400:         }
401:         break;
402:       case 6:
403:         PetscKernel_v_gets_A_times_w_6(x, idiag, b);
404:         t[0] = b[0];
405:         t[1] = b[1];
406:         t[2] = b[2];
407:         t[3] = b[3];
408:         t[4] = b[4];
409:         t[5] = b[5];
410:         i2   = 6;
411:         idiag += 36;
412:         for (i = 1; i < m; i++) {
413:           v    = aa + 36 * ai[i];
414:           vi   = aj + ai[i];
415:           nz   = diag[i] - ai[i];
416:           s[0] = b[i2];
417:           s[1] = b[i2 + 1];
418:           s[2] = b[i2 + 2];
419:           s[3] = b[i2 + 3];
420:           s[4] = b[i2 + 4];
421:           s[5] = b[i2 + 5];
422:           while (nz--) {
423:             idx   = 6 * (*vi++);
424:             xw[0] = x[idx];
425:             xw[1] = x[1 + idx];
426:             xw[2] = x[2 + idx];
427:             xw[3] = x[3 + idx];
428:             xw[4] = x[4 + idx];
429:             xw[5] = x[5 + idx];
430:             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
431:             v += 36;
432:           }
433:           t[i2]     = s[0];
434:           t[i2 + 1] = s[1];
435:           t[i2 + 2] = s[2];
436:           t[i2 + 3] = s[3];
437:           t[i2 + 4] = s[4];
438:           t[i2 + 5] = s[5];
439:           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
440:           x[i2]     = xw[0];
441:           x[i2 + 1] = xw[1];
442:           x[i2 + 2] = xw[2];
443:           x[i2 + 3] = xw[3];
444:           x[i2 + 4] = xw[4];
445:           x[i2 + 5] = xw[5];
446:           idiag += 36;
447:           i2 += 6;
448:         }
449:         break;
450:       case 7:
451:         PetscKernel_v_gets_A_times_w_7(x, idiag, b);
452:         t[0] = b[0];
453:         t[1] = b[1];
454:         t[2] = b[2];
455:         t[3] = b[3];
456:         t[4] = b[4];
457:         t[5] = b[5];
458:         t[6] = b[6];
459:         i2   = 7;
460:         idiag += 49;
461:         for (i = 1; i < m; i++) {
462:           v    = aa + 49 * ai[i];
463:           vi   = aj + ai[i];
464:           nz   = diag[i] - ai[i];
465:           s[0] = b[i2];
466:           s[1] = b[i2 + 1];
467:           s[2] = b[i2 + 2];
468:           s[3] = b[i2 + 3];
469:           s[4] = b[i2 + 4];
470:           s[5] = b[i2 + 5];
471:           s[6] = b[i2 + 6];
472:           while (nz--) {
473:             idx   = 7 * (*vi++);
474:             xw[0] = x[idx];
475:             xw[1] = x[1 + idx];
476:             xw[2] = x[2 + idx];
477:             xw[3] = x[3 + idx];
478:             xw[4] = x[4 + idx];
479:             xw[5] = x[5 + idx];
480:             xw[6] = x[6 + idx];
481:             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
482:             v += 49;
483:           }
484:           t[i2]     = s[0];
485:           t[i2 + 1] = s[1];
486:           t[i2 + 2] = s[2];
487:           t[i2 + 3] = s[3];
488:           t[i2 + 4] = s[4];
489:           t[i2 + 5] = s[5];
490:           t[i2 + 6] = s[6];
491:           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
492:           x[i2]     = xw[0];
493:           x[i2 + 1] = xw[1];
494:           x[i2 + 2] = xw[2];
495:           x[i2 + 3] = xw[3];
496:           x[i2 + 4] = xw[4];
497:           x[i2 + 5] = xw[5];
498:           x[i2 + 6] = xw[6];
499:           idiag += 49;
500:           i2 += 7;
501:         }
502:         break;
503:       default:
504:         PetscKernel_w_gets_Ar_times_v(bs, bs, b, idiag, x);
505:         PetscArraycpy(t, b, bs);
506:         i2 = bs;
507:         idiag += bs2;
508:         for (i = 1; i < m; i++) {
509:           v  = aa + bs2 * ai[i];
510:           vi = aj + ai[i];
511:           nz = diag[i] - ai[i];

513:           PetscArraycpy(w, b + i2, bs);
514:           /* copy all rows of x that are needed into contiguous space */
515:           workt = work;
516:           for (j = 0; j < nz; j++) {
517:             PetscArraycpy(workt, x + bs * (*vi++), bs);
518:             workt += bs;
519:           }
520:           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
521:           PetscArraycpy(t + i2, w, bs);
522:           PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);

524:           idiag += bs2;
525:           i2 += bs;
526:         }
527:         break;
528:       }
529:       /* for logging purposes assume number of nonzero in lower half is 1/2 of total */
530:       PetscLogFlops(1.0 * bs2 * a->nz);
531:       xb = t;
532:     } else xb = b;
533:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
534:       idiag = a->idiag + bs2 * (a->mbs - 1);
535:       i2    = bs * (m - 1);
536:       switch (bs) {
537:       case 1:
538:         s[0] = xb[i2];
539:         PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
540:         x[i2] = xw[0];
541:         i2 -= 1;
542:         for (i = m - 2; i >= 0; i--) {
543:           v    = aa + (diag[i] + 1);
544:           vi   = aj + diag[i] + 1;
545:           nz   = ai[i + 1] - diag[i] - 1;
546:           s[0] = xb[i2];
547:           for (j = 0; j < nz; j++) {
548:             xw[0] = x[vi[j]];
549:             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
550:           }
551:           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
552:           x[i2] = xw[0];
553:           idiag -= 1;
554:           i2 -= 1;
555:         }
556:         break;
557:       case 2:
558:         s[0] = xb[i2];
559:         s[1] = xb[i2 + 1];
560:         PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
561:         x[i2]     = xw[0];
562:         x[i2 + 1] = xw[1];
563:         i2 -= 2;
564:         idiag -= 4;
565:         for (i = m - 2; i >= 0; i--) {
566:           v    = aa + 4 * (diag[i] + 1);
567:           vi   = aj + diag[i] + 1;
568:           nz   = ai[i + 1] - diag[i] - 1;
569:           s[0] = xb[i2];
570:           s[1] = xb[i2 + 1];
571:           for (j = 0; j < nz; j++) {
572:             idx   = 2 * vi[j];
573:             it    = 4 * j;
574:             xw[0] = x[idx];
575:             xw[1] = x[1 + idx];
576:             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
577:           }
578:           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
579:           x[i2]     = xw[0];
580:           x[i2 + 1] = xw[1];
581:           idiag -= 4;
582:           i2 -= 2;
583:         }
584:         break;
585:       case 3:
586:         s[0] = xb[i2];
587:         s[1] = xb[i2 + 1];
588:         s[2] = xb[i2 + 2];
589:         PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
590:         x[i2]     = xw[0];
591:         x[i2 + 1] = xw[1];
592:         x[i2 + 2] = xw[2];
593:         i2 -= 3;
594:         idiag -= 9;
595:         for (i = m - 2; i >= 0; i--) {
596:           v    = aa + 9 * (diag[i] + 1);
597:           vi   = aj + diag[i] + 1;
598:           nz   = ai[i + 1] - diag[i] - 1;
599:           s[0] = xb[i2];
600:           s[1] = xb[i2 + 1];
601:           s[2] = xb[i2 + 2];
602:           while (nz--) {
603:             idx   = 3 * (*vi++);
604:             xw[0] = x[idx];
605:             xw[1] = x[1 + idx];
606:             xw[2] = x[2 + idx];
607:             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
608:             v += 9;
609:           }
610:           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
611:           x[i2]     = xw[0];
612:           x[i2 + 1] = xw[1];
613:           x[i2 + 2] = xw[2];
614:           idiag -= 9;
615:           i2 -= 3;
616:         }
617:         break;
618:       case 4:
619:         s[0] = xb[i2];
620:         s[1] = xb[i2 + 1];
621:         s[2] = xb[i2 + 2];
622:         s[3] = xb[i2 + 3];
623:         PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
624:         x[i2]     = xw[0];
625:         x[i2 + 1] = xw[1];
626:         x[i2 + 2] = xw[2];
627:         x[i2 + 3] = xw[3];
628:         i2 -= 4;
629:         idiag -= 16;
630:         for (i = m - 2; i >= 0; i--) {
631:           v    = aa + 16 * (diag[i] + 1);
632:           vi   = aj + diag[i] + 1;
633:           nz   = ai[i + 1] - diag[i] - 1;
634:           s[0] = xb[i2];
635:           s[1] = xb[i2 + 1];
636:           s[2] = xb[i2 + 2];
637:           s[3] = xb[i2 + 3];
638:           while (nz--) {
639:             idx   = 4 * (*vi++);
640:             xw[0] = x[idx];
641:             xw[1] = x[1 + idx];
642:             xw[2] = x[2 + idx];
643:             xw[3] = x[3 + idx];
644:             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
645:             v += 16;
646:           }
647:           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
648:           x[i2]     = xw[0];
649:           x[i2 + 1] = xw[1];
650:           x[i2 + 2] = xw[2];
651:           x[i2 + 3] = xw[3];
652:           idiag -= 16;
653:           i2 -= 4;
654:         }
655:         break;
656:       case 5:
657:         s[0] = xb[i2];
658:         s[1] = xb[i2 + 1];
659:         s[2] = xb[i2 + 2];
660:         s[3] = xb[i2 + 3];
661:         s[4] = xb[i2 + 4];
662:         PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
663:         x[i2]     = xw[0];
664:         x[i2 + 1] = xw[1];
665:         x[i2 + 2] = xw[2];
666:         x[i2 + 3] = xw[3];
667:         x[i2 + 4] = xw[4];
668:         i2 -= 5;
669:         idiag -= 25;
670:         for (i = m - 2; i >= 0; i--) {
671:           v    = aa + 25 * (diag[i] + 1);
672:           vi   = aj + diag[i] + 1;
673:           nz   = ai[i + 1] - diag[i] - 1;
674:           s[0] = xb[i2];
675:           s[1] = xb[i2 + 1];
676:           s[2] = xb[i2 + 2];
677:           s[3] = xb[i2 + 3];
678:           s[4] = xb[i2 + 4];
679:           while (nz--) {
680:             idx   = 5 * (*vi++);
681:             xw[0] = x[idx];
682:             xw[1] = x[1 + idx];
683:             xw[2] = x[2 + idx];
684:             xw[3] = x[3 + idx];
685:             xw[4] = x[4 + idx];
686:             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
687:             v += 25;
688:           }
689:           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
690:           x[i2]     = xw[0];
691:           x[i2 + 1] = xw[1];
692:           x[i2 + 2] = xw[2];
693:           x[i2 + 3] = xw[3];
694:           x[i2 + 4] = xw[4];
695:           idiag -= 25;
696:           i2 -= 5;
697:         }
698:         break;
699:       case 6:
700:         s[0] = xb[i2];
701:         s[1] = xb[i2 + 1];
702:         s[2] = xb[i2 + 2];
703:         s[3] = xb[i2 + 3];
704:         s[4] = xb[i2 + 4];
705:         s[5] = xb[i2 + 5];
706:         PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
707:         x[i2]     = xw[0];
708:         x[i2 + 1] = xw[1];
709:         x[i2 + 2] = xw[2];
710:         x[i2 + 3] = xw[3];
711:         x[i2 + 4] = xw[4];
712:         x[i2 + 5] = xw[5];
713:         i2 -= 6;
714:         idiag -= 36;
715:         for (i = m - 2; i >= 0; i--) {
716:           v    = aa + 36 * (diag[i] + 1);
717:           vi   = aj + diag[i] + 1;
718:           nz   = ai[i + 1] - diag[i] - 1;
719:           s[0] = xb[i2];
720:           s[1] = xb[i2 + 1];
721:           s[2] = xb[i2 + 2];
722:           s[3] = xb[i2 + 3];
723:           s[4] = xb[i2 + 4];
724:           s[5] = xb[i2 + 5];
725:           while (nz--) {
726:             idx   = 6 * (*vi++);
727:             xw[0] = x[idx];
728:             xw[1] = x[1 + idx];
729:             xw[2] = x[2 + idx];
730:             xw[3] = x[3 + idx];
731:             xw[4] = x[4 + idx];
732:             xw[5] = x[5 + idx];
733:             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
734:             v += 36;
735:           }
736:           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
737:           x[i2]     = xw[0];
738:           x[i2 + 1] = xw[1];
739:           x[i2 + 2] = xw[2];
740:           x[i2 + 3] = xw[3];
741:           x[i2 + 4] = xw[4];
742:           x[i2 + 5] = xw[5];
743:           idiag -= 36;
744:           i2 -= 6;
745:         }
746:         break;
747:       case 7:
748:         s[0] = xb[i2];
749:         s[1] = xb[i2 + 1];
750:         s[2] = xb[i2 + 2];
751:         s[3] = xb[i2 + 3];
752:         s[4] = xb[i2 + 4];
753:         s[5] = xb[i2 + 5];
754:         s[6] = xb[i2 + 6];
755:         PetscKernel_v_gets_A_times_w_7(x, idiag, b);
756:         x[i2]     = xw[0];
757:         x[i2 + 1] = xw[1];
758:         x[i2 + 2] = xw[2];
759:         x[i2 + 3] = xw[3];
760:         x[i2 + 4] = xw[4];
761:         x[i2 + 5] = xw[5];
762:         x[i2 + 6] = xw[6];
763:         i2 -= 7;
764:         idiag -= 49;
765:         for (i = m - 2; i >= 0; i--) {
766:           v    = aa + 49 * (diag[i] + 1);
767:           vi   = aj + diag[i] + 1;
768:           nz   = ai[i + 1] - diag[i] - 1;
769:           s[0] = xb[i2];
770:           s[1] = xb[i2 + 1];
771:           s[2] = xb[i2 + 2];
772:           s[3] = xb[i2 + 3];
773:           s[4] = xb[i2 + 4];
774:           s[5] = xb[i2 + 5];
775:           s[6] = xb[i2 + 6];
776:           while (nz--) {
777:             idx   = 7 * (*vi++);
778:             xw[0] = x[idx];
779:             xw[1] = x[1 + idx];
780:             xw[2] = x[2 + idx];
781:             xw[3] = x[3 + idx];
782:             xw[4] = x[4 + idx];
783:             xw[5] = x[5 + idx];
784:             xw[6] = x[6 + idx];
785:             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
786:             v += 49;
787:           }
788:           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
789:           x[i2]     = xw[0];
790:           x[i2 + 1] = xw[1];
791:           x[i2 + 2] = xw[2];
792:           x[i2 + 3] = xw[3];
793:           x[i2 + 4] = xw[4];
794:           x[i2 + 5] = xw[5];
795:           x[i2 + 6] = xw[6];
796:           idiag -= 49;
797:           i2 -= 7;
798:         }
799:         break;
800:       default:
801:         PetscArraycpy(w, xb + i2, bs);
802:         PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
803:         i2 -= bs;
804:         idiag -= bs2;
805:         for (i = m - 2; i >= 0; i--) {
806:           v  = aa + bs2 * (diag[i] + 1);
807:           vi = aj + diag[i] + 1;
808:           nz = ai[i + 1] - diag[i] - 1;

810:           PetscArraycpy(w, xb + i2, bs);
811:           /* copy all rows of x that are needed into contiguous space */
812:           workt = work;
813:           for (j = 0; j < nz; j++) {
814:             PetscArraycpy(workt, x + bs * (*vi++), bs);
815:             workt += bs;
816:           }
817:           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
818:           PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);

820:           idiag -= bs2;
821:           i2 -= bs;
822:         }
823:         break;
824:       }
825:       PetscLogFlops(1.0 * bs2 * (a->nz));
826:     }
827:     its--;
828:   }
829:   while (its--) {
830:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
831:       idiag = a->idiag;
832:       i2    = 0;
833:       switch (bs) {
834:       case 1:
835:         for (i = 0; i < m; i++) {
836:           v    = aa + ai[i];
837:           vi   = aj + ai[i];
838:           nz   = ai[i + 1] - ai[i];
839:           s[0] = b[i2];
840:           for (j = 0; j < nz; j++) {
841:             xw[0] = x[vi[j]];
842:             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
843:           }
844:           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
845:           x[i2] += xw[0];
846:           idiag += 1;
847:           i2 += 1;
848:         }
849:         break;
850:       case 2:
851:         for (i = 0; i < m; i++) {
852:           v    = aa + 4 * ai[i];
853:           vi   = aj + ai[i];
854:           nz   = ai[i + 1] - ai[i];
855:           s[0] = b[i2];
856:           s[1] = b[i2 + 1];
857:           for (j = 0; j < nz; j++) {
858:             idx   = 2 * vi[j];
859:             it    = 4 * j;
860:             xw[0] = x[idx];
861:             xw[1] = x[1 + idx];
862:             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
863:           }
864:           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
865:           x[i2] += xw[0];
866:           x[i2 + 1] += xw[1];
867:           idiag += 4;
868:           i2 += 2;
869:         }
870:         break;
871:       case 3:
872:         for (i = 0; i < m; i++) {
873:           v    = aa + 9 * ai[i];
874:           vi   = aj + ai[i];
875:           nz   = ai[i + 1] - ai[i];
876:           s[0] = b[i2];
877:           s[1] = b[i2 + 1];
878:           s[2] = b[i2 + 2];
879:           while (nz--) {
880:             idx   = 3 * (*vi++);
881:             xw[0] = x[idx];
882:             xw[1] = x[1 + idx];
883:             xw[2] = x[2 + idx];
884:             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
885:             v += 9;
886:           }
887:           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
888:           x[i2] += xw[0];
889:           x[i2 + 1] += xw[1];
890:           x[i2 + 2] += xw[2];
891:           idiag += 9;
892:           i2 += 3;
893:         }
894:         break;
895:       case 4:
896:         for (i = 0; i < m; i++) {
897:           v    = aa + 16 * ai[i];
898:           vi   = aj + ai[i];
899:           nz   = ai[i + 1] - ai[i];
900:           s[0] = b[i2];
901:           s[1] = b[i2 + 1];
902:           s[2] = b[i2 + 2];
903:           s[3] = b[i2 + 3];
904:           while (nz--) {
905:             idx   = 4 * (*vi++);
906:             xw[0] = x[idx];
907:             xw[1] = x[1 + idx];
908:             xw[2] = x[2 + idx];
909:             xw[3] = x[3 + idx];
910:             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
911:             v += 16;
912:           }
913:           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
914:           x[i2] += xw[0];
915:           x[i2 + 1] += xw[1];
916:           x[i2 + 2] += xw[2];
917:           x[i2 + 3] += xw[3];
918:           idiag += 16;
919:           i2 += 4;
920:         }
921:         break;
922:       case 5:
923:         for (i = 0; i < m; i++) {
924:           v    = aa + 25 * ai[i];
925:           vi   = aj + ai[i];
926:           nz   = ai[i + 1] - ai[i];
927:           s[0] = b[i2];
928:           s[1] = b[i2 + 1];
929:           s[2] = b[i2 + 2];
930:           s[3] = b[i2 + 3];
931:           s[4] = b[i2 + 4];
932:           while (nz--) {
933:             idx   = 5 * (*vi++);
934:             xw[0] = x[idx];
935:             xw[1] = x[1 + idx];
936:             xw[2] = x[2 + idx];
937:             xw[3] = x[3 + idx];
938:             xw[4] = x[4 + idx];
939:             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
940:             v += 25;
941:           }
942:           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
943:           x[i2] += xw[0];
944:           x[i2 + 1] += xw[1];
945:           x[i2 + 2] += xw[2];
946:           x[i2 + 3] += xw[3];
947:           x[i2 + 4] += xw[4];
948:           idiag += 25;
949:           i2 += 5;
950:         }
951:         break;
952:       case 6:
953:         for (i = 0; i < m; i++) {
954:           v    = aa + 36 * ai[i];
955:           vi   = aj + ai[i];
956:           nz   = ai[i + 1] - ai[i];
957:           s[0] = b[i2];
958:           s[1] = b[i2 + 1];
959:           s[2] = b[i2 + 2];
960:           s[3] = b[i2 + 3];
961:           s[4] = b[i2 + 4];
962:           s[5] = b[i2 + 5];
963:           while (nz--) {
964:             idx   = 6 * (*vi++);
965:             xw[0] = x[idx];
966:             xw[1] = x[1 + idx];
967:             xw[2] = x[2 + idx];
968:             xw[3] = x[3 + idx];
969:             xw[4] = x[4 + idx];
970:             xw[5] = x[5 + idx];
971:             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
972:             v += 36;
973:           }
974:           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
975:           x[i2] += xw[0];
976:           x[i2 + 1] += xw[1];
977:           x[i2 + 2] += xw[2];
978:           x[i2 + 3] += xw[3];
979:           x[i2 + 4] += xw[4];
980:           x[i2 + 5] += xw[5];
981:           idiag += 36;
982:           i2 += 6;
983:         }
984:         break;
985:       case 7:
986:         for (i = 0; i < m; i++) {
987:           v    = aa + 49 * ai[i];
988:           vi   = aj + ai[i];
989:           nz   = ai[i + 1] - ai[i];
990:           s[0] = b[i2];
991:           s[1] = b[i2 + 1];
992:           s[2] = b[i2 + 2];
993:           s[3] = b[i2 + 3];
994:           s[4] = b[i2 + 4];
995:           s[5] = b[i2 + 5];
996:           s[6] = b[i2 + 6];
997:           while (nz--) {
998:             idx   = 7 * (*vi++);
999:             xw[0] = x[idx];
1000:             xw[1] = x[1 + idx];
1001:             xw[2] = x[2 + idx];
1002:             xw[3] = x[3 + idx];
1003:             xw[4] = x[4 + idx];
1004:             xw[5] = x[5 + idx];
1005:             xw[6] = x[6 + idx];
1006:             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
1007:             v += 49;
1008:           }
1009:           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
1010:           x[i2] += xw[0];
1011:           x[i2 + 1] += xw[1];
1012:           x[i2 + 2] += xw[2];
1013:           x[i2 + 3] += xw[3];
1014:           x[i2 + 4] += xw[4];
1015:           x[i2 + 5] += xw[5];
1016:           x[i2 + 6] += xw[6];
1017:           idiag += 49;
1018:           i2 += 7;
1019:         }
1020:         break;
1021:       default:
1022:         for (i = 0; i < m; i++) {
1023:           v  = aa + bs2 * ai[i];
1024:           vi = aj + ai[i];
1025:           nz = ai[i + 1] - ai[i];

1027:           PetscArraycpy(w, b + i2, bs);
1028:           /* copy all rows of x that are needed into contiguous space */
1029:           workt = work;
1030:           for (j = 0; j < nz; j++) {
1031:             PetscArraycpy(workt, x + bs * (*vi++), bs);
1032:             workt += bs;
1033:           }
1034:           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
1035:           PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2);

1037:           idiag += bs2;
1038:           i2 += bs;
1039:         }
1040:         break;
1041:       }
1042:       PetscLogFlops(2.0 * bs2 * a->nz);
1043:     }
1044:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
1045:       idiag = a->idiag + bs2 * (a->mbs - 1);
1046:       i2    = bs * (m - 1);
1047:       switch (bs) {
1048:       case 1:
1049:         for (i = m - 1; i >= 0; i--) {
1050:           v    = aa + ai[i];
1051:           vi   = aj + ai[i];
1052:           nz   = ai[i + 1] - ai[i];
1053:           s[0] = b[i2];
1054:           for (j = 0; j < nz; j++) {
1055:             xw[0] = x[vi[j]];
1056:             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
1057:           }
1058:           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
1059:           x[i2] += xw[0];
1060:           idiag -= 1;
1061:           i2 -= 1;
1062:         }
1063:         break;
1064:       case 2:
1065:         for (i = m - 1; i >= 0; i--) {
1066:           v    = aa + 4 * ai[i];
1067:           vi   = aj + ai[i];
1068:           nz   = ai[i + 1] - ai[i];
1069:           s[0] = b[i2];
1070:           s[1] = b[i2 + 1];
1071:           for (j = 0; j < nz; j++) {
1072:             idx   = 2 * vi[j];
1073:             it    = 4 * j;
1074:             xw[0] = x[idx];
1075:             xw[1] = x[1 + idx];
1076:             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
1077:           }
1078:           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
1079:           x[i2] += xw[0];
1080:           x[i2 + 1] += xw[1];
1081:           idiag -= 4;
1082:           i2 -= 2;
1083:         }
1084:         break;
1085:       case 3:
1086:         for (i = m - 1; i >= 0; i--) {
1087:           v    = aa + 9 * ai[i];
1088:           vi   = aj + ai[i];
1089:           nz   = ai[i + 1] - ai[i];
1090:           s[0] = b[i2];
1091:           s[1] = b[i2 + 1];
1092:           s[2] = b[i2 + 2];
1093:           while (nz--) {
1094:             idx   = 3 * (*vi++);
1095:             xw[0] = x[idx];
1096:             xw[1] = x[1 + idx];
1097:             xw[2] = x[2 + idx];
1098:             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
1099:             v += 9;
1100:           }
1101:           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
1102:           x[i2] += xw[0];
1103:           x[i2 + 1] += xw[1];
1104:           x[i2 + 2] += xw[2];
1105:           idiag -= 9;
1106:           i2 -= 3;
1107:         }
1108:         break;
1109:       case 4:
1110:         for (i = m - 1; i >= 0; i--) {
1111:           v    = aa + 16 * ai[i];
1112:           vi   = aj + ai[i];
1113:           nz   = ai[i + 1] - ai[i];
1114:           s[0] = b[i2];
1115:           s[1] = b[i2 + 1];
1116:           s[2] = b[i2 + 2];
1117:           s[3] = b[i2 + 3];
1118:           while (nz--) {
1119:             idx   = 4 * (*vi++);
1120:             xw[0] = x[idx];
1121:             xw[1] = x[1 + idx];
1122:             xw[2] = x[2 + idx];
1123:             xw[3] = x[3 + idx];
1124:             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
1125:             v += 16;
1126:           }
1127:           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
1128:           x[i2] += xw[0];
1129:           x[i2 + 1] += xw[1];
1130:           x[i2 + 2] += xw[2];
1131:           x[i2 + 3] += xw[3];
1132:           idiag -= 16;
1133:           i2 -= 4;
1134:         }
1135:         break;
1136:       case 5:
1137:         for (i = m - 1; i >= 0; i--) {
1138:           v    = aa + 25 * ai[i];
1139:           vi   = aj + ai[i];
1140:           nz   = ai[i + 1] - ai[i];
1141:           s[0] = b[i2];
1142:           s[1] = b[i2 + 1];
1143:           s[2] = b[i2 + 2];
1144:           s[3] = b[i2 + 3];
1145:           s[4] = b[i2 + 4];
1146:           while (nz--) {
1147:             idx   = 5 * (*vi++);
1148:             xw[0] = x[idx];
1149:             xw[1] = x[1 + idx];
1150:             xw[2] = x[2 + idx];
1151:             xw[3] = x[3 + idx];
1152:             xw[4] = x[4 + idx];
1153:             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
1154:             v += 25;
1155:           }
1156:           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
1157:           x[i2] += xw[0];
1158:           x[i2 + 1] += xw[1];
1159:           x[i2 + 2] += xw[2];
1160:           x[i2 + 3] += xw[3];
1161:           x[i2 + 4] += xw[4];
1162:           idiag -= 25;
1163:           i2 -= 5;
1164:         }
1165:         break;
1166:       case 6:
1167:         for (i = m - 1; i >= 0; i--) {
1168:           v    = aa + 36 * ai[i];
1169:           vi   = aj + ai[i];
1170:           nz   = ai[i + 1] - ai[i];
1171:           s[0] = b[i2];
1172:           s[1] = b[i2 + 1];
1173:           s[2] = b[i2 + 2];
1174:           s[3] = b[i2 + 3];
1175:           s[4] = b[i2 + 4];
1176:           s[5] = b[i2 + 5];
1177:           while (nz--) {
1178:             idx   = 6 * (*vi++);
1179:             xw[0] = x[idx];
1180:             xw[1] = x[1 + idx];
1181:             xw[2] = x[2 + idx];
1182:             xw[3] = x[3 + idx];
1183:             xw[4] = x[4 + idx];
1184:             xw[5] = x[5 + idx];
1185:             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
1186:             v += 36;
1187:           }
1188:           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
1189:           x[i2] += xw[0];
1190:           x[i2 + 1] += xw[1];
1191:           x[i2 + 2] += xw[2];
1192:           x[i2 + 3] += xw[3];
1193:           x[i2 + 4] += xw[4];
1194:           x[i2 + 5] += xw[5];
1195:           idiag -= 36;
1196:           i2 -= 6;
1197:         }
1198:         break;
1199:       case 7:
1200:         for (i = m - 1; i >= 0; i--) {
1201:           v    = aa + 49 * ai[i];
1202:           vi   = aj + ai[i];
1203:           nz   = ai[i + 1] - ai[i];
1204:           s[0] = b[i2];
1205:           s[1] = b[i2 + 1];
1206:           s[2] = b[i2 + 2];
1207:           s[3] = b[i2 + 3];
1208:           s[4] = b[i2 + 4];
1209:           s[5] = b[i2 + 5];
1210:           s[6] = b[i2 + 6];
1211:           while (nz--) {
1212:             idx   = 7 * (*vi++);
1213:             xw[0] = x[idx];
1214:             xw[1] = x[1 + idx];
1215:             xw[2] = x[2 + idx];
1216:             xw[3] = x[3 + idx];
1217:             xw[4] = x[4 + idx];
1218:             xw[5] = x[5 + idx];
1219:             xw[6] = x[6 + idx];
1220:             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
1221:             v += 49;
1222:           }
1223:           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
1224:           x[i2] += xw[0];
1225:           x[i2 + 1] += xw[1];
1226:           x[i2 + 2] += xw[2];
1227:           x[i2 + 3] += xw[3];
1228:           x[i2 + 4] += xw[4];
1229:           x[i2 + 5] += xw[5];
1230:           x[i2 + 6] += xw[6];
1231:           idiag -= 49;
1232:           i2 -= 7;
1233:         }
1234:         break;
1235:       default:
1236:         for (i = m - 1; i >= 0; i--) {
1237:           v  = aa + bs2 * ai[i];
1238:           vi = aj + ai[i];
1239:           nz = ai[i + 1] - ai[i];

1241:           PetscArraycpy(w, b + i2, bs);
1242:           /* copy all rows of x that are needed into contiguous space */
1243:           workt = work;
1244:           for (j = 0; j < nz; j++) {
1245:             PetscArraycpy(workt, x + bs * (*vi++), bs);
1246:             workt += bs;
1247:           }
1248:           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
1249:           PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2);

1251:           idiag -= bs2;
1252:           i2 -= bs;
1253:         }
1254:         break;
1255:       }
1256:       PetscLogFlops(2.0 * bs2 * (a->nz));
1257:     }
1258:   }
1259:   VecRestoreArray(xx, &x);
1260:   VecRestoreArrayRead(bb, &b);
1261:   return 0;
1262: }

1264: /*
1265:     Special version for direct calls from Fortran (Used in PETSc-fun3d)
1266: */
1267: #if defined(PETSC_HAVE_FORTRAN_CAPS)
1268:   #define matsetvaluesblocked4_ MATSETVALUESBLOCKED4
1269: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1270:   #define matsetvaluesblocked4_ matsetvaluesblocked4
1271: #endif

1273: PETSC_EXTERN void matsetvaluesblocked4_(Mat *AA, PetscInt *mm, const PetscInt im[], PetscInt *nn, const PetscInt in[], const PetscScalar v[])
1274: {
1275:   Mat                A = *AA;
1276:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1277:   PetscInt          *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, N, m = *mm, n = *nn;
1278:   PetscInt          *ai = a->i, *ailen = a->ilen;
1279:   PetscInt          *aj = a->j, stepval, lastcol = -1;
1280:   const PetscScalar *value = v;
1281:   MatScalar         *ap, *aa = a->a, *bap;

1283:   if (A->rmap->bs != 4) SETERRABORT(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Can only be called with a block size of 4");
1284:   stepval = (n - 1) * 4;
1285:   for (k = 0; k < m; k++) { /* loop over added rows */
1286:     row  = im[k];
1287:     rp   = aj + ai[row];
1288:     ap   = aa + 16 * ai[row];
1289:     nrow = ailen[row];
1290:     low  = 0;
1291:     high = nrow;
1292:     for (l = 0; l < n; l++) { /* loop over added columns */
1293:       col = in[l];
1294:       if (col <= lastcol) low = 0;
1295:       else high = nrow;
1296:       lastcol = col;
1297:       value   = v + k * (stepval + 4 + l) * 4;
1298:       while (high - low > 7) {
1299:         t = (low + high) / 2;
1300:         if (rp[t] > col) high = t;
1301:         else low = t;
1302:       }
1303:       for (i = low; i < high; i++) {
1304:         if (rp[i] > col) break;
1305:         if (rp[i] == col) {
1306:           bap = ap + 16 * i;
1307:           for (ii = 0; ii < 4; ii++, value += stepval) {
1308:             for (jj = ii; jj < 16; jj += 4) bap[jj] += *value++;
1309:           }
1310:           goto noinsert2;
1311:         }
1312:       }
1313:       N = nrow++ - 1;
1314:       high++; /* added new column index thus must search to one higher than before */
1315:       /* shift up all the later entries in this row */
1316:       for (ii = N; ii >= i; ii--) {
1317:         rp[ii + 1] = rp[ii];
1318:         PetscArraycpy(ap + 16 * (ii + 1), ap + 16 * (ii), 16);
1319:       }
1320:       if (N >= i) PetscArrayzero(ap + 16 * i, 16);
1321:       rp[i] = col;
1322:       bap   = ap + 16 * i;
1323:       for (ii = 0; ii < 4; ii++, value += stepval) {
1324:         for (jj = ii; jj < 16; jj += 4) bap[jj] = *value++;
1325:       }
1326:     noinsert2:;
1327:       low = i;
1328:     }
1329:     ailen[row] = nrow;
1330:   }
1331:   return;
1332: }

1334: #if defined(PETSC_HAVE_FORTRAN_CAPS)
1335:   #define matsetvalues4_ MATSETVALUES4
1336: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1337:   #define matsetvalues4_ matsetvalues4
1338: #endif

1340: PETSC_EXTERN void matsetvalues4_(Mat *AA, PetscInt *mm, PetscInt *im, PetscInt *nn, PetscInt *in, PetscScalar *v)
1341: {
1342:   Mat          A = *AA;
1343:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1344:   PetscInt    *rp, k, low, high, t, row, nrow, i, col, l, N, n = *nn, m = *mm;
1345:   PetscInt    *ai = a->i, *ailen = a->ilen;
1346:   PetscInt    *aj = a->j, brow, bcol;
1347:   PetscInt     ridx, cidx, lastcol = -1;
1348:   MatScalar   *ap, value, *aa      = a->a, *bap;

1350:   for (k = 0; k < m; k++) { /* loop over added rows */
1351:     row  = im[k];
1352:     brow = row / 4;
1353:     rp   = aj + ai[brow];
1354:     ap   = aa + 16 * ai[brow];
1355:     nrow = ailen[brow];
1356:     low  = 0;
1357:     high = nrow;
1358:     for (l = 0; l < n; l++) { /* loop over added columns */
1359:       col   = in[l];
1360:       bcol  = col / 4;
1361:       ridx  = row % 4;
1362:       cidx  = col % 4;
1363:       value = v[l + k * n];
1364:       if (col <= lastcol) low = 0;
1365:       else high = nrow;
1366:       lastcol = col;
1367:       while (high - low > 7) {
1368:         t = (low + high) / 2;
1369:         if (rp[t] > bcol) high = t;
1370:         else low = t;
1371:       }
1372:       for (i = low; i < high; i++) {
1373:         if (rp[i] > bcol) break;
1374:         if (rp[i] == bcol) {
1375:           bap = ap + 16 * i + 4 * cidx + ridx;
1376:           *bap += value;
1377:           goto noinsert1;
1378:         }
1379:       }
1380:       N = nrow++ - 1;
1381:       high++; /* added new column thus must search to one higher than before */
1382:       /* shift up all the later entries in this row */
1383:       PetscArraymove(rp + i + 1, rp + i, N - i + 1);
1384:       PetscArraymove(ap + 16 * i + 16, ap + 16 * i, 16 * (N - i + 1));
1385:       PetscArrayzero(ap + 16 * i, 16);
1386:       rp[i]                        = bcol;
1387:       ap[16 * i + 4 * cidx + ridx] = value;
1388:     noinsert1:;
1389:       low = i;
1390:     }
1391:     ailen[brow] = nrow;
1392:   }
1393:   return;
1394: }

1396: /*
1397:      Checks for missing diagonals
1398: */
1399: PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat A, PetscBool *missing, PetscInt *d)
1400: {
1401:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1402:   PetscInt    *diag, *ii = a->i, i;

1404:   MatMarkDiagonal_SeqBAIJ(A);
1405:   *missing = PETSC_FALSE;
1406:   if (A->rmap->n > 0 && !ii) {
1407:     *missing = PETSC_TRUE;
1408:     if (d) *d = 0;
1409:     PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n");
1410:   } else {
1411:     PetscInt n;
1412:     n    = PetscMin(a->mbs, a->nbs);
1413:     diag = a->diag;
1414:     for (i = 0; i < n; i++) {
1415:       if (diag[i] >= ii[i + 1]) {
1416:         *missing = PETSC_TRUE;
1417:         if (d) *d = i;
1418:         PetscInfo(A, "Matrix is missing block diagonal number %" PetscInt_FMT "\n", i);
1419:         break;
1420:       }
1421:     }
1422:   }
1423:   return 0;
1424: }

1426: PetscErrorCode MatMarkDiagonal_SeqBAIJ(Mat A)
1427: {
1428:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1429:   PetscInt     i, j, m = a->mbs;

1431:   if (!a->diag) {
1432:     PetscMalloc1(m, &a->diag);
1433:     a->free_diag = PETSC_TRUE;
1434:   }
1435:   for (i = 0; i < m; i++) {
1436:     a->diag[i] = a->i[i + 1];
1437:     for (j = a->i[i]; j < a->i[i + 1]; j++) {
1438:       if (a->j[j] == i) {
1439:         a->diag[i] = j;
1440:         break;
1441:       }
1442:     }
1443:   }
1444:   return 0;
1445: }

1447: static PetscErrorCode MatGetRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *inia[], const PetscInt *inja[], PetscBool *done)
1448: {
1449:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1450:   PetscInt     i, j, n = a->mbs, nz = a->i[n], *tia, *tja, bs = A->rmap->bs, k, l, cnt;
1451:   PetscInt   **ia = (PetscInt **)inia, **ja = (PetscInt **)inja;

1453:   *nn = n;
1454:   if (!ia) return 0;
1455:   if (symmetric) {
1456:     MatToSymmetricIJ_SeqAIJ(n, a->i, a->j, PETSC_TRUE, 0, 0, &tia, &tja);
1457:     nz = tia[n];
1458:   } else {
1459:     tia = a->i;
1460:     tja = a->j;
1461:   }

1463:   if (!blockcompressed && bs > 1) {
1464:     (*nn) *= bs;
1465:     /* malloc & create the natural set of indices */
1466:     PetscMalloc1((n + 1) * bs, ia);
1467:     if (n) {
1468:       (*ia)[0] = oshift;
1469:       for (j = 1; j < bs; j++) (*ia)[j] = (tia[1] - tia[0]) * bs + (*ia)[j - 1];
1470:     }

1472:     for (i = 1; i < n; i++) {
1473:       (*ia)[i * bs] = (tia[i] - tia[i - 1]) * bs + (*ia)[i * bs - 1];
1474:       for (j = 1; j < bs; j++) (*ia)[i * bs + j] = (tia[i + 1] - tia[i]) * bs + (*ia)[i * bs + j - 1];
1475:     }
1476:     if (n) (*ia)[n * bs] = (tia[n] - tia[n - 1]) * bs + (*ia)[n * bs - 1];

1478:     if (inja) {
1479:       PetscMalloc1(nz * bs * bs, ja);
1480:       cnt = 0;
1481:       for (i = 0; i < n; i++) {
1482:         for (j = 0; j < bs; j++) {
1483:           for (k = tia[i]; k < tia[i + 1]; k++) {
1484:             for (l = 0; l < bs; l++) (*ja)[cnt++] = bs * tja[k] + l;
1485:           }
1486:         }
1487:       }
1488:     }

1490:     if (symmetric) { /* deallocate memory allocated in MatToSymmetricIJ_SeqAIJ() */
1491:       PetscFree(tia);
1492:       PetscFree(tja);
1493:     }
1494:   } else if (oshift == 1) {
1495:     if (symmetric) {
1496:       nz = tia[A->rmap->n / bs];
1497:       /*  add 1 to i and j indices */
1498:       for (i = 0; i < A->rmap->n / bs + 1; i++) tia[i] = tia[i] + 1;
1499:       *ia = tia;
1500:       if (ja) {
1501:         for (i = 0; i < nz; i++) tja[i] = tja[i] + 1;
1502:         *ja = tja;
1503:       }
1504:     } else {
1505:       nz = a->i[A->rmap->n / bs];
1506:       /* malloc space and  add 1 to i and j indices */
1507:       PetscMalloc1(A->rmap->n / bs + 1, ia);
1508:       for (i = 0; i < A->rmap->n / bs + 1; i++) (*ia)[i] = a->i[i] + 1;
1509:       if (ja) {
1510:         PetscMalloc1(nz, ja);
1511:         for (i = 0; i < nz; i++) (*ja)[i] = a->j[i] + 1;
1512:       }
1513:     }
1514:   } else {
1515:     *ia = tia;
1516:     if (ja) *ja = tja;
1517:   }
1518:   return 0;
1519: }

1521: static PetscErrorCode MatRestoreRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
1522: {
1523:   if (!ia) return 0;
1524:   if ((!blockcompressed && A->rmap->bs > 1) || (symmetric || oshift == 1)) {
1525:     PetscFree(*ia);
1526:     if (ja) PetscFree(*ja);
1527:   }
1528:   return 0;
1529: }

1531: PetscErrorCode MatDestroy_SeqBAIJ(Mat A)
1532: {
1533:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

1535: #if defined(PETSC_USE_LOG)
1536:   PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->N, A->cmap->n, a->nz);
1537: #endif
1538:   MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i);
1539:   ISDestroy(&a->row);
1540:   ISDestroy(&a->col);
1541:   if (a->free_diag) PetscFree(a->diag);
1542:   PetscFree(a->idiag);
1543:   if (a->free_imax_ilen) PetscFree2(a->imax, a->ilen);
1544:   PetscFree(a->solve_work);
1545:   PetscFree(a->mult_work);
1546:   PetscFree(a->sor_workt);
1547:   PetscFree(a->sor_work);
1548:   ISDestroy(&a->icol);
1549:   PetscFree(a->saved_values);
1550:   PetscFree2(a->compressedrow.i, a->compressedrow.rindex);

1552:   MatDestroy(&a->sbaijMat);
1553:   MatDestroy(&a->parent);
1554:   PetscFree(A->data);

1556:   PetscObjectChangeTypeName((PetscObject)A, NULL);
1557:   PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJGetArray_C", NULL);
1558:   PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJRestoreArray_C", NULL);
1559:   PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL);
1560:   PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL);
1561:   PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetColumnIndices_C", NULL);
1562:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqaij_C", NULL);
1563:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqsbaij_C", NULL);
1564:   PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocation_C", NULL);
1565:   PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocationCSR_C", NULL);
1566:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqbstrm_C", NULL);
1567:   PetscObjectComposeFunction((PetscObject)A, "MatIsTranspose_C", NULL);
1568: #if defined(PETSC_HAVE_HYPRE)
1569:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_hypre_C", NULL);
1570: #endif
1571:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_is_C", NULL);
1572:   PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL);
1573:   return 0;
1574: }

1576: PetscErrorCode MatSetOption_SeqBAIJ(Mat A, MatOption op, PetscBool flg)
1577: {
1578:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

1580:   switch (op) {
1581:   case MAT_ROW_ORIENTED:
1582:     a->roworiented = flg;
1583:     break;
1584:   case MAT_KEEP_NONZERO_PATTERN:
1585:     a->keepnonzeropattern = flg;
1586:     break;
1587:   case MAT_NEW_NONZERO_LOCATIONS:
1588:     a->nonew = (flg ? 0 : 1);
1589:     break;
1590:   case MAT_NEW_NONZERO_LOCATION_ERR:
1591:     a->nonew = (flg ? -1 : 0);
1592:     break;
1593:   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1594:     a->nonew = (flg ? -2 : 0);
1595:     break;
1596:   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1597:     a->nounused = (flg ? -1 : 0);
1598:     break;
1599:   case MAT_FORCE_DIAGONAL_ENTRIES:
1600:   case MAT_IGNORE_OFF_PROC_ENTRIES:
1601:   case MAT_USE_HASH_TABLE:
1602:   case MAT_SORTED_FULL:
1603:     PetscInfo(A, "Option %s ignored\n", MatOptions[op]);
1604:     break;
1605:   case MAT_SPD:
1606:   case MAT_SYMMETRIC:
1607:   case MAT_STRUCTURALLY_SYMMETRIC:
1608:   case MAT_HERMITIAN:
1609:   case MAT_SYMMETRY_ETERNAL:
1610:   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
1611:   case MAT_SUBMAT_SINGLEIS:
1612:   case MAT_STRUCTURE_ONLY:
1613:   case MAT_SPD_ETERNAL:
1614:     /* if the diagonal matrix is square it inherits some of the properties above */
1615:     break;
1616:   default:
1617:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
1618:   }
1619:   return 0;
1620: }

1622: /* used for both SeqBAIJ and SeqSBAIJ matrices */
1623: PetscErrorCode MatGetRow_SeqBAIJ_private(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v, PetscInt *ai, PetscInt *aj, PetscScalar *aa)
1624: {
1625:   PetscInt     itmp, i, j, k, M, bn, bp, *idx_i, bs, bs2;
1626:   MatScalar   *aa_i;
1627:   PetscScalar *v_i;

1629:   bs  = A->rmap->bs;
1630:   bs2 = bs * bs;

1633:   bn  = row / bs; /* Block number */
1634:   bp  = row % bs; /* Block Position */
1635:   M   = ai[bn + 1] - ai[bn];
1636:   *nz = bs * M;

1638:   if (v) {
1639:     *v = NULL;
1640:     if (*nz) {
1641:       PetscMalloc1(*nz, v);
1642:       for (i = 0; i < M; i++) { /* for each block in the block row */
1643:         v_i  = *v + i * bs;
1644:         aa_i = aa + bs2 * (ai[bn] + i);
1645:         for (j = bp, k = 0; j < bs2; j += bs, k++) v_i[k] = aa_i[j];
1646:       }
1647:     }
1648:   }

1650:   if (idx) {
1651:     *idx = NULL;
1652:     if (*nz) {
1653:       PetscMalloc1(*nz, idx);
1654:       for (i = 0; i < M; i++) { /* for each block in the block row */
1655:         idx_i = *idx + i * bs;
1656:         itmp  = bs * aj[ai[bn] + i];
1657:         for (j = 0; j < bs; j++) idx_i[j] = itmp++;
1658:       }
1659:     }
1660:   }
1661:   return 0;
1662: }

1664: PetscErrorCode MatGetRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1665: {
1666:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

1668:   MatGetRow_SeqBAIJ_private(A, row, nz, idx, v, a->i, a->j, a->a);
1669:   return 0;
1670: }

1672: PetscErrorCode MatRestoreRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1673: {
1674:   if (nz) *nz = 0;
1675:   if (idx) PetscFree(*idx);
1676:   if (v) PetscFree(*v);
1677:   return 0;
1678: }

1680: PetscErrorCode MatTranspose_SeqBAIJ(Mat A, MatReuse reuse, Mat *B)
1681: {
1682:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *at;
1683:   Mat          C;
1684:   PetscInt     i, j, k, *aj = a->j, *ai = a->i, bs = A->rmap->bs, mbs = a->mbs, nbs = a->nbs, *atfill;
1685:   PetscInt     bs2 = a->bs2, *ati, *atj, anzj, kr;
1686:   MatScalar   *ata, *aa = a->a;

1688:   if (reuse == MAT_REUSE_MATRIX) MatTransposeCheckNonzeroState_Private(A, *B);
1689:   PetscCalloc1(1 + nbs, &atfill);
1690:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
1691:     for (i = 0; i < ai[mbs]; i++) atfill[aj[i]] += 1; /* count num of non-zeros in row aj[i] */

1693:     MatCreate(PetscObjectComm((PetscObject)A), &C);
1694:     MatSetSizes(C, A->cmap->n, A->rmap->N, A->cmap->n, A->rmap->N);
1695:     MatSetType(C, ((PetscObject)A)->type_name);
1696:     MatSeqBAIJSetPreallocation(C, bs, 0, atfill);

1698:     at  = (Mat_SeqBAIJ *)C->data;
1699:     ati = at->i;
1700:     for (i = 0; i < nbs; i++) at->ilen[i] = at->imax[i] = ati[i + 1] - ati[i];
1701:   } else {
1702:     C   = *B;
1703:     at  = (Mat_SeqBAIJ *)C->data;
1704:     ati = at->i;
1705:   }

1707:   atj = at->j;
1708:   ata = at->a;

1710:   /* Copy ati into atfill so we have locations of the next free space in atj */
1711:   PetscArraycpy(atfill, ati, nbs);

1713:   /* Walk through A row-wise and mark nonzero entries of A^T. */
1714:   for (i = 0; i < mbs; i++) {
1715:     anzj = ai[i + 1] - ai[i];
1716:     for (j = 0; j < anzj; j++) {
1717:       atj[atfill[*aj]] = i;
1718:       for (kr = 0; kr < bs; kr++) {
1719:         for (k = 0; k < bs; k++) ata[bs2 * atfill[*aj] + k * bs + kr] = *aa++;
1720:       }
1721:       atfill[*aj++] += 1;
1722:     }
1723:   }
1724:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
1725:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);

1727:   /* Clean up temporary space and complete requests. */
1728:   PetscFree(atfill);

1730:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1731:     MatSetBlockSizes(C, PetscAbs(A->cmap->bs), PetscAbs(A->rmap->bs));
1732:     *B = C;
1733:   } else {
1734:     MatHeaderMerge(A, &C);
1735:   }
1736:   return 0;
1737: }

1739: PetscErrorCode MatIsTranspose_SeqBAIJ(Mat A, Mat B, PetscReal tol, PetscBool *f)
1740: {
1741:   Mat Btrans;

1743:   *f = PETSC_FALSE;
1744:   MatTranspose(A, MAT_INITIAL_MATRIX, &Btrans);
1745:   MatEqual_SeqBAIJ(B, Btrans, f);
1746:   MatDestroy(&Btrans);
1747:   return 0;
1748: }

1750: /* Used for both SeqBAIJ and SeqSBAIJ matrices */
1751: PetscErrorCode MatView_SeqBAIJ_Binary(Mat mat, PetscViewer viewer)
1752: {
1753:   Mat_SeqBAIJ *A = (Mat_SeqBAIJ *)mat->data;
1754:   PetscInt     header[4], M, N, m, bs, nz, cnt, i, j, k, l;
1755:   PetscInt    *rowlens, *colidxs;
1756:   PetscScalar *matvals;

1758:   PetscViewerSetUp(viewer);

1760:   M  = mat->rmap->N;
1761:   N  = mat->cmap->N;
1762:   m  = mat->rmap->n;
1763:   bs = mat->rmap->bs;
1764:   nz = bs * bs * A->nz;

1766:   /* write matrix header */
1767:   header[0] = MAT_FILE_CLASSID;
1768:   header[1] = M;
1769:   header[2] = N;
1770:   header[3] = nz;
1771:   PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT);

1773:   /* store row lengths */
1774:   PetscMalloc1(m, &rowlens);
1775:   for (cnt = 0, i = 0; i < A->mbs; i++)
1776:     for (j = 0; j < bs; j++) rowlens[cnt++] = bs * (A->i[i + 1] - A->i[i]);
1777:   PetscViewerBinaryWrite(viewer, rowlens, m, PETSC_INT);
1778:   PetscFree(rowlens);

1780:   /* store column indices  */
1781:   PetscMalloc1(nz, &colidxs);
1782:   for (cnt = 0, i = 0; i < A->mbs; i++)
1783:     for (k = 0; k < bs; k++)
1784:       for (j = A->i[i]; j < A->i[i + 1]; j++)
1785:         for (l = 0; l < bs; l++) colidxs[cnt++] = bs * A->j[j] + l;
1787:   PetscViewerBinaryWrite(viewer, colidxs, nz, PETSC_INT);
1788:   PetscFree(colidxs);

1790:   /* store nonzero values */
1791:   PetscMalloc1(nz, &matvals);
1792:   for (cnt = 0, i = 0; i < A->mbs; i++)
1793:     for (k = 0; k < bs; k++)
1794:       for (j = A->i[i]; j < A->i[i + 1]; j++)
1795:         for (l = 0; l < bs; l++) matvals[cnt++] = A->a[bs * (bs * j + l) + k];
1797:   PetscViewerBinaryWrite(viewer, matvals, nz, PETSC_SCALAR);
1798:   PetscFree(matvals);

1800:   /* write block size option to the viewer's .info file */
1801:   MatView_Binary_BlockSizes(mat, viewer);
1802:   return 0;
1803: }

1805: static PetscErrorCode MatView_SeqBAIJ_ASCII_structonly(Mat A, PetscViewer viewer)
1806: {
1807:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1808:   PetscInt     i, bs = A->rmap->bs, k;

1810:   PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1811:   for (i = 0; i < a->mbs; i++) {
1812:     PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT "-%" PetscInt_FMT ":", i * bs, i * bs + bs - 1);
1813:     for (k = a->i[i]; k < a->i[i + 1]; k++) PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT "-%" PetscInt_FMT ") ", bs * a->j[k], bs * a->j[k] + bs - 1);
1814:     PetscViewerASCIIPrintf(viewer, "\n");
1815:   }
1816:   PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1817:   return 0;
1818: }

1820: static PetscErrorCode MatView_SeqBAIJ_ASCII(Mat A, PetscViewer viewer)
1821: {
1822:   Mat_SeqBAIJ      *a = (Mat_SeqBAIJ *)A->data;
1823:   PetscInt          i, j, bs = A->rmap->bs, k, l, bs2 = a->bs2;
1824:   PetscViewerFormat format;

1826:   if (A->structure_only) {
1827:     MatView_SeqBAIJ_ASCII_structonly(A, viewer);
1828:     return 0;
1829:   }

1831:   PetscViewerGetFormat(viewer, &format);
1832:   if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1833:     PetscViewerASCIIPrintf(viewer, "  block size is %" PetscInt_FMT "\n", bs);
1834:   } else if (format == PETSC_VIEWER_ASCII_MATLAB) {
1835:     const char *matname;
1836:     Mat         aij;
1837:     MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &aij);
1838:     PetscObjectGetName((PetscObject)A, &matname);
1839:     PetscObjectSetName((PetscObject)aij, matname);
1840:     MatView(aij, viewer);
1841:     MatDestroy(&aij);
1842:   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1843:     return 0;
1844:   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1845:     PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1846:     for (i = 0; i < a->mbs; i++) {
1847:       for (j = 0; j < bs; j++) {
1848:         PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j);
1849:         for (k = a->i[i]; k < a->i[i + 1]; k++) {
1850:           for (l = 0; l < bs; l++) {
1851: #if defined(PETSC_USE_COMPLEX)
1852:             if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1853:               PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j]));
1854:             } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1855:               PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j]));
1856:             } else if (PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1857:               PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]));
1858:             }
1859: #else
1860:             if (a->a[bs2 * k + l * bs + j] != 0.0) PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j]);
1861: #endif
1862:           }
1863:         }
1864:         PetscViewerASCIIPrintf(viewer, "\n");
1865:       }
1866:     }
1867:     PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1868:   } else {
1869:     PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1870:     for (i = 0; i < a->mbs; i++) {
1871:       for (j = 0; j < bs; j++) {
1872:         PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j);
1873:         for (k = a->i[i]; k < a->i[i + 1]; k++) {
1874:           for (l = 0; l < bs; l++) {
1875: #if defined(PETSC_USE_COMPLEX)
1876:             if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0) {
1877:               PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j]));
1878:             } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0) {
1879:               PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j]));
1880:             } else {
1881:               PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]));
1882:             }
1883: #else
1884:             PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j]);
1885: #endif
1886:           }
1887:         }
1888:         PetscViewerASCIIPrintf(viewer, "\n");
1889:       }
1890:     }
1891:     PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1892:   }
1893:   PetscViewerFlush(viewer);
1894:   return 0;
1895: }

1897: #include <petscdraw.h>
1898: static PetscErrorCode MatView_SeqBAIJ_Draw_Zoom(PetscDraw draw, void *Aa)
1899: {
1900:   Mat               A = (Mat)Aa;
1901:   Mat_SeqBAIJ      *a = (Mat_SeqBAIJ *)A->data;
1902:   PetscInt          row, i, j, k, l, mbs = a->mbs, color, bs = A->rmap->bs, bs2 = a->bs2;
1903:   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1904:   MatScalar        *aa;
1905:   PetscViewer       viewer;
1906:   PetscViewerFormat format;

1908:   PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer);
1909:   PetscViewerGetFormat(viewer, &format);
1910:   PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr);

1912:   /* loop over matrix elements drawing boxes */

1914:   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1915:     PetscDrawCollectiveBegin(draw);
1916:     /* Blue for negative, Cyan for zero and  Red for positive */
1917:     color = PETSC_DRAW_BLUE;
1918:     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1919:       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1920:         y_l = A->rmap->N - row - 1.0;
1921:         y_r = y_l + 1.0;
1922:         x_l = a->j[j] * bs;
1923:         x_r = x_l + 1.0;
1924:         aa  = a->a + j * bs2;
1925:         for (k = 0; k < bs; k++) {
1926:           for (l = 0; l < bs; l++) {
1927:             if (PetscRealPart(*aa++) >= 0.) continue;
1928:             PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color);
1929:           }
1930:         }
1931:       }
1932:     }
1933:     color = PETSC_DRAW_CYAN;
1934:     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1935:       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1936:         y_l = A->rmap->N - row - 1.0;
1937:         y_r = y_l + 1.0;
1938:         x_l = a->j[j] * bs;
1939:         x_r = x_l + 1.0;
1940:         aa  = a->a + j * bs2;
1941:         for (k = 0; k < bs; k++) {
1942:           for (l = 0; l < bs; l++) {
1943:             if (PetscRealPart(*aa++) != 0.) continue;
1944:             PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color);
1945:           }
1946:         }
1947:       }
1948:     }
1949:     color = PETSC_DRAW_RED;
1950:     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1951:       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1952:         y_l = A->rmap->N - row - 1.0;
1953:         y_r = y_l + 1.0;
1954:         x_l = a->j[j] * bs;
1955:         x_r = x_l + 1.0;
1956:         aa  = a->a + j * bs2;
1957:         for (k = 0; k < bs; k++) {
1958:           for (l = 0; l < bs; l++) {
1959:             if (PetscRealPart(*aa++) <= 0.) continue;
1960:             PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color);
1961:           }
1962:         }
1963:       }
1964:     }
1965:     PetscDrawCollectiveEnd(draw);
1966:   } else {
1967:     /* use contour shading to indicate magnitude of values */
1968:     /* first determine max of all nonzero values */
1969:     PetscReal minv = 0.0, maxv = 0.0;
1970:     PetscDraw popup;

1972:     for (i = 0; i < a->nz * a->bs2; i++) {
1973:       if (PetscAbsScalar(a->a[i]) > maxv) maxv = PetscAbsScalar(a->a[i]);
1974:     }
1975:     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1976:     PetscDrawGetPopup(draw, &popup);
1977:     PetscDrawScalePopup(popup, 0.0, maxv);

1979:     PetscDrawCollectiveBegin(draw);
1980:     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1981:       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1982:         y_l = A->rmap->N - row - 1.0;
1983:         y_r = y_l + 1.0;
1984:         x_l = a->j[j] * bs;
1985:         x_r = x_l + 1.0;
1986:         aa  = a->a + j * bs2;
1987:         for (k = 0; k < bs; k++) {
1988:           for (l = 0; l < bs; l++) {
1989:             MatScalar v = *aa++;
1990:             color       = PetscDrawRealToColor(PetscAbsScalar(v), minv, maxv);
1991:             PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color);
1992:           }
1993:         }
1994:       }
1995:     }
1996:     PetscDrawCollectiveEnd(draw);
1997:   }
1998:   return 0;
1999: }

2001: static PetscErrorCode MatView_SeqBAIJ_Draw(Mat A, PetscViewer viewer)
2002: {
2003:   PetscReal xl, yl, xr, yr, w, h;
2004:   PetscDraw draw;
2005:   PetscBool isnull;

2007:   PetscViewerDrawGetDraw(viewer, 0, &draw);
2008:   PetscDrawIsNull(draw, &isnull);
2009:   if (isnull) return 0;

2011:   xr = A->cmap->n;
2012:   yr = A->rmap->N;
2013:   h  = yr / 10.0;
2014:   w  = xr / 10.0;
2015:   xr += w;
2016:   yr += h;
2017:   xl = -w;
2018:   yl = -h;
2019:   PetscDrawSetCoordinates(draw, xl, yl, xr, yr);
2020:   PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer);
2021:   PetscDrawZoom(draw, MatView_SeqBAIJ_Draw_Zoom, A);
2022:   PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL);
2023:   PetscDrawSave(draw);
2024:   return 0;
2025: }

2027: PetscErrorCode MatView_SeqBAIJ(Mat A, PetscViewer viewer)
2028: {
2029:   PetscBool iascii, isbinary, isdraw;

2031:   PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii);
2032:   PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary);
2033:   PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw);
2034:   if (iascii) {
2035:     MatView_SeqBAIJ_ASCII(A, viewer);
2036:   } else if (isbinary) {
2037:     MatView_SeqBAIJ_Binary(A, viewer);
2038:   } else if (isdraw) {
2039:     MatView_SeqBAIJ_Draw(A, viewer);
2040:   } else {
2041:     Mat B;
2042:     MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &B);
2043:     MatView(B, viewer);
2044:     MatDestroy(&B);
2045:   }
2046:   return 0;
2047: }

2049: PetscErrorCode MatGetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
2050: {
2051:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2052:   PetscInt    *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
2053:   PetscInt    *ai = a->i, *ailen = a->ilen;
2054:   PetscInt     brow, bcol, ridx, cidx, bs = A->rmap->bs, bs2 = a->bs2;
2055:   MatScalar   *ap, *aa = a->a;

2057:   for (k = 0; k < m; k++) { /* loop over rows */
2058:     row  = im[k];
2059:     brow = row / bs;
2060:     if (row < 0) {
2061:       v += n;
2062:       continue;
2063:     } /* negative row */
2065:     rp   = aj ? aj + ai[brow] : NULL;       /* mustn't add to NULL, that is UB */
2066:     ap   = aa ? aa + bs2 * ai[brow] : NULL; /* mustn't add to NULL, that is UB */
2067:     nrow = ailen[brow];
2068:     for (l = 0; l < n; l++) { /* loop over columns */
2069:       if (in[l] < 0) {
2070:         v++;
2071:         continue;
2072:       } /* negative column */
2074:       col  = in[l];
2075:       bcol = col / bs;
2076:       cidx = col % bs;
2077:       ridx = row % bs;
2078:       high = nrow;
2079:       low  = 0; /* assume unsorted */
2080:       while (high - low > 5) {
2081:         t = (low + high) / 2;
2082:         if (rp[t] > bcol) high = t;
2083:         else low = t;
2084:       }
2085:       for (i = low; i < high; i++) {
2086:         if (rp[i] > bcol) break;
2087:         if (rp[i] == bcol) {
2088:           *v++ = ap[bs2 * i + bs * cidx + ridx];
2089:           goto finished;
2090:         }
2091:       }
2092:       *v++ = 0.0;
2093:     finished:;
2094:     }
2095:   }
2096:   return 0;
2097: }

2099: PetscErrorCode MatSetValuesBlocked_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
2100: {
2101:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2102:   PetscInt          *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, rmax, N, lastcol = -1;
2103:   PetscInt          *imax = a->imax, *ai = a->i, *ailen = a->ilen;
2104:   PetscInt          *aj = a->j, nonew = a->nonew, bs2 = a->bs2, bs = A->rmap->bs, stepval;
2105:   PetscBool          roworiented = a->roworiented;
2106:   const PetscScalar *value       = v;
2107:   MatScalar         *ap = NULL, *aa = a->a, *bap;

2109:   if (roworiented) {
2110:     stepval = (n - 1) * bs;
2111:   } else {
2112:     stepval = (m - 1) * bs;
2113:   }
2114:   for (k = 0; k < m; k++) { /* loop over added rows */
2115:     row = im[k];
2116:     if (row < 0) continue;
2118:     rp = aj + ai[row];
2119:     if (!A->structure_only) ap = aa + bs2 * ai[row];
2120:     rmax = imax[row];
2121:     nrow = ailen[row];
2122:     low  = 0;
2123:     high = nrow;
2124:     for (l = 0; l < n; l++) { /* loop over added columns */
2125:       if (in[l] < 0) continue;
2127:       col = in[l];
2128:       if (!A->structure_only) {
2129:         if (roworiented) {
2130:           value = v + (k * (stepval + bs) + l) * bs;
2131:         } else {
2132:           value = v + (l * (stepval + bs) + k) * bs;
2133:         }
2134:       }
2135:       if (col <= lastcol) low = 0;
2136:       else high = nrow;
2137:       lastcol = col;
2138:       while (high - low > 7) {
2139:         t = (low + high) / 2;
2140:         if (rp[t] > col) high = t;
2141:         else low = t;
2142:       }
2143:       for (i = low; i < high; i++) {
2144:         if (rp[i] > col) break;
2145:         if (rp[i] == col) {
2146:           if (A->structure_only) goto noinsert2;
2147:           bap = ap + bs2 * i;
2148:           if (roworiented) {
2149:             if (is == ADD_VALUES) {
2150:               for (ii = 0; ii < bs; ii++, value += stepval) {
2151:                 for (jj = ii; jj < bs2; jj += bs) bap[jj] += *value++;
2152:               }
2153:             } else {
2154:               for (ii = 0; ii < bs; ii++, value += stepval) {
2155:                 for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++;
2156:               }
2157:             }
2158:           } else {
2159:             if (is == ADD_VALUES) {
2160:               for (ii = 0; ii < bs; ii++, value += bs + stepval) {
2161:                 for (jj = 0; jj < bs; jj++) bap[jj] += value[jj];
2162:                 bap += bs;
2163:               }
2164:             } else {
2165:               for (ii = 0; ii < bs; ii++, value += bs + stepval) {
2166:                 for (jj = 0; jj < bs; jj++) bap[jj] = value[jj];
2167:                 bap += bs;
2168:               }
2169:             }
2170:           }
2171:           goto noinsert2;
2172:         }
2173:       }
2174:       if (nonew == 1) goto noinsert2;
2176:       if (A->structure_only) {
2177:         MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, row, col, rmax, ai, aj, rp, imax, nonew, MatScalar);
2178:       } else {
2179:         MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, row, col, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar);
2180:       }
2181:       N = nrow++ - 1;
2182:       high++;
2183:       /* shift up all the later entries in this row */
2184:       PetscArraymove(rp + i + 1, rp + i, N - i + 1);
2185:       rp[i] = col;
2186:       if (!A->structure_only) {
2187:         PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1));
2188:         bap = ap + bs2 * i;
2189:         if (roworiented) {
2190:           for (ii = 0; ii < bs; ii++, value += stepval) {
2191:             for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++;
2192:           }
2193:         } else {
2194:           for (ii = 0; ii < bs; ii++, value += stepval) {
2195:             for (jj = 0; jj < bs; jj++) *bap++ = *value++;
2196:           }
2197:         }
2198:       }
2199:     noinsert2:;
2200:       low = i;
2201:     }
2202:     ailen[row] = nrow;
2203:   }
2204:   return 0;
2205: }

2207: PetscErrorCode MatAssemblyEnd_SeqBAIJ(Mat A, MatAssemblyType mode)
2208: {
2209:   Mat_SeqBAIJ *a      = (Mat_SeqBAIJ *)A->data;
2210:   PetscInt     fshift = 0, i, *ai = a->i, *aj = a->j, *imax = a->imax;
2211:   PetscInt     m = A->rmap->N, *ip, N, *ailen = a->ilen;
2212:   PetscInt     mbs = a->mbs, bs2 = a->bs2, rmax = 0;
2213:   MatScalar   *aa    = a->a, *ap;
2214:   PetscReal    ratio = 0.6;

2216:   if (mode == MAT_FLUSH_ASSEMBLY) return 0;

2218:   if (m) rmax = ailen[0];
2219:   for (i = 1; i < mbs; i++) {
2220:     /* move each row back by the amount of empty slots (fshift) before it*/
2221:     fshift += imax[i - 1] - ailen[i - 1];
2222:     rmax = PetscMax(rmax, ailen[i]);
2223:     if (fshift) {
2224:       ip = aj + ai[i];
2225:       ap = aa + bs2 * ai[i];
2226:       N  = ailen[i];
2227:       PetscArraymove(ip - fshift, ip, N);
2228:       if (!A->structure_only) PetscArraymove(ap - bs2 * fshift, ap, bs2 * N);
2229:     }
2230:     ai[i] = ai[i - 1] + ailen[i - 1];
2231:   }
2232:   if (mbs) {
2233:     fshift += imax[mbs - 1] - ailen[mbs - 1];
2234:     ai[mbs] = ai[mbs - 1] + ailen[mbs - 1];
2235:   }

2237:   /* reset ilen and imax for each row */
2238:   a->nonzerorowcnt = 0;
2239:   if (A->structure_only) {
2240:     PetscFree2(a->imax, a->ilen);
2241:   } else { /* !A->structure_only */
2242:     for (i = 0; i < mbs; i++) {
2243:       ailen[i] = imax[i] = ai[i + 1] - ai[i];
2244:       a->nonzerorowcnt += ((ai[i + 1] - ai[i]) > 0);
2245:     }
2246:   }
2247:   a->nz = ai[mbs];

2249:   /* diagonals may have moved, so kill the diagonal pointers */
2250:   a->idiagvalid = PETSC_FALSE;
2251:   if (fshift && a->diag) {
2252:     PetscFree(a->diag);
2253:     a->diag = NULL;
2254:   }
2256:   PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT ", block size %" PetscInt_FMT "; storage space: %" PetscInt_FMT " unneeded, %" PetscInt_FMT " used\n", m, A->cmap->n, A->rmap->bs, fshift * bs2, a->nz * bs2);
2257:   PetscInfo(A, "Number of mallocs during MatSetValues is %" PetscInt_FMT "\n", a->reallocs);
2258:   PetscInfo(A, "Most nonzeros blocks in any row is %" PetscInt_FMT "\n", rmax);

2260:   A->info.mallocs += a->reallocs;
2261:   a->reallocs         = 0;
2262:   A->info.nz_unneeded = (PetscReal)fshift * bs2;
2263:   a->rmax             = rmax;

2265:   if (!A->structure_only) MatCheckCompressedRow(A, a->nonzerorowcnt, &a->compressedrow, a->i, mbs, ratio);
2266:   return 0;
2267: }

2269: /*
2270:    This function returns an array of flags which indicate the locations of contiguous
2271:    blocks that should be zeroed. for eg: if bs = 3  and is = [0,1,2,3,5,6,7,8,9]
2272:    then the resulting sizes = [3,1,1,3,1] corresponding to sets [(0,1,2),(3),(5),(6,7,8),(9)]
2273:    Assume: sizes should be long enough to hold all the values.
2274: */
2275: static PetscErrorCode MatZeroRows_SeqBAIJ_Check_Blocks(PetscInt idx[], PetscInt n, PetscInt bs, PetscInt sizes[], PetscInt *bs_max)
2276: {
2277:   PetscInt  i, j, k, row;
2278:   PetscBool flg;

2280:   for (i = 0, j = 0; i < n; j++) {
2281:     row = idx[i];
2282:     if (row % bs != 0) { /* Not the beginning of a block */
2283:       sizes[j] = 1;
2284:       i++;
2285:     } else if (i + bs > n) { /* complete block doesn't exist (at idx end) */
2286:       sizes[j] = 1;          /* Also makes sure at least 'bs' values exist for next else */
2287:       i++;
2288:     } else { /* Beginning of the block, so check if the complete block exists */
2289:       flg = PETSC_TRUE;
2290:       for (k = 1; k < bs; k++) {
2291:         if (row + k != idx[i + k]) { /* break in the block */
2292:           flg = PETSC_FALSE;
2293:           break;
2294:         }
2295:       }
2296:       if (flg) { /* No break in the bs */
2297:         sizes[j] = bs;
2298:         i += bs;
2299:       } else {
2300:         sizes[j] = 1;
2301:         i++;
2302:       }
2303:     }
2304:   }
2305:   *bs_max = j;
2306:   return 0;
2307: }

2309: PetscErrorCode MatZeroRows_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b)
2310: {
2311:   Mat_SeqBAIJ       *baij = (Mat_SeqBAIJ *)A->data;
2312:   PetscInt           i, j, k, count, *rows;
2313:   PetscInt           bs = A->rmap->bs, bs2 = baij->bs2, *sizes, row, bs_max;
2314:   PetscScalar        zero = 0.0;
2315:   MatScalar         *aa;
2316:   const PetscScalar *xx;
2317:   PetscScalar       *bb;

2319:   /* fix right hand side if needed */
2320:   if (x && b) {
2321:     VecGetArrayRead(x, &xx);
2322:     VecGetArray(b, &bb);
2323:     for (i = 0; i < is_n; i++) bb[is_idx[i]] = diag * xx[is_idx[i]];
2324:     VecRestoreArrayRead(x, &xx);
2325:     VecRestoreArray(b, &bb);
2326:   }

2328:   /* Make a copy of the IS and  sort it */
2329:   /* allocate memory for rows,sizes */
2330:   PetscMalloc2(is_n, &rows, 2 * is_n, &sizes);

2332:   /* copy IS values to rows, and sort them */
2333:   for (i = 0; i < is_n; i++) rows[i] = is_idx[i];
2334:   PetscSortInt(is_n, rows);

2336:   if (baij->keepnonzeropattern) {
2337:     for (i = 0; i < is_n; i++) sizes[i] = 1;
2338:     bs_max = is_n;
2339:   } else {
2340:     MatZeroRows_SeqBAIJ_Check_Blocks(rows, is_n, bs, sizes, &bs_max);
2341:     A->nonzerostate++;
2342:   }

2344:   for (i = 0, j = 0; i < bs_max; j += sizes[i], i++) {
2345:     row = rows[j];
2347:     count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs;
2348:     aa    = ((MatScalar *)(baij->a)) + baij->i[row / bs] * bs2 + (row % bs);
2349:     if (sizes[i] == bs && !baij->keepnonzeropattern) {
2350:       if (diag != (PetscScalar)0.0) {
2351:         if (baij->ilen[row / bs] > 0) {
2352:           baij->ilen[row / bs]       = 1;
2353:           baij->j[baij->i[row / bs]] = row / bs;

2355:           PetscArrayzero(aa, count * bs);
2356:         }
2357:         /* Now insert all the diagonal values for this bs */
2358:         for (k = 0; k < bs; k++) (*A->ops->setvalues)(A, 1, rows + j + k, 1, rows + j + k, &diag, INSERT_VALUES);
2359:       } else { /* (diag == 0.0) */
2360:         baij->ilen[row / bs] = 0;
2361:       }      /* end (diag == 0.0) */
2362:     } else { /* (sizes[i] != bs) */
2363:       PetscAssert(sizes[i] == 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal Error. Value should be 1");
2364:       for (k = 0; k < count; k++) {
2365:         aa[0] = zero;
2366:         aa += bs;
2367:       }
2368:       if (diag != (PetscScalar)0.0) (*A->ops->setvalues)(A, 1, rows + j, 1, rows + j, &diag, INSERT_VALUES);
2369:     }
2370:   }

2372:   PetscFree2(rows, sizes);
2373:   MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY);
2374:   return 0;
2375: }

2377: PetscErrorCode MatZeroRowsColumns_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b)
2378: {
2379:   Mat_SeqBAIJ       *baij = (Mat_SeqBAIJ *)A->data;
2380:   PetscInt           i, j, k, count;
2381:   PetscInt           bs = A->rmap->bs, bs2 = baij->bs2, row, col;
2382:   PetscScalar        zero = 0.0;
2383:   MatScalar         *aa;
2384:   const PetscScalar *xx;
2385:   PetscScalar       *bb;
2386:   PetscBool         *zeroed, vecs = PETSC_FALSE;

2388:   /* fix right hand side if needed */
2389:   if (x && b) {
2390:     VecGetArrayRead(x, &xx);
2391:     VecGetArray(b, &bb);
2392:     vecs = PETSC_TRUE;
2393:   }

2395:   /* zero the columns */
2396:   PetscCalloc1(A->rmap->n, &zeroed);
2397:   for (i = 0; i < is_n; i++) {
2399:     zeroed[is_idx[i]] = PETSC_TRUE;
2400:   }
2401:   for (i = 0; i < A->rmap->N; i++) {
2402:     if (!zeroed[i]) {
2403:       row = i / bs;
2404:       for (j = baij->i[row]; j < baij->i[row + 1]; j++) {
2405:         for (k = 0; k < bs; k++) {
2406:           col = bs * baij->j[j] + k;
2407:           if (zeroed[col]) {
2408:             aa = ((MatScalar *)(baij->a)) + j * bs2 + (i % bs) + bs * k;
2409:             if (vecs) bb[i] -= aa[0] * xx[col];
2410:             aa[0] = 0.0;
2411:           }
2412:         }
2413:       }
2414:     } else if (vecs) bb[i] = diag * xx[i];
2415:   }
2416:   PetscFree(zeroed);
2417:   if (vecs) {
2418:     VecRestoreArrayRead(x, &xx);
2419:     VecRestoreArray(b, &bb);
2420:   }

2422:   /* zero the rows */
2423:   for (i = 0; i < is_n; i++) {
2424:     row   = is_idx[i];
2425:     count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs;
2426:     aa    = ((MatScalar *)(baij->a)) + baij->i[row / bs] * bs2 + (row % bs);
2427:     for (k = 0; k < count; k++) {
2428:       aa[0] = zero;
2429:       aa += bs;
2430:     }
2431:     if (diag != (PetscScalar)0.0) PetscUseTypeMethod(A, setvalues, 1, &row, 1, &row, &diag, INSERT_VALUES);
2432:   }
2433:   MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY);
2434:   return 0;
2435: }

2437: PetscErrorCode MatSetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
2438: {
2439:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2440:   PetscInt    *rp, k, low, high, t, ii, row, nrow, i, col, l, rmax, N, lastcol = -1;
2441:   PetscInt    *imax = a->imax, *ai = a->i, *ailen = a->ilen;
2442:   PetscInt    *aj = a->j, nonew = a->nonew, bs = A->rmap->bs, brow, bcol;
2443:   PetscInt     ridx, cidx, bs2                 = a->bs2;
2444:   PetscBool    roworiented = a->roworiented;
2445:   MatScalar   *ap = NULL, value = 0.0, *aa = a->a, *bap;

2447:   for (k = 0; k < m; k++) { /* loop over added rows */
2448:     row  = im[k];
2449:     brow = row / bs;
2450:     if (row < 0) continue;
2452:     rp = aj + ai[brow];
2453:     if (!A->structure_only) ap = aa + bs2 * ai[brow];
2454:     rmax = imax[brow];
2455:     nrow = ailen[brow];
2456:     low  = 0;
2457:     high = nrow;
2458:     for (l = 0; l < n; l++) { /* loop over added columns */
2459:       if (in[l] < 0) continue;
2461:       col  = in[l];
2462:       bcol = col / bs;
2463:       ridx = row % bs;
2464:       cidx = col % bs;
2465:       if (!A->structure_only) {
2466:         if (roworiented) {
2467:           value = v[l + k * n];
2468:         } else {
2469:           value = v[k + l * m];
2470:         }
2471:       }
2472:       if (col <= lastcol) low = 0;
2473:       else high = nrow;
2474:       lastcol = col;
2475:       while (high - low > 7) {
2476:         t = (low + high) / 2;
2477:         if (rp[t] > bcol) high = t;
2478:         else low = t;
2479:       }
2480:       for (i = low; i < high; i++) {
2481:         if (rp[i] > bcol) break;
2482:         if (rp[i] == bcol) {
2483:           bap = ap + bs2 * i + bs * cidx + ridx;
2484:           if (!A->structure_only) {
2485:             if (is == ADD_VALUES) *bap += value;
2486:             else *bap = value;
2487:           }
2488:           goto noinsert1;
2489:         }
2490:       }
2491:       if (nonew == 1) goto noinsert1;
2493:       if (A->structure_only) {
2494:         MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, brow, bcol, rmax, ai, aj, rp, imax, nonew, MatScalar);
2495:       } else {
2496:         MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, brow, bcol, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar);
2497:       }
2498:       N = nrow++ - 1;
2499:       high++;
2500:       /* shift up all the later entries in this row */
2501:       PetscArraymove(rp + i + 1, rp + i, N - i + 1);
2502:       rp[i] = bcol;
2503:       if (!A->structure_only) {
2504:         PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1));
2505:         PetscArrayzero(ap + bs2 * i, bs2);
2506:         ap[bs2 * i + bs * cidx + ridx] = value;
2507:       }
2508:       a->nz++;
2509:       A->nonzerostate++;
2510:     noinsert1:;
2511:       low = i;
2512:     }
2513:     ailen[brow] = nrow;
2514:   }
2515:   return 0;
2516: }

2518: PetscErrorCode MatILUFactor_SeqBAIJ(Mat inA, IS row, IS col, const MatFactorInfo *info)
2519: {
2520:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
2521:   Mat          outA;
2522:   PetscBool    row_identity, col_identity;

2525:   ISIdentity(row, &row_identity);
2526:   ISIdentity(col, &col_identity);

2529:   outA            = inA;
2530:   inA->factortype = MAT_FACTOR_LU;
2531:   PetscFree(inA->solvertype);
2532:   PetscStrallocpy(MATSOLVERPETSC, &inA->solvertype);

2534:   MatMarkDiagonal_SeqBAIJ(inA);

2536:   PetscObjectReference((PetscObject)row);
2537:   ISDestroy(&a->row);
2538:   a->row = row;
2539:   PetscObjectReference((PetscObject)col);
2540:   ISDestroy(&a->col);
2541:   a->col = col;

2543:   /* Create the invert permutation so that it can be used in MatLUFactorNumeric() */
2544:   ISDestroy(&a->icol);
2545:   ISInvertPermutation(col, PETSC_DECIDE, &a->icol);

2547:   MatSeqBAIJSetNumericFactorization_inplace(inA, (PetscBool)(row_identity && col_identity));
2548:   if (!a->solve_work) { PetscMalloc1(inA->rmap->N + inA->rmap->bs, &a->solve_work); }
2549:   MatLUFactorNumeric(outA, inA, info);
2550:   return 0;
2551: }

2553: PetscErrorCode MatSeqBAIJSetColumnIndices_SeqBAIJ(Mat mat, PetscInt *indices)
2554: {
2555:   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *)mat->data;
2556:   PetscInt     i, nz, mbs;

2558:   nz  = baij->maxnz;
2559:   mbs = baij->mbs;
2560:   for (i = 0; i < nz; i++) baij->j[i] = indices[i];
2561:   baij->nz = nz;
2562:   for (i = 0; i < mbs; i++) baij->ilen[i] = baij->imax[i];
2563:   return 0;
2564: }

2566: /*@
2567:     MatSeqBAIJSetColumnIndices - Set the column indices for all the rows in the matrix.

2569:   Input Parameters:
2570: +  mat - the `MATSEQBAIJ` matrix
2571: -  indices - the column indices

2573:   Level: advanced

2575:   Notes:
2576:     This can be called if you have precomputed the nonzero structure of the
2577:   matrix and want to provide it to the matrix object to improve the performance
2578:   of the `MatSetValues()` operation.

2580:     You MUST have set the correct numbers of nonzeros per row in the call to
2581:   `MatCreateSeqBAIJ()`, and the columns indices MUST be sorted.

2583:     MUST be called before any calls to `MatSetValues()`

2585: .seealso: `MATSEQBAIJ`, `MatSetValues()`
2586: @*/
2587: PetscErrorCode MatSeqBAIJSetColumnIndices(Mat mat, PetscInt *indices)
2588: {
2591:   PetscUseMethod(mat, "MatSeqBAIJSetColumnIndices_C", (Mat, PetscInt *), (mat, indices));
2592:   return 0;
2593: }

2595: PetscErrorCode MatGetRowMaxAbs_SeqBAIJ(Mat A, Vec v, PetscInt idx[])
2596: {
2597:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2598:   PetscInt     i, j, n, row, bs, *ai, *aj, mbs;
2599:   PetscReal    atmp;
2600:   PetscScalar *x, zero = 0.0;
2601:   MatScalar   *aa;
2602:   PetscInt     ncols, brow, krow, kcol;

2604:   /* why is this not a macro???????????????????????????????????????????????????????????????? */
2606:   bs  = A->rmap->bs;
2607:   aa  = a->a;
2608:   ai  = a->i;
2609:   aj  = a->j;
2610:   mbs = a->mbs;

2612:   VecSet(v, zero);
2613:   VecGetArray(v, &x);
2614:   VecGetLocalSize(v, &n);
2616:   for (i = 0; i < mbs; i++) {
2617:     ncols = ai[1] - ai[0];
2618:     ai++;
2619:     brow = bs * i;
2620:     for (j = 0; j < ncols; j++) {
2621:       for (kcol = 0; kcol < bs; kcol++) {
2622:         for (krow = 0; krow < bs; krow++) {
2623:           atmp = PetscAbsScalar(*aa);
2624:           aa++;
2625:           row = brow + krow; /* row index */
2626:           if (PetscAbsScalar(x[row]) < atmp) {
2627:             x[row] = atmp;
2628:             if (idx) idx[row] = bs * (*aj) + kcol;
2629:           }
2630:         }
2631:       }
2632:       aj++;
2633:     }
2634:   }
2635:   VecRestoreArray(v, &x);
2636:   return 0;
2637: }

2639: PetscErrorCode MatCopy_SeqBAIJ(Mat A, Mat B, MatStructure str)
2640: {
2641:   /* If the two matrices have the same copy implementation, use fast copy. */
2642:   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
2643:     Mat_SeqBAIJ *a    = (Mat_SeqBAIJ *)A->data;
2644:     Mat_SeqBAIJ *b    = (Mat_SeqBAIJ *)B->data;
2645:     PetscInt     ambs = a->mbs, bmbs = b->mbs, abs = A->rmap->bs, bbs = B->rmap->bs, bs2 = abs * abs;

2649:     PetscArraycpy(b->a, a->a, bs2 * a->i[ambs]);
2650:     PetscObjectStateIncrease((PetscObject)B);
2651:   } else {
2652:     MatCopy_Basic(A, B, str);
2653:   }
2654:   return 0;
2655: }

2657: PetscErrorCode MatSetUp_SeqBAIJ(Mat A)
2658: {
2659:   MatSeqBAIJSetPreallocation(A, A->rmap->bs, PETSC_DEFAULT, NULL);
2660:   return 0;
2661: }

2663: static PetscErrorCode MatSeqBAIJGetArray_SeqBAIJ(Mat A, PetscScalar *array[])
2664: {
2665:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

2667:   *array = a->a;
2668:   return 0;
2669: }

2671: static PetscErrorCode MatSeqBAIJRestoreArray_SeqBAIJ(Mat A, PetscScalar *array[])
2672: {
2673:   *array = NULL;
2674:   return 0;
2675: }

2677: PetscErrorCode MatAXPYGetPreallocation_SeqBAIJ(Mat Y, Mat X, PetscInt *nnz)
2678: {
2679:   PetscInt     bs = Y->rmap->bs, mbs = Y->rmap->N / bs;
2680:   Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data;
2681:   Mat_SeqBAIJ *y = (Mat_SeqBAIJ *)Y->data;

2683:   /* Set the number of nonzeros in the new matrix */
2684:   MatAXPYGetPreallocation_SeqX_private(mbs, x->i, x->j, y->i, y->j, nnz);
2685:   return 0;
2686: }

2688: PetscErrorCode MatAXPY_SeqBAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2689: {
2690:   Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data, *y = (Mat_SeqBAIJ *)Y->data;
2691:   PetscInt     bs = Y->rmap->bs, bs2 = bs * bs;
2692:   PetscBLASInt one = 1;

2694:   if (str == UNKNOWN_NONZERO_PATTERN || (PetscDefined(USE_DEBUG) && str == SAME_NONZERO_PATTERN)) {
2695:     PetscBool e = x->nz == y->nz && x->mbs == y->mbs && bs == X->rmap->bs ? PETSC_TRUE : PETSC_FALSE;
2696:     if (e) {
2697:       PetscArraycmp(x->i, y->i, x->mbs + 1, &e);
2698:       if (e) {
2699:         PetscArraycmp(x->j, y->j, x->i[x->mbs], &e);
2700:         if (e) str = SAME_NONZERO_PATTERN;
2701:       }
2702:     }
2704:   }
2705:   if (str == SAME_NONZERO_PATTERN) {
2706:     PetscScalar  alpha = a;
2707:     PetscBLASInt bnz;
2708:     PetscBLASIntCast(x->nz * bs2, &bnz);
2709:     PetscCallBLAS("BLASaxpy", BLASaxpy_(&bnz, &alpha, x->a, &one, y->a, &one));
2710:     PetscObjectStateIncrease((PetscObject)Y);
2711:   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2712:     MatAXPY_Basic(Y, a, X, str);
2713:   } else {
2714:     Mat       B;
2715:     PetscInt *nnz;
2717:     PetscMalloc1(Y->rmap->N, &nnz);
2718:     MatCreate(PetscObjectComm((PetscObject)Y), &B);
2719:     PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name);
2720:     MatSetSizes(B, Y->rmap->n, Y->cmap->n, Y->rmap->N, Y->cmap->N);
2721:     MatSetBlockSizesFromMats(B, Y, Y);
2722:     MatSetType(B, (MatType)((PetscObject)Y)->type_name);
2723:     MatAXPYGetPreallocation_SeqBAIJ(Y, X, nnz);
2724:     MatSeqBAIJSetPreallocation(B, bs, 0, nnz);
2725:     MatAXPY_BasicWithPreallocation(B, Y, a, X, str);
2726:     MatHeaderMerge(Y, &B);
2727:     PetscFree(nnz);
2728:   }
2729:   return 0;
2730: }

2732: PETSC_INTERN PetscErrorCode MatConjugate_SeqBAIJ(Mat A)
2733: {
2734: #if defined(PETSC_USE_COMPLEX)
2735:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2736:   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2737:   MatScalar   *aa = a->a;

2739:   for (i = 0; i < nz; i++) aa[i] = PetscConj(aa[i]);
2740: #else
2741: #endif
2742:   return 0;
2743: }

2745: PetscErrorCode MatRealPart_SeqBAIJ(Mat A)
2746: {
2747:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2748:   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2749:   MatScalar   *aa = a->a;

2751:   for (i = 0; i < nz; i++) aa[i] = PetscRealPart(aa[i]);
2752:   return 0;
2753: }

2755: PetscErrorCode MatImaginaryPart_SeqBAIJ(Mat A)
2756: {
2757:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2758:   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2759:   MatScalar   *aa = a->a;

2761:   for (i = 0; i < nz; i++) aa[i] = PetscImaginaryPart(aa[i]);
2762:   return 0;
2763: }

2765: /*
2766:     Code almost identical to MatGetColumnIJ_SeqAIJ() should share common code
2767: */
2768: PetscErrorCode MatGetColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
2769: {
2770:   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
2771:   PetscInt     bs = A->rmap->bs, i, *collengths, *cia, *cja, n = A->cmap->n / bs, m = A->rmap->n / bs;
2772:   PetscInt     nz = a->i[m], row, *jj, mr, col;

2774:   *nn = n;
2775:   if (!ia) return 0;
2777:   PetscCalloc1(n, &collengths);
2778:   PetscMalloc1(n + 1, &cia);
2779:   PetscMalloc1(nz, &cja);
2780:   jj = a->j;
2781:   for (i = 0; i < nz; i++) collengths[jj[i]]++;
2782:   cia[0] = oshift;
2783:   for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i];
2784:   PetscArrayzero(collengths, n);
2785:   jj = a->j;
2786:   for (row = 0; row < m; row++) {
2787:     mr = a->i[row + 1] - a->i[row];
2788:     for (i = 0; i < mr; i++) {
2789:       col = *jj++;

2791:       cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
2792:     }
2793:   }
2794:   PetscFree(collengths);
2795:   *ia = cia;
2796:   *ja = cja;
2797:   return 0;
2798: }

2800: PetscErrorCode MatRestoreColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
2801: {
2802:   if (!ia) return 0;
2803:   PetscFree(*ia);
2804:   PetscFree(*ja);
2805:   return 0;
2806: }

2808: /*
2809:  MatGetColumnIJ_SeqBAIJ_Color() and MatRestoreColumnIJ_SeqBAIJ_Color() are customized from
2810:  MatGetColumnIJ_SeqBAIJ() and MatRestoreColumnIJ_SeqBAIJ() by adding an output
2811:  spidx[], index of a->a, to be used in MatTransposeColoringCreate() and MatFDColoringCreate()
2812:  */
2813: PetscErrorCode MatGetColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done)
2814: {
2815:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2816:   PetscInt     i, *collengths, *cia, *cja, n = a->nbs, m = a->mbs;
2817:   PetscInt     nz = a->i[m], row, *jj, mr, col;
2818:   PetscInt    *cspidx;

2820:   *nn = n;
2821:   if (!ia) return 0;

2823:   PetscCalloc1(n, &collengths);
2824:   PetscMalloc1(n + 1, &cia);
2825:   PetscMalloc1(nz, &cja);
2826:   PetscMalloc1(nz, &cspidx);
2827:   jj = a->j;
2828:   for (i = 0; i < nz; i++) collengths[jj[i]]++;
2829:   cia[0] = oshift;
2830:   for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i];
2831:   PetscArrayzero(collengths, n);
2832:   jj = a->j;
2833:   for (row = 0; row < m; row++) {
2834:     mr = a->i[row + 1] - a->i[row];
2835:     for (i = 0; i < mr; i++) {
2836:       col                                         = *jj++;
2837:       cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
2838:       cja[cia[col] + collengths[col]++ - oshift]  = row + oshift;
2839:     }
2840:   }
2841:   PetscFree(collengths);
2842:   *ia    = cia;
2843:   *ja    = cja;
2844:   *spidx = cspidx;
2845:   return 0;
2846: }

2848: PetscErrorCode MatRestoreColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done)
2849: {
2850:   MatRestoreColumnIJ_SeqBAIJ(A, oshift, symmetric, inodecompressed, n, ia, ja, done);
2851:   PetscFree(*spidx);
2852:   return 0;
2853: }

2855: PetscErrorCode MatShift_SeqBAIJ(Mat Y, PetscScalar a)
2856: {
2857:   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)Y->data;

2859:   if (!Y->preallocated || !aij->nz) MatSeqBAIJSetPreallocation(Y, Y->rmap->bs, 1, NULL);
2860:   MatShift_Basic(Y, a);
2861:   return 0;
2862: }

2864: /* -------------------------------------------------------------------*/
2865: static struct _MatOps MatOps_Values = {
2866:   MatSetValues_SeqBAIJ,
2867:   MatGetRow_SeqBAIJ,
2868:   MatRestoreRow_SeqBAIJ,
2869:   MatMult_SeqBAIJ_N,
2870:   /* 4*/ MatMultAdd_SeqBAIJ_N,
2871:   MatMultTranspose_SeqBAIJ,
2872:   MatMultTransposeAdd_SeqBAIJ,
2873:   NULL,
2874:   NULL,
2875:   NULL,
2876:   /* 10*/ NULL,
2877:   MatLUFactor_SeqBAIJ,
2878:   NULL,
2879:   NULL,
2880:   MatTranspose_SeqBAIJ,
2881:   /* 15*/ MatGetInfo_SeqBAIJ,
2882:   MatEqual_SeqBAIJ,
2883:   MatGetDiagonal_SeqBAIJ,
2884:   MatDiagonalScale_SeqBAIJ,
2885:   MatNorm_SeqBAIJ,
2886:   /* 20*/ NULL,
2887:   MatAssemblyEnd_SeqBAIJ,
2888:   MatSetOption_SeqBAIJ,
2889:   MatZeroEntries_SeqBAIJ,
2890:   /* 24*/ MatZeroRows_SeqBAIJ,
2891:   NULL,
2892:   NULL,
2893:   NULL,
2894:   NULL,
2895:   /* 29*/ MatSetUp_SeqBAIJ,
2896:   NULL,
2897:   NULL,
2898:   NULL,
2899:   NULL,
2900:   /* 34*/ MatDuplicate_SeqBAIJ,
2901:   NULL,
2902:   NULL,
2903:   MatILUFactor_SeqBAIJ,
2904:   NULL,
2905:   /* 39*/ MatAXPY_SeqBAIJ,
2906:   MatCreateSubMatrices_SeqBAIJ,
2907:   MatIncreaseOverlap_SeqBAIJ,
2908:   MatGetValues_SeqBAIJ,
2909:   MatCopy_SeqBAIJ,
2910:   /* 44*/ NULL,
2911:   MatScale_SeqBAIJ,
2912:   MatShift_SeqBAIJ,
2913:   NULL,
2914:   MatZeroRowsColumns_SeqBAIJ,
2915:   /* 49*/ NULL,
2916:   MatGetRowIJ_SeqBAIJ,
2917:   MatRestoreRowIJ_SeqBAIJ,
2918:   MatGetColumnIJ_SeqBAIJ,
2919:   MatRestoreColumnIJ_SeqBAIJ,
2920:   /* 54*/ MatFDColoringCreate_SeqXAIJ,
2921:   NULL,
2922:   NULL,
2923:   NULL,
2924:   MatSetValuesBlocked_SeqBAIJ,
2925:   /* 59*/ MatCreateSubMatrix_SeqBAIJ,
2926:   MatDestroy_SeqBAIJ,
2927:   MatView_SeqBAIJ,
2928:   NULL,
2929:   NULL,
2930:   /* 64*/ NULL,
2931:   NULL,
2932:   NULL,
2933:   NULL,
2934:   NULL,
2935:   /* 69*/ MatGetRowMaxAbs_SeqBAIJ,
2936:   NULL,
2937:   MatConvert_Basic,
2938:   NULL,
2939:   NULL,
2940:   /* 74*/ NULL,
2941:   MatFDColoringApply_BAIJ,
2942:   NULL,
2943:   NULL,
2944:   NULL,
2945:   /* 79*/ NULL,
2946:   NULL,
2947:   NULL,
2948:   NULL,
2949:   MatLoad_SeqBAIJ,
2950:   /* 84*/ NULL,
2951:   NULL,
2952:   NULL,
2953:   NULL,
2954:   NULL,
2955:   /* 89*/ NULL,
2956:   NULL,
2957:   NULL,
2958:   NULL,
2959:   NULL,
2960:   /* 94*/ NULL,
2961:   NULL,
2962:   NULL,
2963:   NULL,
2964:   NULL,
2965:   /* 99*/ NULL,
2966:   NULL,
2967:   NULL,
2968:   MatConjugate_SeqBAIJ,
2969:   NULL,
2970:   /*104*/ NULL,
2971:   MatRealPart_SeqBAIJ,
2972:   MatImaginaryPart_SeqBAIJ,
2973:   NULL,
2974:   NULL,
2975:   /*109*/ NULL,
2976:   NULL,
2977:   NULL,
2978:   NULL,
2979:   MatMissingDiagonal_SeqBAIJ,
2980:   /*114*/ NULL,
2981:   NULL,
2982:   NULL,
2983:   NULL,
2984:   NULL,
2985:   /*119*/ NULL,
2986:   NULL,
2987:   MatMultHermitianTranspose_SeqBAIJ,
2988:   MatMultHermitianTransposeAdd_SeqBAIJ,
2989:   NULL,
2990:   /*124*/ NULL,
2991:   MatGetColumnReductions_SeqBAIJ,
2992:   MatInvertBlockDiagonal_SeqBAIJ,
2993:   NULL,
2994:   NULL,
2995:   /*129*/ NULL,
2996:   NULL,
2997:   NULL,
2998:   NULL,
2999:   NULL,
3000:   /*134*/ NULL,
3001:   NULL,
3002:   NULL,
3003:   NULL,
3004:   NULL,
3005:   /*139*/ MatSetBlockSizes_Default,
3006:   NULL,
3007:   NULL,
3008:   MatFDColoringSetUp_SeqXAIJ,
3009:   NULL,
3010:   /*144*/ MatCreateMPIMatConcatenateSeqMat_SeqBAIJ,
3011:   MatDestroySubMatrices_SeqBAIJ,
3012:   NULL,
3013:   NULL,
3014:   NULL,
3015:   NULL,
3016:   /*150*/ NULL,
3017: };

3019: PetscErrorCode MatStoreValues_SeqBAIJ(Mat mat)
3020: {
3021:   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data;
3022:   PetscInt     nz  = aij->i[aij->mbs] * aij->bs2;


3026:   /* allocate space for values if not already there */
3027:   if (!aij->saved_values) { PetscMalloc1(nz + 1, &aij->saved_values); }

3029:   /* copy values over */
3030:   PetscArraycpy(aij->saved_values, aij->a, nz);
3031:   return 0;
3032: }

3034: PetscErrorCode MatRetrieveValues_SeqBAIJ(Mat mat)
3035: {
3036:   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data;
3037:   PetscInt     nz  = aij->i[aij->mbs] * aij->bs2;


3042:   /* copy values over */
3043:   PetscArraycpy(aij->a, aij->saved_values, nz);
3044:   return 0;
3045: }

3047: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqAIJ(Mat, MatType, MatReuse, Mat *);
3048: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqSBAIJ(Mat, MatType, MatReuse, Mat *);

3050: PetscErrorCode MatSeqBAIJSetPreallocation_SeqBAIJ(Mat B, PetscInt bs, PetscInt nz, PetscInt *nnz)
3051: {
3052:   Mat_SeqBAIJ *b;
3053:   PetscInt     i, mbs, nbs, bs2;
3054:   PetscBool    flg = PETSC_FALSE, skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;

3056:   if (nz >= 0 || nnz) realalloc = PETSC_TRUE;
3057:   if (nz == MAT_SKIP_ALLOCATION) {
3058:     skipallocation = PETSC_TRUE;
3059:     nz             = 0;
3060:   }

3062:   MatSetBlockSize(B, PetscAbs(bs));
3063:   PetscLayoutSetUp(B->rmap);
3064:   PetscLayoutSetUp(B->cmap);
3065:   PetscLayoutGetBlockSize(B->rmap, &bs);

3067:   B->preallocated = PETSC_TRUE;

3069:   mbs = B->rmap->n / bs;
3070:   nbs = B->cmap->n / bs;
3071:   bs2 = bs * bs;


3075:   if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3077:   if (nnz) {
3078:     for (i = 0; i < mbs; i++) {
3081:     }
3082:   }

3084:   b = (Mat_SeqBAIJ *)B->data;
3085:   PetscOptionsBegin(PetscObjectComm((PetscObject)B), NULL, "Optimize options for SEQBAIJ matrix 2 ", "Mat");
3086:   PetscOptionsBool("-mat_no_unroll", "Do not optimize for block size (slow)", NULL, flg, &flg, NULL);
3087:   PetscOptionsEnd();

3089:   if (!flg) {
3090:     switch (bs) {
3091:     case 1:
3092:       B->ops->mult    = MatMult_SeqBAIJ_1;
3093:       B->ops->multadd = MatMultAdd_SeqBAIJ_1;
3094:       break;
3095:     case 2:
3096:       B->ops->mult    = MatMult_SeqBAIJ_2;
3097:       B->ops->multadd = MatMultAdd_SeqBAIJ_2;
3098:       break;
3099:     case 3:
3100:       B->ops->mult    = MatMult_SeqBAIJ_3;
3101:       B->ops->multadd = MatMultAdd_SeqBAIJ_3;
3102:       break;
3103:     case 4:
3104:       B->ops->mult    = MatMult_SeqBAIJ_4;
3105:       B->ops->multadd = MatMultAdd_SeqBAIJ_4;
3106:       break;
3107:     case 5:
3108:       B->ops->mult    = MatMult_SeqBAIJ_5;
3109:       B->ops->multadd = MatMultAdd_SeqBAIJ_5;
3110:       break;
3111:     case 6:
3112:       B->ops->mult    = MatMult_SeqBAIJ_6;
3113:       B->ops->multadd = MatMultAdd_SeqBAIJ_6;
3114:       break;
3115:     case 7:
3116:       B->ops->mult    = MatMult_SeqBAIJ_7;
3117:       B->ops->multadd = MatMultAdd_SeqBAIJ_7;
3118:       break;
3119:     case 9: {
3120:       PetscInt version = 1;
3121:       PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL);
3122:       switch (version) {
3123: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
3124:       case 1:
3125:         B->ops->mult    = MatMult_SeqBAIJ_9_AVX2;
3126:         B->ops->multadd = MatMultAdd_SeqBAIJ_9_AVX2;
3127:         PetscInfo((PetscObject)B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs);
3128:         break;
3129: #endif
3130:       default:
3131:         B->ops->mult    = MatMult_SeqBAIJ_N;
3132:         B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3133:         PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs);
3134:         break;
3135:       }
3136:       break;
3137:     }
3138:     case 11:
3139:       B->ops->mult    = MatMult_SeqBAIJ_11;
3140:       B->ops->multadd = MatMultAdd_SeqBAIJ_11;
3141:       break;
3142:     case 12: {
3143:       PetscInt version = 1;
3144:       PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL);
3145:       switch (version) {
3146:       case 1:
3147:         B->ops->mult    = MatMult_SeqBAIJ_12_ver1;
3148:         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
3149:         PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs);
3150:         break;
3151:       case 2:
3152:         B->ops->mult    = MatMult_SeqBAIJ_12_ver2;
3153:         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver2;
3154:         PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs);
3155:         break;
3156: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
3157:       case 3:
3158:         B->ops->mult    = MatMult_SeqBAIJ_12_AVX2;
3159:         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
3160:         PetscInfo((PetscObject)B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs);
3161:         break;
3162: #endif
3163:       default:
3164:         B->ops->mult    = MatMult_SeqBAIJ_N;
3165:         B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3166:         PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs);
3167:         break;
3168:       }
3169:       break;
3170:     }
3171:     case 15: {
3172:       PetscInt version = 1;
3173:       PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL);
3174:       switch (version) {
3175:       case 1:
3176:         B->ops->mult = MatMult_SeqBAIJ_15_ver1;
3177:         PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs);
3178:         break;
3179:       case 2:
3180:         B->ops->mult = MatMult_SeqBAIJ_15_ver2;
3181:         PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs);
3182:         break;
3183:       case 3:
3184:         B->ops->mult = MatMult_SeqBAIJ_15_ver3;
3185:         PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs);
3186:         break;
3187:       case 4:
3188:         B->ops->mult = MatMult_SeqBAIJ_15_ver4;
3189:         PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs);
3190:         break;
3191:       default:
3192:         B->ops->mult = MatMult_SeqBAIJ_N;
3193:         PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs);
3194:         break;
3195:       }
3196:       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3197:       break;
3198:     }
3199:     default:
3200:       B->ops->mult    = MatMult_SeqBAIJ_N;
3201:       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3202:       PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs);
3203:       break;
3204:     }
3205:   }
3206:   B->ops->sor = MatSOR_SeqBAIJ;
3207:   b->mbs      = mbs;
3208:   b->nbs      = nbs;
3209:   if (!skipallocation) {
3210:     if (!b->imax) {
3211:       PetscMalloc2(mbs, &b->imax, mbs, &b->ilen);

3213:       b->free_imax_ilen = PETSC_TRUE;
3214:     }
3215:     /* b->ilen will count nonzeros in each block row so far. */
3216:     for (i = 0; i < mbs; i++) b->ilen[i] = 0;
3217:     if (!nnz) {
3218:       if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3219:       else if (nz < 0) nz = 1;
3220:       nz = PetscMin(nz, nbs);
3221:       for (i = 0; i < mbs; i++) b->imax[i] = nz;
3222:       PetscIntMultError(nz, mbs, &nz);
3223:     } else {
3224:       PetscInt64 nz64 = 0;
3225:       for (i = 0; i < mbs; i++) {
3226:         b->imax[i] = nnz[i];
3227:         nz64 += nnz[i];
3228:       }
3229:       PetscIntCast(nz64, &nz);
3230:     }

3232:     /* allocate the matrix space */
3233:     MatSeqXAIJFreeAIJ(B, &b->a, &b->j, &b->i);
3234:     if (B->structure_only) {
3235:       PetscMalloc1(nz, &b->j);
3236:       PetscMalloc1(B->rmap->N + 1, &b->i);
3237:     } else {
3238:       PetscInt nzbs2 = 0;
3239:       PetscIntMultError(nz, bs2, &nzbs2);
3240:       PetscMalloc3(nzbs2, &b->a, nz, &b->j, B->rmap->N + 1, &b->i);
3241:       PetscArrayzero(b->a, nz * bs2);
3242:     }
3243:     PetscArrayzero(b->j, nz);

3245:     if (B->structure_only) {
3246:       b->singlemalloc = PETSC_FALSE;
3247:       b->free_a       = PETSC_FALSE;
3248:     } else {
3249:       b->singlemalloc = PETSC_TRUE;
3250:       b->free_a       = PETSC_TRUE;
3251:     }
3252:     b->free_ij = PETSC_TRUE;

3254:     b->i[0] = 0;
3255:     for (i = 1; i < mbs + 1; i++) b->i[i] = b->i[i - 1] + b->imax[i - 1];

3257:   } else {
3258:     b->free_a  = PETSC_FALSE;
3259:     b->free_ij = PETSC_FALSE;
3260:   }

3262:   b->bs2              = bs2;
3263:   b->mbs              = mbs;
3264:   b->nz               = 0;
3265:   b->maxnz            = nz;
3266:   B->info.nz_unneeded = (PetscReal)b->maxnz * bs2;
3267:   B->was_assembled    = PETSC_FALSE;
3268:   B->assembled        = PETSC_FALSE;
3269:   if (realalloc) MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE);
3270:   return 0;
3271: }

3273: PetscErrorCode MatSeqBAIJSetPreallocationCSR_SeqBAIJ(Mat B, PetscInt bs, const PetscInt ii[], const PetscInt jj[], const PetscScalar V[])
3274: {
3275:   PetscInt     i, m, nz, nz_max = 0, *nnz;
3276:   PetscScalar *values      = NULL;
3277:   PetscBool    roworiented = ((Mat_SeqBAIJ *)B->data)->roworiented;

3280:   PetscLayoutSetBlockSize(B->rmap, bs);
3281:   PetscLayoutSetBlockSize(B->cmap, bs);
3282:   PetscLayoutSetUp(B->rmap);
3283:   PetscLayoutSetUp(B->cmap);
3284:   PetscLayoutGetBlockSize(B->rmap, &bs);
3285:   m = B->rmap->n / bs;

3288:   PetscMalloc1(m + 1, &nnz);
3289:   for (i = 0; i < m; i++) {
3290:     nz = ii[i + 1] - ii[i];
3292:     nz_max = PetscMax(nz_max, nz);
3293:     nnz[i] = nz;
3294:   }
3295:   MatSeqBAIJSetPreallocation(B, bs, 0, nnz);
3296:   PetscFree(nnz);

3298:   values = (PetscScalar *)V;
3299:   if (!values) PetscCalloc1(bs * bs * (nz_max + 1), &values);
3300:   for (i = 0; i < m; i++) {
3301:     PetscInt        ncols = ii[i + 1] - ii[i];
3302:     const PetscInt *icols = jj + ii[i];
3303:     if (bs == 1 || !roworiented) {
3304:       const PetscScalar *svals = values + (V ? (bs * bs * ii[i]) : 0);
3305:       MatSetValuesBlocked_SeqBAIJ(B, 1, &i, ncols, icols, svals, INSERT_VALUES);
3306:     } else {
3307:       PetscInt j;
3308:       for (j = 0; j < ncols; j++) {
3309:         const PetscScalar *svals = values + (V ? (bs * bs * (ii[i] + j)) : 0);
3310:         MatSetValuesBlocked_SeqBAIJ(B, 1, &i, 1, &icols[j], svals, INSERT_VALUES);
3311:       }
3312:     }
3313:   }
3314:   if (!V) PetscFree(values);
3315:   MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
3316:   MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
3317:   MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE);
3318:   return 0;
3319: }

3321: /*@C
3322:    MatSeqBAIJGetArray - gives read/write access to the array where the data for a `MATSEQBAIJ` matrix is stored

3324:    Not Collective

3326:    Input Parameter:
3327: .  mat - a `MATSEQBAIJ` matrix

3329:    Output Parameter:
3330: .   array - pointer to the data

3332:    Level: intermediate

3334: .seealso: `MATSEQBAIJ`, `MatSeqBAIJRestoreArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()`
3335: @*/
3336: PetscErrorCode MatSeqBAIJGetArray(Mat A, PetscScalar **array)
3337: {
3338:   PetscUseMethod(A, "MatSeqBAIJGetArray_C", (Mat, PetscScalar **), (A, array));
3339:   return 0;
3340: }

3342: /*@C
3343:    MatSeqBAIJRestoreArray - returns access to the array where the data for a `MATSEQBAIJ` matrix is stored obtained by `MatSeqBAIJGetArray()`

3345:    Not Collective

3347:    Input Parameters:
3348: +  mat - a `MATSEQBAIJ` matrix
3349: -  array - pointer to the data

3351:    Level: intermediate

3353: .seealso: `MatSeqBAIJGetArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()`
3354: @*/
3355: PetscErrorCode MatSeqBAIJRestoreArray(Mat A, PetscScalar **array)
3356: {
3357:   PetscUseMethod(A, "MatSeqBAIJRestoreArray_C", (Mat, PetscScalar **), (A, array));
3358:   return 0;
3359: }

3361: /*MC
3362:    MATSEQBAIJ - MATSEQBAIJ = "seqbaij" - A matrix type to be used for sequential block sparse matrices, based on
3363:    block sparse compressed row format.

3365:    Options Database Keys:
3366: + -mat_type seqbaij - sets the matrix type to "seqbaij" during a call to MatSetFromOptions()
3367: - -mat_baij_mult_version version - indicate the version of the matrix-vector product to use (0 often indicates using BLAS)

3369:    Level: beginner

3371:    Notes:
3372:     `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
3373:     space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored

3375:    Run with -info to see what version of the matrix-vector product is being used

3377: .seealso: `MatCreateSeqBAIJ()`
3378: M*/

3380: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBSTRM(Mat, MatType, MatReuse, Mat *);

3382: PETSC_EXTERN PetscErrorCode MatCreate_SeqBAIJ(Mat B)
3383: {
3384:   PetscMPIInt  size;
3385:   Mat_SeqBAIJ *b;

3387:   MPI_Comm_size(PetscObjectComm((PetscObject)B), &size);

3390:   PetscNew(&b);
3391:   B->data = (void *)b;
3392:   PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps));

3394:   b->row          = NULL;
3395:   b->col          = NULL;
3396:   b->icol         = NULL;
3397:   b->reallocs     = 0;
3398:   b->saved_values = NULL;

3400:   b->roworiented        = PETSC_TRUE;
3401:   b->nonew              = 0;
3402:   b->diag               = NULL;
3403:   B->spptr              = NULL;
3404:   B->info.nz_unneeded   = (PetscReal)b->maxnz * b->bs2;
3405:   b->keepnonzeropattern = PETSC_FALSE;

3407:   PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJGetArray_C", MatSeqBAIJGetArray_SeqBAIJ);
3408:   PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJRestoreArray_C", MatSeqBAIJRestoreArray_SeqBAIJ);
3409:   PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqBAIJ);
3410:   PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqBAIJ);
3411:   PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetColumnIndices_C", MatSeqBAIJSetColumnIndices_SeqBAIJ);
3412:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqaij_C", MatConvert_SeqBAIJ_SeqAIJ);
3413:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqsbaij_C", MatConvert_SeqBAIJ_SeqSBAIJ);
3414:   PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocation_C", MatSeqBAIJSetPreallocation_SeqBAIJ);
3415:   PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocationCSR_C", MatSeqBAIJSetPreallocationCSR_SeqBAIJ);
3416:   PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_SeqBAIJ);
3417: #if defined(PETSC_HAVE_HYPRE)
3418:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_hypre_C", MatConvert_AIJ_HYPRE);
3419: #endif
3420:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_is_C", MatConvert_XAIJ_IS);
3421:   PetscObjectChangeTypeName((PetscObject)B, MATSEQBAIJ);
3422:   return 0;
3423: }

3425: PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
3426: {
3427:   Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data, *a = (Mat_SeqBAIJ *)A->data;
3428:   PetscInt     i, mbs = a->mbs, nz = a->nz, bs2 = a->bs2;


3432:   if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3433:     c->imax           = a->imax;
3434:     c->ilen           = a->ilen;
3435:     c->free_imax_ilen = PETSC_FALSE;
3436:   } else {
3437:     PetscMalloc2(mbs, &c->imax, mbs, &c->ilen);
3438:     for (i = 0; i < mbs; i++) {
3439:       c->imax[i] = a->imax[i];
3440:       c->ilen[i] = a->ilen[i];
3441:     }
3442:     c->free_imax_ilen = PETSC_TRUE;
3443:   }

3445:   /* allocate the matrix space */
3446:   if (mallocmatspace) {
3447:     if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3448:       PetscCalloc1(bs2 * nz, &c->a);

3450:       c->i            = a->i;
3451:       c->j            = a->j;
3452:       c->singlemalloc = PETSC_FALSE;
3453:       c->free_a       = PETSC_TRUE;
3454:       c->free_ij      = PETSC_FALSE;
3455:       c->parent       = A;
3456:       C->preallocated = PETSC_TRUE;
3457:       C->assembled    = PETSC_TRUE;

3459:       PetscObjectReference((PetscObject)A);
3460:       MatSetOption(A, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE);
3461:       MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE);
3462:     } else {
3463:       PetscMalloc3(bs2 * nz, &c->a, nz, &c->j, mbs + 1, &c->i);

3465:       c->singlemalloc = PETSC_TRUE;
3466:       c->free_a       = PETSC_TRUE;
3467:       c->free_ij      = PETSC_TRUE;

3469:       PetscArraycpy(c->i, a->i, mbs + 1);
3470:       if (mbs > 0) {
3471:         PetscArraycpy(c->j, a->j, nz);
3472:         if (cpvalues == MAT_COPY_VALUES) {
3473:           PetscArraycpy(c->a, a->a, bs2 * nz);
3474:         } else {
3475:           PetscArrayzero(c->a, bs2 * nz);
3476:         }
3477:       }
3478:       C->preallocated = PETSC_TRUE;
3479:       C->assembled    = PETSC_TRUE;
3480:     }
3481:   }

3483:   c->roworiented = a->roworiented;
3484:   c->nonew       = a->nonew;

3486:   PetscLayoutReference(A->rmap, &C->rmap);
3487:   PetscLayoutReference(A->cmap, &C->cmap);

3489:   c->bs2 = a->bs2;
3490:   c->mbs = a->mbs;
3491:   c->nbs = a->nbs;

3493:   if (a->diag) {
3494:     if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3495:       c->diag      = a->diag;
3496:       c->free_diag = PETSC_FALSE;
3497:     } else {
3498:       PetscMalloc1(mbs + 1, &c->diag);
3499:       for (i = 0; i < mbs; i++) c->diag[i] = a->diag[i];
3500:       c->free_diag = PETSC_TRUE;
3501:     }
3502:   } else c->diag = NULL;

3504:   c->nz         = a->nz;
3505:   c->maxnz      = a->nz; /* Since we allocate exactly the right amount */
3506:   c->solve_work = NULL;
3507:   c->mult_work  = NULL;
3508:   c->sor_workt  = NULL;
3509:   c->sor_work   = NULL;

3511:   c->compressedrow.use   = a->compressedrow.use;
3512:   c->compressedrow.nrows = a->compressedrow.nrows;
3513:   if (a->compressedrow.use) {
3514:     i = a->compressedrow.nrows;
3515:     PetscMalloc2(i + 1, &c->compressedrow.i, i + 1, &c->compressedrow.rindex);
3516:     PetscArraycpy(c->compressedrow.i, a->compressedrow.i, i + 1);
3517:     PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, i);
3518:   } else {
3519:     c->compressedrow.use    = PETSC_FALSE;
3520:     c->compressedrow.i      = NULL;
3521:     c->compressedrow.rindex = NULL;
3522:   }
3523:   C->nonzerostate = A->nonzerostate;

3525:   PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist);
3526:   return 0;
3527: }

3529: PetscErrorCode MatDuplicate_SeqBAIJ(Mat A, MatDuplicateOption cpvalues, Mat *B)
3530: {
3531:   MatCreate(PetscObjectComm((PetscObject)A), B);
3532:   MatSetSizes(*B, A->rmap->N, A->cmap->n, A->rmap->N, A->cmap->n);
3533:   MatSetType(*B, MATSEQBAIJ);
3534:   MatDuplicateNoCreate_SeqBAIJ(*B, A, cpvalues, PETSC_TRUE);
3535:   return 0;
3536: }

3538: /* Used for both SeqBAIJ and SeqSBAIJ matrices */
3539: PetscErrorCode MatLoad_SeqBAIJ_Binary(Mat mat, PetscViewer viewer)
3540: {
3541:   PetscInt     header[4], M, N, nz, bs, m, n, mbs, nbs, rows, cols, sum, i, j, k;
3542:   PetscInt    *rowidxs, *colidxs;
3543:   PetscScalar *matvals;

3545:   PetscViewerSetUp(viewer);

3547:   /* read matrix header */
3548:   PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT);
3550:   M  = header[1];
3551:   N  = header[2];
3552:   nz = header[3];

3557:   /* set block sizes from the viewer's .info file */
3558:   MatLoad_Binary_BlockSizes(mat, viewer);
3559:   /* set local and global sizes if not set already */
3560:   if (mat->rmap->n < 0) mat->rmap->n = M;
3561:   if (mat->cmap->n < 0) mat->cmap->n = N;
3562:   if (mat->rmap->N < 0) mat->rmap->N = M;
3563:   if (mat->cmap->N < 0) mat->cmap->N = N;
3564:   PetscLayoutSetUp(mat->rmap);
3565:   PetscLayoutSetUp(mat->cmap);

3567:   /* check if the matrix sizes are correct */
3568:   MatGetSize(mat, &rows, &cols);
3570:   MatGetBlockSize(mat, &bs);
3571:   MatGetLocalSize(mat, &m, &n);
3572:   mbs = m / bs;
3573:   nbs = n / bs;

3575:   /* read in row lengths, column indices and nonzero values */
3576:   PetscMalloc1(m + 1, &rowidxs);
3577:   PetscViewerBinaryRead(viewer, rowidxs + 1, m, NULL, PETSC_INT);
3578:   rowidxs[0] = 0;
3579:   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3580:   sum = rowidxs[m];

3583:   /* read in column indices and nonzero values */
3584:   PetscMalloc2(rowidxs[m], &colidxs, nz, &matvals);
3585:   PetscViewerBinaryRead(viewer, colidxs, rowidxs[m], NULL, PETSC_INT);
3586:   PetscViewerBinaryRead(viewer, matvals, rowidxs[m], NULL, PETSC_SCALAR);

3588:   {               /* preallocate matrix storage */
3589:     PetscBT   bt; /* helper bit set to count nonzeros */
3590:     PetscInt *nnz;
3591:     PetscBool sbaij;

3593:     PetscBTCreate(nbs, &bt);
3594:     PetscCalloc1(mbs, &nnz);
3595:     PetscObjectTypeCompare((PetscObject)mat, MATSEQSBAIJ, &sbaij);
3596:     for (i = 0; i < mbs; i++) {
3597:       PetscBTMemzero(nbs, bt);
3598:       for (k = 0; k < bs; k++) {
3599:         PetscInt row = bs * i + k;
3600:         for (j = rowidxs[row]; j < rowidxs[row + 1]; j++) {
3601:           PetscInt col = colidxs[j];
3602:           if (!sbaij || col >= row)
3603:             if (!PetscBTLookupSet(bt, col / bs)) nnz[i]++;
3604:         }
3605:       }
3606:     }
3607:     PetscBTDestroy(&bt);
3608:     MatSeqBAIJSetPreallocation(mat, bs, 0, nnz);
3609:     MatSeqSBAIJSetPreallocation(mat, bs, 0, nnz);
3610:     PetscFree(nnz);
3611:   }

3613:   /* store matrix values */
3614:   for (i = 0; i < m; i++) {
3615:     PetscInt row = i, s = rowidxs[i], e = rowidxs[i + 1];
3616:     (*mat->ops->setvalues)(mat, 1, &row, e - s, colidxs + s, matvals + s, INSERT_VALUES);
3617:   }

3619:   PetscFree(rowidxs);
3620:   PetscFree2(colidxs, matvals);
3621:   MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY);
3622:   MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY);
3623:   return 0;
3624: }

3626: PetscErrorCode MatLoad_SeqBAIJ(Mat mat, PetscViewer viewer)
3627: {
3628:   PetscBool isbinary;

3630:   PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary);
3632:   MatLoad_SeqBAIJ_Binary(mat, viewer);
3633:   return 0;
3634: }

3636: /*@C
3637:    MatCreateSeqBAIJ - Creates a sparse matrix in `MATSEQAIJ` (block
3638:    compressed row) format.  For good matrix assembly performance the
3639:    user should preallocate the matrix storage by setting the parameter nz
3640:    (or the array nnz).  By setting these parameters accurately, performance
3641:    during matrix assembly can be increased by more than a factor of 50.

3643:    Collective

3645:    Input Parameters:
3646: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3647: .  bs - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
3648:           blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
3649: .  m - number of rows
3650: .  n - number of columns
3651: .  nz - number of nonzero blocks  per block row (same for all rows)
3652: -  nnz - array containing the number of nonzero blocks in the various block rows
3653:          (possibly different for each block row) or NULL

3655:    Output Parameter:
3656: .  A - the matrix

3658:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3659:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
3660:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

3662:    Options Database Keys:
3663: +   -mat_no_unroll - uses code that does not unroll the loops in the block calculations (much slower)
3664: -   -mat_block_size - size of the blocks to use

3666:    Level: intermediate

3668:    Notes:
3669:    The number of rows and columns must be divisible by blocksize.

3671:    If the nnz parameter is given then the nz parameter is ignored

3673:    A nonzero block is any block that as 1 or more nonzeros in it

3675:    The `MATSEQBAIJ` format is fully compatible with standard Fortran 77
3676:    storage.  That is, the stored row and column indices can begin at
3677:    either one (as in Fortran) or zero.  See the users' manual for details.

3679:    Specify the preallocated storage with either nz or nnz (not both).
3680:    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3681:    allocation.  See [Sparse Matrices](sec_matsparse) for details.
3682:    matrices.

3684: .seealso: [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`
3685: @*/
3686: PetscErrorCode MatCreateSeqBAIJ(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3687: {
3688:   MatCreate(comm, A);
3689:   MatSetSizes(*A, m, n, m, n);
3690:   MatSetType(*A, MATSEQBAIJ);
3691:   MatSeqBAIJSetPreallocation(*A, bs, nz, (PetscInt *)nnz);
3692:   return 0;
3693: }

3695: /*@C
3696:    MatSeqBAIJSetPreallocation - Sets the block size and expected nonzeros
3697:    per row in the matrix. For good matrix assembly performance the
3698:    user should preallocate the matrix storage by setting the parameter nz
3699:    (or the array nnz).  By setting these parameters accurately, performance
3700:    during matrix assembly can be increased by more than a factor of 50.

3702:    Collective

3704:    Input Parameters:
3705: +  B - the matrix
3706: .  bs - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
3707:           blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
3708: .  nz - number of block nonzeros per block row (same for all rows)
3709: -  nnz - array containing the number of block nonzeros in the various block rows
3710:          (possibly different for each block row) or NULL

3712:    Options Database Keys:
3713: +   -mat_no_unroll - uses code that does not unroll the loops in the block calculations (much slower)
3714: -   -mat_block_size - size of the blocks to use

3716:    Level: intermediate

3718:    Notes:
3719:    If the nnz parameter is given then the nz parameter is ignored

3721:    You can call `MatGetInfo()` to get information on how effective the preallocation was;
3722:    for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
3723:    You can also run with the option -info and look for messages with the string
3724:    malloc in them to see if additional memory allocation was needed.

3726:    The `MATSEQBAIJ` format is fully compatible with standard Fortran 77
3727:    storage.  That is, the stored row and column indices can begin at
3728:    either one (as in Fortran) or zero.  See the users' manual for details.

3730:    Specify the preallocated storage with either nz or nnz (not both).
3731:    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3732:    allocation.  See [Sparse Matrices](sec_matsparse) for details.

3734: .seealso: [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`, `MatGetInfo()`
3735: @*/
3736: PetscErrorCode MatSeqBAIJSetPreallocation(Mat B, PetscInt bs, PetscInt nz, const PetscInt nnz[])
3737: {
3741:   PetscTryMethod(B, "MatSeqBAIJSetPreallocation_C", (Mat, PetscInt, PetscInt, const PetscInt[]), (B, bs, nz, nnz));
3742:   return 0;
3743: }

3745: /*@C
3746:    MatSeqBAIJSetPreallocationCSR - Creates a sparse sequential matrix in `MATSEQBAIJ` format using the given nonzero structure and (optional) numerical values

3748:    Collective

3750:    Input Parameters:
3751: +  B - the matrix
3752: .  i - the indices into j for the start of each local row (starts with zero)
3753: .  j - the column indices for each local row (starts with zero) these must be sorted for each row
3754: -  v - optional values in the matrix

3756:    Level: advanced

3758:    Notes:
3759:    The order of the entries in values is specified by the `MatOption` `MAT_ROW_ORIENTED`.  For example, C programs
3760:    may want to use the default `MAT_ROW_ORIENTED` of `PETSC_TRUE` and use an array v[nnz][bs][bs] where the second index is
3761:    over rows within a block and the last index is over columns within a block row.  Fortran programs will likely set
3762:    `MAT_ROW_ORIENTED` of `PETSC_FALSE` and use a Fortran array v(bs,bs,nnz) in which the first index is over rows within a
3763:    block column and the second index is over columns within a block.

3765:    Though this routine has Preallocation() in the name it also sets the exact nonzero locations of the matrix entries and usually the numerical values as well

3767: .seealso: `MatCreate()`, `MatCreateSeqBAIJ()`, `MatSetValues()`, `MatSeqBAIJSetPreallocation()`, `MATSEQBAIJ`
3768: @*/
3769: PetscErrorCode MatSeqBAIJSetPreallocationCSR(Mat B, PetscInt bs, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
3770: {
3774:   PetscTryMethod(B, "MatSeqBAIJSetPreallocationCSR_C", (Mat, PetscInt, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, bs, i, j, v));
3775:   return 0;
3776: }

3778: /*@
3779:      MatCreateSeqBAIJWithArrays - Creates a `MATSEQBAIJ` matrix using matrix elements provided by the user.

3781:      Collective

3783:    Input Parameters:
3784: +  comm - must be an MPI communicator of size 1
3785: .  bs - size of block
3786: .  m - number of rows
3787: .  n - number of columns
3788: .  i - row indices; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row block row of the matrix
3789: .  j - column indices
3790: -  a - matrix values

3792:    Output Parameter:
3793: .  mat - the matrix

3795:    Level: advanced

3797:    Notes:
3798:        The i, j, and a arrays are not copied by this routine, the user must free these arrays
3799:     once the matrix is destroyed

3801:        You cannot set new nonzero locations into this matrix, that will generate an error.

3803:        The i and j indices are 0 based

3805:        When block size is greater than 1 the matrix values must be stored using the `MATSEQBAIJ` storage format

3807:       The order of the entries in values is the same as the block compressed sparse row storage format; that is, it is
3808:       the same as a three dimensional array in Fortran values(bs,bs,nnz) that contains the first column of the first
3809:       block, followed by the second column of the first block etc etc.  That is, the blocks are contiguous in memory
3810:       with column-major ordering within blocks.

3812: .seealso: `MatCreate()`, `MatCreateBAIJ()`, `MatCreateSeqBAIJ()`
3813: @*/
3814: PetscErrorCode MatCreateSeqBAIJWithArrays(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], Mat *mat)
3815: {
3816:   PetscInt     ii;
3817:   Mat_SeqBAIJ *baij;


3822:   MatCreate(comm, mat);
3823:   MatSetSizes(*mat, m, n, m, n);
3824:   MatSetType(*mat, MATSEQBAIJ);
3825:   MatSeqBAIJSetPreallocation(*mat, bs, MAT_SKIP_ALLOCATION, NULL);
3826:   baij = (Mat_SeqBAIJ *)(*mat)->data;
3827:   PetscMalloc2(m, &baij->imax, m, &baij->ilen);

3829:   baij->i = i;
3830:   baij->j = j;
3831:   baij->a = a;

3833:   baij->singlemalloc = PETSC_FALSE;
3834:   baij->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
3835:   baij->free_a       = PETSC_FALSE;
3836:   baij->free_ij      = PETSC_FALSE;

3838:   for (ii = 0; ii < m; ii++) {
3839:     baij->ilen[ii] = baij->imax[ii] = i[ii + 1] - i[ii];
3841:   }
3842:   if (PetscDefined(USE_DEBUG)) {
3843:     for (ii = 0; ii < baij->i[m]; ii++) {
3846:     }
3847:   }

3849:   MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY);
3850:   MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY);
3851:   return 0;
3852: }

3854: PetscErrorCode MatCreateMPIMatConcatenateSeqMat_SeqBAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
3855: {
3856:   MatCreateMPIMatConcatenateSeqMat_MPIBAIJ(comm, inmat, n, scall, outmat);
3857:   return 0;
3858: }