Actual source code: inode.c

  1: /*
  2:   This file provides high performance routines for the Inode format (compressed sparse row)
  3:   by taking advantage of rows with identical nonzero structure (I-nodes).
  4: */
  5: #include <../src/mat/impls/aij/seq/aij.h>
  6: #if defined(PETSC_HAVE_XMMINTRIN_H)
  7:   #include <xmmintrin.h>
  8: #endif

 10: static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
 11: {
 12:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
 13:   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;

 15:   PetscFunctionBegin;
 16:   n = A->cmap->n;
 17:   m = A->rmap->n;
 18:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
 19:   ns_row = a->inode.size;

 21:   min_mn = (m < n) ? m : n;
 22:   if (!ns) {
 23:     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++);
 24:     for (; count + 1 < n; count++, i++);
 25:     if (count < n) i++;
 26:     *size = i;
 27:     PetscFunctionReturn(PETSC_SUCCESS);
 28:   }
 29:   PetscCall(PetscMalloc1(n + 1, &ns_col));

 31:   /* Use the same row structure wherever feasible. */
 32:   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];

 34:   /* if m < n; pad up the remainder with inode_limit */
 35:   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
 36:   /* The last node is the odd ball. pad it up with the remaining rows; */
 37:   if (count < n) {
 38:     ns_col[i] = n - count;
 39:     i++;
 40:   } else if (count > n) {
 41:     /* Adjust for the over estimation */
 42:     ns_col[i - 1] += n - count;
 43:   }
 44:   *size = i;
 45:   *ns   = ns_col;
 46:   PetscFunctionReturn(PETSC_SUCCESS);
 47: }

 49: /*
 50:       This builds symmetric version of nonzero structure,
 51: */
 52: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
 53: {
 54:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
 55:   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
 56:   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
 57:   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;

 59:   PetscFunctionBegin;
 60:   nslim_row = a->inode.node_count;
 61:   m         = A->rmap->n;
 62:   n         = A->cmap->n;
 63:   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
 64:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");

 66:   /* Use the row_inode as column_inode */
 67:   nslim_col = nslim_row;
 68:   ns_col    = ns_row;

 70:   /* allocate space for reformatted inode structure */
 71:   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
 72:   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];

 74:   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
 75:     nsz = ns_col[i1];
 76:     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
 77:   }
 78:   /* allocate space for row pointers */
 79:   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
 80:   *iia = ia;
 81:   PetscCall(PetscMalloc1(nslim_row + 1, &work));

 83:   /* determine the number of columns in each row */
 84:   ia[0] = oshift;
 85:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
 86:     j    = aj + ai[row] + ishift;
 87:     jmax = aj + ai[row + 1] + ishift;
 88:     if (j == jmax) continue; /* empty row */
 89:     col = *j++ + ishift;
 90:     i2  = tvc[col];
 91:     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
 92:       ia[i1 + 1]++;
 93:       ia[i2 + 1]++;
 94:       i2++; /* Start col of next node */
 95:       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
 96:       i2 = tvc[col];
 97:     }
 98:     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
 99:   }

101:   /* shift ia[i] to point to next row */
102:   for (i1 = 1; i1 < nslim_row + 1; i1++) {
103:     row = ia[i1 - 1];
104:     ia[i1] += row;
105:     work[i1 - 1] = row - oshift;
106:   }

108:   /* allocate space for column pointers */
109:   nz = ia[nslim_row] + (!ishift);
110:   PetscCall(PetscMalloc1(nz, &ja));
111:   *jja = ja;

113:   /* loop over lower triangular part putting into ja */
114:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
115:     j    = aj + ai[row] + ishift;
116:     jmax = aj + ai[row + 1] + ishift;
117:     if (j == jmax) continue; /* empty row */
118:     col = *j++ + ishift;
119:     i2  = tvc[col];
120:     while (i2 < i1 && j < jmax) {
121:       ja[work[i2]++] = i1 + oshift;
122:       ja[work[i1]++] = i2 + oshift;
123:       ++i2;
124:       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
125:       i2 = tvc[col];
126:     }
127:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
128:   }
129:   PetscCall(PetscFree(work));
130:   PetscCall(PetscFree2(tns, tvc));
131:   PetscFunctionReturn(PETSC_SUCCESS);
132: }

134: /*
135:       This builds nonsymmetric version of nonzero structure,
136: */
137: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
138: {
139:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
140:   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
141:   PetscInt       *tns, *tvc, nsz, i1, i2;
142:   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;

144:   PetscFunctionBegin;
145:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
146:   nslim_row = a->inode.node_count;
147:   n         = A->cmap->n;

149:   /* Create The column_inode for this matrix */
150:   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));

152:   /* allocate space for reformatted column_inode structure */
153:   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
154:   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];

156:   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
157:     nsz = ns_col[i1];
158:     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
159:   }
160:   /* allocate space for row pointers */
161:   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
162:   *iia = ia;
163:   PetscCall(PetscMalloc1(nslim_row + 1, &work));

165:   /* determine the number of columns in each row */
166:   ia[0] = oshift;
167:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
168:     j  = aj + ai[row] + ishift;
169:     nz = ai[row + 1] - ai[row];
170:     if (!nz) continue; /* empty row */
171:     col = *j++ + ishift;
172:     i2  = tvc[col];
173:     while (nz-- > 0) { /* off-diagonal elements */
174:       ia[i1 + 1]++;
175:       i2++; /* Start col of next node */
176:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
177:       if (nz > 0) i2 = tvc[col];
178:     }
179:   }

181:   /* shift ia[i] to point to next row */
182:   for (i1 = 1; i1 < nslim_row + 1; i1++) {
183:     row = ia[i1 - 1];
184:     ia[i1] += row;
185:     work[i1 - 1] = row - oshift;
186:   }

188:   /* allocate space for column pointers */
189:   nz = ia[nslim_row] + (!ishift);
190:   PetscCall(PetscMalloc1(nz, &ja));
191:   *jja = ja;

193:   /* loop over matrix putting into ja */
194:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
195:     j  = aj + ai[row] + ishift;
196:     nz = ai[row + 1] - ai[row];
197:     if (!nz) continue; /* empty row */
198:     col = *j++ + ishift;
199:     i2  = tvc[col];
200:     while (nz-- > 0) {
201:       ja[work[i1]++] = i2 + oshift;
202:       ++i2;
203:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
204:       if (nz > 0) i2 = tvc[col];
205:     }
206:   }
207:   PetscCall(PetscFree(ns_col));
208:   PetscCall(PetscFree(work));
209:   PetscCall(PetscFree2(tns, tvc));
210:   PetscFunctionReturn(PETSC_SUCCESS);
211: }

213: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
214: {
215:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

217:   PetscFunctionBegin;
218:   if (n) *n = a->inode.node_count;
219:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
220:   if (!blockcompressed) {
221:     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
222:   } else if (symmetric) {
223:     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
224:   } else {
225:     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
226:   }
227:   PetscFunctionReturn(PETSC_SUCCESS);
228: }

230: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
231: {
232:   PetscFunctionBegin;
233:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);

235:   if (!blockcompressed) {
236:     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
237:   } else {
238:     PetscCall(PetscFree(*ia));
239:     PetscCall(PetscFree(*ja));
240:   }
241:   PetscFunctionReturn(PETSC_SUCCESS);
242: }

244: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
245: {
246:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
247:   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
248:   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;

250:   PetscFunctionBegin;
251:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
252:   nslim_row = a->inode.node_count;
253:   n         = A->cmap->n;

255:   /* Create The column_inode for this matrix */
256:   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));

258:   /* allocate space for reformatted column_inode structure */
259:   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
260:   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];

262:   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
263:     nsz = ns_col[i1];
264:     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
265:   }
266:   /* allocate space for column pointers */
267:   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
268:   *iia = ia;
269:   PetscCall(PetscMalloc1(nslim_col + 1, &work));

271:   /* determine the number of columns in each row */
272:   ia[0] = oshift;
273:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
274:     j   = aj + ai[row] + ishift;
275:     col = *j++ + ishift;
276:     i2  = tvc[col];
277:     nz  = ai[row + 1] - ai[row];
278:     while (nz-- > 0) { /* off-diagonal elements */
279:       /* ia[i1+1]++; */
280:       ia[i2 + 1]++;
281:       i2++;
282:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
283:       if (nz > 0) i2 = tvc[col];
284:     }
285:   }

287:   /* shift ia[i] to point to next col */
288:   for (i1 = 1; i1 < nslim_col + 1; i1++) {
289:     col = ia[i1 - 1];
290:     ia[i1] += col;
291:     work[i1 - 1] = col - oshift;
292:   }

294:   /* allocate space for column pointers */
295:   nz = ia[nslim_col] + (!ishift);
296:   PetscCall(PetscMalloc1(nz, &ja));
297:   *jja = ja;

299:   /* loop over matrix putting into ja */
300:   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
301:     j   = aj + ai[row] + ishift;
302:     col = *j++ + ishift;
303:     i2  = tvc[col];
304:     nz  = ai[row + 1] - ai[row];
305:     while (nz-- > 0) {
306:       /* ja[work[i1]++] = i2 + oshift; */
307:       ja[work[i2]++] = i1 + oshift;
308:       i2++;
309:       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
310:       if (nz > 0) i2 = tvc[col];
311:     }
312:   }
313:   PetscCall(PetscFree(ns_col));
314:   PetscCall(PetscFree(work));
315:   PetscCall(PetscFree2(tns, tvc));
316:   PetscFunctionReturn(PETSC_SUCCESS);
317: }

319: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
320: {
321:   PetscFunctionBegin;
322:   PetscCall(MatCreateColInode_Private(A, n, NULL));
323:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);

325:   if (!blockcompressed) {
326:     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
327:   } else if (symmetric) {
328:     /* Since the indices are symmetric it doesn't matter */
329:     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
330:   } else {
331:     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
332:   }
333:   PetscFunctionReturn(PETSC_SUCCESS);
334: }

336: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
337: {
338:   PetscFunctionBegin;
339:   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
340:   if (!blockcompressed) {
341:     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
342:   } else {
343:     PetscCall(PetscFree(*ia));
344:     PetscCall(PetscFree(*ja));
345:   }
346:   PetscFunctionReturn(PETSC_SUCCESS);
347: }

349: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
350: {
351:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
352:   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
353:   PetscScalar       *y;
354:   const PetscScalar *x;
355:   const MatScalar   *v1, *v2, *v3, *v4, *v5;
356:   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
357:   const PetscInt    *idx, *ns, *ii;

359: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
360:   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
361: #endif

363:   PetscFunctionBegin;
364:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
365:   node_max = a->inode.node_count;
366:   ns       = a->inode.size; /* Node Size array */
367:   PetscCall(VecGetArrayRead(xx, &x));
368:   PetscCall(VecGetArray(yy, &y));
369:   idx = a->j;
370:   v1  = a->a;
371:   ii  = a->i;

373:   for (i = 0, row = 0; i < node_max; ++i) {
374:     nsz = ns[i];
375:     n   = ii[1] - ii[0];
376:     nonzerorow += (n > 0) * nsz;
377:     ii += nsz;
378:     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
379:     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
380:     sz = n;                                                                /* No of non zeros in this row */
381:                                                                            /* Switch on the size of Node */
382:     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
383:     case 1:
384:       sum1 = 0.;

386:       for (n = 0; n < sz - 1; n += 2) {
387:         i1 = idx[0]; /* The instructions are ordered to */
388:         i2 = idx[1]; /* make the compiler's job easy */
389:         idx += 2;
390:         tmp0 = x[i1];
391:         tmp1 = x[i2];
392:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
393:         v1 += 2;
394:       }

396:       if (n == sz - 1) { /* Take care of the last nonzero  */
397:         tmp0 = x[*idx++];
398:         sum1 += *v1++ * tmp0;
399:       }
400:       y[row++] = sum1;
401:       break;
402:     case 2:
403:       sum1 = 0.;
404:       sum2 = 0.;
405:       v2   = v1 + n;

407:       for (n = 0; n < sz - 1; n += 2) {
408:         i1 = idx[0];
409:         i2 = idx[1];
410:         idx += 2;
411:         tmp0 = x[i1];
412:         tmp1 = x[i2];
413:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
414:         v1 += 2;
415:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
416:         v2 += 2;
417:       }
418:       if (n == sz - 1) {
419:         tmp0 = x[*idx++];
420:         sum1 += *v1++ * tmp0;
421:         sum2 += *v2++ * tmp0;
422:       }
423:       y[row++] = sum1;
424:       y[row++] = sum2;
425:       v1       = v2; /* Since the next block to be processed starts there*/
426:       idx += sz;
427:       break;
428:     case 3:
429:       sum1 = 0.;
430:       sum2 = 0.;
431:       sum3 = 0.;
432:       v2   = v1 + n;
433:       v3   = v2 + n;

435:       for (n = 0; n < sz - 1; n += 2) {
436:         i1 = idx[0];
437:         i2 = idx[1];
438:         idx += 2;
439:         tmp0 = x[i1];
440:         tmp1 = x[i2];
441:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
442:         v1 += 2;
443:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
444:         v2 += 2;
445:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
446:         v3 += 2;
447:       }
448:       if (n == sz - 1) {
449:         tmp0 = x[*idx++];
450:         sum1 += *v1++ * tmp0;
451:         sum2 += *v2++ * tmp0;
452:         sum3 += *v3++ * tmp0;
453:       }
454:       y[row++] = sum1;
455:       y[row++] = sum2;
456:       y[row++] = sum3;
457:       v1       = v3; /* Since the next block to be processed starts there*/
458:       idx += 2 * sz;
459:       break;
460:     case 4:
461:       sum1 = 0.;
462:       sum2 = 0.;
463:       sum3 = 0.;
464:       sum4 = 0.;
465:       v2   = v1 + n;
466:       v3   = v2 + n;
467:       v4   = v3 + n;

469:       for (n = 0; n < sz - 1; n += 2) {
470:         i1 = idx[0];
471:         i2 = idx[1];
472:         idx += 2;
473:         tmp0 = x[i1];
474:         tmp1 = x[i2];
475:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
476:         v1 += 2;
477:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
478:         v2 += 2;
479:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
480:         v3 += 2;
481:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
482:         v4 += 2;
483:       }
484:       if (n == sz - 1) {
485:         tmp0 = x[*idx++];
486:         sum1 += *v1++ * tmp0;
487:         sum2 += *v2++ * tmp0;
488:         sum3 += *v3++ * tmp0;
489:         sum4 += *v4++ * tmp0;
490:       }
491:       y[row++] = sum1;
492:       y[row++] = sum2;
493:       y[row++] = sum3;
494:       y[row++] = sum4;
495:       v1       = v4; /* Since the next block to be processed starts there*/
496:       idx += 3 * sz;
497:       break;
498:     case 5:
499:       sum1 = 0.;
500:       sum2 = 0.;
501:       sum3 = 0.;
502:       sum4 = 0.;
503:       sum5 = 0.;
504:       v2   = v1 + n;
505:       v3   = v2 + n;
506:       v4   = v3 + n;
507:       v5   = v4 + n;

509:       for (n = 0; n < sz - 1; n += 2) {
510:         i1 = idx[0];
511:         i2 = idx[1];
512:         idx += 2;
513:         tmp0 = x[i1];
514:         tmp1 = x[i2];
515:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
516:         v1 += 2;
517:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
518:         v2 += 2;
519:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
520:         v3 += 2;
521:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
522:         v4 += 2;
523:         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
524:         v5 += 2;
525:       }
526:       if (n == sz - 1) {
527:         tmp0 = x[*idx++];
528:         sum1 += *v1++ * tmp0;
529:         sum2 += *v2++ * tmp0;
530:         sum3 += *v3++ * tmp0;
531:         sum4 += *v4++ * tmp0;
532:         sum5 += *v5++ * tmp0;
533:       }
534:       y[row++] = sum1;
535:       y[row++] = sum2;
536:       y[row++] = sum3;
537:       y[row++] = sum4;
538:       y[row++] = sum5;
539:       v1       = v5; /* Since the next block to be processed starts there */
540:       idx += 4 * sz;
541:       break;
542:     default:
543:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
544:     }
545:   }
546:   PetscCall(VecRestoreArrayRead(xx, &x));
547:   PetscCall(VecRestoreArray(yy, &y));
548:   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
549:   PetscFunctionReturn(PETSC_SUCCESS);
550: }

552: /* Almost same code as the MatMult_SeqAIJ_Inode() */
553: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
554: {
555:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
556:   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
557:   const MatScalar   *v1, *v2, *v3, *v4, *v5;
558:   const PetscScalar *x;
559:   PetscScalar       *y, *z, *zt;
560:   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
561:   const PetscInt    *idx, *ns, *ii;

563:   PetscFunctionBegin;
564:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
565:   node_max = a->inode.node_count;
566:   ns       = a->inode.size; /* Node Size array */

568:   PetscCall(VecGetArrayRead(xx, &x));
569:   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
570:   zt = z;

572:   idx = a->j;
573:   v1  = a->a;
574:   ii  = a->i;

576:   for (i = 0, row = 0; i < node_max; ++i) {
577:     nsz = ns[i];
578:     n   = ii[1] - ii[0];
579:     ii += nsz;
580:     sz = n;        /* No of non zeros in this row */
581:                    /* Switch on the size of Node */
582:     switch (nsz) { /* Each loop in 'case' is unrolled */
583:     case 1:
584:       sum1 = *zt++;

586:       for (n = 0; n < sz - 1; n += 2) {
587:         i1 = idx[0]; /* The instructions are ordered to */
588:         i2 = idx[1]; /* make the compiler's job easy */
589:         idx += 2;
590:         tmp0 = x[i1];
591:         tmp1 = x[i2];
592:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
593:         v1 += 2;
594:       }

596:       if (n == sz - 1) { /* Take care of the last nonzero  */
597:         tmp0 = x[*idx++];
598:         sum1 += *v1++ * tmp0;
599:       }
600:       y[row++] = sum1;
601:       break;
602:     case 2:
603:       sum1 = *zt++;
604:       sum2 = *zt++;
605:       v2   = v1 + n;

607:       for (n = 0; n < sz - 1; n += 2) {
608:         i1 = idx[0];
609:         i2 = idx[1];
610:         idx += 2;
611:         tmp0 = x[i1];
612:         tmp1 = x[i2];
613:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
614:         v1 += 2;
615:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
616:         v2 += 2;
617:       }
618:       if (n == sz - 1) {
619:         tmp0 = x[*idx++];
620:         sum1 += *v1++ * tmp0;
621:         sum2 += *v2++ * tmp0;
622:       }
623:       y[row++] = sum1;
624:       y[row++] = sum2;
625:       v1       = v2; /* Since the next block to be processed starts there*/
626:       idx += sz;
627:       break;
628:     case 3:
629:       sum1 = *zt++;
630:       sum2 = *zt++;
631:       sum3 = *zt++;
632:       v2   = v1 + n;
633:       v3   = v2 + n;

635:       for (n = 0; n < sz - 1; n += 2) {
636:         i1 = idx[0];
637:         i2 = idx[1];
638:         idx += 2;
639:         tmp0 = x[i1];
640:         tmp1 = x[i2];
641:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
642:         v1 += 2;
643:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
644:         v2 += 2;
645:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
646:         v3 += 2;
647:       }
648:       if (n == sz - 1) {
649:         tmp0 = x[*idx++];
650:         sum1 += *v1++ * tmp0;
651:         sum2 += *v2++ * tmp0;
652:         sum3 += *v3++ * tmp0;
653:       }
654:       y[row++] = sum1;
655:       y[row++] = sum2;
656:       y[row++] = sum3;
657:       v1       = v3; /* Since the next block to be processed starts there*/
658:       idx += 2 * sz;
659:       break;
660:     case 4:
661:       sum1 = *zt++;
662:       sum2 = *zt++;
663:       sum3 = *zt++;
664:       sum4 = *zt++;
665:       v2   = v1 + n;
666:       v3   = v2 + n;
667:       v4   = v3 + n;

669:       for (n = 0; n < sz - 1; n += 2) {
670:         i1 = idx[0];
671:         i2 = idx[1];
672:         idx += 2;
673:         tmp0 = x[i1];
674:         tmp1 = x[i2];
675:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
676:         v1 += 2;
677:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
678:         v2 += 2;
679:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
680:         v3 += 2;
681:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
682:         v4 += 2;
683:       }
684:       if (n == sz - 1) {
685:         tmp0 = x[*idx++];
686:         sum1 += *v1++ * tmp0;
687:         sum2 += *v2++ * tmp0;
688:         sum3 += *v3++ * tmp0;
689:         sum4 += *v4++ * tmp0;
690:       }
691:       y[row++] = sum1;
692:       y[row++] = sum2;
693:       y[row++] = sum3;
694:       y[row++] = sum4;
695:       v1       = v4; /* Since the next block to be processed starts there*/
696:       idx += 3 * sz;
697:       break;
698:     case 5:
699:       sum1 = *zt++;
700:       sum2 = *zt++;
701:       sum3 = *zt++;
702:       sum4 = *zt++;
703:       sum5 = *zt++;
704:       v2   = v1 + n;
705:       v3   = v2 + n;
706:       v4   = v3 + n;
707:       v5   = v4 + n;

709:       for (n = 0; n < sz - 1; n += 2) {
710:         i1 = idx[0];
711:         i2 = idx[1];
712:         idx += 2;
713:         tmp0 = x[i1];
714:         tmp1 = x[i2];
715:         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
716:         v1 += 2;
717:         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
718:         v2 += 2;
719:         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
720:         v3 += 2;
721:         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
722:         v4 += 2;
723:         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
724:         v5 += 2;
725:       }
726:       if (n == sz - 1) {
727:         tmp0 = x[*idx++];
728:         sum1 += *v1++ * tmp0;
729:         sum2 += *v2++ * tmp0;
730:         sum3 += *v3++ * tmp0;
731:         sum4 += *v4++ * tmp0;
732:         sum5 += *v5++ * tmp0;
733:       }
734:       y[row++] = sum1;
735:       y[row++] = sum2;
736:       y[row++] = sum3;
737:       y[row++] = sum4;
738:       y[row++] = sum5;
739:       v1       = v5; /* Since the next block to be processed starts there */
740:       idx += 4 * sz;
741:       break;
742:     default:
743:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
744:     }
745:   }
746:   PetscCall(VecRestoreArrayRead(xx, &x));
747:   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
748:   PetscCall(PetscLogFlops(2.0 * a->nz));
749:   PetscFunctionReturn(PETSC_SUCCESS);
750: }

752: static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
753: {
754:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
755:   IS                 iscol = a->col, isrow = a->row;
756:   const PetscInt    *r, *c, *rout, *cout;
757:   PetscInt           i, j, n = A->rmap->n, nz;
758:   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
759:   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
760:   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
761:   PetscScalar        sum1, sum2, sum3, sum4, sum5;
762:   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
763:   const PetscScalar *b;

765:   PetscFunctionBegin;
766:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
767:   node_max = a->inode.node_count;
768:   ns       = a->inode.size; /* Node Size array */

770:   PetscCall(VecGetArrayRead(bb, &b));
771:   PetscCall(VecGetArrayWrite(xx, &x));
772:   tmp = a->solve_work;

774:   PetscCall(ISGetIndices(isrow, &rout));
775:   r = rout;
776:   PetscCall(ISGetIndices(iscol, &cout));
777:   c = cout + (n - 1);

779:   /* forward solve the lower triangular */
780:   tmps = tmp;
781:   aa   = a_a;
782:   aj   = a_j;
783:   ad   = a->diag;

785:   for (i = 0, row = 0; i < node_max; ++i) {
786:     nsz = ns[i];
787:     aii = ai[row];
788:     v1  = aa + aii;
789:     vi  = aj + aii;
790:     nz  = ad[row] - aii;
791:     if (i < node_max - 1) {
792:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
793:       * but our indexing to determine its size could. */
794:       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
795:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
796:       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
797:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
798:     }

800:     switch (nsz) { /* Each loop in 'case' is unrolled */
801:     case 1:
802:       sum1 = b[*r++];
803:       for (j = 0; j < nz - 1; j += 2) {
804:         i0 = vi[0];
805:         i1 = vi[1];
806:         vi += 2;
807:         tmp0 = tmps[i0];
808:         tmp1 = tmps[i1];
809:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
810:         v1 += 2;
811:       }
812:       if (j == nz - 1) {
813:         tmp0 = tmps[*vi++];
814:         sum1 -= *v1++ * tmp0;
815:       }
816:       tmp[row++] = sum1;
817:       break;
818:     case 2:
819:       sum1 = b[*r++];
820:       sum2 = b[*r++];
821:       v2   = aa + ai[row + 1];

823:       for (j = 0; j < nz - 1; j += 2) {
824:         i0 = vi[0];
825:         i1 = vi[1];
826:         vi += 2;
827:         tmp0 = tmps[i0];
828:         tmp1 = tmps[i1];
829:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
830:         v1 += 2;
831:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
832:         v2 += 2;
833:       }
834:       if (j == nz - 1) {
835:         tmp0 = tmps[*vi++];
836:         sum1 -= *v1++ * tmp0;
837:         sum2 -= *v2++ * tmp0;
838:       }
839:       sum2 -= *v2++ * sum1;
840:       tmp[row++] = sum1;
841:       tmp[row++] = sum2;
842:       break;
843:     case 3:
844:       sum1 = b[*r++];
845:       sum2 = b[*r++];
846:       sum3 = b[*r++];
847:       v2   = aa + ai[row + 1];
848:       v3   = aa + ai[row + 2];

850:       for (j = 0; j < nz - 1; j += 2) {
851:         i0 = vi[0];
852:         i1 = vi[1];
853:         vi += 2;
854:         tmp0 = tmps[i0];
855:         tmp1 = tmps[i1];
856:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
857:         v1 += 2;
858:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
859:         v2 += 2;
860:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
861:         v3 += 2;
862:       }
863:       if (j == nz - 1) {
864:         tmp0 = tmps[*vi++];
865:         sum1 -= *v1++ * tmp0;
866:         sum2 -= *v2++ * tmp0;
867:         sum3 -= *v3++ * tmp0;
868:       }
869:       sum2 -= *v2++ * sum1;
870:       sum3 -= *v3++ * sum1;
871:       sum3 -= *v3++ * sum2;

873:       tmp[row++] = sum1;
874:       tmp[row++] = sum2;
875:       tmp[row++] = sum3;
876:       break;

878:     case 4:
879:       sum1 = b[*r++];
880:       sum2 = b[*r++];
881:       sum3 = b[*r++];
882:       sum4 = b[*r++];
883:       v2   = aa + ai[row + 1];
884:       v3   = aa + ai[row + 2];
885:       v4   = aa + ai[row + 3];

887:       for (j = 0; j < nz - 1; j += 2) {
888:         i0 = vi[0];
889:         i1 = vi[1];
890:         vi += 2;
891:         tmp0 = tmps[i0];
892:         tmp1 = tmps[i1];
893:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
894:         v1 += 2;
895:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
896:         v2 += 2;
897:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
898:         v3 += 2;
899:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
900:         v4 += 2;
901:       }
902:       if (j == nz - 1) {
903:         tmp0 = tmps[*vi++];
904:         sum1 -= *v1++ * tmp0;
905:         sum2 -= *v2++ * tmp0;
906:         sum3 -= *v3++ * tmp0;
907:         sum4 -= *v4++ * tmp0;
908:       }
909:       sum2 -= *v2++ * sum1;
910:       sum3 -= *v3++ * sum1;
911:       sum4 -= *v4++ * sum1;
912:       sum3 -= *v3++ * sum2;
913:       sum4 -= *v4++ * sum2;
914:       sum4 -= *v4++ * sum3;

916:       tmp[row++] = sum1;
917:       tmp[row++] = sum2;
918:       tmp[row++] = sum3;
919:       tmp[row++] = sum4;
920:       break;
921:     case 5:
922:       sum1 = b[*r++];
923:       sum2 = b[*r++];
924:       sum3 = b[*r++];
925:       sum4 = b[*r++];
926:       sum5 = b[*r++];
927:       v2   = aa + ai[row + 1];
928:       v3   = aa + ai[row + 2];
929:       v4   = aa + ai[row + 3];
930:       v5   = aa + ai[row + 4];

932:       for (j = 0; j < nz - 1; j += 2) {
933:         i0 = vi[0];
934:         i1 = vi[1];
935:         vi += 2;
936:         tmp0 = tmps[i0];
937:         tmp1 = tmps[i1];
938:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
939:         v1 += 2;
940:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
941:         v2 += 2;
942:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
943:         v3 += 2;
944:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
945:         v4 += 2;
946:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
947:         v5 += 2;
948:       }
949:       if (j == nz - 1) {
950:         tmp0 = tmps[*vi++];
951:         sum1 -= *v1++ * tmp0;
952:         sum2 -= *v2++ * tmp0;
953:         sum3 -= *v3++ * tmp0;
954:         sum4 -= *v4++ * tmp0;
955:         sum5 -= *v5++ * tmp0;
956:       }

958:       sum2 -= *v2++ * sum1;
959:       sum3 -= *v3++ * sum1;
960:       sum4 -= *v4++ * sum1;
961:       sum5 -= *v5++ * sum1;
962:       sum3 -= *v3++ * sum2;
963:       sum4 -= *v4++ * sum2;
964:       sum5 -= *v5++ * sum2;
965:       sum4 -= *v4++ * sum3;
966:       sum5 -= *v5++ * sum3;
967:       sum5 -= *v5++ * sum4;

969:       tmp[row++] = sum1;
970:       tmp[row++] = sum2;
971:       tmp[row++] = sum3;
972:       tmp[row++] = sum4;
973:       tmp[row++] = sum5;
974:       break;
975:     default:
976:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
977:     }
978:   }
979:   /* backward solve the upper triangular */
980:   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
981:     nsz = ns[i];
982:     aii = ai[row + 1] - 1;
983:     v1  = aa + aii;
984:     vi  = aj + aii;
985:     nz  = aii - ad[row];
986:     switch (nsz) { /* Each loop in 'case' is unrolled */
987:     case 1:
988:       sum1 = tmp[row];

990:       for (j = nz; j > 1; j -= 2) {
991:         vi -= 2;
992:         i0   = vi[2];
993:         i1   = vi[1];
994:         tmp0 = tmps[i0];
995:         tmp1 = tmps[i1];
996:         v1 -= 2;
997:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
998:       }
999:       if (j == 1) {
1000:         tmp0 = tmps[*vi--];
1001:         sum1 -= *v1-- * tmp0;
1002:       }
1003:       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1004:       row--;
1005:       break;
1006:     case 2:
1007:       sum1 = tmp[row];
1008:       sum2 = tmp[row - 1];
1009:       v2   = aa + ai[row] - 1;
1010:       for (j = nz; j > 1; j -= 2) {
1011:         vi -= 2;
1012:         i0   = vi[2];
1013:         i1   = vi[1];
1014:         tmp0 = tmps[i0];
1015:         tmp1 = tmps[i1];
1016:         v1 -= 2;
1017:         v2 -= 2;
1018:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1019:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1020:       }
1021:       if (j == 1) {
1022:         tmp0 = tmps[*vi--];
1023:         sum1 -= *v1-- * tmp0;
1024:         sum2 -= *v2-- * tmp0;
1025:       }

1027:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1028:       row--;
1029:       sum2 -= *v2-- * tmp0;
1030:       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1031:       row--;
1032:       break;
1033:     case 3:
1034:       sum1 = tmp[row];
1035:       sum2 = tmp[row - 1];
1036:       sum3 = tmp[row - 2];
1037:       v2   = aa + ai[row] - 1;
1038:       v3   = aa + ai[row - 1] - 1;
1039:       for (j = nz; j > 1; j -= 2) {
1040:         vi -= 2;
1041:         i0   = vi[2];
1042:         i1   = vi[1];
1043:         tmp0 = tmps[i0];
1044:         tmp1 = tmps[i1];
1045:         v1 -= 2;
1046:         v2 -= 2;
1047:         v3 -= 2;
1048:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1049:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1050:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1051:       }
1052:       if (j == 1) {
1053:         tmp0 = tmps[*vi--];
1054:         sum1 -= *v1-- * tmp0;
1055:         sum2 -= *v2-- * tmp0;
1056:         sum3 -= *v3-- * tmp0;
1057:       }
1058:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1059:       row--;
1060:       sum2 -= *v2-- * tmp0;
1061:       sum3 -= *v3-- * tmp0;
1062:       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1063:       row--;
1064:       sum3 -= *v3-- * tmp0;
1065:       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1066:       row--;

1068:       break;
1069:     case 4:
1070:       sum1 = tmp[row];
1071:       sum2 = tmp[row - 1];
1072:       sum3 = tmp[row - 2];
1073:       sum4 = tmp[row - 3];
1074:       v2   = aa + ai[row] - 1;
1075:       v3   = aa + ai[row - 1] - 1;
1076:       v4   = aa + ai[row - 2] - 1;

1078:       for (j = nz; j > 1; j -= 2) {
1079:         vi -= 2;
1080:         i0   = vi[2];
1081:         i1   = vi[1];
1082:         tmp0 = tmps[i0];
1083:         tmp1 = tmps[i1];
1084:         v1 -= 2;
1085:         v2 -= 2;
1086:         v3 -= 2;
1087:         v4 -= 2;
1088:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1089:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1090:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1091:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1092:       }
1093:       if (j == 1) {
1094:         tmp0 = tmps[*vi--];
1095:         sum1 -= *v1-- * tmp0;
1096:         sum2 -= *v2-- * tmp0;
1097:         sum3 -= *v3-- * tmp0;
1098:         sum4 -= *v4-- * tmp0;
1099:       }

1101:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1102:       row--;
1103:       sum2 -= *v2-- * tmp0;
1104:       sum3 -= *v3-- * tmp0;
1105:       sum4 -= *v4-- * tmp0;
1106:       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1107:       row--;
1108:       sum3 -= *v3-- * tmp0;
1109:       sum4 -= *v4-- * tmp0;
1110:       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1111:       row--;
1112:       sum4 -= *v4-- * tmp0;
1113:       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1114:       row--;
1115:       break;
1116:     case 5:
1117:       sum1 = tmp[row];
1118:       sum2 = tmp[row - 1];
1119:       sum3 = tmp[row - 2];
1120:       sum4 = tmp[row - 3];
1121:       sum5 = tmp[row - 4];
1122:       v2   = aa + ai[row] - 1;
1123:       v3   = aa + ai[row - 1] - 1;
1124:       v4   = aa + ai[row - 2] - 1;
1125:       v5   = aa + ai[row - 3] - 1;
1126:       for (j = nz; j > 1; j -= 2) {
1127:         vi -= 2;
1128:         i0   = vi[2];
1129:         i1   = vi[1];
1130:         tmp0 = tmps[i0];
1131:         tmp1 = tmps[i1];
1132:         v1 -= 2;
1133:         v2 -= 2;
1134:         v3 -= 2;
1135:         v4 -= 2;
1136:         v5 -= 2;
1137:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142:       }
1143:       if (j == 1) {
1144:         tmp0 = tmps[*vi--];
1145:         sum1 -= *v1-- * tmp0;
1146:         sum2 -= *v2-- * tmp0;
1147:         sum3 -= *v3-- * tmp0;
1148:         sum4 -= *v4-- * tmp0;
1149:         sum5 -= *v5-- * tmp0;
1150:       }

1152:       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1153:       row--;
1154:       sum2 -= *v2-- * tmp0;
1155:       sum3 -= *v3-- * tmp0;
1156:       sum4 -= *v4-- * tmp0;
1157:       sum5 -= *v5-- * tmp0;
1158:       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1159:       row--;
1160:       sum3 -= *v3-- * tmp0;
1161:       sum4 -= *v4-- * tmp0;
1162:       sum5 -= *v5-- * tmp0;
1163:       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1164:       row--;
1165:       sum4 -= *v4-- * tmp0;
1166:       sum5 -= *v5-- * tmp0;
1167:       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1168:       row--;
1169:       sum5 -= *v5-- * tmp0;
1170:       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1171:       row--;
1172:       break;
1173:     default:
1174:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1175:     }
1176:   }
1177:   PetscCall(ISRestoreIndices(isrow, &rout));
1178:   PetscCall(ISRestoreIndices(iscol, &cout));
1179:   PetscCall(VecRestoreArrayRead(bb, &b));
1180:   PetscCall(VecRestoreArrayWrite(xx, &x));
1181:   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1182:   PetscFunctionReturn(PETSC_SUCCESS);
1183: }

1185: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1186: {
1187:   Mat              C = B;
1188:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1189:   IS               isrow = b->row, isicol = b->icol;
1190:   const PetscInt  *r, *ic, *ics;
1191:   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1192:   PetscInt         i, j, k, nz, nzL, row, *pj;
1193:   const PetscInt  *ajtmp, *bjtmp;
1194:   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1195:   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1196:   FactorShiftCtx   sctx;
1197:   const PetscInt  *ddiag;
1198:   PetscReal        rs;
1199:   MatScalar        d;
1200:   PetscInt         inod, nodesz, node_max, col;
1201:   const PetscInt  *ns;
1202:   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;

1204:   PetscFunctionBegin;
1205:   /* MatPivotSetUp(): initialize shift context sctx */
1206:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

1208:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1209:     ddiag          = a->diag;
1210:     sctx.shift_top = info->zeropivot;
1211:     for (i = 0; i < n; i++) {
1212:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1213:       d  = (aa)[ddiag[i]];
1214:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1215:       v  = aa + ai[i];
1216:       nz = ai[i + 1] - ai[i];
1217:       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1218:       if (rs > sctx.shift_top) sctx.shift_top = rs;
1219:     }
1220:     sctx.shift_top *= 1.1;
1221:     sctx.nshift_max = 5;
1222:     sctx.shift_lo   = 0.;
1223:     sctx.shift_hi   = 1.;
1224:   }

1226:   PetscCall(ISGetIndices(isrow, &r));
1227:   PetscCall(ISGetIndices(isicol, &ic));

1229:   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1230:   ics = ic;

1232:   node_max = a->inode.node_count;
1233:   ns       = a->inode.size;
1234:   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");

1236:   /* If max inode size > 4, split it into two inodes.*/
1237:   /* also map the inode sizes according to the ordering */
1238:   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1239:   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1240:     if (ns[i] > 4) {
1241:       tmp_vec1[j] = 4;
1242:       ++j;
1243:       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1244:     } else {
1245:       tmp_vec1[j] = ns[i];
1246:     }
1247:   }
1248:   /* Use the correct node_max */
1249:   node_max = j;

1251:   /* Now reorder the inode info based on mat re-ordering info */
1252:   /* First create a row -> inode_size_array_index map */
1253:   PetscCall(PetscMalloc1(n + 1, &nsmap));
1254:   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1255:   for (i = 0, row = 0; i < node_max; i++) {
1256:     nodesz = tmp_vec1[i];
1257:     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1258:   }
1259:   /* Using nsmap, create a reordered ns structure */
1260:   for (i = 0, j = 0; i < node_max; i++) {
1261:     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1262:     tmp_vec2[i] = nodesz;
1263:     j += nodesz;
1264:   }
1265:   PetscCall(PetscFree(nsmap));
1266:   PetscCall(PetscFree(tmp_vec1));

1268:   /* Now use the correct ns */
1269:   ns = tmp_vec2;

1271:   do {
1272:     sctx.newshift = PETSC_FALSE;
1273:     /* Now loop over each block-row, and do the factorization */
1274:     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1275:       nodesz = ns[inod];

1277:       switch (nodesz) {
1278:       case 1:
1279:         /* zero rtmp1 */
1280:         /* L part */
1281:         nz    = bi[i + 1] - bi[i];
1282:         bjtmp = bj + bi[i];
1283:         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;

1285:         /* U part */
1286:         nz    = bdiag[i] - bdiag[i + 1];
1287:         bjtmp = bj + bdiag[i + 1] + 1;
1288:         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;

1290:         /* load in initial (unfactored row) */
1291:         nz    = ai[r[i] + 1] - ai[r[i]];
1292:         ajtmp = aj + ai[r[i]];
1293:         v     = aa + ai[r[i]];
1294:         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];

1296:         /* ZeropivotApply() */
1297:         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */

1299:         /* elimination */
1300:         bjtmp = bj + bi[i];
1301:         row   = *bjtmp++;
1302:         nzL   = bi[i + 1] - bi[i];
1303:         for (k = 0; k < nzL; k++) {
1304:           pc = rtmp1 + row;
1305:           if (*pc != 0.0) {
1306:             pv   = b->a + bdiag[row];
1307:             mul1 = *pc * (*pv);
1308:             *pc  = mul1;
1309:             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1310:             pv   = b->a + bdiag[row + 1] + 1;
1311:             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1312:             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1313:             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1314:           }
1315:           row = *bjtmp++;
1316:         }

1318:         /* finished row so stick it into b->a */
1319:         rs = 0.0;
1320:         /* L part */
1321:         pv = b->a + bi[i];
1322:         pj = b->j + bi[i];
1323:         nz = bi[i + 1] - bi[i];
1324:         for (j = 0; j < nz; j++) {
1325:           pv[j] = rtmp1[pj[j]];
1326:           rs += PetscAbsScalar(pv[j]);
1327:         }

1329:         /* U part */
1330:         pv = b->a + bdiag[i + 1] + 1;
1331:         pj = b->j + bdiag[i + 1] + 1;
1332:         nz = bdiag[i] - bdiag[i + 1] - 1;
1333:         for (j = 0; j < nz; j++) {
1334:           pv[j] = rtmp1[pj[j]];
1335:           rs += PetscAbsScalar(pv[j]);
1336:         }

1338:         /* Check zero pivot */
1339:         sctx.rs = rs;
1340:         sctx.pv = rtmp1[i];
1341:         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1342:         if (sctx.newshift) break;

1344:         /* Mark diagonal and invert diagonal for simpler triangular solves */
1345:         pv  = b->a + bdiag[i];
1346:         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1347:         break;

1349:       case 2:
1350:         /* zero rtmp1 and rtmp2 */
1351:         /* L part */
1352:         nz    = bi[i + 1] - bi[i];
1353:         bjtmp = bj + bi[i];
1354:         for (j = 0; j < nz; j++) {
1355:           col        = bjtmp[j];
1356:           rtmp1[col] = 0.0;
1357:           rtmp2[col] = 0.0;
1358:         }

1360:         /* U part */
1361:         nz    = bdiag[i] - bdiag[i + 1];
1362:         bjtmp = bj + bdiag[i + 1] + 1;
1363:         for (j = 0; j < nz; j++) {
1364:           col        = bjtmp[j];
1365:           rtmp1[col] = 0.0;
1366:           rtmp2[col] = 0.0;
1367:         }

1369:         /* load in initial (unfactored row) */
1370:         nz    = ai[r[i] + 1] - ai[r[i]];
1371:         ajtmp = aj + ai[r[i]];
1372:         v1    = aa + ai[r[i]];
1373:         v2    = aa + ai[r[i] + 1];
1374:         for (j = 0; j < nz; j++) {
1375:           col        = ics[ajtmp[j]];
1376:           rtmp1[col] = v1[j];
1377:           rtmp2[col] = v2[j];
1378:         }
1379:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1380:         rtmp1[i] += sctx.shift_amount;
1381:         rtmp2[i + 1] += sctx.shift_amount;

1383:         /* elimination */
1384:         bjtmp = bj + bi[i];
1385:         row   = *bjtmp++; /* pivot row */
1386:         nzL   = bi[i + 1] - bi[i];
1387:         for (k = 0; k < nzL; k++) {
1388:           pc1 = rtmp1 + row;
1389:           pc2 = rtmp2 + row;
1390:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1391:             pv   = b->a + bdiag[row];
1392:             mul1 = *pc1 * (*pv);
1393:             mul2 = *pc2 * (*pv);
1394:             *pc1 = mul1;
1395:             *pc2 = mul2;

1397:             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1398:             pv = b->a + bdiag[row + 1] + 1;
1399:             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1400:             for (j = 0; j < nz; j++) {
1401:               col = pj[j];
1402:               rtmp1[col] -= mul1 * pv[j];
1403:               rtmp2[col] -= mul2 * pv[j];
1404:             }
1405:             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1406:           }
1407:           row = *bjtmp++;
1408:         }

1410:         /* finished row i; check zero pivot, then stick row i into b->a */
1411:         rs = 0.0;
1412:         /* L part */
1413:         pc1 = b->a + bi[i];
1414:         pj  = b->j + bi[i];
1415:         nz  = bi[i + 1] - bi[i];
1416:         for (j = 0; j < nz; j++) {
1417:           col    = pj[j];
1418:           pc1[j] = rtmp1[col];
1419:           rs += PetscAbsScalar(pc1[j]);
1420:         }
1421:         /* U part */
1422:         pc1 = b->a + bdiag[i + 1] + 1;
1423:         pj  = b->j + bdiag[i + 1] + 1;
1424:         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1425:         for (j = 0; j < nz; j++) {
1426:           col    = pj[j];
1427:           pc1[j] = rtmp1[col];
1428:           rs += PetscAbsScalar(pc1[j]);
1429:         }

1431:         sctx.rs = rs;
1432:         sctx.pv = rtmp1[i];
1433:         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1434:         if (sctx.newshift) break;
1435:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1436:         *pc1 = 1.0 / sctx.pv;

1438:         /* Now take care of diagonal 2x2 block. */
1439:         pc2 = rtmp2 + i;
1440:         if (*pc2 != 0.0) {
1441:           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1442:           *pc2 = mul1;                        /* insert L entry */
1443:           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1444:           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1445:           for (j = 0; j < nz; j++) {
1446:             col = pj[j];
1447:             rtmp2[col] -= mul1 * rtmp1[col];
1448:           }
1449:           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1450:         }

1452:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1453:         rs = 0.0;
1454:         /* L part */
1455:         pc2 = b->a + bi[i + 1];
1456:         pj  = b->j + bi[i + 1];
1457:         nz  = bi[i + 2] - bi[i + 1];
1458:         for (j = 0; j < nz; j++) {
1459:           col    = pj[j];
1460:           pc2[j] = rtmp2[col];
1461:           rs += PetscAbsScalar(pc2[j]);
1462:         }
1463:         /* U part */
1464:         pc2 = b->a + bdiag[i + 2] + 1;
1465:         pj  = b->j + bdiag[i + 2] + 1;
1466:         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1467:         for (j = 0; j < nz; j++) {
1468:           col    = pj[j];
1469:           pc2[j] = rtmp2[col];
1470:           rs += PetscAbsScalar(pc2[j]);
1471:         }

1473:         sctx.rs = rs;
1474:         sctx.pv = rtmp2[i + 1];
1475:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1476:         if (sctx.newshift) break;
1477:         pc2  = b->a + bdiag[i + 1];
1478:         *pc2 = 1.0 / sctx.pv;
1479:         break;

1481:       case 3:
1482:         /* zero rtmp */
1483:         /* L part */
1484:         nz    = bi[i + 1] - bi[i];
1485:         bjtmp = bj + bi[i];
1486:         for (j = 0; j < nz; j++) {
1487:           col        = bjtmp[j];
1488:           rtmp1[col] = 0.0;
1489:           rtmp2[col] = 0.0;
1490:           rtmp3[col] = 0.0;
1491:         }

1493:         /* U part */
1494:         nz    = bdiag[i] - bdiag[i + 1];
1495:         bjtmp = bj + bdiag[i + 1] + 1;
1496:         for (j = 0; j < nz; j++) {
1497:           col        = bjtmp[j];
1498:           rtmp1[col] = 0.0;
1499:           rtmp2[col] = 0.0;
1500:           rtmp3[col] = 0.0;
1501:         }

1503:         /* load in initial (unfactored row) */
1504:         nz    = ai[r[i] + 1] - ai[r[i]];
1505:         ajtmp = aj + ai[r[i]];
1506:         v1    = aa + ai[r[i]];
1507:         v2    = aa + ai[r[i] + 1];
1508:         v3    = aa + ai[r[i] + 2];
1509:         for (j = 0; j < nz; j++) {
1510:           col        = ics[ajtmp[j]];
1511:           rtmp1[col] = v1[j];
1512:           rtmp2[col] = v2[j];
1513:           rtmp3[col] = v3[j];
1514:         }
1515:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1516:         rtmp1[i] += sctx.shift_amount;
1517:         rtmp2[i + 1] += sctx.shift_amount;
1518:         rtmp3[i + 2] += sctx.shift_amount;

1520:         /* elimination */
1521:         bjtmp = bj + bi[i];
1522:         row   = *bjtmp++; /* pivot row */
1523:         nzL   = bi[i + 1] - bi[i];
1524:         for (k = 0; k < nzL; k++) {
1525:           pc1 = rtmp1 + row;
1526:           pc2 = rtmp2 + row;
1527:           pc3 = rtmp3 + row;
1528:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1529:             pv   = b->a + bdiag[row];
1530:             mul1 = *pc1 * (*pv);
1531:             mul2 = *pc2 * (*pv);
1532:             mul3 = *pc3 * (*pv);
1533:             *pc1 = mul1;
1534:             *pc2 = mul2;
1535:             *pc3 = mul3;

1537:             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1538:             pv = b->a + bdiag[row + 1] + 1;
1539:             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1540:             for (j = 0; j < nz; j++) {
1541:               col = pj[j];
1542:               rtmp1[col] -= mul1 * pv[j];
1543:               rtmp2[col] -= mul2 * pv[j];
1544:               rtmp3[col] -= mul3 * pv[j];
1545:             }
1546:             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1547:           }
1548:           row = *bjtmp++;
1549:         }

1551:         /* finished row i; check zero pivot, then stick row i into b->a */
1552:         rs = 0.0;
1553:         /* L part */
1554:         pc1 = b->a + bi[i];
1555:         pj  = b->j + bi[i];
1556:         nz  = bi[i + 1] - bi[i];
1557:         for (j = 0; j < nz; j++) {
1558:           col    = pj[j];
1559:           pc1[j] = rtmp1[col];
1560:           rs += PetscAbsScalar(pc1[j]);
1561:         }
1562:         /* U part */
1563:         pc1 = b->a + bdiag[i + 1] + 1;
1564:         pj  = b->j + bdiag[i + 1] + 1;
1565:         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1566:         for (j = 0; j < nz; j++) {
1567:           col    = pj[j];
1568:           pc1[j] = rtmp1[col];
1569:           rs += PetscAbsScalar(pc1[j]);
1570:         }

1572:         sctx.rs = rs;
1573:         sctx.pv = rtmp1[i];
1574:         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1575:         if (sctx.newshift) break;
1576:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1577:         *pc1 = 1.0 / sctx.pv;

1579:         /* Now take care of 1st column of diagonal 3x3 block. */
1580:         pc2 = rtmp2 + i;
1581:         pc3 = rtmp3 + i;
1582:         if (*pc2 != 0.0 || *pc3 != 0.0) {
1583:           mul2 = (*pc2) * (*pc1);
1584:           *pc2 = mul2;
1585:           mul3 = (*pc3) * (*pc1);
1586:           *pc3 = mul3;
1587:           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1588:           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1589:           for (j = 0; j < nz; j++) {
1590:             col = pj[j];
1591:             rtmp2[col] -= mul2 * rtmp1[col];
1592:             rtmp3[col] -= mul3 * rtmp1[col];
1593:           }
1594:           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1595:         }

1597:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1598:         rs = 0.0;
1599:         /* L part */
1600:         pc2 = b->a + bi[i + 1];
1601:         pj  = b->j + bi[i + 1];
1602:         nz  = bi[i + 2] - bi[i + 1];
1603:         for (j = 0; j < nz; j++) {
1604:           col    = pj[j];
1605:           pc2[j] = rtmp2[col];
1606:           rs += PetscAbsScalar(pc2[j]);
1607:         }
1608:         /* U part */
1609:         pc2 = b->a + bdiag[i + 2] + 1;
1610:         pj  = b->j + bdiag[i + 2] + 1;
1611:         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1612:         for (j = 0; j < nz; j++) {
1613:           col    = pj[j];
1614:           pc2[j] = rtmp2[col];
1615:           rs += PetscAbsScalar(pc2[j]);
1616:         }

1618:         sctx.rs = rs;
1619:         sctx.pv = rtmp2[i + 1];
1620:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1621:         if (sctx.newshift) break;
1622:         pc2  = b->a + bdiag[i + 1];
1623:         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */

1625:         /* Now take care of 2nd column of diagonal 3x3 block. */
1626:         pc3 = rtmp3 + i + 1;
1627:         if (*pc3 != 0.0) {
1628:           mul3 = (*pc3) * (*pc2);
1629:           *pc3 = mul3;
1630:           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1631:           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1632:           for (j = 0; j < nz; j++) {
1633:             col = pj[j];
1634:             rtmp3[col] -= mul3 * rtmp2[col];
1635:           }
1636:           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1637:         }

1639:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1640:         rs = 0.0;
1641:         /* L part */
1642:         pc3 = b->a + bi[i + 2];
1643:         pj  = b->j + bi[i + 2];
1644:         nz  = bi[i + 3] - bi[i + 2];
1645:         for (j = 0; j < nz; j++) {
1646:           col    = pj[j];
1647:           pc3[j] = rtmp3[col];
1648:           rs += PetscAbsScalar(pc3[j]);
1649:         }
1650:         /* U part */
1651:         pc3 = b->a + bdiag[i + 3] + 1;
1652:         pj  = b->j + bdiag[i + 3] + 1;
1653:         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1654:         for (j = 0; j < nz; j++) {
1655:           col    = pj[j];
1656:           pc3[j] = rtmp3[col];
1657:           rs += PetscAbsScalar(pc3[j]);
1658:         }

1660:         sctx.rs = rs;
1661:         sctx.pv = rtmp3[i + 2];
1662:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1663:         if (sctx.newshift) break;
1664:         pc3  = b->a + bdiag[i + 2];
1665:         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1666:         break;
1667:       case 4:
1668:         /* zero rtmp */
1669:         /* L part */
1670:         nz    = bi[i + 1] - bi[i];
1671:         bjtmp = bj + bi[i];
1672:         for (j = 0; j < nz; j++) {
1673:           col        = bjtmp[j];
1674:           rtmp1[col] = 0.0;
1675:           rtmp2[col] = 0.0;
1676:           rtmp3[col] = 0.0;
1677:           rtmp4[col] = 0.0;
1678:         }

1680:         /* U part */
1681:         nz    = bdiag[i] - bdiag[i + 1];
1682:         bjtmp = bj + bdiag[i + 1] + 1;
1683:         for (j = 0; j < nz; j++) {
1684:           col        = bjtmp[j];
1685:           rtmp1[col] = 0.0;
1686:           rtmp2[col] = 0.0;
1687:           rtmp3[col] = 0.0;
1688:           rtmp4[col] = 0.0;
1689:         }

1691:         /* load in initial (unfactored row) */
1692:         nz    = ai[r[i] + 1] - ai[r[i]];
1693:         ajtmp = aj + ai[r[i]];
1694:         v1    = aa + ai[r[i]];
1695:         v2    = aa + ai[r[i] + 1];
1696:         v3    = aa + ai[r[i] + 2];
1697:         v4    = aa + ai[r[i] + 3];
1698:         for (j = 0; j < nz; j++) {
1699:           col        = ics[ajtmp[j]];
1700:           rtmp1[col] = v1[j];
1701:           rtmp2[col] = v2[j];
1702:           rtmp3[col] = v3[j];
1703:           rtmp4[col] = v4[j];
1704:         }
1705:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1706:         rtmp1[i] += sctx.shift_amount;
1707:         rtmp2[i + 1] += sctx.shift_amount;
1708:         rtmp3[i + 2] += sctx.shift_amount;
1709:         rtmp4[i + 3] += sctx.shift_amount;

1711:         /* elimination */
1712:         bjtmp = bj + bi[i];
1713:         row   = *bjtmp++; /* pivot row */
1714:         nzL   = bi[i + 1] - bi[i];
1715:         for (k = 0; k < nzL; k++) {
1716:           pc1 = rtmp1 + row;
1717:           pc2 = rtmp2 + row;
1718:           pc3 = rtmp3 + row;
1719:           pc4 = rtmp4 + row;
1720:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1721:             pv   = b->a + bdiag[row];
1722:             mul1 = *pc1 * (*pv);
1723:             mul2 = *pc2 * (*pv);
1724:             mul3 = *pc3 * (*pv);
1725:             mul4 = *pc4 * (*pv);
1726:             *pc1 = mul1;
1727:             *pc2 = mul2;
1728:             *pc3 = mul3;
1729:             *pc4 = mul4;

1731:             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1732:             pv = b->a + bdiag[row + 1] + 1;
1733:             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1734:             for (j = 0; j < nz; j++) {
1735:               col = pj[j];
1736:               rtmp1[col] -= mul1 * pv[j];
1737:               rtmp2[col] -= mul2 * pv[j];
1738:               rtmp3[col] -= mul3 * pv[j];
1739:               rtmp4[col] -= mul4 * pv[j];
1740:             }
1741:             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1742:           }
1743:           row = *bjtmp++;
1744:         }

1746:         /* finished row i; check zero pivot, then stick row i into b->a */
1747:         rs = 0.0;
1748:         /* L part */
1749:         pc1 = b->a + bi[i];
1750:         pj  = b->j + bi[i];
1751:         nz  = bi[i + 1] - bi[i];
1752:         for (j = 0; j < nz; j++) {
1753:           col    = pj[j];
1754:           pc1[j] = rtmp1[col];
1755:           rs += PetscAbsScalar(pc1[j]);
1756:         }
1757:         /* U part */
1758:         pc1 = b->a + bdiag[i + 1] + 1;
1759:         pj  = b->j + bdiag[i + 1] + 1;
1760:         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1761:         for (j = 0; j < nz; j++) {
1762:           col    = pj[j];
1763:           pc1[j] = rtmp1[col];
1764:           rs += PetscAbsScalar(pc1[j]);
1765:         }

1767:         sctx.rs = rs;
1768:         sctx.pv = rtmp1[i];
1769:         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1770:         if (sctx.newshift) break;
1771:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1772:         *pc1 = 1.0 / sctx.pv;

1774:         /* Now take care of 1st column of diagonal 4x4 block. */
1775:         pc2 = rtmp2 + i;
1776:         pc3 = rtmp3 + i;
1777:         pc4 = rtmp4 + i;
1778:         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1779:           mul2 = (*pc2) * (*pc1);
1780:           *pc2 = mul2;
1781:           mul3 = (*pc3) * (*pc1);
1782:           *pc3 = mul3;
1783:           mul4 = (*pc4) * (*pc1);
1784:           *pc4 = mul4;
1785:           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1786:           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1787:           for (j = 0; j < nz; j++) {
1788:             col = pj[j];
1789:             rtmp2[col] -= mul2 * rtmp1[col];
1790:             rtmp3[col] -= mul3 * rtmp1[col];
1791:             rtmp4[col] -= mul4 * rtmp1[col];
1792:           }
1793:           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1794:         }

1796:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1797:         rs = 0.0;
1798:         /* L part */
1799:         pc2 = b->a + bi[i + 1];
1800:         pj  = b->j + bi[i + 1];
1801:         nz  = bi[i + 2] - bi[i + 1];
1802:         for (j = 0; j < nz; j++) {
1803:           col    = pj[j];
1804:           pc2[j] = rtmp2[col];
1805:           rs += PetscAbsScalar(pc2[j]);
1806:         }
1807:         /* U part */
1808:         pc2 = b->a + bdiag[i + 2] + 1;
1809:         pj  = b->j + bdiag[i + 2] + 1;
1810:         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1811:         for (j = 0; j < nz; j++) {
1812:           col    = pj[j];
1813:           pc2[j] = rtmp2[col];
1814:           rs += PetscAbsScalar(pc2[j]);
1815:         }

1817:         sctx.rs = rs;
1818:         sctx.pv = rtmp2[i + 1];
1819:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1820:         if (sctx.newshift) break;
1821:         pc2  = b->a + bdiag[i + 1];
1822:         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */

1824:         /* Now take care of 2nd column of diagonal 4x4 block. */
1825:         pc3 = rtmp3 + i + 1;
1826:         pc4 = rtmp4 + i + 1;
1827:         if (*pc3 != 0.0 || *pc4 != 0.0) {
1828:           mul3 = (*pc3) * (*pc2);
1829:           *pc3 = mul3;
1830:           mul4 = (*pc4) * (*pc2);
1831:           *pc4 = mul4;
1832:           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1833:           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1834:           for (j = 0; j < nz; j++) {
1835:             col = pj[j];
1836:             rtmp3[col] -= mul3 * rtmp2[col];
1837:             rtmp4[col] -= mul4 * rtmp2[col];
1838:           }
1839:           PetscCall(PetscLogFlops(4.0 * nz));
1840:         }

1842:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1843:         rs = 0.0;
1844:         /* L part */
1845:         pc3 = b->a + bi[i + 2];
1846:         pj  = b->j + bi[i + 2];
1847:         nz  = bi[i + 3] - bi[i + 2];
1848:         for (j = 0; j < nz; j++) {
1849:           col    = pj[j];
1850:           pc3[j] = rtmp3[col];
1851:           rs += PetscAbsScalar(pc3[j]);
1852:         }
1853:         /* U part */
1854:         pc3 = b->a + bdiag[i + 3] + 1;
1855:         pj  = b->j + bdiag[i + 3] + 1;
1856:         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1857:         for (j = 0; j < nz; j++) {
1858:           col    = pj[j];
1859:           pc3[j] = rtmp3[col];
1860:           rs += PetscAbsScalar(pc3[j]);
1861:         }

1863:         sctx.rs = rs;
1864:         sctx.pv = rtmp3[i + 2];
1865:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1866:         if (sctx.newshift) break;
1867:         pc3  = b->a + bdiag[i + 2];
1868:         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */

1870:         /* Now take care of 3rd column of diagonal 4x4 block. */
1871:         pc4 = rtmp4 + i + 2;
1872:         if (*pc4 != 0.0) {
1873:           mul4 = (*pc4) * (*pc3);
1874:           *pc4 = mul4;
1875:           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1876:           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1877:           for (j = 0; j < nz; j++) {
1878:             col = pj[j];
1879:             rtmp4[col] -= mul4 * rtmp3[col];
1880:           }
1881:           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1882:         }

1884:         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1885:         rs = 0.0;
1886:         /* L part */
1887:         pc4 = b->a + bi[i + 3];
1888:         pj  = b->j + bi[i + 3];
1889:         nz  = bi[i + 4] - bi[i + 3];
1890:         for (j = 0; j < nz; j++) {
1891:           col    = pj[j];
1892:           pc4[j] = rtmp4[col];
1893:           rs += PetscAbsScalar(pc4[j]);
1894:         }
1895:         /* U part */
1896:         pc4 = b->a + bdiag[i + 4] + 1;
1897:         pj  = b->j + bdiag[i + 4] + 1;
1898:         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1899:         for (j = 0; j < nz; j++) {
1900:           col    = pj[j];
1901:           pc4[j] = rtmp4[col];
1902:           rs += PetscAbsScalar(pc4[j]);
1903:         }

1905:         sctx.rs = rs;
1906:         sctx.pv = rtmp4[i + 3];
1907:         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1908:         if (sctx.newshift) break;
1909:         pc4  = b->a + bdiag[i + 3];
1910:         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1911:         break;

1913:       default:
1914:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1915:       }
1916:       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1917:       i += nodesz;              /* Update the row */
1918:     }

1920:     /* MatPivotRefine() */
1921:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1922:       /*
1923:        * if no shift in this attempt & shifting & started shifting & can refine,
1924:        * then try lower shift
1925:        */
1926:       sctx.shift_hi       = sctx.shift_fraction;
1927:       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1928:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1929:       sctx.newshift       = PETSC_TRUE;
1930:       sctx.nshift++;
1931:     }
1932:   } while (sctx.newshift);

1934:   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1935:   PetscCall(PetscFree(tmp_vec2));
1936:   PetscCall(ISRestoreIndices(isicol, &ic));
1937:   PetscCall(ISRestoreIndices(isrow, &r));

1939:   if (b->inode.size) {
1940:     C->ops->solve = MatSolve_SeqAIJ_Inode;
1941:   } else {
1942:     C->ops->solve = MatSolve_SeqAIJ;
1943:   }
1944:   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1945:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1946:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1947:   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1948:   C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1949:   C->assembled              = PETSC_TRUE;
1950:   C->preallocated           = PETSC_TRUE;

1952:   PetscCall(PetscLogFlops(C->cmap->n));

1954:   /* MatShiftView(A,info,&sctx) */
1955:   if (sctx.nshift) {
1956:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1957:       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1958:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1959:       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1960:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1961:       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1962:     }
1963:   }
1964:   PetscFunctionReturn(PETSC_SUCCESS);
1965: }

1967: #if 0
1968: // unused
1969: static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1970: {
1971:   Mat              C = B;
1972:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1973:   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1974:   const PetscInt  *r, *ic, *c, *ics;
1975:   PetscInt         n = A->rmap->n, *bi = b->i;
1976:   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1977:   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1978:   PetscInt        *ai = a->i, *aj = a->j;
1979:   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1980:   PetscScalar      mul1, mul2, mul3, tmp;
1981:   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1982:   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1983:   PetscReal        rs = 0.0;
1984:   FactorShiftCtx   sctx;

1986:   PetscFunctionBegin;
1987:   sctx.shift_top      = 0;
1988:   sctx.nshift_max     = 0;
1989:   sctx.shift_lo       = 0;
1990:   sctx.shift_hi       = 0;
1991:   sctx.shift_fraction = 0;

1993:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1994:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1995:     sctx.shift_top = 0;
1996:     for (i = 0; i < n; i++) {
1997:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1998:       rs    = 0.0;
1999:       ajtmp = aj + ai[i];
2000:       rtmp1 = aa + ai[i];
2001:       nz    = ai[i + 1] - ai[i];
2002:       for (j = 0; j < nz; j++) {
2003:         if (*ajtmp != i) {
2004:           rs += PetscAbsScalar(*rtmp1++);
2005:         } else {
2006:           rs -= PetscRealPart(*rtmp1++);
2007:         }
2008:         ajtmp++;
2009:       }
2010:       if (rs > sctx.shift_top) sctx.shift_top = rs;
2011:     }
2012:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2013:     sctx.shift_top *= 1.1;
2014:     sctx.nshift_max = 5;
2015:     sctx.shift_lo   = 0.;
2016:     sctx.shift_hi   = 1.;
2017:   }
2018:   sctx.shift_amount = 0;
2019:   sctx.nshift       = 0;

2021:   PetscCall(ISGetIndices(isrow, &r));
2022:   PetscCall(ISGetIndices(iscol, &c));
2023:   PetscCall(ISGetIndices(isicol, &ic));
2024:   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2025:   ics = ic;

2027:   node_max = a->inode.node_count;
2028:   ns       = a->inode.size;
2029:   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");

2031:   /* If max inode size > 3, split it into two inodes.*/
2032:   /* also map the inode sizes according to the ordering */
2033:   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2034:   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2035:     if (ns[i] > 3) {
2036:       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2037:       ++j;
2038:       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2039:     } else {
2040:       tmp_vec1[j] = ns[i];
2041:     }
2042:   }
2043:   /* Use the correct node_max */
2044:   node_max = j;

2046:   /* Now reorder the inode info based on mat re-ordering info */
2047:   /* First create a row -> inode_size_array_index map */
2048:   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2049:   for (i = 0, row = 0; i < node_max; i++) {
2050:     nodesz = tmp_vec1[i];
2051:     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2052:   }
2053:   /* Using nsmap, create a reordered ns structure */
2054:   for (i = 0, j = 0; i < node_max; i++) {
2055:     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2056:     tmp_vec2[i] = nodesz;
2057:     j += nodesz;
2058:   }
2059:   PetscCall(PetscFree2(nsmap, tmp_vec1));
2060:   /* Now use the correct ns */
2061:   ns = tmp_vec2;

2063:   do {
2064:     sctx.newshift = PETSC_FALSE;
2065:     /* Now loop over each block-row, and do the factorization */
2066:     for (i = 0, row = 0; i < node_max; i++) {
2067:       nodesz = ns[i];
2068:       nz     = bi[row + 1] - bi[row];
2069:       bjtmp  = bj + bi[row];

2071:       switch (nodesz) {
2072:       case 1:
2073:         for (j = 0; j < nz; j++) {
2074:           idx         = bjtmp[j];
2075:           rtmp11[idx] = 0.0;
2076:         }

2078:         /* load in initial (unfactored row) */
2079:         idx    = r[row];
2080:         nz_tmp = ai[idx + 1] - ai[idx];
2081:         ajtmp  = aj + ai[idx];
2082:         v1     = aa + ai[idx];

2084:         for (j = 0; j < nz_tmp; j++) {
2085:           idx         = ics[ajtmp[j]];
2086:           rtmp11[idx] = v1[j];
2087:         }
2088:         rtmp11[ics[r[row]]] += sctx.shift_amount;

2090:         prow = *bjtmp++;
2091:         while (prow < row) {
2092:           pc1 = rtmp11 + prow;
2093:           if (*pc1 != 0.0) {
2094:             pv     = ba + bd[prow];
2095:             pj     = nbj + bd[prow];
2096:             mul1   = *pc1 * *pv++;
2097:             *pc1   = mul1;
2098:             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2099:             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2100:             for (j = 0; j < nz_tmp; j++) {
2101:               tmp = pv[j];
2102:               idx = pj[j];
2103:               rtmp11[idx] -= mul1 * tmp;
2104:             }
2105:           }
2106:           prow = *bjtmp++;
2107:         }
2108:         pj  = bj + bi[row];
2109:         pc1 = ba + bi[row];

2111:         sctx.pv     = rtmp11[row];
2112:         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2113:         rs          = 0.0;
2114:         for (j = 0; j < nz; j++) {
2115:           idx    = pj[j];
2116:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2117:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2118:         }
2119:         sctx.rs = rs;
2120:         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2121:         if (sctx.newshift) goto endofwhile;
2122:         break;

2124:       case 2:
2125:         for (j = 0; j < nz; j++) {
2126:           idx         = bjtmp[j];
2127:           rtmp11[idx] = 0.0;
2128:           rtmp22[idx] = 0.0;
2129:         }

2131:         /* load in initial (unfactored row) */
2132:         idx    = r[row];
2133:         nz_tmp = ai[idx + 1] - ai[idx];
2134:         ajtmp  = aj + ai[idx];
2135:         v1     = aa + ai[idx];
2136:         v2     = aa + ai[idx + 1];
2137:         for (j = 0; j < nz_tmp; j++) {
2138:           idx         = ics[ajtmp[j]];
2139:           rtmp11[idx] = v1[j];
2140:           rtmp22[idx] = v2[j];
2141:         }
2142:         rtmp11[ics[r[row]]] += sctx.shift_amount;
2143:         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;

2145:         prow = *bjtmp++;
2146:         while (prow < row) {
2147:           pc1 = rtmp11 + prow;
2148:           pc2 = rtmp22 + prow;
2149:           if (*pc1 != 0.0 || *pc2 != 0.0) {
2150:             pv   = ba + bd[prow];
2151:             pj   = nbj + bd[prow];
2152:             mul1 = *pc1 * *pv;
2153:             mul2 = *pc2 * *pv;
2154:             ++pv;
2155:             *pc1 = mul1;
2156:             *pc2 = mul2;

2158:             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2159:             for (j = 0; j < nz_tmp; j++) {
2160:               tmp = pv[j];
2161:               idx = pj[j];
2162:               rtmp11[idx] -= mul1 * tmp;
2163:               rtmp22[idx] -= mul2 * tmp;
2164:             }
2165:             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2166:           }
2167:           prow = *bjtmp++;
2168:         }

2170:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2171:         pc1 = rtmp11 + prow;
2172:         pc2 = rtmp22 + prow;

2174:         sctx.pv = *pc1;
2175:         pj      = bj + bi[prow];
2176:         rs      = 0.0;
2177:         for (j = 0; j < nz; j++) {
2178:           idx = pj[j];
2179:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2180:         }
2181:         sctx.rs = rs;
2182:         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2183:         if (sctx.newshift) goto endofwhile;

2185:         if (*pc2 != 0.0) {
2186:           pj     = nbj + bd[prow];
2187:           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2188:           *pc2   = mul2;
2189:           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2190:           for (j = 0; j < nz_tmp; j++) {
2191:             idx = pj[j];
2192:             tmp = rtmp11[idx];
2193:             rtmp22[idx] -= mul2 * tmp;
2194:           }
2195:           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2196:         }

2198:         pj  = bj + bi[row];
2199:         pc1 = ba + bi[row];
2200:         pc2 = ba + bi[row + 1];

2202:         sctx.pv         = rtmp22[row + 1];
2203:         rs              = 0.0;
2204:         rtmp11[row]     = 1.0 / rtmp11[row];
2205:         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2206:         /* copy row entries from dense representation to sparse */
2207:         for (j = 0; j < nz; j++) {
2208:           idx    = pj[j];
2209:           pc1[j] = rtmp11[idx];
2210:           pc2[j] = rtmp22[idx];
2211:           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2212:         }
2213:         sctx.rs = rs;
2214:         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2215:         if (sctx.newshift) goto endofwhile;
2216:         break;

2218:       case 3:
2219:         for (j = 0; j < nz; j++) {
2220:           idx         = bjtmp[j];
2221:           rtmp11[idx] = 0.0;
2222:           rtmp22[idx] = 0.0;
2223:           rtmp33[idx] = 0.0;
2224:         }
2225:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2226:         idx    = r[row];
2227:         nz_tmp = ai[idx + 1] - ai[idx];
2228:         ajtmp  = aj + ai[idx];
2229:         v1     = aa + ai[idx];
2230:         v2     = aa + ai[idx + 1];
2231:         v3     = aa + ai[idx + 2];
2232:         for (j = 0; j < nz_tmp; j++) {
2233:           idx         = ics[ajtmp[j]];
2234:           rtmp11[idx] = v1[j];
2235:           rtmp22[idx] = v2[j];
2236:           rtmp33[idx] = v3[j];
2237:         }
2238:         rtmp11[ics[r[row]]] += sctx.shift_amount;
2239:         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2240:         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;

2242:         /* loop over all pivot row blocks above this row block */
2243:         prow = *bjtmp++;
2244:         while (prow < row) {
2245:           pc1 = rtmp11 + prow;
2246:           pc2 = rtmp22 + prow;
2247:           pc3 = rtmp33 + prow;
2248:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2249:             pv   = ba + bd[prow];
2250:             pj   = nbj + bd[prow];
2251:             mul1 = *pc1 * *pv;
2252:             mul2 = *pc2 * *pv;
2253:             mul3 = *pc3 * *pv;
2254:             ++pv;
2255:             *pc1 = mul1;
2256:             *pc2 = mul2;
2257:             *pc3 = mul3;

2259:             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2260:             /* update this row based on pivot row */
2261:             for (j = 0; j < nz_tmp; j++) {
2262:               tmp = pv[j];
2263:               idx = pj[j];
2264:               rtmp11[idx] -= mul1 * tmp;
2265:               rtmp22[idx] -= mul2 * tmp;
2266:               rtmp33[idx] -= mul3 * tmp;
2267:             }
2268:             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2269:           }
2270:           prow = *bjtmp++;
2271:         }

2273:         /* Now take care of diagonal 3x3 block in this set of rows */
2274:         /* note: prow = row here */
2275:         pc1 = rtmp11 + prow;
2276:         pc2 = rtmp22 + prow;
2277:         pc3 = rtmp33 + prow;

2279:         sctx.pv = *pc1;
2280:         pj      = bj + bi[prow];
2281:         rs      = 0.0;
2282:         for (j = 0; j < nz; j++) {
2283:           idx = pj[j];
2284:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2285:         }
2286:         sctx.rs = rs;
2287:         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2288:         if (sctx.newshift) goto endofwhile;

2290:         if (*pc2 != 0.0 || *pc3 != 0.0) {
2291:           mul2   = (*pc2) / (*pc1);
2292:           mul3   = (*pc3) / (*pc1);
2293:           *pc2   = mul2;
2294:           *pc3   = mul3;
2295:           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2296:           pj     = nbj + bd[prow];
2297:           for (j = 0; j < nz_tmp; j++) {
2298:             idx = pj[j];
2299:             tmp = rtmp11[idx];
2300:             rtmp22[idx] -= mul2 * tmp;
2301:             rtmp33[idx] -= mul3 * tmp;
2302:           }
2303:           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2304:         }
2305:         ++prow;

2307:         pc2     = rtmp22 + prow;
2308:         pc3     = rtmp33 + prow;
2309:         sctx.pv = *pc2;
2310:         pj      = bj + bi[prow];
2311:         rs      = 0.0;
2312:         for (j = 0; j < nz; j++) {
2313:           idx = pj[j];
2314:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2315:         }
2316:         sctx.rs = rs;
2317:         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2318:         if (sctx.newshift) goto endofwhile;

2320:         if (*pc3 != 0.0) {
2321:           mul3   = (*pc3) / (*pc2);
2322:           *pc3   = mul3;
2323:           pj     = nbj + bd[prow];
2324:           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2325:           for (j = 0; j < nz_tmp; j++) {
2326:             idx = pj[j];
2327:             tmp = rtmp22[idx];
2328:             rtmp33[idx] -= mul3 * tmp;
2329:           }
2330:           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2331:         }

2333:         pj  = bj + bi[row];
2334:         pc1 = ba + bi[row];
2335:         pc2 = ba + bi[row + 1];
2336:         pc3 = ba + bi[row + 2];

2338:         sctx.pv         = rtmp33[row + 2];
2339:         rs              = 0.0;
2340:         rtmp11[row]     = 1.0 / rtmp11[row];
2341:         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2342:         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2343:         /* copy row entries from dense representation to sparse */
2344:         for (j = 0; j < nz; j++) {
2345:           idx    = pj[j];
2346:           pc1[j] = rtmp11[idx];
2347:           pc2[j] = rtmp22[idx];
2348:           pc3[j] = rtmp33[idx];
2349:           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2350:         }

2352:         sctx.rs = rs;
2353:         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2354:         if (sctx.newshift) goto endofwhile;
2355:         break;

2357:       default:
2358:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2359:       }
2360:       row += nodesz; /* Update the row */
2361:     }
2362:   endofwhile:;
2363:   } while (sctx.newshift);
2364:   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2365:   PetscCall(PetscFree(tmp_vec2));
2366:   PetscCall(ISRestoreIndices(isicol, &ic));
2367:   PetscCall(ISRestoreIndices(isrow, &r));
2368:   PetscCall(ISRestoreIndices(iscol, &c));

2370:   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2371:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2372:   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2373:   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2374:   C->assembled              = PETSC_TRUE;
2375:   C->preallocated           = PETSC_TRUE;
2376:   if (sctx.nshift) {
2377:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2378:       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2379:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2380:       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2381:     }
2382:   }
2383:   PetscCall(PetscLogFlops(C->cmap->n));
2384:   PetscCall(MatSeqAIJCheckInode(C));
2385:   PetscFunctionReturn(PETSC_SUCCESS);
2386: }
2387: #endif

2389: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2390: {
2391:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2392:   IS                 iscol = a->col, isrow = a->row;
2393:   const PetscInt    *r, *c, *rout, *cout;
2394:   PetscInt           i, j, n = A->rmap->n;
2395:   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2396:   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2397:   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2398:   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2399:   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2400:   const PetscScalar *b;

2402:   PetscFunctionBegin;
2403:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2404:   node_max = a->inode.node_count;
2405:   ns       = a->inode.size; /* Node Size array */

2407:   PetscCall(VecGetArrayRead(bb, &b));
2408:   PetscCall(VecGetArrayWrite(xx, &x));
2409:   tmp = a->solve_work;

2411:   PetscCall(ISGetIndices(isrow, &rout));
2412:   r = rout;
2413:   PetscCall(ISGetIndices(iscol, &cout));
2414:   c = cout;

2416:   /* forward solve the lower triangular */
2417:   tmps = tmp;
2418:   aa   = a_a;
2419:   aj   = a_j;
2420:   ad   = a->diag;

2422:   for (i = 0, row = 0; i < node_max; ++i) {
2423:     nsz = ns[i];
2424:     aii = ai[row];
2425:     v1  = aa + aii;
2426:     vi  = aj + aii;
2427:     nz  = ai[row + 1] - ai[row];

2429:     if (i < node_max - 1) {
2430:       /* Prefetch the indices for the next block */
2431:       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2432:       /* Prefetch the data for the next block */
2433:       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2434:     }

2436:     switch (nsz) { /* Each loop in 'case' is unrolled */
2437:     case 1:
2438:       sum1 = b[r[row]];
2439:       for (j = 0; j < nz - 1; j += 2) {
2440:         i0   = vi[j];
2441:         i1   = vi[j + 1];
2442:         tmp0 = tmps[i0];
2443:         tmp1 = tmps[i1];
2444:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2445:       }
2446:       if (j == nz - 1) {
2447:         tmp0 = tmps[vi[j]];
2448:         sum1 -= v1[j] * tmp0;
2449:       }
2450:       tmp[row++] = sum1;
2451:       break;
2452:     case 2:
2453:       sum1 = b[r[row]];
2454:       sum2 = b[r[row + 1]];
2455:       v2   = aa + ai[row + 1];

2457:       for (j = 0; j < nz - 1; j += 2) {
2458:         i0   = vi[j];
2459:         i1   = vi[j + 1];
2460:         tmp0 = tmps[i0];
2461:         tmp1 = tmps[i1];
2462:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2463:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2464:       }
2465:       if (j == nz - 1) {
2466:         tmp0 = tmps[vi[j]];
2467:         sum1 -= v1[j] * tmp0;
2468:         sum2 -= v2[j] * tmp0;
2469:       }
2470:       sum2 -= v2[nz] * sum1;
2471:       tmp[row++] = sum1;
2472:       tmp[row++] = sum2;
2473:       break;
2474:     case 3:
2475:       sum1 = b[r[row]];
2476:       sum2 = b[r[row + 1]];
2477:       sum3 = b[r[row + 2]];
2478:       v2   = aa + ai[row + 1];
2479:       v3   = aa + ai[row + 2];

2481:       for (j = 0; j < nz - 1; j += 2) {
2482:         i0   = vi[j];
2483:         i1   = vi[j + 1];
2484:         tmp0 = tmps[i0];
2485:         tmp1 = tmps[i1];
2486:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2487:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2488:         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2489:       }
2490:       if (j == nz - 1) {
2491:         tmp0 = tmps[vi[j]];
2492:         sum1 -= v1[j] * tmp0;
2493:         sum2 -= v2[j] * tmp0;
2494:         sum3 -= v3[j] * tmp0;
2495:       }
2496:       sum2 -= v2[nz] * sum1;
2497:       sum3 -= v3[nz] * sum1;
2498:       sum3 -= v3[nz + 1] * sum2;
2499:       tmp[row++] = sum1;
2500:       tmp[row++] = sum2;
2501:       tmp[row++] = sum3;
2502:       break;

2504:     case 4:
2505:       sum1 = b[r[row]];
2506:       sum2 = b[r[row + 1]];
2507:       sum3 = b[r[row + 2]];
2508:       sum4 = b[r[row + 3]];
2509:       v2   = aa + ai[row + 1];
2510:       v3   = aa + ai[row + 2];
2511:       v4   = aa + ai[row + 3];

2513:       for (j = 0; j < nz - 1; j += 2) {
2514:         i0   = vi[j];
2515:         i1   = vi[j + 1];
2516:         tmp0 = tmps[i0];
2517:         tmp1 = tmps[i1];
2518:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2519:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2520:         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2521:         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2522:       }
2523:       if (j == nz - 1) {
2524:         tmp0 = tmps[vi[j]];
2525:         sum1 -= v1[j] * tmp0;
2526:         sum2 -= v2[j] * tmp0;
2527:         sum3 -= v3[j] * tmp0;
2528:         sum4 -= v4[j] * tmp0;
2529:       }
2530:       sum2 -= v2[nz] * sum1;
2531:       sum3 -= v3[nz] * sum1;
2532:       sum4 -= v4[nz] * sum1;
2533:       sum3 -= v3[nz + 1] * sum2;
2534:       sum4 -= v4[nz + 1] * sum2;
2535:       sum4 -= v4[nz + 2] * sum3;

2537:       tmp[row++] = sum1;
2538:       tmp[row++] = sum2;
2539:       tmp[row++] = sum3;
2540:       tmp[row++] = sum4;
2541:       break;
2542:     case 5:
2543:       sum1 = b[r[row]];
2544:       sum2 = b[r[row + 1]];
2545:       sum3 = b[r[row + 2]];
2546:       sum4 = b[r[row + 3]];
2547:       sum5 = b[r[row + 4]];
2548:       v2   = aa + ai[row + 1];
2549:       v3   = aa + ai[row + 2];
2550:       v4   = aa + ai[row + 3];
2551:       v5   = aa + ai[row + 4];

2553:       for (j = 0; j < nz - 1; j += 2) {
2554:         i0   = vi[j];
2555:         i1   = vi[j + 1];
2556:         tmp0 = tmps[i0];
2557:         tmp1 = tmps[i1];
2558:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2559:         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2560:         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2561:         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2562:         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2563:       }
2564:       if (j == nz - 1) {
2565:         tmp0 = tmps[vi[j]];
2566:         sum1 -= v1[j] * tmp0;
2567:         sum2 -= v2[j] * tmp0;
2568:         sum3 -= v3[j] * tmp0;
2569:         sum4 -= v4[j] * tmp0;
2570:         sum5 -= v5[j] * tmp0;
2571:       }

2573:       sum2 -= v2[nz] * sum1;
2574:       sum3 -= v3[nz] * sum1;
2575:       sum4 -= v4[nz] * sum1;
2576:       sum5 -= v5[nz] * sum1;
2577:       sum3 -= v3[nz + 1] * sum2;
2578:       sum4 -= v4[nz + 1] * sum2;
2579:       sum5 -= v5[nz + 1] * sum2;
2580:       sum4 -= v4[nz + 2] * sum3;
2581:       sum5 -= v5[nz + 2] * sum3;
2582:       sum5 -= v5[nz + 3] * sum4;

2584:       tmp[row++] = sum1;
2585:       tmp[row++] = sum2;
2586:       tmp[row++] = sum3;
2587:       tmp[row++] = sum4;
2588:       tmp[row++] = sum5;
2589:       break;
2590:     default:
2591:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2592:     }
2593:   }
2594:   /* backward solve the upper triangular */
2595:   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2596:     nsz = ns[i];
2597:     aii = ad[row + 1] + 1;
2598:     v1  = aa + aii;
2599:     vi  = aj + aii;
2600:     nz  = ad[row] - ad[row + 1] - 1;

2602:     if (i > 0) {
2603:       /* Prefetch the indices for the next block */
2604:       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2605:       /* Prefetch the data for the next block */
2606:       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2607:     }

2609:     switch (nsz) { /* Each loop in 'case' is unrolled */
2610:     case 1:
2611:       sum1 = tmp[row];

2613:       for (j = 0; j < nz - 1; j += 2) {
2614:         i0   = vi[j];
2615:         i1   = vi[j + 1];
2616:         tmp0 = tmps[i0];
2617:         tmp1 = tmps[i1];
2618:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2619:       }
2620:       if (j == nz - 1) {
2621:         tmp0 = tmps[vi[j]];
2622:         sum1 -= v1[j] * tmp0;
2623:       }
2624:       x[c[row]] = tmp[row] = sum1 * v1[nz];
2625:       row--;
2626:       break;
2627:     case 2:
2628:       sum1 = tmp[row];
2629:       sum2 = tmp[row - 1];
2630:       v2   = aa + ad[row] + 1;
2631:       for (j = 0; j < nz - 1; j += 2) {
2632:         i0   = vi[j];
2633:         i1   = vi[j + 1];
2634:         tmp0 = tmps[i0];
2635:         tmp1 = tmps[i1];
2636:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2637:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2638:       }
2639:       if (j == nz - 1) {
2640:         tmp0 = tmps[vi[j]];
2641:         sum1 -= v1[j] * tmp0;
2642:         sum2 -= v2[j + 1] * tmp0;
2643:       }

2645:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2646:       row--;
2647:       sum2 -= v2[0] * tmp0;
2648:       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2649:       row--;
2650:       break;
2651:     case 3:
2652:       sum1 = tmp[row];
2653:       sum2 = tmp[row - 1];
2654:       sum3 = tmp[row - 2];
2655:       v2   = aa + ad[row] + 1;
2656:       v3   = aa + ad[row - 1] + 1;
2657:       for (j = 0; j < nz - 1; j += 2) {
2658:         i0   = vi[j];
2659:         i1   = vi[j + 1];
2660:         tmp0 = tmps[i0];
2661:         tmp1 = tmps[i1];
2662:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2663:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2664:         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2665:       }
2666:       if (j == nz - 1) {
2667:         tmp0 = tmps[vi[j]];
2668:         sum1 -= v1[j] * tmp0;
2669:         sum2 -= v2[j + 1] * tmp0;
2670:         sum3 -= v3[j + 2] * tmp0;
2671:       }
2672:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2673:       row--;
2674:       sum2 -= v2[0] * tmp0;
2675:       sum3 -= v3[1] * tmp0;
2676:       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2677:       row--;
2678:       sum3 -= v3[0] * tmp0;
2679:       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2680:       row--;

2682:       break;
2683:     case 4:
2684:       sum1 = tmp[row];
2685:       sum2 = tmp[row - 1];
2686:       sum3 = tmp[row - 2];
2687:       sum4 = tmp[row - 3];
2688:       v2   = aa + ad[row] + 1;
2689:       v3   = aa + ad[row - 1] + 1;
2690:       v4   = aa + ad[row - 2] + 1;

2692:       for (j = 0; j < nz - 1; j += 2) {
2693:         i0   = vi[j];
2694:         i1   = vi[j + 1];
2695:         tmp0 = tmps[i0];
2696:         tmp1 = tmps[i1];
2697:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2698:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2699:         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2700:         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2701:       }
2702:       if (j == nz - 1) {
2703:         tmp0 = tmps[vi[j]];
2704:         sum1 -= v1[j] * tmp0;
2705:         sum2 -= v2[j + 1] * tmp0;
2706:         sum3 -= v3[j + 2] * tmp0;
2707:         sum4 -= v4[j + 3] * tmp0;
2708:       }

2710:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2711:       row--;
2712:       sum2 -= v2[0] * tmp0;
2713:       sum3 -= v3[1] * tmp0;
2714:       sum4 -= v4[2] * tmp0;
2715:       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2716:       row--;
2717:       sum3 -= v3[0] * tmp0;
2718:       sum4 -= v4[1] * tmp0;
2719:       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2720:       row--;
2721:       sum4 -= v4[0] * tmp0;
2722:       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2723:       row--;
2724:       break;
2725:     case 5:
2726:       sum1 = tmp[row];
2727:       sum2 = tmp[row - 1];
2728:       sum3 = tmp[row - 2];
2729:       sum4 = tmp[row - 3];
2730:       sum5 = tmp[row - 4];
2731:       v2   = aa + ad[row] + 1;
2732:       v3   = aa + ad[row - 1] + 1;
2733:       v4   = aa + ad[row - 2] + 1;
2734:       v5   = aa + ad[row - 3] + 1;
2735:       for (j = 0; j < nz - 1; j += 2) {
2736:         i0   = vi[j];
2737:         i1   = vi[j + 1];
2738:         tmp0 = tmps[i0];
2739:         tmp1 = tmps[i1];
2740:         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2741:         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2742:         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2743:         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2744:         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2745:       }
2746:       if (j == nz - 1) {
2747:         tmp0 = tmps[vi[j]];
2748:         sum1 -= v1[j] * tmp0;
2749:         sum2 -= v2[j + 1] * tmp0;
2750:         sum3 -= v3[j + 2] * tmp0;
2751:         sum4 -= v4[j + 3] * tmp0;
2752:         sum5 -= v5[j + 4] * tmp0;
2753:       }

2755:       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2756:       row--;
2757:       sum2 -= v2[0] * tmp0;
2758:       sum3 -= v3[1] * tmp0;
2759:       sum4 -= v4[2] * tmp0;
2760:       sum5 -= v5[3] * tmp0;
2761:       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2762:       row--;
2763:       sum3 -= v3[0] * tmp0;
2764:       sum4 -= v4[1] * tmp0;
2765:       sum5 -= v5[2] * tmp0;
2766:       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2767:       row--;
2768:       sum4 -= v4[0] * tmp0;
2769:       sum5 -= v5[1] * tmp0;
2770:       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2771:       row--;
2772:       sum5 -= v5[0] * tmp0;
2773:       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2774:       row--;
2775:       break;
2776:     default:
2777:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2778:     }
2779:   }
2780:   PetscCall(ISRestoreIndices(isrow, &rout));
2781:   PetscCall(ISRestoreIndices(iscol, &cout));
2782:   PetscCall(VecRestoreArrayRead(bb, &b));
2783:   PetscCall(VecRestoreArrayWrite(xx, &x));
2784:   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2785:   PetscFunctionReturn(PETSC_SUCCESS);
2786: }

2788: /*
2789:      Makes a longer coloring[] array and calls the usual code with that
2790: */
2791: static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2792: {
2793:   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2794:   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2795:   PetscInt        *colorused, i;
2796:   ISColoringValue *newcolor;

2798:   PetscFunctionBegin;
2799:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2800:   PetscCall(PetscMalloc1(n + 1, &newcolor));
2801:   /* loop over inodes, marking a color for each column*/
2802:   row = 0;
2803:   for (i = 0; i < m; i++) {
2804:     for (j = 0; j < ns[i]; j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
2805:   }

2807:   /* eliminate unneeded colors */
2808:   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2809:   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;

2811:   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2812:   ncolors = colorused[5 * ncolors - 1];
2813:   for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
2814:   PetscCall(PetscFree(colorused));
2815:   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2816:   PetscCall(PetscFree(coloring));
2817:   PetscFunctionReturn(PETSC_SUCCESS);
2818: }

2820: #include <petsc/private/kernels/blockinvert.h>

2822: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2823: {
2824:   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2825:   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2826:   MatScalar         *ibdiag, *bdiag, work[25], *t;
2827:   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2828:   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2829:   const PetscScalar *xb, *b;
2830:   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2831:   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2832:   PetscInt           sz, k, ipvt[5];
2833:   PetscBool          allowzeropivot, zeropivotdetected;
2834:   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;

2836:   PetscFunctionBegin;
2837:   allowzeropivot = PetscNot(A->erroriffailure);
2838:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2839:   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2840:   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");

2842:   if (!a->inode.ibdiagvalid) {
2843:     if (!a->inode.ibdiag) {
2844:       /* calculate space needed for diagonal blocks */
2845:       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2846:       a->inode.bdiagsize = cnt;

2848:       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2849:     }

2851:     /* copy over the diagonal blocks and invert them */
2852:     ibdiag = a->inode.ibdiag;
2853:     bdiag  = a->inode.bdiag;
2854:     cnt    = 0;
2855:     for (i = 0, row = 0; i < m; i++) {
2856:       for (j = 0; j < sizes[i]; j++) {
2857:         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2858:       }
2859:       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));

2861:       switch (sizes[i]) {
2862:       case 1:
2863:         /* Create matrix data structure */
2864:         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2865:           if (allowzeropivot) {
2866:             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2867:             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2868:             A->factorerror_zeropivot_row   = row;
2869:             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2870:           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2871:         }
2872:         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2873:         break;
2874:       case 2:
2875:         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2876:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2877:         break;
2878:       case 3:
2879:         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2880:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2881:         break;
2882:       case 4:
2883:         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2884:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2885:         break;
2886:       case 5:
2887:         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2888:         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2889:         break;
2890:       default:
2891:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2892:       }
2893:       cnt += sizes[i] * sizes[i];
2894:       row += sizes[i];
2895:     }
2896:     a->inode.ibdiagvalid = PETSC_TRUE;
2897:   }
2898:   ibdiag = a->inode.ibdiag;
2899:   bdiag  = a->inode.bdiag;
2900:   t      = a->inode.ssor_work;

2902:   PetscCall(VecGetArray(xx, &x));
2903:   PetscCall(VecGetArrayRead(bb, &b));
2904:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2905:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2906:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2907:       for (i = 0, row = 0; i < m; i++) {
2908:         sz  = diag[row] - ii[row];
2909:         v1  = a->a + ii[row];
2910:         idx = a->j + ii[row];

2912:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2913:         switch (sizes[i]) {
2914:         case 1:

2916:           sum1 = b[row];
2917:           for (n = 0; n < sz - 1; n += 2) {
2918:             i1 = idx[0];
2919:             i2 = idx[1];
2920:             idx += 2;
2921:             tmp0 = x[i1];
2922:             tmp1 = x[i2];
2923:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2924:             v1 += 2;
2925:           }

2927:           if (n == sz - 1) {
2928:             tmp0 = x[*idx];
2929:             sum1 -= *v1 * tmp0;
2930:           }
2931:           t[row]   = sum1;
2932:           x[row++] = sum1 * (*ibdiag++);
2933:           break;
2934:         case 2:
2935:           v2   = a->a + ii[row + 1];
2936:           sum1 = b[row];
2937:           sum2 = b[row + 1];
2938:           for (n = 0; n < sz - 1; n += 2) {
2939:             i1 = idx[0];
2940:             i2 = idx[1];
2941:             idx += 2;
2942:             tmp0 = x[i1];
2943:             tmp1 = x[i2];
2944:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2945:             v1 += 2;
2946:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2947:             v2 += 2;
2948:           }

2950:           if (n == sz - 1) {
2951:             tmp0 = x[*idx];
2952:             sum1 -= v1[0] * tmp0;
2953:             sum2 -= v2[0] * tmp0;
2954:           }
2955:           t[row]     = sum1;
2956:           t[row + 1] = sum2;
2957:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2958:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2959:           ibdiag += 4;
2960:           break;
2961:         case 3:
2962:           v2   = a->a + ii[row + 1];
2963:           v3   = a->a + ii[row + 2];
2964:           sum1 = b[row];
2965:           sum2 = b[row + 1];
2966:           sum3 = b[row + 2];
2967:           for (n = 0; n < sz - 1; n += 2) {
2968:             i1 = idx[0];
2969:             i2 = idx[1];
2970:             idx += 2;
2971:             tmp0 = x[i1];
2972:             tmp1 = x[i2];
2973:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2974:             v1 += 2;
2975:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2976:             v2 += 2;
2977:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2978:             v3 += 2;
2979:           }

2981:           if (n == sz - 1) {
2982:             tmp0 = x[*idx];
2983:             sum1 -= v1[0] * tmp0;
2984:             sum2 -= v2[0] * tmp0;
2985:             sum3 -= v3[0] * tmp0;
2986:           }
2987:           t[row]     = sum1;
2988:           t[row + 1] = sum2;
2989:           t[row + 2] = sum3;
2990:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2991:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2992:           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2993:           ibdiag += 9;
2994:           break;
2995:         case 4:
2996:           v2   = a->a + ii[row + 1];
2997:           v3   = a->a + ii[row + 2];
2998:           v4   = a->a + ii[row + 3];
2999:           sum1 = b[row];
3000:           sum2 = b[row + 1];
3001:           sum3 = b[row + 2];
3002:           sum4 = b[row + 3];
3003:           for (n = 0; n < sz - 1; n += 2) {
3004:             i1 = idx[0];
3005:             i2 = idx[1];
3006:             idx += 2;
3007:             tmp0 = x[i1];
3008:             tmp1 = x[i2];
3009:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3010:             v1 += 2;
3011:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3012:             v2 += 2;
3013:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3014:             v3 += 2;
3015:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3016:             v4 += 2;
3017:           }

3019:           if (n == sz - 1) {
3020:             tmp0 = x[*idx];
3021:             sum1 -= v1[0] * tmp0;
3022:             sum2 -= v2[0] * tmp0;
3023:             sum3 -= v3[0] * tmp0;
3024:             sum4 -= v4[0] * tmp0;
3025:           }
3026:           t[row]     = sum1;
3027:           t[row + 1] = sum2;
3028:           t[row + 2] = sum3;
3029:           t[row + 3] = sum4;
3030:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3031:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3032:           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3033:           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3034:           ibdiag += 16;
3035:           break;
3036:         case 5:
3037:           v2   = a->a + ii[row + 1];
3038:           v3   = a->a + ii[row + 2];
3039:           v4   = a->a + ii[row + 3];
3040:           v5   = a->a + ii[row + 4];
3041:           sum1 = b[row];
3042:           sum2 = b[row + 1];
3043:           sum3 = b[row + 2];
3044:           sum4 = b[row + 3];
3045:           sum5 = b[row + 4];
3046:           for (n = 0; n < sz - 1; n += 2) {
3047:             i1 = idx[0];
3048:             i2 = idx[1];
3049:             idx += 2;
3050:             tmp0 = x[i1];
3051:             tmp1 = x[i2];
3052:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3053:             v1 += 2;
3054:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3055:             v2 += 2;
3056:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3057:             v3 += 2;
3058:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3059:             v4 += 2;
3060:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3061:             v5 += 2;
3062:           }

3064:           if (n == sz - 1) {
3065:             tmp0 = x[*idx];
3066:             sum1 -= v1[0] * tmp0;
3067:             sum2 -= v2[0] * tmp0;
3068:             sum3 -= v3[0] * tmp0;
3069:             sum4 -= v4[0] * tmp0;
3070:             sum5 -= v5[0] * tmp0;
3071:           }
3072:           t[row]     = sum1;
3073:           t[row + 1] = sum2;
3074:           t[row + 2] = sum3;
3075:           t[row + 3] = sum4;
3076:           t[row + 4] = sum5;
3077:           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3078:           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3079:           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3080:           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3081:           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3082:           ibdiag += 25;
3083:           break;
3084:         default:
3085:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3086:         }
3087:       }

3089:       xb = t;
3090:       PetscCall(PetscLogFlops(a->nz));
3091:     } else xb = b;
3092:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3093:       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3094:       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3095:         ibdiag -= sizes[i] * sizes[i];
3096:         sz  = ii[row + 1] - diag[row] - 1;
3097:         v1  = a->a + diag[row] + 1;
3098:         idx = a->j + diag[row] + 1;

3100:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3101:         switch (sizes[i]) {
3102:         case 1:

3104:           sum1 = xb[row];
3105:           for (n = 0; n < sz - 1; n += 2) {
3106:             i1 = idx[0];
3107:             i2 = idx[1];
3108:             idx += 2;
3109:             tmp0 = x[i1];
3110:             tmp1 = x[i2];
3111:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3112:             v1 += 2;
3113:           }

3115:           if (n == sz - 1) {
3116:             tmp0 = x[*idx];
3117:             sum1 -= *v1 * tmp0;
3118:           }
3119:           x[row--] = sum1 * (*ibdiag);
3120:           break;

3122:         case 2:

3124:           sum1 = xb[row];
3125:           sum2 = xb[row - 1];
3126:           /* note that sum1 is associated with the second of the two rows */
3127:           v2 = a->a + diag[row - 1] + 2;
3128:           for (n = 0; n < sz - 1; n += 2) {
3129:             i1 = idx[0];
3130:             i2 = idx[1];
3131:             idx += 2;
3132:             tmp0 = x[i1];
3133:             tmp1 = x[i2];
3134:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3135:             v1 += 2;
3136:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3137:             v2 += 2;
3138:           }

3140:           if (n == sz - 1) {
3141:             tmp0 = x[*idx];
3142:             sum1 -= *v1 * tmp0;
3143:             sum2 -= *v2 * tmp0;
3144:           }
3145:           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3146:           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3147:           break;
3148:         case 3:

3150:           sum1 = xb[row];
3151:           sum2 = xb[row - 1];
3152:           sum3 = xb[row - 2];
3153:           v2   = a->a + diag[row - 1] + 2;
3154:           v3   = a->a + diag[row - 2] + 3;
3155:           for (n = 0; n < sz - 1; n += 2) {
3156:             i1 = idx[0];
3157:             i2 = idx[1];
3158:             idx += 2;
3159:             tmp0 = x[i1];
3160:             tmp1 = x[i2];
3161:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3162:             v1 += 2;
3163:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3164:             v2 += 2;
3165:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3166:             v3 += 2;
3167:           }

3169:           if (n == sz - 1) {
3170:             tmp0 = x[*idx];
3171:             sum1 -= *v1 * tmp0;
3172:             sum2 -= *v2 * tmp0;
3173:             sum3 -= *v3 * tmp0;
3174:           }
3175:           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3176:           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3177:           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3178:           break;
3179:         case 4:

3181:           sum1 = xb[row];
3182:           sum2 = xb[row - 1];
3183:           sum3 = xb[row - 2];
3184:           sum4 = xb[row - 3];
3185:           v2   = a->a + diag[row - 1] + 2;
3186:           v3   = a->a + diag[row - 2] + 3;
3187:           v4   = a->a + diag[row - 3] + 4;
3188:           for (n = 0; n < sz - 1; n += 2) {
3189:             i1 = idx[0];
3190:             i2 = idx[1];
3191:             idx += 2;
3192:             tmp0 = x[i1];
3193:             tmp1 = x[i2];
3194:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3195:             v1 += 2;
3196:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3197:             v2 += 2;
3198:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3199:             v3 += 2;
3200:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3201:             v4 += 2;
3202:           }

3204:           if (n == sz - 1) {
3205:             tmp0 = x[*idx];
3206:             sum1 -= *v1 * tmp0;
3207:             sum2 -= *v2 * tmp0;
3208:             sum3 -= *v3 * tmp0;
3209:             sum4 -= *v4 * tmp0;
3210:           }
3211:           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3212:           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3213:           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3214:           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3215:           break;
3216:         case 5:

3218:           sum1 = xb[row];
3219:           sum2 = xb[row - 1];
3220:           sum3 = xb[row - 2];
3221:           sum4 = xb[row - 3];
3222:           sum5 = xb[row - 4];
3223:           v2   = a->a + diag[row - 1] + 2;
3224:           v3   = a->a + diag[row - 2] + 3;
3225:           v4   = a->a + diag[row - 3] + 4;
3226:           v5   = a->a + diag[row - 4] + 5;
3227:           for (n = 0; n < sz - 1; n += 2) {
3228:             i1 = idx[0];
3229:             i2 = idx[1];
3230:             idx += 2;
3231:             tmp0 = x[i1];
3232:             tmp1 = x[i2];
3233:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3234:             v1 += 2;
3235:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3236:             v2 += 2;
3237:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3238:             v3 += 2;
3239:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3240:             v4 += 2;
3241:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3242:             v5 += 2;
3243:           }

3245:           if (n == sz - 1) {
3246:             tmp0 = x[*idx];
3247:             sum1 -= *v1 * tmp0;
3248:             sum2 -= *v2 * tmp0;
3249:             sum3 -= *v3 * tmp0;
3250:             sum4 -= *v4 * tmp0;
3251:             sum5 -= *v5 * tmp0;
3252:           }
3253:           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3254:           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3255:           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3256:           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3257:           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3258:           break;
3259:         default:
3260:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3261:         }
3262:       }

3264:       PetscCall(PetscLogFlops(a->nz));
3265:     }
3266:     its--;
3267:   }
3268:   while (its--) {
3269:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3270:       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3271:         sz  = diag[row] - ii[row];
3272:         v1  = a->a + ii[row];
3273:         idx = a->j + ii[row];
3274:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3275:         switch (sizes[i]) {
3276:         case 1:
3277:           sum1 = b[row];
3278:           for (n = 0; n < sz - 1; n += 2) {
3279:             i1 = idx[0];
3280:             i2 = idx[1];
3281:             idx += 2;
3282:             tmp0 = x[i1];
3283:             tmp1 = x[i2];
3284:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3285:             v1 += 2;
3286:           }
3287:           if (n == sz - 1) {
3288:             tmp0 = x[*idx++];
3289:             sum1 -= *v1 * tmp0;
3290:             v1++;
3291:           }
3292:           t[row] = sum1;
3293:           sz     = ii[row + 1] - diag[row] - 1;
3294:           idx    = a->j + diag[row] + 1;
3295:           v1 += 1;
3296:           for (n = 0; n < sz - 1; n += 2) {
3297:             i1 = idx[0];
3298:             i2 = idx[1];
3299:             idx += 2;
3300:             tmp0 = x[i1];
3301:             tmp1 = x[i2];
3302:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3303:             v1 += 2;
3304:           }
3305:           if (n == sz - 1) {
3306:             tmp0 = x[*idx++];
3307:             sum1 -= *v1 * tmp0;
3308:           }
3309:           /* in MatSOR_SeqAIJ this line would be
3310:            *
3311:            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3312:            *
3313:            * but omega == 1, so this becomes
3314:            *
3315:            * x[row] = sum1*(*ibdiag++);
3316:            *
3317:            */
3318:           x[row] = sum1 * (*ibdiag);
3319:           break;
3320:         case 2:
3321:           v2   = a->a + ii[row + 1];
3322:           sum1 = b[row];
3323:           sum2 = b[row + 1];
3324:           for (n = 0; n < sz - 1; n += 2) {
3325:             i1 = idx[0];
3326:             i2 = idx[1];
3327:             idx += 2;
3328:             tmp0 = x[i1];
3329:             tmp1 = x[i2];
3330:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3331:             v1 += 2;
3332:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3333:             v2 += 2;
3334:           }
3335:           if (n == sz - 1) {
3336:             tmp0 = x[*idx++];
3337:             sum1 -= v1[0] * tmp0;
3338:             sum2 -= v2[0] * tmp0;
3339:             v1++;
3340:             v2++;
3341:           }
3342:           t[row]     = sum1;
3343:           t[row + 1] = sum2;
3344:           sz         = ii[row + 1] - diag[row] - 2;
3345:           idx        = a->j + diag[row] + 2;
3346:           v1 += 2;
3347:           v2 += 2;
3348:           for (n = 0; n < sz - 1; n += 2) {
3349:             i1 = idx[0];
3350:             i2 = idx[1];
3351:             idx += 2;
3352:             tmp0 = x[i1];
3353:             tmp1 = x[i2];
3354:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3355:             v1 += 2;
3356:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3357:             v2 += 2;
3358:           }
3359:           if (n == sz - 1) {
3360:             tmp0 = x[*idx];
3361:             sum1 -= v1[0] * tmp0;
3362:             sum2 -= v2[0] * tmp0;
3363:           }
3364:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3365:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3366:           break;
3367:         case 3:
3368:           v2   = a->a + ii[row + 1];
3369:           v3   = a->a + ii[row + 2];
3370:           sum1 = b[row];
3371:           sum2 = b[row + 1];
3372:           sum3 = b[row + 2];
3373:           for (n = 0; n < sz - 1; n += 2) {
3374:             i1 = idx[0];
3375:             i2 = idx[1];
3376:             idx += 2;
3377:             tmp0 = x[i1];
3378:             tmp1 = x[i2];
3379:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3380:             v1 += 2;
3381:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3382:             v2 += 2;
3383:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3384:             v3 += 2;
3385:           }
3386:           if (n == sz - 1) {
3387:             tmp0 = x[*idx++];
3388:             sum1 -= v1[0] * tmp0;
3389:             sum2 -= v2[0] * tmp0;
3390:             sum3 -= v3[0] * tmp0;
3391:             v1++;
3392:             v2++;
3393:             v3++;
3394:           }
3395:           t[row]     = sum1;
3396:           t[row + 1] = sum2;
3397:           t[row + 2] = sum3;
3398:           sz         = ii[row + 1] - diag[row] - 3;
3399:           idx        = a->j + diag[row] + 3;
3400:           v1 += 3;
3401:           v2 += 3;
3402:           v3 += 3;
3403:           for (n = 0; n < sz - 1; n += 2) {
3404:             i1 = idx[0];
3405:             i2 = idx[1];
3406:             idx += 2;
3407:             tmp0 = x[i1];
3408:             tmp1 = x[i2];
3409:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3410:             v1 += 2;
3411:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3412:             v2 += 2;
3413:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3414:             v3 += 2;
3415:           }
3416:           if (n == sz - 1) {
3417:             tmp0 = x[*idx];
3418:             sum1 -= v1[0] * tmp0;
3419:             sum2 -= v2[0] * tmp0;
3420:             sum3 -= v3[0] * tmp0;
3421:           }
3422:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3423:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3424:           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3425:           break;
3426:         case 4:
3427:           v2   = a->a + ii[row + 1];
3428:           v3   = a->a + ii[row + 2];
3429:           v4   = a->a + ii[row + 3];
3430:           sum1 = b[row];
3431:           sum2 = b[row + 1];
3432:           sum3 = b[row + 2];
3433:           sum4 = b[row + 3];
3434:           for (n = 0; n < sz - 1; n += 2) {
3435:             i1 = idx[0];
3436:             i2 = idx[1];
3437:             idx += 2;
3438:             tmp0 = x[i1];
3439:             tmp1 = x[i2];
3440:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3441:             v1 += 2;
3442:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3443:             v2 += 2;
3444:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3445:             v3 += 2;
3446:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3447:             v4 += 2;
3448:           }
3449:           if (n == sz - 1) {
3450:             tmp0 = x[*idx++];
3451:             sum1 -= v1[0] * tmp0;
3452:             sum2 -= v2[0] * tmp0;
3453:             sum3 -= v3[0] * tmp0;
3454:             sum4 -= v4[0] * tmp0;
3455:             v1++;
3456:             v2++;
3457:             v3++;
3458:             v4++;
3459:           }
3460:           t[row]     = sum1;
3461:           t[row + 1] = sum2;
3462:           t[row + 2] = sum3;
3463:           t[row + 3] = sum4;
3464:           sz         = ii[row + 1] - diag[row] - 4;
3465:           idx        = a->j + diag[row] + 4;
3466:           v1 += 4;
3467:           v2 += 4;
3468:           v3 += 4;
3469:           v4 += 4;
3470:           for (n = 0; n < sz - 1; n += 2) {
3471:             i1 = idx[0];
3472:             i2 = idx[1];
3473:             idx += 2;
3474:             tmp0 = x[i1];
3475:             tmp1 = x[i2];
3476:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3477:             v1 += 2;
3478:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3479:             v2 += 2;
3480:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3481:             v3 += 2;
3482:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3483:             v4 += 2;
3484:           }
3485:           if (n == sz - 1) {
3486:             tmp0 = x[*idx];
3487:             sum1 -= v1[0] * tmp0;
3488:             sum2 -= v2[0] * tmp0;
3489:             sum3 -= v3[0] * tmp0;
3490:             sum4 -= v4[0] * tmp0;
3491:           }
3492:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3493:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3494:           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3495:           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3496:           break;
3497:         case 5:
3498:           v2   = a->a + ii[row + 1];
3499:           v3   = a->a + ii[row + 2];
3500:           v4   = a->a + ii[row + 3];
3501:           v5   = a->a + ii[row + 4];
3502:           sum1 = b[row];
3503:           sum2 = b[row + 1];
3504:           sum3 = b[row + 2];
3505:           sum4 = b[row + 3];
3506:           sum5 = b[row + 4];
3507:           for (n = 0; n < sz - 1; n += 2) {
3508:             i1 = idx[0];
3509:             i2 = idx[1];
3510:             idx += 2;
3511:             tmp0 = x[i1];
3512:             tmp1 = x[i2];
3513:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3514:             v1 += 2;
3515:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3516:             v2 += 2;
3517:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3518:             v3 += 2;
3519:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3520:             v4 += 2;
3521:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3522:             v5 += 2;
3523:           }
3524:           if (n == sz - 1) {
3525:             tmp0 = x[*idx++];
3526:             sum1 -= v1[0] * tmp0;
3527:             sum2 -= v2[0] * tmp0;
3528:             sum3 -= v3[0] * tmp0;
3529:             sum4 -= v4[0] * tmp0;
3530:             sum5 -= v5[0] * tmp0;
3531:             v1++;
3532:             v2++;
3533:             v3++;
3534:             v4++;
3535:             v5++;
3536:           }
3537:           t[row]     = sum1;
3538:           t[row + 1] = sum2;
3539:           t[row + 2] = sum3;
3540:           t[row + 3] = sum4;
3541:           t[row + 4] = sum5;
3542:           sz         = ii[row + 1] - diag[row] - 5;
3543:           idx        = a->j + diag[row] + 5;
3544:           v1 += 5;
3545:           v2 += 5;
3546:           v3 += 5;
3547:           v4 += 5;
3548:           v5 += 5;
3549:           for (n = 0; n < sz - 1; n += 2) {
3550:             i1 = idx[0];
3551:             i2 = idx[1];
3552:             idx += 2;
3553:             tmp0 = x[i1];
3554:             tmp1 = x[i2];
3555:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3556:             v1 += 2;
3557:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3558:             v2 += 2;
3559:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3560:             v3 += 2;
3561:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3562:             v4 += 2;
3563:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3564:             v5 += 2;
3565:           }
3566:           if (n == sz - 1) {
3567:             tmp0 = x[*idx];
3568:             sum1 -= v1[0] * tmp0;
3569:             sum2 -= v2[0] * tmp0;
3570:             sum3 -= v3[0] * tmp0;
3571:             sum4 -= v4[0] * tmp0;
3572:             sum5 -= v5[0] * tmp0;
3573:           }
3574:           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3575:           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3576:           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3577:           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3578:           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3579:           break;
3580:         default:
3581:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3582:         }
3583:       }
3584:       xb = t;
3585:       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3586:     } else xb = b;

3588:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3589:       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3590:       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3591:         ibdiag -= sizes[i] * sizes[i];

3593:         /* set RHS */
3594:         if (xb == b) {
3595:           /* whole (old way) */
3596:           sz  = ii[row + 1] - ii[row];
3597:           idx = a->j + ii[row];
3598:           switch (sizes[i]) {
3599:           case 5:
3600:             v5 = a->a + ii[row - 4]; /* fall through */
3601:           case 4:
3602:             v4 = a->a + ii[row - 3]; /* fall through */
3603:           case 3:
3604:             v3 = a->a + ii[row - 2]; /* fall through */
3605:           case 2:
3606:             v2 = a->a + ii[row - 1]; /* fall through */
3607:           case 1:
3608:             v1 = a->a + ii[row];
3609:             break;
3610:           default:
3611:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3612:           }
3613:         } else {
3614:           /* upper, no diag */
3615:           sz  = ii[row + 1] - diag[row] - 1;
3616:           idx = a->j + diag[row] + 1;
3617:           switch (sizes[i]) {
3618:           case 5:
3619:             v5 = a->a + diag[row - 4] + 5; /* fall through */
3620:           case 4:
3621:             v4 = a->a + diag[row - 3] + 4; /* fall through */
3622:           case 3:
3623:             v3 = a->a + diag[row - 2] + 3; /* fall through */
3624:           case 2:
3625:             v2 = a->a + diag[row - 1] + 2; /* fall through */
3626:           case 1:
3627:             v1 = a->a + diag[row] + 1;
3628:           }
3629:         }
3630:         /* set sum */
3631:         switch (sizes[i]) {
3632:         case 5:
3633:           sum5 = xb[row - 4]; /* fall through */
3634:         case 4:
3635:           sum4 = xb[row - 3]; /* fall through */
3636:         case 3:
3637:           sum3 = xb[row - 2]; /* fall through */
3638:         case 2:
3639:           sum2 = xb[row - 1]; /* fall through */
3640:         case 1:
3641:           /* note that sum1 is associated with the last row */
3642:           sum1 = xb[row];
3643:         }
3644:         /* do sums */
3645:         for (n = 0; n < sz - 1; n += 2) {
3646:           i1 = idx[0];
3647:           i2 = idx[1];
3648:           idx += 2;
3649:           tmp0 = x[i1];
3650:           tmp1 = x[i2];
3651:           switch (sizes[i]) {
3652:           case 5:
3653:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3654:             v5 += 2; /* fall through */
3655:           case 4:
3656:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3657:             v4 += 2; /* fall through */
3658:           case 3:
3659:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3660:             v3 += 2; /* fall through */
3661:           case 2:
3662:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3663:             v2 += 2; /* fall through */
3664:           case 1:
3665:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3666:             v1 += 2;
3667:           }
3668:         }
3669:         /* ragged edge */
3670:         if (n == sz - 1) {
3671:           tmp0 = x[*idx];
3672:           switch (sizes[i]) {
3673:           case 5:
3674:             sum5 -= *v5 * tmp0; /* fall through */
3675:           case 4:
3676:             sum4 -= *v4 * tmp0; /* fall through */
3677:           case 3:
3678:             sum3 -= *v3 * tmp0; /* fall through */
3679:           case 2:
3680:             sum2 -= *v2 * tmp0; /* fall through */
3681:           case 1:
3682:             sum1 -= *v1 * tmp0;
3683:           }
3684:         }
3685:         /* update */
3686:         if (xb == b) {
3687:           /* whole (old way) w/ diag */
3688:           switch (sizes[i]) {
3689:           case 5:
3690:             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3691:             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3692:             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3693:             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3694:             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3695:             break;
3696:           case 4:
3697:             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3698:             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3699:             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3700:             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3701:             break;
3702:           case 3:
3703:             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3704:             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3705:             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3706:             break;
3707:           case 2:
3708:             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3709:             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3710:             break;
3711:           case 1:
3712:             x[row--] += sum1 * (*ibdiag);
3713:             break;
3714:           }
3715:         } else {
3716:           /* no diag so set =  */
3717:           switch (sizes[i]) {
3718:           case 5:
3719:             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3720:             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3721:             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3722:             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3723:             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3724:             break;
3725:           case 4:
3726:             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3727:             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3728:             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3729:             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3730:             break;
3731:           case 3:
3732:             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3733:             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3734:             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3735:             break;
3736:           case 2:
3737:             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3738:             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3739:             break;
3740:           case 1:
3741:             x[row--] = sum1 * (*ibdiag);
3742:             break;
3743:           }
3744:         }
3745:       }
3746:       if (xb == b) {
3747:         PetscCall(PetscLogFlops(2.0 * a->nz));
3748:       } else {
3749:         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3750:       }
3751:     }
3752:   }
3753:   if (flag & SOR_EISENSTAT) {
3754:     /*
3755:           Apply  (U + D)^-1  where D is now the block diagonal
3756:     */
3757:     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3758:     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3759:       ibdiag -= sizes[i] * sizes[i];
3760:       sz  = ii[row + 1] - diag[row] - 1;
3761:       v1  = a->a + diag[row] + 1;
3762:       idx = a->j + diag[row] + 1;
3763:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3764:       switch (sizes[i]) {
3765:       case 1:

3767:         sum1 = b[row];
3768:         for (n = 0; n < sz - 1; n += 2) {
3769:           i1 = idx[0];
3770:           i2 = idx[1];
3771:           idx += 2;
3772:           tmp0 = x[i1];
3773:           tmp1 = x[i2];
3774:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3775:           v1 += 2;
3776:         }

3778:         if (n == sz - 1) {
3779:           tmp0 = x[*idx];
3780:           sum1 -= *v1 * tmp0;
3781:         }
3782:         x[row] = sum1 * (*ibdiag);
3783:         row--;
3784:         break;

3786:       case 2:

3788:         sum1 = b[row];
3789:         sum2 = b[row - 1];
3790:         /* note that sum1 is associated with the second of the two rows */
3791:         v2 = a->a + diag[row - 1] + 2;
3792:         for (n = 0; n < sz - 1; n += 2) {
3793:           i1 = idx[0];
3794:           i2 = idx[1];
3795:           idx += 2;
3796:           tmp0 = x[i1];
3797:           tmp1 = x[i2];
3798:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3799:           v1 += 2;
3800:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3801:           v2 += 2;
3802:         }

3804:         if (n == sz - 1) {
3805:           tmp0 = x[*idx];
3806:           sum1 -= *v1 * tmp0;
3807:           sum2 -= *v2 * tmp0;
3808:         }
3809:         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3810:         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3811:         row -= 2;
3812:         break;
3813:       case 3:

3815:         sum1 = b[row];
3816:         sum2 = b[row - 1];
3817:         sum3 = b[row - 2];
3818:         v2   = a->a + diag[row - 1] + 2;
3819:         v3   = a->a + diag[row - 2] + 3;
3820:         for (n = 0; n < sz - 1; n += 2) {
3821:           i1 = idx[0];
3822:           i2 = idx[1];
3823:           idx += 2;
3824:           tmp0 = x[i1];
3825:           tmp1 = x[i2];
3826:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3827:           v1 += 2;
3828:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3829:           v2 += 2;
3830:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3831:           v3 += 2;
3832:         }

3834:         if (n == sz - 1) {
3835:           tmp0 = x[*idx];
3836:           sum1 -= *v1 * tmp0;
3837:           sum2 -= *v2 * tmp0;
3838:           sum3 -= *v3 * tmp0;
3839:         }
3840:         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3841:         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3842:         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3843:         row -= 3;
3844:         break;
3845:       case 4:

3847:         sum1 = b[row];
3848:         sum2 = b[row - 1];
3849:         sum3 = b[row - 2];
3850:         sum4 = b[row - 3];
3851:         v2   = a->a + diag[row - 1] + 2;
3852:         v3   = a->a + diag[row - 2] + 3;
3853:         v4   = a->a + diag[row - 3] + 4;
3854:         for (n = 0; n < sz - 1; n += 2) {
3855:           i1 = idx[0];
3856:           i2 = idx[1];
3857:           idx += 2;
3858:           tmp0 = x[i1];
3859:           tmp1 = x[i2];
3860:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3861:           v1 += 2;
3862:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3863:           v2 += 2;
3864:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3865:           v3 += 2;
3866:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3867:           v4 += 2;
3868:         }

3870:         if (n == sz - 1) {
3871:           tmp0 = x[*idx];
3872:           sum1 -= *v1 * tmp0;
3873:           sum2 -= *v2 * tmp0;
3874:           sum3 -= *v3 * tmp0;
3875:           sum4 -= *v4 * tmp0;
3876:         }
3877:         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3878:         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3879:         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3880:         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3881:         row -= 4;
3882:         break;
3883:       case 5:

3885:         sum1 = b[row];
3886:         sum2 = b[row - 1];
3887:         sum3 = b[row - 2];
3888:         sum4 = b[row - 3];
3889:         sum5 = b[row - 4];
3890:         v2   = a->a + diag[row - 1] + 2;
3891:         v3   = a->a + diag[row - 2] + 3;
3892:         v4   = a->a + diag[row - 3] + 4;
3893:         v5   = a->a + diag[row - 4] + 5;
3894:         for (n = 0; n < sz - 1; n += 2) {
3895:           i1 = idx[0];
3896:           i2 = idx[1];
3897:           idx += 2;
3898:           tmp0 = x[i1];
3899:           tmp1 = x[i2];
3900:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3901:           v1 += 2;
3902:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3903:           v2 += 2;
3904:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3905:           v3 += 2;
3906:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3907:           v4 += 2;
3908:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3909:           v5 += 2;
3910:         }

3912:         if (n == sz - 1) {
3913:           tmp0 = x[*idx];
3914:           sum1 -= *v1 * tmp0;
3915:           sum2 -= *v2 * tmp0;
3916:           sum3 -= *v3 * tmp0;
3917:           sum4 -= *v4 * tmp0;
3918:           sum5 -= *v5 * tmp0;
3919:         }
3920:         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3921:         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3922:         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3923:         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3924:         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3925:         row -= 5;
3926:         break;
3927:       default:
3928:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3929:       }
3930:     }
3931:     PetscCall(PetscLogFlops(a->nz));

3933:     /*
3934:            t = b - D x    where D is the block diagonal
3935:     */
3936:     cnt = 0;
3937:     for (i = 0, row = 0; i < m; i++) {
3938:       switch (sizes[i]) {
3939:       case 1:
3940:         t[row] = b[row] - bdiag[cnt++] * x[row];
3941:         row++;
3942:         break;
3943:       case 2:
3944:         x1         = x[row];
3945:         x2         = x[row + 1];
3946:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3947:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3948:         t[row]     = b[row] - tmp1;
3949:         t[row + 1] = b[row + 1] - tmp2;
3950:         row += 2;
3951:         cnt += 4;
3952:         break;
3953:       case 3:
3954:         x1         = x[row];
3955:         x2         = x[row + 1];
3956:         x3         = x[row + 2];
3957:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3958:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3959:         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3960:         t[row]     = b[row] - tmp1;
3961:         t[row + 1] = b[row + 1] - tmp2;
3962:         t[row + 2] = b[row + 2] - tmp3;
3963:         row += 3;
3964:         cnt += 9;
3965:         break;
3966:       case 4:
3967:         x1         = x[row];
3968:         x2         = x[row + 1];
3969:         x3         = x[row + 2];
3970:         x4         = x[row + 3];
3971:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3972:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3973:         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3974:         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3975:         t[row]     = b[row] - tmp1;
3976:         t[row + 1] = b[row + 1] - tmp2;
3977:         t[row + 2] = b[row + 2] - tmp3;
3978:         t[row + 3] = b[row + 3] - tmp4;
3979:         row += 4;
3980:         cnt += 16;
3981:         break;
3982:       case 5:
3983:         x1         = x[row];
3984:         x2         = x[row + 1];
3985:         x3         = x[row + 2];
3986:         x4         = x[row + 3];
3987:         x5         = x[row + 4];
3988:         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3989:         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3990:         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3991:         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3992:         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3993:         t[row]     = b[row] - tmp1;
3994:         t[row + 1] = b[row + 1] - tmp2;
3995:         t[row + 2] = b[row + 2] - tmp3;
3996:         t[row + 3] = b[row + 3] - tmp4;
3997:         t[row + 4] = b[row + 4] - tmp5;
3998:         row += 5;
3999:         cnt += 25;
4000:         break;
4001:       default:
4002:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4003:       }
4004:     }
4005:     PetscCall(PetscLogFlops(m));

4007:     /*
4008:           Apply (L + D)^-1 where D is the block diagonal
4009:     */
4010:     for (i = 0, row = 0; i < m; i++) {
4011:       sz  = diag[row] - ii[row];
4012:       v1  = a->a + ii[row];
4013:       idx = a->j + ii[row];
4014:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4015:       switch (sizes[i]) {
4016:       case 1:

4018:         sum1 = t[row];
4019:         for (n = 0; n < sz - 1; n += 2) {
4020:           i1 = idx[0];
4021:           i2 = idx[1];
4022:           idx += 2;
4023:           tmp0 = t[i1];
4024:           tmp1 = t[i2];
4025:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4026:           v1 += 2;
4027:         }

4029:         if (n == sz - 1) {
4030:           tmp0 = t[*idx];
4031:           sum1 -= *v1 * tmp0;
4032:         }
4033:         x[row] += t[row] = sum1 * (*ibdiag++);
4034:         row++;
4035:         break;
4036:       case 2:
4037:         v2   = a->a + ii[row + 1];
4038:         sum1 = t[row];
4039:         sum2 = t[row + 1];
4040:         for (n = 0; n < sz - 1; n += 2) {
4041:           i1 = idx[0];
4042:           i2 = idx[1];
4043:           idx += 2;
4044:           tmp0 = t[i1];
4045:           tmp1 = t[i2];
4046:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4047:           v1 += 2;
4048:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4049:           v2 += 2;
4050:         }

4052:         if (n == sz - 1) {
4053:           tmp0 = t[*idx];
4054:           sum1 -= v1[0] * tmp0;
4055:           sum2 -= v2[0] * tmp0;
4056:         }
4057:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4058:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4059:         ibdiag += 4;
4060:         row += 2;
4061:         break;
4062:       case 3:
4063:         v2   = a->a + ii[row + 1];
4064:         v3   = a->a + ii[row + 2];
4065:         sum1 = t[row];
4066:         sum2 = t[row + 1];
4067:         sum3 = t[row + 2];
4068:         for (n = 0; n < sz - 1; n += 2) {
4069:           i1 = idx[0];
4070:           i2 = idx[1];
4071:           idx += 2;
4072:           tmp0 = t[i1];
4073:           tmp1 = t[i2];
4074:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4075:           v1 += 2;
4076:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4077:           v2 += 2;
4078:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4079:           v3 += 2;
4080:         }

4082:         if (n == sz - 1) {
4083:           tmp0 = t[*idx];
4084:           sum1 -= v1[0] * tmp0;
4085:           sum2 -= v2[0] * tmp0;
4086:           sum3 -= v3[0] * tmp0;
4087:         }
4088:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4089:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4090:         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4091:         ibdiag += 9;
4092:         row += 3;
4093:         break;
4094:       case 4:
4095:         v2   = a->a + ii[row + 1];
4096:         v3   = a->a + ii[row + 2];
4097:         v4   = a->a + ii[row + 3];
4098:         sum1 = t[row];
4099:         sum2 = t[row + 1];
4100:         sum3 = t[row + 2];
4101:         sum4 = t[row + 3];
4102:         for (n = 0; n < sz - 1; n += 2) {
4103:           i1 = idx[0];
4104:           i2 = idx[1];
4105:           idx += 2;
4106:           tmp0 = t[i1];
4107:           tmp1 = t[i2];
4108:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4109:           v1 += 2;
4110:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4111:           v2 += 2;
4112:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4113:           v3 += 2;
4114:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4115:           v4 += 2;
4116:         }

4118:         if (n == sz - 1) {
4119:           tmp0 = t[*idx];
4120:           sum1 -= v1[0] * tmp0;
4121:           sum2 -= v2[0] * tmp0;
4122:           sum3 -= v3[0] * tmp0;
4123:           sum4 -= v4[0] * tmp0;
4124:         }
4125:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4126:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4127:         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4128:         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4129:         ibdiag += 16;
4130:         row += 4;
4131:         break;
4132:       case 5:
4133:         v2   = a->a + ii[row + 1];
4134:         v3   = a->a + ii[row + 2];
4135:         v4   = a->a + ii[row + 3];
4136:         v5   = a->a + ii[row + 4];
4137:         sum1 = t[row];
4138:         sum2 = t[row + 1];
4139:         sum3 = t[row + 2];
4140:         sum4 = t[row + 3];
4141:         sum5 = t[row + 4];
4142:         for (n = 0; n < sz - 1; n += 2) {
4143:           i1 = idx[0];
4144:           i2 = idx[1];
4145:           idx += 2;
4146:           tmp0 = t[i1];
4147:           tmp1 = t[i2];
4148:           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4149:           v1 += 2;
4150:           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4151:           v2 += 2;
4152:           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4153:           v3 += 2;
4154:           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4155:           v4 += 2;
4156:           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4157:           v5 += 2;
4158:         }

4160:         if (n == sz - 1) {
4161:           tmp0 = t[*idx];
4162:           sum1 -= v1[0] * tmp0;
4163:           sum2 -= v2[0] * tmp0;
4164:           sum3 -= v3[0] * tmp0;
4165:           sum4 -= v4[0] * tmp0;
4166:           sum5 -= v5[0] * tmp0;
4167:         }
4168:         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4169:         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4170:         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4171:         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4172:         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4173:         ibdiag += 25;
4174:         row += 5;
4175:         break;
4176:       default:
4177:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4178:       }
4179:     }
4180:     PetscCall(PetscLogFlops(a->nz));
4181:   }
4182:   PetscCall(VecRestoreArray(xx, &x));
4183:   PetscCall(VecRestoreArrayRead(bb, &b));
4184:   PetscFunctionReturn(PETSC_SUCCESS);
4185: }

4187: static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4188: {
4189:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4190:   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4191:   const MatScalar   *bdiag = a->inode.bdiag;
4192:   const PetscScalar *b;
4193:   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
4194:   const PetscInt    *sizes = a->inode.size;

4196:   PetscFunctionBegin;
4197:   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4198:   PetscCall(VecGetArray(xx, &x));
4199:   PetscCall(VecGetArrayRead(bb, &b));
4200:   cnt = 0;
4201:   for (i = 0, row = 0; i < m; i++) {
4202:     switch (sizes[i]) {
4203:     case 1:
4204:       x[row] = b[row] * bdiag[cnt++];
4205:       row++;
4206:       break;
4207:     case 2:
4208:       x1       = b[row];
4209:       x2       = b[row + 1];
4210:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4211:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4212:       x[row++] = tmp1;
4213:       x[row++] = tmp2;
4214:       cnt += 4;
4215:       break;
4216:     case 3:
4217:       x1       = b[row];
4218:       x2       = b[row + 1];
4219:       x3       = b[row + 2];
4220:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4221:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4222:       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4223:       x[row++] = tmp1;
4224:       x[row++] = tmp2;
4225:       x[row++] = tmp3;
4226:       cnt += 9;
4227:       break;
4228:     case 4:
4229:       x1       = b[row];
4230:       x2       = b[row + 1];
4231:       x3       = b[row + 2];
4232:       x4       = b[row + 3];
4233:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4234:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4235:       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4236:       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4237:       x[row++] = tmp1;
4238:       x[row++] = tmp2;
4239:       x[row++] = tmp3;
4240:       x[row++] = tmp4;
4241:       cnt += 16;
4242:       break;
4243:     case 5:
4244:       x1       = b[row];
4245:       x2       = b[row + 1];
4246:       x3       = b[row + 2];
4247:       x4       = b[row + 3];
4248:       x5       = b[row + 4];
4249:       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4250:       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4251:       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4252:       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4253:       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4254:       x[row++] = tmp1;
4255:       x[row++] = tmp2;
4256:       x[row++] = tmp3;
4257:       x[row++] = tmp4;
4258:       x[row++] = tmp5;
4259:       cnt += 25;
4260:       break;
4261:     default:
4262:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4263:     }
4264:   }
4265:   PetscCall(PetscLogFlops(2.0 * cnt));
4266:   PetscCall(VecRestoreArray(xx, &x));
4267:   PetscCall(VecRestoreArrayRead(bb, &b));
4268:   PetscFunctionReturn(PETSC_SUCCESS);
4269: }

4271: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4272: {
4273:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4275:   PetscFunctionBegin;
4276:   a->inode.node_count       = 0;
4277:   a->inode.use              = PETSC_FALSE;
4278:   a->inode.checked          = PETSC_FALSE;
4279:   a->inode.mat_nonzerostate = -1;
4280:   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4281:   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4282:   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4283:   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4284:   A->ops->coloringpatch     = NULL;
4285:   A->ops->multdiagonalblock = NULL;
4286:   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4287:   PetscFunctionReturn(PETSC_SUCCESS);
4288: }

4290: /*
4291:     samestructure indicates that the matrix has not changed its nonzero structure so we
4292:     do not need to recompute the inodes
4293: */
4294: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4295: {
4296:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4297:   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4298:   PetscBool       flag;
4299:   const PetscInt *idx, *idy, *ii;

4301:   PetscFunctionBegin;
4302:   if (!a->inode.use) {
4303:     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4304:     PetscCall(PetscFree(a->inode.size));
4305:     PetscFunctionReturn(PETSC_SUCCESS);
4306:   }
4307:   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);

4309:   m = A->rmap->n;
4310:   if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4311:   ns = a->inode.size;

4313:   i          = 0;
4314:   node_count = 0;
4315:   idx        = a->j;
4316:   ii         = a->i;
4317:   if (idx) {
4318:     while (i < m) {            /* For each row */
4319:       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4320:       /* Limits the number of elements in a node to 'a->inode.limit' */
4321:       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4322:         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4323:         if (nzy != nzx) break;
4324:         idy += nzx; /* Same nonzero pattern */
4325:         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4326:         if (!flag) break;
4327:       }
4328:       ns[node_count++] = blk_size;
4329:       idx += blk_size * nzx;
4330:       i = j;
4331:     }
4332:   }
4333:   /* If not enough inodes found,, do not use inode version of the routines */
4334:   if (!m || !idx || node_count > .8 * m) {
4335:     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4336:     PetscCall(PetscFree(a->inode.size));
4337:     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4338:   } else {
4339:     if (!A->factortype) {
4340:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4341:       if (A->rmap->n == A->cmap->n) {
4342:         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4343:         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4344:         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4345:         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4346:         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4347:       }
4348:     } else {
4349:       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4350:     }
4351:     a->inode.node_count = node_count;
4352:     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4353:   }
4354:   a->inode.checked          = PETSC_TRUE;
4355:   a->inode.mat_nonzerostate = A->nonzerostate;
4356:   PetscFunctionReturn(PETSC_SUCCESS);
4357: }

4359: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4360: {
4361:   Mat         B = *C;
4362:   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4363:   PetscInt    m = A->rmap->n;

4365:   PetscFunctionBegin;
4366:   c->inode.use              = a->inode.use;
4367:   c->inode.limit            = a->inode.limit;
4368:   c->inode.max_limit        = a->inode.max_limit;
4369:   c->inode.checked          = PETSC_FALSE;
4370:   c->inode.size             = NULL;
4371:   c->inode.node_count       = 0;
4372:   c->inode.ibdiagvalid      = PETSC_FALSE;
4373:   c->inode.ibdiag           = NULL;
4374:   c->inode.bdiag            = NULL;
4375:   c->inode.mat_nonzerostate = -1;
4376:   if (a->inode.use) {
4377:     if (a->inode.checked && a->inode.size) {
4378:       PetscCall(PetscMalloc1(m + 1, &c->inode.size));
4379:       PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));

4381:       c->inode.checked          = PETSC_TRUE;
4382:       c->inode.node_count       = a->inode.node_count;
4383:       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4384:     }
4385:     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4386:     if (!B->factortype) {
4387:       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4388:       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4389:       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4390:       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4391:       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4392:       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4393:     } else {
4394:       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4395:     }
4396:   }
4397:   PetscFunctionReturn(PETSC_SUCCESS);
4398: }

4400: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4401: {
4402:   PetscInt        k;
4403:   const PetscInt *vi;

4405:   PetscFunctionBegin;
4406:   vi = aj + ai[row];
4407:   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4408:   vi        = aj + adiag[row];
4409:   cols[nzl] = vi[0];
4410:   vi        = aj + adiag[row + 1] + 1;
4411:   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4412:   PetscFunctionReturn(PETSC_SUCCESS);
4413: }
4414: /*
4415:    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4416:    Modified from MatSeqAIJCheckInode().

4418:    Input Parameters:
4419: .  Mat A - ILU or LU matrix factor

4421: */
4422: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4423: {
4424:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4425:   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4426:   PetscInt       *cols1, *cols2, *ns;
4427:   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4428:   PetscBool       flag;

4430:   PetscFunctionBegin;
4431:   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4432:   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);

4434:   m = A->rmap->n;
4435:   if (a->inode.size) ns = a->inode.size;
4436:   else PetscCall(PetscMalloc1(m + 1, &ns));

4438:   i          = 0;
4439:   node_count = 0;
4440:   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4441:   while (i < m) {                       /* For each row */
4442:     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4443:     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4444:     nzx  = nzl1 + nzu1 + 1;
4445:     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));

4447:     /* Limits the number of elements in a node to 'a->inode.limit' */
4448:     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4449:       nzl2 = ai[j + 1] - ai[j];
4450:       nzu2 = adiag[j] - adiag[j + 1] - 1;
4451:       nzy  = nzl2 + nzu2 + 1;
4452:       if (nzy != nzx) break;
4453:       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4454:       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4455:       if (!flag) break;
4456:     }
4457:     ns[node_count++] = blk_size;
4458:     i                = j;
4459:   }
4460:   PetscCall(PetscFree2(cols1, cols2));
4461:   /* If not enough inodes found,, do not use inode version of the routines */
4462:   if (!m || node_count > .8 * m) {
4463:     PetscCall(PetscFree(ns));

4465:     a->inode.node_count = 0;
4466:     a->inode.size       = NULL;
4467:     a->inode.use        = PETSC_FALSE;

4469:     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4470:   } else {
4471:     A->ops->mult              = NULL;
4472:     A->ops->sor               = NULL;
4473:     A->ops->multadd           = NULL;
4474:     A->ops->getrowij          = NULL;
4475:     A->ops->restorerowij      = NULL;
4476:     A->ops->getcolumnij       = NULL;
4477:     A->ops->restorecolumnij   = NULL;
4478:     A->ops->coloringpatch     = NULL;
4479:     A->ops->multdiagonalblock = NULL;
4480:     a->inode.node_count       = node_count;
4481:     a->inode.size             = ns;

4483:     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4484:   }
4485:   a->inode.checked = PETSC_TRUE;
4486:   PetscFunctionReturn(PETSC_SUCCESS);
4487: }

4489: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4490: {
4491:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4493:   PetscFunctionBegin;
4494:   a->inode.ibdiagvalid = PETSC_FALSE;
4495:   PetscFunctionReturn(PETSC_SUCCESS);
4496: }

4498: /*
4499:      This is really ugly. if inodes are used this replaces the
4500:   permutations with ones that correspond to rows/cols of the matrix
4501:   rather than inode blocks
4502: */
4503: PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4504: {
4505:   PetscFunctionBegin;
4506:   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4507:   PetscFunctionReturn(PETSC_SUCCESS);
4508: }

4510: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4511: {
4512:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4513:   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4514:   const PetscInt *ridx, *cidx;
4515:   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4516:   PetscInt        nslim_col, *ns_col;
4517:   IS              ris = *rperm, cis = *cperm;

4519:   PetscFunctionBegin;
4520:   if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS);           /* no inodes so return */
4521:   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */

4523:   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4524:   PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
4525:   PetscCall(PetscMalloc2(m, &permr, n, &permc));

4527:   PetscCall(ISGetIndices(ris, &ridx));
4528:   PetscCall(ISGetIndices(cis, &cidx));

4530:   /* Form the inode structure for the rows of permuted matrix using inv perm*/
4531:   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];

4533:   /* Construct the permutations for rows*/
4534:   for (i = 0, row = 0; i < nslim_row; ++i) {
4535:     indx      = ridx[i];
4536:     start_val = tns[indx];
4537:     end_val   = tns[indx + 1];
4538:     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4539:   }

4541:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4542:   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];

4544:   /* Construct permutations for columns */
4545:   for (i = 0, col = 0; i < nslim_col; ++i) {
4546:     indx      = cidx[i];
4547:     start_val = tns[indx];
4548:     end_val   = tns[indx + 1];
4549:     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4550:   }

4552:   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4553:   PetscCall(ISSetPermutation(*rperm));
4554:   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4555:   PetscCall(ISSetPermutation(*cperm));

4557:   PetscCall(ISRestoreIndices(ris, &ridx));
4558:   PetscCall(ISRestoreIndices(cis, &cidx));

4560:   PetscCall(PetscFree(ns_col));
4561:   PetscCall(PetscFree2(permr, permc));
4562:   PetscCall(ISDestroy(&cis));
4563:   PetscCall(ISDestroy(&ris));
4564:   PetscCall(PetscFree(tns));
4565:   PetscFunctionReturn(PETSC_SUCCESS);
4566: }

4568: /*@C
4569:   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes

4571:   Not Collective

4573:   Input Parameter:
4574: . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`

4576:   Output Parameters:
4577: + node_count - no of inodes present in the matrix.
4578: . sizes      - an array of size `node_count`, with the sizes of each inode.
4579: - limit      - the max size used to generate the inodes.

4581:   Level: advanced

4583:   Note:
4584:   It should be called after the matrix is assembled.
4585:   The contents of the sizes[] array should not be changed.
4586:   `NULL` may be passed for information not needed

4588: .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4589: @*/
4590: PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4591: {
4592:   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);

4594:   PetscFunctionBegin;
4595:   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4596:   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4597:   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4598:   PetscFunctionReturn(PETSC_SUCCESS);
4599: }

4601: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4602: {
4603:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4605:   PetscFunctionBegin;
4606:   if (node_count) *node_count = a->inode.node_count;
4607:   if (sizes) *sizes = a->inode.size;
4608:   if (limit) *limit = a->inode.limit;
4609:   PetscFunctionReturn(PETSC_SUCCESS);
4610: }