Actual source code: baijfact.c

  1: /*
  2:     Factorization code for BAIJ format.
  3: */
  4: #include <../src/mat/impls/baij/seq/baij.h>
  5: #include <petsc/private/kernels/blockinvert.h>

  7: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
  8: {
  9:   Mat             C = B;
 10:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
 11:   IS              isrow = b->row, isicol = b->icol;
 12:   const PetscInt *r, *ic;
 13:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
 14:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
 15:   MatScalar      *rtmp, *pc, *mwork, *pv;
 16:   MatScalar      *aa             = a->a, *v;
 17:   PetscReal       shift          = info->shiftamount;
 18:   const PetscBool allowzeropivot = PetscNot(A->erroriffailure);

 20:   PetscFunctionBegin;
 21:   PetscCall(ISGetIndices(isrow, &r));
 22:   PetscCall(ISGetIndices(isicol, &ic));

 24:   /* generate work space needed by the factorization */
 25:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
 26:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

 28:   for (PetscInt i = 0; i < n; i++) {
 29:     /* zero rtmp */
 30:     /* L part */
 31:     PetscInt *pj;
 32:     PetscInt  nzL, nz = bi[i + 1] - bi[i];
 33:     bjtmp = bj + bi[i];
 34:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

 36:     /* U part */
 37:     nz    = bdiag[i] - bdiag[i + 1];
 38:     bjtmp = bj + bdiag[i + 1] + 1;
 39:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

 41:     /* load in initial (unfactored row) */
 42:     nz    = ai[r[i] + 1] - ai[r[i]];
 43:     ajtmp = aj + ai[r[i]];
 44:     v     = aa + bs2 * ai[r[i]];
 45:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));

 47:     /* elimination */
 48:     bjtmp = bj + bi[i];
 49:     nzL   = bi[i + 1] - bi[i];
 50:     for (PetscInt k = 0; k < nzL; k++) {
 51:       PetscBool flg = PETSC_FALSE;
 52:       PetscInt  row = bjtmp[k];

 54:       pc = rtmp + bs2 * row;
 55:       for (PetscInt j = 0; j < bs2; j++) {
 56:         if (pc[j] != (PetscScalar)0.0) {
 57:           flg = PETSC_TRUE;
 58:           break;
 59:         }
 60:       }
 61:       if (flg) {
 62:         pv = b->a + bs2 * bdiag[row];
 63:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
 64:         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));

 66:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
 67:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
 68:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
 69:         for (PetscInt j = 0; j < nz; j++) {
 70:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
 71:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
 72:           v = rtmp + 4 * pj[j];
 73:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
 74:           pv += 4;
 75:         }
 76:         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
 77:       }
 78:     }

 80:     /* finished row so stick it into b->a */
 81:     /* L part */
 82:     pv = b->a + bs2 * bi[i];
 83:     pj = b->j + bi[i];
 84:     nz = bi[i + 1] - bi[i];
 85:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

 87:     /* Mark diagonal and invert diagonal for simpler triangular solves */
 88:     pv = b->a + bs2 * bdiag[i];
 89:     pj = b->j + bdiag[i];
 90:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
 91:     {
 92:       PetscBool zeropivotdetected;

 94:       PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
 95:       if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
 96:     }

 98:     /* U part */
 99:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
100:     pj = b->j + bdiag[i + 1] + 1;
101:     nz = bdiag[i] - bdiag[i + 1] - 1;
102:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
103:   }

105:   PetscCall(PetscFree2(rtmp, mwork));
106:   PetscCall(ISRestoreIndices(isicol, &ic));
107:   PetscCall(ISRestoreIndices(isrow, &r));

109:   C->ops->solve          = MatSolve_SeqBAIJ_2;
110:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
111:   C->assembled           = PETSC_TRUE;

113:   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
114:   PetscFunctionReturn(PETSC_SUCCESS);
115: }

117: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
118: {
119:   Mat             C = B;
120:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
121:   PetscInt        i, j, k, nz, nzL, row, *pj;
122:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
123:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
124:   MatScalar      *rtmp, *pc, *mwork, *pv;
125:   MatScalar      *aa = a->a, *v;
126:   PetscInt        flg;
127:   PetscReal       shift = info->shiftamount;
128:   PetscBool       allowzeropivot, zeropivotdetected;

130:   PetscFunctionBegin;
131:   allowzeropivot = PetscNot(A->erroriffailure);

133:   /* generate work space needed by the factorization */
134:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
135:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

137:   for (i = 0; i < n; i++) {
138:     /* zero rtmp */
139:     /* L part */
140:     nz    = bi[i + 1] - bi[i];
141:     bjtmp = bj + bi[i];
142:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

144:     /* U part */
145:     nz    = bdiag[i] - bdiag[i + 1];
146:     bjtmp = bj + bdiag[i + 1] + 1;
147:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

149:     /* load in initial (unfactored row) */
150:     nz    = ai[i + 1] - ai[i];
151:     ajtmp = aj + ai[i];
152:     v     = aa + bs2 * ai[i];
153:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));

155:     /* elimination */
156:     bjtmp = bj + bi[i];
157:     nzL   = bi[i + 1] - bi[i];
158:     for (k = 0; k < nzL; k++) {
159:       row = bjtmp[k];
160:       pc  = rtmp + bs2 * row;
161:       for (flg = 0, j = 0; j < bs2; j++) {
162:         if (pc[j] != (PetscScalar)0.0) {
163:           flg = 1;
164:           break;
165:         }
166:       }
167:       if (flg) {
168:         pv = b->a + bs2 * bdiag[row];
169:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
170:         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));

172:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
173:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
174:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
175:         for (j = 0; j < nz; j++) {
176:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
177:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
178:           v = rtmp + 4 * pj[j];
179:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
180:           pv += 4;
181:         }
182:         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
183:       }
184:     }

186:     /* finished row so stick it into b->a */
187:     /* L part */
188:     pv = b->a + bs2 * bi[i];
189:     pj = b->j + bi[i];
190:     nz = bi[i + 1] - bi[i];
191:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

193:     /* Mark diagonal and invert diagonal for simpler triangular solves */
194:     pv = b->a + bs2 * bdiag[i];
195:     pj = b->j + bdiag[i];
196:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
197:     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
198:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

200:     /* U part */
201:     /*
202:     pv = b->a + bs2*bi[2*n-i];
203:     pj = b->j + bi[2*n-i];
204:     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
205:     */
206:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
207:     pj = b->j + bdiag[i + 1] + 1;
208:     nz = bdiag[i] - bdiag[i + 1] - 1;
209:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
210:   }
211:   PetscCall(PetscFree2(rtmp, mwork));

213:   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
214:   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
215:   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
216:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
217:   C->assembled           = PETSC_TRUE;

219:   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
220:   PetscFunctionReturn(PETSC_SUCCESS);
221: }

223: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
224: {
225:   Mat             C = B;
226:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
227:   IS              isrow = b->row, isicol = b->icol;
228:   const PetscInt *r, *ic;
229:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
230:   PetscInt       *ajtmpold, *ajtmp, nz, row;
231:   PetscInt        idx, *ai = a->i, *aj = a->j, *pj;
232:   const PetscInt *diag_offset;
233:   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
234:   MatScalar       p1, p2, p3, p4;
235:   MatScalar      *ba = b->a, *aa = a->a;
236:   PetscReal       shift = info->shiftamount;
237:   PetscBool       allowzeropivot, zeropivotdetected;

239:   PetscFunctionBegin;
240:   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
241:   A->factortype = MAT_FACTOR_NONE;
242:   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
243:   A->factortype  = MAT_FACTOR_ILU;
244:   allowzeropivot = PetscNot(A->erroriffailure);
245:   PetscCall(ISGetIndices(isrow, &r));
246:   PetscCall(ISGetIndices(isicol, &ic));
247:   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));

249:   for (i = 0; i < n; i++) {
250:     nz    = bi[i + 1] - bi[i];
251:     ajtmp = bj + bi[i];
252:     for (j = 0; j < nz; j++) {
253:       x    = rtmp + 4 * ajtmp[j];
254:       x[0] = x[1] = x[2] = x[3] = 0.0;
255:     }
256:     /* load in initial (unfactored row) */
257:     idx      = r[i];
258:     nz       = ai[idx + 1] - ai[idx];
259:     ajtmpold = aj + ai[idx];
260:     v        = aa + 4 * ai[idx];
261:     for (j = 0; j < nz; j++) {
262:       x    = rtmp + 4 * ic[ajtmpold[j]];
263:       x[0] = v[0];
264:       x[1] = v[1];
265:       x[2] = v[2];
266:       x[3] = v[3];
267:       v += 4;
268:     }
269:     row = *ajtmp++;
270:     while (row < i) {
271:       pc = rtmp + 4 * row;
272:       p1 = pc[0];
273:       p2 = pc[1];
274:       p3 = pc[2];
275:       p4 = pc[3];
276:       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
277:         pv    = ba + 4 * diag_offset[row];
278:         pj    = bj + diag_offset[row] + 1;
279:         x1    = pv[0];
280:         x2    = pv[1];
281:         x3    = pv[2];
282:         x4    = pv[3];
283:         pc[0] = m1 = p1 * x1 + p3 * x2;
284:         pc[1] = m2 = p2 * x1 + p4 * x2;
285:         pc[2] = m3 = p1 * x3 + p3 * x4;
286:         pc[3] = m4 = p2 * x3 + p4 * x4;
287:         nz         = bi[row + 1] - diag_offset[row] - 1;
288:         pv += 4;
289:         for (j = 0; j < nz; j++) {
290:           x1 = pv[0];
291:           x2 = pv[1];
292:           x3 = pv[2];
293:           x4 = pv[3];
294:           x  = rtmp + 4 * pj[j];
295:           x[0] -= m1 * x1 + m3 * x2;
296:           x[1] -= m2 * x1 + m4 * x2;
297:           x[2] -= m1 * x3 + m3 * x4;
298:           x[3] -= m2 * x3 + m4 * x4;
299:           pv += 4;
300:         }
301:         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
302:       }
303:       row = *ajtmp++;
304:     }
305:     /* finished row so stick it into b->a */
306:     pv = ba + 4 * bi[i];
307:     pj = bj + bi[i];
308:     nz = bi[i + 1] - bi[i];
309:     for (j = 0; j < nz; j++) {
310:       x     = rtmp + 4 * pj[j];
311:       pv[0] = x[0];
312:       pv[1] = x[1];
313:       pv[2] = x[2];
314:       pv[3] = x[3];
315:       pv += 4;
316:     }
317:     /* invert diagonal block */
318:     w = ba + 4 * diag_offset[i];
319:     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
320:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
321:   }

323:   PetscCall(PetscFree(rtmp));
324:   PetscCall(ISRestoreIndices(isicol, &ic));
325:   PetscCall(ISRestoreIndices(isrow, &r));

327:   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
328:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
329:   C->assembled           = PETSC_TRUE;

331:   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
332:   PetscFunctionReturn(PETSC_SUCCESS);
333: }
334: /*
335:       Version for when blocks are 2 by 2 Using natural ordering
336: */
337: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
338: {
339:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
340:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
341:   PetscInt       *ajtmpold, *ajtmp, nz, row;
342:   PetscInt       *ai = a->i, *aj = a->j, *pj;
343:   const PetscInt *diag_offset;
344:   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
345:   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
346:   MatScalar      *ba = b->a, *aa = a->a;
347:   PetscReal       shift = info->shiftamount;
348:   PetscBool       allowzeropivot, zeropivotdetected;

350:   PetscFunctionBegin;
351:   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
352:   A->factortype = MAT_FACTOR_NONE;
353:   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
354:   A->factortype  = MAT_FACTOR_ILU;
355:   allowzeropivot = PetscNot(A->erroriffailure);
356:   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
357:   for (i = 0; i < n; i++) {
358:     nz    = bi[i + 1] - bi[i];
359:     ajtmp = bj + bi[i];
360:     for (j = 0; j < nz; j++) {
361:       x    = rtmp + 4 * ajtmp[j];
362:       x[0] = x[1] = x[2] = x[3] = 0.0;
363:     }
364:     /* load in initial (unfactored row) */
365:     nz       = ai[i + 1] - ai[i];
366:     ajtmpold = aj + ai[i];
367:     v        = aa + 4 * ai[i];
368:     for (j = 0; j < nz; j++) {
369:       x    = rtmp + 4 * ajtmpold[j];
370:       x[0] = v[0];
371:       x[1] = v[1];
372:       x[2] = v[2];
373:       x[3] = v[3];
374:       v += 4;
375:     }
376:     row = *ajtmp++;
377:     while (row < i) {
378:       pc = rtmp + 4 * row;
379:       p1 = pc[0];
380:       p2 = pc[1];
381:       p3 = pc[2];
382:       p4 = pc[3];
383:       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
384:         pv    = ba + 4 * diag_offset[row];
385:         pj    = bj + diag_offset[row] + 1;
386:         x1    = pv[0];
387:         x2    = pv[1];
388:         x3    = pv[2];
389:         x4    = pv[3];
390:         pc[0] = m1 = p1 * x1 + p3 * x2;
391:         pc[1] = m2 = p2 * x1 + p4 * x2;
392:         pc[2] = m3 = p1 * x3 + p3 * x4;
393:         pc[3] = m4 = p2 * x3 + p4 * x4;
394:         nz         = bi[row + 1] - diag_offset[row] - 1;
395:         pv += 4;
396:         for (j = 0; j < nz; j++) {
397:           x1 = pv[0];
398:           x2 = pv[1];
399:           x3 = pv[2];
400:           x4 = pv[3];
401:           x  = rtmp + 4 * pj[j];
402:           x[0] -= m1 * x1 + m3 * x2;
403:           x[1] -= m2 * x1 + m4 * x2;
404:           x[2] -= m1 * x3 + m3 * x4;
405:           x[3] -= m2 * x3 + m4 * x4;
406:           pv += 4;
407:         }
408:         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
409:       }
410:       row = *ajtmp++;
411:     }
412:     /* finished row so stick it into b->a */
413:     pv = ba + 4 * bi[i];
414:     pj = bj + bi[i];
415:     nz = bi[i + 1] - bi[i];
416:     for (j = 0; j < nz; j++) {
417:       x     = rtmp + 4 * pj[j];
418:       pv[0] = x[0];
419:       pv[1] = x[1];
420:       pv[2] = x[2];
421:       pv[3] = x[3];
422:       /*
423:       printf(" col %d:",pj[j]);
424:       PetscInt j1;
425:       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
426:       printf("\n");
427:       */
428:       pv += 4;
429:     }
430:     /* invert diagonal block */
431:     w = ba + 4 * diag_offset[i];
432:     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
433:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
434:   }

436:   PetscCall(PetscFree(rtmp));

438:   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
439:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
440:   C->assembled           = PETSC_TRUE;

442:   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
443:   PetscFunctionReturn(PETSC_SUCCESS);
444: }

446: /*
447:      Version for when blocks are 1 by 1.
448: */
449: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
450: {
451:   Mat              C = B;
452:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
453:   IS               isrow = b->row, isicol = b->icol;
454:   const PetscInt  *r, *ic, *ics;
455:   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
456:   PetscInt         i, j, k, nz, nzL, row, *pj;
457:   const PetscInt  *ajtmp, *bjtmp;
458:   MatScalar       *rtmp, *pc, multiplier, *pv;
459:   const MatScalar *aa = a->a, *v;
460:   PetscBool        row_identity, col_identity;
461:   FactorShiftCtx   sctx;
462:   const PetscInt  *ddiag;
463:   PetscReal        rs;
464:   MatScalar        d;

466:   PetscFunctionBegin;
467:   /* MatPivotSetUp(): initialize shift context sctx */
468:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

470:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
471:     PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &ddiag, NULL));
472:     sctx.shift_top = info->zeropivot;
473:     for (i = 0; i < n; i++) {
474:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
475:       d  = (aa)[ddiag[i]];
476:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
477:       v  = aa + ai[i];
478:       nz = ai[i + 1] - ai[i];
479:       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
480:       if (rs > sctx.shift_top) sctx.shift_top = rs;
481:     }
482:     sctx.shift_top *= 1.1;
483:     sctx.nshift_max = 5;
484:     sctx.shift_lo   = 0.;
485:     sctx.shift_hi   = 1.;
486:   }

488:   PetscCall(ISGetIndices(isrow, &r));
489:   PetscCall(ISGetIndices(isicol, &ic));
490:   PetscCall(PetscMalloc1(n + 1, &rtmp));
491:   ics = ic;

493:   do {
494:     sctx.newshift = PETSC_FALSE;
495:     for (i = 0; i < n; i++) {
496:       /* zero rtmp */
497:       /* L part */
498:       nz    = bi[i + 1] - bi[i];
499:       bjtmp = bj + bi[i];
500:       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

502:       /* U part */
503:       nz    = bdiag[i] - bdiag[i + 1];
504:       bjtmp = bj + bdiag[i + 1] + 1;
505:       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

507:       /* load in initial (unfactored row) */
508:       nz    = ai[r[i] + 1] - ai[r[i]];
509:       ajtmp = aj + ai[r[i]];
510:       v     = aa + ai[r[i]];
511:       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];

513:       /* ZeropivotApply() */
514:       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */

516:       /* elimination */
517:       bjtmp = bj + bi[i];
518:       row   = *bjtmp++;
519:       nzL   = bi[i + 1] - bi[i];
520:       for (k = 0; k < nzL; k++) {
521:         pc = rtmp + row;
522:         if (*pc != (PetscScalar)0.0) {
523:           pv         = b->a + bdiag[row];
524:           multiplier = *pc * (*pv);
525:           *pc        = multiplier;

527:           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
528:           pv = b->a + bdiag[row + 1] + 1;
529:           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
530:           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
531:           PetscCall(PetscLogFlops(2.0 * nz));
532:         }
533:         row = *bjtmp++;
534:       }

536:       /* finished row so stick it into b->a */
537:       rs = 0.0;
538:       /* L part */
539:       pv = b->a + bi[i];
540:       pj = b->j + bi[i];
541:       nz = bi[i + 1] - bi[i];
542:       for (j = 0; j < nz; j++) {
543:         pv[j] = rtmp[pj[j]];
544:         rs += PetscAbsScalar(pv[j]);
545:       }

547:       /* U part */
548:       pv = b->a + bdiag[i + 1] + 1;
549:       pj = b->j + bdiag[i + 1] + 1;
550:       nz = bdiag[i] - bdiag[i + 1] - 1;
551:       for (j = 0; j < nz; j++) {
552:         pv[j] = rtmp[pj[j]];
553:         rs += PetscAbsScalar(pv[j]);
554:       }

556:       sctx.rs = rs;
557:       sctx.pv = rtmp[i];
558:       PetscCall(MatPivotCheck(B, A, info, &sctx, i));
559:       if (sctx.newshift) break; /* break for-loop */
560:       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */

562:       /* Mark diagonal and invert diagonal for simpler triangular solves */
563:       pv  = b->a + bdiag[i];
564:       *pv = (PetscScalar)1.0 / rtmp[i];

566:     } /* endof for (i=0; i<n; i++) { */

568:     /* MatPivotRefine() */
569:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
570:       /*
571:        * if no shift in this attempt & shifting & started shifting & can refine,
572:        * then try lower shift
573:        */
574:       sctx.shift_hi       = sctx.shift_fraction;
575:       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
576:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
577:       sctx.newshift       = PETSC_TRUE;
578:       sctx.nshift++;
579:     }
580:   } while (sctx.newshift);

582:   PetscCall(PetscFree(rtmp));
583:   PetscCall(ISRestoreIndices(isicol, &ic));
584:   PetscCall(ISRestoreIndices(isrow, &r));

586:   PetscCall(ISIdentity(isrow, &row_identity));
587:   PetscCall(ISIdentity(isicol, &col_identity));
588:   if (row_identity && col_identity) {
589:     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
590:     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
591:     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
592:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
593:   } else {
594:     C->ops->solve          = MatSolve_SeqBAIJ_1;
595:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
596:   }
597:   C->assembled = PETSC_TRUE;
598:   PetscCall(PetscLogFlops(C->cmap->n));

600:   /* MatShiftView(A,info,&sctx) */
601:   if (sctx.nshift) {
602:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
603:       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
604:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
605:       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
606:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
607:       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
608:     }
609:   }
610:   PetscFunctionReturn(PETSC_SUCCESS);
611: }

613: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
614: {
615:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
616:   IS              isrow = b->row, isicol = b->icol;
617:   const PetscInt *r, *ic;
618:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
619:   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
620:   PetscInt        diag, *pj;
621:   const PetscInt *diag_offset;
622:   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
623:   MatScalar      *ba = b->a, *aa = a->a;
624:   PetscBool       row_identity, col_identity;

626:   PetscFunctionBegin;
627:   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
628:   A->factortype = MAT_FACTOR_NONE;
629:   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
630:   A->factortype = MAT_FACTOR_ILU;
631:   PetscCall(ISGetIndices(isrow, &r));
632:   PetscCall(ISGetIndices(isicol, &ic));
633:   PetscCall(PetscMalloc1(n + 1, &rtmp));

635:   for (i = 0; i < n; i++) {
636:     nz    = bi[i + 1] - bi[i];
637:     ajtmp = bj + bi[i];
638:     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;

640:     /* load in initial (unfactored row) */
641:     nz       = ai[r[i] + 1] - ai[r[i]];
642:     ajtmpold = aj + ai[r[i]];
643:     v        = aa + ai[r[i]];
644:     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];

646:     row = *ajtmp++;
647:     while (row < i) {
648:       pc = rtmp + row;
649:       if (*pc != 0.0) {
650:         pv         = ba + diag_offset[row];
651:         pj         = bj + diag_offset[row] + 1;
652:         multiplier = *pc * *pv++;
653:         *pc        = multiplier;
654:         nz         = bi[row + 1] - diag_offset[row] - 1;
655:         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
656:         PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
657:       }
658:       row = *ajtmp++;
659:     }
660:     /* finished row so stick it into b->a */
661:     pv = ba + bi[i];
662:     pj = bj + bi[i];
663:     nz = bi[i + 1] - bi[i];
664:     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
665:     diag = diag_offset[i] - bi[i];
666:     /* check pivot entry for current row */
667:     PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
668:     pv[diag] = 1.0 / pv[diag];
669:   }

671:   PetscCall(PetscFree(rtmp));
672:   PetscCall(ISRestoreIndices(isicol, &ic));
673:   PetscCall(ISRestoreIndices(isrow, &r));
674:   PetscCall(ISIdentity(isrow, &row_identity));
675:   PetscCall(ISIdentity(isicol, &col_identity));
676:   if (row_identity && col_identity) {
677:     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
678:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
679:   } else {
680:     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
681:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
682:   }
683:   C->assembled = PETSC_TRUE;
684:   PetscCall(PetscLogFlops(C->cmap->n));
685:   PetscFunctionReturn(PETSC_SUCCESS);
686: }

688: static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
689: {
690:   PetscFunctionBegin;
691:   *type = MATSOLVERPETSC;
692:   PetscFunctionReturn(PETSC_SUCCESS);
693: }

695: PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
696: {
697:   PetscInt n = A->rmap->n;

699:   PetscFunctionBegin;
700: #if defined(PETSC_USE_COMPLEX)
701:   PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
702: #endif
703:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
704:   PetscCall(MatSetSizes(*B, n, n, n, n));
705:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
706:     PetscCall(MatSetType(*B, MATSEQBAIJ));

708:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
709:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
710:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
711:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
712:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
713:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
714:     PetscCall(MatSetType(*B, MATSEQSBAIJ));
715:     PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));

717:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
718:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
719:     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
720:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
721:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
722:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
723:   (*B)->factortype     = ftype;
724:   (*B)->canuseordering = PETSC_TRUE;

726:   PetscCall(PetscFree((*B)->solvertype));
727:   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
728:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
729:   PetscFunctionReturn(PETSC_SUCCESS);
730: }

732: PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
733: {
734:   Mat C;

736:   PetscFunctionBegin;
737:   PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
738:   PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
739:   PetscCall(MatLUFactorNumeric(C, A, info));

741:   A->ops->solve          = C->ops->solve;
742:   A->ops->solvetranspose = C->ops->solvetranspose;

744:   PetscCall(MatHeaderMerge(A, &C));
745:   PetscFunctionReturn(PETSC_SUCCESS);
746: }

748: #include <../src/mat/impls/sbaij/seq/sbaij.h>
749: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
750: {
751:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
752:   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
753:   IS              ip = b->row;
754:   const PetscInt *rip;
755:   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
756:   PetscInt       *ai = a->i, *aj = a->j;
757:   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
758:   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
759:   PetscReal       rs;
760:   FactorShiftCtx  sctx;

762:   PetscFunctionBegin;
763:   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
764:     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
765:     PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info));
766:     PetscCall(MatDestroy(&a->sbaijMat));
767:     PetscFunctionReturn(PETSC_SUCCESS);
768:   }

770:   /* MatPivotSetUp(): initialize shift context sctx */
771:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

773:   PetscCall(ISGetIndices(ip, &rip));
774:   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));

776:   sctx.shift_amount = 0.;
777:   sctx.nshift       = 0;
778:   do {
779:     sctx.newshift = PETSC_FALSE;
780:     for (i = 0; i < mbs; i++) {
781:       rtmp[i] = 0.0;
782:       jl[i]   = mbs;
783:       il[0]   = 0;
784:     }

786:     for (k = 0; k < mbs; k++) {
787:       bval = ba + bi[k];
788:       /* initialize k-th row by the perm[k]-th row of A */
789:       jmin = ai[rip[k]];
790:       jmax = ai[rip[k] + 1];
791:       for (j = jmin; j < jmax; j++) {
792:         col = rip[aj[j]];
793:         if (col >= k) { /* only take upper triangular entry */
794:           rtmp[col] = aa[j];
795:           *bval++   = 0.0; /* for in-place factorization */
796:         }
797:       }

799:       /* shift the diagonal of the matrix */
800:       if (sctx.nshift) rtmp[k] += sctx.shift_amount;

802:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
803:       dk = rtmp[k];
804:       i  = jl[k]; /* first row to be added to k_th row  */

806:       while (i < k) {
807:         nexti = jl[i]; /* next row to be added to k_th row */

809:         /* compute multiplier, update diag(k) and U(i,k) */
810:         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
811:         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
812:         dk += uikdi * ba[ili];
813:         ba[ili] = uikdi; /* -U(i,k) */

815:         /* add multiple of row i to k-th row */
816:         jmin = ili + 1;
817:         jmax = bi[i + 1];
818:         if (jmin < jmax) {
819:           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
820:           /* update il and jl for row i */
821:           il[i] = jmin;
822:           j     = bj[jmin];
823:           jl[i] = jl[j];
824:           jl[j] = i;
825:         }
826:         i = nexti;
827:       }

829:       /* shift the diagonals when zero pivot is detected */
830:       /* compute rs=sum of abs(off-diagonal) */
831:       rs   = 0.0;
832:       jmin = bi[k] + 1;
833:       nz   = bi[k + 1] - jmin;
834:       if (nz) {
835:         bcol = bj + jmin;
836:         while (nz--) {
837:           rs += PetscAbsScalar(rtmp[*bcol]);
838:           bcol++;
839:         }
840:       }

842:       sctx.rs = rs;
843:       sctx.pv = dk;
844:       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
845:       if (sctx.newshift) break;
846:       dk = sctx.pv;

848:       /* copy data into U(k,:) */
849:       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
850:       jmin      = bi[k] + 1;
851:       jmax      = bi[k + 1];
852:       if (jmin < jmax) {
853:         for (j = jmin; j < jmax; j++) {
854:           col       = bj[j];
855:           ba[j]     = rtmp[col];
856:           rtmp[col] = 0.0;
857:         }
858:         /* add the k-th row into il and jl */
859:         il[k] = jmin;
860:         i     = bj[jmin];
861:         jl[k] = jl[i];
862:         jl[i] = k;
863:       }
864:     }
865:   } while (sctx.newshift);
866:   PetscCall(PetscFree3(rtmp, il, jl));

868:   PetscCall(ISRestoreIndices(ip, &rip));

870:   C->assembled    = PETSC_TRUE;
871:   C->preallocated = PETSC_TRUE;

873:   PetscCall(PetscLogFlops(C->rmap->N));
874:   if (sctx.nshift) {
875:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
876:       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
877:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
878:       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
879:     }
880:   }
881:   PetscFunctionReturn(PETSC_SUCCESS);
882: }

884: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
885: {
886:   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
887:   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
888:   PetscInt       i, j, am = a->mbs;
889:   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
890:   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
891:   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
892:   PetscReal      rs;
893:   FactorShiftCtx sctx;

895:   PetscFunctionBegin;
896:   /* MatPivotSetUp(): initialize shift context sctx */
897:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

899:   PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));

901:   do {
902:     sctx.newshift = PETSC_FALSE;
903:     for (i = 0; i < am; i++) {
904:       rtmp[i] = 0.0;
905:       jl[i]   = am;
906:       il[0]   = 0;
907:     }

909:     for (k = 0; k < am; k++) {
910:       /* initialize k-th row with elements nonzero in row perm(k) of A */
911:       nz   = ai[k + 1] - ai[k];
912:       acol = aj + ai[k];
913:       aval = aa + ai[k];
914:       bval = ba + bi[k];
915:       while (nz--) {
916:         if (*acol < k) { /* skip lower triangular entries */
917:           acol++;
918:           aval++;
919:         } else {
920:           rtmp[*acol++] = *aval++;
921:           *bval++       = 0.0; /* for in-place factorization */
922:         }
923:       }

925:       /* shift the diagonal of the matrix */
926:       if (sctx.nshift) rtmp[k] += sctx.shift_amount;

928:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
929:       dk = rtmp[k];
930:       i  = jl[k]; /* first row to be added to k_th row  */

932:       while (i < k) {
933:         nexti = jl[i]; /* next row to be added to k_th row */
934:         /* compute multiplier, update D(k) and U(i,k) */
935:         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
936:         uikdi = -ba[ili] * ba[bi[i]];
937:         dk += uikdi * ba[ili];
938:         ba[ili] = uikdi; /* -U(i,k) */

940:         /* add multiple of row i to k-th row ... */
941:         jmin = ili + 1;
942:         nz   = bi[i + 1] - jmin;
943:         if (nz > 0) {
944:           bcol = bj + jmin;
945:           bval = ba + jmin;
946:           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
947:           /* update il and jl for i-th row */
948:           il[i] = jmin;
949:           j     = bj[jmin];
950:           jl[i] = jl[j];
951:           jl[j] = i;
952:         }
953:         i = nexti;
954:       }

956:       /* shift the diagonals when zero pivot is detected */
957:       /* compute rs=sum of abs(off-diagonal) */
958:       rs   = 0.0;
959:       jmin = bi[k] + 1;
960:       nz   = bi[k + 1] - jmin;
961:       if (nz) {
962:         bcol = bj + jmin;
963:         while (nz--) {
964:           rs += PetscAbsScalar(rtmp[*bcol]);
965:           bcol++;
966:         }
967:       }

969:       sctx.rs = rs;
970:       sctx.pv = dk;
971:       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
972:       if (sctx.newshift) break; /* sctx.shift_amount is updated */
973:       dk = sctx.pv;

975:       /* copy data into U(k,:) */
976:       ba[bi[k]] = 1.0 / dk;
977:       jmin      = bi[k] + 1;
978:       nz        = bi[k + 1] - jmin;
979:       if (nz) {
980:         bcol = bj + jmin;
981:         bval = ba + jmin;
982:         while (nz--) {
983:           *bval++       = rtmp[*bcol];
984:           rtmp[*bcol++] = 0.0;
985:         }
986:         /* add k-th row into il and jl */
987:         il[k] = jmin;
988:         i     = bj[jmin];
989:         jl[k] = jl[i];
990:         jl[i] = k;
991:       }
992:     }
993:   } while (sctx.newshift);
994:   PetscCall(PetscFree3(rtmp, il, jl));

996:   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
997:   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
998:   C->assembled           = PETSC_TRUE;
999:   C->preallocated        = PETSC_TRUE;

1001:   PetscCall(PetscLogFlops(C->rmap->N));
1002:   if (sctx.nshift) {
1003:     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1004:       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1005:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1006:       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1007:     }
1008:   }
1009:   PetscFunctionReturn(PETSC_SUCCESS);
1010: }

1012: #include <petscbt.h>
1013: #include <../src/mat/utils/freespace.h>
1014: PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1015: {
1016:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1017:   Mat_SeqSBAIJ      *b;
1018:   Mat                B;
1019:   PetscBool          perm_identity;
1020:   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1021:   const PetscInt    *rip;
1022:   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1023:   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1024:   PetscReal          fill = info->fill, levels = info->levels;
1025:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1026:   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1027:   PetscBT            lnkbt;
1028:   PetscBool          diagDense;
1029:   const PetscInt    *adiag;

1031:   PetscFunctionBegin;
1032:   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense));
1033:   PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");

1035:   if (bs > 1) {
1036:     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1037:     fact->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */

1039:     PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1040:     PetscFunctionReturn(PETSC_SUCCESS);
1041:   }

1043:   PetscCall(ISIdentity(perm, &perm_identity));
1044:   PetscCall(ISGetIndices(perm, &rip));

1046:   /* special case that simply copies fill pattern */
1047:   if (!levels && perm_identity) {
1048:     PetscCall(PetscMalloc1(am + 1, &ui));
1049:     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - adiag[i]; /* ui: rowlengths - changes when !perm_identity */
1050:     B = fact;
1051:     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));

1053:     b  = (Mat_SeqSBAIJ *)B->data;
1054:     uj = b->j;
1055:     for (i = 0; i < am; i++) {
1056:       aj = a->j + adiag[i];
1057:       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1058:       b->ilen[i] = ui[i];
1059:     }
1060:     PetscCall(PetscFree(ui));

1062:     B->factortype = MAT_FACTOR_NONE;

1064:     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1065:     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1066:     B->factortype = MAT_FACTOR_ICC;

1068:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1069:     PetscFunctionReturn(PETSC_SUCCESS);
1070:   }

1072:   /* initialization */
1073:   PetscCall(PetscMalloc1(am + 1, &ui));
1074:   ui[0] = 0;
1075:   PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));

1077:   /* jl: linked list for storing indices of the pivot rows
1078:      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1079:   PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1080:   for (i = 0; i < am; i++) {
1081:     jl[i] = am;
1082:     il[i] = 0;
1083:   }

1085:   /* create and initialize a linked list for storing column indices of the active row k */
1086:   nlnk = am + 1;
1087:   PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));

1089:   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1090:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));

1092:   current_space = free_space;

1094:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1095:   current_space_lvl = free_space_lvl;

1097:   for (k = 0; k < am; k++) { /* for each active row k */
1098:     /* initialize lnk by the column indices of row rip[k] of A */
1099:     nzk         = 0;
1100:     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1101:     ncols_upper = 0;
1102:     cols        = cols_lvl + am;
1103:     for (j = 0; j < ncols; j++) {
1104:       i = rip[*(aj + ai[rip[k]] + j)];
1105:       if (i >= k) { /* only take upper triangular entry */
1106:         cols[ncols_upper]     = i;
1107:         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1108:         ncols_upper++;
1109:       }
1110:     }
1111:     PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1112:     nzk += nlnk;

1114:     /* update lnk by computing fill-in for each pivot row to be merged in */
1115:     prow = jl[k]; /* 1st pivot row */

1117:     while (prow < k) {
1118:       nextprow = jl[prow];

1120:       /* merge prow into k-th row */
1121:       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1122:       jmax  = ui[prow + 1];
1123:       ncols = jmax - jmin;
1124:       i     = jmin - ui[prow];
1125:       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1126:       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1127:       PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1128:       nzk += nlnk;

1130:       /* update il and jl for prow */
1131:       if (jmin < jmax) {
1132:         il[prow] = jmin;

1134:         j        = *cols;
1135:         jl[prow] = jl[j];
1136:         jl[j]    = prow;
1137:       }
1138:       prow = nextprow;
1139:     }

1141:     /* if free space is not available, make more free space */
1142:     if (current_space->local_remaining < nzk) {
1143:       i = am - k + 1;                                                             /* num of unfactored rows */
1144:       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1145:       PetscCall(PetscFreeSpaceGet(i, &current_space));
1146:       PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
1147:       reallocs++;
1148:     }

1150:     /* copy data into free_space and free_space_lvl, then initialize lnk */
1151:     PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));

1153:     /* add the k-th row into il and jl */
1154:     if (nzk - 1 > 0) {
1155:       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1156:       jl[k] = jl[i];
1157:       jl[i] = k;
1158:       il[k] = ui[k] + 1;
1159:     }
1160:     uj_ptr[k]     = current_space->array;
1161:     uj_lvl_ptr[k] = current_space_lvl->array;

1163:     current_space->array += nzk;
1164:     current_space->local_used += nzk;
1165:     current_space->local_remaining -= nzk;

1167:     current_space_lvl->array += nzk;
1168:     current_space_lvl->local_used += nzk;
1169:     current_space_lvl->local_remaining -= nzk;

1171:     ui[k + 1] = ui[k] + nzk;
1172:   }

1174:   PetscCall(ISRestoreIndices(perm, &rip));
1175:   PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1176:   PetscCall(PetscFree(cols_lvl));

1178:   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1179:   PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1180:   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1181:   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1182:   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));

1184:   /* put together the new matrix in MATSEQSBAIJ format */
1185:   B = fact;
1186:   PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));

1188:   b          = (Mat_SeqSBAIJ *)B->data;
1189:   b->free_a  = PETSC_TRUE;
1190:   b->free_ij = PETSC_TRUE;

1192:   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));

1194:   b->j             = uj;
1195:   b->i             = ui;
1196:   b->diag          = NULL;
1197:   b->ilen          = NULL;
1198:   b->imax          = NULL;
1199:   b->row           = perm;
1200:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

1202:   PetscCall(PetscObjectReference((PetscObject)perm));

1204:   b->icol = perm;

1206:   PetscCall(PetscObjectReference((PetscObject)perm));
1207:   PetscCall(PetscMalloc1(am + 1, &b->solve_work));

1209:   b->maxnz = b->nz = ui[am];

1211:   B->info.factor_mallocs   = reallocs;
1212:   B->info.fill_ratio_given = fill;
1213:   if (ai[am] != 0.) {
1214:     /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1215:     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1216:   } else {
1217:     B->info.fill_ratio_needed = 0.0;
1218:   }
1219: #if defined(PETSC_USE_INFO)
1220:   if (ai[am] != 0) {
1221:     PetscReal af = B->info.fill_ratio_needed;
1222:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1223:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1224:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1225:   } else {
1226:     PetscCall(PetscInfo(A, "Empty matrix\n"));
1227:   }
1228: #endif
1229:   if (perm_identity) {
1230:     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1231:     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1232:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1233:   } else {
1234:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1235:   }
1236:   PetscFunctionReturn(PETSC_SUCCESS);
1237: }

1239: PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1240: {
1241:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1242:   Mat_SeqSBAIJ      *b;
1243:   Mat                B;
1244:   PetscBool          perm_identity;
1245:   PetscReal          fill = info->fill;
1246:   const PetscInt    *rip;
1247:   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1248:   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1249:   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1250:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1251:   PetscBT            lnkbt;
1252:   PetscBool          diagDense;

1254:   PetscFunctionBegin;
1255:   if (bs > 1) { /* convert to seqsbaij */
1256:     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1257:     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */

1259:     PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1260:     PetscFunctionReturn(PETSC_SUCCESS);
1261:   }
1262:   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, NULL, &diagDense));
1263:   PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");

1265:   /* check whether perm is the identity mapping */
1266:   PetscCall(ISIdentity(perm, &perm_identity));
1267:   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1268:   PetscCall(ISGetIndices(perm, &rip));

1270:   /* initialization */
1271:   PetscCall(PetscMalloc1(mbs + 1, &ui));
1272:   ui[0] = 0;

1274:   /* jl: linked list for storing indices of the pivot rows
1275:      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1276:   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1277:   for (i = 0; i < mbs; i++) {
1278:     jl[i] = mbs;
1279:     il[i] = 0;
1280:   }

1282:   /* create and initialize a linked list for storing column indices of the active row k */
1283:   nlnk = mbs + 1;
1284:   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));

1286:   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1287:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));

1289:   current_space = free_space;

1291:   for (k = 0; k < mbs; k++) { /* for each active row k */
1292:     /* initialize lnk by the column indices of row rip[k] of A */
1293:     nzk         = 0;
1294:     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1295:     ncols_upper = 0;
1296:     for (j = 0; j < ncols; j++) {
1297:       i = rip[*(aj + ai[rip[k]] + j)];
1298:       if (i >= k) { /* only take upper triangular entry */
1299:         cols[ncols_upper] = i;
1300:         ncols_upper++;
1301:       }
1302:     }
1303:     PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1304:     nzk += nlnk;

1306:     /* update lnk by computing fill-in for each pivot row to be merged in */
1307:     prow = jl[k]; /* 1st pivot row */

1309:     while (prow < k) {
1310:       nextprow = jl[prow];
1311:       /* merge prow into k-th row */
1312:       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1313:       jmax   = ui[prow + 1];
1314:       ncols  = jmax - jmin;
1315:       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1316:       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1317:       nzk += nlnk;

1319:       /* update il and jl for prow */
1320:       if (jmin < jmax) {
1321:         il[prow] = jmin;
1322:         j        = *uj_ptr;
1323:         jl[prow] = jl[j];
1324:         jl[j]    = prow;
1325:       }
1326:       prow = nextprow;
1327:     }

1329:     /* if free space is not available, make more free space */
1330:     if (current_space->local_remaining < nzk) {
1331:       i = mbs - k + 1;                                                            /* num of unfactored rows */
1332:       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1333:       PetscCall(PetscFreeSpaceGet(i, &current_space));
1334:       reallocs++;
1335:     }

1337:     /* copy data into free space, then initialize lnk */
1338:     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));

1340:     /* add the k-th row into il and jl */
1341:     if (nzk - 1 > 0) {
1342:       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1343:       jl[k] = jl[i];
1344:       jl[i] = k;
1345:       il[k] = ui[k] + 1;
1346:     }
1347:     ui_ptr[k] = current_space->array;
1348:     current_space->array += nzk;
1349:     current_space->local_used += nzk;
1350:     current_space->local_remaining -= nzk;

1352:     ui[k + 1] = ui[k] + nzk;
1353:   }

1355:   PetscCall(ISRestoreIndices(perm, &rip));
1356:   PetscCall(PetscFree4(ui_ptr, il, jl, cols));

1358:   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1359:   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1360:   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1361:   PetscCall(PetscLLDestroy(lnk, lnkbt));

1363:   /* put together the new matrix in MATSEQSBAIJ format */
1364:   B = fact;
1365:   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));

1367:   b          = (Mat_SeqSBAIJ *)B->data;
1368:   b->free_a  = PETSC_TRUE;
1369:   b->free_ij = PETSC_TRUE;

1371:   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));

1373:   b->j             = uj;
1374:   b->i             = ui;
1375:   b->diag          = NULL;
1376:   b->ilen          = NULL;
1377:   b->imax          = NULL;
1378:   b->row           = perm;
1379:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

1381:   PetscCall(PetscObjectReference((PetscObject)perm));
1382:   b->icol = perm;
1383:   PetscCall(PetscObjectReference((PetscObject)perm));
1384:   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1385:   b->maxnz = b->nz = ui[mbs];

1387:   B->info.factor_mallocs   = reallocs;
1388:   B->info.fill_ratio_given = fill;
1389:   if (ai[mbs] != 0.) {
1390:     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1391:     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1392:   } else {
1393:     B->info.fill_ratio_needed = 0.0;
1394:   }
1395: #if defined(PETSC_USE_INFO)
1396:   if (ai[mbs] != 0.) {
1397:     PetscReal af = B->info.fill_ratio_needed;
1398:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1399:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1400:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1401:   } else {
1402:     PetscCall(PetscInfo(A, "Empty matrix\n"));
1403:   }
1404: #endif
1405:   if (perm_identity) {
1406:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1407:   } else {
1408:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1409:   }
1410:   PetscFunctionReturn(PETSC_SUCCESS);
1411: }

1413: PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1414: {
1415:   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1416:   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1417:   PetscInt           i, k, n                       = a->mbs;
1418:   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1419:   const MatScalar   *aa = a->a, *v;
1420:   PetscScalar       *x, *s, *t, *ls;
1421:   const PetscScalar *b;

1423:   PetscFunctionBegin;
1424:   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1425:   PetscCall(VecGetArrayRead(bb, &b));
1426:   PetscCall(VecGetArray(xx, &x));
1427:   t = a->solve_work;

1429:   /* forward solve the lower triangular */
1430:   PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */

1432:   for (i = 1; i < n; i++) {
1433:     v  = aa + bs2 * ai[i];
1434:     vi = aj + ai[i];
1435:     nz = ai[i + 1] - ai[i];
1436:     s  = t + bs * i;
1437:     PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1438:     for (k = 0; k < nz; k++) {
1439:       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1440:       v += bs2;
1441:     }
1442:   }

1444:   /* backward solve the upper triangular */
1445:   ls = a->solve_work + A->cmap->n;
1446:   for (i = n - 1; i >= 0; i--) {
1447:     v  = aa + bs2 * (adiag[i + 1] + 1);
1448:     vi = aj + adiag[i + 1] + 1;
1449:     nz = adiag[i] - adiag[i + 1] - 1;
1450:     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1451:     for (k = 0; k < nz; k++) {
1452:       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1453:       v += bs2;
1454:     }
1455:     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1456:     PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1457:   }

1459:   PetscCall(VecRestoreArrayRead(bb, &b));
1460:   PetscCall(VecRestoreArray(xx, &x));
1461:   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1462:   PetscFunctionReturn(PETSC_SUCCESS);
1463: }

1465: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1466: {
1467:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1468:   IS                 iscol = a->col, isrow = a->row;
1469:   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1470:   PetscInt           i, m, n = a->mbs;
1471:   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1472:   const MatScalar   *aa = a->a, *v;
1473:   PetscScalar       *x, *s, *t, *ls;
1474:   const PetscScalar *b;

1476:   PetscFunctionBegin;
1477:   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1478:   PetscCall(VecGetArrayRead(bb, &b));
1479:   PetscCall(VecGetArray(xx, &x));
1480:   t = a->solve_work;

1482:   PetscCall(ISGetIndices(isrow, &rout));
1483:   r = rout;
1484:   PetscCall(ISGetIndices(iscol, &cout));
1485:   c = cout;

1487:   /* forward solve the lower triangular */
1488:   PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1489:   for (i = 1; i < n; i++) {
1490:     v  = aa + bs2 * ai[i];
1491:     vi = aj + ai[i];
1492:     nz = ai[i + 1] - ai[i];
1493:     s  = t + bs * i;
1494:     PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1495:     for (m = 0; m < nz; m++) {
1496:       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1497:       v += bs2;
1498:     }
1499:   }

1501:   /* backward solve the upper triangular */
1502:   ls = a->solve_work + A->cmap->n;
1503:   for (i = n - 1; i >= 0; i--) {
1504:     v  = aa + bs2 * (adiag[i + 1] + 1);
1505:     vi = aj + adiag[i + 1] + 1;
1506:     nz = adiag[i] - adiag[i + 1] - 1;
1507:     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1508:     for (m = 0; m < nz; m++) {
1509:       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1510:       v += bs2;
1511:     }
1512:     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1513:     PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1514:   }
1515:   PetscCall(ISRestoreIndices(isrow, &rout));
1516:   PetscCall(ISRestoreIndices(iscol, &cout));
1517:   PetscCall(VecRestoreArrayRead(bb, &b));
1518:   PetscCall(VecRestoreArray(xx, &x));
1519:   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1520:   PetscFunctionReturn(PETSC_SUCCESS);
1521: }