Actual source code: baijfact.c


  2: /*
  3:     Factorization code for BAIJ format.
  4: */
  5: #include <../src/mat/impls/baij/seq/baij.h>
  6: #include <petsc/private/kernels/blockinvert.h>

  8: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
  9: {
 10:   Mat             C = B;
 11:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
 12:   IS              isrow = b->row, isicol = b->icol;
 13:   const PetscInt *r, *ic;
 14:   PetscInt        i, j, k, nz, nzL, row, *pj;
 15:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
 16:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
 17:   MatScalar      *rtmp, *pc, *mwork, *pv;
 18:   MatScalar      *aa = a->a, *v;
 19:   PetscInt        flg;
 20:   PetscReal       shift = info->shiftamount;
 21:   PetscBool       allowzeropivot, zeropivotdetected;

 23:   ISGetIndices(isrow, &r);
 24:   ISGetIndices(isicol, &ic);
 25:   allowzeropivot = PetscNot(A->erroriffailure);

 27:   /* generate work space needed by the factorization */
 28:   PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork);
 29:   PetscArrayzero(rtmp, bs2 * n);

 31:   for (i = 0; i < n; i++) {
 32:     /* zero rtmp */
 33:     /* L part */
 34:     nz    = bi[i + 1] - bi[i];
 35:     bjtmp = bj + bi[i];
 36:     for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);

 38:     /* U part */
 39:     nz    = bdiag[i] - bdiag[i + 1];
 40:     bjtmp = bj + bdiag[i + 1] + 1;
 41:     for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);

 43:     /* load in initial (unfactored row) */
 44:     nz    = ai[r[i] + 1] - ai[r[i]];
 45:     ajtmp = aj + ai[r[i]];
 46:     v     = aa + bs2 * ai[r[i]];
 47:     for (j = 0; j < nz; j++) PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2);

 49:     /* elimination */
 50:     bjtmp = bj + bi[i];
 51:     nzL   = bi[i + 1] - bi[i];
 52:     for (k = 0; k < nzL; k++) {
 53:       row = bjtmp[k];
 54:       pc  = rtmp + bs2 * row;
 55:       for (flg = 0, j = 0; j < bs2; j++) {
 56:         if (pc[j] != (PetscScalar)0.0) {
 57:           flg = 1;
 58:           break;
 59:         }
 60:       }
 61:       if (flg) {
 62:         pv = b->a + bs2 * bdiag[row];
 63:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
 64:         PetscKernel_A_gets_A_times_B_2(pc, pv, mwork);

 66:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
 67:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
 68:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
 69:         for (j = 0; j < nz; j++) {
 70:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
 71:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
 72:           v = rtmp + 4 * pj[j];
 73:           PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv);
 74:           pv += 4;
 75:         }
 76:         PetscLogFlops(16.0 * nz + 12); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
 77:       }
 78:     }

 80:     /* finished row so stick it into b->a */
 81:     /* L part */
 82:     pv = b->a + bs2 * bi[i];
 83:     pj = b->j + bi[i];
 84:     nz = bi[i + 1] - bi[i];
 85:     for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);

 87:     /* Mark diagonal and invert diagonal for simpler triangular solves */
 88:     pv = b->a + bs2 * bdiag[i];
 89:     pj = b->j + bdiag[i];
 90:     PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2);
 91:     PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected);
 92:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

 94:     /* U part */
 95:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
 96:     pj = b->j + bdiag[i + 1] + 1;
 97:     nz = bdiag[i] - bdiag[i + 1] - 1;
 98:     for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
 99:   }

101:   PetscFree2(rtmp, mwork);
102:   ISRestoreIndices(isicol, &ic);
103:   ISRestoreIndices(isrow, &r);

105:   C->ops->solve          = MatSolve_SeqBAIJ_2;
106:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
107:   C->assembled           = PETSC_TRUE;

109:   PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n); /* from inverting diagonal blocks */
110:   return 0;
111: }

113: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
114: {
115:   Mat             C = B;
116:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
117:   PetscInt        i, j, k, nz, nzL, row, *pj;
118:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
119:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
120:   MatScalar      *rtmp, *pc, *mwork, *pv;
121:   MatScalar      *aa = a->a, *v;
122:   PetscInt        flg;
123:   PetscReal       shift = info->shiftamount;
124:   PetscBool       allowzeropivot, zeropivotdetected;

126:   allowzeropivot = PetscNot(A->erroriffailure);

128:   /* generate work space needed by the factorization */
129:   PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork);
130:   PetscArrayzero(rtmp, bs2 * n);

132:   for (i = 0; i < n; i++) {
133:     /* zero rtmp */
134:     /* L part */
135:     nz    = bi[i + 1] - bi[i];
136:     bjtmp = bj + bi[i];
137:     for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);

139:     /* U part */
140:     nz    = bdiag[i] - bdiag[i + 1];
141:     bjtmp = bj + bdiag[i + 1] + 1;
142:     for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);

144:     /* load in initial (unfactored row) */
145:     nz    = ai[i + 1] - ai[i];
146:     ajtmp = aj + ai[i];
147:     v     = aa + bs2 * ai[i];
148:     for (j = 0; j < nz; j++) PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2);

150:     /* elimination */
151:     bjtmp = bj + bi[i];
152:     nzL   = bi[i + 1] - bi[i];
153:     for (k = 0; k < nzL; k++) {
154:       row = bjtmp[k];
155:       pc  = rtmp + bs2 * row;
156:       for (flg = 0, j = 0; j < bs2; j++) {
157:         if (pc[j] != (PetscScalar)0.0) {
158:           flg = 1;
159:           break;
160:         }
161:       }
162:       if (flg) {
163:         pv = b->a + bs2 * bdiag[row];
164:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
165:         PetscKernel_A_gets_A_times_B_2(pc, pv, mwork);

167:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
168:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
169:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
170:         for (j = 0; j < nz; j++) {
171:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
172:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
173:           v = rtmp + 4 * pj[j];
174:           PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv);
175:           pv += 4;
176:         }
177:         PetscLogFlops(16.0 * nz + 12); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
178:       }
179:     }

181:     /* finished row so stick it into b->a */
182:     /* L part */
183:     pv = b->a + bs2 * bi[i];
184:     pj = b->j + bi[i];
185:     nz = bi[i + 1] - bi[i];
186:     for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);

188:     /* Mark diagonal and invert diagonal for simpler triangular solves */
189:     pv = b->a + bs2 * bdiag[i];
190:     pj = b->j + bdiag[i];
191:     PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2);
192:     PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected);
193:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

195:     /* U part */
196:     /*
197:     pv = b->a + bs2*bi[2*n-i];
198:     pj = b->j + bi[2*n-i];
199:     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
200:     */
201:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
202:     pj = b->j + bdiag[i + 1] + 1;
203:     nz = bdiag[i] - bdiag[i + 1] - 1;
204:     for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
205:   }
206:   PetscFree2(rtmp, mwork);

208:   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
209:   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
210:   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
211:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
212:   C->assembled           = PETSC_TRUE;

214:   PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n); /* from inverting diagonal blocks */
215:   return 0;
216: }

218: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
219: {
220:   Mat             C = B;
221:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
222:   IS              isrow = b->row, isicol = b->icol;
223:   const PetscInt *r, *ic;
224:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
225:   PetscInt       *ajtmpold, *ajtmp, nz, row;
226:   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
227:   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
228:   MatScalar       p1, p2, p3, p4;
229:   MatScalar      *ba = b->a, *aa = a->a;
230:   PetscReal       shift = info->shiftamount;
231:   PetscBool       allowzeropivot, zeropivotdetected;

233:   allowzeropivot = PetscNot(A->erroriffailure);
234:   ISGetIndices(isrow, &r);
235:   ISGetIndices(isicol, &ic);
236:   PetscMalloc1(4 * (n + 1), &rtmp);

238:   for (i = 0; i < n; i++) {
239:     nz    = bi[i + 1] - bi[i];
240:     ajtmp = bj + bi[i];
241:     for (j = 0; j < nz; j++) {
242:       x    = rtmp + 4 * ajtmp[j];
243:       x[0] = x[1] = x[2] = x[3] = 0.0;
244:     }
245:     /* load in initial (unfactored row) */
246:     idx      = r[i];
247:     nz       = ai[idx + 1] - ai[idx];
248:     ajtmpold = aj + ai[idx];
249:     v        = aa + 4 * ai[idx];
250:     for (j = 0; j < nz; j++) {
251:       x    = rtmp + 4 * ic[ajtmpold[j]];
252:       x[0] = v[0];
253:       x[1] = v[1];
254:       x[2] = v[2];
255:       x[3] = v[3];
256:       v += 4;
257:     }
258:     row = *ajtmp++;
259:     while (row < i) {
260:       pc = rtmp + 4 * row;
261:       p1 = pc[0];
262:       p2 = pc[1];
263:       p3 = pc[2];
264:       p4 = pc[3];
265:       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
266:         pv    = ba + 4 * diag_offset[row];
267:         pj    = bj + diag_offset[row] + 1;
268:         x1    = pv[0];
269:         x2    = pv[1];
270:         x3    = pv[2];
271:         x4    = pv[3];
272:         pc[0] = m1 = p1 * x1 + p3 * x2;
273:         pc[1] = m2 = p2 * x1 + p4 * x2;
274:         pc[2] = m3 = p1 * x3 + p3 * x4;
275:         pc[3] = m4 = p2 * x3 + p4 * x4;
276:         nz         = bi[row + 1] - diag_offset[row] - 1;
277:         pv += 4;
278:         for (j = 0; j < nz; j++) {
279:           x1 = pv[0];
280:           x2 = pv[1];
281:           x3 = pv[2];
282:           x4 = pv[3];
283:           x  = rtmp + 4 * pj[j];
284:           x[0] -= m1 * x1 + m3 * x2;
285:           x[1] -= m2 * x1 + m4 * x2;
286:           x[2] -= m1 * x3 + m3 * x4;
287:           x[3] -= m2 * x3 + m4 * x4;
288:           pv += 4;
289:         }
290:         PetscLogFlops(16.0 * nz + 12.0);
291:       }
292:       row = *ajtmp++;
293:     }
294:     /* finished row so stick it into b->a */
295:     pv = ba + 4 * bi[i];
296:     pj = bj + bi[i];
297:     nz = bi[i + 1] - bi[i];
298:     for (j = 0; j < nz; j++) {
299:       x     = rtmp + 4 * pj[j];
300:       pv[0] = x[0];
301:       pv[1] = x[1];
302:       pv[2] = x[2];
303:       pv[3] = x[3];
304:       pv += 4;
305:     }
306:     /* invert diagonal block */
307:     w = ba + 4 * diag_offset[i];
308:     PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected);
309:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
310:   }

312:   PetscFree(rtmp);
313:   ISRestoreIndices(isicol, &ic);
314:   ISRestoreIndices(isrow, &r);

316:   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
317:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
318:   C->assembled           = PETSC_TRUE;

320:   PetscLogFlops(1.333333333333 * 8 * b->mbs); /* from inverting diagonal blocks */
321:   return 0;
322: }
323: /*
324:       Version for when blocks are 2 by 2 Using natural ordering
325: */
326: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
327: {
328:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
329:   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
330:   PetscInt    *ajtmpold, *ajtmp, nz, row;
331:   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
332:   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
333:   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
334:   MatScalar   *ba = b->a, *aa = a->a;
335:   PetscReal    shift = info->shiftamount;
336:   PetscBool    allowzeropivot, zeropivotdetected;

338:   allowzeropivot = PetscNot(A->erroriffailure);
339:   PetscMalloc1(4 * (n + 1), &rtmp);
340:   for (i = 0; i < n; i++) {
341:     nz    = bi[i + 1] - bi[i];
342:     ajtmp = bj + bi[i];
343:     for (j = 0; j < nz; j++) {
344:       x    = rtmp + 4 * ajtmp[j];
345:       x[0] = x[1] = x[2] = x[3] = 0.0;
346:     }
347:     /* load in initial (unfactored row) */
348:     nz       = ai[i + 1] - ai[i];
349:     ajtmpold = aj + ai[i];
350:     v        = aa + 4 * ai[i];
351:     for (j = 0; j < nz; j++) {
352:       x    = rtmp + 4 * ajtmpold[j];
353:       x[0] = v[0];
354:       x[1] = v[1];
355:       x[2] = v[2];
356:       x[3] = v[3];
357:       v += 4;
358:     }
359:     row = *ajtmp++;
360:     while (row < i) {
361:       pc = rtmp + 4 * row;
362:       p1 = pc[0];
363:       p2 = pc[1];
364:       p3 = pc[2];
365:       p4 = pc[3];
366:       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
367:         pv    = ba + 4 * diag_offset[row];
368:         pj    = bj + diag_offset[row] + 1;
369:         x1    = pv[0];
370:         x2    = pv[1];
371:         x3    = pv[2];
372:         x4    = pv[3];
373:         pc[0] = m1 = p1 * x1 + p3 * x2;
374:         pc[1] = m2 = p2 * x1 + p4 * x2;
375:         pc[2] = m3 = p1 * x3 + p3 * x4;
376:         pc[3] = m4 = p2 * x3 + p4 * x4;
377:         nz         = bi[row + 1] - diag_offset[row] - 1;
378:         pv += 4;
379:         for (j = 0; j < nz; j++) {
380:           x1 = pv[0];
381:           x2 = pv[1];
382:           x3 = pv[2];
383:           x4 = pv[3];
384:           x  = rtmp + 4 * pj[j];
385:           x[0] -= m1 * x1 + m3 * x2;
386:           x[1] -= m2 * x1 + m4 * x2;
387:           x[2] -= m1 * x3 + m3 * x4;
388:           x[3] -= m2 * x3 + m4 * x4;
389:           pv += 4;
390:         }
391:         PetscLogFlops(16.0 * nz + 12.0);
392:       }
393:       row = *ajtmp++;
394:     }
395:     /* finished row so stick it into b->a */
396:     pv = ba + 4 * bi[i];
397:     pj = bj + bi[i];
398:     nz = bi[i + 1] - bi[i];
399:     for (j = 0; j < nz; j++) {
400:       x     = rtmp + 4 * pj[j];
401:       pv[0] = x[0];
402:       pv[1] = x[1];
403:       pv[2] = x[2];
404:       pv[3] = x[3];
405:       /*
406:       printf(" col %d:",pj[j]);
407:       PetscInt j1;
408:       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
409:       printf("\n");
410:       */
411:       pv += 4;
412:     }
413:     /* invert diagonal block */
414:     w = ba + 4 * diag_offset[i];
415:     PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected);
416:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
417:   }

419:   PetscFree(rtmp);

421:   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
422:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
423:   C->assembled           = PETSC_TRUE;

425:   PetscLogFlops(1.333333333333 * 8 * b->mbs); /* from inverting diagonal blocks */
426:   return 0;
427: }

429: /* ----------------------------------------------------------- */
430: /*
431:      Version for when blocks are 1 by 1.
432: */
433: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
434: {
435:   Mat              C = B;
436:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
437:   IS               isrow = b->row, isicol = b->icol;
438:   const PetscInt  *r, *ic, *ics;
439:   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
440:   PetscInt         i, j, k, nz, nzL, row, *pj;
441:   const PetscInt  *ajtmp, *bjtmp;
442:   MatScalar       *rtmp, *pc, multiplier, *pv;
443:   const MatScalar *aa = a->a, *v;
444:   PetscBool        row_identity, col_identity;
445:   FactorShiftCtx   sctx;
446:   const PetscInt  *ddiag;
447:   PetscReal        rs;
448:   MatScalar        d;

450:   /* MatPivotSetUp(): initialize shift context sctx */
451:   PetscMemzero(&sctx, sizeof(FactorShiftCtx));

453:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
454:     ddiag          = a->diag;
455:     sctx.shift_top = info->zeropivot;
456:     for (i = 0; i < n; i++) {
457:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
458:       d  = (aa)[ddiag[i]];
459:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
460:       v  = aa + ai[i];
461:       nz = ai[i + 1] - ai[i];
462:       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
463:       if (rs > sctx.shift_top) sctx.shift_top = rs;
464:     }
465:     sctx.shift_top *= 1.1;
466:     sctx.nshift_max = 5;
467:     sctx.shift_lo   = 0.;
468:     sctx.shift_hi   = 1.;
469:   }

471:   ISGetIndices(isrow, &r);
472:   ISGetIndices(isicol, &ic);
473:   PetscMalloc1(n + 1, &rtmp);
474:   ics = ic;

476:   do {
477:     sctx.newshift = PETSC_FALSE;
478:     for (i = 0; i < n; i++) {
479:       /* zero rtmp */
480:       /* L part */
481:       nz    = bi[i + 1] - bi[i];
482:       bjtmp = bj + bi[i];
483:       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

485:       /* U part */
486:       nz    = bdiag[i] - bdiag[i + 1];
487:       bjtmp = bj + bdiag[i + 1] + 1;
488:       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

490:       /* load in initial (unfactored row) */
491:       nz    = ai[r[i] + 1] - ai[r[i]];
492:       ajtmp = aj + ai[r[i]];
493:       v     = aa + ai[r[i]];
494:       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];

496:       /* ZeropivotApply() */
497:       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */

499:       /* elimination */
500:       bjtmp = bj + bi[i];
501:       row   = *bjtmp++;
502:       nzL   = bi[i + 1] - bi[i];
503:       for (k = 0; k < nzL; k++) {
504:         pc = rtmp + row;
505:         if (*pc != (PetscScalar)0.0) {
506:           pv         = b->a + bdiag[row];
507:           multiplier = *pc * (*pv);
508:           *pc        = multiplier;

510:           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
511:           pv = b->a + bdiag[row + 1] + 1;
512:           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
513:           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
514:           PetscLogFlops(2.0 * nz);
515:         }
516:         row = *bjtmp++;
517:       }

519:       /* finished row so stick it into b->a */
520:       rs = 0.0;
521:       /* L part */
522:       pv = b->a + bi[i];
523:       pj = b->j + bi[i];
524:       nz = bi[i + 1] - bi[i];
525:       for (j = 0; j < nz; j++) {
526:         pv[j] = rtmp[pj[j]];
527:         rs += PetscAbsScalar(pv[j]);
528:       }

530:       /* U part */
531:       pv = b->a + bdiag[i + 1] + 1;
532:       pj = b->j + bdiag[i + 1] + 1;
533:       nz = bdiag[i] - bdiag[i + 1] - 1;
534:       for (j = 0; j < nz; j++) {
535:         pv[j] = rtmp[pj[j]];
536:         rs += PetscAbsScalar(pv[j]);
537:       }

539:       sctx.rs = rs;
540:       sctx.pv = rtmp[i];
541:       MatPivotCheck(B, A, info, &sctx, i);
542:       if (sctx.newshift) break; /* break for-loop */
543:       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */

545:       /* Mark diagonal and invert diagonal for simpler triangular solves */
546:       pv  = b->a + bdiag[i];
547:       *pv = (PetscScalar)1.0 / rtmp[i];

549:     } /* endof for (i=0; i<n; i++) { */

551:     /* MatPivotRefine() */
552:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
553:       /*
554:        * if no shift in this attempt & shifting & started shifting & can refine,
555:        * then try lower shift
556:        */
557:       sctx.shift_hi       = sctx.shift_fraction;
558:       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
559:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
560:       sctx.newshift       = PETSC_TRUE;
561:       sctx.nshift++;
562:     }
563:   } while (sctx.newshift);

565:   PetscFree(rtmp);
566:   ISRestoreIndices(isicol, &ic);
567:   ISRestoreIndices(isrow, &r);

569:   ISIdentity(isrow, &row_identity);
570:   ISIdentity(isicol, &col_identity);
571:   if (row_identity && col_identity) {
572:     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
573:     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
574:     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
575:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
576:   } else {
577:     C->ops->solve          = MatSolve_SeqBAIJ_1;
578:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
579:   }
580:   C->assembled = PETSC_TRUE;
581:   PetscLogFlops(C->cmap->n);

583:   /* MatShiftView(A,info,&sctx) */
584:   if (sctx.nshift) {
585:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
586:       PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top);
587:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
588:       PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
589:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
590:       PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount);
591:     }
592:   }
593:   return 0;
594: }

596: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
597: {
598:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
599:   IS              isrow = b->row, isicol = b->icol;
600:   const PetscInt *r, *ic;
601:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
602:   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
603:   PetscInt       *diag_offset = b->diag, diag, *pj;
604:   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
605:   MatScalar      *ba = b->a, *aa = a->a;
606:   PetscBool       row_identity, col_identity;

608:   ISGetIndices(isrow, &r);
609:   ISGetIndices(isicol, &ic);
610:   PetscMalloc1(n + 1, &rtmp);

612:   for (i = 0; i < n; i++) {
613:     nz    = bi[i + 1] - bi[i];
614:     ajtmp = bj + bi[i];
615:     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;

617:     /* load in initial (unfactored row) */
618:     nz       = ai[r[i] + 1] - ai[r[i]];
619:     ajtmpold = aj + ai[r[i]];
620:     v        = aa + ai[r[i]];
621:     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];

623:     row = *ajtmp++;
624:     while (row < i) {
625:       pc = rtmp + row;
626:       if (*pc != 0.0) {
627:         pv         = ba + diag_offset[row];
628:         pj         = bj + diag_offset[row] + 1;
629:         multiplier = *pc * *pv++;
630:         *pc        = multiplier;
631:         nz         = bi[row + 1] - diag_offset[row] - 1;
632:         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
633:         PetscLogFlops(1.0 + 2.0 * nz);
634:       }
635:       row = *ajtmp++;
636:     }
637:     /* finished row so stick it into b->a */
638:     pv = ba + bi[i];
639:     pj = bj + bi[i];
640:     nz = bi[i + 1] - bi[i];
641:     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
642:     diag = diag_offset[i] - bi[i];
643:     /* check pivot entry for current row */
645:     pv[diag] = 1.0 / pv[diag];
646:   }

648:   PetscFree(rtmp);
649:   ISRestoreIndices(isicol, &ic);
650:   ISRestoreIndices(isrow, &r);
651:   ISIdentity(isrow, &row_identity);
652:   ISIdentity(isicol, &col_identity);
653:   if (row_identity && col_identity) {
654:     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
655:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
656:   } else {
657:     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
658:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
659:   }
660:   C->assembled = PETSC_TRUE;
661:   PetscLogFlops(C->cmap->n);
662:   return 0;
663: }

665: static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
666: {
667:   *type = MATSOLVERPETSC;
668:   return 0;
669: }

671: PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
672: {
673:   PetscInt n = A->rmap->n;

675: #if defined(PETSC_USE_COMPLEX)
677: #endif
678:   MatCreate(PetscObjectComm((PetscObject)A), B);
679:   MatSetSizes(*B, n, n, n, n);
680:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
681:     MatSetType(*B, MATSEQBAIJ);

683:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
684:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
685:     PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]);
686:     PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]);
687:     PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
688:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
689:     MatSetType(*B, MATSEQSBAIJ);
690:     MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL);

692:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
693:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
694:     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
695:     PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
696:     PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]);
697:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
698:   (*B)->factortype     = ftype;
699:   (*B)->canuseordering = PETSC_TRUE;

701:   PetscFree((*B)->solvertype);
702:   PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype);
703:   PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc);
704:   return 0;
705: }

707: /* ----------------------------------------------------------- */
708: PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
709: {
710:   Mat C;

712:   MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C);
713:   MatLUFactorSymbolic(C, A, row, col, info);
714:   MatLUFactorNumeric(C, A, info);

716:   A->ops->solve          = C->ops->solve;
717:   A->ops->solvetranspose = C->ops->solvetranspose;

719:   MatHeaderMerge(A, &C);
720:   return 0;
721: }

723: #include <../src/mat/impls/sbaij/seq/sbaij.h>
724: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
725: {
726:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
727:   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
728:   IS              ip = b->row;
729:   const PetscInt *rip;
730:   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
731:   PetscInt       *ai = a->i, *aj = a->j;
732:   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
733:   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
734:   PetscReal       rs;
735:   FactorShiftCtx  sctx;

737:   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
738:     if (!a->sbaijMat) MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat);
739:     (a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info);
740:     MatDestroy(&a->sbaijMat);
741:     return 0;
742:   }

744:   /* MatPivotSetUp(): initialize shift context sctx */
745:   PetscMemzero(&sctx, sizeof(FactorShiftCtx));

747:   ISGetIndices(ip, &rip);
748:   PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl);

750:   sctx.shift_amount = 0.;
751:   sctx.nshift       = 0;
752:   do {
753:     sctx.newshift = PETSC_FALSE;
754:     for (i = 0; i < mbs; i++) {
755:       rtmp[i] = 0.0;
756:       jl[i]   = mbs;
757:       il[0]   = 0;
758:     }

760:     for (k = 0; k < mbs; k++) {
761:       bval = ba + bi[k];
762:       /* initialize k-th row by the perm[k]-th row of A */
763:       jmin = ai[rip[k]];
764:       jmax = ai[rip[k] + 1];
765:       for (j = jmin; j < jmax; j++) {
766:         col = rip[aj[j]];
767:         if (col >= k) { /* only take upper triangular entry */
768:           rtmp[col] = aa[j];
769:           *bval++   = 0.0; /* for in-place factorization */
770:         }
771:       }

773:       /* shift the diagonal of the matrix */
774:       if (sctx.nshift) rtmp[k] += sctx.shift_amount;

776:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
777:       dk = rtmp[k];
778:       i  = jl[k]; /* first row to be added to k_th row  */

780:       while (i < k) {
781:         nexti = jl[i]; /* next row to be added to k_th row */

783:         /* compute multiplier, update diag(k) and U(i,k) */
784:         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
785:         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
786:         dk += uikdi * ba[ili];
787:         ba[ili] = uikdi; /* -U(i,k) */

789:         /* add multiple of row i to k-th row */
790:         jmin = ili + 1;
791:         jmax = bi[i + 1];
792:         if (jmin < jmax) {
793:           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
794:           /* update il and jl for row i */
795:           il[i] = jmin;
796:           j     = bj[jmin];
797:           jl[i] = jl[j];
798:           jl[j] = i;
799:         }
800:         i = nexti;
801:       }

803:       /* shift the diagonals when zero pivot is detected */
804:       /* compute rs=sum of abs(off-diagonal) */
805:       rs   = 0.0;
806:       jmin = bi[k] + 1;
807:       nz   = bi[k + 1] - jmin;
808:       if (nz) {
809:         bcol = bj + jmin;
810:         while (nz--) {
811:           rs += PetscAbsScalar(rtmp[*bcol]);
812:           bcol++;
813:         }
814:       }

816:       sctx.rs = rs;
817:       sctx.pv = dk;
818:       MatPivotCheck(C, A, info, &sctx, k);
819:       if (sctx.newshift) break;
820:       dk = sctx.pv;

822:       /* copy data into U(k,:) */
823:       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
824:       jmin      = bi[k] + 1;
825:       jmax      = bi[k + 1];
826:       if (jmin < jmax) {
827:         for (j = jmin; j < jmax; j++) {
828:           col       = bj[j];
829:           ba[j]     = rtmp[col];
830:           rtmp[col] = 0.0;
831:         }
832:         /* add the k-th row into il and jl */
833:         il[k] = jmin;
834:         i     = bj[jmin];
835:         jl[k] = jl[i];
836:         jl[i] = k;
837:       }
838:     }
839:   } while (sctx.newshift);
840:   PetscFree3(rtmp, il, jl);

842:   ISRestoreIndices(ip, &rip);

844:   C->assembled    = PETSC_TRUE;
845:   C->preallocated = PETSC_TRUE;

847:   PetscLogFlops(C->rmap->N);
848:   if (sctx.nshift) {
849:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
850:       PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
851:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
852:       PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
853:     }
854:   }
855:   return 0;
856: }

858: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
859: {
860:   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
861:   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
862:   PetscInt       i, j, am = a->mbs;
863:   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
864:   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
865:   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
866:   PetscReal      rs;
867:   FactorShiftCtx sctx;

869:   /* MatPivotSetUp(): initialize shift context sctx */
870:   PetscMemzero(&sctx, sizeof(FactorShiftCtx));

872:   PetscMalloc3(am, &rtmp, am, &il, am, &jl);

874:   do {
875:     sctx.newshift = PETSC_FALSE;
876:     for (i = 0; i < am; i++) {
877:       rtmp[i] = 0.0;
878:       jl[i]   = am;
879:       il[0]   = 0;
880:     }

882:     for (k = 0; k < am; k++) {
883:       /* initialize k-th row with elements nonzero in row perm(k) of A */
884:       nz   = ai[k + 1] - ai[k];
885:       acol = aj + ai[k];
886:       aval = aa + ai[k];
887:       bval = ba + bi[k];
888:       while (nz--) {
889:         if (*acol < k) { /* skip lower triangular entries */
890:           acol++;
891:           aval++;
892:         } else {
893:           rtmp[*acol++] = *aval++;
894:           *bval++       = 0.0; /* for in-place factorization */
895:         }
896:       }

898:       /* shift the diagonal of the matrix */
899:       if (sctx.nshift) rtmp[k] += sctx.shift_amount;

901:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
902:       dk = rtmp[k];
903:       i  = jl[k]; /* first row to be added to k_th row  */

905:       while (i < k) {
906:         nexti = jl[i]; /* next row to be added to k_th row */
907:         /* compute multiplier, update D(k) and U(i,k) */
908:         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
909:         uikdi = -ba[ili] * ba[bi[i]];
910:         dk += uikdi * ba[ili];
911:         ba[ili] = uikdi; /* -U(i,k) */

913:         /* add multiple of row i to k-th row ... */
914:         jmin = ili + 1;
915:         nz   = bi[i + 1] - jmin;
916:         if (nz > 0) {
917:           bcol = bj + jmin;
918:           bval = ba + jmin;
919:           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
920:           /* update il and jl for i-th row */
921:           il[i] = jmin;
922:           j     = bj[jmin];
923:           jl[i] = jl[j];
924:           jl[j] = i;
925:         }
926:         i = nexti;
927:       }

929:       /* shift the diagonals when zero pivot is detected */
930:       /* compute rs=sum of abs(off-diagonal) */
931:       rs   = 0.0;
932:       jmin = bi[k] + 1;
933:       nz   = bi[k + 1] - jmin;
934:       if (nz) {
935:         bcol = bj + jmin;
936:         while (nz--) {
937:           rs += PetscAbsScalar(rtmp[*bcol]);
938:           bcol++;
939:         }
940:       }

942:       sctx.rs = rs;
943:       sctx.pv = dk;
944:       MatPivotCheck(C, A, info, &sctx, k);
945:       if (sctx.newshift) break; /* sctx.shift_amount is updated */
946:       dk = sctx.pv;

948:       /* copy data into U(k,:) */
949:       ba[bi[k]] = 1.0 / dk;
950:       jmin      = bi[k] + 1;
951:       nz        = bi[k + 1] - jmin;
952:       if (nz) {
953:         bcol = bj + jmin;
954:         bval = ba + jmin;
955:         while (nz--) {
956:           *bval++       = rtmp[*bcol];
957:           rtmp[*bcol++] = 0.0;
958:         }
959:         /* add k-th row into il and jl */
960:         il[k] = jmin;
961:         i     = bj[jmin];
962:         jl[k] = jl[i];
963:         jl[i] = k;
964:       }
965:     }
966:   } while (sctx.newshift);
967:   PetscFree3(rtmp, il, jl);

969:   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
970:   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
971:   C->assembled           = PETSC_TRUE;
972:   C->preallocated        = PETSC_TRUE;

974:   PetscLogFlops(C->rmap->N);
975:   if (sctx.nshift) {
976:     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
977:       PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
978:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
979:       PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
980:     }
981:   }
982:   return 0;
983: }

985: #include <petscbt.h>
986: #include <../src/mat/utils/freespace.h>
987: PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
988: {
989:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
990:   Mat_SeqSBAIJ      *b;
991:   Mat                B;
992:   PetscBool          perm_identity, missing;
993:   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
994:   const PetscInt    *rip;
995:   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
996:   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
997:   PetscReal          fill = info->fill, levels = info->levels;
998:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
999:   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1000:   PetscBT            lnkbt;

1002:   MatMissingDiagonal(A, &missing, &i);

1005:   if (bs > 1) {
1006:     if (!a->sbaijMat) MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat);
1007:     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */

1009:     MatICCFactorSymbolic(fact, a->sbaijMat, perm, info);
1010:     return 0;
1011:   }

1013:   ISIdentity(perm, &perm_identity);
1014:   ISGetIndices(perm, &rip);

1016:   /* special case that simply copies fill pattern */
1017:   if (!levels && perm_identity) {
1018:     PetscMalloc1(am + 1, &ui);
1019:     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1020:     B = fact;
1021:     MatSeqSBAIJSetPreallocation(B, 1, 0, ui);

1023:     b  = (Mat_SeqSBAIJ *)B->data;
1024:     uj = b->j;
1025:     for (i = 0; i < am; i++) {
1026:       aj = a->j + a->diag[i];
1027:       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1028:       b->ilen[i] = ui[i];
1029:     }
1030:     PetscFree(ui);

1032:     B->factortype = MAT_FACTOR_NONE;

1034:     MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
1035:     MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
1036:     B->factortype = MAT_FACTOR_ICC;

1038:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1039:     return 0;
1040:   }

1042:   /* initialization */
1043:   PetscMalloc1(am + 1, &ui);
1044:   ui[0] = 0;
1045:   PetscMalloc1(2 * am + 1, &cols_lvl);

1047:   /* jl: linked list for storing indices of the pivot rows
1048:      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1049:   PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl);
1050:   for (i = 0; i < am; i++) {
1051:     jl[i] = am;
1052:     il[i] = 0;
1053:   }

1055:   /* create and initialize a linked list for storing column indices of the active row k */
1056:   nlnk = am + 1;
1057:   PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt);

1059:   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1060:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space);

1062:   current_space = free_space;

1064:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl);
1065:   current_space_lvl = free_space_lvl;

1067:   for (k = 0; k < am; k++) { /* for each active row k */
1068:     /* initialize lnk by the column indices of row rip[k] of A */
1069:     nzk         = 0;
1070:     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1071:     ncols_upper = 0;
1072:     cols        = cols_lvl + am;
1073:     for (j = 0; j < ncols; j++) {
1074:       i = rip[*(aj + ai[rip[k]] + j)];
1075:       if (i >= k) { /* only take upper triangular entry */
1076:         cols[ncols_upper]     = i;
1077:         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1078:         ncols_upper++;
1079:       }
1080:     }
1081:     PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt);
1082:     nzk += nlnk;

1084:     /* update lnk by computing fill-in for each pivot row to be merged in */
1085:     prow = jl[k]; /* 1st pivot row */

1087:     while (prow < k) {
1088:       nextprow = jl[prow];

1090:       /* merge prow into k-th row */
1091:       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1092:       jmax  = ui[prow + 1];
1093:       ncols = jmax - jmin;
1094:       i     = jmin - ui[prow];
1095:       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1096:       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1097:       PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt);
1098:       nzk += nlnk;

1100:       /* update il and jl for prow */
1101:       if (jmin < jmax) {
1102:         il[prow] = jmin;

1104:         j        = *cols;
1105:         jl[prow] = jl[j];
1106:         jl[j]    = prow;
1107:       }
1108:       prow = nextprow;
1109:     }

1111:     /* if free space is not available, make more free space */
1112:     if (current_space->local_remaining < nzk) {
1113:       i = am - k + 1;                                                             /* num of unfactored rows */
1114:       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1115:       PetscFreeSpaceGet(i, &current_space);
1116:       PetscFreeSpaceGet(i, &current_space_lvl);
1117:       reallocs++;
1118:     }

1120:     /* copy data into free_space and free_space_lvl, then initialize lnk */
1121:     PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt);

1123:     /* add the k-th row into il and jl */
1124:     if (nzk - 1 > 0) {
1125:       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1126:       jl[k] = jl[i];
1127:       jl[i] = k;
1128:       il[k] = ui[k] + 1;
1129:     }
1130:     uj_ptr[k]     = current_space->array;
1131:     uj_lvl_ptr[k] = current_space_lvl->array;

1133:     current_space->array += nzk;
1134:     current_space->local_used += nzk;
1135:     current_space->local_remaining -= nzk;

1137:     current_space_lvl->array += nzk;
1138:     current_space_lvl->local_used += nzk;
1139:     current_space_lvl->local_remaining -= nzk;

1141:     ui[k + 1] = ui[k] + nzk;
1142:   }

1144:   ISRestoreIndices(perm, &rip);
1145:   PetscFree4(uj_ptr, uj_lvl_ptr, il, jl);
1146:   PetscFree(cols_lvl);

1148:   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1149:   PetscMalloc1(ui[am] + 1, &uj);
1150:   PetscFreeSpaceContiguous(&free_space, uj);
1151:   PetscIncompleteLLDestroy(lnk, lnkbt);
1152:   PetscFreeSpaceDestroy(free_space_lvl);

1154:   /* put together the new matrix in MATSEQSBAIJ format */
1155:   B = fact;
1156:   MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL);

1158:   b               = (Mat_SeqSBAIJ *)B->data;
1159:   b->singlemalloc = PETSC_FALSE;
1160:   b->free_a       = PETSC_TRUE;
1161:   b->free_ij      = PETSC_TRUE;

1163:   PetscMalloc1(ui[am] + 1, &b->a);

1165:   b->j             = uj;
1166:   b->i             = ui;
1167:   b->diag          = NULL;
1168:   b->ilen          = NULL;
1169:   b->imax          = NULL;
1170:   b->row           = perm;
1171:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

1173:   PetscObjectReference((PetscObject)perm);

1175:   b->icol = perm;

1177:   PetscObjectReference((PetscObject)perm);
1178:   PetscMalloc1(am + 1, &b->solve_work);

1180:   b->maxnz = b->nz = ui[am];

1182:   B->info.factor_mallocs   = reallocs;
1183:   B->info.fill_ratio_given = fill;
1184:   if (ai[am] != 0.) {
1185:     /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1186:     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1187:   } else {
1188:     B->info.fill_ratio_needed = 0.0;
1189:   }
1190: #if defined(PETSC_USE_INFO)
1191:   if (ai[am] != 0) {
1192:     PetscReal af = B->info.fill_ratio_needed;
1193:     PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af);
1194:     PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
1195:     PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af);
1196:   } else {
1197:     PetscInfo(A, "Empty matrix\n");
1198:   }
1199: #endif
1200:   if (perm_identity) {
1201:     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1202:     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1203:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1204:   } else {
1205:     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1206:   }
1207:   return 0;
1208: }

1210: PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1211: {
1212:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1213:   Mat_SeqSBAIJ      *b;
1214:   Mat                B;
1215:   PetscBool          perm_identity, missing;
1216:   PetscReal          fill = info->fill;
1217:   const PetscInt    *rip;
1218:   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1219:   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1220:   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1221:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1222:   PetscBT            lnkbt;

1224:   if (bs > 1) { /* convert to seqsbaij */
1225:     if (!a->sbaijMat) MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat);
1226:     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */

1228:     MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info);
1229:     return 0;
1230:   }

1232:   MatMissingDiagonal(A, &missing, &i);

1235:   /* check whether perm is the identity mapping */
1236:   ISIdentity(perm, &perm_identity);
1238:   ISGetIndices(perm, &rip);

1240:   /* initialization */
1241:   PetscMalloc1(mbs + 1, &ui);
1242:   ui[0] = 0;

1244:   /* jl: linked list for storing indices of the pivot rows
1245:      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1246:   PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols);
1247:   for (i = 0; i < mbs; i++) {
1248:     jl[i] = mbs;
1249:     il[i] = 0;
1250:   }

1252:   /* create and initialize a linked list for storing column indices of the active row k */
1253:   nlnk = mbs + 1;
1254:   PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt);

1256:   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1257:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space);

1259:   current_space = free_space;

1261:   for (k = 0; k < mbs; k++) { /* for each active row k */
1262:     /* initialize lnk by the column indices of row rip[k] of A */
1263:     nzk         = 0;
1264:     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1265:     ncols_upper = 0;
1266:     for (j = 0; j < ncols; j++) {
1267:       i = rip[*(aj + ai[rip[k]] + j)];
1268:       if (i >= k) { /* only take upper triangular entry */
1269:         cols[ncols_upper] = i;
1270:         ncols_upper++;
1271:       }
1272:     }
1273:     PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt);
1274:     nzk += nlnk;

1276:     /* update lnk by computing fill-in for each pivot row to be merged in */
1277:     prow = jl[k]; /* 1st pivot row */

1279:     while (prow < k) {
1280:       nextprow = jl[prow];
1281:       /* merge prow into k-th row */
1282:       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1283:       jmax   = ui[prow + 1];
1284:       ncols  = jmax - jmin;
1285:       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1286:       PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt);
1287:       nzk += nlnk;

1289:       /* update il and jl for prow */
1290:       if (jmin < jmax) {
1291:         il[prow] = jmin;
1292:         j        = *uj_ptr;
1293:         jl[prow] = jl[j];
1294:         jl[j]    = prow;
1295:       }
1296:       prow = nextprow;
1297:     }

1299:     /* if free space is not available, make more free space */
1300:     if (current_space->local_remaining < nzk) {
1301:       i = mbs - k + 1;                                                            /* num of unfactored rows */
1302:       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1303:       PetscFreeSpaceGet(i, &current_space);
1304:       reallocs++;
1305:     }

1307:     /* copy data into free space, then initialize lnk */
1308:     PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt);

1310:     /* add the k-th row into il and jl */
1311:     if (nzk - 1 > 0) {
1312:       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1313:       jl[k] = jl[i];
1314:       jl[i] = k;
1315:       il[k] = ui[k] + 1;
1316:     }
1317:     ui_ptr[k] = current_space->array;
1318:     current_space->array += nzk;
1319:     current_space->local_used += nzk;
1320:     current_space->local_remaining -= nzk;

1322:     ui[k + 1] = ui[k] + nzk;
1323:   }

1325:   ISRestoreIndices(perm, &rip);
1326:   PetscFree4(ui_ptr, il, jl, cols);

1328:   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1329:   PetscMalloc1(ui[mbs] + 1, &uj);
1330:   PetscFreeSpaceContiguous(&free_space, uj);
1331:   PetscLLDestroy(lnk, lnkbt);

1333:   /* put together the new matrix in MATSEQSBAIJ format */
1334:   B = fact;
1335:   MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL);

1337:   b               = (Mat_SeqSBAIJ *)B->data;
1338:   b->singlemalloc = PETSC_FALSE;
1339:   b->free_a       = PETSC_TRUE;
1340:   b->free_ij      = PETSC_TRUE;

1342:   PetscMalloc1(ui[mbs] + 1, &b->a);

1344:   b->j             = uj;
1345:   b->i             = ui;
1346:   b->diag          = NULL;
1347:   b->ilen          = NULL;
1348:   b->imax          = NULL;
1349:   b->row           = perm;
1350:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

1352:   PetscObjectReference((PetscObject)perm);
1353:   b->icol = perm;
1354:   PetscObjectReference((PetscObject)perm);
1355:   PetscMalloc1(mbs + 1, &b->solve_work);
1356:   b->maxnz = b->nz = ui[mbs];

1358:   B->info.factor_mallocs   = reallocs;
1359:   B->info.fill_ratio_given = fill;
1360:   if (ai[mbs] != 0.) {
1361:     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1362:     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1363:   } else {
1364:     B->info.fill_ratio_needed = 0.0;
1365:   }
1366: #if defined(PETSC_USE_INFO)
1367:   if (ai[mbs] != 0.) {
1368:     PetscReal af = B->info.fill_ratio_needed;
1369:     PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af);
1370:     PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
1371:     PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af);
1372:   } else {
1373:     PetscInfo(A, "Empty matrix\n");
1374:   }
1375: #endif
1376:   if (perm_identity) {
1377:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1378:   } else {
1379:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1380:   }
1381:   return 0;
1382: }

1384: PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1385: {
1386:   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1387:   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1388:   PetscInt           i, k, n                       = a->mbs;
1389:   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1390:   const MatScalar   *aa = a->a, *v;
1391:   PetscScalar       *x, *s, *t, *ls;
1392:   const PetscScalar *b;

1394:   VecGetArrayRead(bb, &b);
1395:   VecGetArray(xx, &x);
1396:   t = a->solve_work;

1398:   /* forward solve the lower triangular */
1399:   PetscArraycpy(t, b, bs); /* copy 1st block of b to t */

1401:   for (i = 1; i < n; i++) {
1402:     v  = aa + bs2 * ai[i];
1403:     vi = aj + ai[i];
1404:     nz = ai[i + 1] - ai[i];
1405:     s  = t + bs * i;
1406:     PetscArraycpy(s, b + bs * i, bs); /* copy i_th block of b to t */
1407:     for (k = 0; k < nz; k++) {
1408:       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1409:       v += bs2;
1410:     }
1411:   }

1413:   /* backward solve the upper triangular */
1414:   ls = a->solve_work + A->cmap->n;
1415:   for (i = n - 1; i >= 0; i--) {
1416:     v  = aa + bs2 * (adiag[i + 1] + 1);
1417:     vi = aj + adiag[i + 1] + 1;
1418:     nz = adiag[i] - adiag[i + 1] - 1;
1419:     PetscArraycpy(ls, t + i * bs, bs);
1420:     for (k = 0; k < nz; k++) {
1421:       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1422:       v += bs2;
1423:     }
1424:     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1425:     PetscArraycpy(x + i * bs, t + i * bs, bs);
1426:   }

1428:   VecRestoreArrayRead(bb, &b);
1429:   VecRestoreArray(xx, &x);
1430:   PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n);
1431:   return 0;
1432: }

1434: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1435: {
1436:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1437:   IS                 iscol = a->col, isrow = a->row;
1438:   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1439:   PetscInt           i, m, n = a->mbs;
1440:   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1441:   const MatScalar   *aa = a->a, *v;
1442:   PetscScalar       *x, *s, *t, *ls;
1443:   const PetscScalar *b;

1445:   VecGetArrayRead(bb, &b);
1446:   VecGetArray(xx, &x);
1447:   t = a->solve_work;

1449:   ISGetIndices(isrow, &rout);
1450:   r = rout;
1451:   ISGetIndices(iscol, &cout);
1452:   c = cout;

1454:   /* forward solve the lower triangular */
1455:   PetscArraycpy(t, b + bs * r[0], bs);
1456:   for (i = 1; i < n; i++) {
1457:     v  = aa + bs2 * ai[i];
1458:     vi = aj + ai[i];
1459:     nz = ai[i + 1] - ai[i];
1460:     s  = t + bs * i;
1461:     PetscArraycpy(s, b + bs * r[i], bs);
1462:     for (m = 0; m < nz; m++) {
1463:       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1464:       v += bs2;
1465:     }
1466:   }

1468:   /* backward solve the upper triangular */
1469:   ls = a->solve_work + A->cmap->n;
1470:   for (i = n - 1; i >= 0; i--) {
1471:     v  = aa + bs2 * (adiag[i + 1] + 1);
1472:     vi = aj + adiag[i + 1] + 1;
1473:     nz = adiag[i] - adiag[i + 1] - 1;
1474:     PetscArraycpy(ls, t + i * bs, bs);
1475:     for (m = 0; m < nz; m++) {
1476:       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1477:       v += bs2;
1478:     }
1479:     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1480:     PetscArraycpy(x + bs * c[i], t + i * bs, bs);
1481:   }
1482:   ISRestoreIndices(isrow, &rout);
1483:   ISRestoreIndices(iscol, &cout);
1484:   VecRestoreArrayRead(bb, &b);
1485:   VecRestoreArray(xx, &x);
1486:   PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n);
1487:   return 0;
1488: }

1490: /*
1491:     For each block in an block array saves the largest absolute value in the block into another array
1492: */
1493: static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray)
1494: {
1495:   PetscInt i, j;

1497:   PetscArrayzero(absarray, nbs + 1);
1498:   for (i = 0; i < nbs; i++) {
1499:     for (j = 0; j < bs2; j++) {
1500:       if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]);
1501:     }
1502:   }
1503:   return 0;
1504: }

1506: /*
1507:      This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1508: */
1509: PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact)
1510: {
1511:   Mat             B = *fact;
1512:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b;
1513:   IS              isicol;
1514:   const PetscInt *r, *ic;
1515:   PetscInt        i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag;
1516:   PetscInt       *bi, *bj, *bdiag;

1518:   PetscInt   row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au;
1519:   PetscInt   nlnk, *lnk;
1520:   PetscBT    lnkbt;
1521:   PetscBool  row_identity, icol_identity;
1522:   MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp;
1523:   PetscInt   j, nz, *pj, *bjtmp, k, ncut, *jtmp;

1525:   PetscReal  dt = info->dt; /* shift=info->shiftamount; */
1526:   PetscInt   nnz_max;
1527:   PetscBool  missing;
1528:   PetscReal *vtmp_abs;
1529:   MatScalar *v_work;
1530:   PetscInt  *v_pivots;
1531:   PetscBool  allowzeropivot, zeropivotdetected = PETSC_FALSE;

1533:   /* ------- symbolic factorization, can be reused ---------*/
1534:   MatMissingDiagonal(A, &missing, &i);
1536:   adiag = a->diag;

1538:   ISInvertPermutation(iscol, PETSC_DECIDE, &isicol);

1540:   /* bdiag is location of diagonal in factor */
1541:   PetscMalloc1(mbs + 1, &bdiag);

1543:   /* allocate row pointers bi */
1544:   PetscMalloc1(2 * mbs + 2, &bi);

1546:   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1547:   dtcount = (PetscInt)info->dtcount;
1548:   if (dtcount > mbs - 1) dtcount = mbs - 1;
1549:   nnz_max = ai[mbs] + 2 * mbs * dtcount + 2;
1550:   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1551:   PetscMalloc1(nnz_max, &bj);
1552:   nnz_max = nnz_max * bs2;
1553:   PetscMalloc1(nnz_max, &ba);

1555:   /* put together the new matrix */
1556:   MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL);

1558:   b               = (Mat_SeqBAIJ *)(B)->data;
1559:   b->free_a       = PETSC_TRUE;
1560:   b->free_ij      = PETSC_TRUE;
1561:   b->singlemalloc = PETSC_FALSE;

1563:   b->a    = ba;
1564:   b->j    = bj;
1565:   b->i    = bi;
1566:   b->diag = bdiag;
1567:   b->ilen = NULL;
1568:   b->imax = NULL;
1569:   b->row  = isrow;
1570:   b->col  = iscol;

1572:   PetscObjectReference((PetscObject)isrow);
1573:   PetscObjectReference((PetscObject)iscol);

1575:   b->icol = isicol;
1576:   PetscMalloc1(bs * (mbs + 1), &b->solve_work);
1577:   b->maxnz = nnz_max / bs2;

1579:   (B)->factortype            = MAT_FACTOR_ILUDT;
1580:   (B)->info.factor_mallocs   = 0;
1581:   (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2));
1582:   /* ------- end of symbolic factorization ---------*/
1583:   ISGetIndices(isrow, &r);
1584:   ISGetIndices(isicol, &ic);

1586:   /* linked list for storing column indices of the active row */
1587:   nlnk = mbs + 1;
1588:   PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt);

1590:   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1591:   PetscMalloc2(mbs, &im, mbs, &jtmp);
1592:   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1593:   PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp);
1594:   PetscMalloc1(mbs + 1, &vtmp_abs);
1595:   PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots);

1597:   allowzeropivot  = PetscNot(A->erroriffailure);
1598:   bi[0]           = 0;
1599:   bdiag[0]        = (nnz_max / bs2) - 1; /* location of diagonal in factor B */
1600:   bi[2 * mbs + 1] = bdiag[0] + 1;        /* endof bj and ba array */
1601:   for (i = 0; i < mbs; i++) {
1602:     /* copy initial fill into linked list */
1603:     nzi = ai[r[i] + 1] - ai[r[i]];
1605:     nzi_al = adiag[r[i]] - ai[r[i]];
1606:     nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1;

1608:     /* load in initial unfactored row */
1609:     ajtmp = aj + ai[r[i]];
1610:     PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt);
1611:     PetscArrayzero(rtmp, mbs * bs2);
1612:     aatmp = a->a + bs2 * ai[r[i]];
1613:     for (j = 0; j < nzi; j++) PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2);

1615:     /* add pivot rows into linked list */
1616:     row = lnk[mbs];
1617:     while (row < i) {
1618:       nzi_bl = bi[row + 1] - bi[row] + 1;
1619:       bjtmp  = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */
1620:       PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im);
1621:       nzi += nlnk;
1622:       row = lnk[row];
1623:     }

1625:     /* copy data from lnk into jtmp, then initialize lnk */
1626:     PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt);

1628:     /* numerical factorization */
1629:     bjtmp = jtmp;
1630:     row   = *bjtmp++; /* 1st pivot row */

1632:     while (row < i) {
1633:       pc = rtmp + bs2 * row;
1634:       pv = ba + bs2 * bdiag[row];                           /* inv(diag) of the pivot row */
1635:       PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1636:       MatBlockAbs_private(1, bs2, pc, vtmp_abs);
1637:       if (vtmp_abs[0] > dt) {         /* apply tolerance dropping rule */
1638:         pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */
1639:         pv = ba + bs2 * (bdiag[row + 1] + 1);
1640:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */
1641:         for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
1642:         /* PetscLogFlops(bslog*(nz+1.0)-bs); */
1643:       }
1644:       row = *bjtmp++;
1645:     }

1647:     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1648:     nzi_bl = 0;
1649:     j      = 0;
1650:     while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */
1651:       PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2);
1652:       nzi_bl++;
1653:       j++;
1654:     }
1655:     nzi_bu = nzi - nzi_bl - 1;

1657:     while (j < nzi) { /* U-part */
1658:       PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2);
1659:       j++;
1660:     }

1662:     MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs);

1664:     bjtmp = bj + bi[i];
1665:     batmp = ba + bs2 * bi[i];
1666:     /* apply level dropping rule to L part */
1667:     ncut = nzi_al + dtcount;
1668:     if (ncut < nzi_bl) {
1669:       PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp);
1670:       PetscSortIntWithScalarArray(ncut, jtmp, vtmp);
1671:     } else {
1672:       ncut = nzi_bl;
1673:     }
1674:     for (j = 0; j < ncut; j++) {
1675:       bjtmp[j] = jtmp[j];
1676:       PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2);
1677:     }
1678:     bi[i + 1] = bi[i] + ncut;
1679:     nzi       = ncut + 1;

1681:     /* apply level dropping rule to U part */
1682:     ncut = nzi_au + dtcount;
1683:     if (ncut < nzi_bu) {
1684:       PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1);
1685:       PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1);
1686:     } else {
1687:       ncut = nzi_bu;
1688:     }
1689:     nzi += ncut;

1691:     /* mark bdiagonal */
1692:     bdiag[i + 1]    = bdiag[i] - (ncut + 1);
1693:     bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1);

1695:     bjtmp = bj + bdiag[i];
1696:     batmp = ba + bs2 * bdiag[i];
1697:     PetscArraycpy(batmp, rtmp + bs2 * i, bs2);
1698:     *bjtmp = i;

1700:     bjtmp = bj + bdiag[i + 1] + 1;
1701:     batmp = ba + (bdiag[i + 1] + 1) * bs2;

1703:     for (k = 0; k < ncut; k++) {
1704:       bjtmp[k] = jtmp[nzi_bl + 1 + k];
1705:       PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2);
1706:     }

1708:     im[i] = nzi; /* used by PetscLLAddSortedLU() */

1710:     /* invert diagonal block for simpler triangular solves - add shift??? */
1711:     batmp = ba + bs2 * bdiag[i];

1713:     PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected);
1714:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1715:   } /* for (i=0; i<mbs; i++) */
1716:   PetscFree3(v_work, multiplier, v_pivots);

1718:   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */

1721:   ISRestoreIndices(isrow, &r);
1722:   ISRestoreIndices(isicol, &ic);

1724:   PetscLLDestroy(lnk, lnkbt);

1726:   PetscFree2(im, jtmp);
1727:   PetscFree2(rtmp, vtmp);

1729:   PetscLogFlops(bs2 * B->cmap->n);
1730:   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];

1732:   ISIdentity(isrow, &row_identity);
1733:   ISIdentity(isicol, &icol_identity);
1734:   if (row_identity && icol_identity) {
1735:     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1736:   } else {
1737:     B->ops->solve = MatSolve_SeqBAIJ_N;
1738:   }

1740:   B->ops->solveadd          = NULL;
1741:   B->ops->solvetranspose    = NULL;
1742:   B->ops->solvetransposeadd = NULL;
1743:   B->ops->matsolve          = NULL;
1744:   B->assembled              = PETSC_TRUE;
1745:   B->preallocated           = PETSC_TRUE;
1746:   return 0;
1747: }