Actual source code: sbaijfact.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  3: #include <petsc/private/kernels/blockinvert.h>
  4: #include <petscis.h>

  6: PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F, PetscInt *nneg, PetscInt *nzero, PetscInt *npos)
  7: {
  8:   Mat_SeqSBAIJ *fact = (Mat_SeqSBAIJ *)F->data;
  9:   MatScalar    *dd   = fact->a;
 10:   PetscInt      mbs = fact->mbs, bs = F->rmap->bs, i, nneg_tmp, npos_tmp, *fi = fact->diag;

 12:   PetscFunctionBegin;
 13:   PetscCheck(bs == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for bs: %" PetscInt_FMT " >1 yet", bs);

 15:   nneg_tmp = 0;
 16:   npos_tmp = 0;
 17:   if (fi) {
 18:     for (i = 0; i < mbs; i++) {
 19:       if (PetscRealPart(dd[*fi]) > 0.0) npos_tmp++;
 20:       else if (PetscRealPart(dd[*fi]) < 0.0) nneg_tmp++;
 21:       fi++;
 22:     }
 23:   } else {
 24:     for (i = 0; i < mbs; i++) {
 25:       if (PetscRealPart(dd[fact->i[i]]) > 0.0) npos_tmp++;
 26:       else if (PetscRealPart(dd[fact->i[i]]) < 0.0) nneg_tmp++;
 27:     }
 28:   }
 29:   if (nneg) *nneg = nneg_tmp;
 30:   if (npos) *npos = npos_tmp;
 31:   if (nzero) *nzero = mbs - nneg_tmp - npos_tmp;
 32:   PetscFunctionReturn(PETSC_SUCCESS);
 33: }

 35: /*
 36:   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
 37:   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
 38: */
 39: static PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F, Mat A, IS perm, const MatFactorInfo *info)
 40: {
 41:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b;
 42:   const PetscInt *rip, *ai, *aj;
 43:   PetscInt        i, mbs = a->mbs, *jutmp, bs = A->rmap->bs, bs2 = a->bs2;
 44:   PetscInt        m, reallocs = 0, prow;
 45:   PetscInt       *jl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
 46:   PetscReal       f = info->fill;
 47:   PetscBool       perm_identity;

 49:   PetscFunctionBegin;
 50:   /* check whether perm is the identity mapping */
 51:   PetscCall(ISIdentity(perm, &perm_identity));
 52:   PetscCall(ISGetIndices(perm, &rip));

 54:   if (perm_identity) { /* without permutation */
 55:     a->permute = PETSC_FALSE;

 57:     ai = a->i;
 58:     aj = a->j;
 59:   } else { /* non-trivial permutation */
 60:     a->permute = PETSC_TRUE;

 62:     PetscCall(MatReorderingSeqSBAIJ(A, perm));

 64:     ai = a->inew;
 65:     aj = a->jnew;
 66:   }

 68:   /* initialization */
 69:   PetscCall(PetscMalloc1(mbs + 1, &iu));
 70:   umax = (PetscInt)(f * ai[mbs] + 1);
 71:   umax += mbs + 1;
 72:   PetscCall(PetscMalloc1(umax, &ju));
 73:   iu[0] = mbs + 1;
 74:   juidx = mbs + 1; /* index for ju */
 75:   /* jl linked list for pivot row -- linked list for col index */
 76:   PetscCall(PetscMalloc2(mbs, &jl, mbs, &q));
 77:   for (i = 0; i < mbs; i++) {
 78:     jl[i] = mbs;
 79:     q[i]  = 0;
 80:   }

 82:   /* for each row k */
 83:   for (k = 0; k < mbs; k++) {
 84:     for (i = 0; i < mbs; i++) q[i] = 0; /* to be removed! */
 85:     nzk  = 0;                           /* num. of nz blocks in k-th block row with diagonal block excluded */
 86:     q[k] = mbs;
 87:     /* initialize nonzero structure of k-th row to row rip[k] of A */
 88:     jmin = ai[rip[k]] + 1; /* exclude diag[k] */
 89:     jmax = ai[rip[k] + 1];
 90:     for (j = jmin; j < jmax; j++) {
 91:       vj = rip[aj[j]]; /* col. value */
 92:       if (vj > k) {
 93:         qm = k;
 94:         do {
 95:           m  = qm;
 96:           qm = q[m];
 97:         } while (qm < vj);
 98:         PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
 99:         nzk++;
100:         q[m]  = vj;
101:         q[vj] = qm;
102:       } /* if (vj > k) */
103:     } /* for (j=jmin; j<jmax; j++) */

105:     /* modify nonzero structure of k-th row by computing fill-in
106:        for each row i to be merged in */
107:     prow = k;
108:     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */

110:     while (prow < k) {
111:       /* merge row prow into k-th row */
112:       jmin = iu[prow] + 1;
113:       jmax = iu[prow + 1];
114:       qm   = k;
115:       for (j = jmin; j < jmax; j++) {
116:         vj = ju[j];
117:         do {
118:           m  = qm;
119:           qm = q[m];
120:         } while (qm < vj);
121:         if (qm != vj) {
122:           nzk++;
123:           q[m]  = vj;
124:           q[vj] = qm;
125:           qm    = vj;
126:         }
127:       }
128:       prow = jl[prow]; /* next pivot row */
129:     }

131:     /* add k to row list for first nonzero element in k-th row */
132:     if (nzk > 0) {
133:       i     = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
134:       jl[k] = jl[i];
135:       jl[i] = k;
136:     }
137:     iu[k + 1] = iu[k] + nzk;

139:     /* allocate more space to ju if needed */
140:     if (iu[k + 1] > umax) {
141:       /* estimate how much additional space we will need */
142:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
143:       /* just double the memory each time */
144:       maxadd = umax;
145:       if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
146:       umax += maxadd;

148:       /* allocate a longer ju */
149:       PetscCall(PetscMalloc1(umax, &jutmp));
150:       PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
151:       PetscCall(PetscFree(ju));
152:       ju = jutmp;
153:       reallocs++; /* count how many times we realloc */
154:     }

156:     /* save nonzero structure of k-th row in ju */
157:     i = k;
158:     while (nzk--) {
159:       i           = q[i];
160:       ju[juidx++] = i;
161:     }
162:   }

164: #if defined(PETSC_USE_INFO)
165:   if (ai[mbs] != 0) {
166:     PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
167:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
168:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
169:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
170:     PetscCall(PetscInfo(A, "for best performance.\n"));
171:   } else {
172:     PetscCall(PetscInfo(A, "Empty matrix\n"));
173:   }
174: #endif

176:   PetscCall(ISRestoreIndices(perm, &rip));
177:   PetscCall(PetscFree2(jl, q));

179:   /* put together the new matrix */
180:   PetscCall(MatSeqSBAIJSetPreallocation(F, bs, MAT_SKIP_ALLOCATION, NULL));

182:   b               = (Mat_SeqSBAIJ *)(F)->data;
183:   b->singlemalloc = PETSC_FALSE;
184:   b->free_a       = PETSC_TRUE;
185:   b->free_ij      = PETSC_TRUE;

187:   PetscCall(PetscMalloc1((iu[mbs] + 1) * bs2, &b->a));
188:   b->j    = ju;
189:   b->i    = iu;
190:   b->diag = NULL;
191:   b->ilen = NULL;
192:   b->imax = NULL;
193:   b->row  = perm;

195:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

197:   PetscCall(PetscObjectReference((PetscObject)perm));

199:   b->icol = perm;
200:   PetscCall(PetscObjectReference((PetscObject)perm));
201:   PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
202:   /* In b structure:  Free imax, ilen, old a, old j.
203:      Allocate idnew, solve_work, new a, new j */
204:   b->maxnz = b->nz = iu[mbs];

206:   (F)->info.factor_mallocs   = reallocs;
207:   (F)->info.fill_ratio_given = f;
208:   if (ai[mbs] != 0) {
209:     (F)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
210:   } else {
211:     (F)->info.fill_ratio_needed = 0.0;
212:   }
213:   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(F, perm_identity));
214:   PetscFunctionReturn(PETSC_SUCCESS);
215: }
216: /*
217:     Symbolic U^T*D*U factorization for SBAIJ format.
218:     See MatICCFactorSymbolic_SeqAIJ() for description of its data structure.
219: */
220: #include <petscbt.h>
221: #include <../src/mat/utils/freespace.h>
222: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
223: {
224:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
225:   Mat_SeqSBAIJ      *b;
226:   PetscBool          perm_identity, missing;
227:   PetscReal          fill = info->fill;
228:   const PetscInt    *rip, *ai = a->i, *aj = a->j;
229:   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, reallocs = 0, prow;
230:   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
231:   PetscInt           nlnk, *lnk, ncols, *cols, *uj, **ui_ptr, *uj_ptr, *udiag;
232:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
233:   PetscBT            lnkbt;

235:   PetscFunctionBegin;
236:   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
237:   PetscCall(MatMissingDiagonal(A, &missing, &i));
238:   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
239:   if (bs > 1) {
240:     PetscCall(MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
241:     PetscFunctionReturn(PETSC_SUCCESS);
242:   }

244:   /* check whether perm is the identity mapping */
245:   PetscCall(ISIdentity(perm, &perm_identity));
246:   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
247:   a->permute = PETSC_FALSE;
248:   PetscCall(ISGetIndices(perm, &rip));

250:   /* initialization */
251:   PetscCall(PetscMalloc1(mbs + 1, &ui));
252:   PetscCall(PetscMalloc1(mbs + 1, &udiag));
253:   ui[0] = 0;

255:   /* jl: linked list for storing indices of the pivot rows
256:      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
257:   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
258:   for (i = 0; i < mbs; i++) {
259:     jl[i] = mbs;
260:     il[i] = 0;
261:   }

263:   /* create and initialize a linked list for storing column indices of the active row k */
264:   nlnk = mbs + 1;
265:   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));

267:   /* initial FreeSpace size is fill*(ai[mbs]+1) */
268:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[mbs] + 1), &free_space));
269:   current_space = free_space;

271:   for (k = 0; k < mbs; k++) { /* for each active row k */
272:     /* initialize lnk by the column indices of row rip[k] of A */
273:     nzk   = 0;
274:     ncols = ai[k + 1] - ai[k];
275:     PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
276:     for (j = 0; j < ncols; j++) {
277:       i       = *(aj + ai[k] + j);
278:       cols[j] = i;
279:     }
280:     PetscCall(PetscLLAdd(ncols, cols, mbs, &nlnk, lnk, lnkbt));
281:     nzk += nlnk;

283:     /* update lnk by computing fill-in for each pivot row to be merged in */
284:     prow = jl[k]; /* 1st pivot row */

286:     while (prow < k) {
287:       nextprow = jl[prow];
288:       /* merge prow into k-th row */
289:       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
290:       jmax   = ui[prow + 1];
291:       ncols  = jmax - jmin;
292:       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
293:       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
294:       nzk += nlnk;

296:       /* update il and jl for prow */
297:       if (jmin < jmax) {
298:         il[prow] = jmin;
299:         j        = *uj_ptr;
300:         jl[prow] = jl[j];
301:         jl[j]    = prow;
302:       }
303:       prow = nextprow;
304:     }

306:     /* if free space is not available, make more free space */
307:     if (current_space->local_remaining < nzk) {
308:       i = mbs - k + 1;                                   /* num of unfactored rows */
309:       i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
310:       PetscCall(PetscFreeSpaceGet(i, &current_space));
311:       reallocs++;
312:     }

314:     /* copy data into free space, then initialize lnk */
315:     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));

317:     /* add the k-th row into il and jl */
318:     if (nzk > 1) {
319:       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
320:       jl[k] = jl[i];
321:       jl[i] = k;
322:       il[k] = ui[k] + 1;
323:     }
324:     ui_ptr[k] = current_space->array;

326:     current_space->array += nzk;
327:     current_space->local_used += nzk;
328:     current_space->local_remaining -= nzk;

330:     ui[k + 1] = ui[k] + nzk;
331:   }

333:   PetscCall(ISRestoreIndices(perm, &rip));
334:   PetscCall(PetscFree4(ui_ptr, il, jl, cols));

336:   /* destroy list of free space and other temporary array(s) */
337:   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
338:   PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, mbs, ui, udiag)); /* store matrix factor */
339:   PetscCall(PetscLLDestroy(lnk, lnkbt));

341:   /* put together the new matrix in MATSEQSBAIJ format */
342:   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));

344:   b               = (Mat_SeqSBAIJ *)fact->data;
345:   b->singlemalloc = PETSC_FALSE;
346:   b->free_a       = PETSC_TRUE;
347:   b->free_ij      = PETSC_TRUE;

349:   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));

351:   b->j         = uj;
352:   b->i         = ui;
353:   b->diag      = udiag;
354:   b->free_diag = PETSC_TRUE;
355:   b->ilen      = NULL;
356:   b->imax      = NULL;
357:   b->row       = perm;
358:   b->icol      = perm;

360:   PetscCall(PetscObjectReference((PetscObject)perm));
361:   PetscCall(PetscObjectReference((PetscObject)perm));

363:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

365:   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));

367:   b->maxnz = b->nz = ui[mbs];

369:   fact->info.factor_mallocs   = reallocs;
370:   fact->info.fill_ratio_given = fill;
371:   if (ai[mbs] != 0) {
372:     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs]) / ai[mbs];
373:   } else {
374:     fact->info.fill_ratio_needed = 0.0;
375:   }
376: #if defined(PETSC_USE_INFO)
377:   if (ai[mbs] != 0) {
378:     PetscReal af = fact->info.fill_ratio_needed;
379:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
380:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
381:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
382:   } else {
383:     PetscCall(PetscInfo(A, "Empty matrix\n"));
384:   }
385: #endif
386:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
387:   PetscFunctionReturn(PETSC_SUCCESS);
388: }

390: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
391: {
392:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
393:   Mat_SeqSBAIJ      *b;
394:   PetscBool          perm_identity, missing;
395:   PetscReal          fill = info->fill;
396:   const PetscInt    *rip, *ai, *aj;
397:   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, reallocs = 0, prow, d;
398:   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
399:   PetscInt           nlnk, *lnk, ncols, *cols, *uj, **ui_ptr, *uj_ptr;
400:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
401:   PetscBT            lnkbt;

403:   PetscFunctionBegin;
404:   PetscCall(MatMissingDiagonal(A, &missing, &d));
405:   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d);

407:   /*
408:    This code originally uses Modified Sparse Row (MSR) storage
409:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
410:    Then it is rewritten so the factor B takes seqsbaij format. However the associated
411:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
412:    thus the original code in MSR format is still used for these cases.
413:    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
414:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
415:   */
416:   if (bs > 1) {
417:     PetscCall(MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
418:     PetscFunctionReturn(PETSC_SUCCESS);
419:   }

421:   /* check whether perm is the identity mapping */
422:   PetscCall(ISIdentity(perm, &perm_identity));
423:   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
424:   a->permute = PETSC_FALSE;
425:   ai         = a->i;
426:   aj         = a->j;
427:   PetscCall(ISGetIndices(perm, &rip));

429:   /* initialization */
430:   PetscCall(PetscMalloc1(mbs + 1, &ui));
431:   ui[0] = 0;

433:   /* jl: linked list for storing indices of the pivot rows
434:      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
435:   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
436:   for (i = 0; i < mbs; i++) {
437:     jl[i] = mbs;
438:     il[i] = 0;
439:   }

441:   /* create and initialize a linked list for storing column indices of the active row k */
442:   nlnk = mbs + 1;
443:   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));

445:   /* initial FreeSpace size is fill*(ai[mbs]+1) */
446:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[mbs] + 1), &free_space));
447:   current_space = free_space;

449:   for (k = 0; k < mbs; k++) { /* for each active row k */
450:     /* initialize lnk by the column indices of row rip[k] of A */
451:     nzk   = 0;
452:     ncols = ai[rip[k] + 1] - ai[rip[k]];
453:     for (j = 0; j < ncols; j++) {
454:       i       = *(aj + ai[rip[k]] + j);
455:       cols[j] = rip[i];
456:     }
457:     PetscCall(PetscLLAdd(ncols, cols, mbs, &nlnk, lnk, lnkbt));
458:     nzk += nlnk;

460:     /* update lnk by computing fill-in for each pivot row to be merged in */
461:     prow = jl[k]; /* 1st pivot row */

463:     while (prow < k) {
464:       nextprow = jl[prow];
465:       /* merge prow into k-th row */
466:       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
467:       jmax   = ui[prow + 1];
468:       ncols  = jmax - jmin;
469:       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
470:       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
471:       nzk += nlnk;

473:       /* update il and jl for prow */
474:       if (jmin < jmax) {
475:         il[prow] = jmin;

477:         j        = *uj_ptr;
478:         jl[prow] = jl[j];
479:         jl[j]    = prow;
480:       }
481:       prow = nextprow;
482:     }

484:     /* if free space is not available, make more free space */
485:     if (current_space->local_remaining < nzk) {
486:       i = mbs - k + 1;                                                            /* num of unfactored rows */
487:       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
488:       PetscCall(PetscFreeSpaceGet(i, &current_space));
489:       reallocs++;
490:     }

492:     /* copy data into free space, then initialize lnk */
493:     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));

495:     /* add the k-th row into il and jl */
496:     if (nzk - 1 > 0) {
497:       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
498:       jl[k] = jl[i];
499:       jl[i] = k;
500:       il[k] = ui[k] + 1;
501:     }
502:     ui_ptr[k] = current_space->array;

504:     current_space->array += nzk;
505:     current_space->local_used += nzk;
506:     current_space->local_remaining -= nzk;

508:     ui[k + 1] = ui[k] + nzk;
509:   }

511:   PetscCall(ISRestoreIndices(perm, &rip));
512:   PetscCall(PetscFree4(ui_ptr, il, jl, cols));

514:   /* destroy list of free space and other temporary array(s) */
515:   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
516:   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
517:   PetscCall(PetscLLDestroy(lnk, lnkbt));

519:   /* put together the new matrix in MATSEQSBAIJ format */
520:   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));

522:   b               = (Mat_SeqSBAIJ *)fact->data;
523:   b->singlemalloc = PETSC_FALSE;
524:   b->free_a       = PETSC_TRUE;
525:   b->free_ij      = PETSC_TRUE;

527:   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));

529:   b->j    = uj;
530:   b->i    = ui;
531:   b->diag = NULL;
532:   b->ilen = NULL;
533:   b->imax = NULL;
534:   b->row  = perm;

536:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

538:   PetscCall(PetscObjectReference((PetscObject)perm));
539:   b->icol = perm;
540:   PetscCall(PetscObjectReference((PetscObject)perm));
541:   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
542:   b->maxnz = b->nz = ui[mbs];

544:   fact->info.factor_mallocs   = reallocs;
545:   fact->info.fill_ratio_given = fill;
546:   if (ai[mbs] != 0) {
547:     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs]) / ai[mbs];
548:   } else {
549:     fact->info.fill_ratio_needed = 0.0;
550:   }
551: #if defined(PETSC_USE_INFO)
552:   if (ai[mbs] != 0) {
553:     PetscReal af = fact->info.fill_ratio_needed;
554:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
555:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
556:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
557:   } else {
558:     PetscCall(PetscInfo(A, "Empty matrix\n"));
559:   }
560: #endif
561:   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(fact, perm_identity));
562:   PetscFunctionReturn(PETSC_SUCCESS);
563: }

565: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
566: {
567:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
568:   IS              perm = b->row;
569:   const PetscInt *ai, *aj, *perm_ptr, mbs = a->mbs, *bi = b->i, *bj = b->j;
570:   PetscInt        i, j;
571:   PetscInt       *a2anew, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
572:   PetscInt        bs = A->rmap->bs, bs2 = a->bs2;
573:   MatScalar      *ba = b->a, *aa, *ap, *dk, *uik;
574:   MatScalar      *u, *diag, *rtmp, *rtmp_ptr;
575:   MatScalar      *work;
576:   PetscInt       *pivots;
577:   PetscBool       allowzeropivot, zeropivotdetected;

579:   PetscFunctionBegin;
580:   /* initialization */
581:   PetscCall(PetscCalloc1(bs2 * mbs, &rtmp));
582:   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
583:   allowzeropivot = PetscNot(A->erroriffailure);

585:   il[0] = 0;
586:   for (i = 0; i < mbs; i++) jl[i] = mbs;

588:   PetscCall(PetscMalloc3(bs2, &dk, bs2, &uik, bs, &work));
589:   PetscCall(PetscMalloc1(bs, &pivots));

591:   PetscCall(ISGetIndices(perm, &perm_ptr));

593:   /* check permutation */
594:   if (!a->permute) {
595:     ai = a->i;
596:     aj = a->j;
597:     aa = a->a;
598:   } else {
599:     ai = a->inew;
600:     aj = a->jnew;
601:     PetscCall(PetscMalloc1(bs2 * ai[mbs], &aa));
602:     PetscCall(PetscArraycpy(aa, a->a, bs2 * ai[mbs]));
603:     PetscCall(PetscMalloc1(ai[mbs], &a2anew));
604:     PetscCall(PetscArraycpy(a2anew, a->a2anew, ai[mbs]));

606:     for (i = 0; i < mbs; i++) {
607:       jmin = ai[i];
608:       jmax = ai[i + 1];
609:       for (j = jmin; j < jmax; j++) {
610:         while (a2anew[j] != j) {
611:           k         = a2anew[j];
612:           a2anew[j] = a2anew[k];
613:           a2anew[k] = k;
614:           for (k1 = 0; k1 < bs2; k1++) {
615:             dk[k1]           = aa[k * bs2 + k1];
616:             aa[k * bs2 + k1] = aa[j * bs2 + k1];
617:             aa[j * bs2 + k1] = dk[k1];
618:           }
619:         }
620:         /* transform column-oriented blocks that lie in the lower triangle to row-oriented blocks */
621:         if (i > aj[j]) {
622:           ap = aa + j * bs2;                       /* ptr to the beginning of j-th block of aa */
623:           for (k = 0; k < bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
624:           for (k = 0; k < bs; k++) {               /* j-th block of aa <- dk^T */
625:             for (k1 = 0; k1 < bs; k1++) *ap++ = dk[k + bs * k1];
626:           }
627:         }
628:       }
629:     }
630:     PetscCall(PetscFree(a2anew));
631:   }

633:   /* for each row k */
634:   for (k = 0; k < mbs; k++) {
635:     /*initialize k-th row with elements nonzero in row perm(k) of A */
636:     jmin = ai[perm_ptr[k]];
637:     jmax = ai[perm_ptr[k] + 1];

639:     ap = aa + jmin * bs2;
640:     for (j = jmin; j < jmax; j++) {
641:       vj       = perm_ptr[aj[j]]; /* block col. index */
642:       rtmp_ptr = rtmp + vj * bs2;
643:       for (i = 0; i < bs2; i++) *rtmp_ptr++ = *ap++;
644:     }

646:     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
647:     PetscCall(PetscArraycpy(dk, rtmp + k * bs2, bs2));
648:     i = jl[k]; /* first row to be added to k_th row  */

650:     while (i < k) {
651:       nexti = jl[i]; /* next row to be added to k_th row */

653:       /* compute multiplier */
654:       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */

656:       /* uik = -inv(Di)*U_bar(i,k) */
657:       diag = ba + i * bs2;
658:       u    = ba + ili * bs2;
659:       PetscCall(PetscArrayzero(uik, bs2));
660:       PetscKernel_A_gets_A_minus_B_times_C(bs, uik, diag, u);

662:       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
663:       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, dk, uik, u);
664:       PetscCall(PetscLogFlops(4.0 * bs * bs2));

666:       /* update -U(i,k) */
667:       PetscCall(PetscArraycpy(ba + ili * bs2, uik, bs2));

669:       /* add multiple of row i to k-th row ... */
670:       jmin = ili + 1;
671:       jmax = bi[i + 1];
672:       if (jmin < jmax) {
673:         for (j = jmin; j < jmax; j++) {
674:           /* rtmp += -U(i,k)^T * U_bar(i,j) */
675:           rtmp_ptr = rtmp + bj[j] * bs2;
676:           u        = ba + j * bs2;
677:           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, rtmp_ptr, uik, u);
678:         }
679:         PetscCall(PetscLogFlops(2.0 * bs * bs2 * (jmax - jmin)));

681:         /* ... add i to row list for next nonzero entry */
682:         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
683:         j     = bj[jmin];
684:         jl[i] = jl[j];
685:         jl[j] = i; /* update jl */
686:       }
687:       i = nexti;
688:     }

690:     /* save nonzero entries in k-th row of U ... */

692:     /* invert diagonal block */
693:     diag = ba + k * bs2;
694:     PetscCall(PetscArraycpy(diag, dk, bs2));

696:     PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, pivots, work, allowzeropivot, &zeropivotdetected));
697:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

699:     jmin = bi[k];
700:     jmax = bi[k + 1];
701:     if (jmin < jmax) {
702:       for (j = jmin; j < jmax; j++) {
703:         vj       = bj[j]; /* block col. index of U */
704:         u        = ba + j * bs2;
705:         rtmp_ptr = rtmp + vj * bs2;
706:         for (k1 = 0; k1 < bs2; k1++) {
707:           *u++        = *rtmp_ptr;
708:           *rtmp_ptr++ = 0.0;
709:         }
710:       }

712:       /* ... add k to row list for first nonzero entry in k-th row */
713:       il[k] = jmin;
714:       i     = bj[jmin];
715:       jl[k] = jl[i];
716:       jl[i] = k;
717:     }
718:   }

720:   PetscCall(PetscFree(rtmp));
721:   PetscCall(PetscFree2(il, jl));
722:   PetscCall(PetscFree3(dk, uik, work));
723:   PetscCall(PetscFree(pivots));
724:   if (a->permute) PetscCall(PetscFree(aa));

726:   PetscCall(ISRestoreIndices(perm, &perm_ptr));

728:   C->ops->solve          = MatSolve_SeqSBAIJ_N_inplace;
729:   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_inplace;
730:   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_inplace;
731:   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_inplace;

733:   C->assembled    = PETSC_TRUE;
734:   C->preallocated = PETSC_TRUE;

736:   PetscCall(PetscLogFlops(1.3333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */
737:   PetscFunctionReturn(PETSC_SUCCESS);
738: }

740: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
741: {
742:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
743:   PetscInt      i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
744:   PetscInt     *ai, *aj, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
745:   PetscInt      bs = A->rmap->bs, bs2 = a->bs2;
746:   MatScalar    *ba = b->a, *aa, *ap, *dk, *uik;
747:   MatScalar    *u, *diag, *rtmp, *rtmp_ptr;
748:   MatScalar    *work;
749:   PetscInt     *pivots;
750:   PetscBool     allowzeropivot, zeropivotdetected;

752:   PetscFunctionBegin;
753:   PetscCall(PetscCalloc1(bs2 * mbs, &rtmp));
754:   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
755:   il[0] = 0;
756:   for (i = 0; i < mbs; i++) jl[i] = mbs;

758:   PetscCall(PetscMalloc3(bs2, &dk, bs2, &uik, bs, &work));
759:   PetscCall(PetscMalloc1(bs, &pivots));
760:   allowzeropivot = PetscNot(A->erroriffailure);

762:   ai = a->i;
763:   aj = a->j;
764:   aa = a->a;

766:   /* for each row k */
767:   for (k = 0; k < mbs; k++) {
768:     /*initialize k-th row with elements nonzero in row k of A */
769:     jmin = ai[k];
770:     jmax = ai[k + 1];
771:     ap   = aa + jmin * bs2;
772:     for (j = jmin; j < jmax; j++) {
773:       vj       = aj[j]; /* block col. index */
774:       rtmp_ptr = rtmp + vj * bs2;
775:       for (i = 0; i < bs2; i++) *rtmp_ptr++ = *ap++;
776:     }

778:     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
779:     PetscCall(PetscArraycpy(dk, rtmp + k * bs2, bs2));
780:     i = jl[k]; /* first row to be added to k_th row  */

782:     while (i < k) {
783:       nexti = jl[i]; /* next row to be added to k_th row */

785:       /* compute multiplier */
786:       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */

788:       /* uik = -inv(Di)*U_bar(i,k) */
789:       diag = ba + i * bs2;
790:       u    = ba + ili * bs2;
791:       PetscCall(PetscArrayzero(uik, bs2));
792:       PetscKernel_A_gets_A_minus_B_times_C(bs, uik, diag, u);

794:       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
795:       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, dk, uik, u);
796:       PetscCall(PetscLogFlops(2.0 * bs * bs2));

798:       /* update -U(i,k) */
799:       PetscCall(PetscArraycpy(ba + ili * bs2, uik, bs2));

801:       /* add multiple of row i to k-th row ... */
802:       jmin = ili + 1;
803:       jmax = bi[i + 1];
804:       if (jmin < jmax) {
805:         for (j = jmin; j < jmax; j++) {
806:           /* rtmp += -U(i,k)^T * U_bar(i,j) */
807:           rtmp_ptr = rtmp + bj[j] * bs2;
808:           u        = ba + j * bs2;
809:           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, rtmp_ptr, uik, u);
810:         }
811:         PetscCall(PetscLogFlops(2.0 * bs * bs2 * (jmax - jmin)));

813:         /* ... add i to row list for next nonzero entry */
814:         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
815:         j     = bj[jmin];
816:         jl[i] = jl[j];
817:         jl[j] = i; /* update jl */
818:       }
819:       i = nexti;
820:     }

822:     /* save nonzero entries in k-th row of U ... */

824:     /* invert diagonal block */
825:     diag = ba + k * bs2;
826:     PetscCall(PetscArraycpy(diag, dk, bs2));

828:     PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, pivots, work, allowzeropivot, &zeropivotdetected));
829:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

831:     jmin = bi[k];
832:     jmax = bi[k + 1];
833:     if (jmin < jmax) {
834:       for (j = jmin; j < jmax; j++) {
835:         vj       = bj[j]; /* block col. index of U */
836:         u        = ba + j * bs2;
837:         rtmp_ptr = rtmp + vj * bs2;
838:         for (k1 = 0; k1 < bs2; k1++) {
839:           *u++        = *rtmp_ptr;
840:           *rtmp_ptr++ = 0.0;
841:         }
842:       }

844:       /* ... add k to row list for first nonzero entry in k-th row */
845:       il[k] = jmin;
846:       i     = bj[jmin];
847:       jl[k] = jl[i];
848:       jl[i] = k;
849:     }
850:   }

852:   PetscCall(PetscFree(rtmp));
853:   PetscCall(PetscFree2(il, jl));
854:   PetscCall(PetscFree3(dk, uik, work));
855:   PetscCall(PetscFree(pivots));

857:   C->ops->solve          = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
858:   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
859:   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
860:   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
861:   C->assembled           = PETSC_TRUE;
862:   C->preallocated        = PETSC_TRUE;

864:   PetscCall(PetscLogFlops(1.3333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */
865:   PetscFunctionReturn(PETSC_SUCCESS);
866: }

868: /*
869:     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
870:     Version for blocks 2 by 2.
871: */
872: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C, Mat A, const MatFactorInfo *info)
873: {
874:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
875:   IS              perm = b->row;
876:   const PetscInt *ai, *aj, *perm_ptr;
877:   PetscInt        i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
878:   PetscInt       *a2anew, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
879:   MatScalar      *ba = b->a, *aa, *ap;
880:   MatScalar      *u, *diag, *rtmp, *rtmp_ptr, dk[4], uik[4];
881:   PetscReal       shift = info->shiftamount;
882:   PetscBool       allowzeropivot, zeropivotdetected;

884:   PetscFunctionBegin;
885:   allowzeropivot = PetscNot(A->erroriffailure);

887:   /* initialization */
888:   /* il and jl record the first nonzero element in each row of the accessing
889:      window U(0:k, k:mbs-1).
890:      jl:    list of rows to be added to uneliminated rows
891:             i>= k: jl(i) is the first row to be added to row i
892:             i<  k: jl(i) is the row following row i in some list of rows
893:             jl(i) = mbs indicates the end of a list
894:      il(i): points to the first nonzero element in columns k,...,mbs-1 of
895:             row i of U */
896:   PetscCall(PetscCalloc1(4 * mbs, &rtmp));
897:   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
898:   il[0] = 0;
899:   for (i = 0; i < mbs; i++) jl[i] = mbs;

901:   PetscCall(ISGetIndices(perm, &perm_ptr));

903:   /* check permutation */
904:   if (!a->permute) {
905:     ai = a->i;
906:     aj = a->j;
907:     aa = a->a;
908:   } else {
909:     ai = a->inew;
910:     aj = a->jnew;
911:     PetscCall(PetscMalloc1(4 * ai[mbs], &aa));
912:     PetscCall(PetscArraycpy(aa, a->a, 4 * ai[mbs]));
913:     PetscCall(PetscMalloc1(ai[mbs], &a2anew));
914:     PetscCall(PetscArraycpy(a2anew, a->a2anew, ai[mbs]));

916:     for (i = 0; i < mbs; i++) {
917:       jmin = ai[i];
918:       jmax = ai[i + 1];
919:       for (j = jmin; j < jmax; j++) {
920:         while (a2anew[j] != j) {
921:           k         = a2anew[j];
922:           a2anew[j] = a2anew[k];
923:           a2anew[k] = k;
924:           for (k1 = 0; k1 < 4; k1++) {
925:             dk[k1]         = aa[k * 4 + k1];
926:             aa[k * 4 + k1] = aa[j * 4 + k1];
927:             aa[j * 4 + k1] = dk[k1];
928:           }
929:         }
930:         /* transform column-oriented blocks that lie in the lower triangle to row-oriented blocks */
931:         if (i > aj[j]) {
932:           ap    = aa + j * 4; /* ptr to the beginning of the block */
933:           dk[1] = ap[1];      /* swap ap[1] and ap[2] */
934:           ap[1] = ap[2];
935:           ap[2] = dk[1];
936:         }
937:       }
938:     }
939:     PetscCall(PetscFree(a2anew));
940:   }

942:   /* for each row k */
943:   for (k = 0; k < mbs; k++) {
944:     /*initialize k-th row with elements nonzero in row perm(k) of A */
945:     jmin = ai[perm_ptr[k]];
946:     jmax = ai[perm_ptr[k] + 1];
947:     ap   = aa + jmin * 4;
948:     for (j = jmin; j < jmax; j++) {
949:       vj       = perm_ptr[aj[j]]; /* block col. index */
950:       rtmp_ptr = rtmp + vj * 4;
951:       for (i = 0; i < 4; i++) *rtmp_ptr++ = *ap++;
952:     }

954:     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
955:     PetscCall(PetscArraycpy(dk, rtmp + k * 4, 4));
956:     i = jl[k]; /* first row to be added to k_th row  */

958:     while (i < k) {
959:       nexti = jl[i]; /* next row to be added to k_th row */

961:       /* compute multiplier */
962:       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */

964:       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
965:       diag   = ba + i * 4;
966:       u      = ba + ili * 4;
967:       uik[0] = -(diag[0] * u[0] + diag[2] * u[1]);
968:       uik[1] = -(diag[1] * u[0] + diag[3] * u[1]);
969:       uik[2] = -(diag[0] * u[2] + diag[2] * u[3]);
970:       uik[3] = -(diag[1] * u[2] + diag[3] * u[3]);

972:       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
973:       dk[0] += uik[0] * u[0] + uik[1] * u[1];
974:       dk[1] += uik[2] * u[0] + uik[3] * u[1];
975:       dk[2] += uik[0] * u[2] + uik[1] * u[3];
976:       dk[3] += uik[2] * u[2] + uik[3] * u[3];

978:       PetscCall(PetscLogFlops(16.0 * 2.0));

980:       /* update -U(i,k): ba[ili] = uik */
981:       PetscCall(PetscArraycpy(ba + ili * 4, uik, 4));

983:       /* add multiple of row i to k-th row ... */
984:       jmin = ili + 1;
985:       jmax = bi[i + 1];
986:       if (jmin < jmax) {
987:         for (j = jmin; j < jmax; j++) {
988:           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
989:           rtmp_ptr = rtmp + bj[j] * 4;
990:           u        = ba + j * 4;
991:           rtmp_ptr[0] += uik[0] * u[0] + uik[1] * u[1];
992:           rtmp_ptr[1] += uik[2] * u[0] + uik[3] * u[1];
993:           rtmp_ptr[2] += uik[0] * u[2] + uik[1] * u[3];
994:           rtmp_ptr[3] += uik[2] * u[2] + uik[3] * u[3];
995:         }
996:         PetscCall(PetscLogFlops(16.0 * (jmax - jmin)));

998:         /* ... add i to row list for next nonzero entry */
999:         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1000:         j     = bj[jmin];
1001:         jl[i] = jl[j];
1002:         jl[j] = i; /* update jl */
1003:       }
1004:       i = nexti;
1005:     }

1007:     /* save nonzero entries in k-th row of U ... */

1009:     /* invert diagonal block */
1010:     diag = ba + k * 4;
1011:     PetscCall(PetscArraycpy(diag, dk, 4));
1012:     PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected));
1013:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

1015:     jmin = bi[k];
1016:     jmax = bi[k + 1];
1017:     if (jmin < jmax) {
1018:       for (j = jmin; j < jmax; j++) {
1019:         vj       = bj[j]; /* block col. index of U */
1020:         u        = ba + j * 4;
1021:         rtmp_ptr = rtmp + vj * 4;
1022:         for (k1 = 0; k1 < 4; k1++) {
1023:           *u++        = *rtmp_ptr;
1024:           *rtmp_ptr++ = 0.0;
1025:         }
1026:       }

1028:       /* ... add k to row list for first nonzero entry in k-th row */
1029:       il[k] = jmin;
1030:       i     = bj[jmin];
1031:       jl[k] = jl[i];
1032:       jl[i] = k;
1033:     }
1034:   }

1036:   PetscCall(PetscFree(rtmp));
1037:   PetscCall(PetscFree2(il, jl));
1038:   if (a->permute) PetscCall(PetscFree(aa));
1039:   PetscCall(ISRestoreIndices(perm, &perm_ptr));

1041:   C->ops->solve          = MatSolve_SeqSBAIJ_2_inplace;
1042:   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_inplace;
1043:   C->assembled           = PETSC_TRUE;
1044:   C->preallocated        = PETSC_TRUE;

1046:   PetscCall(PetscLogFlops(1.3333 * 8 * b->mbs)); /* from inverting diagonal blocks */
1047:   PetscFunctionReturn(PETSC_SUCCESS);
1048: }

1050: /*
1051:       Version for when blocks are 2 by 2 Using natural ordering
1052: */
1053: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
1054: {
1055:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1056:   PetscInt      i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
1057:   PetscInt     *ai, *aj, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
1058:   MatScalar    *ba = b->a, *aa, *ap, dk[8], uik[8];
1059:   MatScalar    *u, *diag, *rtmp, *rtmp_ptr;
1060:   PetscReal     shift = info->shiftamount;
1061:   PetscBool     allowzeropivot, zeropivotdetected;

1063:   PetscFunctionBegin;
1064:   allowzeropivot = PetscNot(A->erroriffailure);

1066:   /* initialization */
1067:   /* il and jl record the first nonzero element in each row of the accessing
1068:      window U(0:k, k:mbs-1).
1069:      jl:    list of rows to be added to uneliminated rows
1070:             i>= k: jl(i) is the first row to be added to row i
1071:             i<  k: jl(i) is the row following row i in some list of rows
1072:             jl(i) = mbs indicates the end of a list
1073:      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1074:             row i of U */
1075:   PetscCall(PetscCalloc1(4 * mbs, &rtmp));
1076:   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
1077:   il[0] = 0;
1078:   for (i = 0; i < mbs; i++) jl[i] = mbs;

1080:   ai = a->i;
1081:   aj = a->j;
1082:   aa = a->a;

1084:   /* for each row k */
1085:   for (k = 0; k < mbs; k++) {
1086:     /*initialize k-th row with elements nonzero in row k of A */
1087:     jmin = ai[k];
1088:     jmax = ai[k + 1];
1089:     ap   = aa + jmin * 4;
1090:     for (j = jmin; j < jmax; j++) {
1091:       vj       = aj[j]; /* block col. index */
1092:       rtmp_ptr = rtmp + vj * 4;
1093:       for (i = 0; i < 4; i++) *rtmp_ptr++ = *ap++;
1094:     }

1096:     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1097:     PetscCall(PetscArraycpy(dk, rtmp + k * 4, 4));
1098:     i = jl[k]; /* first row to be added to k_th row  */

1100:     while (i < k) {
1101:       nexti = jl[i]; /* next row to be added to k_th row */

1103:       /* compute multiplier */
1104:       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */

1106:       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1107:       diag   = ba + i * 4;
1108:       u      = ba + ili * 4;
1109:       uik[0] = -(diag[0] * u[0] + diag[2] * u[1]);
1110:       uik[1] = -(diag[1] * u[0] + diag[3] * u[1]);
1111:       uik[2] = -(diag[0] * u[2] + diag[2] * u[3]);
1112:       uik[3] = -(diag[1] * u[2] + diag[3] * u[3]);

1114:       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1115:       dk[0] += uik[0] * u[0] + uik[1] * u[1];
1116:       dk[1] += uik[2] * u[0] + uik[3] * u[1];
1117:       dk[2] += uik[0] * u[2] + uik[1] * u[3];
1118:       dk[3] += uik[2] * u[2] + uik[3] * u[3];

1120:       PetscCall(PetscLogFlops(16.0 * 2.0));

1122:       /* update -U(i,k): ba[ili] = uik */
1123:       PetscCall(PetscArraycpy(ba + ili * 4, uik, 4));

1125:       /* add multiple of row i to k-th row ... */
1126:       jmin = ili + 1;
1127:       jmax = bi[i + 1];
1128:       if (jmin < jmax) {
1129:         for (j = jmin; j < jmax; j++) {
1130:           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1131:           rtmp_ptr = rtmp + bj[j] * 4;
1132:           u        = ba + j * 4;
1133:           rtmp_ptr[0] += uik[0] * u[0] + uik[1] * u[1];
1134:           rtmp_ptr[1] += uik[2] * u[0] + uik[3] * u[1];
1135:           rtmp_ptr[2] += uik[0] * u[2] + uik[1] * u[3];
1136:           rtmp_ptr[3] += uik[2] * u[2] + uik[3] * u[3];
1137:         }
1138:         PetscCall(PetscLogFlops(16.0 * (jmax - jmin)));

1140:         /* ... add i to row list for next nonzero entry */
1141:         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1142:         j     = bj[jmin];
1143:         jl[i] = jl[j];
1144:         jl[j] = i; /* update jl */
1145:       }
1146:       i = nexti;
1147:     }

1149:     /* save nonzero entries in k-th row of U ... */

1151:     /* invert diagonal block */
1152:     diag = ba + k * 4;
1153:     PetscCall(PetscArraycpy(diag, dk, 4));
1154:     PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected));
1155:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

1157:     jmin = bi[k];
1158:     jmax = bi[k + 1];
1159:     if (jmin < jmax) {
1160:       for (j = jmin; j < jmax; j++) {
1161:         vj       = bj[j]; /* block col. index of U */
1162:         u        = ba + j * 4;
1163:         rtmp_ptr = rtmp + vj * 4;
1164:         for (k1 = 0; k1 < 4; k1++) {
1165:           *u++        = *rtmp_ptr;
1166:           *rtmp_ptr++ = 0.0;
1167:         }
1168:       }

1170:       /* ... add k to row list for first nonzero entry in k-th row */
1171:       il[k] = jmin;
1172:       i     = bj[jmin];
1173:       jl[k] = jl[i];
1174:       jl[i] = k;
1175:     }
1176:   }

1178:   PetscCall(PetscFree(rtmp));
1179:   PetscCall(PetscFree2(il, jl));

1181:   C->ops->solve          = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1182:   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1183:   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1184:   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1185:   C->assembled           = PETSC_TRUE;
1186:   C->preallocated        = PETSC_TRUE;

1188:   PetscCall(PetscLogFlops(1.3333 * 8 * b->mbs)); /* from inverting diagonal blocks */
1189:   PetscFunctionReturn(PETSC_SUCCESS);
1190: }

1192: /*
1193:     Numeric U^T*D*U factorization for SBAIJ format.
1194:     Version for blocks are 1 by 1.
1195: */
1196: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
1197: {
1198:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1199:   IS              ip = b->row;
1200:   const PetscInt *ai, *aj, *rip;
1201:   PetscInt       *a2anew, i, j, mbs = a->mbs, *bi = b->i, *bj = b->j, *bcol;
1202:   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
1203:   MatScalar      *rtmp, *ba = b->a, *bval, *aa, dk, uikdi;
1204:   PetscReal       rs;
1205:   FactorShiftCtx  sctx;

1207:   PetscFunctionBegin;
1208:   /* MatPivotSetUp(): initialize shift context sctx */
1209:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

1211:   PetscCall(ISGetIndices(ip, &rip));
1212:   if (!a->permute) {
1213:     ai = a->i;
1214:     aj = a->j;
1215:     aa = a->a;
1216:   } else {
1217:     ai = a->inew;
1218:     aj = a->jnew;
1219:     nz = ai[mbs];
1220:     PetscCall(PetscMalloc1(nz, &aa));
1221:     a2anew = a->a2anew;
1222:     bval   = a->a;
1223:     for (j = 0; j < nz; j++) aa[a2anew[j]] = *(bval++);
1224:   }

1226:   /* initialization */
1227:   /* il and jl record the first nonzero element in each row of the accessing
1228:      window U(0:k, k:mbs-1).
1229:      jl:    list of rows to be added to uneliminated rows
1230:             i>= k: jl(i) is the first row to be added to row i
1231:             i<  k: jl(i) is the row following row i in some list of rows
1232:             jl(i) = mbs indicates the end of a list
1233:      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1234:             row i of U */
1235:   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));

1237:   do {
1238:     sctx.newshift = PETSC_FALSE;
1239:     il[0]         = 0;
1240:     for (i = 0; i < mbs; i++) {
1241:       rtmp[i] = 0.0;
1242:       jl[i]   = mbs;
1243:     }

1245:     for (k = 0; k < mbs; k++) {
1246:       /*initialize k-th row by the perm[k]-th row of A */
1247:       jmin = ai[rip[k]];
1248:       jmax = ai[rip[k] + 1];
1249:       bval = ba + bi[k];
1250:       for (j = jmin; j < jmax; j++) {
1251:         col       = rip[aj[j]];
1252:         rtmp[col] = aa[j];
1253:         *bval++   = 0.0; /* for in-place factorization */
1254:       }

1256:       /* shift the diagonal of the matrix */
1257:       if (sctx.nshift) rtmp[k] += sctx.shift_amount;

1259:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1260:       dk = rtmp[k];
1261:       i  = jl[k]; /* first row to be added to k_th row  */

1263:       while (i < k) {
1264:         nexti = jl[i]; /* next row to be added to k_th row */

1266:         /* compute multiplier, update diag(k) and U(i,k) */
1267:         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
1268:         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
1269:         dk += uikdi * ba[ili];
1270:         ba[ili] = uikdi; /* -U(i,k) */

1272:         /* add multiple of row i to k-th row */
1273:         jmin = ili + 1;
1274:         jmax = bi[i + 1];
1275:         if (jmin < jmax) {
1276:           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
1277:           PetscCall(PetscLogFlops(2.0 * (jmax - jmin)));

1279:           /* update il and jl for row i */
1280:           il[i] = jmin;
1281:           j     = bj[jmin];
1282:           jl[i] = jl[j];
1283:           jl[j] = i;
1284:         }
1285:         i = nexti;
1286:       }

1288:       /* shift the diagonals when zero pivot is detected */
1289:       /* compute rs=sum of abs(off-diagonal) */
1290:       rs   = 0.0;
1291:       jmin = bi[k] + 1;
1292:       nz   = bi[k + 1] - jmin;
1293:       if (nz) {
1294:         bcol = bj + jmin;
1295:         while (nz--) {
1296:           rs += PetscAbsScalar(rtmp[*bcol]);
1297:           bcol++;
1298:         }
1299:       }

1301:       sctx.rs = rs;
1302:       sctx.pv = dk;
1303:       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
1304:       if (sctx.newshift) break; /* sctx.shift_amount is updated */
1305:       dk = sctx.pv;

1307:       /* copy data into U(k,:) */
1308:       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
1309:       jmin      = bi[k] + 1;
1310:       jmax      = bi[k + 1];
1311:       if (jmin < jmax) {
1312:         for (j = jmin; j < jmax; j++) {
1313:           col       = bj[j];
1314:           ba[j]     = rtmp[col];
1315:           rtmp[col] = 0.0;
1316:         }
1317:         /* add the k-th row into il and jl */
1318:         il[k] = jmin;
1319:         i     = bj[jmin];
1320:         jl[k] = jl[i];
1321:         jl[i] = k;
1322:       }
1323:     }
1324:   } while (sctx.newshift);
1325:   PetscCall(PetscFree3(rtmp, il, jl));
1326:   if (a->permute) PetscCall(PetscFree(aa));

1328:   PetscCall(ISRestoreIndices(ip, &rip));

1330:   C->ops->solve          = MatSolve_SeqSBAIJ_1_inplace;
1331:   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1332:   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_inplace;
1333:   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_inplace;
1334:   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_inplace;
1335:   C->assembled           = PETSC_TRUE;
1336:   C->preallocated        = PETSC_TRUE;

1338:   PetscCall(PetscLogFlops(C->rmap->N));
1339:   if (sctx.nshift) {
1340:     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1341:       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1342:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1343:       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1344:     }
1345:   }
1346:   PetscFunctionReturn(PETSC_SUCCESS);
1347: }

1349: /*
1350:   Version for when blocks are 1 by 1 Using natural ordering under new datastructure
1351:   Modified from MatCholeskyFactorNumeric_SeqAIJ()
1352: */
1353: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
1354: {
1355:   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ *)A->data;
1356:   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)B->data;
1357:   PetscInt       i, j, mbs = A->rmap->n, *bi = b->i, *bj = b->j, *bdiag = b->diag, *bjtmp;
1358:   PetscInt      *ai = a->i, *aj = a->j, *ajtmp;
1359:   PetscInt       k, jmin, jmax, *c2r, *il, col, nexti, ili, nz;
1360:   MatScalar     *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
1361:   FactorShiftCtx sctx;
1362:   PetscReal      rs;
1363:   MatScalar      d, *v;

1365:   PetscFunctionBegin;
1366:   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &c2r));

1368:   /* MatPivotSetUp(): initialize shift context sctx */
1369:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

1371:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1372:     sctx.shift_top = info->zeropivot;

1374:     PetscCall(PetscArrayzero(rtmp, mbs));

1376:     for (i = 0; i < mbs; i++) {
1377:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1378:       d = (aa)[a->diag[i]];
1379:       rtmp[i] += -PetscRealPart(d); /* diagonal entry */
1380:       ajtmp = aj + ai[i] + 1;       /* exclude diagonal */
1381:       v     = aa + ai[i] + 1;
1382:       nz    = ai[i + 1] - ai[i] - 1;
1383:       for (j = 0; j < nz; j++) {
1384:         rtmp[i] += PetscAbsScalar(v[j]);
1385:         rtmp[ajtmp[j]] += PetscAbsScalar(v[j]);
1386:       }
1387:       if (PetscRealPart(rtmp[i]) > sctx.shift_top) sctx.shift_top = PetscRealPart(rtmp[i]);
1388:     }
1389:     sctx.shift_top *= 1.1;
1390:     sctx.nshift_max = 5;
1391:     sctx.shift_lo   = 0.;
1392:     sctx.shift_hi   = 1.;
1393:   }

1395:   /* allocate working arrays
1396:      c2r: linked list, keep track of pivot rows for a given column. c2r[col]: head of the list for a given col
1397:      il:  for active k row, il[i] gives the index of the 1st nonzero entry in U[i,k:n-1] in bj and ba arrays
1398:   */
1399:   do {
1400:     sctx.newshift = PETSC_FALSE;

1402:     for (i = 0; i < mbs; i++) c2r[i] = mbs;
1403:     if (mbs) il[0] = 0;

1405:     for (k = 0; k < mbs; k++) {
1406:       /* zero rtmp */
1407:       nz    = bi[k + 1] - bi[k];
1408:       bjtmp = bj + bi[k];
1409:       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

1411:       /* load in initial unfactored row */
1412:       bval = ba + bi[k];
1413:       jmin = ai[k];
1414:       jmax = ai[k + 1];
1415:       for (j = jmin; j < jmax; j++) {
1416:         col       = aj[j];
1417:         rtmp[col] = aa[j];
1418:         *bval++   = 0.0; /* for in-place factorization */
1419:       }
1420:       /* shift the diagonal of the matrix: ZeropivotApply() */
1421:       rtmp[k] += sctx.shift_amount; /* shift the diagonal of the matrix */

1423:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1424:       dk = rtmp[k];
1425:       i  = c2r[k]; /* first row to be added to k_th row  */

1427:       while (i < k) {
1428:         nexti = c2r[i]; /* next row to be added to k_th row */

1430:         /* compute multiplier, update diag(k) and U(i,k) */
1431:         ili   = il[i];                   /* index of first nonzero element in U(i,k:bms-1) */
1432:         uikdi = -ba[ili] * ba[bdiag[i]]; /* diagonal(k) */
1433:         dk += uikdi * ba[ili];           /* update diag[k] */
1434:         ba[ili] = uikdi;                 /* -U(i,k) */

1436:         /* add multiple of row i to k-th row */
1437:         jmin = ili + 1;
1438:         jmax = bi[i + 1];
1439:         if (jmin < jmax) {
1440:           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
1441:           /* update il and c2r for row i */
1442:           il[i]  = jmin;
1443:           j      = bj[jmin];
1444:           c2r[i] = c2r[j];
1445:           c2r[j] = i;
1446:         }
1447:         i = nexti;
1448:       }

1450:       /* copy data into U(k,:) */
1451:       rs   = 0.0;
1452:       jmin = bi[k];
1453:       jmax = bi[k + 1] - 1;
1454:       if (jmin < jmax) {
1455:         for (j = jmin; j < jmax; j++) {
1456:           col   = bj[j];
1457:           ba[j] = rtmp[col];
1458:           rs += PetscAbsScalar(ba[j]);
1459:         }
1460:         /* add the k-th row into il and c2r */
1461:         il[k]  = jmin;
1462:         i      = bj[jmin];
1463:         c2r[k] = c2r[i];
1464:         c2r[i] = k;
1465:       }

1467:       sctx.rs = rs;
1468:       sctx.pv = dk;
1469:       PetscCall(MatPivotCheck(B, A, info, &sctx, k));
1470:       if (sctx.newshift) break;
1471:       dk = sctx.pv;

1473:       ba[bdiag[k]] = 1.0 / dk; /* U(k,k) */
1474:     }
1475:   } while (sctx.newshift);

1477:   PetscCall(PetscFree3(rtmp, il, c2r));

1479:   B->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1480:   B->ops->solves         = MatSolves_SeqSBAIJ_1;
1481:   B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1482:   B->ops->matsolve       = MatMatSolve_SeqSBAIJ_1_NaturalOrdering;
1483:   B->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
1484:   B->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;

1486:   B->assembled    = PETSC_TRUE;
1487:   B->preallocated = PETSC_TRUE;

1489:   PetscCall(PetscLogFlops(B->rmap->n));

1491:   /* MatPivotView() */
1492:   if (sctx.nshift) {
1493:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1494:       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1495:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1496:       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1497:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1498:       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1499:     }
1500:   }
1501:   PetscFunctionReturn(PETSC_SUCCESS);
1502: }

1504: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
1505: {
1506:   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1507:   PetscInt       i, j, mbs = a->mbs;
1508:   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
1509:   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
1510:   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
1511:   PetscReal      rs;
1512:   FactorShiftCtx sctx;

1514:   PetscFunctionBegin;
1515:   /* MatPivotSetUp(): initialize shift context sctx */
1516:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

1518:   /* initialization */
1519:   /* il and jl record the first nonzero element in each row of the accessing
1520:      window U(0:k, k:mbs-1).
1521:      jl:    list of rows to be added to uneliminated rows
1522:             i>= k: jl(i) is the first row to be added to row i
1523:             i<  k: jl(i) is the row following row i in some list of rows
1524:             jl(i) = mbs indicates the end of a list
1525:      il(i): points to the first nonzero element in U(i,k:mbs-1)
1526:   */
1527:   PetscCall(PetscMalloc1(mbs, &rtmp));
1528:   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));

1530:   do {
1531:     sctx.newshift = PETSC_FALSE;
1532:     il[0]         = 0;
1533:     for (i = 0; i < mbs; i++) {
1534:       rtmp[i] = 0.0;
1535:       jl[i]   = mbs;
1536:     }

1538:     for (k = 0; k < mbs; k++) {
1539:       /*initialize k-th row with elements nonzero in row perm(k) of A */
1540:       nz   = ai[k + 1] - ai[k];
1541:       acol = aj + ai[k];
1542:       aval = aa + ai[k];
1543:       bval = ba + bi[k];
1544:       while (nz--) {
1545:         rtmp[*acol++] = *aval++;
1546:         *bval++       = 0.0; /* for in-place factorization */
1547:       }

1549:       /* shift the diagonal of the matrix */
1550:       if (sctx.nshift) rtmp[k] += sctx.shift_amount;

1552:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1553:       dk = rtmp[k];
1554:       i  = jl[k]; /* first row to be added to k_th row  */

1556:       while (i < k) {
1557:         nexti = jl[i]; /* next row to be added to k_th row */
1558:         /* compute multiplier, update D(k) and U(i,k) */
1559:         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1560:         uikdi = -ba[ili] * ba[bi[i]];
1561:         dk += uikdi * ba[ili];
1562:         ba[ili] = uikdi; /* -U(i,k) */

1564:         /* add multiple of row i to k-th row ... */
1565:         jmin = ili + 1;
1566:         nz   = bi[i + 1] - jmin;
1567:         if (nz > 0) {
1568:           bcol = bj + jmin;
1569:           bval = ba + jmin;
1570:           PetscCall(PetscLogFlops(2.0 * nz));
1571:           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);

1573:           /* update il and jl for i-th row */
1574:           il[i] = jmin;
1575:           j     = bj[jmin];
1576:           jl[i] = jl[j];
1577:           jl[j] = i;
1578:         }
1579:         i = nexti;
1580:       }

1582:       /* shift the diagonals when zero pivot is detected */
1583:       /* compute rs=sum of abs(off-diagonal) */
1584:       rs   = 0.0;
1585:       jmin = bi[k] + 1;
1586:       nz   = bi[k + 1] - jmin;
1587:       if (nz) {
1588:         bcol = bj + jmin;
1589:         while (nz--) {
1590:           rs += PetscAbsScalar(rtmp[*bcol]);
1591:           bcol++;
1592:         }
1593:       }

1595:       sctx.rs = rs;
1596:       sctx.pv = dk;
1597:       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
1598:       if (sctx.newshift) break; /* sctx.shift_amount is updated */
1599:       dk = sctx.pv;

1601:       /* copy data into U(k,:) */
1602:       ba[bi[k]] = 1.0 / dk;
1603:       jmin      = bi[k] + 1;
1604:       nz        = bi[k + 1] - jmin;
1605:       if (nz) {
1606:         bcol = bj + jmin;
1607:         bval = ba + jmin;
1608:         while (nz--) {
1609:           *bval++       = rtmp[*bcol];
1610:           rtmp[*bcol++] = 0.0;
1611:         }
1612:         /* add k-th row into il and jl */
1613:         il[k] = jmin;
1614:         i     = bj[jmin];
1615:         jl[k] = jl[i];
1616:         jl[i] = k;
1617:       }
1618:     } /* end of for (k = 0; k<mbs; k++) */
1619:   } while (sctx.newshift);
1620:   PetscCall(PetscFree(rtmp));
1621:   PetscCall(PetscFree2(il, jl));

1623:   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1624:   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1625:   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1626:   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1627:   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;

1629:   C->assembled    = PETSC_TRUE;
1630:   C->preallocated = PETSC_TRUE;

1632:   PetscCall(PetscLogFlops(C->rmap->N));
1633:   if (sctx.nshift) {
1634:     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1635:       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1636:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1637:       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1638:     }
1639:   }
1640:   PetscFunctionReturn(PETSC_SUCCESS);
1641: }

1643: PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A, IS perm, const MatFactorInfo *info)
1644: {
1645:   Mat C;

1647:   PetscFunctionBegin;
1648:   PetscCall(MatGetFactor(A, "petsc", MAT_FACTOR_CHOLESKY, &C));
1649:   PetscCall(MatCholeskyFactorSymbolic(C, A, perm, info));
1650:   PetscCall(MatCholeskyFactorNumeric(C, A, info));

1652:   A->ops->solve          = C->ops->solve;
1653:   A->ops->solvetranspose = C->ops->solvetranspose;

1655:   PetscCall(MatHeaderMerge(A, &C));
1656:   PetscFunctionReturn(PETSC_SUCCESS);
1657: }