Actual source code: baijfact.c
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include <../src/mat/impls/baij/seq/baij.h>
6: #include <petsc/private/kernels/blockinvert.h>
8: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
9: {
10: Mat C = B;
11: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
12: IS isrow = b->row, isicol = b->icol;
13: const PetscInt *r, *ic;
14: PetscInt i, j, k, nz, nzL, row, *pj;
15: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
16: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
17: MatScalar *rtmp, *pc, *mwork, *pv;
18: MatScalar *aa = a->a, *v;
19: PetscInt flg;
20: PetscReal shift = info->shiftamount;
21: PetscBool allowzeropivot, zeropivotdetected;
23: ISGetIndices(isrow, &r);
24: ISGetIndices(isicol, &ic);
25: allowzeropivot = PetscNot(A->erroriffailure);
27: /* generate work space needed by the factorization */
28: PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork);
29: PetscArrayzero(rtmp, bs2 * n);
31: for (i = 0; i < n; i++) {
32: /* zero rtmp */
33: /* L part */
34: nz = bi[i + 1] - bi[i];
35: bjtmp = bj + bi[i];
36: for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);
38: /* U part */
39: nz = bdiag[i] - bdiag[i + 1];
40: bjtmp = bj + bdiag[i + 1] + 1;
41: for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);
43: /* load in initial (unfactored row) */
44: nz = ai[r[i] + 1] - ai[r[i]];
45: ajtmp = aj + ai[r[i]];
46: v = aa + bs2 * ai[r[i]];
47: for (j = 0; j < nz; j++) PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2);
49: /* elimination */
50: bjtmp = bj + bi[i];
51: nzL = bi[i + 1] - bi[i];
52: for (k = 0; k < nzL; k++) {
53: row = bjtmp[k];
54: pc = rtmp + bs2 * row;
55: for (flg = 0, j = 0; j < bs2; j++) {
56: if (pc[j] != (PetscScalar)0.0) {
57: flg = 1;
58: break;
59: }
60: }
61: if (flg) {
62: pv = b->a + bs2 * bdiag[row];
63: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
64: PetscKernel_A_gets_A_times_B_2(pc, pv, mwork);
66: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
67: pv = b->a + bs2 * (bdiag[row + 1] + 1);
68: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
69: for (j = 0; j < nz; j++) {
70: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
71: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
72: v = rtmp + 4 * pj[j];
73: PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv);
74: pv += 4;
75: }
76: PetscLogFlops(16.0 * nz + 12); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
77: }
78: }
80: /* finished row so stick it into b->a */
81: /* L part */
82: pv = b->a + bs2 * bi[i];
83: pj = b->j + bi[i];
84: nz = bi[i + 1] - bi[i];
85: for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
87: /* Mark diagonal and invert diagonal for simpler triangular solves */
88: pv = b->a + bs2 * bdiag[i];
89: pj = b->j + bdiag[i];
90: PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2);
91: PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected);
92: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
94: /* U part */
95: pv = b->a + bs2 * (bdiag[i + 1] + 1);
96: pj = b->j + bdiag[i + 1] + 1;
97: nz = bdiag[i] - bdiag[i + 1] - 1;
98: for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
99: }
101: PetscFree2(rtmp, mwork);
102: ISRestoreIndices(isicol, &ic);
103: ISRestoreIndices(isrow, &r);
105: C->ops->solve = MatSolve_SeqBAIJ_2;
106: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
107: C->assembled = PETSC_TRUE;
109: PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n); /* from inverting diagonal blocks */
110: return 0;
111: }
113: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
114: {
115: Mat C = B;
116: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
117: PetscInt i, j, k, nz, nzL, row, *pj;
118: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
119: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
120: MatScalar *rtmp, *pc, *mwork, *pv;
121: MatScalar *aa = a->a, *v;
122: PetscInt flg;
123: PetscReal shift = info->shiftamount;
124: PetscBool allowzeropivot, zeropivotdetected;
126: allowzeropivot = PetscNot(A->erroriffailure);
128: /* generate work space needed by the factorization */
129: PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork);
130: PetscArrayzero(rtmp, bs2 * n);
132: for (i = 0; i < n; i++) {
133: /* zero rtmp */
134: /* L part */
135: nz = bi[i + 1] - bi[i];
136: bjtmp = bj + bi[i];
137: for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);
139: /* U part */
140: nz = bdiag[i] - bdiag[i + 1];
141: bjtmp = bj + bdiag[i + 1] + 1;
142: for (j = 0; j < nz; j++) PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2);
144: /* load in initial (unfactored row) */
145: nz = ai[i + 1] - ai[i];
146: ajtmp = aj + ai[i];
147: v = aa + bs2 * ai[i];
148: for (j = 0; j < nz; j++) PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2);
150: /* elimination */
151: bjtmp = bj + bi[i];
152: nzL = bi[i + 1] - bi[i];
153: for (k = 0; k < nzL; k++) {
154: row = bjtmp[k];
155: pc = rtmp + bs2 * row;
156: for (flg = 0, j = 0; j < bs2; j++) {
157: if (pc[j] != (PetscScalar)0.0) {
158: flg = 1;
159: break;
160: }
161: }
162: if (flg) {
163: pv = b->a + bs2 * bdiag[row];
164: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
165: PetscKernel_A_gets_A_times_B_2(pc, pv, mwork);
167: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
168: pv = b->a + bs2 * (bdiag[row + 1] + 1);
169: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
170: for (j = 0; j < nz; j++) {
171: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
172: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
173: v = rtmp + 4 * pj[j];
174: PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv);
175: pv += 4;
176: }
177: PetscLogFlops(16.0 * nz + 12); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
178: }
179: }
181: /* finished row so stick it into b->a */
182: /* L part */
183: pv = b->a + bs2 * bi[i];
184: pj = b->j + bi[i];
185: nz = bi[i + 1] - bi[i];
186: for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
188: /* Mark diagonal and invert diagonal for simpler triangular solves */
189: pv = b->a + bs2 * bdiag[i];
190: pj = b->j + bdiag[i];
191: PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2);
192: PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected);
193: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
195: /* U part */
196: /*
197: pv = b->a + bs2*bi[2*n-i];
198: pj = b->j + bi[2*n-i];
199: nz = bi[2*n-i+1] - bi[2*n-i] - 1;
200: */
201: pv = b->a + bs2 * (bdiag[i + 1] + 1);
202: pj = b->j + bdiag[i + 1] + 1;
203: nz = bdiag[i] - bdiag[i + 1] - 1;
204: for (j = 0; j < nz; j++) PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2);
205: }
206: PetscFree2(rtmp, mwork);
208: C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering;
209: C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
210: C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
211: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
212: C->assembled = PETSC_TRUE;
214: PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n); /* from inverting diagonal blocks */
215: return 0;
216: }
218: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
219: {
220: Mat C = B;
221: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
222: IS isrow = b->row, isicol = b->icol;
223: const PetscInt *r, *ic;
224: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
225: PetscInt *ajtmpold, *ajtmp, nz, row;
226: PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
227: MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
228: MatScalar p1, p2, p3, p4;
229: MatScalar *ba = b->a, *aa = a->a;
230: PetscReal shift = info->shiftamount;
231: PetscBool allowzeropivot, zeropivotdetected;
233: allowzeropivot = PetscNot(A->erroriffailure);
234: ISGetIndices(isrow, &r);
235: ISGetIndices(isicol, &ic);
236: PetscMalloc1(4 * (n + 1), &rtmp);
238: for (i = 0; i < n; i++) {
239: nz = bi[i + 1] - bi[i];
240: ajtmp = bj + bi[i];
241: for (j = 0; j < nz; j++) {
242: x = rtmp + 4 * ajtmp[j];
243: x[0] = x[1] = x[2] = x[3] = 0.0;
244: }
245: /* load in initial (unfactored row) */
246: idx = r[i];
247: nz = ai[idx + 1] - ai[idx];
248: ajtmpold = aj + ai[idx];
249: v = aa + 4 * ai[idx];
250: for (j = 0; j < nz; j++) {
251: x = rtmp + 4 * ic[ajtmpold[j]];
252: x[0] = v[0];
253: x[1] = v[1];
254: x[2] = v[2];
255: x[3] = v[3];
256: v += 4;
257: }
258: row = *ajtmp++;
259: while (row < i) {
260: pc = rtmp + 4 * row;
261: p1 = pc[0];
262: p2 = pc[1];
263: p3 = pc[2];
264: p4 = pc[3];
265: if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
266: pv = ba + 4 * diag_offset[row];
267: pj = bj + diag_offset[row] + 1;
268: x1 = pv[0];
269: x2 = pv[1];
270: x3 = pv[2];
271: x4 = pv[3];
272: pc[0] = m1 = p1 * x1 + p3 * x2;
273: pc[1] = m2 = p2 * x1 + p4 * x2;
274: pc[2] = m3 = p1 * x3 + p3 * x4;
275: pc[3] = m4 = p2 * x3 + p4 * x4;
276: nz = bi[row + 1] - diag_offset[row] - 1;
277: pv += 4;
278: for (j = 0; j < nz; j++) {
279: x1 = pv[0];
280: x2 = pv[1];
281: x3 = pv[2];
282: x4 = pv[3];
283: x = rtmp + 4 * pj[j];
284: x[0] -= m1 * x1 + m3 * x2;
285: x[1] -= m2 * x1 + m4 * x2;
286: x[2] -= m1 * x3 + m3 * x4;
287: x[3] -= m2 * x3 + m4 * x4;
288: pv += 4;
289: }
290: PetscLogFlops(16.0 * nz + 12.0);
291: }
292: row = *ajtmp++;
293: }
294: /* finished row so stick it into b->a */
295: pv = ba + 4 * bi[i];
296: pj = bj + bi[i];
297: nz = bi[i + 1] - bi[i];
298: for (j = 0; j < nz; j++) {
299: x = rtmp + 4 * pj[j];
300: pv[0] = x[0];
301: pv[1] = x[1];
302: pv[2] = x[2];
303: pv[3] = x[3];
304: pv += 4;
305: }
306: /* invert diagonal block */
307: w = ba + 4 * diag_offset[i];
308: PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected);
309: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
310: }
312: PetscFree(rtmp);
313: ISRestoreIndices(isicol, &ic);
314: ISRestoreIndices(isrow, &r);
316: C->ops->solve = MatSolve_SeqBAIJ_2_inplace;
317: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
318: C->assembled = PETSC_TRUE;
320: PetscLogFlops(1.333333333333 * 8 * b->mbs); /* from inverting diagonal blocks */
321: return 0;
322: }
323: /*
324: Version for when blocks are 2 by 2 Using natural ordering
325: */
326: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
327: {
328: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
329: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
330: PetscInt *ajtmpold, *ajtmp, nz, row;
331: PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
332: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
333: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
334: MatScalar *ba = b->a, *aa = a->a;
335: PetscReal shift = info->shiftamount;
336: PetscBool allowzeropivot, zeropivotdetected;
338: allowzeropivot = PetscNot(A->erroriffailure);
339: PetscMalloc1(4 * (n + 1), &rtmp);
340: for (i = 0; i < n; i++) {
341: nz = bi[i + 1] - bi[i];
342: ajtmp = bj + bi[i];
343: for (j = 0; j < nz; j++) {
344: x = rtmp + 4 * ajtmp[j];
345: x[0] = x[1] = x[2] = x[3] = 0.0;
346: }
347: /* load in initial (unfactored row) */
348: nz = ai[i + 1] - ai[i];
349: ajtmpold = aj + ai[i];
350: v = aa + 4 * ai[i];
351: for (j = 0; j < nz; j++) {
352: x = rtmp + 4 * ajtmpold[j];
353: x[0] = v[0];
354: x[1] = v[1];
355: x[2] = v[2];
356: x[3] = v[3];
357: v += 4;
358: }
359: row = *ajtmp++;
360: while (row < i) {
361: pc = rtmp + 4 * row;
362: p1 = pc[0];
363: p2 = pc[1];
364: p3 = pc[2];
365: p4 = pc[3];
366: if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
367: pv = ba + 4 * diag_offset[row];
368: pj = bj + diag_offset[row] + 1;
369: x1 = pv[0];
370: x2 = pv[1];
371: x3 = pv[2];
372: x4 = pv[3];
373: pc[0] = m1 = p1 * x1 + p3 * x2;
374: pc[1] = m2 = p2 * x1 + p4 * x2;
375: pc[2] = m3 = p1 * x3 + p3 * x4;
376: pc[3] = m4 = p2 * x3 + p4 * x4;
377: nz = bi[row + 1] - diag_offset[row] - 1;
378: pv += 4;
379: for (j = 0; j < nz; j++) {
380: x1 = pv[0];
381: x2 = pv[1];
382: x3 = pv[2];
383: x4 = pv[3];
384: x = rtmp + 4 * pj[j];
385: x[0] -= m1 * x1 + m3 * x2;
386: x[1] -= m2 * x1 + m4 * x2;
387: x[2] -= m1 * x3 + m3 * x4;
388: x[3] -= m2 * x3 + m4 * x4;
389: pv += 4;
390: }
391: PetscLogFlops(16.0 * nz + 12.0);
392: }
393: row = *ajtmp++;
394: }
395: /* finished row so stick it into b->a */
396: pv = ba + 4 * bi[i];
397: pj = bj + bi[i];
398: nz = bi[i + 1] - bi[i];
399: for (j = 0; j < nz; j++) {
400: x = rtmp + 4 * pj[j];
401: pv[0] = x[0];
402: pv[1] = x[1];
403: pv[2] = x[2];
404: pv[3] = x[3];
405: /*
406: printf(" col %d:",pj[j]);
407: PetscInt j1;
408: for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
409: printf("\n");
410: */
411: pv += 4;
412: }
413: /* invert diagonal block */
414: w = ba + 4 * diag_offset[i];
415: PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected);
416: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
417: }
419: PetscFree(rtmp);
421: C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
422: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
423: C->assembled = PETSC_TRUE;
425: PetscLogFlops(1.333333333333 * 8 * b->mbs); /* from inverting diagonal blocks */
426: return 0;
427: }
429: /* ----------------------------------------------------------- */
430: /*
431: Version for when blocks are 1 by 1.
432: */
433: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
434: {
435: Mat C = B;
436: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
437: IS isrow = b->row, isicol = b->icol;
438: const PetscInt *r, *ic, *ics;
439: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
440: PetscInt i, j, k, nz, nzL, row, *pj;
441: const PetscInt *ajtmp, *bjtmp;
442: MatScalar *rtmp, *pc, multiplier, *pv;
443: const MatScalar *aa = a->a, *v;
444: PetscBool row_identity, col_identity;
445: FactorShiftCtx sctx;
446: const PetscInt *ddiag;
447: PetscReal rs;
448: MatScalar d;
450: /* MatPivotSetUp(): initialize shift context sctx */
451: PetscMemzero(&sctx, sizeof(FactorShiftCtx));
453: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
454: ddiag = a->diag;
455: sctx.shift_top = info->zeropivot;
456: for (i = 0; i < n; i++) {
457: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
458: d = (aa)[ddiag[i]];
459: rs = -PetscAbsScalar(d) - PetscRealPart(d);
460: v = aa + ai[i];
461: nz = ai[i + 1] - ai[i];
462: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
463: if (rs > sctx.shift_top) sctx.shift_top = rs;
464: }
465: sctx.shift_top *= 1.1;
466: sctx.nshift_max = 5;
467: sctx.shift_lo = 0.;
468: sctx.shift_hi = 1.;
469: }
471: ISGetIndices(isrow, &r);
472: ISGetIndices(isicol, &ic);
473: PetscMalloc1(n + 1, &rtmp);
474: ics = ic;
476: do {
477: sctx.newshift = PETSC_FALSE;
478: for (i = 0; i < n; i++) {
479: /* zero rtmp */
480: /* L part */
481: nz = bi[i + 1] - bi[i];
482: bjtmp = bj + bi[i];
483: for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
485: /* U part */
486: nz = bdiag[i] - bdiag[i + 1];
487: bjtmp = bj + bdiag[i + 1] + 1;
488: for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
490: /* load in initial (unfactored row) */
491: nz = ai[r[i] + 1] - ai[r[i]];
492: ajtmp = aj + ai[r[i]];
493: v = aa + ai[r[i]];
494: for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
496: /* ZeropivotApply() */
497: rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
499: /* elimination */
500: bjtmp = bj + bi[i];
501: row = *bjtmp++;
502: nzL = bi[i + 1] - bi[i];
503: for (k = 0; k < nzL; k++) {
504: pc = rtmp + row;
505: if (*pc != (PetscScalar)0.0) {
506: pv = b->a + bdiag[row];
507: multiplier = *pc * (*pv);
508: *pc = multiplier;
510: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
511: pv = b->a + bdiag[row + 1] + 1;
512: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
513: for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
514: PetscLogFlops(2.0 * nz);
515: }
516: row = *bjtmp++;
517: }
519: /* finished row so stick it into b->a */
520: rs = 0.0;
521: /* L part */
522: pv = b->a + bi[i];
523: pj = b->j + bi[i];
524: nz = bi[i + 1] - bi[i];
525: for (j = 0; j < nz; j++) {
526: pv[j] = rtmp[pj[j]];
527: rs += PetscAbsScalar(pv[j]);
528: }
530: /* U part */
531: pv = b->a + bdiag[i + 1] + 1;
532: pj = b->j + bdiag[i + 1] + 1;
533: nz = bdiag[i] - bdiag[i + 1] - 1;
534: for (j = 0; j < nz; j++) {
535: pv[j] = rtmp[pj[j]];
536: rs += PetscAbsScalar(pv[j]);
537: }
539: sctx.rs = rs;
540: sctx.pv = rtmp[i];
541: MatPivotCheck(B, A, info, &sctx, i);
542: if (sctx.newshift) break; /* break for-loop */
543: rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
545: /* Mark diagonal and invert diagonal for simpler triangular solves */
546: pv = b->a + bdiag[i];
547: *pv = (PetscScalar)1.0 / rtmp[i];
549: } /* endof for (i=0; i<n; i++) { */
551: /* MatPivotRefine() */
552: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
553: /*
554: * if no shift in this attempt & shifting & started shifting & can refine,
555: * then try lower shift
556: */
557: sctx.shift_hi = sctx.shift_fraction;
558: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
559: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
560: sctx.newshift = PETSC_TRUE;
561: sctx.nshift++;
562: }
563: } while (sctx.newshift);
565: PetscFree(rtmp);
566: ISRestoreIndices(isicol, &ic);
567: ISRestoreIndices(isrow, &r);
569: ISIdentity(isrow, &row_identity);
570: ISIdentity(isicol, &col_identity);
571: if (row_identity && col_identity) {
572: C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
573: C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
574: C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
575: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
576: } else {
577: C->ops->solve = MatSolve_SeqBAIJ_1;
578: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
579: }
580: C->assembled = PETSC_TRUE;
581: PetscLogFlops(C->cmap->n);
583: /* MatShiftView(A,info,&sctx) */
584: if (sctx.nshift) {
585: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
586: PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top);
587: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
588: PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
589: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
590: PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount);
591: }
592: }
593: return 0;
594: }
596: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
597: {
598: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
599: IS isrow = b->row, isicol = b->icol;
600: const PetscInt *r, *ic;
601: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
602: PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
603: PetscInt *diag_offset = b->diag, diag, *pj;
604: MatScalar *pv, *v, *rtmp, multiplier, *pc;
605: MatScalar *ba = b->a, *aa = a->a;
606: PetscBool row_identity, col_identity;
608: ISGetIndices(isrow, &r);
609: ISGetIndices(isicol, &ic);
610: PetscMalloc1(n + 1, &rtmp);
612: for (i = 0; i < n; i++) {
613: nz = bi[i + 1] - bi[i];
614: ajtmp = bj + bi[i];
615: for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
617: /* load in initial (unfactored row) */
618: nz = ai[r[i] + 1] - ai[r[i]];
619: ajtmpold = aj + ai[r[i]];
620: v = aa + ai[r[i]];
621: for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
623: row = *ajtmp++;
624: while (row < i) {
625: pc = rtmp + row;
626: if (*pc != 0.0) {
627: pv = ba + diag_offset[row];
628: pj = bj + diag_offset[row] + 1;
629: multiplier = *pc * *pv++;
630: *pc = multiplier;
631: nz = bi[row + 1] - diag_offset[row] - 1;
632: for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
633: PetscLogFlops(1.0 + 2.0 * nz);
634: }
635: row = *ajtmp++;
636: }
637: /* finished row so stick it into b->a */
638: pv = ba + bi[i];
639: pj = bj + bi[i];
640: nz = bi[i + 1] - bi[i];
641: for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
642: diag = diag_offset[i] - bi[i];
643: /* check pivot entry for current row */
645: pv[diag] = 1.0 / pv[diag];
646: }
648: PetscFree(rtmp);
649: ISRestoreIndices(isicol, &ic);
650: ISRestoreIndices(isrow, &r);
651: ISIdentity(isrow, &row_identity);
652: ISIdentity(isicol, &col_identity);
653: if (row_identity && col_identity) {
654: C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
655: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
656: } else {
657: C->ops->solve = MatSolve_SeqBAIJ_1_inplace;
658: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
659: }
660: C->assembled = PETSC_TRUE;
661: PetscLogFlops(C->cmap->n);
662: return 0;
663: }
665: static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
666: {
667: *type = MATSOLVERPETSC;
668: return 0;
669: }
671: PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
672: {
673: PetscInt n = A->rmap->n;
675: #if defined(PETSC_USE_COMPLEX)
677: #endif
678: MatCreate(PetscObjectComm((PetscObject)A), B);
679: MatSetSizes(*B, n, n, n, n);
680: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
681: MatSetType(*B, MATSEQBAIJ);
683: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ;
684: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
685: PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]);
686: PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]);
687: PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
688: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
689: MatSetType(*B, MATSEQSBAIJ);
690: MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL);
692: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ;
693: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
694: /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
695: PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
696: PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]);
697: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
698: (*B)->factortype = ftype;
699: (*B)->canuseordering = PETSC_TRUE;
701: PetscFree((*B)->solvertype);
702: PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype);
703: PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc);
704: return 0;
705: }
707: /* ----------------------------------------------------------- */
708: PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
709: {
710: Mat C;
712: MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C);
713: MatLUFactorSymbolic(C, A, row, col, info);
714: MatLUFactorNumeric(C, A, info);
716: A->ops->solve = C->ops->solve;
717: A->ops->solvetranspose = C->ops->solvetranspose;
719: MatHeaderMerge(A, &C);
720: return 0;
721: }
723: #include <../src/mat/impls/sbaij/seq/sbaij.h>
724: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
725: {
726: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
727: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data;
728: IS ip = b->row;
729: const PetscInt *rip;
730: PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
731: PetscInt *ai = a->i, *aj = a->j;
732: PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
733: MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
734: PetscReal rs;
735: FactorShiftCtx sctx;
737: if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
738: if (!a->sbaijMat) MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat);
739: (a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info);
740: MatDestroy(&a->sbaijMat);
741: return 0;
742: }
744: /* MatPivotSetUp(): initialize shift context sctx */
745: PetscMemzero(&sctx, sizeof(FactorShiftCtx));
747: ISGetIndices(ip, &rip);
748: PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl);
750: sctx.shift_amount = 0.;
751: sctx.nshift = 0;
752: do {
753: sctx.newshift = PETSC_FALSE;
754: for (i = 0; i < mbs; i++) {
755: rtmp[i] = 0.0;
756: jl[i] = mbs;
757: il[0] = 0;
758: }
760: for (k = 0; k < mbs; k++) {
761: bval = ba + bi[k];
762: /* initialize k-th row by the perm[k]-th row of A */
763: jmin = ai[rip[k]];
764: jmax = ai[rip[k] + 1];
765: for (j = jmin; j < jmax; j++) {
766: col = rip[aj[j]];
767: if (col >= k) { /* only take upper triangular entry */
768: rtmp[col] = aa[j];
769: *bval++ = 0.0; /* for in-place factorization */
770: }
771: }
773: /* shift the diagonal of the matrix */
774: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
776: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
777: dk = rtmp[k];
778: i = jl[k]; /* first row to be added to k_th row */
780: while (i < k) {
781: nexti = jl[i]; /* next row to be added to k_th row */
783: /* compute multiplier, update diag(k) and U(i,k) */
784: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
785: uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
786: dk += uikdi * ba[ili];
787: ba[ili] = uikdi; /* -U(i,k) */
789: /* add multiple of row i to k-th row */
790: jmin = ili + 1;
791: jmax = bi[i + 1];
792: if (jmin < jmax) {
793: for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
794: /* update il and jl for row i */
795: il[i] = jmin;
796: j = bj[jmin];
797: jl[i] = jl[j];
798: jl[j] = i;
799: }
800: i = nexti;
801: }
803: /* shift the diagonals when zero pivot is detected */
804: /* compute rs=sum of abs(off-diagonal) */
805: rs = 0.0;
806: jmin = bi[k] + 1;
807: nz = bi[k + 1] - jmin;
808: if (nz) {
809: bcol = bj + jmin;
810: while (nz--) {
811: rs += PetscAbsScalar(rtmp[*bcol]);
812: bcol++;
813: }
814: }
816: sctx.rs = rs;
817: sctx.pv = dk;
818: MatPivotCheck(C, A, info, &sctx, k);
819: if (sctx.newshift) break;
820: dk = sctx.pv;
822: /* copy data into U(k,:) */
823: ba[bi[k]] = 1.0 / dk; /* U(k,k) */
824: jmin = bi[k] + 1;
825: jmax = bi[k + 1];
826: if (jmin < jmax) {
827: for (j = jmin; j < jmax; j++) {
828: col = bj[j];
829: ba[j] = rtmp[col];
830: rtmp[col] = 0.0;
831: }
832: /* add the k-th row into il and jl */
833: il[k] = jmin;
834: i = bj[jmin];
835: jl[k] = jl[i];
836: jl[i] = k;
837: }
838: }
839: } while (sctx.newshift);
840: PetscFree3(rtmp, il, jl);
842: ISRestoreIndices(ip, &rip);
844: C->assembled = PETSC_TRUE;
845: C->preallocated = PETSC_TRUE;
847: PetscLogFlops(C->rmap->N);
848: if (sctx.nshift) {
849: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
850: PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
851: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
852: PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
853: }
854: }
855: return 0;
856: }
858: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
859: {
860: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
861: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data;
862: PetscInt i, j, am = a->mbs;
863: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
864: PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
865: MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
866: PetscReal rs;
867: FactorShiftCtx sctx;
869: /* MatPivotSetUp(): initialize shift context sctx */
870: PetscMemzero(&sctx, sizeof(FactorShiftCtx));
872: PetscMalloc3(am, &rtmp, am, &il, am, &jl);
874: do {
875: sctx.newshift = PETSC_FALSE;
876: for (i = 0; i < am; i++) {
877: rtmp[i] = 0.0;
878: jl[i] = am;
879: il[0] = 0;
880: }
882: for (k = 0; k < am; k++) {
883: /* initialize k-th row with elements nonzero in row perm(k) of A */
884: nz = ai[k + 1] - ai[k];
885: acol = aj + ai[k];
886: aval = aa + ai[k];
887: bval = ba + bi[k];
888: while (nz--) {
889: if (*acol < k) { /* skip lower triangular entries */
890: acol++;
891: aval++;
892: } else {
893: rtmp[*acol++] = *aval++;
894: *bval++ = 0.0; /* for in-place factorization */
895: }
896: }
898: /* shift the diagonal of the matrix */
899: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
901: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
902: dk = rtmp[k];
903: i = jl[k]; /* first row to be added to k_th row */
905: while (i < k) {
906: nexti = jl[i]; /* next row to be added to k_th row */
907: /* compute multiplier, update D(k) and U(i,k) */
908: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
909: uikdi = -ba[ili] * ba[bi[i]];
910: dk += uikdi * ba[ili];
911: ba[ili] = uikdi; /* -U(i,k) */
913: /* add multiple of row i to k-th row ... */
914: jmin = ili + 1;
915: nz = bi[i + 1] - jmin;
916: if (nz > 0) {
917: bcol = bj + jmin;
918: bval = ba + jmin;
919: while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
920: /* update il and jl for i-th row */
921: il[i] = jmin;
922: j = bj[jmin];
923: jl[i] = jl[j];
924: jl[j] = i;
925: }
926: i = nexti;
927: }
929: /* shift the diagonals when zero pivot is detected */
930: /* compute rs=sum of abs(off-diagonal) */
931: rs = 0.0;
932: jmin = bi[k] + 1;
933: nz = bi[k + 1] - jmin;
934: if (nz) {
935: bcol = bj + jmin;
936: while (nz--) {
937: rs += PetscAbsScalar(rtmp[*bcol]);
938: bcol++;
939: }
940: }
942: sctx.rs = rs;
943: sctx.pv = dk;
944: MatPivotCheck(C, A, info, &sctx, k);
945: if (sctx.newshift) break; /* sctx.shift_amount is updated */
946: dk = sctx.pv;
948: /* copy data into U(k,:) */
949: ba[bi[k]] = 1.0 / dk;
950: jmin = bi[k] + 1;
951: nz = bi[k + 1] - jmin;
952: if (nz) {
953: bcol = bj + jmin;
954: bval = ba + jmin;
955: while (nz--) {
956: *bval++ = rtmp[*bcol];
957: rtmp[*bcol++] = 0.0;
958: }
959: /* add k-th row into il and jl */
960: il[k] = jmin;
961: i = bj[jmin];
962: jl[k] = jl[i];
963: jl[i] = k;
964: }
965: }
966: } while (sctx.newshift);
967: PetscFree3(rtmp, il, jl);
969: C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
970: C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
971: C->assembled = PETSC_TRUE;
972: C->preallocated = PETSC_TRUE;
974: PetscLogFlops(C->rmap->N);
975: if (sctx.nshift) {
976: if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
977: PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
978: } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
979: PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount);
980: }
981: }
982: return 0;
983: }
985: #include <petscbt.h>
986: #include <../src/mat/utils/freespace.h>
987: PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
988: {
989: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
990: Mat_SeqSBAIJ *b;
991: Mat B;
992: PetscBool perm_identity, missing;
993: PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
994: const PetscInt *rip;
995: PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
996: PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
997: PetscReal fill = info->fill, levels = info->levels;
998: PetscFreeSpaceList free_space = NULL, current_space = NULL;
999: PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1000: PetscBT lnkbt;
1002: MatMissingDiagonal(A, &missing, &i);
1005: if (bs > 1) {
1006: if (!a->sbaijMat) MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat);
1007: (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1009: MatICCFactorSymbolic(fact, a->sbaijMat, perm, info);
1010: return 0;
1011: }
1013: ISIdentity(perm, &perm_identity);
1014: ISGetIndices(perm, &rip);
1016: /* special case that simply copies fill pattern */
1017: if (!levels && perm_identity) {
1018: PetscMalloc1(am + 1, &ui);
1019: for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1020: B = fact;
1021: MatSeqSBAIJSetPreallocation(B, 1, 0, ui);
1023: b = (Mat_SeqSBAIJ *)B->data;
1024: uj = b->j;
1025: for (i = 0; i < am; i++) {
1026: aj = a->j + a->diag[i];
1027: for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1028: b->ilen[i] = ui[i];
1029: }
1030: PetscFree(ui);
1032: B->factortype = MAT_FACTOR_NONE;
1034: MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
1035: MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
1036: B->factortype = MAT_FACTOR_ICC;
1038: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1039: return 0;
1040: }
1042: /* initialization */
1043: PetscMalloc1(am + 1, &ui);
1044: ui[0] = 0;
1045: PetscMalloc1(2 * am + 1, &cols_lvl);
1047: /* jl: linked list for storing indices of the pivot rows
1048: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1049: PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl);
1050: for (i = 0; i < am; i++) {
1051: jl[i] = am;
1052: il[i] = 0;
1053: }
1055: /* create and initialize a linked list for storing column indices of the active row k */
1056: nlnk = am + 1;
1057: PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt);
1059: /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1060: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space);
1062: current_space = free_space;
1064: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl);
1065: current_space_lvl = free_space_lvl;
1067: for (k = 0; k < am; k++) { /* for each active row k */
1068: /* initialize lnk by the column indices of row rip[k] of A */
1069: nzk = 0;
1070: ncols = ai[rip[k] + 1] - ai[rip[k]];
1071: ncols_upper = 0;
1072: cols = cols_lvl + am;
1073: for (j = 0; j < ncols; j++) {
1074: i = rip[*(aj + ai[rip[k]] + j)];
1075: if (i >= k) { /* only take upper triangular entry */
1076: cols[ncols_upper] = i;
1077: cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1078: ncols_upper++;
1079: }
1080: }
1081: PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt);
1082: nzk += nlnk;
1084: /* update lnk by computing fill-in for each pivot row to be merged in */
1085: prow = jl[k]; /* 1st pivot row */
1087: while (prow < k) {
1088: nextprow = jl[prow];
1090: /* merge prow into k-th row */
1091: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1092: jmax = ui[prow + 1];
1093: ncols = jmax - jmin;
1094: i = jmin - ui[prow];
1095: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1096: for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1097: PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt);
1098: nzk += nlnk;
1100: /* update il and jl for prow */
1101: if (jmin < jmax) {
1102: il[prow] = jmin;
1104: j = *cols;
1105: jl[prow] = jl[j];
1106: jl[j] = prow;
1107: }
1108: prow = nextprow;
1109: }
1111: /* if free space is not available, make more free space */
1112: if (current_space->local_remaining < nzk) {
1113: i = am - k + 1; /* num of unfactored rows */
1114: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1115: PetscFreeSpaceGet(i, ¤t_space);
1116: PetscFreeSpaceGet(i, ¤t_space_lvl);
1117: reallocs++;
1118: }
1120: /* copy data into free_space and free_space_lvl, then initialize lnk */
1121: PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt);
1123: /* add the k-th row into il and jl */
1124: if (nzk - 1 > 0) {
1125: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1126: jl[k] = jl[i];
1127: jl[i] = k;
1128: il[k] = ui[k] + 1;
1129: }
1130: uj_ptr[k] = current_space->array;
1131: uj_lvl_ptr[k] = current_space_lvl->array;
1133: current_space->array += nzk;
1134: current_space->local_used += nzk;
1135: current_space->local_remaining -= nzk;
1137: current_space_lvl->array += nzk;
1138: current_space_lvl->local_used += nzk;
1139: current_space_lvl->local_remaining -= nzk;
1141: ui[k + 1] = ui[k] + nzk;
1142: }
1144: ISRestoreIndices(perm, &rip);
1145: PetscFree4(uj_ptr, uj_lvl_ptr, il, jl);
1146: PetscFree(cols_lvl);
1148: /* copy free_space into uj and free free_space; set uj in new datastructure; */
1149: PetscMalloc1(ui[am] + 1, &uj);
1150: PetscFreeSpaceContiguous(&free_space, uj);
1151: PetscIncompleteLLDestroy(lnk, lnkbt);
1152: PetscFreeSpaceDestroy(free_space_lvl);
1154: /* put together the new matrix in MATSEQSBAIJ format */
1155: B = fact;
1156: MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL);
1158: b = (Mat_SeqSBAIJ *)B->data;
1159: b->singlemalloc = PETSC_FALSE;
1160: b->free_a = PETSC_TRUE;
1161: b->free_ij = PETSC_TRUE;
1163: PetscMalloc1(ui[am] + 1, &b->a);
1165: b->j = uj;
1166: b->i = ui;
1167: b->diag = NULL;
1168: b->ilen = NULL;
1169: b->imax = NULL;
1170: b->row = perm;
1171: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1173: PetscObjectReference((PetscObject)perm);
1175: b->icol = perm;
1177: PetscObjectReference((PetscObject)perm);
1178: PetscMalloc1(am + 1, &b->solve_work);
1180: b->maxnz = b->nz = ui[am];
1182: B->info.factor_mallocs = reallocs;
1183: B->info.fill_ratio_given = fill;
1184: if (ai[am] != 0.) {
1185: /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1186: B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1187: } else {
1188: B->info.fill_ratio_needed = 0.0;
1189: }
1190: #if defined(PETSC_USE_INFO)
1191: if (ai[am] != 0) {
1192: PetscReal af = B->info.fill_ratio_needed;
1193: PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af);
1194: PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
1195: PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af);
1196: } else {
1197: PetscInfo(A, "Empty matrix\n");
1198: }
1199: #endif
1200: if (perm_identity) {
1201: B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1202: B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1203: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1204: } else {
1205: (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1206: }
1207: return 0;
1208: }
1210: PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1211: {
1212: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1213: Mat_SeqSBAIJ *b;
1214: Mat B;
1215: PetscBool perm_identity, missing;
1216: PetscReal fill = info->fill;
1217: const PetscInt *rip;
1218: PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1219: PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1220: PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1221: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1222: PetscBT lnkbt;
1224: if (bs > 1) { /* convert to seqsbaij */
1225: if (!a->sbaijMat) MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat);
1226: (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1228: MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info);
1229: return 0;
1230: }
1232: MatMissingDiagonal(A, &missing, &i);
1235: /* check whether perm is the identity mapping */
1236: ISIdentity(perm, &perm_identity);
1238: ISGetIndices(perm, &rip);
1240: /* initialization */
1241: PetscMalloc1(mbs + 1, &ui);
1242: ui[0] = 0;
1244: /* jl: linked list for storing indices of the pivot rows
1245: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1246: PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols);
1247: for (i = 0; i < mbs; i++) {
1248: jl[i] = mbs;
1249: il[i] = 0;
1250: }
1252: /* create and initialize a linked list for storing column indices of the active row k */
1253: nlnk = mbs + 1;
1254: PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt);
1256: /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1257: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space);
1259: current_space = free_space;
1261: for (k = 0; k < mbs; k++) { /* for each active row k */
1262: /* initialize lnk by the column indices of row rip[k] of A */
1263: nzk = 0;
1264: ncols = ai[rip[k] + 1] - ai[rip[k]];
1265: ncols_upper = 0;
1266: for (j = 0; j < ncols; j++) {
1267: i = rip[*(aj + ai[rip[k]] + j)];
1268: if (i >= k) { /* only take upper triangular entry */
1269: cols[ncols_upper] = i;
1270: ncols_upper++;
1271: }
1272: }
1273: PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt);
1274: nzk += nlnk;
1276: /* update lnk by computing fill-in for each pivot row to be merged in */
1277: prow = jl[k]; /* 1st pivot row */
1279: while (prow < k) {
1280: nextprow = jl[prow];
1281: /* merge prow into k-th row */
1282: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1283: jmax = ui[prow + 1];
1284: ncols = jmax - jmin;
1285: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1286: PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt);
1287: nzk += nlnk;
1289: /* update il and jl for prow */
1290: if (jmin < jmax) {
1291: il[prow] = jmin;
1292: j = *uj_ptr;
1293: jl[prow] = jl[j];
1294: jl[j] = prow;
1295: }
1296: prow = nextprow;
1297: }
1299: /* if free space is not available, make more free space */
1300: if (current_space->local_remaining < nzk) {
1301: i = mbs - k + 1; /* num of unfactored rows */
1302: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1303: PetscFreeSpaceGet(i, ¤t_space);
1304: reallocs++;
1305: }
1307: /* copy data into free space, then initialize lnk */
1308: PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt);
1310: /* add the k-th row into il and jl */
1311: if (nzk - 1 > 0) {
1312: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1313: jl[k] = jl[i];
1314: jl[i] = k;
1315: il[k] = ui[k] + 1;
1316: }
1317: ui_ptr[k] = current_space->array;
1318: current_space->array += nzk;
1319: current_space->local_used += nzk;
1320: current_space->local_remaining -= nzk;
1322: ui[k + 1] = ui[k] + nzk;
1323: }
1325: ISRestoreIndices(perm, &rip);
1326: PetscFree4(ui_ptr, il, jl, cols);
1328: /* copy free_space into uj and free free_space; set uj in new datastructure; */
1329: PetscMalloc1(ui[mbs] + 1, &uj);
1330: PetscFreeSpaceContiguous(&free_space, uj);
1331: PetscLLDestroy(lnk, lnkbt);
1333: /* put together the new matrix in MATSEQSBAIJ format */
1334: B = fact;
1335: MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL);
1337: b = (Mat_SeqSBAIJ *)B->data;
1338: b->singlemalloc = PETSC_FALSE;
1339: b->free_a = PETSC_TRUE;
1340: b->free_ij = PETSC_TRUE;
1342: PetscMalloc1(ui[mbs] + 1, &b->a);
1344: b->j = uj;
1345: b->i = ui;
1346: b->diag = NULL;
1347: b->ilen = NULL;
1348: b->imax = NULL;
1349: b->row = perm;
1350: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1352: PetscObjectReference((PetscObject)perm);
1353: b->icol = perm;
1354: PetscObjectReference((PetscObject)perm);
1355: PetscMalloc1(mbs + 1, &b->solve_work);
1356: b->maxnz = b->nz = ui[mbs];
1358: B->info.factor_mallocs = reallocs;
1359: B->info.fill_ratio_given = fill;
1360: if (ai[mbs] != 0.) {
1361: /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1362: B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1363: } else {
1364: B->info.fill_ratio_needed = 0.0;
1365: }
1366: #if defined(PETSC_USE_INFO)
1367: if (ai[mbs] != 0.) {
1368: PetscReal af = B->info.fill_ratio_needed;
1369: PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af);
1370: PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af);
1371: PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af);
1372: } else {
1373: PetscInfo(A, "Empty matrix\n");
1374: }
1375: #endif
1376: if (perm_identity) {
1377: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1378: } else {
1379: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1380: }
1381: return 0;
1382: }
1384: PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1385: {
1386: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1387: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1388: PetscInt i, k, n = a->mbs;
1389: PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
1390: const MatScalar *aa = a->a, *v;
1391: PetscScalar *x, *s, *t, *ls;
1392: const PetscScalar *b;
1394: VecGetArrayRead(bb, &b);
1395: VecGetArray(xx, &x);
1396: t = a->solve_work;
1398: /* forward solve the lower triangular */
1399: PetscArraycpy(t, b, bs); /* copy 1st block of b to t */
1401: for (i = 1; i < n; i++) {
1402: v = aa + bs2 * ai[i];
1403: vi = aj + ai[i];
1404: nz = ai[i + 1] - ai[i];
1405: s = t + bs * i;
1406: PetscArraycpy(s, b + bs * i, bs); /* copy i_th block of b to t */
1407: for (k = 0; k < nz; k++) {
1408: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1409: v += bs2;
1410: }
1411: }
1413: /* backward solve the upper triangular */
1414: ls = a->solve_work + A->cmap->n;
1415: for (i = n - 1; i >= 0; i--) {
1416: v = aa + bs2 * (adiag[i + 1] + 1);
1417: vi = aj + adiag[i + 1] + 1;
1418: nz = adiag[i] - adiag[i + 1] - 1;
1419: PetscArraycpy(ls, t + i * bs, bs);
1420: for (k = 0; k < nz; k++) {
1421: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1422: v += bs2;
1423: }
1424: PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1425: PetscArraycpy(x + i * bs, t + i * bs, bs);
1426: }
1428: VecRestoreArrayRead(bb, &b);
1429: VecRestoreArray(xx, &x);
1430: PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n);
1431: return 0;
1432: }
1434: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1435: {
1436: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1437: IS iscol = a->col, isrow = a->row;
1438: const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1439: PetscInt i, m, n = a->mbs;
1440: PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
1441: const MatScalar *aa = a->a, *v;
1442: PetscScalar *x, *s, *t, *ls;
1443: const PetscScalar *b;
1445: VecGetArrayRead(bb, &b);
1446: VecGetArray(xx, &x);
1447: t = a->solve_work;
1449: ISGetIndices(isrow, &rout);
1450: r = rout;
1451: ISGetIndices(iscol, &cout);
1452: c = cout;
1454: /* forward solve the lower triangular */
1455: PetscArraycpy(t, b + bs * r[0], bs);
1456: for (i = 1; i < n; i++) {
1457: v = aa + bs2 * ai[i];
1458: vi = aj + ai[i];
1459: nz = ai[i + 1] - ai[i];
1460: s = t + bs * i;
1461: PetscArraycpy(s, b + bs * r[i], bs);
1462: for (m = 0; m < nz; m++) {
1463: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1464: v += bs2;
1465: }
1466: }
1468: /* backward solve the upper triangular */
1469: ls = a->solve_work + A->cmap->n;
1470: for (i = n - 1; i >= 0; i--) {
1471: v = aa + bs2 * (adiag[i + 1] + 1);
1472: vi = aj + adiag[i + 1] + 1;
1473: nz = adiag[i] - adiag[i + 1] - 1;
1474: PetscArraycpy(ls, t + i * bs, bs);
1475: for (m = 0; m < nz; m++) {
1476: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1477: v += bs2;
1478: }
1479: PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1480: PetscArraycpy(x + bs * c[i], t + i * bs, bs);
1481: }
1482: ISRestoreIndices(isrow, &rout);
1483: ISRestoreIndices(iscol, &cout);
1484: VecRestoreArrayRead(bb, &b);
1485: VecRestoreArray(xx, &x);
1486: PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n);
1487: return 0;
1488: }
1490: /*
1491: For each block in an block array saves the largest absolute value in the block into another array
1492: */
1493: static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray)
1494: {
1495: PetscInt i, j;
1497: PetscArrayzero(absarray, nbs + 1);
1498: for (i = 0; i < nbs; i++) {
1499: for (j = 0; j < bs2; j++) {
1500: if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]);
1501: }
1502: }
1503: return 0;
1504: }
1506: /*
1507: This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1508: */
1509: PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact)
1510: {
1511: Mat B = *fact;
1512: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b;
1513: IS isicol;
1514: const PetscInt *r, *ic;
1515: PetscInt i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag;
1516: PetscInt *bi, *bj, *bdiag;
1518: PetscInt row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au;
1519: PetscInt nlnk, *lnk;
1520: PetscBT lnkbt;
1521: PetscBool row_identity, icol_identity;
1522: MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp;
1523: PetscInt j, nz, *pj, *bjtmp, k, ncut, *jtmp;
1525: PetscReal dt = info->dt; /* shift=info->shiftamount; */
1526: PetscInt nnz_max;
1527: PetscBool missing;
1528: PetscReal *vtmp_abs;
1529: MatScalar *v_work;
1530: PetscInt *v_pivots;
1531: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
1533: /* ------- symbolic factorization, can be reused ---------*/
1534: MatMissingDiagonal(A, &missing, &i);
1536: adiag = a->diag;
1538: ISInvertPermutation(iscol, PETSC_DECIDE, &isicol);
1540: /* bdiag is location of diagonal in factor */
1541: PetscMalloc1(mbs + 1, &bdiag);
1543: /* allocate row pointers bi */
1544: PetscMalloc1(2 * mbs + 2, &bi);
1546: /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1547: dtcount = (PetscInt)info->dtcount;
1548: if (dtcount > mbs - 1) dtcount = mbs - 1;
1549: nnz_max = ai[mbs] + 2 * mbs * dtcount + 2;
1550: /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1551: PetscMalloc1(nnz_max, &bj);
1552: nnz_max = nnz_max * bs2;
1553: PetscMalloc1(nnz_max, &ba);
1555: /* put together the new matrix */
1556: MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL);
1558: b = (Mat_SeqBAIJ *)(B)->data;
1559: b->free_a = PETSC_TRUE;
1560: b->free_ij = PETSC_TRUE;
1561: b->singlemalloc = PETSC_FALSE;
1563: b->a = ba;
1564: b->j = bj;
1565: b->i = bi;
1566: b->diag = bdiag;
1567: b->ilen = NULL;
1568: b->imax = NULL;
1569: b->row = isrow;
1570: b->col = iscol;
1572: PetscObjectReference((PetscObject)isrow);
1573: PetscObjectReference((PetscObject)iscol);
1575: b->icol = isicol;
1576: PetscMalloc1(bs * (mbs + 1), &b->solve_work);
1577: b->maxnz = nnz_max / bs2;
1579: (B)->factortype = MAT_FACTOR_ILUDT;
1580: (B)->info.factor_mallocs = 0;
1581: (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2));
1582: /* ------- end of symbolic factorization ---------*/
1583: ISGetIndices(isrow, &r);
1584: ISGetIndices(isicol, &ic);
1586: /* linked list for storing column indices of the active row */
1587: nlnk = mbs + 1;
1588: PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt);
1590: /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1591: PetscMalloc2(mbs, &im, mbs, &jtmp);
1592: /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1593: PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp);
1594: PetscMalloc1(mbs + 1, &vtmp_abs);
1595: PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots);
1597: allowzeropivot = PetscNot(A->erroriffailure);
1598: bi[0] = 0;
1599: bdiag[0] = (nnz_max / bs2) - 1; /* location of diagonal in factor B */
1600: bi[2 * mbs + 1] = bdiag[0] + 1; /* endof bj and ba array */
1601: for (i = 0; i < mbs; i++) {
1602: /* copy initial fill into linked list */
1603: nzi = ai[r[i] + 1] - ai[r[i]];
1605: nzi_al = adiag[r[i]] - ai[r[i]];
1606: nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1;
1608: /* load in initial unfactored row */
1609: ajtmp = aj + ai[r[i]];
1610: PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt);
1611: PetscArrayzero(rtmp, mbs * bs2);
1612: aatmp = a->a + bs2 * ai[r[i]];
1613: for (j = 0; j < nzi; j++) PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2);
1615: /* add pivot rows into linked list */
1616: row = lnk[mbs];
1617: while (row < i) {
1618: nzi_bl = bi[row + 1] - bi[row] + 1;
1619: bjtmp = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */
1620: PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im);
1621: nzi += nlnk;
1622: row = lnk[row];
1623: }
1625: /* copy data from lnk into jtmp, then initialize lnk */
1626: PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt);
1628: /* numerical factorization */
1629: bjtmp = jtmp;
1630: row = *bjtmp++; /* 1st pivot row */
1632: while (row < i) {
1633: pc = rtmp + bs2 * row;
1634: pv = ba + bs2 * bdiag[row]; /* inv(diag) of the pivot row */
1635: PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1636: MatBlockAbs_private(1, bs2, pc, vtmp_abs);
1637: if (vtmp_abs[0] > dt) { /* apply tolerance dropping rule */
1638: pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */
1639: pv = ba + bs2 * (bdiag[row + 1] + 1);
1640: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */
1641: for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
1642: /* PetscLogFlops(bslog*(nz+1.0)-bs); */
1643: }
1644: row = *bjtmp++;
1645: }
1647: /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1648: nzi_bl = 0;
1649: j = 0;
1650: while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */
1651: PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2);
1652: nzi_bl++;
1653: j++;
1654: }
1655: nzi_bu = nzi - nzi_bl - 1;
1657: while (j < nzi) { /* U-part */
1658: PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2);
1659: j++;
1660: }
1662: MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs);
1664: bjtmp = bj + bi[i];
1665: batmp = ba + bs2 * bi[i];
1666: /* apply level dropping rule to L part */
1667: ncut = nzi_al + dtcount;
1668: if (ncut < nzi_bl) {
1669: PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp);
1670: PetscSortIntWithScalarArray(ncut, jtmp, vtmp);
1671: } else {
1672: ncut = nzi_bl;
1673: }
1674: for (j = 0; j < ncut; j++) {
1675: bjtmp[j] = jtmp[j];
1676: PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2);
1677: }
1678: bi[i + 1] = bi[i] + ncut;
1679: nzi = ncut + 1;
1681: /* apply level dropping rule to U part */
1682: ncut = nzi_au + dtcount;
1683: if (ncut < nzi_bu) {
1684: PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1);
1685: PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1);
1686: } else {
1687: ncut = nzi_bu;
1688: }
1689: nzi += ncut;
1691: /* mark bdiagonal */
1692: bdiag[i + 1] = bdiag[i] - (ncut + 1);
1693: bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1);
1695: bjtmp = bj + bdiag[i];
1696: batmp = ba + bs2 * bdiag[i];
1697: PetscArraycpy(batmp, rtmp + bs2 * i, bs2);
1698: *bjtmp = i;
1700: bjtmp = bj + bdiag[i + 1] + 1;
1701: batmp = ba + (bdiag[i + 1] + 1) * bs2;
1703: for (k = 0; k < ncut; k++) {
1704: bjtmp[k] = jtmp[nzi_bl + 1 + k];
1705: PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2);
1706: }
1708: im[i] = nzi; /* used by PetscLLAddSortedLU() */
1710: /* invert diagonal block for simpler triangular solves - add shift??? */
1711: batmp = ba + bs2 * bdiag[i];
1713: PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected);
1714: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1715: } /* for (i=0; i<mbs; i++) */
1716: PetscFree3(v_work, multiplier, v_pivots);
1718: /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1721: ISRestoreIndices(isrow, &r);
1722: ISRestoreIndices(isicol, &ic);
1724: PetscLLDestroy(lnk, lnkbt);
1726: PetscFree2(im, jtmp);
1727: PetscFree2(rtmp, vtmp);
1729: PetscLogFlops(bs2 * B->cmap->n);
1730: b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1732: ISIdentity(isrow, &row_identity);
1733: ISIdentity(isicol, &icol_identity);
1734: if (row_identity && icol_identity) {
1735: B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1736: } else {
1737: B->ops->solve = MatSolve_SeqBAIJ_N;
1738: }
1740: B->ops->solveadd = NULL;
1741: B->ops->solvetranspose = NULL;
1742: B->ops->solvetransposeadd = NULL;
1743: B->ops->matsolve = NULL;
1744: B->assembled = PETSC_TRUE;
1745: B->preallocated = PETSC_TRUE;
1746: return 0;
1747: }