Actual source code: baijfact.c
1: /*
2: Factorization code for BAIJ format.
3: */
4: #include <../src/mat/impls/baij/seq/baij.h>
5: #include <petsc/private/kernels/blockinvert.h>
7: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
8: {
9: Mat C = B;
10: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
11: IS isrow = b->row, isicol = b->icol;
12: const PetscInt *r, *ic;
13: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
14: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
15: MatScalar *rtmp, *pc, *mwork, *pv;
16: MatScalar *aa = a->a, *v;
17: PetscReal shift = info->shiftamount;
18: const PetscBool allowzeropivot = PetscNot(A->erroriffailure);
20: PetscFunctionBegin;
21: PetscCall(ISGetIndices(isrow, &r));
22: PetscCall(ISGetIndices(isicol, &ic));
24: /* generate work space needed by the factorization */
25: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
26: PetscCall(PetscArrayzero(rtmp, bs2 * n));
28: for (PetscInt i = 0; i < n; i++) {
29: /* zero rtmp */
30: /* L part */
31: PetscInt *pj;
32: PetscInt nzL, nz = bi[i + 1] - bi[i];
33: bjtmp = bj + bi[i];
34: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
36: /* U part */
37: nz = bdiag[i] - bdiag[i + 1];
38: bjtmp = bj + bdiag[i + 1] + 1;
39: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
41: /* load in initial (unfactored row) */
42: nz = ai[r[i] + 1] - ai[r[i]];
43: ajtmp = aj + ai[r[i]];
44: v = aa + bs2 * ai[r[i]];
45: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
47: /* elimination */
48: bjtmp = bj + bi[i];
49: nzL = bi[i + 1] - bi[i];
50: for (PetscInt k = 0; k < nzL; k++) {
51: PetscBool flg = PETSC_FALSE;
52: PetscInt row = bjtmp[k];
54: pc = rtmp + bs2 * row;
55: for (PetscInt j = 0; j < bs2; j++) {
56: if (pc[j] != (PetscScalar)0.0) {
57: flg = PETSC_TRUE;
58: break;
59: }
60: }
61: if (flg) {
62: pv = b->a + bs2 * bdiag[row];
63: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
64: PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
66: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
67: pv = b->a + bs2 * (bdiag[row + 1] + 1);
68: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
69: for (PetscInt j = 0; j < nz; j++) {
70: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
71: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
72: v = rtmp + 4 * pj[j];
73: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
74: pv += 4;
75: }
76: PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
77: }
78: }
80: /* finished row so stick it into b->a */
81: /* L part */
82: pv = b->a + bs2 * bi[i];
83: pj = b->j + bi[i];
84: nz = bi[i + 1] - bi[i];
85: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
87: /* Mark diagonal and invert diagonal for simpler triangular solves */
88: pv = b->a + bs2 * bdiag[i];
89: pj = b->j + bdiag[i];
90: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
91: {
92: PetscBool zeropivotdetected;
94: PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
95: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
96: }
98: /* U part */
99: pv = b->a + bs2 * (bdiag[i + 1] + 1);
100: pj = b->j + bdiag[i + 1] + 1;
101: nz = bdiag[i] - bdiag[i + 1] - 1;
102: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
103: }
105: PetscCall(PetscFree2(rtmp, mwork));
106: PetscCall(ISRestoreIndices(isicol, &ic));
107: PetscCall(ISRestoreIndices(isrow, &r));
109: C->ops->solve = MatSolve_SeqBAIJ_2;
110: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
111: C->assembled = PETSC_TRUE;
113: PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
114: PetscFunctionReturn(PETSC_SUCCESS);
115: }
117: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
118: {
119: Mat C = B;
120: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
121: PetscInt i, j, k, nz, nzL, row, *pj;
122: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
123: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
124: MatScalar *rtmp, *pc, *mwork, *pv;
125: MatScalar *aa = a->a, *v;
126: PetscInt flg;
127: PetscReal shift = info->shiftamount;
128: PetscBool allowzeropivot, zeropivotdetected;
130: PetscFunctionBegin;
131: allowzeropivot = PetscNot(A->erroriffailure);
133: /* generate work space needed by the factorization */
134: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
135: PetscCall(PetscArrayzero(rtmp, bs2 * n));
137: for (i = 0; i < n; i++) {
138: /* zero rtmp */
139: /* L part */
140: nz = bi[i + 1] - bi[i];
141: bjtmp = bj + bi[i];
142: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
144: /* U part */
145: nz = bdiag[i] - bdiag[i + 1];
146: bjtmp = bj + bdiag[i + 1] + 1;
147: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
149: /* load in initial (unfactored row) */
150: nz = ai[i + 1] - ai[i];
151: ajtmp = aj + ai[i];
152: v = aa + bs2 * ai[i];
153: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
155: /* elimination */
156: bjtmp = bj + bi[i];
157: nzL = bi[i + 1] - bi[i];
158: for (k = 0; k < nzL; k++) {
159: row = bjtmp[k];
160: pc = rtmp + bs2 * row;
161: for (flg = 0, j = 0; j < bs2; j++) {
162: if (pc[j] != (PetscScalar)0.0) {
163: flg = 1;
164: break;
165: }
166: }
167: if (flg) {
168: pv = b->a + bs2 * bdiag[row];
169: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
170: PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
172: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
173: pv = b->a + bs2 * (bdiag[row + 1] + 1);
174: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
175: for (j = 0; j < nz; j++) {
176: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
177: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
178: v = rtmp + 4 * pj[j];
179: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
180: pv += 4;
181: }
182: PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
183: }
184: }
186: /* finished row so stick it into b->a */
187: /* L part */
188: pv = b->a + bs2 * bi[i];
189: pj = b->j + bi[i];
190: nz = bi[i + 1] - bi[i];
191: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
193: /* Mark diagonal and invert diagonal for simpler triangular solves */
194: pv = b->a + bs2 * bdiag[i];
195: pj = b->j + bdiag[i];
196: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
197: PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
198: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
200: /* U part */
201: /*
202: pv = b->a + bs2*bi[2*n-i];
203: pj = b->j + bi[2*n-i];
204: nz = bi[2*n-i+1] - bi[2*n-i] - 1;
205: */
206: pv = b->a + bs2 * (bdiag[i + 1] + 1);
207: pj = b->j + bdiag[i + 1] + 1;
208: nz = bdiag[i] - bdiag[i + 1] - 1;
209: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
210: }
211: PetscCall(PetscFree2(rtmp, mwork));
213: C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering;
214: C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
215: C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
216: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
217: C->assembled = PETSC_TRUE;
219: PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
220: PetscFunctionReturn(PETSC_SUCCESS);
221: }
223: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
224: {
225: Mat C = B;
226: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
227: IS isrow = b->row, isicol = b->icol;
228: const PetscInt *r, *ic;
229: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
230: PetscInt *ajtmpold, *ajtmp, nz, row;
231: PetscInt idx, *ai = a->i, *aj = a->j, *pj;
232: const PetscInt *diag_offset;
233: MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
234: MatScalar p1, p2, p3, p4;
235: MatScalar *ba = b->a, *aa = a->a;
236: PetscReal shift = info->shiftamount;
237: PetscBool allowzeropivot, zeropivotdetected;
239: PetscFunctionBegin;
240: /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
241: A->factortype = MAT_FACTOR_NONE;
242: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
243: A->factortype = MAT_FACTOR_ILU;
244: allowzeropivot = PetscNot(A->erroriffailure);
245: PetscCall(ISGetIndices(isrow, &r));
246: PetscCall(ISGetIndices(isicol, &ic));
247: PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
249: for (i = 0; i < n; i++) {
250: nz = bi[i + 1] - bi[i];
251: ajtmp = bj + bi[i];
252: for (j = 0; j < nz; j++) {
253: x = rtmp + 4 * ajtmp[j];
254: x[0] = x[1] = x[2] = x[3] = 0.0;
255: }
256: /* load in initial (unfactored row) */
257: idx = r[i];
258: nz = ai[idx + 1] - ai[idx];
259: ajtmpold = aj + ai[idx];
260: v = aa + 4 * ai[idx];
261: for (j = 0; j < nz; j++) {
262: x = rtmp + 4 * ic[ajtmpold[j]];
263: x[0] = v[0];
264: x[1] = v[1];
265: x[2] = v[2];
266: x[3] = v[3];
267: v += 4;
268: }
269: row = *ajtmp++;
270: while (row < i) {
271: pc = rtmp + 4 * row;
272: p1 = pc[0];
273: p2 = pc[1];
274: p3 = pc[2];
275: p4 = pc[3];
276: if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
277: pv = ba + 4 * diag_offset[row];
278: pj = bj + diag_offset[row] + 1;
279: x1 = pv[0];
280: x2 = pv[1];
281: x3 = pv[2];
282: x4 = pv[3];
283: pc[0] = m1 = p1 * x1 + p3 * x2;
284: pc[1] = m2 = p2 * x1 + p4 * x2;
285: pc[2] = m3 = p1 * x3 + p3 * x4;
286: pc[3] = m4 = p2 * x3 + p4 * x4;
287: nz = bi[row + 1] - diag_offset[row] - 1;
288: pv += 4;
289: for (j = 0; j < nz; j++) {
290: x1 = pv[0];
291: x2 = pv[1];
292: x3 = pv[2];
293: x4 = pv[3];
294: x = rtmp + 4 * pj[j];
295: x[0] -= m1 * x1 + m3 * x2;
296: x[1] -= m2 * x1 + m4 * x2;
297: x[2] -= m1 * x3 + m3 * x4;
298: x[3] -= m2 * x3 + m4 * x4;
299: pv += 4;
300: }
301: PetscCall(PetscLogFlops(16.0 * nz + 12.0));
302: }
303: row = *ajtmp++;
304: }
305: /* finished row so stick it into b->a */
306: pv = ba + 4 * bi[i];
307: pj = bj + bi[i];
308: nz = bi[i + 1] - bi[i];
309: for (j = 0; j < nz; j++) {
310: x = rtmp + 4 * pj[j];
311: pv[0] = x[0];
312: pv[1] = x[1];
313: pv[2] = x[2];
314: pv[3] = x[3];
315: pv += 4;
316: }
317: /* invert diagonal block */
318: w = ba + 4 * diag_offset[i];
319: PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
320: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
321: }
323: PetscCall(PetscFree(rtmp));
324: PetscCall(ISRestoreIndices(isicol, &ic));
325: PetscCall(ISRestoreIndices(isrow, &r));
327: C->ops->solve = MatSolve_SeqBAIJ_2_inplace;
328: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
329: C->assembled = PETSC_TRUE;
331: PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
332: PetscFunctionReturn(PETSC_SUCCESS);
333: }
334: /*
335: Version for when blocks are 2 by 2 Using natural ordering
336: */
337: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
338: {
339: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
340: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
341: PetscInt *ajtmpold, *ajtmp, nz, row;
342: PetscInt *ai = a->i, *aj = a->j, *pj;
343: const PetscInt *diag_offset;
344: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
345: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
346: MatScalar *ba = b->a, *aa = a->a;
347: PetscReal shift = info->shiftamount;
348: PetscBool allowzeropivot, zeropivotdetected;
350: PetscFunctionBegin;
351: /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
352: A->factortype = MAT_FACTOR_NONE;
353: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
354: A->factortype = MAT_FACTOR_ILU;
355: allowzeropivot = PetscNot(A->erroriffailure);
356: PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
357: for (i = 0; i < n; i++) {
358: nz = bi[i + 1] - bi[i];
359: ajtmp = bj + bi[i];
360: for (j = 0; j < nz; j++) {
361: x = rtmp + 4 * ajtmp[j];
362: x[0] = x[1] = x[2] = x[3] = 0.0;
363: }
364: /* load in initial (unfactored row) */
365: nz = ai[i + 1] - ai[i];
366: ajtmpold = aj + ai[i];
367: v = aa + 4 * ai[i];
368: for (j = 0; j < nz; j++) {
369: x = rtmp + 4 * ajtmpold[j];
370: x[0] = v[0];
371: x[1] = v[1];
372: x[2] = v[2];
373: x[3] = v[3];
374: v += 4;
375: }
376: row = *ajtmp++;
377: while (row < i) {
378: pc = rtmp + 4 * row;
379: p1 = pc[0];
380: p2 = pc[1];
381: p3 = pc[2];
382: p4 = pc[3];
383: if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
384: pv = ba + 4 * diag_offset[row];
385: pj = bj + diag_offset[row] + 1;
386: x1 = pv[0];
387: x2 = pv[1];
388: x3 = pv[2];
389: x4 = pv[3];
390: pc[0] = m1 = p1 * x1 + p3 * x2;
391: pc[1] = m2 = p2 * x1 + p4 * x2;
392: pc[2] = m3 = p1 * x3 + p3 * x4;
393: pc[3] = m4 = p2 * x3 + p4 * x4;
394: nz = bi[row + 1] - diag_offset[row] - 1;
395: pv += 4;
396: for (j = 0; j < nz; j++) {
397: x1 = pv[0];
398: x2 = pv[1];
399: x3 = pv[2];
400: x4 = pv[3];
401: x = rtmp + 4 * pj[j];
402: x[0] -= m1 * x1 + m3 * x2;
403: x[1] -= m2 * x1 + m4 * x2;
404: x[2] -= m1 * x3 + m3 * x4;
405: x[3] -= m2 * x3 + m4 * x4;
406: pv += 4;
407: }
408: PetscCall(PetscLogFlops(16.0 * nz + 12.0));
409: }
410: row = *ajtmp++;
411: }
412: /* finished row so stick it into b->a */
413: pv = ba + 4 * bi[i];
414: pj = bj + bi[i];
415: nz = bi[i + 1] - bi[i];
416: for (j = 0; j < nz; j++) {
417: x = rtmp + 4 * pj[j];
418: pv[0] = x[0];
419: pv[1] = x[1];
420: pv[2] = x[2];
421: pv[3] = x[3];
422: /*
423: printf(" col %d:",pj[j]);
424: PetscInt j1;
425: for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
426: printf("\n");
427: */
428: pv += 4;
429: }
430: /* invert diagonal block */
431: w = ba + 4 * diag_offset[i];
432: PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
433: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
434: }
436: PetscCall(PetscFree(rtmp));
438: C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
439: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
440: C->assembled = PETSC_TRUE;
442: PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
443: PetscFunctionReturn(PETSC_SUCCESS);
444: }
446: /*
447: Version for when blocks are 1 by 1.
448: */
449: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
450: {
451: Mat C = B;
452: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
453: IS isrow = b->row, isicol = b->icol;
454: const PetscInt *r, *ic, *ics;
455: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
456: PetscInt i, j, k, nz, nzL, row, *pj;
457: const PetscInt *ajtmp, *bjtmp;
458: MatScalar *rtmp, *pc, multiplier, *pv;
459: const MatScalar *aa = a->a, *v;
460: PetscBool row_identity, col_identity;
461: FactorShiftCtx sctx;
462: const PetscInt *ddiag;
463: PetscReal rs;
464: MatScalar d;
466: PetscFunctionBegin;
467: /* MatPivotSetUp(): initialize shift context sctx */
468: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
470: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
471: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &ddiag, NULL));
472: sctx.shift_top = info->zeropivot;
473: for (i = 0; i < n; i++) {
474: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
475: d = (aa)[ddiag[i]];
476: rs = -PetscAbsScalar(d) - PetscRealPart(d);
477: v = aa + ai[i];
478: nz = ai[i + 1] - ai[i];
479: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
480: if (rs > sctx.shift_top) sctx.shift_top = rs;
481: }
482: sctx.shift_top *= 1.1;
483: sctx.nshift_max = 5;
484: sctx.shift_lo = 0.;
485: sctx.shift_hi = 1.;
486: }
488: PetscCall(ISGetIndices(isrow, &r));
489: PetscCall(ISGetIndices(isicol, &ic));
490: PetscCall(PetscMalloc1(n + 1, &rtmp));
491: ics = ic;
493: do {
494: sctx.newshift = PETSC_FALSE;
495: for (i = 0; i < n; i++) {
496: /* zero rtmp */
497: /* L part */
498: nz = bi[i + 1] - bi[i];
499: bjtmp = bj + bi[i];
500: for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
502: /* U part */
503: nz = bdiag[i] - bdiag[i + 1];
504: bjtmp = bj + bdiag[i + 1] + 1;
505: for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
507: /* load in initial (unfactored row) */
508: nz = ai[r[i] + 1] - ai[r[i]];
509: ajtmp = aj + ai[r[i]];
510: v = aa + ai[r[i]];
511: for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
513: /* ZeropivotApply() */
514: rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
516: /* elimination */
517: bjtmp = bj + bi[i];
518: row = *bjtmp++;
519: nzL = bi[i + 1] - bi[i];
520: for (k = 0; k < nzL; k++) {
521: pc = rtmp + row;
522: if (*pc != (PetscScalar)0.0) {
523: pv = b->a + bdiag[row];
524: multiplier = *pc * (*pv);
525: *pc = multiplier;
527: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
528: pv = b->a + bdiag[row + 1] + 1;
529: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
530: for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
531: PetscCall(PetscLogFlops(2.0 * nz));
532: }
533: row = *bjtmp++;
534: }
536: /* finished row so stick it into b->a */
537: rs = 0.0;
538: /* L part */
539: pv = b->a + bi[i];
540: pj = b->j + bi[i];
541: nz = bi[i + 1] - bi[i];
542: for (j = 0; j < nz; j++) {
543: pv[j] = rtmp[pj[j]];
544: rs += PetscAbsScalar(pv[j]);
545: }
547: /* U part */
548: pv = b->a + bdiag[i + 1] + 1;
549: pj = b->j + bdiag[i + 1] + 1;
550: nz = bdiag[i] - bdiag[i + 1] - 1;
551: for (j = 0; j < nz; j++) {
552: pv[j] = rtmp[pj[j]];
553: rs += PetscAbsScalar(pv[j]);
554: }
556: sctx.rs = rs;
557: sctx.pv = rtmp[i];
558: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
559: if (sctx.newshift) break; /* break for-loop */
560: rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
562: /* Mark diagonal and invert diagonal for simpler triangular solves */
563: pv = b->a + bdiag[i];
564: *pv = (PetscScalar)1.0 / rtmp[i];
566: } /* endof for (i=0; i<n; i++) { */
568: /* MatPivotRefine() */
569: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
570: /*
571: * if no shift in this attempt & shifting & started shifting & can refine,
572: * then try lower shift
573: */
574: sctx.shift_hi = sctx.shift_fraction;
575: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
576: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
577: sctx.newshift = PETSC_TRUE;
578: sctx.nshift++;
579: }
580: } while (sctx.newshift);
582: PetscCall(PetscFree(rtmp));
583: PetscCall(ISRestoreIndices(isicol, &ic));
584: PetscCall(ISRestoreIndices(isrow, &r));
586: PetscCall(ISIdentity(isrow, &row_identity));
587: PetscCall(ISIdentity(isicol, &col_identity));
588: if (row_identity && col_identity) {
589: C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
590: C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
591: C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
592: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
593: } else {
594: C->ops->solve = MatSolve_SeqBAIJ_1;
595: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
596: }
597: C->assembled = PETSC_TRUE;
598: PetscCall(PetscLogFlops(C->cmap->n));
600: /* MatShiftView(A,info,&sctx) */
601: if (sctx.nshift) {
602: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
603: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
604: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
605: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
606: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
607: PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
608: }
609: }
610: PetscFunctionReturn(PETSC_SUCCESS);
611: }
613: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
614: {
615: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
616: IS isrow = b->row, isicol = b->icol;
617: const PetscInt *r, *ic;
618: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
619: PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
620: PetscInt diag, *pj;
621: const PetscInt *diag_offset;
622: MatScalar *pv, *v, *rtmp, multiplier, *pc;
623: MatScalar *ba = b->a, *aa = a->a;
624: PetscBool row_identity, col_identity;
626: PetscFunctionBegin;
627: /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
628: A->factortype = MAT_FACTOR_NONE;
629: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
630: A->factortype = MAT_FACTOR_ILU;
631: PetscCall(ISGetIndices(isrow, &r));
632: PetscCall(ISGetIndices(isicol, &ic));
633: PetscCall(PetscMalloc1(n + 1, &rtmp));
635: for (i = 0; i < n; i++) {
636: nz = bi[i + 1] - bi[i];
637: ajtmp = bj + bi[i];
638: for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
640: /* load in initial (unfactored row) */
641: nz = ai[r[i] + 1] - ai[r[i]];
642: ajtmpold = aj + ai[r[i]];
643: v = aa + ai[r[i]];
644: for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
646: row = *ajtmp++;
647: while (row < i) {
648: pc = rtmp + row;
649: if (*pc != 0.0) {
650: pv = ba + diag_offset[row];
651: pj = bj + diag_offset[row] + 1;
652: multiplier = *pc * *pv++;
653: *pc = multiplier;
654: nz = bi[row + 1] - diag_offset[row] - 1;
655: for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
656: PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
657: }
658: row = *ajtmp++;
659: }
660: /* finished row so stick it into b->a */
661: pv = ba + bi[i];
662: pj = bj + bi[i];
663: nz = bi[i + 1] - bi[i];
664: for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
665: diag = diag_offset[i] - bi[i];
666: /* check pivot entry for current row */
667: PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
668: pv[diag] = 1.0 / pv[diag];
669: }
671: PetscCall(PetscFree(rtmp));
672: PetscCall(ISRestoreIndices(isicol, &ic));
673: PetscCall(ISRestoreIndices(isrow, &r));
674: PetscCall(ISIdentity(isrow, &row_identity));
675: PetscCall(ISIdentity(isicol, &col_identity));
676: if (row_identity && col_identity) {
677: C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
678: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
679: } else {
680: C->ops->solve = MatSolve_SeqBAIJ_1_inplace;
681: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
682: }
683: C->assembled = PETSC_TRUE;
684: PetscCall(PetscLogFlops(C->cmap->n));
685: PetscFunctionReturn(PETSC_SUCCESS);
686: }
688: static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
689: {
690: PetscFunctionBegin;
691: *type = MATSOLVERPETSC;
692: PetscFunctionReturn(PETSC_SUCCESS);
693: }
695: PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
696: {
697: PetscInt n = A->rmap->n;
699: PetscFunctionBegin;
700: #if defined(PETSC_USE_COMPLEX)
701: PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
702: #endif
703: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
704: PetscCall(MatSetSizes(*B, n, n, n, n));
705: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
706: PetscCall(MatSetType(*B, MATSEQBAIJ));
708: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ;
709: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
710: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
711: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
712: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
713: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
714: PetscCall(MatSetType(*B, MATSEQSBAIJ));
715: PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
717: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ;
718: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
719: /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
720: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
721: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
722: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
723: (*B)->factortype = ftype;
724: (*B)->canuseordering = PETSC_TRUE;
726: PetscCall(PetscFree((*B)->solvertype));
727: PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
728: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
729: PetscFunctionReturn(PETSC_SUCCESS);
730: }
732: PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
733: {
734: Mat C;
736: PetscFunctionBegin;
737: PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
738: PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
739: PetscCall(MatLUFactorNumeric(C, A, info));
741: A->ops->solve = C->ops->solve;
742: A->ops->solvetranspose = C->ops->solvetranspose;
744: PetscCall(MatHeaderMerge(A, &C));
745: PetscFunctionReturn(PETSC_SUCCESS);
746: }
748: #include <../src/mat/impls/sbaij/seq/sbaij.h>
749: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
750: {
751: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
752: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data;
753: IS ip = b->row;
754: const PetscInt *rip;
755: PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
756: PetscInt *ai = a->i, *aj = a->j;
757: PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
758: MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
759: PetscReal rs;
760: FactorShiftCtx sctx;
762: PetscFunctionBegin;
763: if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
764: if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
765: PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info));
766: PetscCall(MatDestroy(&a->sbaijMat));
767: PetscFunctionReturn(PETSC_SUCCESS);
768: }
770: /* MatPivotSetUp(): initialize shift context sctx */
771: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
773: PetscCall(ISGetIndices(ip, &rip));
774: PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
776: sctx.shift_amount = 0.;
777: sctx.nshift = 0;
778: do {
779: sctx.newshift = PETSC_FALSE;
780: for (i = 0; i < mbs; i++) {
781: rtmp[i] = 0.0;
782: jl[i] = mbs;
783: il[0] = 0;
784: }
786: for (k = 0; k < mbs; k++) {
787: bval = ba + bi[k];
788: /* initialize k-th row by the perm[k]-th row of A */
789: jmin = ai[rip[k]];
790: jmax = ai[rip[k] + 1];
791: for (j = jmin; j < jmax; j++) {
792: col = rip[aj[j]];
793: if (col >= k) { /* only take upper triangular entry */
794: rtmp[col] = aa[j];
795: *bval++ = 0.0; /* for in-place factorization */
796: }
797: }
799: /* shift the diagonal of the matrix */
800: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
802: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
803: dk = rtmp[k];
804: i = jl[k]; /* first row to be added to k_th row */
806: while (i < k) {
807: nexti = jl[i]; /* next row to be added to k_th row */
809: /* compute multiplier, update diag(k) and U(i,k) */
810: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
811: uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
812: dk += uikdi * ba[ili];
813: ba[ili] = uikdi; /* -U(i,k) */
815: /* add multiple of row i to k-th row */
816: jmin = ili + 1;
817: jmax = bi[i + 1];
818: if (jmin < jmax) {
819: for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
820: /* update il and jl for row i */
821: il[i] = jmin;
822: j = bj[jmin];
823: jl[i] = jl[j];
824: jl[j] = i;
825: }
826: i = nexti;
827: }
829: /* shift the diagonals when zero pivot is detected */
830: /* compute rs=sum of abs(off-diagonal) */
831: rs = 0.0;
832: jmin = bi[k] + 1;
833: nz = bi[k + 1] - jmin;
834: if (nz) {
835: bcol = bj + jmin;
836: while (nz--) {
837: rs += PetscAbsScalar(rtmp[*bcol]);
838: bcol++;
839: }
840: }
842: sctx.rs = rs;
843: sctx.pv = dk;
844: PetscCall(MatPivotCheck(C, A, info, &sctx, k));
845: if (sctx.newshift) break;
846: dk = sctx.pv;
848: /* copy data into U(k,:) */
849: ba[bi[k]] = 1.0 / dk; /* U(k,k) */
850: jmin = bi[k] + 1;
851: jmax = bi[k + 1];
852: if (jmin < jmax) {
853: for (j = jmin; j < jmax; j++) {
854: col = bj[j];
855: ba[j] = rtmp[col];
856: rtmp[col] = 0.0;
857: }
858: /* add the k-th row into il and jl */
859: il[k] = jmin;
860: i = bj[jmin];
861: jl[k] = jl[i];
862: jl[i] = k;
863: }
864: }
865: } while (sctx.newshift);
866: PetscCall(PetscFree3(rtmp, il, jl));
868: PetscCall(ISRestoreIndices(ip, &rip));
870: C->assembled = PETSC_TRUE;
871: C->preallocated = PETSC_TRUE;
873: PetscCall(PetscLogFlops(C->rmap->N));
874: if (sctx.nshift) {
875: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
876: PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
877: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
878: PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
879: }
880: }
881: PetscFunctionReturn(PETSC_SUCCESS);
882: }
884: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
885: {
886: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
887: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data;
888: PetscInt i, j, am = a->mbs;
889: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
890: PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
891: MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
892: PetscReal rs;
893: FactorShiftCtx sctx;
895: PetscFunctionBegin;
896: /* MatPivotSetUp(): initialize shift context sctx */
897: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
899: PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
901: do {
902: sctx.newshift = PETSC_FALSE;
903: for (i = 0; i < am; i++) {
904: rtmp[i] = 0.0;
905: jl[i] = am;
906: il[0] = 0;
907: }
909: for (k = 0; k < am; k++) {
910: /* initialize k-th row with elements nonzero in row perm(k) of A */
911: nz = ai[k + 1] - ai[k];
912: acol = aj + ai[k];
913: aval = aa + ai[k];
914: bval = ba + bi[k];
915: while (nz--) {
916: if (*acol < k) { /* skip lower triangular entries */
917: acol++;
918: aval++;
919: } else {
920: rtmp[*acol++] = *aval++;
921: *bval++ = 0.0; /* for in-place factorization */
922: }
923: }
925: /* shift the diagonal of the matrix */
926: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
928: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
929: dk = rtmp[k];
930: i = jl[k]; /* first row to be added to k_th row */
932: while (i < k) {
933: nexti = jl[i]; /* next row to be added to k_th row */
934: /* compute multiplier, update D(k) and U(i,k) */
935: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
936: uikdi = -ba[ili] * ba[bi[i]];
937: dk += uikdi * ba[ili];
938: ba[ili] = uikdi; /* -U(i,k) */
940: /* add multiple of row i to k-th row ... */
941: jmin = ili + 1;
942: nz = bi[i + 1] - jmin;
943: if (nz > 0) {
944: bcol = bj + jmin;
945: bval = ba + jmin;
946: while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
947: /* update il and jl for i-th row */
948: il[i] = jmin;
949: j = bj[jmin];
950: jl[i] = jl[j];
951: jl[j] = i;
952: }
953: i = nexti;
954: }
956: /* shift the diagonals when zero pivot is detected */
957: /* compute rs=sum of abs(off-diagonal) */
958: rs = 0.0;
959: jmin = bi[k] + 1;
960: nz = bi[k + 1] - jmin;
961: if (nz) {
962: bcol = bj + jmin;
963: while (nz--) {
964: rs += PetscAbsScalar(rtmp[*bcol]);
965: bcol++;
966: }
967: }
969: sctx.rs = rs;
970: sctx.pv = dk;
971: PetscCall(MatPivotCheck(C, A, info, &sctx, k));
972: if (sctx.newshift) break; /* sctx.shift_amount is updated */
973: dk = sctx.pv;
975: /* copy data into U(k,:) */
976: ba[bi[k]] = 1.0 / dk;
977: jmin = bi[k] + 1;
978: nz = bi[k + 1] - jmin;
979: if (nz) {
980: bcol = bj + jmin;
981: bval = ba + jmin;
982: while (nz--) {
983: *bval++ = rtmp[*bcol];
984: rtmp[*bcol++] = 0.0;
985: }
986: /* add k-th row into il and jl */
987: il[k] = jmin;
988: i = bj[jmin];
989: jl[k] = jl[i];
990: jl[i] = k;
991: }
992: }
993: } while (sctx.newshift);
994: PetscCall(PetscFree3(rtmp, il, jl));
996: C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
997: C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
998: C->assembled = PETSC_TRUE;
999: C->preallocated = PETSC_TRUE;
1001: PetscCall(PetscLogFlops(C->rmap->N));
1002: if (sctx.nshift) {
1003: if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1004: PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1005: } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1006: PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1007: }
1008: }
1009: PetscFunctionReturn(PETSC_SUCCESS);
1010: }
1012: #include <petscbt.h>
1013: #include <../src/mat/utils/freespace.h>
1014: PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1015: {
1016: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1017: Mat_SeqSBAIJ *b;
1018: Mat B;
1019: PetscBool perm_identity;
1020: PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1021: const PetscInt *rip;
1022: PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1023: PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1024: PetscReal fill = info->fill, levels = info->levels;
1025: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1026: PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1027: PetscBT lnkbt;
1028: PetscBool diagDense;
1029: const PetscInt *adiag;
1031: PetscFunctionBegin;
1032: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense));
1033: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
1035: if (bs > 1) {
1036: if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1037: fact->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1039: PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1040: PetscFunctionReturn(PETSC_SUCCESS);
1041: }
1043: PetscCall(ISIdentity(perm, &perm_identity));
1044: PetscCall(ISGetIndices(perm, &rip));
1046: /* special case that simply copies fill pattern */
1047: if (!levels && perm_identity) {
1048: PetscCall(PetscMalloc1(am + 1, &ui));
1049: for (i = 0; i < am; i++) ui[i] = ai[i + 1] - adiag[i]; /* ui: rowlengths - changes when !perm_identity */
1050: B = fact;
1051: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1053: b = (Mat_SeqSBAIJ *)B->data;
1054: uj = b->j;
1055: for (i = 0; i < am; i++) {
1056: aj = a->j + adiag[i];
1057: for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1058: b->ilen[i] = ui[i];
1059: }
1060: PetscCall(PetscFree(ui));
1062: B->factortype = MAT_FACTOR_NONE;
1064: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1065: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1066: B->factortype = MAT_FACTOR_ICC;
1068: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1069: PetscFunctionReturn(PETSC_SUCCESS);
1070: }
1072: /* initialization */
1073: PetscCall(PetscMalloc1(am + 1, &ui));
1074: ui[0] = 0;
1075: PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1077: /* jl: linked list for storing indices of the pivot rows
1078: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1079: PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1080: for (i = 0; i < am; i++) {
1081: jl[i] = am;
1082: il[i] = 0;
1083: }
1085: /* create and initialize a linked list for storing column indices of the active row k */
1086: nlnk = am + 1;
1087: PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1089: /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1090: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1092: current_space = free_space;
1094: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1095: current_space_lvl = free_space_lvl;
1097: for (k = 0; k < am; k++) { /* for each active row k */
1098: /* initialize lnk by the column indices of row rip[k] of A */
1099: nzk = 0;
1100: ncols = ai[rip[k] + 1] - ai[rip[k]];
1101: ncols_upper = 0;
1102: cols = cols_lvl + am;
1103: for (j = 0; j < ncols; j++) {
1104: i = rip[*(aj + ai[rip[k]] + j)];
1105: if (i >= k) { /* only take upper triangular entry */
1106: cols[ncols_upper] = i;
1107: cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1108: ncols_upper++;
1109: }
1110: }
1111: PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1112: nzk += nlnk;
1114: /* update lnk by computing fill-in for each pivot row to be merged in */
1115: prow = jl[k]; /* 1st pivot row */
1117: while (prow < k) {
1118: nextprow = jl[prow];
1120: /* merge prow into k-th row */
1121: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1122: jmax = ui[prow + 1];
1123: ncols = jmax - jmin;
1124: i = jmin - ui[prow];
1125: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1126: for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1127: PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1128: nzk += nlnk;
1130: /* update il and jl for prow */
1131: if (jmin < jmax) {
1132: il[prow] = jmin;
1134: j = *cols;
1135: jl[prow] = jl[j];
1136: jl[j] = prow;
1137: }
1138: prow = nextprow;
1139: }
1141: /* if free space is not available, make more free space */
1142: if (current_space->local_remaining < nzk) {
1143: i = am - k + 1; /* num of unfactored rows */
1144: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1145: PetscCall(PetscFreeSpaceGet(i, ¤t_space));
1146: PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl));
1147: reallocs++;
1148: }
1150: /* copy data into free_space and free_space_lvl, then initialize lnk */
1151: PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1153: /* add the k-th row into il and jl */
1154: if (nzk - 1 > 0) {
1155: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1156: jl[k] = jl[i];
1157: jl[i] = k;
1158: il[k] = ui[k] + 1;
1159: }
1160: uj_ptr[k] = current_space->array;
1161: uj_lvl_ptr[k] = current_space_lvl->array;
1163: current_space->array += nzk;
1164: current_space->local_used += nzk;
1165: current_space->local_remaining -= nzk;
1167: current_space_lvl->array += nzk;
1168: current_space_lvl->local_used += nzk;
1169: current_space_lvl->local_remaining -= nzk;
1171: ui[k + 1] = ui[k] + nzk;
1172: }
1174: PetscCall(ISRestoreIndices(perm, &rip));
1175: PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1176: PetscCall(PetscFree(cols_lvl));
1178: /* copy free_space into uj and free free_space; set uj in new datastructure; */
1179: PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1180: PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1181: PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1182: PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1184: /* put together the new matrix in MATSEQSBAIJ format */
1185: B = fact;
1186: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1188: b = (Mat_SeqSBAIJ *)B->data;
1189: b->free_a = PETSC_TRUE;
1190: b->free_ij = PETSC_TRUE;
1192: PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1194: b->j = uj;
1195: b->i = ui;
1196: b->diag = NULL;
1197: b->ilen = NULL;
1198: b->imax = NULL;
1199: b->row = perm;
1200: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1202: PetscCall(PetscObjectReference((PetscObject)perm));
1204: b->icol = perm;
1206: PetscCall(PetscObjectReference((PetscObject)perm));
1207: PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1209: b->maxnz = b->nz = ui[am];
1211: B->info.factor_mallocs = reallocs;
1212: B->info.fill_ratio_given = fill;
1213: if (ai[am] != 0.) {
1214: /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1215: B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1216: } else {
1217: B->info.fill_ratio_needed = 0.0;
1218: }
1219: #if defined(PETSC_USE_INFO)
1220: if (ai[am] != 0) {
1221: PetscReal af = B->info.fill_ratio_needed;
1222: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1223: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1224: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1225: } else {
1226: PetscCall(PetscInfo(A, "Empty matrix\n"));
1227: }
1228: #endif
1229: if (perm_identity) {
1230: B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1231: B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1232: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1233: } else {
1234: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1235: }
1236: PetscFunctionReturn(PETSC_SUCCESS);
1237: }
1239: PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1240: {
1241: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1242: Mat_SeqSBAIJ *b;
1243: Mat B;
1244: PetscBool perm_identity;
1245: PetscReal fill = info->fill;
1246: const PetscInt *rip;
1247: PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1248: PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1249: PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1250: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1251: PetscBT lnkbt;
1252: PetscBool diagDense;
1254: PetscFunctionBegin;
1255: if (bs > 1) { /* convert to seqsbaij */
1256: if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1257: fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1259: PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1260: PetscFunctionReturn(PETSC_SUCCESS);
1261: }
1262: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, NULL, &diagDense));
1263: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
1265: /* check whether perm is the identity mapping */
1266: PetscCall(ISIdentity(perm, &perm_identity));
1267: PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1268: PetscCall(ISGetIndices(perm, &rip));
1270: /* initialization */
1271: PetscCall(PetscMalloc1(mbs + 1, &ui));
1272: ui[0] = 0;
1274: /* jl: linked list for storing indices of the pivot rows
1275: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1276: PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1277: for (i = 0; i < mbs; i++) {
1278: jl[i] = mbs;
1279: il[i] = 0;
1280: }
1282: /* create and initialize a linked list for storing column indices of the active row k */
1283: nlnk = mbs + 1;
1284: PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1286: /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1287: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1289: current_space = free_space;
1291: for (k = 0; k < mbs; k++) { /* for each active row k */
1292: /* initialize lnk by the column indices of row rip[k] of A */
1293: nzk = 0;
1294: ncols = ai[rip[k] + 1] - ai[rip[k]];
1295: ncols_upper = 0;
1296: for (j = 0; j < ncols; j++) {
1297: i = rip[*(aj + ai[rip[k]] + j)];
1298: if (i >= k) { /* only take upper triangular entry */
1299: cols[ncols_upper] = i;
1300: ncols_upper++;
1301: }
1302: }
1303: PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1304: nzk += nlnk;
1306: /* update lnk by computing fill-in for each pivot row to be merged in */
1307: prow = jl[k]; /* 1st pivot row */
1309: while (prow < k) {
1310: nextprow = jl[prow];
1311: /* merge prow into k-th row */
1312: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1313: jmax = ui[prow + 1];
1314: ncols = jmax - jmin;
1315: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1316: PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1317: nzk += nlnk;
1319: /* update il and jl for prow */
1320: if (jmin < jmax) {
1321: il[prow] = jmin;
1322: j = *uj_ptr;
1323: jl[prow] = jl[j];
1324: jl[j] = prow;
1325: }
1326: prow = nextprow;
1327: }
1329: /* if free space is not available, make more free space */
1330: if (current_space->local_remaining < nzk) {
1331: i = mbs - k + 1; /* num of unfactored rows */
1332: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1333: PetscCall(PetscFreeSpaceGet(i, ¤t_space));
1334: reallocs++;
1335: }
1337: /* copy data into free space, then initialize lnk */
1338: PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1340: /* add the k-th row into il and jl */
1341: if (nzk - 1 > 0) {
1342: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1343: jl[k] = jl[i];
1344: jl[i] = k;
1345: il[k] = ui[k] + 1;
1346: }
1347: ui_ptr[k] = current_space->array;
1348: current_space->array += nzk;
1349: current_space->local_used += nzk;
1350: current_space->local_remaining -= nzk;
1352: ui[k + 1] = ui[k] + nzk;
1353: }
1355: PetscCall(ISRestoreIndices(perm, &rip));
1356: PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1358: /* copy free_space into uj and free free_space; set uj in new datastructure; */
1359: PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1360: PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1361: PetscCall(PetscLLDestroy(lnk, lnkbt));
1363: /* put together the new matrix in MATSEQSBAIJ format */
1364: B = fact;
1365: PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1367: b = (Mat_SeqSBAIJ *)B->data;
1368: b->free_a = PETSC_TRUE;
1369: b->free_ij = PETSC_TRUE;
1371: PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1373: b->j = uj;
1374: b->i = ui;
1375: b->diag = NULL;
1376: b->ilen = NULL;
1377: b->imax = NULL;
1378: b->row = perm;
1379: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1381: PetscCall(PetscObjectReference((PetscObject)perm));
1382: b->icol = perm;
1383: PetscCall(PetscObjectReference((PetscObject)perm));
1384: PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1385: b->maxnz = b->nz = ui[mbs];
1387: B->info.factor_mallocs = reallocs;
1388: B->info.fill_ratio_given = fill;
1389: if (ai[mbs] != 0.) {
1390: /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1391: B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1392: } else {
1393: B->info.fill_ratio_needed = 0.0;
1394: }
1395: #if defined(PETSC_USE_INFO)
1396: if (ai[mbs] != 0.) {
1397: PetscReal af = B->info.fill_ratio_needed;
1398: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1399: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1400: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1401: } else {
1402: PetscCall(PetscInfo(A, "Empty matrix\n"));
1403: }
1404: #endif
1405: if (perm_identity) {
1406: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1407: } else {
1408: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1409: }
1410: PetscFunctionReturn(PETSC_SUCCESS);
1411: }
1413: PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1414: {
1415: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1416: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1417: PetscInt i, k, n = a->mbs;
1418: PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
1419: const MatScalar *aa = a->a, *v;
1420: PetscScalar *x, *s, *t, *ls;
1421: const PetscScalar *b;
1423: PetscFunctionBegin;
1424: PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1425: PetscCall(VecGetArrayRead(bb, &b));
1426: PetscCall(VecGetArray(xx, &x));
1427: t = a->solve_work;
1429: /* forward solve the lower triangular */
1430: PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1432: for (i = 1; i < n; i++) {
1433: v = aa + bs2 * ai[i];
1434: vi = aj + ai[i];
1435: nz = ai[i + 1] - ai[i];
1436: s = t + bs * i;
1437: PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1438: for (k = 0; k < nz; k++) {
1439: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1440: v += bs2;
1441: }
1442: }
1444: /* backward solve the upper triangular */
1445: ls = a->solve_work + A->cmap->n;
1446: for (i = n - 1; i >= 0; i--) {
1447: v = aa + bs2 * (adiag[i + 1] + 1);
1448: vi = aj + adiag[i + 1] + 1;
1449: nz = adiag[i] - adiag[i + 1] - 1;
1450: PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1451: for (k = 0; k < nz; k++) {
1452: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1453: v += bs2;
1454: }
1455: PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1456: PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1457: }
1459: PetscCall(VecRestoreArrayRead(bb, &b));
1460: PetscCall(VecRestoreArray(xx, &x));
1461: PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1462: PetscFunctionReturn(PETSC_SUCCESS);
1463: }
1465: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1466: {
1467: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1468: IS iscol = a->col, isrow = a->row;
1469: const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1470: PetscInt i, m, n = a->mbs;
1471: PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
1472: const MatScalar *aa = a->a, *v;
1473: PetscScalar *x, *s, *t, *ls;
1474: const PetscScalar *b;
1476: PetscFunctionBegin;
1477: PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1478: PetscCall(VecGetArrayRead(bb, &b));
1479: PetscCall(VecGetArray(xx, &x));
1480: t = a->solve_work;
1482: PetscCall(ISGetIndices(isrow, &rout));
1483: r = rout;
1484: PetscCall(ISGetIndices(iscol, &cout));
1485: c = cout;
1487: /* forward solve the lower triangular */
1488: PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1489: for (i = 1; i < n; i++) {
1490: v = aa + bs2 * ai[i];
1491: vi = aj + ai[i];
1492: nz = ai[i + 1] - ai[i];
1493: s = t + bs * i;
1494: PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1495: for (m = 0; m < nz; m++) {
1496: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1497: v += bs2;
1498: }
1499: }
1501: /* backward solve the upper triangular */
1502: ls = a->solve_work + A->cmap->n;
1503: for (i = n - 1; i >= 0; i--) {
1504: v = aa + bs2 * (adiag[i + 1] + 1);
1505: vi = aj + adiag[i + 1] + 1;
1506: nz = adiag[i] - adiag[i + 1] - 1;
1507: PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1508: for (m = 0; m < nz; m++) {
1509: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1510: v += bs2;
1511: }
1512: PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1513: PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1514: }
1515: PetscCall(ISRestoreIndices(isrow, &rout));
1516: PetscCall(ISRestoreIndices(iscol, &cout));
1517: PetscCall(VecRestoreArrayRead(bb, &b));
1518: PetscCall(VecRestoreArray(xx, &x));
1519: PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1520: PetscFunctionReturn(PETSC_SUCCESS);
1521: }