Actual source code: baijfact.c
1: /*
2: Factorization code for BAIJ format.
3: */
4: #include <../src/mat/impls/baij/seq/baij.h>
5: #include <petsc/private/kernels/blockinvert.h>
7: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
8: {
9: Mat C = B;
10: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
11: IS isrow = b->row, isicol = b->icol;
12: const PetscInt *r, *ic;
13: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
14: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
15: MatScalar *rtmp, *pc, *mwork, *pv;
16: MatScalar *aa = a->a, *v;
17: PetscReal shift = info->shiftamount;
18: const PetscBool allowzeropivot = PetscNot(A->erroriffailure);
20: PetscFunctionBegin;
21: PetscCall(ISGetIndices(isrow, &r));
22: PetscCall(ISGetIndices(isicol, &ic));
24: /* generate work space needed by the factorization */
25: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
26: PetscCall(PetscArrayzero(rtmp, bs2 * n));
28: for (PetscInt i = 0; i < n; i++) {
29: /* zero rtmp */
30: /* L part */
31: PetscInt *pj;
32: PetscInt nzL, nz = bi[i + 1] - bi[i];
33: bjtmp = bj + bi[i];
34: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
36: /* U part */
37: nz = bdiag[i] - bdiag[i + 1];
38: bjtmp = bj + bdiag[i + 1] + 1;
39: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
41: /* load in initial (unfactored row) */
42: nz = ai[r[i] + 1] - ai[r[i]];
43: ajtmp = aj + ai[r[i]];
44: v = aa + bs2 * ai[r[i]];
45: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
47: /* elimination */
48: bjtmp = bj + bi[i];
49: nzL = bi[i + 1] - bi[i];
50: for (PetscInt k = 0; k < nzL; k++) {
51: PetscBool flg = PETSC_FALSE;
52: PetscInt row = bjtmp[k];
54: pc = rtmp + bs2 * row;
55: for (PetscInt j = 0; j < bs2; j++) {
56: if (pc[j] != (PetscScalar)0.0) {
57: flg = PETSC_TRUE;
58: break;
59: }
60: }
61: if (flg) {
62: pv = b->a + bs2 * bdiag[row];
63: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
64: PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
66: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
67: pv = b->a + bs2 * (bdiag[row + 1] + 1);
68: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
69: for (PetscInt j = 0; j < nz; j++) {
70: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
71: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
72: v = rtmp + 4 * pj[j];
73: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
74: pv += 4;
75: }
76: PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
77: }
78: }
80: /* finished row so stick it into b->a */
81: /* L part */
82: pv = b->a + bs2 * bi[i];
83: pj = b->j + bi[i];
84: nz = bi[i + 1] - bi[i];
85: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
87: /* Mark diagonal and invert diagonal for simpler triangular solves */
88: pv = b->a + bs2 * bdiag[i];
89: pj = b->j + bdiag[i];
90: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
91: {
92: PetscBool zeropivotdetected;
94: PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
95: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
96: }
98: /* U part */
99: pv = b->a + bs2 * (bdiag[i + 1] + 1);
100: pj = b->j + bdiag[i + 1] + 1;
101: nz = bdiag[i] - bdiag[i + 1] - 1;
102: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
103: }
105: PetscCall(PetscFree2(rtmp, mwork));
106: PetscCall(ISRestoreIndices(isicol, &ic));
107: PetscCall(ISRestoreIndices(isrow, &r));
109: C->ops->solve = MatSolve_SeqBAIJ_2;
110: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
111: C->assembled = PETSC_TRUE;
113: PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
114: PetscFunctionReturn(PETSC_SUCCESS);
115: }
117: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
118: {
119: Mat C = B;
120: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
121: PetscInt i, j, k, nz, nzL, row, *pj;
122: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
123: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
124: MatScalar *rtmp, *pc, *mwork, *pv;
125: MatScalar *aa = a->a, *v;
126: PetscInt flg;
127: PetscReal shift = info->shiftamount;
128: PetscBool allowzeropivot, zeropivotdetected;
130: PetscFunctionBegin;
131: allowzeropivot = PetscNot(A->erroriffailure);
133: /* generate work space needed by the factorization */
134: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
135: PetscCall(PetscArrayzero(rtmp, bs2 * n));
137: for (i = 0; i < n; i++) {
138: /* zero rtmp */
139: /* L part */
140: nz = bi[i + 1] - bi[i];
141: bjtmp = bj + bi[i];
142: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
144: /* U part */
145: nz = bdiag[i] - bdiag[i + 1];
146: bjtmp = bj + bdiag[i + 1] + 1;
147: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
149: /* load in initial (unfactored row) */
150: nz = ai[i + 1] - ai[i];
151: ajtmp = aj + ai[i];
152: v = aa + bs2 * ai[i];
153: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
155: /* elimination */
156: bjtmp = bj + bi[i];
157: nzL = bi[i + 1] - bi[i];
158: for (k = 0; k < nzL; k++) {
159: row = bjtmp[k];
160: pc = rtmp + bs2 * row;
161: for (flg = 0, j = 0; j < bs2; j++) {
162: if (pc[j] != (PetscScalar)0.0) {
163: flg = 1;
164: break;
165: }
166: }
167: if (flg) {
168: pv = b->a + bs2 * bdiag[row];
169: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
170: PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
172: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
173: pv = b->a + bs2 * (bdiag[row + 1] + 1);
174: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
175: for (j = 0; j < nz; j++) {
176: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
177: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
178: v = rtmp + 4 * pj[j];
179: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
180: pv += 4;
181: }
182: PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
183: }
184: }
186: /* finished row so stick it into b->a */
187: /* L part */
188: pv = b->a + bs2 * bi[i];
189: pj = b->j + bi[i];
190: nz = bi[i + 1] - bi[i];
191: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
193: /* Mark diagonal and invert diagonal for simpler triangular solves */
194: pv = b->a + bs2 * bdiag[i];
195: pj = b->j + bdiag[i];
196: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
197: PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
198: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
200: /* U part */
201: /*
202: pv = b->a + bs2*bi[2*n-i];
203: pj = b->j + bi[2*n-i];
204: nz = bi[2*n-i+1] - bi[2*n-i] - 1;
205: */
206: pv = b->a + bs2 * (bdiag[i + 1] + 1);
207: pj = b->j + bdiag[i + 1] + 1;
208: nz = bdiag[i] - bdiag[i + 1] - 1;
209: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
210: }
211: PetscCall(PetscFree2(rtmp, mwork));
213: C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering;
214: C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
215: C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
216: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
217: C->assembled = PETSC_TRUE;
219: PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
220: PetscFunctionReturn(PETSC_SUCCESS);
221: }
223: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
224: {
225: Mat C = B;
226: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
227: IS isrow = b->row, isicol = b->icol;
228: const PetscInt *r, *ic;
229: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
230: PetscInt *ajtmpold, *ajtmp, nz, row;
231: PetscInt idx, *ai = a->i, *aj = a->j, *pj;
232: const PetscInt *diag_offset;
233: MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
234: MatScalar p1, p2, p3, p4;
235: MatScalar *ba = b->a, *aa = a->a;
236: PetscReal shift = info->shiftamount;
237: PetscBool allowzeropivot, zeropivotdetected;
239: PetscFunctionBegin;
240: /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
241: A->factortype = MAT_FACTOR_NONE;
242: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
243: A->factortype = MAT_FACTOR_ILU;
244: allowzeropivot = PetscNot(A->erroriffailure);
245: PetscCall(ISGetIndices(isrow, &r));
246: PetscCall(ISGetIndices(isicol, &ic));
247: PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
249: for (i = 0; i < n; i++) {
250: nz = bi[i + 1] - bi[i];
251: ajtmp = bj + bi[i];
252: for (j = 0; j < nz; j++) {
253: x = rtmp + 4 * ajtmp[j];
254: x[0] = x[1] = x[2] = x[3] = 0.0;
255: }
256: /* load in initial (unfactored row) */
257: idx = r[i];
258: nz = ai[idx + 1] - ai[idx];
259: ajtmpold = aj + ai[idx];
260: v = aa + 4 * ai[idx];
261: for (j = 0; j < nz; j++) {
262: x = rtmp + 4 * ic[ajtmpold[j]];
263: x[0] = v[0];
264: x[1] = v[1];
265: x[2] = v[2];
266: x[3] = v[3];
267: v += 4;
268: }
269: row = *ajtmp++;
270: while (row < i) {
271: pc = rtmp + 4 * row;
272: p1 = pc[0];
273: p2 = pc[1];
274: p3 = pc[2];
275: p4 = pc[3];
276: if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
277: pv = ba + 4 * diag_offset[row];
278: pj = bj + diag_offset[row] + 1;
279: x1 = pv[0];
280: x2 = pv[1];
281: x3 = pv[2];
282: x4 = pv[3];
283: pc[0] = m1 = p1 * x1 + p3 * x2;
284: pc[1] = m2 = p2 * x1 + p4 * x2;
285: pc[2] = m3 = p1 * x3 + p3 * x4;
286: pc[3] = m4 = p2 * x3 + p4 * x4;
287: nz = bi[row + 1] - diag_offset[row] - 1;
288: pv += 4;
289: for (j = 0; j < nz; j++) {
290: x1 = pv[0];
291: x2 = pv[1];
292: x3 = pv[2];
293: x4 = pv[3];
294: x = rtmp + 4 * pj[j];
295: x[0] -= m1 * x1 + m3 * x2;
296: x[1] -= m2 * x1 + m4 * x2;
297: x[2] -= m1 * x3 + m3 * x4;
298: x[3] -= m2 * x3 + m4 * x4;
299: pv += 4;
300: }
301: PetscCall(PetscLogFlops(16.0 * nz + 12.0));
302: }
303: row = *ajtmp++;
304: }
305: /* finished row so stick it into b->a */
306: pv = ba + 4 * bi[i];
307: pj = bj + bi[i];
308: nz = bi[i + 1] - bi[i];
309: for (j = 0; j < nz; j++) {
310: x = rtmp + 4 * pj[j];
311: pv[0] = x[0];
312: pv[1] = x[1];
313: pv[2] = x[2];
314: pv[3] = x[3];
315: pv += 4;
316: }
317: /* invert diagonal block */
318: w = ba + 4 * diag_offset[i];
319: PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
320: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
321: }
323: PetscCall(PetscFree(rtmp));
324: PetscCall(ISRestoreIndices(isicol, &ic));
325: PetscCall(ISRestoreIndices(isrow, &r));
327: C->ops->solve = MatSolve_SeqBAIJ_2_inplace;
328: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
329: C->assembled = PETSC_TRUE;
331: PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
332: PetscFunctionReturn(PETSC_SUCCESS);
333: }
334: /*
335: Version for when blocks are 2 by 2 Using natural ordering
336: */
337: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
338: {
339: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
340: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
341: PetscInt *ajtmpold, *ajtmp, nz, row;
342: PetscInt *ai = a->i, *aj = a->j, *pj;
343: const PetscInt *diag_offset;
344: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
345: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
346: MatScalar *ba = b->a, *aa = a->a;
347: PetscReal shift = info->shiftamount;
348: PetscBool allowzeropivot, zeropivotdetected;
350: PetscFunctionBegin;
351: /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
352: A->factortype = MAT_FACTOR_NONE;
353: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
354: A->factortype = MAT_FACTOR_ILU;
355: allowzeropivot = PetscNot(A->erroriffailure);
356: PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
357: for (i = 0; i < n; i++) {
358: nz = bi[i + 1] - bi[i];
359: ajtmp = bj + bi[i];
360: for (j = 0; j < nz; j++) {
361: x = rtmp + 4 * ajtmp[j];
362: x[0] = x[1] = x[2] = x[3] = 0.0;
363: }
364: /* load in initial (unfactored row) */
365: nz = ai[i + 1] - ai[i];
366: ajtmpold = aj + ai[i];
367: v = aa + 4 * ai[i];
368: for (j = 0; j < nz; j++) {
369: x = rtmp + 4 * ajtmpold[j];
370: x[0] = v[0];
371: x[1] = v[1];
372: x[2] = v[2];
373: x[3] = v[3];
374: v += 4;
375: }
376: row = *ajtmp++;
377: while (row < i) {
378: pc = rtmp + 4 * row;
379: p1 = pc[0];
380: p2 = pc[1];
381: p3 = pc[2];
382: p4 = pc[3];
383: if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
384: pv = ba + 4 * diag_offset[row];
385: pj = bj + diag_offset[row] + 1;
386: x1 = pv[0];
387: x2 = pv[1];
388: x3 = pv[2];
389: x4 = pv[3];
390: pc[0] = m1 = p1 * x1 + p3 * x2;
391: pc[1] = m2 = p2 * x1 + p4 * x2;
392: pc[2] = m3 = p1 * x3 + p3 * x4;
393: pc[3] = m4 = p2 * x3 + p4 * x4;
394: nz = bi[row + 1] - diag_offset[row] - 1;
395: pv += 4;
396: for (j = 0; j < nz; j++) {
397: x1 = pv[0];
398: x2 = pv[1];
399: x3 = pv[2];
400: x4 = pv[3];
401: x = rtmp + 4 * pj[j];
402: x[0] -= m1 * x1 + m3 * x2;
403: x[1] -= m2 * x1 + m4 * x2;
404: x[2] -= m1 * x3 + m3 * x4;
405: x[3] -= m2 * x3 + m4 * x4;
406: pv += 4;
407: }
408: PetscCall(PetscLogFlops(16.0 * nz + 12.0));
409: }
410: row = *ajtmp++;
411: }
412: /* finished row so stick it into b->a */
413: pv = ba + 4 * bi[i];
414: pj = bj + bi[i];
415: nz = bi[i + 1] - bi[i];
416: for (j = 0; j < nz; j++) {
417: x = rtmp + 4 * pj[j];
418: pv[0] = x[0];
419: pv[1] = x[1];
420: pv[2] = x[2];
421: pv[3] = x[3];
422: /*
423: printf(" col %d:",pj[j]);
424: for (PetscInt j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
425: printf("\n");
426: */
427: pv += 4;
428: }
429: /* invert diagonal block */
430: w = ba + 4 * diag_offset[i];
431: PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
432: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
433: }
435: PetscCall(PetscFree(rtmp));
437: C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
438: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
439: C->assembled = PETSC_TRUE;
441: PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
442: PetscFunctionReturn(PETSC_SUCCESS);
443: }
445: /*
446: Version for when blocks are 1 by 1.
447: */
448: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
449: {
450: Mat C = B;
451: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
452: IS isrow = b->row, isicol = b->icol;
453: const PetscInt *r, *ic, *ics;
454: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
455: PetscInt i, j, k, nz, nzL, row, *pj;
456: const PetscInt *ajtmp, *bjtmp;
457: MatScalar *rtmp, *pc, multiplier, *pv;
458: const MatScalar *aa = a->a, *v;
459: PetscBool row_identity, col_identity;
460: FactorShiftCtx sctx;
461: const PetscInt *ddiag;
462: PetscReal rs;
463: MatScalar d;
465: PetscFunctionBegin;
466: /* MatPivotSetUp(): initialize shift context sctx */
467: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
469: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
470: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &ddiag, NULL));
471: sctx.shift_top = info->zeropivot;
472: for (i = 0; i < n; i++) {
473: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
474: d = (aa)[ddiag[i]];
475: rs = -PetscAbsScalar(d) - PetscRealPart(d);
476: v = aa + ai[i];
477: nz = ai[i + 1] - ai[i];
478: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
479: if (rs > sctx.shift_top) sctx.shift_top = rs;
480: }
481: sctx.shift_top *= 1.1;
482: sctx.nshift_max = 5;
483: sctx.shift_lo = 0.;
484: sctx.shift_hi = 1.;
485: }
487: PetscCall(ISGetIndices(isrow, &r));
488: PetscCall(ISGetIndices(isicol, &ic));
489: PetscCall(PetscMalloc1(n + 1, &rtmp));
490: ics = ic;
492: do {
493: sctx.newshift = PETSC_FALSE;
494: for (i = 0; i < n; i++) {
495: /* zero rtmp */
496: /* L part */
497: nz = bi[i + 1] - bi[i];
498: bjtmp = bj + bi[i];
499: for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
501: /* U part */
502: nz = bdiag[i] - bdiag[i + 1];
503: bjtmp = bj + bdiag[i + 1] + 1;
504: for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
506: /* load in initial (unfactored row) */
507: nz = ai[r[i] + 1] - ai[r[i]];
508: ajtmp = aj + ai[r[i]];
509: v = aa + ai[r[i]];
510: for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
512: /* ZeropivotApply() */
513: rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
515: /* elimination */
516: bjtmp = bj + bi[i];
517: row = *bjtmp++;
518: nzL = bi[i + 1] - bi[i];
519: for (k = 0; k < nzL; k++) {
520: pc = rtmp + row;
521: if (*pc != (PetscScalar)0.0) {
522: pv = b->a + bdiag[row];
523: multiplier = *pc * (*pv);
524: *pc = multiplier;
526: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
527: pv = b->a + bdiag[row + 1] + 1;
528: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
529: for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
530: PetscCall(PetscLogFlops(2.0 * nz));
531: }
532: row = *bjtmp++;
533: }
535: /* finished row so stick it into b->a */
536: rs = 0.0;
537: /* L part */
538: pv = b->a + bi[i];
539: pj = b->j + bi[i];
540: nz = bi[i + 1] - bi[i];
541: for (j = 0; j < nz; j++) {
542: pv[j] = rtmp[pj[j]];
543: rs += PetscAbsScalar(pv[j]);
544: }
546: /* U part */
547: pv = b->a + bdiag[i + 1] + 1;
548: pj = b->j + bdiag[i + 1] + 1;
549: nz = bdiag[i] - bdiag[i + 1] - 1;
550: for (j = 0; j < nz; j++) {
551: pv[j] = rtmp[pj[j]];
552: rs += PetscAbsScalar(pv[j]);
553: }
555: sctx.rs = rs;
556: sctx.pv = rtmp[i];
557: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
558: if (sctx.newshift) break; /* break for-loop */
559: rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
561: /* Mark diagonal and invert diagonal for simpler triangular solves */
562: pv = b->a + bdiag[i];
563: *pv = (PetscScalar)1.0 / rtmp[i];
565: } /* endof for (i=0; i<n; i++) { */
567: /* MatPivotRefine() */
568: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
569: /*
570: * if no shift in this attempt & shifting & started shifting & can refine,
571: * then try lower shift
572: */
573: sctx.shift_hi = sctx.shift_fraction;
574: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
575: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
576: sctx.newshift = PETSC_TRUE;
577: sctx.nshift++;
578: }
579: } while (sctx.newshift);
581: PetscCall(PetscFree(rtmp));
582: PetscCall(ISRestoreIndices(isicol, &ic));
583: PetscCall(ISRestoreIndices(isrow, &r));
585: PetscCall(ISIdentity(isrow, &row_identity));
586: PetscCall(ISIdentity(isicol, &col_identity));
587: if (row_identity && col_identity) {
588: C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
589: C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
590: C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
591: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
592: } else {
593: C->ops->solve = MatSolve_SeqBAIJ_1;
594: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
595: }
596: C->assembled = PETSC_TRUE;
597: PetscCall(PetscLogFlops(C->cmap->n));
599: /* MatShiftView(A,info,&sctx) */
600: if (sctx.nshift) {
601: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
602: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
603: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
604: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
605: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
606: PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
607: }
608: }
609: PetscFunctionReturn(PETSC_SUCCESS);
610: }
612: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
613: {
614: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
615: IS isrow = b->row, isicol = b->icol;
616: const PetscInt *r, *ic;
617: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
618: PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
619: PetscInt diag, *pj;
620: const PetscInt *diag_offset;
621: MatScalar *pv, *v, *rtmp, multiplier, *pc;
622: MatScalar *ba = b->a, *aa = a->a;
623: PetscBool row_identity, col_identity;
625: PetscFunctionBegin;
626: /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
627: A->factortype = MAT_FACTOR_NONE;
628: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
629: A->factortype = MAT_FACTOR_ILU;
630: PetscCall(ISGetIndices(isrow, &r));
631: PetscCall(ISGetIndices(isicol, &ic));
632: PetscCall(PetscMalloc1(n + 1, &rtmp));
634: for (i = 0; i < n; i++) {
635: nz = bi[i + 1] - bi[i];
636: ajtmp = bj + bi[i];
637: for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
639: /* load in initial (unfactored row) */
640: nz = ai[r[i] + 1] - ai[r[i]];
641: ajtmpold = aj + ai[r[i]];
642: v = aa + ai[r[i]];
643: for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
645: row = *ajtmp++;
646: while (row < i) {
647: pc = rtmp + row;
648: if (*pc != 0.0) {
649: pv = ba + diag_offset[row];
650: pj = bj + diag_offset[row] + 1;
651: multiplier = *pc * *pv++;
652: *pc = multiplier;
653: nz = bi[row + 1] - diag_offset[row] - 1;
654: for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
655: PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
656: }
657: row = *ajtmp++;
658: }
659: /* finished row so stick it into b->a */
660: pv = ba + bi[i];
661: pj = bj + bi[i];
662: nz = bi[i + 1] - bi[i];
663: for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
664: diag = diag_offset[i] - bi[i];
665: /* check pivot entry for current row */
666: PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
667: pv[diag] = 1.0 / pv[diag];
668: }
670: PetscCall(PetscFree(rtmp));
671: PetscCall(ISRestoreIndices(isicol, &ic));
672: PetscCall(ISRestoreIndices(isrow, &r));
673: PetscCall(ISIdentity(isrow, &row_identity));
674: PetscCall(ISIdentity(isicol, &col_identity));
675: if (row_identity && col_identity) {
676: C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
677: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
678: } else {
679: C->ops->solve = MatSolve_SeqBAIJ_1_inplace;
680: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
681: }
682: C->assembled = PETSC_TRUE;
683: PetscCall(PetscLogFlops(C->cmap->n));
684: PetscFunctionReturn(PETSC_SUCCESS);
685: }
687: static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
688: {
689: PetscFunctionBegin;
690: *type = MATSOLVERPETSC;
691: PetscFunctionReturn(PETSC_SUCCESS);
692: }
694: PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
695: {
696: PetscInt n = A->rmap->n;
698: PetscFunctionBegin;
699: PetscCheck(!PetscDefined(USE_COMPLEX) || A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
700: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
701: PetscCall(MatSetSizes(*B, n, n, n, n));
702: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
703: PetscCall(MatSetType(*B, MATSEQBAIJ));
705: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ;
706: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
707: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
708: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
709: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
710: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
711: PetscCall(MatSetType(*B, MATSEQSBAIJ));
712: PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
714: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ;
715: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
716: /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
717: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
718: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
719: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
720: (*B)->factortype = ftype;
721: (*B)->canuseordering = PETSC_TRUE;
723: PetscCall(PetscFree((*B)->solvertype));
724: PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
725: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
726: PetscFunctionReturn(PETSC_SUCCESS);
727: }
729: PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
730: {
731: Mat C;
733: PetscFunctionBegin;
734: PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
735: PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
736: PetscCall(MatLUFactorNumeric(C, A, info));
738: A->ops->solve = C->ops->solve;
739: A->ops->solvetranspose = C->ops->solvetranspose;
741: PetscCall(MatHeaderMerge(A, &C));
742: PetscFunctionReturn(PETSC_SUCCESS);
743: }
745: #include <../src/mat/impls/sbaij/seq/sbaij.h>
746: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
747: {
748: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
749: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data;
750: IS ip = b->row;
751: const PetscInt *rip;
752: PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
753: PetscInt *ai = a->i, *aj = a->j;
754: PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
755: MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
756: PetscReal rs;
757: FactorShiftCtx sctx;
759: PetscFunctionBegin;
760: if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
761: if (!a->sbaijMat) {
762: A->symmetric = PETSC_BOOL3_TRUE;
763: if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
764: PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
765: }
766: PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info));
767: PetscCall(MatDestroy(&a->sbaijMat));
768: PetscFunctionReturn(PETSC_SUCCESS);
769: }
771: /* MatPivotSetUp(): initialize shift context sctx */
772: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
774: PetscCall(ISGetIndices(ip, &rip));
775: PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
777: sctx.shift_amount = 0.;
778: sctx.nshift = 0;
779: do {
780: sctx.newshift = PETSC_FALSE;
781: for (i = 0; i < mbs; i++) {
782: rtmp[i] = 0.0;
783: jl[i] = mbs;
784: il[0] = 0;
785: }
787: for (k = 0; k < mbs; k++) {
788: bval = ba + bi[k];
789: /* initialize k-th row by the perm[k]-th row of A */
790: jmin = ai[rip[k]];
791: jmax = ai[rip[k] + 1];
792: for (j = jmin; j < jmax; j++) {
793: col = rip[aj[j]];
794: if (col >= k) { /* only take upper triangular entry */
795: rtmp[col] = aa[j];
796: *bval++ = 0.0; /* for in-place factorization */
797: }
798: }
800: /* shift the diagonal of the matrix */
801: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
803: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
804: dk = rtmp[k];
805: i = jl[k]; /* first row to be added to k_th row */
807: while (i < k) {
808: nexti = jl[i]; /* next row to be added to k_th row */
810: /* compute multiplier, update diag(k) and U(i,k) */
811: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
812: uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
813: dk += uikdi * ba[ili];
814: ba[ili] = uikdi; /* -U(i,k) */
816: /* add multiple of row i to k-th row */
817: jmin = ili + 1;
818: jmax = bi[i + 1];
819: if (jmin < jmax) {
820: for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
821: /* update il and jl for row i */
822: il[i] = jmin;
823: j = bj[jmin];
824: jl[i] = jl[j];
825: jl[j] = i;
826: }
827: i = nexti;
828: }
830: /* shift the diagonals when zero pivot is detected */
831: /* compute rs=sum of abs(off-diagonal) */
832: rs = 0.0;
833: jmin = bi[k] + 1;
834: nz = bi[k + 1] - jmin;
835: if (nz) {
836: bcol = bj + jmin;
837: while (nz--) {
838: rs += PetscAbsScalar(rtmp[*bcol]);
839: bcol++;
840: }
841: }
843: sctx.rs = rs;
844: sctx.pv = dk;
845: PetscCall(MatPivotCheck(C, A, info, &sctx, k));
846: if (sctx.newshift) break;
847: dk = sctx.pv;
849: /* copy data into U(k,:) */
850: ba[bi[k]] = 1.0 / dk; /* U(k,k) */
851: jmin = bi[k] + 1;
852: jmax = bi[k + 1];
853: if (jmin < jmax) {
854: for (j = jmin; j < jmax; j++) {
855: col = bj[j];
856: ba[j] = rtmp[col];
857: rtmp[col] = 0.0;
858: }
859: /* add the k-th row into il and jl */
860: il[k] = jmin;
861: i = bj[jmin];
862: jl[k] = jl[i];
863: jl[i] = k;
864: }
865: }
866: } while (sctx.newshift);
867: PetscCall(PetscFree3(rtmp, il, jl));
869: PetscCall(ISRestoreIndices(ip, &rip));
871: C->assembled = PETSC_TRUE;
872: C->preallocated = PETSC_TRUE;
874: PetscCall(PetscLogFlops(C->rmap->N));
875: if (sctx.nshift) {
876: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
877: PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
878: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
879: PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
880: }
881: }
882: PetscFunctionReturn(PETSC_SUCCESS);
883: }
885: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
886: {
887: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
888: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data;
889: PetscInt i, j, am = a->mbs;
890: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
891: PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
892: MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
893: PetscReal rs;
894: FactorShiftCtx sctx;
896: PetscFunctionBegin;
897: /* MatPivotSetUp(): initialize shift context sctx */
898: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
900: PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
902: do {
903: sctx.newshift = PETSC_FALSE;
904: for (i = 0; i < am; i++) {
905: rtmp[i] = 0.0;
906: jl[i] = am;
907: il[0] = 0;
908: }
910: for (k = 0; k < am; k++) {
911: /* initialize k-th row with elements nonzero in row perm(k) of A */
912: nz = ai[k + 1] - ai[k];
913: acol = aj + ai[k];
914: aval = aa + ai[k];
915: bval = ba + bi[k];
916: while (nz--) {
917: if (*acol < k) { /* skip lower triangular entries */
918: acol++;
919: aval++;
920: } else {
921: rtmp[*acol++] = *aval++;
922: *bval++ = 0.0; /* for in-place factorization */
923: }
924: }
926: /* shift the diagonal of the matrix */
927: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
929: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
930: dk = rtmp[k];
931: i = jl[k]; /* first row to be added to k_th row */
933: while (i < k) {
934: nexti = jl[i]; /* next row to be added to k_th row */
935: /* compute multiplier, update D(k) and U(i,k) */
936: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
937: uikdi = -ba[ili] * ba[bi[i]];
938: dk += uikdi * ba[ili];
939: ba[ili] = uikdi; /* -U(i,k) */
941: /* add multiple of row i to k-th row ... */
942: jmin = ili + 1;
943: nz = bi[i + 1] - jmin;
944: if (nz > 0) {
945: bcol = bj + jmin;
946: bval = ba + jmin;
947: while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
948: /* update il and jl for i-th row */
949: il[i] = jmin;
950: j = bj[jmin];
951: jl[i] = jl[j];
952: jl[j] = i;
953: }
954: i = nexti;
955: }
957: /* shift the diagonals when zero pivot is detected */
958: /* compute rs=sum of abs(off-diagonal) */
959: rs = 0.0;
960: jmin = bi[k] + 1;
961: nz = bi[k + 1] - jmin;
962: if (nz) {
963: bcol = bj + jmin;
964: while (nz--) {
965: rs += PetscAbsScalar(rtmp[*bcol]);
966: bcol++;
967: }
968: }
970: sctx.rs = rs;
971: sctx.pv = dk;
972: PetscCall(MatPivotCheck(C, A, info, &sctx, k));
973: if (sctx.newshift) break; /* sctx.shift_amount is updated */
974: dk = sctx.pv;
976: /* copy data into U(k,:) */
977: ba[bi[k]] = 1.0 / dk;
978: jmin = bi[k] + 1;
979: nz = bi[k + 1] - jmin;
980: if (nz) {
981: bcol = bj + jmin;
982: bval = ba + jmin;
983: while (nz--) {
984: *bval++ = rtmp[*bcol];
985: rtmp[*bcol++] = 0.0;
986: }
987: /* add k-th row into il and jl */
988: il[k] = jmin;
989: i = bj[jmin];
990: jl[k] = jl[i];
991: jl[i] = k;
992: }
993: }
994: } while (sctx.newshift);
995: PetscCall(PetscFree3(rtmp, il, jl));
997: C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
998: C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
999: C->assembled = PETSC_TRUE;
1000: C->preallocated = PETSC_TRUE;
1002: PetscCall(PetscLogFlops(C->rmap->N));
1003: if (sctx.nshift) {
1004: if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1005: PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1006: } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1007: PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1008: }
1009: }
1010: PetscFunctionReturn(PETSC_SUCCESS);
1011: }
1013: #include <petscbt.h>
1014: #include <../src/mat/utils/freespace.h>
1015: PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1016: {
1017: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1018: Mat_SeqSBAIJ *b;
1019: Mat B;
1020: PetscBool perm_identity;
1021: PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1022: const PetscInt *rip;
1023: PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1024: PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1025: PetscReal fill = info->fill, levels = info->levels;
1026: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1027: PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1028: PetscBT lnkbt;
1029: PetscBool diagDense;
1030: const PetscInt *adiag;
1032: PetscFunctionBegin;
1033: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense));
1034: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
1036: if (bs > 1) {
1037: if (!a->sbaijMat) {
1038: A->symmetric = PETSC_BOOL3_TRUE;
1039: if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
1040: PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1041: }
1042: fact->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1044: PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1045: PetscFunctionReturn(PETSC_SUCCESS);
1046: }
1048: PetscCall(ISIdentity(perm, &perm_identity));
1049: PetscCall(ISGetIndices(perm, &rip));
1051: /* special case that simply copies fill pattern */
1052: if (!levels && perm_identity) {
1053: PetscCall(PetscMalloc1(am + 1, &ui));
1054: for (i = 0; i < am; i++) ui[i] = ai[i + 1] - adiag[i]; /* ui: rowlengths - changes when !perm_identity */
1055: B = fact;
1056: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1058: b = (Mat_SeqSBAIJ *)B->data;
1059: uj = b->j;
1060: for (i = 0; i < am; i++) {
1061: aj = a->j + adiag[i];
1062: for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1063: b->ilen[i] = ui[i];
1064: }
1065: PetscCall(PetscFree(ui));
1067: B->factortype = MAT_FACTOR_NONE;
1069: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1070: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1071: B->factortype = MAT_FACTOR_ICC;
1073: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1074: PetscFunctionReturn(PETSC_SUCCESS);
1075: }
1077: /* initialization */
1078: PetscCall(PetscMalloc1(am + 1, &ui));
1079: ui[0] = 0;
1080: PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1082: /* jl: linked list for storing indices of the pivot rows
1083: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1084: PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1085: for (i = 0; i < am; i++) {
1086: jl[i] = am;
1087: il[i] = 0;
1088: }
1090: /* create and initialize a linked list for storing column indices of the active row k */
1091: nlnk = am + 1;
1092: PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1094: /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1095: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1097: current_space = free_space;
1099: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1100: current_space_lvl = free_space_lvl;
1102: for (k = 0; k < am; k++) { /* for each active row k */
1103: /* initialize lnk by the column indices of row rip[k] of A */
1104: nzk = 0;
1105: ncols = ai[rip[k] + 1] - ai[rip[k]];
1106: ncols_upper = 0;
1107: cols = cols_lvl + am;
1108: for (j = 0; j < ncols; j++) {
1109: i = rip[*(aj + ai[rip[k]] + j)];
1110: if (i >= k) { /* only take upper triangular entry */
1111: cols[ncols_upper] = i;
1112: cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1113: ncols_upper++;
1114: }
1115: }
1116: PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1117: nzk += nlnk;
1119: /* update lnk by computing fill-in for each pivot row to be merged in */
1120: prow = jl[k]; /* 1st pivot row */
1122: while (prow < k) {
1123: nextprow = jl[prow];
1125: /* merge prow into k-th row */
1126: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1127: jmax = ui[prow + 1];
1128: ncols = jmax - jmin;
1129: i = jmin - ui[prow];
1130: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1131: for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1132: PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1133: nzk += nlnk;
1135: /* update il and jl for prow */
1136: if (jmin < jmax) {
1137: il[prow] = jmin;
1139: j = *cols;
1140: jl[prow] = jl[j];
1141: jl[j] = prow;
1142: }
1143: prow = nextprow;
1144: }
1146: /* if free space is not available, make more free space */
1147: if (current_space->local_remaining < nzk) {
1148: i = am - k + 1; /* num of unfactored rows */
1149: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1150: PetscCall(PetscFreeSpaceGet(i, ¤t_space));
1151: PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl));
1152: reallocs++;
1153: }
1155: /* copy data into free_space and free_space_lvl, then initialize lnk */
1156: PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1158: /* add the k-th row into il and jl */
1159: if (nzk - 1 > 0) {
1160: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1161: jl[k] = jl[i];
1162: jl[i] = k;
1163: il[k] = ui[k] + 1;
1164: }
1165: uj_ptr[k] = current_space->array;
1166: uj_lvl_ptr[k] = current_space_lvl->array;
1168: current_space->array += nzk;
1169: current_space->local_used += nzk;
1170: current_space->local_remaining -= nzk;
1172: current_space_lvl->array += nzk;
1173: current_space_lvl->local_used += nzk;
1174: current_space_lvl->local_remaining -= nzk;
1176: ui[k + 1] = ui[k] + nzk;
1177: }
1179: PetscCall(ISRestoreIndices(perm, &rip));
1180: PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1181: PetscCall(PetscFree(cols_lvl));
1183: /* copy free_space into uj and free free_space; set uj in new datastructure; */
1184: PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1185: PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1186: PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1187: PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1189: /* put together the new matrix in MATSEQSBAIJ format */
1190: B = fact;
1191: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1193: b = (Mat_SeqSBAIJ *)B->data;
1194: b->free_a = PETSC_TRUE;
1195: b->free_ij = PETSC_TRUE;
1197: PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1199: b->j = uj;
1200: b->i = ui;
1201: b->diag = NULL;
1202: b->ilen = NULL;
1203: b->imax = NULL;
1204: b->row = perm;
1205: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1207: PetscCall(PetscObjectReference((PetscObject)perm));
1209: b->icol = perm;
1211: PetscCall(PetscObjectReference((PetscObject)perm));
1212: PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1214: b->maxnz = b->nz = ui[am];
1216: B->info.factor_mallocs = reallocs;
1217: B->info.fill_ratio_given = fill;
1218: if (ai[am] != 0.) {
1219: /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1220: B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1221: } else {
1222: B->info.fill_ratio_needed = 0.0;
1223: }
1224: #if defined(PETSC_USE_INFO)
1225: if (ai[am] != 0) {
1226: PetscReal af = B->info.fill_ratio_needed;
1227: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1228: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1229: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1230: } else {
1231: PetscCall(PetscInfo(A, "Empty matrix\n"));
1232: }
1233: #endif
1234: if (perm_identity) {
1235: B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1236: B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1237: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1238: } else {
1239: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1240: }
1241: PetscFunctionReturn(PETSC_SUCCESS);
1242: }
1244: PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1245: {
1246: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1247: Mat_SeqSBAIJ *b;
1248: Mat B;
1249: PetscBool perm_identity;
1250: PetscReal fill = info->fill;
1251: const PetscInt *rip;
1252: PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1253: PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1254: PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1255: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1256: PetscBT lnkbt;
1257: PetscBool diagDense;
1259: PetscFunctionBegin;
1260: if (bs > 1) { /* convert to seqsbaij */
1261: if (!a->sbaijMat) {
1262: A->symmetric = PETSC_BOOL3_TRUE;
1263: if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
1264: PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1265: }
1266: fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1268: PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1269: PetscFunctionReturn(PETSC_SUCCESS);
1270: }
1271: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, NULL, &diagDense));
1272: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
1274: /* check whether perm is the identity mapping */
1275: PetscCall(ISIdentity(perm, &perm_identity));
1276: PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1277: PetscCall(ISGetIndices(perm, &rip));
1279: /* initialization */
1280: PetscCall(PetscMalloc1(mbs + 1, &ui));
1281: ui[0] = 0;
1283: /* jl: linked list for storing indices of the pivot rows
1284: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1285: PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1286: for (i = 0; i < mbs; i++) {
1287: jl[i] = mbs;
1288: il[i] = 0;
1289: }
1291: /* create and initialize a linked list for storing column indices of the active row k */
1292: nlnk = mbs + 1;
1293: PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1295: /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1296: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1298: current_space = free_space;
1300: for (k = 0; k < mbs; k++) { /* for each active row k */
1301: /* initialize lnk by the column indices of row rip[k] of A */
1302: nzk = 0;
1303: ncols = ai[rip[k] + 1] - ai[rip[k]];
1304: ncols_upper = 0;
1305: for (j = 0; j < ncols; j++) {
1306: i = rip[*(aj + ai[rip[k]] + j)];
1307: if (i >= k) { /* only take upper triangular entry */
1308: cols[ncols_upper] = i;
1309: ncols_upper++;
1310: }
1311: }
1312: PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1313: nzk += nlnk;
1315: /* update lnk by computing fill-in for each pivot row to be merged in */
1316: prow = jl[k]; /* 1st pivot row */
1318: while (prow < k) {
1319: nextprow = jl[prow];
1320: /* merge prow into k-th row */
1321: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1322: jmax = ui[prow + 1];
1323: ncols = jmax - jmin;
1324: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1325: PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1326: nzk += nlnk;
1328: /* update il and jl for prow */
1329: if (jmin < jmax) {
1330: il[prow] = jmin;
1331: j = *uj_ptr;
1332: jl[prow] = jl[j];
1333: jl[j] = prow;
1334: }
1335: prow = nextprow;
1336: }
1338: /* if free space is not available, make more free space */
1339: if (current_space->local_remaining < nzk) {
1340: i = mbs - k + 1; /* num of unfactored rows */
1341: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1342: PetscCall(PetscFreeSpaceGet(i, ¤t_space));
1343: reallocs++;
1344: }
1346: /* copy data into free space, then initialize lnk */
1347: PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1349: /* add the k-th row into il and jl */
1350: if (nzk - 1 > 0) {
1351: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1352: jl[k] = jl[i];
1353: jl[i] = k;
1354: il[k] = ui[k] + 1;
1355: }
1356: ui_ptr[k] = current_space->array;
1357: current_space->array += nzk;
1358: current_space->local_used += nzk;
1359: current_space->local_remaining -= nzk;
1361: ui[k + 1] = ui[k] + nzk;
1362: }
1364: PetscCall(ISRestoreIndices(perm, &rip));
1365: PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1367: /* copy free_space into uj and free free_space; set uj in new datastructure; */
1368: PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1369: PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1370: PetscCall(PetscLLDestroy(lnk, lnkbt));
1372: /* put together the new matrix in MATSEQSBAIJ format */
1373: B = fact;
1374: PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1376: b = (Mat_SeqSBAIJ *)B->data;
1377: b->free_a = PETSC_TRUE;
1378: b->free_ij = PETSC_TRUE;
1380: PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1382: b->j = uj;
1383: b->i = ui;
1384: b->diag = NULL;
1385: b->ilen = NULL;
1386: b->imax = NULL;
1387: b->row = perm;
1388: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1390: PetscCall(PetscObjectReference((PetscObject)perm));
1391: b->icol = perm;
1392: PetscCall(PetscObjectReference((PetscObject)perm));
1393: PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1394: b->maxnz = b->nz = ui[mbs];
1396: B->info.factor_mallocs = reallocs;
1397: B->info.fill_ratio_given = fill;
1398: if (ai[mbs] != 0.) {
1399: /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1400: B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1401: } else {
1402: B->info.fill_ratio_needed = 0.0;
1403: }
1404: #if defined(PETSC_USE_INFO)
1405: if (ai[mbs] != 0.) {
1406: PetscReal af = B->info.fill_ratio_needed;
1407: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1408: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1409: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1410: } else {
1411: PetscCall(PetscInfo(A, "Empty matrix\n"));
1412: }
1413: #endif
1414: if (perm_identity) {
1415: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1416: } else {
1417: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1418: }
1419: PetscFunctionReturn(PETSC_SUCCESS);
1420: }
1422: PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1423: {
1424: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1425: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1426: PetscInt i, k, n = a->mbs;
1427: PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
1428: const MatScalar *aa = a->a, *v;
1429: PetscScalar *x, *s, *t, *ls;
1430: const PetscScalar *b;
1432: PetscFunctionBegin;
1433: PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1434: PetscCall(VecGetArrayRead(bb, &b));
1435: PetscCall(VecGetArray(xx, &x));
1436: t = a->solve_work;
1438: /* forward solve the lower triangular */
1439: PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1441: for (i = 1; i < n; i++) {
1442: v = aa + bs2 * ai[i];
1443: vi = aj + ai[i];
1444: nz = ai[i + 1] - ai[i];
1445: s = t + bs * i;
1446: PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1447: for (k = 0; k < nz; k++) {
1448: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1449: v += bs2;
1450: }
1451: }
1453: /* backward solve the upper triangular */
1454: ls = a->solve_work + A->cmap->n;
1455: for (i = n - 1; i >= 0; i--) {
1456: v = aa + bs2 * (adiag[i + 1] + 1);
1457: vi = aj + adiag[i + 1] + 1;
1458: nz = adiag[i] - adiag[i + 1] - 1;
1459: PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1460: for (k = 0; k < nz; k++) {
1461: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1462: v += bs2;
1463: }
1464: PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1465: PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1466: }
1468: PetscCall(VecRestoreArrayRead(bb, &b));
1469: PetscCall(VecRestoreArray(xx, &x));
1470: PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1471: PetscFunctionReturn(PETSC_SUCCESS);
1472: }
1474: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1475: {
1476: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1477: IS iscol = a->col, isrow = a->row;
1478: const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1479: PetscInt i, m, n = a->mbs;
1480: PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
1481: const MatScalar *aa = a->a, *v;
1482: PetscScalar *x, *s, *t, *ls;
1483: const PetscScalar *b;
1485: PetscFunctionBegin;
1486: PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1487: PetscCall(VecGetArrayRead(bb, &b));
1488: PetscCall(VecGetArray(xx, &x));
1489: t = a->solve_work;
1491: PetscCall(ISGetIndices(isrow, &rout));
1492: r = rout;
1493: PetscCall(ISGetIndices(iscol, &cout));
1494: c = cout;
1496: /* forward solve the lower triangular */
1497: PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1498: for (i = 1; i < n; i++) {
1499: v = aa + bs2 * ai[i];
1500: vi = aj + ai[i];
1501: nz = ai[i + 1] - ai[i];
1502: s = t + bs * i;
1503: PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1504: for (m = 0; m < nz; m++) {
1505: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1506: v += bs2;
1507: }
1508: }
1510: /* backward solve the upper triangular */
1511: ls = a->solve_work + A->cmap->n;
1512: for (i = n - 1; i >= 0; i--) {
1513: v = aa + bs2 * (adiag[i + 1] + 1);
1514: vi = aj + adiag[i + 1] + 1;
1515: nz = adiag[i] - adiag[i + 1] - 1;
1516: PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1517: for (m = 0; m < nz; m++) {
1518: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1519: v += bs2;
1520: }
1521: PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1522: PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1523: }
1524: PetscCall(ISRestoreIndices(isrow, &rout));
1525: PetscCall(ISRestoreIndices(iscol, &cout));
1526: PetscCall(VecRestoreArrayRead(bb, &b));
1527: PetscCall(VecRestoreArray(xx, &x));
1528: PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1529: PetscFunctionReturn(PETSC_SUCCESS);
1530: }