Actual source code: sbaijfact2.c
1: /*
2: Factorization code for SBAIJ format.
3: */
5: #include <../src/mat/impls/sbaij/seq/sbaij.h>
6: #include <../src/mat/impls/baij/seq/baij.h>
7: #include <petsc/private/kernels/blockinvert.h>
9: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
10: {
11: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
12: IS isrow = a->row;
13: PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
14: const PetscInt *r;
15: PetscInt nz, *vj, k, idx, k1;
16: PetscInt bs = A->rmap->bs, bs2 = a->bs2;
17: const MatScalar *aa = a->a, *v, *diag;
18: PetscScalar *x, *xk, *xj, *xk_tmp, *t;
19: const PetscScalar *b;
21: PetscFunctionBegin;
22: PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
23: PetscCall(VecGetArrayRead(bb, &b));
24: PetscCall(VecGetArray(xx, &x));
25: t = a->solve_work;
26: PetscCall(ISGetIndices(isrow, &r));
27: PetscCall(PetscMalloc1(bs, &xk_tmp));
29: /* solve U^T * D * y = b by forward substitution */
30: xk = t;
31: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
32: idx = bs * r[k];
33: for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1];
34: }
35: for (k = 0; k < mbs; k++) {
36: v = aa + bs2 * ai[k];
37: xk = t + k * bs; /* Dk*xk = k-th block of x */
38: PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
39: nz = ai[k + 1] - ai[k];
40: vj = aj + ai[k];
41: xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */
42: while (nz--) {
43: /* x(:) += U(k,:)^T*(Dk*xk) */
44: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
45: vj++;
46: xj = t + (*vj) * bs;
47: v += bs2;
48: }
49: /* xk = inv(Dk)*(Dk*xk) */
50: diag = aa + k * bs2; /* ptr to inv(Dk) */
51: PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
52: }
54: /* solve U*x = y by back substitution */
55: for (k = mbs - 1; k >= 0; k--) {
56: v = aa + bs2 * ai[k];
57: xk = t + k * bs; /* xk */
58: nz = ai[k + 1] - ai[k];
59: vj = aj + ai[k];
60: xj = t + (*vj) * bs;
61: while (nz--) {
62: /* xk += U(k,:)*x(:) */
63: PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
64: vj++;
65: v += bs2;
66: xj = t + (*vj) * bs;
67: }
68: idx = bs * r[k];
69: for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++;
70: }
72: PetscCall(PetscFree(xk_tmp));
73: PetscCall(ISRestoreIndices(isrow, &r));
74: PetscCall(VecRestoreArrayRead(bb, &b));
75: PetscCall(VecRestoreArray(xx, &x));
76: PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs));
77: PetscFunctionReturn(PETSC_SUCCESS);
78: }
80: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
81: {
82: PetscFunctionBegin;
83: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet");
84: }
86: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
87: {
88: PetscFunctionBegin;
89: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented");
90: }
92: static PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
93: {
94: PetscInt nz, k;
95: const PetscInt *vj, bs2 = bs * bs;
96: const MatScalar *v, *diag;
97: PetscScalar *xk, *xj, *xk_tmp;
99: PetscFunctionBegin;
100: PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
101: PetscCall(PetscMalloc1(bs, &xk_tmp));
102: for (k = 0; k < mbs; k++) {
103: v = aa + bs2 * ai[k];
104: xk = x + k * bs; /* Dk*xk = k-th block of x */
105: PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
106: nz = ai[k + 1] - ai[k];
107: vj = aj + ai[k];
108: xj = x + (size_t)(*vj) * bs; /* *vj-th block of x, *vj>k */
109: while (nz--) {
110: /* x(:) += U(k,:)^T*(Dk*xk) */
111: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
112: vj++;
113: xj = x + (size_t)(*vj) * bs;
114: v += bs2;
115: }
116: /* xk = inv(Dk)*(Dk*xk) */
117: diag = aa + k * bs2; /* ptr to inv(Dk) */
118: PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
119: }
120: PetscCall(PetscFree(xk_tmp));
121: PetscFunctionReturn(PETSC_SUCCESS);
122: }
124: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
125: {
126: PetscInt nz, k;
127: const PetscInt *vj, bs2 = bs * bs;
128: const MatScalar *v;
129: PetscScalar *xk, *xj;
131: PetscFunctionBegin;
132: for (k = mbs - 1; k >= 0; k--) {
133: v = aa + bs2 * ai[k];
134: xk = x + k * bs; /* xk */
135: nz = ai[k + 1] - ai[k];
136: vj = aj + ai[k];
137: xj = x + (size_t)(*vj) * bs;
138: while (nz--) {
139: /* xk += U(k,:)*x(:) */
140: PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
141: vj++;
142: v += bs2;
143: xj = x + (size_t)(*vj) * bs;
144: }
145: }
146: PetscFunctionReturn(PETSC_SUCCESS);
147: }
149: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
150: {
151: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
152: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
153: PetscInt bs = A->rmap->bs;
154: const MatScalar *aa = a->a;
155: PetscScalar *x;
156: const PetscScalar *b;
158: PetscFunctionBegin;
159: PetscCall(VecGetArrayRead(bb, &b));
160: PetscCall(VecGetArray(xx, &x));
162: /* solve U^T * D * y = b by forward substitution */
163: PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
164: PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
166: /* solve U*x = y by back substitution */
167: PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
169: PetscCall(VecRestoreArrayRead(bb, &b));
170: PetscCall(VecRestoreArray(xx, &x));
171: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs));
172: PetscFunctionReturn(PETSC_SUCCESS);
173: }
175: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
176: {
177: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
178: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
179: PetscInt bs = A->rmap->bs;
180: const MatScalar *aa = a->a;
181: const PetscScalar *b;
182: PetscScalar *x;
184: PetscFunctionBegin;
185: PetscCall(VecGetArrayRead(bb, &b));
186: PetscCall(VecGetArray(xx, &x));
187: PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
188: PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
189: PetscCall(VecRestoreArrayRead(bb, &b));
190: PetscCall(VecRestoreArray(xx, &x));
191: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs));
192: PetscFunctionReturn(PETSC_SUCCESS);
193: }
195: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
196: {
197: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
198: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
199: PetscInt bs = A->rmap->bs;
200: const MatScalar *aa = a->a;
201: const PetscScalar *b;
202: PetscScalar *x;
204: PetscFunctionBegin;
205: PetscCall(VecGetArrayRead(bb, &b));
206: PetscCall(VecGetArray(xx, &x));
207: PetscCall(PetscArraycpy(x, b, bs * mbs));
208: PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
209: PetscCall(VecRestoreArrayRead(bb, &b));
210: PetscCall(VecRestoreArray(xx, &x));
211: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
212: PetscFunctionReturn(PETSC_SUCCESS);
213: }
215: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
216: {
217: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
218: IS isrow = a->row;
219: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
220: PetscInt nz, k, idx;
221: const MatScalar *aa = a->a, *v, *d;
222: PetscScalar *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp;
223: const PetscScalar *b;
225: PetscFunctionBegin;
226: PetscCall(VecGetArrayRead(bb, &b));
227: PetscCall(VecGetArray(xx, &x));
228: t = a->solve_work;
229: PetscCall(ISGetIndices(isrow, &r));
231: /* solve U^T * D * y = b by forward substitution */
232: tp = t;
233: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
234: idx = 7 * r[k];
235: tp[0] = b[idx];
236: tp[1] = b[idx + 1];
237: tp[2] = b[idx + 2];
238: tp[3] = b[idx + 3];
239: tp[4] = b[idx + 4];
240: tp[5] = b[idx + 5];
241: tp[6] = b[idx + 6];
242: tp += 7;
243: }
245: for (k = 0; k < mbs; k++) {
246: v = aa + 49 * ai[k];
247: vj = aj + ai[k];
248: tp = t + k * 7;
249: x0 = tp[0];
250: x1 = tp[1];
251: x2 = tp[2];
252: x3 = tp[3];
253: x4 = tp[4];
254: x5 = tp[5];
255: x6 = tp[6];
256: nz = ai[k + 1] - ai[k];
257: tp = t + (*vj) * 7;
258: while (nz--) {
259: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
260: tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
261: tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
262: tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
263: tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
264: tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
265: tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
266: vj++;
267: tp = t + (*vj) * 7;
268: v += 49;
269: }
271: /* xk = inv(Dk)*(Dk*xk) */
272: d = aa + k * 49; /* ptr to inv(Dk) */
273: tp = t + k * 7;
274: tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
275: tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
276: tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
277: tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
278: tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
279: tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
280: tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
281: }
283: /* solve U*x = y by back substitution */
284: for (k = mbs - 1; k >= 0; k--) {
285: v = aa + 49 * ai[k];
286: vj = aj + ai[k];
287: tp = t + k * 7;
288: x0 = tp[0];
289: x1 = tp[1];
290: x2 = tp[2];
291: x3 = tp[3];
292: x4 = tp[4];
293: x5 = tp[5];
294: x6 = tp[6]; /* xk */
295: nz = ai[k + 1] - ai[k];
297: tp = t + (*vj) * 7;
298: while (nz--) {
299: /* xk += U(k,:)*x(:) */
300: x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6];
301: x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6];
302: x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6];
303: x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6];
304: x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6];
305: x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6];
306: x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6];
307: vj++;
308: tp = t + (*vj) * 7;
309: v += 49;
310: }
311: tp = t + k * 7;
312: tp[0] = x0;
313: tp[1] = x1;
314: tp[2] = x2;
315: tp[3] = x3;
316: tp[4] = x4;
317: tp[5] = x5;
318: tp[6] = x6;
319: idx = 7 * r[k];
320: x[idx] = x0;
321: x[idx + 1] = x1;
322: x[idx + 2] = x2;
323: x[idx + 3] = x3;
324: x[idx + 4] = x4;
325: x[idx + 5] = x5;
326: x[idx + 6] = x6;
327: }
329: PetscCall(ISRestoreIndices(isrow, &r));
330: PetscCall(VecRestoreArrayRead(bb, &b));
331: PetscCall(VecRestoreArray(xx, &x));
332: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
333: PetscFunctionReturn(PETSC_SUCCESS);
334: }
336: static PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
337: {
338: const MatScalar *v, *d;
339: PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6;
340: PetscInt nz, k;
341: const PetscInt *vj;
343: PetscFunctionBegin;
344: for (k = 0; k < mbs; k++) {
345: v = aa + 49 * ai[k];
346: xp = x + k * 7;
347: x0 = xp[0];
348: x1 = xp[1];
349: x2 = xp[2];
350: x3 = xp[3];
351: x4 = xp[4];
352: x5 = xp[5];
353: x6 = xp[6]; /* Dk*xk = k-th block of x */
354: nz = ai[k + 1] - ai[k];
355: vj = aj + ai[k];
356: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
357: PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358: while (nz--) {
359: xp = x + (*vj) * 7;
360: /* x(:) += U(k,:)^T*(Dk*xk) */
361: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
362: xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
363: xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
364: xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
365: xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
366: xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
367: xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
368: vj++;
369: v += 49;
370: }
371: /* xk = inv(Dk)*(Dk*xk) */
372: d = aa + k * 49; /* ptr to inv(Dk) */
373: xp = x + k * 7;
374: xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
375: xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
376: xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
377: xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
378: xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
379: xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
380: xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
381: }
382: PetscFunctionReturn(PETSC_SUCCESS);
383: }
385: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
386: {
387: const MatScalar *v;
388: PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6;
389: PetscInt nz, k;
390: const PetscInt *vj;
392: PetscFunctionBegin;
393: for (k = mbs - 1; k >= 0; k--) {
394: v = aa + 49 * ai[k];
395: xp = x + k * 7;
396: x0 = xp[0];
397: x1 = xp[1];
398: x2 = xp[2];
399: x3 = xp[3];
400: x4 = xp[4];
401: x5 = xp[5];
402: x6 = xp[6]; /* xk */
403: nz = ai[k + 1] - ai[k];
404: vj = aj + ai[k];
405: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
406: PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
407: while (nz--) {
408: xp = x + (*vj) * 7;
409: /* xk += U(k,:)*x(:) */
410: x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6];
411: x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6];
412: x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6];
413: x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6];
414: x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6];
415: x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6];
416: x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6];
417: vj++;
418: v += 49;
419: }
420: xp = x + k * 7;
421: xp[0] = x0;
422: xp[1] = x1;
423: xp[2] = x2;
424: xp[3] = x3;
425: xp[4] = x4;
426: xp[5] = x5;
427: xp[6] = x6;
428: }
429: PetscFunctionReturn(PETSC_SUCCESS);
430: }
432: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
433: {
434: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
435: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
436: const MatScalar *aa = a->a;
437: PetscScalar *x;
438: const PetscScalar *b;
440: PetscFunctionBegin;
441: PetscCall(VecGetArrayRead(bb, &b));
442: PetscCall(VecGetArray(xx, &x));
444: /* solve U^T * D * y = b by forward substitution */
445: PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */
446: PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
448: /* solve U*x = y by back substitution */
449: PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
451: PetscCall(VecRestoreArrayRead(bb, &b));
452: PetscCall(VecRestoreArray(xx, &x));
453: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
454: PetscFunctionReturn(PETSC_SUCCESS);
455: }
457: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
458: {
459: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
460: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
461: const MatScalar *aa = a->a;
462: PetscScalar *x;
463: const PetscScalar *b;
465: PetscFunctionBegin;
466: PetscCall(VecGetArrayRead(bb, &b));
467: PetscCall(VecGetArray(xx, &x));
468: PetscCall(PetscArraycpy(x, b, 7 * mbs));
469: PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
470: PetscCall(VecRestoreArrayRead(bb, &b));
471: PetscCall(VecRestoreArray(xx, &x));
472: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
473: PetscFunctionReturn(PETSC_SUCCESS);
474: }
476: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
477: {
478: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
479: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
480: const MatScalar *aa = a->a;
481: PetscScalar *x;
482: const PetscScalar *b;
484: PetscFunctionBegin;
485: PetscCall(VecGetArrayRead(bb, &b));
486: PetscCall(VecGetArray(xx, &x));
487: PetscCall(PetscArraycpy(x, b, 7 * mbs));
488: PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
489: PetscCall(VecRestoreArrayRead(bb, &b));
490: PetscCall(VecRestoreArray(xx, &x));
491: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
492: PetscFunctionReturn(PETSC_SUCCESS);
493: }
495: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
496: {
497: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
498: IS isrow = a->row;
499: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
500: PetscInt nz, k, idx;
501: const MatScalar *aa = a->a, *v, *d;
502: PetscScalar *x, x0, x1, x2, x3, x4, x5, *t, *tp;
503: const PetscScalar *b;
505: PetscFunctionBegin;
506: PetscCall(VecGetArrayRead(bb, &b));
507: PetscCall(VecGetArray(xx, &x));
508: t = a->solve_work;
509: PetscCall(ISGetIndices(isrow, &r));
511: /* solve U^T * D * y = b by forward substitution */
512: tp = t;
513: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
514: idx = 6 * r[k];
515: tp[0] = b[idx];
516: tp[1] = b[idx + 1];
517: tp[2] = b[idx + 2];
518: tp[3] = b[idx + 3];
519: tp[4] = b[idx + 4];
520: tp[5] = b[idx + 5];
521: tp += 6;
522: }
524: for (k = 0; k < mbs; k++) {
525: v = aa + 36 * ai[k];
526: vj = aj + ai[k];
527: tp = t + k * 6;
528: x0 = tp[0];
529: x1 = tp[1];
530: x2 = tp[2];
531: x3 = tp[3];
532: x4 = tp[4];
533: x5 = tp[5];
534: nz = ai[k + 1] - ai[k];
535: tp = t + (*vj) * 6;
536: while (nz--) {
537: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
538: tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
539: tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
540: tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
541: tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
542: tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
543: vj++;
544: tp = t + (*vj) * 6;
545: v += 36;
546: }
548: /* xk = inv(Dk)*(Dk*xk) */
549: d = aa + k * 36; /* ptr to inv(Dk) */
550: tp = t + k * 6;
551: tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
552: tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
553: tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
554: tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
555: tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
556: tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
557: }
559: /* solve U*x = y by back substitution */
560: for (k = mbs - 1; k >= 0; k--) {
561: v = aa + 36 * ai[k];
562: vj = aj + ai[k];
563: tp = t + k * 6;
564: x0 = tp[0];
565: x1 = tp[1];
566: x2 = tp[2];
567: x3 = tp[3];
568: x4 = tp[4];
569: x5 = tp[5]; /* xk */
570: nz = ai[k + 1] - ai[k];
572: tp = t + (*vj) * 6;
573: while (nz--) {
574: /* xk += U(k,:)*x(:) */
575: x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5];
576: x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5];
577: x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5];
578: x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5];
579: x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5];
580: x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5];
581: vj++;
582: tp = t + (*vj) * 6;
583: v += 36;
584: }
585: tp = t + k * 6;
586: tp[0] = x0;
587: tp[1] = x1;
588: tp[2] = x2;
589: tp[3] = x3;
590: tp[4] = x4;
591: tp[5] = x5;
592: idx = 6 * r[k];
593: x[idx] = x0;
594: x[idx + 1] = x1;
595: x[idx + 2] = x2;
596: x[idx + 3] = x3;
597: x[idx + 4] = x4;
598: x[idx + 5] = x5;
599: }
601: PetscCall(ISRestoreIndices(isrow, &r));
602: PetscCall(VecRestoreArrayRead(bb, &b));
603: PetscCall(VecRestoreArray(xx, &x));
604: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
605: PetscFunctionReturn(PETSC_SUCCESS);
606: }
608: static PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
609: {
610: const MatScalar *v, *d;
611: PetscScalar *xp, x0, x1, x2, x3, x4, x5;
612: PetscInt nz, k;
613: const PetscInt *vj;
615: PetscFunctionBegin;
616: for (k = 0; k < mbs; k++) {
617: v = aa + 36 * ai[k];
618: xp = x + k * 6;
619: x0 = xp[0];
620: x1 = xp[1];
621: x2 = xp[2];
622: x3 = xp[3];
623: x4 = xp[4];
624: x5 = xp[5]; /* Dk*xk = k-th block of x */
625: nz = ai[k + 1] - ai[k];
626: vj = aj + ai[k];
627: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
628: PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
629: while (nz--) {
630: xp = x + (*vj) * 6;
631: /* x(:) += U(k,:)^T*(Dk*xk) */
632: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
633: xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
634: xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
635: xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
636: xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
637: xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
638: vj++;
639: v += 36;
640: }
641: /* xk = inv(Dk)*(Dk*xk) */
642: d = aa + k * 36; /* ptr to inv(Dk) */
643: xp = x + k * 6;
644: xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
645: xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
646: xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
647: xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
648: xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
649: xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
650: }
651: PetscFunctionReturn(PETSC_SUCCESS);
652: }
653: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
654: {
655: const MatScalar *v;
656: PetscScalar *xp, x0, x1, x2, x3, x4, x5;
657: PetscInt nz, k;
658: const PetscInt *vj;
660: PetscFunctionBegin;
661: for (k = mbs - 1; k >= 0; k--) {
662: v = aa + 36 * ai[k];
663: xp = x + k * 6;
664: x0 = xp[0];
665: x1 = xp[1];
666: x2 = xp[2];
667: x3 = xp[3];
668: x4 = xp[4];
669: x5 = xp[5]; /* xk */
670: nz = ai[k + 1] - ai[k];
671: vj = aj + ai[k];
672: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
673: PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
674: while (nz--) {
675: xp = x + (*vj) * 6;
676: /* xk += U(k,:)*x(:) */
677: x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5];
678: x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5];
679: x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5];
680: x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5];
681: x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5];
682: x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5];
683: vj++;
684: v += 36;
685: }
686: xp = x + k * 6;
687: xp[0] = x0;
688: xp[1] = x1;
689: xp[2] = x2;
690: xp[3] = x3;
691: xp[4] = x4;
692: xp[5] = x5;
693: }
694: PetscFunctionReturn(PETSC_SUCCESS);
695: }
697: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
698: {
699: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
700: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
701: const MatScalar *aa = a->a;
702: PetscScalar *x;
703: const PetscScalar *b;
705: PetscFunctionBegin;
706: PetscCall(VecGetArrayRead(bb, &b));
707: PetscCall(VecGetArray(xx, &x));
709: /* solve U^T * D * y = b by forward substitution */
710: PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
711: PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
713: /* solve U*x = y by back substitution */
714: PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
716: PetscCall(VecRestoreArrayRead(bb, &b));
717: PetscCall(VecRestoreArray(xx, &x));
718: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
719: PetscFunctionReturn(PETSC_SUCCESS);
720: }
722: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
723: {
724: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
725: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
726: const MatScalar *aa = a->a;
727: PetscScalar *x;
728: const PetscScalar *b;
730: PetscFunctionBegin;
731: PetscCall(VecGetArrayRead(bb, &b));
732: PetscCall(VecGetArray(xx, &x));
733: PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
734: PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
735: PetscCall(VecRestoreArrayRead(bb, &b));
736: PetscCall(VecRestoreArray(xx, &x));
737: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
738: PetscFunctionReturn(PETSC_SUCCESS);
739: }
741: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
742: {
743: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
744: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
745: const MatScalar *aa = a->a;
746: PetscScalar *x;
747: const PetscScalar *b;
749: PetscFunctionBegin;
750: PetscCall(VecGetArrayRead(bb, &b));
751: PetscCall(VecGetArray(xx, &x));
752: PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
753: PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
754: PetscCall(VecRestoreArrayRead(bb, &b));
755: PetscCall(VecRestoreArray(xx, &x));
756: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
757: PetscFunctionReturn(PETSC_SUCCESS);
758: }
760: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
761: {
762: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
763: IS isrow = a->row;
764: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
765: const PetscInt *r, *vj;
766: PetscInt nz, k, idx;
767: const MatScalar *aa = a->a, *v, *diag;
768: PetscScalar *x, x0, x1, x2, x3, x4, *t, *tp;
769: const PetscScalar *b;
771: PetscFunctionBegin;
772: PetscCall(VecGetArrayRead(bb, &b));
773: PetscCall(VecGetArray(xx, &x));
774: t = a->solve_work;
775: PetscCall(ISGetIndices(isrow, &r));
777: /* solve U^T * D * y = b by forward substitution */
778: tp = t;
779: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
780: idx = 5 * r[k];
781: tp[0] = b[idx];
782: tp[1] = b[idx + 1];
783: tp[2] = b[idx + 2];
784: tp[3] = b[idx + 3];
785: tp[4] = b[idx + 4];
786: tp += 5;
787: }
789: for (k = 0; k < mbs; k++) {
790: v = aa + 25 * ai[k];
791: vj = aj + ai[k];
792: tp = t + k * 5;
793: x0 = tp[0];
794: x1 = tp[1];
795: x2 = tp[2];
796: x3 = tp[3];
797: x4 = tp[4];
798: nz = ai[k + 1] - ai[k];
800: tp = t + (*vj) * 5;
801: while (nz--) {
802: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
803: tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
804: tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
805: tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
806: tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
807: vj++;
808: tp = t + (*vj) * 5;
809: v += 25;
810: }
812: /* xk = inv(Dk)*(Dk*xk) */
813: diag = aa + k * 25; /* ptr to inv(Dk) */
814: tp = t + k * 5;
815: tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
816: tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
817: tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
818: tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
819: tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
820: }
822: /* solve U*x = y by back substitution */
823: for (k = mbs - 1; k >= 0; k--) {
824: v = aa + 25 * ai[k];
825: vj = aj + ai[k];
826: tp = t + k * 5;
827: x0 = tp[0];
828: x1 = tp[1];
829: x2 = tp[2];
830: x3 = tp[3];
831: x4 = tp[4]; /* xk */
832: nz = ai[k + 1] - ai[k];
834: tp = t + (*vj) * 5;
835: while (nz--) {
836: /* xk += U(k,:)*x(:) */
837: x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4];
838: x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4];
839: x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4];
840: x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4];
841: x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4];
842: vj++;
843: tp = t + (*vj) * 5;
844: v += 25;
845: }
846: tp = t + k * 5;
847: tp[0] = x0;
848: tp[1] = x1;
849: tp[2] = x2;
850: tp[3] = x3;
851: tp[4] = x4;
852: idx = 5 * r[k];
853: x[idx] = x0;
854: x[idx + 1] = x1;
855: x[idx + 2] = x2;
856: x[idx + 3] = x3;
857: x[idx + 4] = x4;
858: }
860: PetscCall(ISRestoreIndices(isrow, &r));
861: PetscCall(VecRestoreArrayRead(bb, &b));
862: PetscCall(VecRestoreArray(xx, &x));
863: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
864: PetscFunctionReturn(PETSC_SUCCESS);
865: }
867: static PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
868: {
869: const MatScalar *v, *diag;
870: PetscScalar *xp, x0, x1, x2, x3, x4;
871: PetscInt nz, k;
872: const PetscInt *vj;
874: PetscFunctionBegin;
875: for (k = 0; k < mbs; k++) {
876: v = aa + 25 * ai[k];
877: xp = x + k * 5;
878: x0 = xp[0];
879: x1 = xp[1];
880: x2 = xp[2];
881: x3 = xp[3];
882: x4 = xp[4]; /* Dk*xk = k-th block of x */
883: nz = ai[k + 1] - ai[k];
884: vj = aj + ai[k];
885: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
886: PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
887: while (nz--) {
888: xp = x + (*vj) * 5;
889: /* x(:) += U(k,:)^T*(Dk*xk) */
890: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
891: xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
892: xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
893: xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
894: xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
895: vj++;
896: v += 25;
897: }
898: /* xk = inv(Dk)*(Dk*xk) */
899: diag = aa + k * 25; /* ptr to inv(Dk) */
900: xp = x + k * 5;
901: xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
902: xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
903: xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
904: xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
905: xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
906: }
907: PetscFunctionReturn(PETSC_SUCCESS);
908: }
910: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
911: {
912: const MatScalar *v;
913: PetscScalar *xp, x0, x1, x2, x3, x4;
914: PetscInt nz, k;
915: const PetscInt *vj;
917: PetscFunctionBegin;
918: for (k = mbs - 1; k >= 0; k--) {
919: v = aa + 25 * ai[k];
920: xp = x + k * 5;
921: x0 = xp[0];
922: x1 = xp[1];
923: x2 = xp[2];
924: x3 = xp[3];
925: x4 = xp[4]; /* xk */
926: nz = ai[k + 1] - ai[k];
927: vj = aj + ai[k];
928: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
929: PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
930: while (nz--) {
931: xp = x + (*vj) * 5;
932: /* xk += U(k,:)*x(:) */
933: x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4];
934: x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4];
935: x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4];
936: x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4];
937: x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4];
938: vj++;
939: v += 25;
940: }
941: xp = x + k * 5;
942: xp[0] = x0;
943: xp[1] = x1;
944: xp[2] = x2;
945: xp[3] = x3;
946: xp[4] = x4;
947: }
948: PetscFunctionReturn(PETSC_SUCCESS);
949: }
951: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
952: {
953: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
954: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
955: const MatScalar *aa = a->a;
956: PetscScalar *x;
957: const PetscScalar *b;
959: PetscFunctionBegin;
960: PetscCall(VecGetArrayRead(bb, &b));
961: PetscCall(VecGetArray(xx, &x));
963: /* solve U^T * D * y = b by forward substitution */
964: PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
965: PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
967: /* solve U*x = y by back substitution */
968: PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
970: PetscCall(VecRestoreArrayRead(bb, &b));
971: PetscCall(VecRestoreArray(xx, &x));
972: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
973: PetscFunctionReturn(PETSC_SUCCESS);
974: }
976: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
977: {
978: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
979: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
980: const MatScalar *aa = a->a;
981: PetscScalar *x;
982: const PetscScalar *b;
984: PetscFunctionBegin;
985: PetscCall(VecGetArrayRead(bb, &b));
986: PetscCall(VecGetArray(xx, &x));
987: PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
988: PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
989: PetscCall(VecRestoreArrayRead(bb, &b));
990: PetscCall(VecRestoreArray(xx, &x));
991: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
992: PetscFunctionReturn(PETSC_SUCCESS);
993: }
995: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
996: {
997: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
998: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
999: const MatScalar *aa = a->a;
1000: PetscScalar *x;
1001: const PetscScalar *b;
1003: PetscFunctionBegin;
1004: PetscCall(VecGetArrayRead(bb, &b));
1005: PetscCall(VecGetArray(xx, &x));
1006: PetscCall(PetscArraycpy(x, b, 5 * mbs));
1007: PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
1008: PetscCall(VecRestoreArrayRead(bb, &b));
1009: PetscCall(VecRestoreArray(xx, &x));
1010: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1011: PetscFunctionReturn(PETSC_SUCCESS);
1012: }
1014: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
1015: {
1016: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1017: IS isrow = a->row;
1018: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1019: const PetscInt *r, *vj;
1020: PetscInt nz, k, idx;
1021: const MatScalar *aa = a->a, *v, *diag;
1022: PetscScalar *x, x0, x1, x2, x3, *t, *tp;
1023: const PetscScalar *b;
1025: PetscFunctionBegin;
1026: PetscCall(VecGetArrayRead(bb, &b));
1027: PetscCall(VecGetArray(xx, &x));
1028: t = a->solve_work;
1029: PetscCall(ISGetIndices(isrow, &r));
1031: /* solve U^T * D * y = b by forward substitution */
1032: tp = t;
1033: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1034: idx = 4 * r[k];
1035: tp[0] = b[idx];
1036: tp[1] = b[idx + 1];
1037: tp[2] = b[idx + 2];
1038: tp[3] = b[idx + 3];
1039: tp += 4;
1040: }
1042: for (k = 0; k < mbs; k++) {
1043: v = aa + 16 * ai[k];
1044: vj = aj + ai[k];
1045: tp = t + k * 4;
1046: x0 = tp[0];
1047: x1 = tp[1];
1048: x2 = tp[2];
1049: x3 = tp[3];
1050: nz = ai[k + 1] - ai[k];
1052: tp = t + (*vj) * 4;
1053: while (nz--) {
1054: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1055: tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1056: tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1057: tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1058: vj++;
1059: tp = t + (*vj) * 4;
1060: v += 16;
1061: }
1063: /* xk = inv(Dk)*(Dk*xk) */
1064: diag = aa + k * 16; /* ptr to inv(Dk) */
1065: tp = t + k * 4;
1066: tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1067: tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1068: tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1069: tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1070: }
1072: /* solve U*x = y by back substitution */
1073: for (k = mbs - 1; k >= 0; k--) {
1074: v = aa + 16 * ai[k];
1075: vj = aj + ai[k];
1076: tp = t + k * 4;
1077: x0 = tp[0];
1078: x1 = tp[1];
1079: x2 = tp[2];
1080: x3 = tp[3]; /* xk */
1081: nz = ai[k + 1] - ai[k];
1083: tp = t + (*vj) * 4;
1084: while (nz--) {
1085: /* xk += U(k,:)*x(:) */
1086: x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3];
1087: x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3];
1088: x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3];
1089: x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3];
1090: vj++;
1091: tp = t + (*vj) * 4;
1092: v += 16;
1093: }
1094: tp = t + k * 4;
1095: tp[0] = x0;
1096: tp[1] = x1;
1097: tp[2] = x2;
1098: tp[3] = x3;
1099: idx = 4 * r[k];
1100: x[idx] = x0;
1101: x[idx + 1] = x1;
1102: x[idx + 2] = x2;
1103: x[idx + 3] = x3;
1104: }
1106: PetscCall(ISRestoreIndices(isrow, &r));
1107: PetscCall(VecRestoreArrayRead(bb, &b));
1108: PetscCall(VecRestoreArray(xx, &x));
1109: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1110: PetscFunctionReturn(PETSC_SUCCESS);
1111: }
1113: static PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1114: {
1115: const MatScalar *v, *diag;
1116: PetscScalar *xp, x0, x1, x2, x3;
1117: PetscInt nz, k;
1118: const PetscInt *vj;
1120: PetscFunctionBegin;
1121: for (k = 0; k < mbs; k++) {
1122: v = aa + 16 * ai[k];
1123: xp = x + k * 4;
1124: x0 = xp[0];
1125: x1 = xp[1];
1126: x2 = xp[2];
1127: x3 = xp[3]; /* Dk*xk = k-th block of x */
1128: nz = ai[k + 1] - ai[k];
1129: vj = aj + ai[k];
1130: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1131: PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1132: while (nz--) {
1133: xp = x + (*vj) * 4;
1134: /* x(:) += U(k,:)^T*(Dk*xk) */
1135: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1136: xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1137: xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1138: xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1139: vj++;
1140: v += 16;
1141: }
1142: /* xk = inv(Dk)*(Dk*xk) */
1143: diag = aa + k * 16; /* ptr to inv(Dk) */
1144: xp = x + k * 4;
1145: xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1146: xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1147: xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1148: xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1149: }
1150: PetscFunctionReturn(PETSC_SUCCESS);
1151: }
1153: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1154: {
1155: const MatScalar *v;
1156: PetscScalar *xp, x0, x1, x2, x3;
1157: PetscInt nz, k;
1158: const PetscInt *vj;
1160: PetscFunctionBegin;
1161: for (k = mbs - 1; k >= 0; k--) {
1162: v = aa + 16 * ai[k];
1163: xp = x + k * 4;
1164: x0 = xp[0];
1165: x1 = xp[1];
1166: x2 = xp[2];
1167: x3 = xp[3]; /* xk */
1168: nz = ai[k + 1] - ai[k];
1169: vj = aj + ai[k];
1170: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1171: PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1172: while (nz--) {
1173: xp = x + (*vj) * 4;
1174: /* xk += U(k,:)*x(:) */
1175: x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3];
1176: x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3];
1177: x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3];
1178: x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3];
1179: vj++;
1180: v += 16;
1181: }
1182: xp = x + k * 4;
1183: xp[0] = x0;
1184: xp[1] = x1;
1185: xp[2] = x2;
1186: xp[3] = x3;
1187: }
1188: PetscFunctionReturn(PETSC_SUCCESS);
1189: }
1191: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1192: {
1193: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1194: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1195: const MatScalar *aa = a->a;
1196: PetscScalar *x;
1197: const PetscScalar *b;
1199: PetscFunctionBegin;
1200: PetscCall(VecGetArrayRead(bb, &b));
1201: PetscCall(VecGetArray(xx, &x));
1203: /* solve U^T * D * y = b by forward substitution */
1204: PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1205: PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1207: /* solve U*x = y by back substitution */
1208: PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1209: PetscCall(VecRestoreArrayRead(bb, &b));
1210: PetscCall(VecRestoreArray(xx, &x));
1211: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1212: PetscFunctionReturn(PETSC_SUCCESS);
1213: }
1215: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1216: {
1217: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1218: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1219: const MatScalar *aa = a->a;
1220: PetscScalar *x;
1221: const PetscScalar *b;
1223: PetscFunctionBegin;
1224: PetscCall(VecGetArrayRead(bb, &b));
1225: PetscCall(VecGetArray(xx, &x));
1226: PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1227: PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1228: PetscCall(VecRestoreArrayRead(bb, &b));
1229: PetscCall(VecRestoreArray(xx, &x));
1230: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1231: PetscFunctionReturn(PETSC_SUCCESS);
1232: }
1234: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1235: {
1236: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1237: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1238: const MatScalar *aa = a->a;
1239: PetscScalar *x;
1240: const PetscScalar *b;
1242: PetscFunctionBegin;
1243: PetscCall(VecGetArrayRead(bb, &b));
1244: PetscCall(VecGetArray(xx, &x));
1245: PetscCall(PetscArraycpy(x, b, 4 * mbs));
1246: PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1247: PetscCall(VecRestoreArrayRead(bb, &b));
1248: PetscCall(VecRestoreArray(xx, &x));
1249: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1250: PetscFunctionReturn(PETSC_SUCCESS);
1251: }
1253: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1254: {
1255: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1256: IS isrow = a->row;
1257: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1258: const PetscInt *r;
1259: PetscInt nz, k, idx;
1260: const PetscInt *vj;
1261: const MatScalar *aa = a->a, *v, *diag;
1262: PetscScalar *x, x0, x1, x2, *t, *tp;
1263: const PetscScalar *b;
1265: PetscFunctionBegin;
1266: PetscCall(VecGetArrayRead(bb, &b));
1267: PetscCall(VecGetArray(xx, &x));
1268: t = a->solve_work;
1269: PetscCall(ISGetIndices(isrow, &r));
1271: /* solve U^T * D * y = b by forward substitution */
1272: tp = t;
1273: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1274: idx = 3 * r[k];
1275: tp[0] = b[idx];
1276: tp[1] = b[idx + 1];
1277: tp[2] = b[idx + 2];
1278: tp += 3;
1279: }
1281: for (k = 0; k < mbs; k++) {
1282: v = aa + 9 * ai[k];
1283: vj = aj + ai[k];
1284: tp = t + k * 3;
1285: x0 = tp[0];
1286: x1 = tp[1];
1287: x2 = tp[2];
1288: nz = ai[k + 1] - ai[k];
1290: tp = t + (*vj) * 3;
1291: while (nz--) {
1292: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1293: tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1294: tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1295: vj++;
1296: tp = t + (*vj) * 3;
1297: v += 9;
1298: }
1300: /* xk = inv(Dk)*(Dk*xk) */
1301: diag = aa + k * 9; /* ptr to inv(Dk) */
1302: tp = t + k * 3;
1303: tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1304: tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1305: tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1306: }
1308: /* solve U*x = y by back substitution */
1309: for (k = mbs - 1; k >= 0; k--) {
1310: v = aa + 9 * ai[k];
1311: vj = aj + ai[k];
1312: tp = t + k * 3;
1313: x0 = tp[0];
1314: x1 = tp[1];
1315: x2 = tp[2]; /* xk */
1316: nz = ai[k + 1] - ai[k];
1318: tp = t + (*vj) * 3;
1319: while (nz--) {
1320: /* xk += U(k,:)*x(:) */
1321: x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2];
1322: x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2];
1323: x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2];
1324: vj++;
1325: tp = t + (*vj) * 3;
1326: v += 9;
1327: }
1328: tp = t + k * 3;
1329: tp[0] = x0;
1330: tp[1] = x1;
1331: tp[2] = x2;
1332: idx = 3 * r[k];
1333: x[idx] = x0;
1334: x[idx + 1] = x1;
1335: x[idx + 2] = x2;
1336: }
1338: PetscCall(ISRestoreIndices(isrow, &r));
1339: PetscCall(VecRestoreArrayRead(bb, &b));
1340: PetscCall(VecRestoreArray(xx, &x));
1341: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1342: PetscFunctionReturn(PETSC_SUCCESS);
1343: }
1345: static PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1346: {
1347: const MatScalar *v, *diag;
1348: PetscScalar *xp, x0, x1, x2;
1349: PetscInt nz, k;
1350: const PetscInt *vj;
1352: PetscFunctionBegin;
1353: for (k = 0; k < mbs; k++) {
1354: v = aa + 9 * ai[k];
1355: xp = x + k * 3;
1356: x0 = xp[0];
1357: x1 = xp[1];
1358: x2 = xp[2]; /* Dk*xk = k-th block of x */
1359: nz = ai[k + 1] - ai[k];
1360: vj = aj + ai[k];
1361: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1362: PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1363: while (nz--) {
1364: xp = x + (*vj) * 3;
1365: /* x(:) += U(k,:)^T*(Dk*xk) */
1366: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1367: xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1368: xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1369: vj++;
1370: v += 9;
1371: }
1372: /* xk = inv(Dk)*(Dk*xk) */
1373: diag = aa + k * 9; /* ptr to inv(Dk) */
1374: xp = x + k * 3;
1375: xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1376: xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1377: xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1378: }
1379: PetscFunctionReturn(PETSC_SUCCESS);
1380: }
1382: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1383: {
1384: const MatScalar *v;
1385: PetscScalar *xp, x0, x1, x2;
1386: PetscInt nz, k;
1387: const PetscInt *vj;
1389: PetscFunctionBegin;
1390: for (k = mbs - 1; k >= 0; k--) {
1391: v = aa + 9 * ai[k];
1392: xp = x + k * 3;
1393: x0 = xp[0];
1394: x1 = xp[1];
1395: x2 = xp[2]; /* xk */
1396: nz = ai[k + 1] - ai[k];
1397: vj = aj + ai[k];
1398: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1399: PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1400: while (nz--) {
1401: xp = x + (*vj) * 3;
1402: /* xk += U(k,:)*x(:) */
1403: x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2];
1404: x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2];
1405: x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2];
1406: vj++;
1407: v += 9;
1408: }
1409: xp = x + k * 3;
1410: xp[0] = x0;
1411: xp[1] = x1;
1412: xp[2] = x2;
1413: }
1414: PetscFunctionReturn(PETSC_SUCCESS);
1415: }
1417: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1418: {
1419: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1420: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1421: const MatScalar *aa = a->a;
1422: PetscScalar *x;
1423: const PetscScalar *b;
1425: PetscFunctionBegin;
1426: PetscCall(VecGetArrayRead(bb, &b));
1427: PetscCall(VecGetArray(xx, &x));
1429: /* solve U^T * D * y = b by forward substitution */
1430: PetscCall(PetscArraycpy(x, b, 3 * mbs));
1431: PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1433: /* solve U*x = y by back substitution */
1434: PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1436: PetscCall(VecRestoreArrayRead(bb, &b));
1437: PetscCall(VecRestoreArray(xx, &x));
1438: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1439: PetscFunctionReturn(PETSC_SUCCESS);
1440: }
1442: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1443: {
1444: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1445: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1446: const MatScalar *aa = a->a;
1447: PetscScalar *x;
1448: const PetscScalar *b;
1450: PetscFunctionBegin;
1451: PetscCall(VecGetArrayRead(bb, &b));
1452: PetscCall(VecGetArray(xx, &x));
1453: PetscCall(PetscArraycpy(x, b, 3 * mbs));
1454: PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1455: PetscCall(VecRestoreArrayRead(bb, &b));
1456: PetscCall(VecRestoreArray(xx, &x));
1457: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1458: PetscFunctionReturn(PETSC_SUCCESS);
1459: }
1461: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1462: {
1463: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1464: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1465: const MatScalar *aa = a->a;
1466: PetscScalar *x;
1467: const PetscScalar *b;
1469: PetscFunctionBegin;
1470: PetscCall(VecGetArrayRead(bb, &b));
1471: PetscCall(VecGetArray(xx, &x));
1472: PetscCall(PetscArraycpy(x, b, 3 * mbs));
1473: PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1474: PetscCall(VecRestoreArrayRead(bb, &b));
1475: PetscCall(VecRestoreArray(xx, &x));
1476: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1477: PetscFunctionReturn(PETSC_SUCCESS);
1478: }
1480: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1481: {
1482: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1483: IS isrow = a->row;
1484: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1485: const PetscInt *r, *vj;
1486: PetscInt nz, k, k2, idx;
1487: const MatScalar *aa = a->a, *v, *diag;
1488: PetscScalar *x, x0, x1, *t;
1489: const PetscScalar *b;
1491: PetscFunctionBegin;
1492: PetscCall(VecGetArrayRead(bb, &b));
1493: PetscCall(VecGetArray(xx, &x));
1494: t = a->solve_work;
1495: PetscCall(ISGetIndices(isrow, &r));
1497: /* solve U^T * D * y = perm(b) by forward substitution */
1498: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1499: idx = 2 * r[k];
1500: t[k * 2] = b[idx];
1501: t[k * 2 + 1] = b[idx + 1];
1502: }
1503: for (k = 0; k < mbs; k++) {
1504: v = aa + 4 * ai[k];
1505: vj = aj + ai[k];
1506: k2 = k * 2;
1507: x0 = t[k2];
1508: x1 = t[k2 + 1];
1509: nz = ai[k + 1] - ai[k];
1510: while (nz--) {
1511: t[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1512: t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1513: vj++;
1514: v += 4;
1515: }
1516: diag = aa + k * 4; /* ptr to inv(Dk) */
1517: t[k2] = diag[0] * x0 + diag[2] * x1;
1518: t[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1519: }
1521: /* solve U*x = y by back substitution */
1522: for (k = mbs - 1; k >= 0; k--) {
1523: v = aa + 4 * ai[k];
1524: vj = aj + ai[k];
1525: k2 = k * 2;
1526: x0 = t[k2];
1527: x1 = t[k2 + 1];
1528: nz = ai[k + 1] - ai[k];
1529: while (nz--) {
1530: x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1];
1531: x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1];
1532: vj++;
1533: v += 4;
1534: }
1535: t[k2] = x0;
1536: t[k2 + 1] = x1;
1537: idx = 2 * r[k];
1538: x[idx] = x0;
1539: x[idx + 1] = x1;
1540: }
1542: PetscCall(ISRestoreIndices(isrow, &r));
1543: PetscCall(VecRestoreArrayRead(bb, &b));
1544: PetscCall(VecRestoreArray(xx, &x));
1545: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1546: PetscFunctionReturn(PETSC_SUCCESS);
1547: }
1549: static PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1550: {
1551: const MatScalar *v, *diag;
1552: PetscScalar x0, x1;
1553: PetscInt nz, k, k2;
1554: const PetscInt *vj;
1556: PetscFunctionBegin;
1557: for (k = 0; k < mbs; k++) {
1558: v = aa + 4 * ai[k];
1559: vj = aj + ai[k];
1560: k2 = k * 2;
1561: x0 = x[k2];
1562: x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */
1563: nz = ai[k + 1] - ai[k];
1564: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1565: PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1566: while (nz--) {
1567: /* x(:) += U(k,:)^T*(Dk*xk) */
1568: x[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1569: x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1570: vj++;
1571: v += 4;
1572: }
1573: /* xk = inv(Dk)*(Dk*xk) */
1574: diag = aa + k * 4; /* ptr to inv(Dk) */
1575: x[k2] = diag[0] * x0 + diag[2] * x1;
1576: x[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1577: }
1578: PetscFunctionReturn(PETSC_SUCCESS);
1579: }
1581: static PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1582: {
1583: const MatScalar *v;
1584: PetscScalar x0, x1;
1585: PetscInt nz, k, k2;
1586: const PetscInt *vj;
1588: PetscFunctionBegin;
1589: for (k = mbs - 1; k >= 0; k--) {
1590: v = aa + 4 * ai[k];
1591: vj = aj + ai[k];
1592: k2 = k * 2;
1593: x0 = x[k2];
1594: x1 = x[k2 + 1]; /* xk */
1595: nz = ai[k + 1] - ai[k];
1596: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1597: PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1598: while (nz--) {
1599: /* xk += U(k,:)*x(:) */
1600: x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1];
1601: x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1];
1602: vj++;
1603: v += 4;
1604: }
1605: x[k2] = x0;
1606: x[k2 + 1] = x1;
1607: }
1608: PetscFunctionReturn(PETSC_SUCCESS);
1609: }
1611: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1612: {
1613: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1614: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1615: const MatScalar *aa = a->a;
1616: PetscScalar *x;
1617: const PetscScalar *b;
1619: PetscFunctionBegin;
1620: PetscCall(VecGetArrayRead(bb, &b));
1621: PetscCall(VecGetArray(xx, &x));
1623: /* solve U^T * D * y = b by forward substitution */
1624: PetscCall(PetscArraycpy(x, b, 2 * mbs));
1625: PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1627: /* solve U*x = y by back substitution */
1628: PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1630: PetscCall(VecRestoreArrayRead(bb, &b));
1631: PetscCall(VecRestoreArray(xx, &x));
1632: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1633: PetscFunctionReturn(PETSC_SUCCESS);
1634: }
1636: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1637: {
1638: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1639: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1640: const MatScalar *aa = a->a;
1641: PetscScalar *x;
1642: const PetscScalar *b;
1644: PetscFunctionBegin;
1645: PetscCall(VecGetArrayRead(bb, &b));
1646: PetscCall(VecGetArray(xx, &x));
1647: PetscCall(PetscArraycpy(x, b, 2 * mbs));
1648: PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1649: PetscCall(VecRestoreArrayRead(bb, &b));
1650: PetscCall(VecRestoreArray(xx, &x));
1651: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1652: PetscFunctionReturn(PETSC_SUCCESS);
1653: }
1655: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1656: {
1657: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1658: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1659: const MatScalar *aa = a->a;
1660: PetscScalar *x;
1661: const PetscScalar *b;
1663: PetscFunctionBegin;
1664: PetscCall(VecGetArrayRead(bb, &b));
1665: PetscCall(VecGetArray(xx, &x));
1666: PetscCall(PetscArraycpy(x, b, 2 * mbs));
1667: PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1668: PetscCall(VecRestoreArrayRead(bb, &b));
1669: PetscCall(VecRestoreArray(xx, &x));
1670: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1671: PetscFunctionReturn(PETSC_SUCCESS);
1672: }
1674: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1675: {
1676: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1677: IS isrow = a->row;
1678: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1679: const MatScalar *aa = a->a, *v;
1680: const PetscScalar *b;
1681: PetscScalar *x, xk, *t;
1682: PetscInt nz, k, j;
1684: PetscFunctionBegin;
1685: PetscCall(VecGetArrayRead(bb, &b));
1686: PetscCall(VecGetArray(xx, &x));
1687: t = a->solve_work;
1688: PetscCall(ISGetIndices(isrow, &rp));
1690: /* solve U^T*D*y = perm(b) by forward substitution */
1691: for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1692: for (k = 0; k < mbs; k++) {
1693: v = aa + ai[k];
1694: vj = aj + ai[k];
1695: xk = t[k];
1696: nz = ai[k + 1] - ai[k] - 1;
1697: for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk;
1698: t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */
1699: }
1701: /* solve U*perm(x) = y by back substitution */
1702: for (k = mbs - 1; k >= 0; k--) {
1703: v = aa + adiag[k] - 1;
1704: vj = aj + adiag[k] - 1;
1705: nz = ai[k + 1] - ai[k] - 1;
1706: for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]];
1707: x[rp[k]] = t[k];
1708: }
1710: PetscCall(ISRestoreIndices(isrow, &rp));
1711: PetscCall(VecRestoreArrayRead(bb, &b));
1712: PetscCall(VecRestoreArray(xx, &x));
1713: PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs));
1714: PetscFunctionReturn(PETSC_SUCCESS);
1715: }
1717: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1718: {
1719: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1720: IS isrow = a->row;
1721: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1722: const MatScalar *aa = a->a, *v;
1723: PetscScalar *x, xk, *t;
1724: const PetscScalar *b;
1725: PetscInt nz, k;
1727: PetscFunctionBegin;
1728: PetscCall(VecGetArrayRead(bb, &b));
1729: PetscCall(VecGetArray(xx, &x));
1730: t = a->solve_work;
1731: PetscCall(ISGetIndices(isrow, &rp));
1733: /* solve U^T*D*y = perm(b) by forward substitution */
1734: for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1735: for (k = 0; k < mbs; k++) {
1736: v = aa + ai[k] + 1;
1737: vj = aj + ai[k] + 1;
1738: xk = t[k];
1739: nz = ai[k + 1] - ai[k] - 1;
1740: while (nz--) t[*vj++] += (*v++) * xk;
1741: t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */
1742: }
1744: /* solve U*perm(x) = y by back substitution */
1745: for (k = mbs - 1; k >= 0; k--) {
1746: v = aa + ai[k] + 1;
1747: vj = aj + ai[k] + 1;
1748: nz = ai[k + 1] - ai[k] - 1;
1749: while (nz--) t[k] += (*v++) * t[*vj++];
1750: x[rp[k]] = t[k];
1751: }
1753: PetscCall(ISRestoreIndices(isrow, &rp));
1754: PetscCall(VecRestoreArrayRead(bb, &b));
1755: PetscCall(VecRestoreArray(xx, &x));
1756: PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
1757: PetscFunctionReturn(PETSC_SUCCESS);
1758: }
1760: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1761: {
1762: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1763: IS isrow = a->row;
1764: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1765: const MatScalar *aa = a->a, *v;
1766: PetscReal diagk;
1767: PetscScalar *x, xk;
1768: const PetscScalar *b;
1769: PetscInt nz, k;
1771: PetscFunctionBegin;
1772: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1773: PetscCall(VecGetArrayRead(bb, &b));
1774: PetscCall(VecGetArray(xx, &x));
1775: PetscCall(ISGetIndices(isrow, &rp));
1777: for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1778: for (k = 0; k < mbs; k++) {
1779: v = aa + ai[k];
1780: vj = aj + ai[k];
1781: xk = x[k];
1782: nz = ai[k + 1] - ai[k] - 1;
1783: while (nz--) x[*vj++] += (*v++) * xk;
1785: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1786: PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1787: x[k] = xk * PetscSqrtReal(diagk);
1788: }
1789: PetscCall(ISRestoreIndices(isrow, &rp));
1790: PetscCall(VecRestoreArrayRead(bb, &b));
1791: PetscCall(VecRestoreArray(xx, &x));
1792: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1793: PetscFunctionReturn(PETSC_SUCCESS);
1794: }
1796: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1797: {
1798: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1799: IS isrow = a->row;
1800: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1801: const MatScalar *aa = a->a, *v;
1802: PetscReal diagk;
1803: PetscScalar *x, xk;
1804: const PetscScalar *b;
1805: PetscInt nz, k;
1807: PetscFunctionBegin;
1808: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1809: PetscCall(VecGetArrayRead(bb, &b));
1810: PetscCall(VecGetArray(xx, &x));
1811: PetscCall(ISGetIndices(isrow, &rp));
1813: for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1814: for (k = 0; k < mbs; k++) {
1815: v = aa + ai[k] + 1;
1816: vj = aj + ai[k] + 1;
1817: xk = x[k];
1818: nz = ai[k + 1] - ai[k] - 1;
1819: while (nz--) x[*vj++] += (*v++) * xk;
1821: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1822: PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1823: x[k] = xk * PetscSqrtReal(diagk);
1824: }
1825: PetscCall(ISRestoreIndices(isrow, &rp));
1826: PetscCall(VecRestoreArrayRead(bb, &b));
1827: PetscCall(VecRestoreArray(xx, &x));
1828: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1829: PetscFunctionReturn(PETSC_SUCCESS);
1830: }
1832: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1833: {
1834: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1835: IS isrow = a->row;
1836: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1837: const MatScalar *aa = a->a, *v;
1838: PetscReal diagk;
1839: PetscScalar *x, *t;
1840: const PetscScalar *b;
1841: PetscInt nz, k;
1843: PetscFunctionBegin;
1844: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1845: PetscCall(VecGetArrayRead(bb, &b));
1846: PetscCall(VecGetArray(xx, &x));
1847: t = a->solve_work;
1848: PetscCall(ISGetIndices(isrow, &rp));
1850: for (k = mbs - 1; k >= 0; k--) {
1851: v = aa + ai[k];
1852: vj = aj + ai[k];
1853: diagk = PetscRealPart(aa[adiag[k]]);
1854: PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1855: t[k] = b[k] * PetscSqrtReal(diagk);
1856: nz = ai[k + 1] - ai[k] - 1;
1857: while (nz--) t[k] += (*v++) * t[*vj++];
1858: x[rp[k]] = t[k];
1859: }
1860: PetscCall(ISRestoreIndices(isrow, &rp));
1861: PetscCall(VecRestoreArrayRead(bb, &b));
1862: PetscCall(VecRestoreArray(xx, &x));
1863: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1864: PetscFunctionReturn(PETSC_SUCCESS);
1865: }
1867: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1868: {
1869: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1870: IS isrow = a->row;
1871: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1872: const MatScalar *aa = a->a, *v;
1873: PetscReal diagk;
1874: PetscScalar *x, *t;
1875: const PetscScalar *b;
1876: PetscInt nz, k;
1878: PetscFunctionBegin;
1879: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1880: PetscCall(VecGetArrayRead(bb, &b));
1881: PetscCall(VecGetArray(xx, &x));
1882: t = a->solve_work;
1883: PetscCall(ISGetIndices(isrow, &rp));
1885: for (k = mbs - 1; k >= 0; k--) {
1886: v = aa + ai[k] + 1;
1887: vj = aj + ai[k] + 1;
1888: diagk = PetscRealPart(aa[ai[k]]);
1889: PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1890: t[k] = b[k] * PetscSqrtReal(diagk);
1891: nz = ai[k + 1] - ai[k] - 1;
1892: while (nz--) t[k] += (*v++) * t[*vj++];
1893: x[rp[k]] = t[k];
1894: }
1895: PetscCall(ISRestoreIndices(isrow, &rp));
1896: PetscCall(VecRestoreArrayRead(bb, &b));
1897: PetscCall(VecRestoreArray(xx, &x));
1898: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1899: PetscFunctionReturn(PETSC_SUCCESS);
1900: }
1902: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx)
1903: {
1904: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1906: PetscFunctionBegin;
1907: if (A->rmap->bs == 1) {
1908: PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v));
1909: } else {
1910: IS isrow = a->row;
1911: const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1912: const MatScalar *aa = a->a, *v;
1913: PetscScalar *x, *t;
1914: const PetscScalar *b;
1915: PetscInt nz, k, n, i, j;
1917: if (bb->n > a->solves_work_n) {
1918: PetscCall(PetscFree(a->solves_work));
1919: PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1920: a->solves_work_n = bb->n;
1921: }
1922: n = bb->n;
1923: PetscCall(VecGetArrayRead(bb->v, &b));
1924: PetscCall(VecGetArray(xx->v, &x));
1925: t = a->solves_work;
1927: PetscCall(ISGetIndices(isrow, &rp));
1929: /* solve U^T*D*y = perm(b) by forward substitution */
1930: for (k = 0; k < mbs; k++) {
1931: for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1932: }
1933: for (k = 0; k < mbs; k++) {
1934: v = aa + ai[k];
1935: vj = aj + ai[k];
1936: nz = ai[k + 1] - ai[k] - 1;
1937: for (j = 0; j < nz; j++) {
1938: for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1939: v++;
1940: vj++;
1941: }
1942: for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1943: }
1945: /* solve U*perm(x) = y by back substitution */
1946: for (k = mbs - 1; k >= 0; k--) {
1947: v = aa + ai[k] - 1;
1948: vj = aj + ai[k] - 1;
1949: nz = ai[k + 1] - ai[k] - 1;
1950: for (j = 0; j < nz; j++) {
1951: for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1952: v++;
1953: vj++;
1954: }
1955: for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1956: }
1958: PetscCall(ISRestoreIndices(isrow, &rp));
1959: PetscCall(VecRestoreArrayRead(bb->v, &b));
1960: PetscCall(VecRestoreArray(xx->v, &x));
1961: PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
1962: }
1963: PetscFunctionReturn(PETSC_SUCCESS);
1964: }
1966: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx)
1967: {
1968: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1970: PetscFunctionBegin;
1971: if (A->rmap->bs == 1) {
1972: PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v));
1973: } else {
1974: IS isrow = a->row;
1975: const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1976: const MatScalar *aa = a->a, *v;
1977: PetscScalar *x, *t;
1978: const PetscScalar *b;
1979: PetscInt nz, k, n, i;
1981: if (bb->n > a->solves_work_n) {
1982: PetscCall(PetscFree(a->solves_work));
1983: PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1984: a->solves_work_n = bb->n;
1985: }
1986: n = bb->n;
1987: PetscCall(VecGetArrayRead(bb->v, &b));
1988: PetscCall(VecGetArray(xx->v, &x));
1989: t = a->solves_work;
1991: PetscCall(ISGetIndices(isrow, &rp));
1993: /* solve U^T*D*y = perm(b) by forward substitution */
1994: for (k = 0; k < mbs; k++) {
1995: for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1996: }
1997: for (k = 0; k < mbs; k++) {
1998: v = aa + ai[k];
1999: vj = aj + ai[k];
2000: nz = ai[k + 1] - ai[k];
2001: while (nz--) {
2002: for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
2003: v++;
2004: vj++;
2005: }
2006: for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */
2007: }
2009: /* solve U*perm(x) = y by back substitution */
2010: for (k = mbs - 1; k >= 0; k--) {
2011: v = aa + ai[k];
2012: vj = aj + ai[k];
2013: nz = ai[k + 1] - ai[k];
2014: while (nz--) {
2015: for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
2016: v++;
2017: vj++;
2018: }
2019: for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
2020: }
2022: PetscCall(ISRestoreIndices(isrow, &rp));
2023: PetscCall(VecRestoreArrayRead(bb->v, &b));
2024: PetscCall(VecRestoreArray(xx->v, &x));
2025: PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
2026: }
2027: PetscFunctionReturn(PETSC_SUCCESS);
2028: }
2030: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2031: {
2032: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2033: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2034: const MatScalar *aa = a->a, *v;
2035: const PetscScalar *b;
2036: PetscScalar *x, xi;
2037: PetscInt nz, i, j;
2039: PetscFunctionBegin;
2040: PetscCall(VecGetArrayRead(bb, &b));
2041: PetscCall(VecGetArray(xx, &x));
2042: /* solve U^T*D*y = b by forward substitution */
2043: PetscCall(PetscArraycpy(x, b, mbs));
2044: for (i = 0; i < mbs; i++) {
2045: v = aa + ai[i];
2046: vj = aj + ai[i];
2047: xi = x[i];
2048: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2049: for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2050: x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2051: }
2052: /* solve U*x = y by backward substitution */
2053: for (i = mbs - 2; i >= 0; i--) {
2054: xi = x[i];
2055: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2056: vj = aj + adiag[i] - 1;
2057: nz = ai[i + 1] - ai[i] - 1;
2058: for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2059: x[i] = xi;
2060: }
2061: PetscCall(VecRestoreArrayRead(bb, &b));
2062: PetscCall(VecRestoreArray(xx, &x));
2063: PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2064: PetscFunctionReturn(PETSC_SUCCESS);
2065: }
2067: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X)
2068: {
2069: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2070: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2071: const MatScalar *aa = a->a, *v;
2072: const PetscScalar *b;
2073: PetscScalar *x, xi;
2074: PetscInt nz, i, j, neq, ldb, ldx;
2075: PetscBool isdense;
2077: PetscFunctionBegin;
2078: if (!mbs) PetscFunctionReturn(PETSC_SUCCESS);
2079: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense));
2080: PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix");
2081: if (X != B) {
2082: PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense));
2083: PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix");
2084: }
2085: PetscCall(MatDenseGetArrayRead(B, &b));
2086: PetscCall(MatDenseGetLDA(B, &ldb));
2087: PetscCall(MatDenseGetArray(X, &x));
2088: PetscCall(MatDenseGetLDA(X, &ldx));
2089: for (neq = 0; neq < B->cmap->n; neq++) {
2090: /* solve U^T*D*y = b by forward substitution */
2091: PetscCall(PetscArraycpy(x, b, mbs));
2092: for (i = 0; i < mbs; i++) {
2093: v = aa + ai[i];
2094: vj = aj + ai[i];
2095: xi = x[i];
2096: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2097: for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2098: x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2099: }
2100: /* solve U*x = y by backward substitution */
2101: for (i = mbs - 2; i >= 0; i--) {
2102: xi = x[i];
2103: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2104: vj = aj + adiag[i] - 1;
2105: nz = ai[i + 1] - ai[i] - 1;
2106: for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2107: x[i] = xi;
2108: }
2109: b += ldb;
2110: x += ldx;
2111: }
2112: PetscCall(MatDenseRestoreArrayRead(B, &b));
2113: PetscCall(MatDenseRestoreArray(X, &x));
2114: PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs)));
2115: PetscFunctionReturn(PETSC_SUCCESS);
2116: }
2118: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2119: {
2120: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2121: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2122: const MatScalar *aa = a->a, *v;
2123: PetscScalar *x, xk;
2124: const PetscScalar *b;
2125: PetscInt nz, k;
2127: PetscFunctionBegin;
2128: PetscCall(VecGetArrayRead(bb, &b));
2129: PetscCall(VecGetArray(xx, &x));
2131: /* solve U^T*D*y = b by forward substitution */
2132: PetscCall(PetscArraycpy(x, b, mbs));
2133: for (k = 0; k < mbs; k++) {
2134: v = aa + ai[k] + 1;
2135: vj = aj + ai[k] + 1;
2136: xk = x[k];
2137: nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2138: while (nz--) x[*vj++] += (*v++) * xk;
2139: x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2140: }
2142: /* solve U*x = y by back substitution */
2143: for (k = mbs - 2; k >= 0; k--) {
2144: v = aa + ai[k] + 1;
2145: vj = aj + ai[k] + 1;
2146: xk = x[k];
2147: nz = ai[k + 1] - ai[k] - 1;
2148: while (nz--) xk += (*v++) * x[*vj++];
2149: x[k] = xk;
2150: }
2152: PetscCall(VecRestoreArrayRead(bb, &b));
2153: PetscCall(VecRestoreArray(xx, &x));
2154: PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2155: PetscFunctionReturn(PETSC_SUCCESS);
2156: }
2158: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2159: {
2160: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2161: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2162: const MatScalar *aa = a->a, *v;
2163: PetscReal diagk;
2164: PetscScalar *x;
2165: const PetscScalar *b;
2166: PetscInt nz, k;
2168: PetscFunctionBegin;
2169: /* solve U^T*D^(1/2)*x = b by forward substitution */
2170: PetscCall(VecGetArrayRead(bb, &b));
2171: PetscCall(VecGetArray(xx, &x));
2172: PetscCall(PetscArraycpy(x, b, mbs));
2173: for (k = 0; k < mbs; k++) {
2174: v = aa + ai[k];
2175: vj = aj + ai[k];
2176: nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2177: while (nz--) x[*vj++] += (*v++) * x[k];
2178: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2179: PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]]));
2180: x[k] *= PetscSqrtReal(diagk);
2181: }
2182: PetscCall(VecRestoreArrayRead(bb, &b));
2183: PetscCall(VecRestoreArray(xx, &x));
2184: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2185: PetscFunctionReturn(PETSC_SUCCESS);
2186: }
2188: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2189: {
2190: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2191: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2192: const MatScalar *aa = a->a, *v;
2193: PetscReal diagk;
2194: PetscScalar *x;
2195: const PetscScalar *b;
2196: PetscInt nz, k;
2198: PetscFunctionBegin;
2199: /* solve U^T*D^(1/2)*x = b by forward substitution */
2200: PetscCall(VecGetArrayRead(bb, &b));
2201: PetscCall(VecGetArray(xx, &x));
2202: PetscCall(PetscArraycpy(x, b, mbs));
2203: for (k = 0; k < mbs; k++) {
2204: v = aa + ai[k] + 1;
2205: vj = aj + ai[k] + 1;
2206: nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2207: while (nz--) x[*vj++] += (*v++) * x[k];
2208: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2209: PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]]));
2210: x[k] *= PetscSqrtReal(diagk);
2211: }
2212: PetscCall(VecRestoreArrayRead(bb, &b));
2213: PetscCall(VecRestoreArray(xx, &x));
2214: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2215: PetscFunctionReturn(PETSC_SUCCESS);
2216: }
2218: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2219: {
2220: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2221: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2222: const MatScalar *aa = a->a, *v;
2223: PetscReal diagk;
2224: PetscScalar *x;
2225: const PetscScalar *b;
2226: PetscInt nz, k;
2228: PetscFunctionBegin;
2229: /* solve D^(1/2)*U*x = b by back substitution */
2230: PetscCall(VecGetArrayRead(bb, &b));
2231: PetscCall(VecGetArray(xx, &x));
2233: for (k = mbs - 1; k >= 0; k--) {
2234: v = aa + ai[k];
2235: vj = aj + ai[k];
2236: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2237: PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2238: x[k] = PetscSqrtReal(diagk) * b[k];
2239: nz = ai[k + 1] - ai[k] - 1;
2240: while (nz--) x[k] += (*v++) * x[*vj++];
2241: }
2242: PetscCall(VecRestoreArrayRead(bb, &b));
2243: PetscCall(VecRestoreArray(xx, &x));
2244: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2245: PetscFunctionReturn(PETSC_SUCCESS);
2246: }
2248: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2249: {
2250: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2251: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2252: const MatScalar *aa = a->a, *v;
2253: PetscReal diagk;
2254: PetscScalar *x;
2255: const PetscScalar *b;
2256: PetscInt nz, k;
2258: PetscFunctionBegin;
2259: /* solve D^(1/2)*U*x = b by back substitution */
2260: PetscCall(VecGetArrayRead(bb, &b));
2261: PetscCall(VecGetArray(xx, &x));
2263: for (k = mbs - 1; k >= 0; k--) {
2264: v = aa + ai[k] + 1;
2265: vj = aj + ai[k] + 1;
2266: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2267: PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2268: x[k] = PetscSqrtReal(diagk) * b[k];
2269: nz = ai[k + 1] - ai[k] - 1;
2270: while (nz--) x[k] += (*v++) * x[*vj++];
2271: }
2272: PetscCall(VecRestoreArrayRead(bb, &b));
2273: PetscCall(VecRestoreArray(xx, &x));
2274: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2275: PetscFunctionReturn(PETSC_SUCCESS);
2276: }
2278: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2279: static PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2280: {
2281: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b;
2282: const PetscInt *rip, mbs = a->mbs, *ai, *aj;
2283: PetscInt *jutmp, bs = A->rmap->bs, i;
2284: PetscInt m, reallocs = 0, *levtmp;
2285: PetscInt *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
2286: PetscInt incrlev, *lev, shift, prow, nz;
2287: PetscReal f = info->fill, levels = info->levels;
2288: PetscBool perm_identity;
2290: PetscFunctionBegin;
2291: /* check whether perm is the identity mapping */
2292: PetscCall(ISIdentity(perm, &perm_identity));
2294: PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2295: a->permute = PETSC_FALSE;
2296: ai = a->i;
2297: aj = a->j;
2299: /* initialization */
2300: PetscCall(ISGetIndices(perm, &rip));
2301: umax = (PetscInt)(f * ai[mbs] + 1);
2302: PetscCall(PetscMalloc1(umax, &lev));
2303: umax += mbs + 1;
2304: shift = mbs + 1;
2305: PetscCall(PetscMalloc1(mbs + 1, &iu));
2306: PetscCall(PetscMalloc1(umax, &ju));
2307: iu[0] = mbs + 1;
2308: juidx = mbs + 1;
2309: /* prowl: linked list for pivot row */
2310: PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp));
2311: /* q: linked list for col index */
2313: for (i = 0; i < mbs; i++) {
2314: prowl[i] = mbs;
2315: q[i] = 0;
2316: }
2318: /* for each row k */
2319: for (k = 0; k < mbs; k++) {
2320: nzk = 0;
2321: q[k] = mbs;
2322: /* copy current row into linked list */
2323: nz = ai[rip[k] + 1] - ai[rip[k]];
2324: j = ai[rip[k]];
2325: while (nz--) {
2326: vj = rip[aj[j++]];
2327: if (vj > k) {
2328: qm = k;
2329: do {
2330: m = qm;
2331: qm = q[m];
2332: } while (qm < vj);
2333: PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
2334: nzk++;
2335: q[m] = vj;
2336: q[vj] = qm;
2337: levtmp[vj] = 0; /* initialize lev for nonzero element */
2338: }
2339: }
2341: /* modify nonzero structure of k-th row by computing fill-in
2342: for each row prow to be merged in */
2343: prow = k;
2344: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2346: while (prow < k) {
2347: /* merge row prow into k-th row */
2348: jmin = iu[prow] + 1;
2349: jmax = iu[prow + 1];
2350: qm = k;
2351: for (j = jmin; j < jmax; j++) {
2352: incrlev = lev[j - shift] + 1;
2353: if (incrlev > levels) continue;
2354: vj = ju[j];
2355: do {
2356: m = qm;
2357: qm = q[m];
2358: } while (qm < vj);
2359: if (qm != vj) { /* a fill */
2360: nzk++;
2361: q[m] = vj;
2362: q[vj] = qm;
2363: qm = vj;
2364: levtmp[vj] = incrlev;
2365: } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2366: }
2367: prow = prowl[prow]; /* next pivot row */
2368: }
2370: /* add k to row list for first nonzero element in k-th row */
2371: if (nzk > 1) {
2372: i = q[k]; /* col value of first nonzero element in k_th row of U */
2373: prowl[k] = prowl[i];
2374: prowl[i] = k;
2375: }
2376: iu[k + 1] = iu[k] + nzk;
2378: /* allocate more space to ju and lev if needed */
2379: if (iu[k + 1] > umax) {
2380: /* estimate how much additional space we will need */
2381: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2382: /* just double the memory each time */
2383: maxadd = umax;
2384: if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
2385: umax += maxadd;
2387: /* allocate a longer ju */
2388: PetscCall(PetscMalloc1(umax, &jutmp));
2389: PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
2390: PetscCall(PetscFree(ju));
2391: ju = jutmp;
2393: PetscCall(PetscMalloc1(umax, &jutmp));
2394: PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift));
2395: PetscCall(PetscFree(lev));
2396: lev = jutmp;
2397: reallocs += 2; /* count how many times we realloc */
2398: }
2400: /* save nonzero structure of k-th row in ju */
2401: i = k;
2402: while (nzk--) {
2403: i = q[i];
2404: ju[juidx] = i;
2405: lev[juidx - shift] = levtmp[i];
2406: juidx++;
2407: }
2408: }
2410: #if defined(PETSC_USE_INFO)
2411: if (ai[mbs] != 0) {
2412: PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2413: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
2414: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2415: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
2416: PetscCall(PetscInfo(A, "for best performance.\n"));
2417: } else {
2418: PetscCall(PetscInfo(A, "Empty matrix\n"));
2419: }
2420: #endif
2422: PetscCall(ISRestoreIndices(perm, &rip));
2423: PetscCall(PetscFree3(prowl, q, levtmp));
2424: PetscCall(PetscFree(lev));
2426: /* put together the new matrix */
2427: PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL));
2429: b = (Mat_SeqSBAIJ *)B->data;
2430: PetscCall(PetscFree2(b->imax, b->ilen));
2432: /* the next line frees the default space generated by the Create() */
2433: PetscCall(MatSeqXAIJFreeAIJ(B, &b->a, &b->j, &b->i));
2434: PetscCall(PetscShmgetAllocateArray((iu[mbs] + 1) * a->bs2, sizeof(PetscScalar), (void **)&b->a));
2435: b->free_a = PETSC_TRUE;
2436: b->free_ij = PETSC_TRUE;
2437: b->j = ju;
2438: b->i = iu;
2439: b->diag = NULL;
2440: b->ilen = NULL;
2441: b->imax = NULL;
2443: PetscCall(ISDestroy(&b->row));
2444: PetscCall(ISDestroy(&b->icol));
2445: b->row = perm;
2446: b->icol = perm;
2447: PetscCall(PetscObjectReference((PetscObject)perm));
2448: PetscCall(PetscObjectReference((PetscObject)perm));
2449: PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
2450: /* In b structure: Free imax, ilen, old a, old j.
2451: Allocate idnew, solve_work, new a, new j */
2452: b->maxnz = b->nz = iu[mbs];
2454: B->info.factor_mallocs = reallocs;
2455: B->info.fill_ratio_given = f;
2456: if (ai[mbs] != 0) {
2457: B->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2458: } else {
2459: B->info.fill_ratio_needed = 0.0;
2460: }
2461: PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity));
2462: PetscFunctionReturn(PETSC_SUCCESS);
2463: }
2465: /*
2466: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2467: */
2468: #include <petscbt.h>
2469: #include <../src/mat/utils/freespace.h>
2470: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2471: {
2472: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b;
2473: PetscBool perm_identity, free_ij = PETSC_TRUE, missing;
2474: PetscInt bs = A->rmap->bs, am = a->mbs, d, *ai = a->i, *aj = a->j;
2475: const PetscInt *rip;
2476: PetscInt reallocs = 0, i, *ui, *udiag, *cols;
2477: PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2478: PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr;
2479: PetscReal fill = info->fill, levels = info->levels;
2480: PetscFreeSpaceList free_space = NULL, current_space = NULL;
2481: PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2482: PetscBT lnkbt;
2484: PetscFunctionBegin;
2485: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2486: PetscCall(MatMissingDiagonal(A, &missing, &d));
2487: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d);
2488: if (bs > 1) {
2489: PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
2490: PetscFunctionReturn(PETSC_SUCCESS);
2491: }
2493: /* check whether perm is the identity mapping */
2494: PetscCall(ISIdentity(perm, &perm_identity));
2495: PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2496: a->permute = PETSC_FALSE;
2498: PetscCall(PetscMalloc1(am + 1, &ui));
2499: PetscCall(PetscMalloc1(am + 1, &udiag));
2500: ui[0] = 0;
2502: /* ICC(0) without matrix ordering: simply rearrange column indices */
2503: if (!levels) {
2504: /* reuse the column pointers and row offsets for memory savings */
2505: for (i = 0; i < am; i++) {
2506: ncols = ai[i + 1] - ai[i];
2507: ui[i + 1] = ui[i] + ncols;
2508: udiag[i] = ui[i + 1] - 1; /* points to the last entry of U(i,:) */
2509: }
2510: PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2511: cols = uj;
2512: for (i = 0; i < am; i++) {
2513: aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2514: ncols = ai[i + 1] - ai[i] - 1;
2515: for (j = 0; j < ncols; j++) *cols++ = aj[j];
2516: *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2517: }
2518: } else { /* case: levels>0 */
2519: PetscCall(ISGetIndices(perm, &rip));
2521: /* initialization */
2522: /* jl: linked list for storing indices of the pivot rows
2523: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2524: PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2525: for (i = 0; i < am; i++) {
2526: jl[i] = am;
2527: il[i] = 0;
2528: }
2530: /* create and initialize a linked list for storing column indices of the active row k */
2531: nlnk = am + 1;
2532: PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2534: /* initial FreeSpace size is fill*(ai[am]+1) */
2535: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2537: current_space = free_space;
2539: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2541: current_space_lvl = free_space_lvl;
2543: for (k = 0; k < am; k++) { /* for each active row k */
2544: /* initialize lnk by the column indices of row k */
2545: nzk = 0;
2546: ncols = ai[k + 1] - ai[k];
2547: PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
2548: cols = aj + ai[k];
2549: PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2550: nzk += nlnk;
2552: /* update lnk by computing fill-in for each pivot row to be merged in */
2553: prow = jl[k]; /* 1st pivot row */
2555: while (prow < k) {
2556: nextprow = jl[prow];
2558: /* merge prow into k-th row */
2559: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2560: jmax = ui[prow + 1];
2561: ncols = jmax - jmin;
2562: i = jmin - ui[prow];
2564: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2565: uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2566: j = *(uj - 1);
2567: PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2568: nzk += nlnk;
2570: /* update il and jl for prow */
2571: if (jmin < jmax) {
2572: il[prow] = jmin;
2574: j = *cols;
2575: jl[prow] = jl[j];
2576: jl[j] = prow;
2577: }
2578: prow = nextprow;
2579: }
2581: /* if free space is not available, make more free space */
2582: if (current_space->local_remaining < nzk) {
2583: i = am - k + 1; /* num of unfactored rows */
2584: i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2585: PetscCall(PetscFreeSpaceGet(i, ¤t_space));
2586: PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl));
2587: reallocs++;
2588: }
2590: /* copy data into free_space and free_space_lvl, then initialize lnk */
2591: PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k);
2592: PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2594: /* add the k-th row into il and jl */
2595: if (nzk > 1) {
2596: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2597: jl[k] = jl[i];
2598: jl[i] = k;
2599: il[k] = ui[k] + 1;
2600: }
2601: uj_ptr[k] = current_space->array;
2602: uj_lvl_ptr[k] = current_space_lvl->array;
2604: current_space->array += nzk;
2605: current_space->local_used += nzk;
2606: current_space->local_remaining -= nzk;
2607: current_space_lvl->array += nzk;
2608: current_space_lvl->local_used += nzk;
2609: current_space_lvl->local_remaining -= nzk;
2611: ui[k + 1] = ui[k] + nzk;
2612: }
2614: PetscCall(ISRestoreIndices(perm, &rip));
2615: PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2617: /* destroy list of free space and other temporary array(s) */
2618: PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2619: PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor */
2620: PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2621: PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2623: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2625: /* put together the new matrix in MATSEQSBAIJ format */
2626: PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2628: b = (Mat_SeqSBAIJ *)fact->data;
2629: PetscCall(PetscFree2(b->imax, b->ilen));
2630: PetscCall(PetscShmgetAllocateArray(ui[am] + 1, sizeof(PetscScalar), (void **)&b->a));
2631: b->free_a = PETSC_TRUE;
2632: b->free_ij = free_ij;
2633: b->j = uj;
2634: b->i = ui;
2635: b->diag = udiag;
2636: b->free_diag = PETSC_TRUE;
2637: b->ilen = NULL;
2638: b->imax = NULL;
2639: b->row = perm;
2640: b->col = perm;
2642: PetscCall(PetscObjectReference((PetscObject)perm));
2643: PetscCall(PetscObjectReference((PetscObject)perm));
2645: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2647: PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2649: b->maxnz = b->nz = ui[am];
2651: fact->info.factor_mallocs = reallocs;
2652: fact->info.fill_ratio_given = fill;
2653: if (ai[am] != 0) {
2654: fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am];
2655: } else {
2656: fact->info.fill_ratio_needed = 0.0;
2657: }
2658: #if defined(PETSC_USE_INFO)
2659: if (ai[am] != 0) {
2660: PetscReal af = fact->info.fill_ratio_needed;
2661: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2662: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2663: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2664: } else {
2665: PetscCall(PetscInfo(A, "Empty matrix\n"));
2666: }
2667: #endif
2668: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2669: PetscFunctionReturn(PETSC_SUCCESS);
2670: }
2672: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2673: {
2674: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2675: Mat_SeqSBAIJ *b;
2676: PetscBool perm_identity, free_ij = PETSC_TRUE;
2677: PetscInt bs = A->rmap->bs, am = a->mbs;
2678: const PetscInt *cols, *rip, *ai = a->i, *aj = a->j;
2679: PetscInt reallocs = 0, i, *ui;
2680: PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2681: PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
2682: PetscReal fill = info->fill, levels = info->levels, ratio_needed;
2683: PetscFreeSpaceList free_space = NULL, current_space = NULL;
2684: PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2685: PetscBT lnkbt;
2687: PetscFunctionBegin;
2688: /*
2689: This code originally uses Modified Sparse Row (MSR) storage
2690: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2691: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2692: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2693: thus the original code in MSR format is still used for these cases.
2694: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2695: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2696: */
2697: if (bs > 1) {
2698: PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
2699: PetscFunctionReturn(PETSC_SUCCESS);
2700: }
2702: /* check whether perm is the identity mapping */
2703: PetscCall(ISIdentity(perm, &perm_identity));
2704: PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2705: a->permute = PETSC_FALSE;
2707: /* special case that simply copies fill pattern */
2708: if (!levels) {
2709: /* reuse the column pointers and row offsets for memory savings */
2710: ui = a->i;
2711: uj = a->j;
2712: free_ij = PETSC_FALSE;
2713: ratio_needed = 1.0;
2714: } else { /* case: levels>0 */
2715: PetscCall(ISGetIndices(perm, &rip));
2717: /* initialization */
2718: PetscCall(PetscMalloc1(am + 1, &ui));
2719: ui[0] = 0;
2721: /* jl: linked list for storing indices of the pivot rows
2722: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2723: PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2724: for (i = 0; i < am; i++) {
2725: jl[i] = am;
2726: il[i] = 0;
2727: }
2729: /* create and initialize a linked list for storing column indices of the active row k */
2730: nlnk = am + 1;
2731: PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2733: /* initial FreeSpace size is fill*(ai[am]+1) */
2734: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2736: current_space = free_space;
2738: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2740: current_space_lvl = free_space_lvl;
2742: for (k = 0; k < am; k++) { /* for each active row k */
2743: /* initialize lnk by the column indices of row rip[k] */
2744: nzk = 0;
2745: ncols = ai[rip[k] + 1] - ai[rip[k]];
2746: cols = aj + ai[rip[k]];
2747: PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2748: nzk += nlnk;
2750: /* update lnk by computing fill-in for each pivot row to be merged in */
2751: prow = jl[k]; /* 1st pivot row */
2753: while (prow < k) {
2754: nextprow = jl[prow];
2756: /* merge prow into k-th row */
2757: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2758: jmax = ui[prow + 1];
2759: ncols = jmax - jmin;
2760: i = jmin - ui[prow];
2761: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2762: j = *(uj_lvl_ptr[prow] + i - 1);
2763: cols_lvl = uj_lvl_ptr[prow] + i;
2764: PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2765: nzk += nlnk;
2767: /* update il and jl for prow */
2768: if (jmin < jmax) {
2769: il[prow] = jmin;
2770: j = *cols;
2771: jl[prow] = jl[j];
2772: jl[j] = prow;
2773: }
2774: prow = nextprow;
2775: }
2777: /* if free space is not available, make more free space */
2778: if (current_space->local_remaining < nzk) {
2779: i = am - k + 1; /* num of unfactored rows */
2780: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2781: PetscCall(PetscFreeSpaceGet(i, ¤t_space));
2782: PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl));
2783: reallocs++;
2784: }
2786: /* copy data into free_space and free_space_lvl, then initialize lnk */
2787: PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2789: /* add the k-th row into il and jl */
2790: if (nzk - 1 > 0) {
2791: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2792: jl[k] = jl[i];
2793: jl[i] = k;
2794: il[k] = ui[k] + 1;
2795: }
2796: uj_ptr[k] = current_space->array;
2797: uj_lvl_ptr[k] = current_space_lvl->array;
2799: current_space->array += nzk;
2800: current_space->local_used += nzk;
2801: current_space->local_remaining -= nzk;
2802: current_space_lvl->array += nzk;
2803: current_space_lvl->local_used += nzk;
2804: current_space_lvl->local_remaining -= nzk;
2806: ui[k + 1] = ui[k] + nzk;
2807: }
2809: PetscCall(ISRestoreIndices(perm, &rip));
2810: PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2812: /* destroy list of free space and other temporary array(s) */
2813: PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2814: PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
2815: PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2816: PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2817: if (ai[am] != 0) {
2818: ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]);
2819: } else {
2820: ratio_needed = 0.0;
2821: }
2822: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2824: /* put together the new matrix in MATSEQSBAIJ format */
2825: PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2827: b = (Mat_SeqSBAIJ *)fact->data;
2828: PetscCall(PetscFree2(b->imax, b->ilen));
2829: PetscCall(PetscShmgetAllocateArray(ui[am] + 1, sizeof(PetscScalar), (void **)&b->a));
2830: b->free_a = PETSC_TRUE;
2831: b->free_ij = free_ij;
2832: b->j = uj;
2833: b->i = ui;
2834: b->diag = NULL;
2835: b->ilen = NULL;
2836: b->imax = NULL;
2837: b->row = perm;
2838: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2840: PetscCall(PetscObjectReference((PetscObject)perm));
2841: b->icol = perm;
2842: PetscCall(PetscObjectReference((PetscObject)perm));
2843: PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2845: b->maxnz = b->nz = ui[am];
2847: fact->info.factor_mallocs = reallocs;
2848: fact->info.fill_ratio_given = fill;
2849: fact->info.fill_ratio_needed = ratio_needed;
2850: #if defined(PETSC_USE_INFO)
2851: if (ai[am] != 0) {
2852: PetscReal af = fact->info.fill_ratio_needed;
2853: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2854: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2855: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2856: } else {
2857: PetscCall(PetscInfo(A, "Empty matrix\n"));
2858: }
2859: #endif
2860: if (perm_identity) {
2861: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2862: } else {
2863: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2864: }
2865: PetscFunctionReturn(PETSC_SUCCESS);
2866: }