Actual source code: inode.c
1: /*
2: This file provides high performance routines for the Inode format (compressed sparse row)
3: by taking advantage of rows with identical nonzero structure (I-nodes).
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #if defined(PETSC_HAVE_XMMINTRIN_H)
7: #include <xmmintrin.h>
8: #endif
10: static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13: PetscInt i, count, m, n, min_mn, *ns_row, *ns_col;
15: PetscFunctionBegin;
16: n = A->cmap->n;
17: m = A->rmap->n;
18: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19: ns_row = a->inode.size_csr;
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24: for (; count + 1 < n; count++, i++);
25: if (count < n) i++;
26: *size = i;
27: PetscFunctionReturn(PETSC_SUCCESS);
28: }
29: PetscCall(PetscMalloc1(n + 1, &ns_col));
30: ns_col[0] = 0;
32: /* Use the same row structure wherever feasible. */
33: for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
35: /* if m < n; pad up the remainder with inode_limit */
36: for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37: /* The last node is the odd ball. pad it up with the remaining rows; */
38: if (count < n) {
39: ns_col[i + 1] = ns_col[i] + (n - count);
40: i++;
41: } else if (count > n) {
42: /* Adjust for the over estimation */
43: ns_col[i] += n - count;
44: }
45: *size = i;
46: *ns = ns_col;
47: PetscFunctionReturn(PETSC_SUCCESS);
48: }
50: /*
51: This builds symmetric version of nonzero structure,
52: */
53: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54: {
55: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
56: PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
57: PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
58: const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
60: PetscFunctionBegin;
61: nslim_row = a->inode.node_count;
62: m = A->rmap->n;
63: n = A->cmap->n;
64: PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
65: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
67: /* Use the row_inode as column_inode */
68: nslim_col = nslim_row;
69: ns_col = ns_row;
71: /* allocate space for reformatted inode structure */
72: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
73: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
75: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
76: nsz = ns_col[i1 + 1] - ns_col[i1];
77: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
78: }
79: /* allocate space for row pointers */
80: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
81: *iia = ia;
82: PetscCall(PetscMalloc1(nslim_row + 1, &work));
84: /* determine the number of columns in each row */
85: ia[0] = oshift;
86: for (i1 = 0; i1 < nslim_row; i1++) {
87: row = ns_row[i1];
88: j = aj + ai[row] + ishift;
89: jmax = aj + ai[row + 1] + ishift;
90: if (j == jmax) continue; /* empty row */
91: col = *j++ + ishift;
92: i2 = tvc[col];
93: while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
94: ia[i1 + 1]++;
95: ia[i2 + 1]++;
96: i2++; /* Start col of next node */
97: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
98: i2 = tvc[col];
99: }
100: if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101: }
103: /* shift ia[i] to point to next row */
104: for (i1 = 1; i1 < nslim_row + 1; i1++) {
105: row = ia[i1 - 1];
106: ia[i1] += row;
107: work[i1 - 1] = row - oshift;
108: }
110: /* allocate space for column pointers */
111: nz = ia[nslim_row] + (!ishift);
112: PetscCall(PetscMalloc1(nz, &ja));
113: *jja = ja;
115: /* loop over lower triangular part putting into ja */
116: for (i1 = 0; i1 < nslim_row; i1++) {
117: row = ns_row[i1];
118: j = aj + ai[row] + ishift;
119: jmax = aj + ai[row + 1] + ishift;
120: if (j == jmax) continue; /* empty row */
121: col = *j++ + ishift;
122: i2 = tvc[col];
123: while (i2 < i1 && j < jmax) {
124: ja[work[i2]++] = i1 + oshift;
125: ja[work[i1]++] = i2 + oshift;
126: ++i2;
127: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
128: i2 = tvc[col];
129: }
130: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
131: }
132: PetscCall(PetscFree(work));
133: PetscCall(PetscFree2(tns, tvc));
134: PetscFunctionReturn(PETSC_SUCCESS);
135: }
137: /*
138: This builds nonsymmetric version of nonzero structure,
139: */
140: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141: {
142: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
143: PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
144: PetscInt *tns, *tvc, nsz, i1, i2;
145: const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
147: PetscFunctionBegin;
148: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
149: nslim_row = a->inode.node_count;
150: n = A->cmap->n;
152: /* Create The column_inode for this matrix */
153: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
155: /* allocate space for reformatted column_inode structure */
156: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
159: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160: nsz = ns_col[i1 + 1] - ns_col[i1];
161: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
162: }
163: /* allocate space for row pointers */
164: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
165: *iia = ia;
166: PetscCall(PetscMalloc1(nslim_row + 1, &work));
168: /* determine the number of columns in each row */
169: ia[0] = oshift;
170: for (i1 = 0; i1 < nslim_row; i1++) {
171: row = ns_row[i1];
172: j = aj + ai[row] + ishift;
173: nz = ai[row + 1] - ai[row];
174: if (!nz) continue; /* empty row */
175: col = *j++ + ishift;
176: i2 = tvc[col];
177: while (nz-- > 0) { /* off-diagonal elements */
178: ia[i1 + 1]++;
179: i2++; /* Start col of next node */
180: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
181: if (nz > 0) i2 = tvc[col];
182: }
183: }
185: /* shift ia[i] to point to next row */
186: for (i1 = 1; i1 < nslim_row + 1; i1++) {
187: row = ia[i1 - 1];
188: ia[i1] += row;
189: work[i1 - 1] = row - oshift;
190: }
192: /* allocate space for column pointers */
193: nz = ia[nslim_row] + (!ishift);
194: PetscCall(PetscMalloc1(nz, &ja));
195: *jja = ja;
197: /* loop over matrix putting into ja */
198: for (i1 = 0; i1 < nslim_row; i1++) {
199: row = ns_row[i1];
200: j = aj + ai[row] + ishift;
201: nz = ai[row + 1] - ai[row];
202: if (!nz) continue; /* empty row */
203: col = *j++ + ishift;
204: i2 = tvc[col];
205: while (nz-- > 0) {
206: ja[work[i1]++] = i2 + oshift;
207: ++i2;
208: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
209: if (nz > 0) i2 = tvc[col];
210: }
211: }
212: PetscCall(PetscFree(ns_col));
213: PetscCall(PetscFree(work));
214: PetscCall(PetscFree2(tns, tvc));
215: PetscFunctionReturn(PETSC_SUCCESS);
216: }
218: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219: {
220: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
222: PetscFunctionBegin;
223: if (n) *n = a->inode.node_count;
224: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
225: if (!blockcompressed) {
226: PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
227: } else if (symmetric) {
228: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
229: } else {
230: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
231: }
232: PetscFunctionReturn(PETSC_SUCCESS);
233: }
235: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236: {
237: PetscFunctionBegin;
238: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
240: if (!blockcompressed) {
241: PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
242: } else {
243: PetscCall(PetscFree(*ia));
244: PetscCall(PetscFree(*ja));
245: }
246: PetscFunctionReturn(PETSC_SUCCESS);
247: }
249: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250: {
251: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252: PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
253: PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
255: PetscFunctionBegin;
256: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
257: nslim_row = a->inode.node_count;
258: n = A->cmap->n;
260: /* Create The column_inode for this matrix */
261: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
263: /* allocate space for reformatted column_inode structure */
264: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
265: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
267: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
268: nsz = ns_col[i1 + 1] - ns_col[i1];
269: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
270: }
271: /* allocate space for column pointers */
272: PetscCall(PetscCalloc1(nslim_col + 1, &ia));
273: *iia = ia;
274: PetscCall(PetscMalloc1(nslim_col + 1, &work));
276: /* determine the number of columns in each row */
277: ia[0] = oshift;
278: for (i1 = 0; i1 < nslim_row; i1++) {
279: row = ns_row[i1];
280: j = aj + ai[row] + ishift;
281: col = *j++ + ishift;
282: i2 = tvc[col];
283: nz = ai[row + 1] - ai[row];
284: while (nz-- > 0) { /* off-diagonal elements */
285: /* ia[i1+1]++; */
286: ia[i2 + 1]++;
287: i2++;
288: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
289: if (nz > 0) i2 = tvc[col];
290: }
291: }
293: /* shift ia[i] to point to next col */
294: for (i1 = 1; i1 < nslim_col + 1; i1++) {
295: col = ia[i1 - 1];
296: ia[i1] += col;
297: work[i1 - 1] = col - oshift;
298: }
300: /* allocate space for column pointers */
301: nz = ia[nslim_col] + (!ishift);
302: PetscCall(PetscMalloc1(nz, &ja));
303: *jja = ja;
305: /* loop over matrix putting into ja */
306: for (i1 = 0; i1 < nslim_row; i1++) {
307: row = ns_row[i1];
308: j = aj + ai[row] + ishift;
309: col = *j++ + ishift;
310: i2 = tvc[col];
311: nz = ai[row + 1] - ai[row];
312: while (nz-- > 0) {
313: /* ja[work[i1]++] = i2 + oshift; */
314: ja[work[i2]++] = i1 + oshift;
315: i2++;
316: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
317: if (nz > 0) i2 = tvc[col];
318: }
319: }
320: PetscCall(PetscFree(ns_col));
321: PetscCall(PetscFree(work));
322: PetscCall(PetscFree2(tns, tvc));
323: PetscFunctionReturn(PETSC_SUCCESS);
324: }
326: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327: {
328: PetscFunctionBegin;
329: PetscCall(MatCreateColInode_Private(A, n, NULL));
330: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
332: if (!blockcompressed) {
333: PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
334: } else if (symmetric) {
335: /* Since the indices are symmetric it doesn't matter */
336: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
337: } else {
338: PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
339: }
340: PetscFunctionReturn(PETSC_SUCCESS);
341: }
343: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344: {
345: PetscFunctionBegin;
346: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
347: if (!blockcompressed) {
348: PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
349: } else {
350: PetscCall(PetscFree(*ia));
351: PetscCall(PetscFree(*ja));
352: }
353: PetscFunctionReturn(PETSC_SUCCESS);
354: }
356: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357: {
358: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
359: PetscScalar *y;
360: const PetscScalar *x;
361: PetscInt row, node_max, nonzerorow = 0;
362: PetscInt *ns;
364: PetscFunctionBegin;
365: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
366: node_max = a->inode.node_count;
367: ns = a->inode.size_csr; /* Node Size array */
368: PetscCall(VecGetArrayRead(xx, &x));
369: PetscCall(VecGetArray(yy, &y));
371: PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow))
372: for (PetscInt i = 0; i < node_max; ++i) {
373: PetscInt i1, i2, nsz, n, sz;
374: const MatScalar *v1, *v2, *v3, *v4, *v5;
375: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
376: const PetscInt *idx;
378: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379: #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
380: #endif
381: row = ns[i];
382: nsz = ns[i + 1] - ns[i];
383: n = a->i[row + 1] - a->i[row];
384: nonzerorow += (n > 0) * nsz;
386: idx = &a->j[a->i[row]];
387: v1 = &a->a[a->i[row]];
388: PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
389: PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
390: sz = n; /* No of non zeros in this row */
391: /* Switch on the size of Node */
392: switch (nsz) { /* Each loop in 'case' is unrolled */
393: case 1:
394: sum1 = 0.;
396: for (n = 0; n < sz - 1; n += 2) {
397: i1 = idx[0]; /* The instructions are ordered to */
398: i2 = idx[1]; /* make the compiler's job easy */
399: idx += 2;
400: tmp0 = x[i1];
401: tmp1 = x[i2];
402: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
403: v1 += 2;
404: }
406: if (n == sz - 1) { /* Take care of the last nonzero */
407: tmp0 = x[*idx++];
408: sum1 += *v1++ * tmp0;
409: }
410: y[row++] = sum1;
411: break;
412: case 2:
413: sum1 = 0.;
414: sum2 = 0.;
415: v2 = v1 + n;
417: for (n = 0; n < sz - 1; n += 2) {
418: i1 = idx[0];
419: i2 = idx[1];
420: idx += 2;
421: tmp0 = x[i1];
422: tmp1 = x[i2];
423: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
424: v1 += 2;
425: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
426: v2 += 2;
427: }
428: if (n == sz - 1) {
429: tmp0 = x[*idx++];
430: sum1 += *v1++ * tmp0;
431: sum2 += *v2++ * tmp0;
432: }
433: y[row++] = sum1;
434: y[row++] = sum2;
435: v1 = v2; /* Since the next block to be processed starts there*/
436: idx += sz;
437: break;
438: case 3:
439: sum1 = 0.;
440: sum2 = 0.;
441: sum3 = 0.;
442: v2 = v1 + n;
443: v3 = v2 + n;
445: for (n = 0; n < sz - 1; n += 2) {
446: i1 = idx[0];
447: i2 = idx[1];
448: idx += 2;
449: tmp0 = x[i1];
450: tmp1 = x[i2];
451: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
452: v1 += 2;
453: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
454: v2 += 2;
455: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
456: v3 += 2;
457: }
458: if (n == sz - 1) {
459: tmp0 = x[*idx++];
460: sum1 += *v1++ * tmp0;
461: sum2 += *v2++ * tmp0;
462: sum3 += *v3++ * tmp0;
463: }
464: y[row++] = sum1;
465: y[row++] = sum2;
466: y[row++] = sum3;
467: v1 = v3; /* Since the next block to be processed starts there*/
468: idx += 2 * sz;
469: break;
470: case 4:
471: sum1 = 0.;
472: sum2 = 0.;
473: sum3 = 0.;
474: sum4 = 0.;
475: v2 = v1 + n;
476: v3 = v2 + n;
477: v4 = v3 + n;
479: for (n = 0; n < sz - 1; n += 2) {
480: i1 = idx[0];
481: i2 = idx[1];
482: idx += 2;
483: tmp0 = x[i1];
484: tmp1 = x[i2];
485: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
486: v1 += 2;
487: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
488: v2 += 2;
489: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
490: v3 += 2;
491: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
492: v4 += 2;
493: }
494: if (n == sz - 1) {
495: tmp0 = x[*idx++];
496: sum1 += *v1++ * tmp0;
497: sum2 += *v2++ * tmp0;
498: sum3 += *v3++ * tmp0;
499: sum4 += *v4++ * tmp0;
500: }
501: y[row++] = sum1;
502: y[row++] = sum2;
503: y[row++] = sum3;
504: y[row++] = sum4;
505: v1 = v4; /* Since the next block to be processed starts there*/
506: idx += 3 * sz;
507: break;
508: case 5:
509: sum1 = 0.;
510: sum2 = 0.;
511: sum3 = 0.;
512: sum4 = 0.;
513: sum5 = 0.;
514: v2 = v1 + n;
515: v3 = v2 + n;
516: v4 = v3 + n;
517: v5 = v4 + n;
519: for (n = 0; n < sz - 1; n += 2) {
520: i1 = idx[0];
521: i2 = idx[1];
522: idx += 2;
523: tmp0 = x[i1];
524: tmp1 = x[i2];
525: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
526: v1 += 2;
527: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
528: v2 += 2;
529: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
530: v3 += 2;
531: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
532: v4 += 2;
533: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
534: v5 += 2;
535: }
536: if (n == sz - 1) {
537: tmp0 = x[*idx++];
538: sum1 += *v1++ * tmp0;
539: sum2 += *v2++ * tmp0;
540: sum3 += *v3++ * tmp0;
541: sum4 += *v4++ * tmp0;
542: sum5 += *v5++ * tmp0;
543: }
544: y[row++] = sum1;
545: y[row++] = sum2;
546: y[row++] = sum3;
547: y[row++] = sum4;
548: y[row++] = sum5;
549: v1 = v5; /* Since the next block to be processed starts there */
550: idx += 4 * sz;
551: break;
552: default:
553: SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
554: }
555: }
556: PetscCall(VecRestoreArrayRead(xx, &x));
557: PetscCall(VecRestoreArray(yy, &y));
558: PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
559: PetscFunctionReturn(PETSC_SUCCESS);
560: }
562: /* Almost same code as the MatMult_SeqAIJ_Inode() */
563: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
564: {
565: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
566: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
567: const MatScalar *v1, *v2, *v3, *v4, *v5;
568: const PetscScalar *x;
569: PetscScalar *y, *z, *zt;
570: PetscInt i1, i2, n, i, row, node_max, nsz, sz;
571: const PetscInt *idx, *ns, *ii;
573: PetscFunctionBegin;
574: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
575: node_max = a->inode.node_count;
576: ns = a->inode.size_csr; /* Node Size array */
578: PetscCall(VecGetArrayRead(xx, &x));
579: PetscCall(VecGetArrayPair(zz, yy, &z, &y));
580: zt = z;
582: idx = a->j;
583: v1 = a->a;
584: ii = a->i;
586: for (i = 0; i < node_max; ++i) {
587: row = ns[i];
588: nsz = ns[i + 1] - ns[i];
589: n = ii[1] - ii[0];
590: ii += nsz;
591: sz = n; /* No of non zeros in this row */
592: /* Switch on the size of Node */
593: switch (nsz) { /* Each loop in 'case' is unrolled */
594: case 1:
595: sum1 = *zt++;
597: for (n = 0; n < sz - 1; n += 2) {
598: i1 = idx[0]; /* The instructions are ordered to */
599: i2 = idx[1]; /* make the compiler's job easy */
600: idx += 2;
601: tmp0 = x[i1];
602: tmp1 = x[i2];
603: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
604: v1 += 2;
605: }
607: if (n == sz - 1) { /* Take care of the last nonzero */
608: tmp0 = x[*idx++];
609: sum1 += *v1++ * tmp0;
610: }
611: y[row++] = sum1;
612: break;
613: case 2:
614: sum1 = *zt++;
615: sum2 = *zt++;
616: v2 = v1 + n;
618: for (n = 0; n < sz - 1; n += 2) {
619: i1 = idx[0];
620: i2 = idx[1];
621: idx += 2;
622: tmp0 = x[i1];
623: tmp1 = x[i2];
624: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
625: v1 += 2;
626: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
627: v2 += 2;
628: }
629: if (n == sz - 1) {
630: tmp0 = x[*idx++];
631: sum1 += *v1++ * tmp0;
632: sum2 += *v2++ * tmp0;
633: }
634: y[row++] = sum1;
635: y[row++] = sum2;
636: v1 = v2; /* Since the next block to be processed starts there*/
637: idx += sz;
638: break;
639: case 3:
640: sum1 = *zt++;
641: sum2 = *zt++;
642: sum3 = *zt++;
643: v2 = v1 + n;
644: v3 = v2 + n;
646: for (n = 0; n < sz - 1; n += 2) {
647: i1 = idx[0];
648: i2 = idx[1];
649: idx += 2;
650: tmp0 = x[i1];
651: tmp1 = x[i2];
652: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
653: v1 += 2;
654: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
655: v2 += 2;
656: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
657: v3 += 2;
658: }
659: if (n == sz - 1) {
660: tmp0 = x[*idx++];
661: sum1 += *v1++ * tmp0;
662: sum2 += *v2++ * tmp0;
663: sum3 += *v3++ * tmp0;
664: }
665: y[row++] = sum1;
666: y[row++] = sum2;
667: y[row++] = sum3;
668: v1 = v3; /* Since the next block to be processed starts there*/
669: idx += 2 * sz;
670: break;
671: case 4:
672: sum1 = *zt++;
673: sum2 = *zt++;
674: sum3 = *zt++;
675: sum4 = *zt++;
676: v2 = v1 + n;
677: v3 = v2 + n;
678: v4 = v3 + n;
680: for (n = 0; n < sz - 1; n += 2) {
681: i1 = idx[0];
682: i2 = idx[1];
683: idx += 2;
684: tmp0 = x[i1];
685: tmp1 = x[i2];
686: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
687: v1 += 2;
688: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
689: v2 += 2;
690: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
691: v3 += 2;
692: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
693: v4 += 2;
694: }
695: if (n == sz - 1) {
696: tmp0 = x[*idx++];
697: sum1 += *v1++ * tmp0;
698: sum2 += *v2++ * tmp0;
699: sum3 += *v3++ * tmp0;
700: sum4 += *v4++ * tmp0;
701: }
702: y[row++] = sum1;
703: y[row++] = sum2;
704: y[row++] = sum3;
705: y[row++] = sum4;
706: v1 = v4; /* Since the next block to be processed starts there*/
707: idx += 3 * sz;
708: break;
709: case 5:
710: sum1 = *zt++;
711: sum2 = *zt++;
712: sum3 = *zt++;
713: sum4 = *zt++;
714: sum5 = *zt++;
715: v2 = v1 + n;
716: v3 = v2 + n;
717: v4 = v3 + n;
718: v5 = v4 + n;
720: for (n = 0; n < sz - 1; n += 2) {
721: i1 = idx[0];
722: i2 = idx[1];
723: idx += 2;
724: tmp0 = x[i1];
725: tmp1 = x[i2];
726: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
727: v1 += 2;
728: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
729: v2 += 2;
730: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
731: v3 += 2;
732: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
733: v4 += 2;
734: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
735: v5 += 2;
736: }
737: if (n == sz - 1) {
738: tmp0 = x[*idx++];
739: sum1 += *v1++ * tmp0;
740: sum2 += *v2++ * tmp0;
741: sum3 += *v3++ * tmp0;
742: sum4 += *v4++ * tmp0;
743: sum5 += *v5++ * tmp0;
744: }
745: y[row++] = sum1;
746: y[row++] = sum2;
747: y[row++] = sum3;
748: y[row++] = sum4;
749: y[row++] = sum5;
750: v1 = v5; /* Since the next block to be processed starts there */
751: idx += 4 * sz;
752: break;
753: default:
754: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
755: }
756: }
757: PetscCall(VecRestoreArrayRead(xx, &x));
758: PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
759: PetscCall(PetscLogFlops(2.0 * a->nz));
760: PetscFunctionReturn(PETSC_SUCCESS);
761: }
763: static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
764: {
765: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
766: IS iscol = a->col, isrow = a->row;
767: const PetscInt *r, *c, *rout, *cout;
768: PetscInt i, j, n = A->rmap->n, nz;
769: PetscInt node_max, *ns, row, nsz, aii, i0, i1;
770: const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
771: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
772: PetscScalar sum1, sum2, sum3, sum4, sum5;
773: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
774: const PetscScalar *b;
776: PetscFunctionBegin;
777: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
778: node_max = a->inode.node_count;
779: ns = a->inode.size_csr; /* Node Size array */
781: PetscCall(VecGetArrayRead(bb, &b));
782: PetscCall(VecGetArrayWrite(xx, &x));
783: tmp = a->solve_work;
785: PetscCall(ISGetIndices(isrow, &rout));
786: r = rout;
787: PetscCall(ISGetIndices(iscol, &cout));
788: c = cout + (n - 1);
790: /* forward solve the lower triangular */
791: tmps = tmp;
792: aa = a_a;
793: aj = a_j;
794: ad = a->diag;
796: for (i = 0, row = 0; i < node_max; ++i) {
797: row = ns[i];
798: nsz = ns[i + 1] - ns[i];
799: aii = ai[row];
800: v1 = aa + aii;
801: vi = aj + aii;
802: nz = ad[row] - aii;
803: if (i < node_max - 1) {
804: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
805: * but our indexing to determine its size could. */
806: PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
807: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
808: PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
809: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
810: }
812: switch (nsz) { /* Each loop in 'case' is unrolled */
813: case 1:
814: sum1 = b[*r++];
815: for (j = 0; j < nz - 1; j += 2) {
816: i0 = vi[0];
817: i1 = vi[1];
818: vi += 2;
819: tmp0 = tmps[i0];
820: tmp1 = tmps[i1];
821: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
822: v1 += 2;
823: }
824: if (j == nz - 1) {
825: tmp0 = tmps[*vi++];
826: sum1 -= *v1++ * tmp0;
827: }
828: tmp[row++] = sum1;
829: break;
830: case 2:
831: sum1 = b[*r++];
832: sum2 = b[*r++];
833: v2 = aa + ai[row + 1];
835: for (j = 0; j < nz - 1; j += 2) {
836: i0 = vi[0];
837: i1 = vi[1];
838: vi += 2;
839: tmp0 = tmps[i0];
840: tmp1 = tmps[i1];
841: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
842: v1 += 2;
843: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
844: v2 += 2;
845: }
846: if (j == nz - 1) {
847: tmp0 = tmps[*vi++];
848: sum1 -= *v1++ * tmp0;
849: sum2 -= *v2++ * tmp0;
850: }
851: sum2 -= *v2++ * sum1;
852: tmp[row++] = sum1;
853: tmp[row++] = sum2;
854: break;
855: case 3:
856: sum1 = b[*r++];
857: sum2 = b[*r++];
858: sum3 = b[*r++];
859: v2 = aa + ai[row + 1];
860: v3 = aa + ai[row + 2];
862: for (j = 0; j < nz - 1; j += 2) {
863: i0 = vi[0];
864: i1 = vi[1];
865: vi += 2;
866: tmp0 = tmps[i0];
867: tmp1 = tmps[i1];
868: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
869: v1 += 2;
870: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
871: v2 += 2;
872: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
873: v3 += 2;
874: }
875: if (j == nz - 1) {
876: tmp0 = tmps[*vi++];
877: sum1 -= *v1++ * tmp0;
878: sum2 -= *v2++ * tmp0;
879: sum3 -= *v3++ * tmp0;
880: }
881: sum2 -= *v2++ * sum1;
882: sum3 -= *v3++ * sum1;
883: sum3 -= *v3++ * sum2;
885: tmp[row++] = sum1;
886: tmp[row++] = sum2;
887: tmp[row++] = sum3;
888: break;
890: case 4:
891: sum1 = b[*r++];
892: sum2 = b[*r++];
893: sum3 = b[*r++];
894: sum4 = b[*r++];
895: v2 = aa + ai[row + 1];
896: v3 = aa + ai[row + 2];
897: v4 = aa + ai[row + 3];
899: for (j = 0; j < nz - 1; j += 2) {
900: i0 = vi[0];
901: i1 = vi[1];
902: vi += 2;
903: tmp0 = tmps[i0];
904: tmp1 = tmps[i1];
905: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
906: v1 += 2;
907: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
908: v2 += 2;
909: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
910: v3 += 2;
911: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
912: v4 += 2;
913: }
914: if (j == nz - 1) {
915: tmp0 = tmps[*vi++];
916: sum1 -= *v1++ * tmp0;
917: sum2 -= *v2++ * tmp0;
918: sum3 -= *v3++ * tmp0;
919: sum4 -= *v4++ * tmp0;
920: }
921: sum2 -= *v2++ * sum1;
922: sum3 -= *v3++ * sum1;
923: sum4 -= *v4++ * sum1;
924: sum3 -= *v3++ * sum2;
925: sum4 -= *v4++ * sum2;
926: sum4 -= *v4++ * sum3;
928: tmp[row++] = sum1;
929: tmp[row++] = sum2;
930: tmp[row++] = sum3;
931: tmp[row++] = sum4;
932: break;
933: case 5:
934: sum1 = b[*r++];
935: sum2 = b[*r++];
936: sum3 = b[*r++];
937: sum4 = b[*r++];
938: sum5 = b[*r++];
939: v2 = aa + ai[row + 1];
940: v3 = aa + ai[row + 2];
941: v4 = aa + ai[row + 3];
942: v5 = aa + ai[row + 4];
944: for (j = 0; j < nz - 1; j += 2) {
945: i0 = vi[0];
946: i1 = vi[1];
947: vi += 2;
948: tmp0 = tmps[i0];
949: tmp1 = tmps[i1];
950: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
951: v1 += 2;
952: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
953: v2 += 2;
954: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
955: v3 += 2;
956: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
957: v4 += 2;
958: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
959: v5 += 2;
960: }
961: if (j == nz - 1) {
962: tmp0 = tmps[*vi++];
963: sum1 -= *v1++ * tmp0;
964: sum2 -= *v2++ * tmp0;
965: sum3 -= *v3++ * tmp0;
966: sum4 -= *v4++ * tmp0;
967: sum5 -= *v5++ * tmp0;
968: }
970: sum2 -= *v2++ * sum1;
971: sum3 -= *v3++ * sum1;
972: sum4 -= *v4++ * sum1;
973: sum5 -= *v5++ * sum1;
974: sum3 -= *v3++ * sum2;
975: sum4 -= *v4++ * sum2;
976: sum5 -= *v5++ * sum2;
977: sum4 -= *v4++ * sum3;
978: sum5 -= *v5++ * sum3;
979: sum5 -= *v5++ * sum4;
981: tmp[row++] = sum1;
982: tmp[row++] = sum2;
983: tmp[row++] = sum3;
984: tmp[row++] = sum4;
985: tmp[row++] = sum5;
986: break;
987: default:
988: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
989: }
990: }
991: /* backward solve the upper triangular */
992: for (i = node_max - 1; i >= 0; i--) {
993: row = ns[i + 1];
994: nsz = ns[i + 1] - ns[i];
995: aii = ai[row + 1] - 1;
996: v1 = aa + aii;
997: vi = aj + aii;
998: nz = aii - ad[row];
999: switch (nsz) { /* Each loop in 'case' is unrolled */
1000: case 1:
1001: sum1 = tmp[row];
1003: for (j = nz; j > 1; j -= 2) {
1004: vi -= 2;
1005: i0 = vi[2];
1006: i1 = vi[1];
1007: tmp0 = tmps[i0];
1008: tmp1 = tmps[i1];
1009: v1 -= 2;
1010: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1011: }
1012: if (j == 1) {
1013: tmp0 = tmps[*vi--];
1014: sum1 -= *v1-- * tmp0;
1015: }
1016: x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1017: row--;
1018: break;
1019: case 2:
1020: sum1 = tmp[row];
1021: sum2 = tmp[row - 1];
1022: v2 = aa + ai[row] - 1;
1023: for (j = nz; j > 1; j -= 2) {
1024: vi -= 2;
1025: i0 = vi[2];
1026: i1 = vi[1];
1027: tmp0 = tmps[i0];
1028: tmp1 = tmps[i1];
1029: v1 -= 2;
1030: v2 -= 2;
1031: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1032: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1033: }
1034: if (j == 1) {
1035: tmp0 = tmps[*vi--];
1036: sum1 -= *v1-- * tmp0;
1037: sum2 -= *v2-- * tmp0;
1038: }
1040: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1041: row--;
1042: sum2 -= *v2-- * tmp0;
1043: x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1044: row--;
1045: break;
1046: case 3:
1047: sum1 = tmp[row];
1048: sum2 = tmp[row - 1];
1049: sum3 = tmp[row - 2];
1050: v2 = aa + ai[row] - 1;
1051: v3 = aa + ai[row - 1] - 1;
1052: for (j = nz; j > 1; j -= 2) {
1053: vi -= 2;
1054: i0 = vi[2];
1055: i1 = vi[1];
1056: tmp0 = tmps[i0];
1057: tmp1 = tmps[i1];
1058: v1 -= 2;
1059: v2 -= 2;
1060: v3 -= 2;
1061: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064: }
1065: if (j == 1) {
1066: tmp0 = tmps[*vi--];
1067: sum1 -= *v1-- * tmp0;
1068: sum2 -= *v2-- * tmp0;
1069: sum3 -= *v3-- * tmp0;
1070: }
1071: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1072: row--;
1073: sum2 -= *v2-- * tmp0;
1074: sum3 -= *v3-- * tmp0;
1075: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1076: row--;
1077: sum3 -= *v3-- * tmp0;
1078: x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1079: row--;
1081: break;
1082: case 4:
1083: sum1 = tmp[row];
1084: sum2 = tmp[row - 1];
1085: sum3 = tmp[row - 2];
1086: sum4 = tmp[row - 3];
1087: v2 = aa + ai[row] - 1;
1088: v3 = aa + ai[row - 1] - 1;
1089: v4 = aa + ai[row - 2] - 1;
1091: for (j = nz; j > 1; j -= 2) {
1092: vi -= 2;
1093: i0 = vi[2];
1094: i1 = vi[1];
1095: tmp0 = tmps[i0];
1096: tmp1 = tmps[i1];
1097: v1 -= 2;
1098: v2 -= 2;
1099: v3 -= 2;
1100: v4 -= 2;
1101: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1102: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1103: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1104: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1105: }
1106: if (j == 1) {
1107: tmp0 = tmps[*vi--];
1108: sum1 -= *v1-- * tmp0;
1109: sum2 -= *v2-- * tmp0;
1110: sum3 -= *v3-- * tmp0;
1111: sum4 -= *v4-- * tmp0;
1112: }
1114: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1115: row--;
1116: sum2 -= *v2-- * tmp0;
1117: sum3 -= *v3-- * tmp0;
1118: sum4 -= *v4-- * tmp0;
1119: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1120: row--;
1121: sum3 -= *v3-- * tmp0;
1122: sum4 -= *v4-- * tmp0;
1123: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1124: row--;
1125: sum4 -= *v4-- * tmp0;
1126: x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1127: row--;
1128: break;
1129: case 5:
1130: sum1 = tmp[row];
1131: sum2 = tmp[row - 1];
1132: sum3 = tmp[row - 2];
1133: sum4 = tmp[row - 3];
1134: sum5 = tmp[row - 4];
1135: v2 = aa + ai[row] - 1;
1136: v3 = aa + ai[row - 1] - 1;
1137: v4 = aa + ai[row - 2] - 1;
1138: v5 = aa + ai[row - 3] - 1;
1139: for (j = nz; j > 1; j -= 2) {
1140: vi -= 2;
1141: i0 = vi[2];
1142: i1 = vi[1];
1143: tmp0 = tmps[i0];
1144: tmp1 = tmps[i1];
1145: v1 -= 2;
1146: v2 -= 2;
1147: v3 -= 2;
1148: v4 -= 2;
1149: v5 -= 2;
1150: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1151: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1152: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1153: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1154: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1155: }
1156: if (j == 1) {
1157: tmp0 = tmps[*vi--];
1158: sum1 -= *v1-- * tmp0;
1159: sum2 -= *v2-- * tmp0;
1160: sum3 -= *v3-- * tmp0;
1161: sum4 -= *v4-- * tmp0;
1162: sum5 -= *v5-- * tmp0;
1163: }
1165: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1166: row--;
1167: sum2 -= *v2-- * tmp0;
1168: sum3 -= *v3-- * tmp0;
1169: sum4 -= *v4-- * tmp0;
1170: sum5 -= *v5-- * tmp0;
1171: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1172: row--;
1173: sum3 -= *v3-- * tmp0;
1174: sum4 -= *v4-- * tmp0;
1175: sum5 -= *v5-- * tmp0;
1176: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1177: row--;
1178: sum4 -= *v4-- * tmp0;
1179: sum5 -= *v5-- * tmp0;
1180: tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1181: row--;
1182: sum5 -= *v5-- * tmp0;
1183: x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1184: row--;
1185: break;
1186: default:
1187: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1188: }
1189: }
1190: PetscCall(ISRestoreIndices(isrow, &rout));
1191: PetscCall(ISRestoreIndices(iscol, &cout));
1192: PetscCall(VecRestoreArrayRead(bb, &b));
1193: PetscCall(VecRestoreArrayWrite(xx, &x));
1194: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1195: PetscFunctionReturn(PETSC_SUCCESS);
1196: }
1198: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1199: {
1200: Mat C = B;
1201: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1202: IS isrow = b->row, isicol = b->icol;
1203: const PetscInt *r, *ic, *ics;
1204: const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1205: PetscInt i, j, k, nz, nzL, row, *pj;
1206: const PetscInt *ajtmp, *bjtmp;
1207: MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1208: const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1209: FactorShiftCtx sctx;
1210: const PetscInt *ddiag;
1211: PetscReal rs;
1212: MatScalar d;
1213: PetscInt inod, nodesz, node_max, col;
1214: const PetscInt *ns;
1215: PetscInt *tmp_vec1, *tmp_vec2, *nsmap;
1217: PetscFunctionBegin;
1218: /* MatPivotSetUp(): initialize shift context sctx */
1219: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1221: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1222: ddiag = a->diag;
1223: sctx.shift_top = info->zeropivot;
1224: for (i = 0; i < n; i++) {
1225: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1226: d = (aa)[ddiag[i]];
1227: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1228: v = aa + ai[i];
1229: nz = ai[i + 1] - ai[i];
1230: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1231: if (rs > sctx.shift_top) sctx.shift_top = rs;
1232: }
1233: sctx.shift_top *= 1.1;
1234: sctx.nshift_max = 5;
1235: sctx.shift_lo = 0.;
1236: sctx.shift_hi = 1.;
1237: }
1239: PetscCall(ISGetIndices(isrow, &r));
1240: PetscCall(ISGetIndices(isicol, &ic));
1242: PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1243: ics = ic;
1245: node_max = a->inode.node_count;
1246: ns = a->inode.size_csr;
1247: PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1249: /* If max inode size > 4, split it into two inodes.*/
1250: /* also map the inode sizes according to the ordering */
1251: PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1252: for (i = 0, j = 0; i < node_max; ++i, ++j) {
1253: nodesz = ns[i + 1] - ns[i];
1254: if (nodesz > 4) {
1255: tmp_vec1[j] = 4;
1256: ++j;
1257: tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
1258: } else {
1259: tmp_vec1[j] = nodesz;
1260: }
1261: }
1262: /* Use the correct node_max */
1263: node_max = j;
1265: /* Now reorder the inode info based on mat re-ordering info */
1266: /* First create a row -> inode_size_array_index map */
1267: PetscCall(PetscMalloc1(n + 1, &nsmap));
1268: PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1269: tmp_vec2[0] = 0;
1270: for (i = 0, row = 0; i < node_max; i++) {
1271: nodesz = tmp_vec1[i];
1272: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1273: }
1274: /* Using nsmap, create a reordered ns structure */
1275: for (i = 0, j = 0; i < node_max; i++) {
1276: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1277: tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
1278: j += nodesz;
1279: }
1280: PetscCall(PetscFree(nsmap));
1281: PetscCall(PetscFree(tmp_vec1));
1283: /* Now use the correct ns */
1284: ns = tmp_vec2;
1286: do {
1287: sctx.newshift = PETSC_FALSE;
1288: /* Now loop over each block-row, and do the factorization */
1289: for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1290: nodesz = ns[inod + 1] - ns[inod];
1292: switch (nodesz) {
1293: case 1:
1294: /* zero rtmp1 */
1295: /* L part */
1296: nz = bi[i + 1] - bi[i];
1297: bjtmp = bj + bi[i];
1298: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1300: /* U part */
1301: nz = bdiag[i] - bdiag[i + 1];
1302: bjtmp = bj + bdiag[i + 1] + 1;
1303: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1305: /* load in initial (unfactored row) */
1306: nz = ai[r[i] + 1] - ai[r[i]];
1307: ajtmp = aj + ai[r[i]];
1308: v = aa + ai[r[i]];
1309: for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1311: /* ZeropivotApply() */
1312: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1314: /* elimination */
1315: bjtmp = bj + bi[i];
1316: row = *bjtmp++;
1317: nzL = bi[i + 1] - bi[i];
1318: for (k = 0; k < nzL; k++) {
1319: pc = rtmp1 + row;
1320: if (*pc != 0.0) {
1321: pv = b->a + bdiag[row];
1322: mul1 = *pc * (*pv);
1323: *pc = mul1;
1324: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1325: pv = b->a + bdiag[row + 1] + 1;
1326: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1327: for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1328: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1329: }
1330: row = *bjtmp++;
1331: }
1333: /* finished row so stick it into b->a */
1334: rs = 0.0;
1335: /* L part */
1336: pv = b->a + bi[i];
1337: pj = b->j + bi[i];
1338: nz = bi[i + 1] - bi[i];
1339: for (j = 0; j < nz; j++) {
1340: pv[j] = rtmp1[pj[j]];
1341: rs += PetscAbsScalar(pv[j]);
1342: }
1344: /* U part */
1345: pv = b->a + bdiag[i + 1] + 1;
1346: pj = b->j + bdiag[i + 1] + 1;
1347: nz = bdiag[i] - bdiag[i + 1] - 1;
1348: for (j = 0; j < nz; j++) {
1349: pv[j] = rtmp1[pj[j]];
1350: rs += PetscAbsScalar(pv[j]);
1351: }
1353: /* Check zero pivot */
1354: sctx.rs = rs;
1355: sctx.pv = rtmp1[i];
1356: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1357: if (sctx.newshift) break;
1359: /* Mark diagonal and invert diagonal for simpler triangular solves */
1360: pv = b->a + bdiag[i];
1361: *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1362: break;
1364: case 2:
1365: /* zero rtmp1 and rtmp2 */
1366: /* L part */
1367: nz = bi[i + 1] - bi[i];
1368: bjtmp = bj + bi[i];
1369: for (j = 0; j < nz; j++) {
1370: col = bjtmp[j];
1371: rtmp1[col] = 0.0;
1372: rtmp2[col] = 0.0;
1373: }
1375: /* U part */
1376: nz = bdiag[i] - bdiag[i + 1];
1377: bjtmp = bj + bdiag[i + 1] + 1;
1378: for (j = 0; j < nz; j++) {
1379: col = bjtmp[j];
1380: rtmp1[col] = 0.0;
1381: rtmp2[col] = 0.0;
1382: }
1384: /* load in initial (unfactored row) */
1385: nz = ai[r[i] + 1] - ai[r[i]];
1386: ajtmp = aj + ai[r[i]];
1387: v1 = aa + ai[r[i]];
1388: v2 = aa + ai[r[i + 1]];
1389: for (j = 0; j < nz; j++) {
1390: col = ics[ajtmp[j]];
1391: rtmp1[col] = v1[j];
1392: rtmp2[col] = v2[j];
1393: }
1394: /* ZeropivotApply(): shift the diagonal of the matrix */
1395: rtmp1[i] += sctx.shift_amount;
1396: rtmp2[i + 1] += sctx.shift_amount;
1398: /* elimination */
1399: bjtmp = bj + bi[i];
1400: row = *bjtmp++; /* pivot row */
1401: nzL = bi[i + 1] - bi[i];
1402: for (k = 0; k < nzL; k++) {
1403: pc1 = rtmp1 + row;
1404: pc2 = rtmp2 + row;
1405: if (*pc1 != 0.0 || *pc2 != 0.0) {
1406: pv = b->a + bdiag[row];
1407: mul1 = *pc1 * (*pv);
1408: mul2 = *pc2 * (*pv);
1409: *pc1 = mul1;
1410: *pc2 = mul2;
1412: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1413: pv = b->a + bdiag[row + 1] + 1;
1414: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1415: for (j = 0; j < nz; j++) {
1416: col = pj[j];
1417: rtmp1[col] -= mul1 * pv[j];
1418: rtmp2[col] -= mul2 * pv[j];
1419: }
1420: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1421: }
1422: row = *bjtmp++;
1423: }
1425: /* finished row i; check zero pivot, then stick row i into b->a */
1426: rs = 0.0;
1427: /* L part */
1428: pc1 = b->a + bi[i];
1429: pj = b->j + bi[i];
1430: nz = bi[i + 1] - bi[i];
1431: for (j = 0; j < nz; j++) {
1432: col = pj[j];
1433: pc1[j] = rtmp1[col];
1434: rs += PetscAbsScalar(pc1[j]);
1435: }
1436: /* U part */
1437: pc1 = b->a + bdiag[i + 1] + 1;
1438: pj = b->j + bdiag[i + 1] + 1;
1439: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1440: for (j = 0; j < nz; j++) {
1441: col = pj[j];
1442: pc1[j] = rtmp1[col];
1443: rs += PetscAbsScalar(pc1[j]);
1444: }
1446: sctx.rs = rs;
1447: sctx.pv = rtmp1[i];
1448: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1449: if (sctx.newshift) break;
1450: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1451: *pc1 = 1.0 / sctx.pv;
1453: /* Now take care of diagonal 2x2 block. */
1454: pc2 = rtmp2 + i;
1455: if (*pc2 != 0.0) {
1456: mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */
1457: *pc2 = mul1; /* insert L entry */
1458: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1459: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1460: for (j = 0; j < nz; j++) {
1461: col = pj[j];
1462: rtmp2[col] -= mul1 * rtmp1[col];
1463: }
1464: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1465: }
1467: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1468: rs = 0.0;
1469: /* L part */
1470: pc2 = b->a + bi[i + 1];
1471: pj = b->j + bi[i + 1];
1472: nz = bi[i + 2] - bi[i + 1];
1473: for (j = 0; j < nz; j++) {
1474: col = pj[j];
1475: pc2[j] = rtmp2[col];
1476: rs += PetscAbsScalar(pc2[j]);
1477: }
1478: /* U part */
1479: pc2 = b->a + bdiag[i + 2] + 1;
1480: pj = b->j + bdiag[i + 2] + 1;
1481: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1482: for (j = 0; j < nz; j++) {
1483: col = pj[j];
1484: pc2[j] = rtmp2[col];
1485: rs += PetscAbsScalar(pc2[j]);
1486: }
1488: sctx.rs = rs;
1489: sctx.pv = rtmp2[i + 1];
1490: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1491: if (sctx.newshift) break;
1492: pc2 = b->a + bdiag[i + 1];
1493: *pc2 = 1.0 / sctx.pv;
1494: break;
1496: case 3:
1497: /* zero rtmp */
1498: /* L part */
1499: nz = bi[i + 1] - bi[i];
1500: bjtmp = bj + bi[i];
1501: for (j = 0; j < nz; j++) {
1502: col = bjtmp[j];
1503: rtmp1[col] = 0.0;
1504: rtmp2[col] = 0.0;
1505: rtmp3[col] = 0.0;
1506: }
1508: /* U part */
1509: nz = bdiag[i] - bdiag[i + 1];
1510: bjtmp = bj + bdiag[i + 1] + 1;
1511: for (j = 0; j < nz; j++) {
1512: col = bjtmp[j];
1513: rtmp1[col] = 0.0;
1514: rtmp2[col] = 0.0;
1515: rtmp3[col] = 0.0;
1516: }
1518: /* load in initial (unfactored row) */
1519: nz = ai[r[i] + 1] - ai[r[i]];
1520: ajtmp = aj + ai[r[i]];
1521: v1 = aa + ai[r[i]];
1522: v2 = aa + ai[r[i + 1]];
1523: v3 = aa + ai[r[i + 2]];
1524: for (j = 0; j < nz; j++) {
1525: col = ics[ajtmp[j]];
1526: rtmp1[col] = v1[j];
1527: rtmp2[col] = v2[j];
1528: rtmp3[col] = v3[j];
1529: }
1530: /* ZeropivotApply(): shift the diagonal of the matrix */
1531: rtmp1[i] += sctx.shift_amount;
1532: rtmp2[i + 1] += sctx.shift_amount;
1533: rtmp3[i + 2] += sctx.shift_amount;
1535: /* elimination */
1536: bjtmp = bj + bi[i];
1537: row = *bjtmp++; /* pivot row */
1538: nzL = bi[i + 1] - bi[i];
1539: for (k = 0; k < nzL; k++) {
1540: pc1 = rtmp1 + row;
1541: pc2 = rtmp2 + row;
1542: pc3 = rtmp3 + row;
1543: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1544: pv = b->a + bdiag[row];
1545: mul1 = *pc1 * (*pv);
1546: mul2 = *pc2 * (*pv);
1547: mul3 = *pc3 * (*pv);
1548: *pc1 = mul1;
1549: *pc2 = mul2;
1550: *pc3 = mul3;
1552: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1553: pv = b->a + bdiag[row + 1] + 1;
1554: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1555: for (j = 0; j < nz; j++) {
1556: col = pj[j];
1557: rtmp1[col] -= mul1 * pv[j];
1558: rtmp2[col] -= mul2 * pv[j];
1559: rtmp3[col] -= mul3 * pv[j];
1560: }
1561: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1562: }
1563: row = *bjtmp++;
1564: }
1566: /* finished row i; check zero pivot, then stick row i into b->a */
1567: rs = 0.0;
1568: /* L part */
1569: pc1 = b->a + bi[i];
1570: pj = b->j + bi[i];
1571: nz = bi[i + 1] - bi[i];
1572: for (j = 0; j < nz; j++) {
1573: col = pj[j];
1574: pc1[j] = rtmp1[col];
1575: rs += PetscAbsScalar(pc1[j]);
1576: }
1577: /* U part */
1578: pc1 = b->a + bdiag[i + 1] + 1;
1579: pj = b->j + bdiag[i + 1] + 1;
1580: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1581: for (j = 0; j < nz; j++) {
1582: col = pj[j];
1583: pc1[j] = rtmp1[col];
1584: rs += PetscAbsScalar(pc1[j]);
1585: }
1587: sctx.rs = rs;
1588: sctx.pv = rtmp1[i];
1589: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1590: if (sctx.newshift) break;
1591: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1592: *pc1 = 1.0 / sctx.pv;
1594: /* Now take care of 1st column of diagonal 3x3 block. */
1595: pc2 = rtmp2 + i;
1596: pc3 = rtmp3 + i;
1597: if (*pc2 != 0.0 || *pc3 != 0.0) {
1598: mul2 = (*pc2) * (*pc1);
1599: *pc2 = mul2;
1600: mul3 = (*pc3) * (*pc1);
1601: *pc3 = mul3;
1602: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1603: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1604: for (j = 0; j < nz; j++) {
1605: col = pj[j];
1606: rtmp2[col] -= mul2 * rtmp1[col];
1607: rtmp3[col] -= mul3 * rtmp1[col];
1608: }
1609: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1610: }
1612: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1613: rs = 0.0;
1614: /* L part */
1615: pc2 = b->a + bi[i + 1];
1616: pj = b->j + bi[i + 1];
1617: nz = bi[i + 2] - bi[i + 1];
1618: for (j = 0; j < nz; j++) {
1619: col = pj[j];
1620: pc2[j] = rtmp2[col];
1621: rs += PetscAbsScalar(pc2[j]);
1622: }
1623: /* U part */
1624: pc2 = b->a + bdiag[i + 2] + 1;
1625: pj = b->j + bdiag[i + 2] + 1;
1626: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1627: for (j = 0; j < nz; j++) {
1628: col = pj[j];
1629: pc2[j] = rtmp2[col];
1630: rs += PetscAbsScalar(pc2[j]);
1631: }
1633: sctx.rs = rs;
1634: sctx.pv = rtmp2[i + 1];
1635: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1636: if (sctx.newshift) break;
1637: pc2 = b->a + bdiag[i + 1];
1638: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1640: /* Now take care of 2nd column of diagonal 3x3 block. */
1641: pc3 = rtmp3 + i + 1;
1642: if (*pc3 != 0.0) {
1643: mul3 = (*pc3) * (*pc2);
1644: *pc3 = mul3;
1645: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1646: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1647: for (j = 0; j < nz; j++) {
1648: col = pj[j];
1649: rtmp3[col] -= mul3 * rtmp2[col];
1650: }
1651: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1652: }
1654: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1655: rs = 0.0;
1656: /* L part */
1657: pc3 = b->a + bi[i + 2];
1658: pj = b->j + bi[i + 2];
1659: nz = bi[i + 3] - bi[i + 2];
1660: for (j = 0; j < nz; j++) {
1661: col = pj[j];
1662: pc3[j] = rtmp3[col];
1663: rs += PetscAbsScalar(pc3[j]);
1664: }
1665: /* U part */
1666: pc3 = b->a + bdiag[i + 3] + 1;
1667: pj = b->j + bdiag[i + 3] + 1;
1668: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1669: for (j = 0; j < nz; j++) {
1670: col = pj[j];
1671: pc3[j] = rtmp3[col];
1672: rs += PetscAbsScalar(pc3[j]);
1673: }
1675: sctx.rs = rs;
1676: sctx.pv = rtmp3[i + 2];
1677: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1678: if (sctx.newshift) break;
1679: pc3 = b->a + bdiag[i + 2];
1680: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1681: break;
1682: case 4:
1683: /* zero rtmp */
1684: /* L part */
1685: nz = bi[i + 1] - bi[i];
1686: bjtmp = bj + bi[i];
1687: for (j = 0; j < nz; j++) {
1688: col = bjtmp[j];
1689: rtmp1[col] = 0.0;
1690: rtmp2[col] = 0.0;
1691: rtmp3[col] = 0.0;
1692: rtmp4[col] = 0.0;
1693: }
1695: /* U part */
1696: nz = bdiag[i] - bdiag[i + 1];
1697: bjtmp = bj + bdiag[i + 1] + 1;
1698: for (j = 0; j < nz; j++) {
1699: col = bjtmp[j];
1700: rtmp1[col] = 0.0;
1701: rtmp2[col] = 0.0;
1702: rtmp3[col] = 0.0;
1703: rtmp4[col] = 0.0;
1704: }
1706: /* load in initial (unfactored row) */
1707: nz = ai[r[i] + 1] - ai[r[i]];
1708: ajtmp = aj + ai[r[i]];
1709: v1 = aa + ai[r[i]];
1710: v2 = aa + ai[r[i + 1]];
1711: v3 = aa + ai[r[i + 2]];
1712: v4 = aa + ai[r[i + 3]];
1713: for (j = 0; j < nz; j++) {
1714: col = ics[ajtmp[j]];
1715: rtmp1[col] = v1[j];
1716: rtmp2[col] = v2[j];
1717: rtmp3[col] = v3[j];
1718: rtmp4[col] = v4[j];
1719: }
1720: /* ZeropivotApply(): shift the diagonal of the matrix */
1721: rtmp1[i] += sctx.shift_amount;
1722: rtmp2[i + 1] += sctx.shift_amount;
1723: rtmp3[i + 2] += sctx.shift_amount;
1724: rtmp4[i + 3] += sctx.shift_amount;
1726: /* elimination */
1727: bjtmp = bj + bi[i];
1728: row = *bjtmp++; /* pivot row */
1729: nzL = bi[i + 1] - bi[i];
1730: for (k = 0; k < nzL; k++) {
1731: pc1 = rtmp1 + row;
1732: pc2 = rtmp2 + row;
1733: pc3 = rtmp3 + row;
1734: pc4 = rtmp4 + row;
1735: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1736: pv = b->a + bdiag[row];
1737: mul1 = *pc1 * (*pv);
1738: mul2 = *pc2 * (*pv);
1739: mul3 = *pc3 * (*pv);
1740: mul4 = *pc4 * (*pv);
1741: *pc1 = mul1;
1742: *pc2 = mul2;
1743: *pc3 = mul3;
1744: *pc4 = mul4;
1746: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1747: pv = b->a + bdiag[row + 1] + 1;
1748: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1749: for (j = 0; j < nz; j++) {
1750: col = pj[j];
1751: rtmp1[col] -= mul1 * pv[j];
1752: rtmp2[col] -= mul2 * pv[j];
1753: rtmp3[col] -= mul3 * pv[j];
1754: rtmp4[col] -= mul4 * pv[j];
1755: }
1756: PetscCall(PetscLogFlops(4 + 8.0 * nz));
1757: }
1758: row = *bjtmp++;
1759: }
1761: /* finished row i; check zero pivot, then stick row i into b->a */
1762: rs = 0.0;
1763: /* L part */
1764: pc1 = b->a + bi[i];
1765: pj = b->j + bi[i];
1766: nz = bi[i + 1] - bi[i];
1767: for (j = 0; j < nz; j++) {
1768: col = pj[j];
1769: pc1[j] = rtmp1[col];
1770: rs += PetscAbsScalar(pc1[j]);
1771: }
1772: /* U part */
1773: pc1 = b->a + bdiag[i + 1] + 1;
1774: pj = b->j + bdiag[i + 1] + 1;
1775: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1776: for (j = 0; j < nz; j++) {
1777: col = pj[j];
1778: pc1[j] = rtmp1[col];
1779: rs += PetscAbsScalar(pc1[j]);
1780: }
1782: sctx.rs = rs;
1783: sctx.pv = rtmp1[i];
1784: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1785: if (sctx.newshift) break;
1786: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1787: *pc1 = 1.0 / sctx.pv;
1789: /* Now take care of 1st column of diagonal 4x4 block. */
1790: pc2 = rtmp2 + i;
1791: pc3 = rtmp3 + i;
1792: pc4 = rtmp4 + i;
1793: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1794: mul2 = (*pc2) * (*pc1);
1795: *pc2 = mul2;
1796: mul3 = (*pc3) * (*pc1);
1797: *pc3 = mul3;
1798: mul4 = (*pc4) * (*pc1);
1799: *pc4 = mul4;
1800: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1801: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1802: for (j = 0; j < nz; j++) {
1803: col = pj[j];
1804: rtmp2[col] -= mul2 * rtmp1[col];
1805: rtmp3[col] -= mul3 * rtmp1[col];
1806: rtmp4[col] -= mul4 * rtmp1[col];
1807: }
1808: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1809: }
1811: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1812: rs = 0.0;
1813: /* L part */
1814: pc2 = b->a + bi[i + 1];
1815: pj = b->j + bi[i + 1];
1816: nz = bi[i + 2] - bi[i + 1];
1817: for (j = 0; j < nz; j++) {
1818: col = pj[j];
1819: pc2[j] = rtmp2[col];
1820: rs += PetscAbsScalar(pc2[j]);
1821: }
1822: /* U part */
1823: pc2 = b->a + bdiag[i + 2] + 1;
1824: pj = b->j + bdiag[i + 2] + 1;
1825: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1826: for (j = 0; j < nz; j++) {
1827: col = pj[j];
1828: pc2[j] = rtmp2[col];
1829: rs += PetscAbsScalar(pc2[j]);
1830: }
1832: sctx.rs = rs;
1833: sctx.pv = rtmp2[i + 1];
1834: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1835: if (sctx.newshift) break;
1836: pc2 = b->a + bdiag[i + 1];
1837: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1839: /* Now take care of 2nd column of diagonal 4x4 block. */
1840: pc3 = rtmp3 + i + 1;
1841: pc4 = rtmp4 + i + 1;
1842: if (*pc3 != 0.0 || *pc4 != 0.0) {
1843: mul3 = (*pc3) * (*pc2);
1844: *pc3 = mul3;
1845: mul4 = (*pc4) * (*pc2);
1846: *pc4 = mul4;
1847: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1848: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1849: for (j = 0; j < nz; j++) {
1850: col = pj[j];
1851: rtmp3[col] -= mul3 * rtmp2[col];
1852: rtmp4[col] -= mul4 * rtmp2[col];
1853: }
1854: PetscCall(PetscLogFlops(4.0 * nz));
1855: }
1857: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1858: rs = 0.0;
1859: /* L part */
1860: pc3 = b->a + bi[i + 2];
1861: pj = b->j + bi[i + 2];
1862: nz = bi[i + 3] - bi[i + 2];
1863: for (j = 0; j < nz; j++) {
1864: col = pj[j];
1865: pc3[j] = rtmp3[col];
1866: rs += PetscAbsScalar(pc3[j]);
1867: }
1868: /* U part */
1869: pc3 = b->a + bdiag[i + 3] + 1;
1870: pj = b->j + bdiag[i + 3] + 1;
1871: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1872: for (j = 0; j < nz; j++) {
1873: col = pj[j];
1874: pc3[j] = rtmp3[col];
1875: rs += PetscAbsScalar(pc3[j]);
1876: }
1878: sctx.rs = rs;
1879: sctx.pv = rtmp3[i + 2];
1880: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1881: if (sctx.newshift) break;
1882: pc3 = b->a + bdiag[i + 2];
1883: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1885: /* Now take care of 3rd column of diagonal 4x4 block. */
1886: pc4 = rtmp4 + i + 2;
1887: if (*pc4 != 0.0) {
1888: mul4 = (*pc4) * (*pc3);
1889: *pc4 = mul4;
1890: pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */
1891: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1892: for (j = 0; j < nz; j++) {
1893: col = pj[j];
1894: rtmp4[col] -= mul4 * rtmp3[col];
1895: }
1896: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1897: }
1899: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1900: rs = 0.0;
1901: /* L part */
1902: pc4 = b->a + bi[i + 3];
1903: pj = b->j + bi[i + 3];
1904: nz = bi[i + 4] - bi[i + 3];
1905: for (j = 0; j < nz; j++) {
1906: col = pj[j];
1907: pc4[j] = rtmp4[col];
1908: rs += PetscAbsScalar(pc4[j]);
1909: }
1910: /* U part */
1911: pc4 = b->a + bdiag[i + 4] + 1;
1912: pj = b->j + bdiag[i + 4] + 1;
1913: nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1914: for (j = 0; j < nz; j++) {
1915: col = pj[j];
1916: pc4[j] = rtmp4[col];
1917: rs += PetscAbsScalar(pc4[j]);
1918: }
1920: sctx.rs = rs;
1921: sctx.pv = rtmp4[i + 3];
1922: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1923: if (sctx.newshift) break;
1924: pc4 = b->a + bdiag[i + 3];
1925: *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1926: break;
1928: default:
1929: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1930: }
1931: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1932: i += nodesz; /* Update the row */
1933: }
1935: /* MatPivotRefine() */
1936: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1937: /*
1938: * if no shift in this attempt & shifting & started shifting & can refine,
1939: * then try lower shift
1940: */
1941: sctx.shift_hi = sctx.shift_fraction;
1942: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1943: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1944: sctx.newshift = PETSC_TRUE;
1945: sctx.nshift++;
1946: }
1947: } while (sctx.newshift);
1949: PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1950: PetscCall(PetscFree(tmp_vec2));
1951: PetscCall(ISRestoreIndices(isicol, &ic));
1952: PetscCall(ISRestoreIndices(isrow, &r));
1954: if (b->inode.size_csr) {
1955: C->ops->solve = MatSolve_SeqAIJ_Inode;
1956: } else {
1957: C->ops->solve = MatSolve_SeqAIJ;
1958: }
1959: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1960: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1961: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1962: C->ops->matsolve = MatMatSolve_SeqAIJ;
1963: C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1964: C->assembled = PETSC_TRUE;
1965: C->preallocated = PETSC_TRUE;
1967: PetscCall(PetscLogFlops(C->cmap->n));
1969: /* MatShiftView(A,info,&sctx) */
1970: if (sctx.nshift) {
1971: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1972: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1973: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1974: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1975: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1976: PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1977: }
1978: }
1979: PetscFunctionReturn(PETSC_SUCCESS);
1980: }
1982: #if 0
1983: // unused
1984: static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1985: {
1986: Mat C = B;
1987: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1988: IS iscol = b->col, isrow = b->row, isicol = b->icol;
1989: const PetscInt *r, *ic, *c, *ics;
1990: PetscInt n = A->rmap->n, *bi = b->i;
1991: PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1992: PetscInt i, j, idx, *bd = b->diag, node_max, nodesz;
1993: PetscInt *ai = a->i, *aj = a->j;
1994: PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1995: PetscScalar mul1, mul2, mul3, tmp;
1996: MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1997: const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1;
1998: PetscReal rs = 0.0;
1999: FactorShiftCtx sctx;
2001: PetscFunctionBegin;
2002: sctx.shift_top = 0;
2003: sctx.nshift_max = 0;
2004: sctx.shift_lo = 0;
2005: sctx.shift_hi = 0;
2006: sctx.shift_fraction = 0;
2008: /* if both shift schemes are chosen by user, only use info->shiftpd */
2009: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
2010: sctx.shift_top = 0;
2011: for (i = 0; i < n; i++) {
2012: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2013: rs = 0.0;
2014: ajtmp = aj + ai[i];
2015: rtmp1 = aa + ai[i];
2016: nz = ai[i + 1] - ai[i];
2017: for (j = 0; j < nz; j++) {
2018: if (*ajtmp != i) {
2019: rs += PetscAbsScalar(*rtmp1++);
2020: } else {
2021: rs -= PetscRealPart(*rtmp1++);
2022: }
2023: ajtmp++;
2024: }
2025: if (rs > sctx.shift_top) sctx.shift_top = rs;
2026: }
2027: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2028: sctx.shift_top *= 1.1;
2029: sctx.nshift_max = 5;
2030: sctx.shift_lo = 0.;
2031: sctx.shift_hi = 1.;
2032: }
2033: sctx.shift_amount = 0;
2034: sctx.nshift = 0;
2036: PetscCall(ISGetIndices(isrow, &r));
2037: PetscCall(ISGetIndices(iscol, &c));
2038: PetscCall(ISGetIndices(isicol, &ic));
2039: PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2040: ics = ic;
2042: node_max = a->inode.node_count;
2043: ns = a->inode.size;
2044: PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2046: /* If max inode size > 3, split it into two inodes.*/
2047: /* also map the inode sizes according to the ordering */
2048: PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2049: for (i = 0, j = 0; i < node_max; ++i, ++j) {
2050: if (ns[i] > 3) {
2051: tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */
2052: ++j;
2053: tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2054: } else {
2055: tmp_vec1[j] = ns[i];
2056: }
2057: }
2058: /* Use the correct node_max */
2059: node_max = j;
2061: /* Now reorder the inode info based on mat re-ordering info */
2062: /* First create a row -> inode_size_array_index map */
2063: PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2064: for (i = 0, row = 0; i < node_max; i++) {
2065: nodesz = tmp_vec1[i];
2066: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2067: }
2068: /* Using nsmap, create a reordered ns structure */
2069: for (i = 0, j = 0; i < node_max; i++) {
2070: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2071: tmp_vec2[i] = nodesz;
2072: j += nodesz;
2073: }
2074: PetscCall(PetscFree2(nsmap, tmp_vec1));
2075: /* Now use the correct ns */
2076: ns = tmp_vec2;
2078: do {
2079: sctx.newshift = PETSC_FALSE;
2080: /* Now loop over each block-row, and do the factorization */
2081: for (i = 0, row = 0; i < node_max; i++) {
2082: nodesz = ns[i];
2083: nz = bi[row + 1] - bi[row];
2084: bjtmp = bj + bi[row];
2086: switch (nodesz) {
2087: case 1:
2088: for (j = 0; j < nz; j++) {
2089: idx = bjtmp[j];
2090: rtmp11[idx] = 0.0;
2091: }
2093: /* load in initial (unfactored row) */
2094: idx = r[row];
2095: nz_tmp = ai[idx + 1] - ai[idx];
2096: ajtmp = aj + ai[idx];
2097: v1 = aa + ai[idx];
2099: for (j = 0; j < nz_tmp; j++) {
2100: idx = ics[ajtmp[j]];
2101: rtmp11[idx] = v1[j];
2102: }
2103: rtmp11[ics[r[row]]] += sctx.shift_amount;
2105: prow = *bjtmp++;
2106: while (prow < row) {
2107: pc1 = rtmp11 + prow;
2108: if (*pc1 != 0.0) {
2109: pv = ba + bd[prow];
2110: pj = nbj + bd[prow];
2111: mul1 = *pc1 * *pv++;
2112: *pc1 = mul1;
2113: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2114: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2115: for (j = 0; j < nz_tmp; j++) {
2116: tmp = pv[j];
2117: idx = pj[j];
2118: rtmp11[idx] -= mul1 * tmp;
2119: }
2120: }
2121: prow = *bjtmp++;
2122: }
2123: pj = bj + bi[row];
2124: pc1 = ba + bi[row];
2126: sctx.pv = rtmp11[row];
2127: rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2128: rs = 0.0;
2129: for (j = 0; j < nz; j++) {
2130: idx = pj[j];
2131: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2132: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2133: }
2134: sctx.rs = rs;
2135: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2136: if (sctx.newshift) goto endofwhile;
2137: break;
2139: case 2:
2140: for (j = 0; j < nz; j++) {
2141: idx = bjtmp[j];
2142: rtmp11[idx] = 0.0;
2143: rtmp22[idx] = 0.0;
2144: }
2146: /* load in initial (unfactored row) */
2147: idx = r[row];
2148: nz_tmp = ai[idx + 1] - ai[idx];
2149: ajtmp = aj + ai[idx];
2150: v1 = aa + ai[idx];
2151: v2 = aa + ai[idx + 1];
2152: for (j = 0; j < nz_tmp; j++) {
2153: idx = ics[ajtmp[j]];
2154: rtmp11[idx] = v1[j];
2155: rtmp22[idx] = v2[j];
2156: }
2157: rtmp11[ics[r[row]]] += sctx.shift_amount;
2158: rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2160: prow = *bjtmp++;
2161: while (prow < row) {
2162: pc1 = rtmp11 + prow;
2163: pc2 = rtmp22 + prow;
2164: if (*pc1 != 0.0 || *pc2 != 0.0) {
2165: pv = ba + bd[prow];
2166: pj = nbj + bd[prow];
2167: mul1 = *pc1 * *pv;
2168: mul2 = *pc2 * *pv;
2169: ++pv;
2170: *pc1 = mul1;
2171: *pc2 = mul2;
2173: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2174: for (j = 0; j < nz_tmp; j++) {
2175: tmp = pv[j];
2176: idx = pj[j];
2177: rtmp11[idx] -= mul1 * tmp;
2178: rtmp22[idx] -= mul2 * tmp;
2179: }
2180: PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2181: }
2182: prow = *bjtmp++;
2183: }
2185: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2186: pc1 = rtmp11 + prow;
2187: pc2 = rtmp22 + prow;
2189: sctx.pv = *pc1;
2190: pj = bj + bi[prow];
2191: rs = 0.0;
2192: for (j = 0; j < nz; j++) {
2193: idx = pj[j];
2194: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2195: }
2196: sctx.rs = rs;
2197: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2198: if (sctx.newshift) goto endofwhile;
2200: if (*pc2 != 0.0) {
2201: pj = nbj + bd[prow];
2202: mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2203: *pc2 = mul2;
2204: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2205: for (j = 0; j < nz_tmp; j++) {
2206: idx = pj[j];
2207: tmp = rtmp11[idx];
2208: rtmp22[idx] -= mul2 * tmp;
2209: }
2210: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2211: }
2213: pj = bj + bi[row];
2214: pc1 = ba + bi[row];
2215: pc2 = ba + bi[row + 1];
2217: sctx.pv = rtmp22[row + 1];
2218: rs = 0.0;
2219: rtmp11[row] = 1.0 / rtmp11[row];
2220: rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2221: /* copy row entries from dense representation to sparse */
2222: for (j = 0; j < nz; j++) {
2223: idx = pj[j];
2224: pc1[j] = rtmp11[idx];
2225: pc2[j] = rtmp22[idx];
2226: if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2227: }
2228: sctx.rs = rs;
2229: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2230: if (sctx.newshift) goto endofwhile;
2231: break;
2233: case 3:
2234: for (j = 0; j < nz; j++) {
2235: idx = bjtmp[j];
2236: rtmp11[idx] = 0.0;
2237: rtmp22[idx] = 0.0;
2238: rtmp33[idx] = 0.0;
2239: }
2240: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2241: idx = r[row];
2242: nz_tmp = ai[idx + 1] - ai[idx];
2243: ajtmp = aj + ai[idx];
2244: v1 = aa + ai[idx];
2245: v2 = aa + ai[idx + 1];
2246: v3 = aa + ai[idx + 2];
2247: for (j = 0; j < nz_tmp; j++) {
2248: idx = ics[ajtmp[j]];
2249: rtmp11[idx] = v1[j];
2250: rtmp22[idx] = v2[j];
2251: rtmp33[idx] = v3[j];
2252: }
2253: rtmp11[ics[r[row]]] += sctx.shift_amount;
2254: rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2255: rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2257: /* loop over all pivot row blocks above this row block */
2258: prow = *bjtmp++;
2259: while (prow < row) {
2260: pc1 = rtmp11 + prow;
2261: pc2 = rtmp22 + prow;
2262: pc3 = rtmp33 + prow;
2263: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2264: pv = ba + bd[prow];
2265: pj = nbj + bd[prow];
2266: mul1 = *pc1 * *pv;
2267: mul2 = *pc2 * *pv;
2268: mul3 = *pc3 * *pv;
2269: ++pv;
2270: *pc1 = mul1;
2271: *pc2 = mul2;
2272: *pc3 = mul3;
2274: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2275: /* update this row based on pivot row */
2276: for (j = 0; j < nz_tmp; j++) {
2277: tmp = pv[j];
2278: idx = pj[j];
2279: rtmp11[idx] -= mul1 * tmp;
2280: rtmp22[idx] -= mul2 * tmp;
2281: rtmp33[idx] -= mul3 * tmp;
2282: }
2283: PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2284: }
2285: prow = *bjtmp++;
2286: }
2288: /* Now take care of diagonal 3x3 block in this set of rows */
2289: /* note: prow = row here */
2290: pc1 = rtmp11 + prow;
2291: pc2 = rtmp22 + prow;
2292: pc3 = rtmp33 + prow;
2294: sctx.pv = *pc1;
2295: pj = bj + bi[prow];
2296: rs = 0.0;
2297: for (j = 0; j < nz; j++) {
2298: idx = pj[j];
2299: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2300: }
2301: sctx.rs = rs;
2302: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2303: if (sctx.newshift) goto endofwhile;
2305: if (*pc2 != 0.0 || *pc3 != 0.0) {
2306: mul2 = (*pc2) / (*pc1);
2307: mul3 = (*pc3) / (*pc1);
2308: *pc2 = mul2;
2309: *pc3 = mul3;
2310: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2311: pj = nbj + bd[prow];
2312: for (j = 0; j < nz_tmp; j++) {
2313: idx = pj[j];
2314: tmp = rtmp11[idx];
2315: rtmp22[idx] -= mul2 * tmp;
2316: rtmp33[idx] -= mul3 * tmp;
2317: }
2318: PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2319: }
2320: ++prow;
2322: pc2 = rtmp22 + prow;
2323: pc3 = rtmp33 + prow;
2324: sctx.pv = *pc2;
2325: pj = bj + bi[prow];
2326: rs = 0.0;
2327: for (j = 0; j < nz; j++) {
2328: idx = pj[j];
2329: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2330: }
2331: sctx.rs = rs;
2332: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2333: if (sctx.newshift) goto endofwhile;
2335: if (*pc3 != 0.0) {
2336: mul3 = (*pc3) / (*pc2);
2337: *pc3 = mul3;
2338: pj = nbj + bd[prow];
2339: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2340: for (j = 0; j < nz_tmp; j++) {
2341: idx = pj[j];
2342: tmp = rtmp22[idx];
2343: rtmp33[idx] -= mul3 * tmp;
2344: }
2345: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2346: }
2348: pj = bj + bi[row];
2349: pc1 = ba + bi[row];
2350: pc2 = ba + bi[row + 1];
2351: pc3 = ba + bi[row + 2];
2353: sctx.pv = rtmp33[row + 2];
2354: rs = 0.0;
2355: rtmp11[row] = 1.0 / rtmp11[row];
2356: rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2357: rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2358: /* copy row entries from dense representation to sparse */
2359: for (j = 0; j < nz; j++) {
2360: idx = pj[j];
2361: pc1[j] = rtmp11[idx];
2362: pc2[j] = rtmp22[idx];
2363: pc3[j] = rtmp33[idx];
2364: if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2365: }
2367: sctx.rs = rs;
2368: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2369: if (sctx.newshift) goto endofwhile;
2370: break;
2372: default:
2373: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2374: }
2375: row += nodesz; /* Update the row */
2376: }
2377: endofwhile:;
2378: } while (sctx.newshift);
2379: PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2380: PetscCall(PetscFree(tmp_vec2));
2381: PetscCall(ISRestoreIndices(isicol, &ic));
2382: PetscCall(ISRestoreIndices(isrow, &r));
2383: PetscCall(ISRestoreIndices(iscol, &c));
2385: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2386: /* do not set solve add, since MatSolve_Inode + Add is faster */
2387: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2388: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2389: C->assembled = PETSC_TRUE;
2390: C->preallocated = PETSC_TRUE;
2391: if (sctx.nshift) {
2392: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2393: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2394: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2395: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2396: }
2397: }
2398: PetscCall(PetscLogFlops(C->cmap->n));
2399: PetscCall(MatSeqAIJCheckInode(C));
2400: PetscFunctionReturn(PETSC_SUCCESS);
2401: }
2402: #endif
2404: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2405: {
2406: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2407: IS iscol = a->col, isrow = a->row;
2408: const PetscInt *r, *c, *rout, *cout;
2409: PetscInt i, j;
2410: PetscInt node_max, row, nsz, aii, i0, i1, nz;
2411: const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2412: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
2413: PetscScalar sum1, sum2, sum3, sum4, sum5;
2414: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2415: const PetscScalar *b;
2417: PetscFunctionBegin;
2418: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2419: node_max = a->inode.node_count;
2420: ns = a->inode.size_csr; /* Node Size array */
2422: PetscCall(VecGetArrayRead(bb, &b));
2423: PetscCall(VecGetArrayWrite(xx, &x));
2424: tmp = a->solve_work;
2426: PetscCall(ISGetIndices(isrow, &rout));
2427: r = rout;
2428: PetscCall(ISGetIndices(iscol, &cout));
2429: c = cout;
2431: /* forward solve the lower triangular */
2432: tmps = tmp;
2433: aa = a_a;
2434: aj = a_j;
2435: ad = a->diag;
2437: for (i = 0; i < node_max; ++i) {
2438: row = ns[i];
2439: nsz = ns[i + 1] - ns[i];
2440: aii = ai[row];
2441: v1 = aa + aii;
2442: vi = aj + aii;
2443: nz = ai[row + 1] - ai[row];
2445: if (i < node_max - 1) {
2446: /* Prefetch the indices for the next block */
2447: PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2448: /* Prefetch the data for the next block */
2449: PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2450: }
2452: switch (nsz) { /* Each loop in 'case' is unrolled */
2453: case 1:
2454: sum1 = b[r[row]];
2455: for (j = 0; j < nz - 1; j += 2) {
2456: i0 = vi[j];
2457: i1 = vi[j + 1];
2458: tmp0 = tmps[i0];
2459: tmp1 = tmps[i1];
2460: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2461: }
2462: if (j == nz - 1) {
2463: tmp0 = tmps[vi[j]];
2464: sum1 -= v1[j] * tmp0;
2465: }
2466: tmp[row++] = sum1;
2467: break;
2468: case 2:
2469: sum1 = b[r[row]];
2470: sum2 = b[r[row + 1]];
2471: v2 = aa + ai[row + 1];
2473: for (j = 0; j < nz - 1; j += 2) {
2474: i0 = vi[j];
2475: i1 = vi[j + 1];
2476: tmp0 = tmps[i0];
2477: tmp1 = tmps[i1];
2478: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2479: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2480: }
2481: if (j == nz - 1) {
2482: tmp0 = tmps[vi[j]];
2483: sum1 -= v1[j] * tmp0;
2484: sum2 -= v2[j] * tmp0;
2485: }
2486: sum2 -= v2[nz] * sum1;
2487: tmp[row++] = sum1;
2488: tmp[row++] = sum2;
2489: break;
2490: case 3:
2491: sum1 = b[r[row]];
2492: sum2 = b[r[row + 1]];
2493: sum3 = b[r[row + 2]];
2494: v2 = aa + ai[row + 1];
2495: v3 = aa + ai[row + 2];
2497: for (j = 0; j < nz - 1; j += 2) {
2498: i0 = vi[j];
2499: i1 = vi[j + 1];
2500: tmp0 = tmps[i0];
2501: tmp1 = tmps[i1];
2502: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2503: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2504: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2505: }
2506: if (j == nz - 1) {
2507: tmp0 = tmps[vi[j]];
2508: sum1 -= v1[j] * tmp0;
2509: sum2 -= v2[j] * tmp0;
2510: sum3 -= v3[j] * tmp0;
2511: }
2512: sum2 -= v2[nz] * sum1;
2513: sum3 -= v3[nz] * sum1;
2514: sum3 -= v3[nz + 1] * sum2;
2515: tmp[row++] = sum1;
2516: tmp[row++] = sum2;
2517: tmp[row++] = sum3;
2518: break;
2520: case 4:
2521: sum1 = b[r[row]];
2522: sum2 = b[r[row + 1]];
2523: sum3 = b[r[row + 2]];
2524: sum4 = b[r[row + 3]];
2525: v2 = aa + ai[row + 1];
2526: v3 = aa + ai[row + 2];
2527: v4 = aa + ai[row + 3];
2529: for (j = 0; j < nz - 1; j += 2) {
2530: i0 = vi[j];
2531: i1 = vi[j + 1];
2532: tmp0 = tmps[i0];
2533: tmp1 = tmps[i1];
2534: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2535: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2536: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2537: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2538: }
2539: if (j == nz - 1) {
2540: tmp0 = tmps[vi[j]];
2541: sum1 -= v1[j] * tmp0;
2542: sum2 -= v2[j] * tmp0;
2543: sum3 -= v3[j] * tmp0;
2544: sum4 -= v4[j] * tmp0;
2545: }
2546: sum2 -= v2[nz] * sum1;
2547: sum3 -= v3[nz] * sum1;
2548: sum4 -= v4[nz] * sum1;
2549: sum3 -= v3[nz + 1] * sum2;
2550: sum4 -= v4[nz + 1] * sum2;
2551: sum4 -= v4[nz + 2] * sum3;
2553: tmp[row++] = sum1;
2554: tmp[row++] = sum2;
2555: tmp[row++] = sum3;
2556: tmp[row++] = sum4;
2557: break;
2558: case 5:
2559: sum1 = b[r[row]];
2560: sum2 = b[r[row + 1]];
2561: sum3 = b[r[row + 2]];
2562: sum4 = b[r[row + 3]];
2563: sum5 = b[r[row + 4]];
2564: v2 = aa + ai[row + 1];
2565: v3 = aa + ai[row + 2];
2566: v4 = aa + ai[row + 3];
2567: v5 = aa + ai[row + 4];
2569: for (j = 0; j < nz - 1; j += 2) {
2570: i0 = vi[j];
2571: i1 = vi[j + 1];
2572: tmp0 = tmps[i0];
2573: tmp1 = tmps[i1];
2574: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2575: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2576: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2577: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2578: sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2579: }
2580: if (j == nz - 1) {
2581: tmp0 = tmps[vi[j]];
2582: sum1 -= v1[j] * tmp0;
2583: sum2 -= v2[j] * tmp0;
2584: sum3 -= v3[j] * tmp0;
2585: sum4 -= v4[j] * tmp0;
2586: sum5 -= v5[j] * tmp0;
2587: }
2589: sum2 -= v2[nz] * sum1;
2590: sum3 -= v3[nz] * sum1;
2591: sum4 -= v4[nz] * sum1;
2592: sum5 -= v5[nz] * sum1;
2593: sum3 -= v3[nz + 1] * sum2;
2594: sum4 -= v4[nz + 1] * sum2;
2595: sum5 -= v5[nz + 1] * sum2;
2596: sum4 -= v4[nz + 2] * sum3;
2597: sum5 -= v5[nz + 2] * sum3;
2598: sum5 -= v5[nz + 3] * sum4;
2600: tmp[row++] = sum1;
2601: tmp[row++] = sum2;
2602: tmp[row++] = sum3;
2603: tmp[row++] = sum4;
2604: tmp[row++] = sum5;
2605: break;
2606: default:
2607: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2608: }
2609: }
2610: /* backward solve the upper triangular */
2611: for (i = node_max - 1; i >= 0; i--) {
2612: row = ns[i + 1] - 1;
2613: nsz = ns[i + 1] - ns[i];
2614: aii = ad[row + 1] + 1;
2615: v1 = aa + aii;
2616: vi = aj + aii;
2617: nz = ad[row] - ad[row + 1] - 1;
2619: if (i > 0) {
2620: /* Prefetch the indices for the next block */
2621: PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2622: /* Prefetch the data for the next block */
2623: PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2624: }
2626: switch (nsz) { /* Each loop in 'case' is unrolled */
2627: case 1:
2628: sum1 = tmp[row];
2630: for (j = 0; j < nz - 1; j += 2) {
2631: i0 = vi[j];
2632: i1 = vi[j + 1];
2633: tmp0 = tmps[i0];
2634: tmp1 = tmps[i1];
2635: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2636: }
2637: if (j == nz - 1) {
2638: tmp0 = tmps[vi[j]];
2639: sum1 -= v1[j] * tmp0;
2640: }
2641: x[c[row]] = tmp[row] = sum1 * v1[nz];
2642: row--;
2643: break;
2644: case 2:
2645: sum1 = tmp[row];
2646: sum2 = tmp[row - 1];
2647: v2 = aa + ad[row] + 1;
2648: for (j = 0; j < nz - 1; j += 2) {
2649: i0 = vi[j];
2650: i1 = vi[j + 1];
2651: tmp0 = tmps[i0];
2652: tmp1 = tmps[i1];
2653: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2654: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2655: }
2656: if (j == nz - 1) {
2657: tmp0 = tmps[vi[j]];
2658: sum1 -= v1[j] * tmp0;
2659: sum2 -= v2[j + 1] * tmp0;
2660: }
2662: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2663: row--;
2664: sum2 -= v2[0] * tmp0;
2665: x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2666: row--;
2667: break;
2668: case 3:
2669: sum1 = tmp[row];
2670: sum2 = tmp[row - 1];
2671: sum3 = tmp[row - 2];
2672: v2 = aa + ad[row] + 1;
2673: v3 = aa + ad[row - 1] + 1;
2674: for (j = 0; j < nz - 1; j += 2) {
2675: i0 = vi[j];
2676: i1 = vi[j + 1];
2677: tmp0 = tmps[i0];
2678: tmp1 = tmps[i1];
2679: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2680: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2681: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2682: }
2683: if (j == nz - 1) {
2684: tmp0 = tmps[vi[j]];
2685: sum1 -= v1[j] * tmp0;
2686: sum2 -= v2[j + 1] * tmp0;
2687: sum3 -= v3[j + 2] * tmp0;
2688: }
2689: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2690: row--;
2691: sum2 -= v2[0] * tmp0;
2692: sum3 -= v3[1] * tmp0;
2693: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2694: row--;
2695: sum3 -= v3[0] * tmp0;
2696: x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2697: row--;
2699: break;
2700: case 4:
2701: sum1 = tmp[row];
2702: sum2 = tmp[row - 1];
2703: sum3 = tmp[row - 2];
2704: sum4 = tmp[row - 3];
2705: v2 = aa + ad[row] + 1;
2706: v3 = aa + ad[row - 1] + 1;
2707: v4 = aa + ad[row - 2] + 1;
2709: for (j = 0; j < nz - 1; j += 2) {
2710: i0 = vi[j];
2711: i1 = vi[j + 1];
2712: tmp0 = tmps[i0];
2713: tmp1 = tmps[i1];
2714: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2715: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2716: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2717: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2718: }
2719: if (j == nz - 1) {
2720: tmp0 = tmps[vi[j]];
2721: sum1 -= v1[j] * tmp0;
2722: sum2 -= v2[j + 1] * tmp0;
2723: sum3 -= v3[j + 2] * tmp0;
2724: sum4 -= v4[j + 3] * tmp0;
2725: }
2727: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2728: row--;
2729: sum2 -= v2[0] * tmp0;
2730: sum3 -= v3[1] * tmp0;
2731: sum4 -= v4[2] * tmp0;
2732: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2733: row--;
2734: sum3 -= v3[0] * tmp0;
2735: sum4 -= v4[1] * tmp0;
2736: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2737: row--;
2738: sum4 -= v4[0] * tmp0;
2739: x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2740: row--;
2741: break;
2742: case 5:
2743: sum1 = tmp[row];
2744: sum2 = tmp[row - 1];
2745: sum3 = tmp[row - 2];
2746: sum4 = tmp[row - 3];
2747: sum5 = tmp[row - 4];
2748: v2 = aa + ad[row] + 1;
2749: v3 = aa + ad[row - 1] + 1;
2750: v4 = aa + ad[row - 2] + 1;
2751: v5 = aa + ad[row - 3] + 1;
2752: for (j = 0; j < nz - 1; j += 2) {
2753: i0 = vi[j];
2754: i1 = vi[j + 1];
2755: tmp0 = tmps[i0];
2756: tmp1 = tmps[i1];
2757: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2758: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2759: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2760: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2761: sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2762: }
2763: if (j == nz - 1) {
2764: tmp0 = tmps[vi[j]];
2765: sum1 -= v1[j] * tmp0;
2766: sum2 -= v2[j + 1] * tmp0;
2767: sum3 -= v3[j + 2] * tmp0;
2768: sum4 -= v4[j + 3] * tmp0;
2769: sum5 -= v5[j + 4] * tmp0;
2770: }
2772: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2773: row--;
2774: sum2 -= v2[0] * tmp0;
2775: sum3 -= v3[1] * tmp0;
2776: sum4 -= v4[2] * tmp0;
2777: sum5 -= v5[3] * tmp0;
2778: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2779: row--;
2780: sum3 -= v3[0] * tmp0;
2781: sum4 -= v4[1] * tmp0;
2782: sum5 -= v5[2] * tmp0;
2783: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2784: row--;
2785: sum4 -= v4[0] * tmp0;
2786: sum5 -= v5[1] * tmp0;
2787: tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2788: row--;
2789: sum5 -= v5[0] * tmp0;
2790: x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2791: row--;
2792: break;
2793: default:
2794: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2795: }
2796: }
2797: PetscCall(ISRestoreIndices(isrow, &rout));
2798: PetscCall(ISRestoreIndices(iscol, &cout));
2799: PetscCall(VecRestoreArrayRead(bb, &b));
2800: PetscCall(VecRestoreArrayWrite(xx, &x));
2801: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2802: PetscFunctionReturn(PETSC_SUCCESS);
2803: }
2805: /*
2806: Makes a longer coloring[] array and calls the usual code with that
2807: */
2808: static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2809: {
2810: Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data;
2811: PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
2812: PetscInt *colorused, i;
2813: ISColoringValue *newcolor;
2815: PetscFunctionBegin;
2816: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2817: PetscCall(PetscMalloc1(n + 1, &newcolor));
2818: /* loop over inodes, marking a color for each column*/
2819: row = 0;
2820: for (i = 0; i < m; i++) {
2821: for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
2822: }
2824: /* eliminate unneeded colors */
2825: PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2826: for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2828: for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2829: ncolors = colorused[5 * ncolors - 1];
2830: for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
2831: PetscCall(PetscFree(colorused));
2832: PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2833: PetscCall(PetscFree(coloring));
2834: PetscFunctionReturn(PETSC_SUCCESS);
2835: }
2837: #include <petsc/private/kernels/blockinvert.h>
2839: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2840: {
2841: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2842: PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2843: MatScalar *ibdiag, *bdiag, work[25], *t;
2844: PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2845: const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2846: const PetscScalar *xb, *b;
2847: PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2848: PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2, nodesz;
2849: PetscInt sz, k, ipvt[5];
2850: PetscBool allowzeropivot, zeropivotdetected;
2851: const PetscInt *sizes = a->inode.size_csr, *idx, *diag = a->diag, *ii = a->i;
2853: PetscFunctionBegin;
2854: allowzeropivot = PetscNot(A->erroriffailure);
2855: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2856: PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2857: PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2859: if (!a->inode.ibdiagvalid) {
2860: if (!a->inode.ibdiag) {
2861: /* calculate space needed for diagonal blocks */
2862: for (i = 0; i < m; i++) {
2863: nodesz = sizes[i + 1] - sizes[i];
2864: cnt += nodesz * nodesz;
2865: }
2866: a->inode.bdiagsize = cnt;
2868: PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2869: }
2871: /* copy over the diagonal blocks and invert them */
2872: ibdiag = a->inode.ibdiag;
2873: bdiag = a->inode.bdiag;
2874: cnt = 0;
2875: for (i = 0, row = 0; i < m; i++) {
2876: nodesz = sizes[i + 1] - sizes[i];
2877: for (j = 0; j < nodesz; j++) {
2878: for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
2879: }
2880: PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
2882: switch (nodesz) {
2883: case 1:
2884: /* Create matrix data structure */
2885: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2886: PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2887: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2888: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2889: A->factorerror_zeropivot_row = row;
2890: PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2891: }
2892: ibdiag[cnt] = 1.0 / ibdiag[cnt];
2893: break;
2894: case 2:
2895: PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2896: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2897: break;
2898: case 3:
2899: PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2900: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2901: break;
2902: case 4:
2903: PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2904: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2905: break;
2906: case 5:
2907: PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2908: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2909: break;
2910: default:
2911: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2912: }
2913: cnt += nodesz * nodesz;
2914: row += nodesz;
2915: }
2916: a->inode.ibdiagvalid = PETSC_TRUE;
2917: }
2918: ibdiag = a->inode.ibdiag;
2919: bdiag = a->inode.bdiag;
2920: t = a->inode.ssor_work;
2922: PetscCall(VecGetArray(xx, &x));
2923: PetscCall(VecGetArrayRead(bb, &b));
2924: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2925: if (flag & SOR_ZERO_INITIAL_GUESS) {
2926: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2927: for (i = 0, row = 0; i < m; i++) {
2928: sz = diag[row] - ii[row];
2929: v1 = a->a + ii[row];
2930: idx = a->j + ii[row];
2932: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2933: nodesz = sizes[i + 1] - sizes[i];
2934: switch (nodesz) {
2935: case 1:
2937: sum1 = b[row];
2938: for (n = 0; n < sz - 1; n += 2) {
2939: i1 = idx[0];
2940: i2 = idx[1];
2941: idx += 2;
2942: tmp0 = x[i1];
2943: tmp1 = x[i2];
2944: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2945: v1 += 2;
2946: }
2948: if (n == sz - 1) {
2949: tmp0 = x[*idx];
2950: sum1 -= *v1 * tmp0;
2951: }
2952: t[row] = sum1;
2953: x[row++] = sum1 * (*ibdiag++);
2954: break;
2955: case 2:
2956: v2 = a->a + ii[row + 1];
2957: sum1 = b[row];
2958: sum2 = b[row + 1];
2959: for (n = 0; n < sz - 1; n += 2) {
2960: i1 = idx[0];
2961: i2 = idx[1];
2962: idx += 2;
2963: tmp0 = x[i1];
2964: tmp1 = x[i2];
2965: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2966: v1 += 2;
2967: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2968: v2 += 2;
2969: }
2971: if (n == sz - 1) {
2972: tmp0 = x[*idx];
2973: sum1 -= v1[0] * tmp0;
2974: sum2 -= v2[0] * tmp0;
2975: }
2976: t[row] = sum1;
2977: t[row + 1] = sum2;
2978: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2979: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2980: ibdiag += 4;
2981: break;
2982: case 3:
2983: v2 = a->a + ii[row + 1];
2984: v3 = a->a + ii[row + 2];
2985: sum1 = b[row];
2986: sum2 = b[row + 1];
2987: sum3 = b[row + 2];
2988: for (n = 0; n < sz - 1; n += 2) {
2989: i1 = idx[0];
2990: i2 = idx[1];
2991: idx += 2;
2992: tmp0 = x[i1];
2993: tmp1 = x[i2];
2994: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2995: v1 += 2;
2996: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2997: v2 += 2;
2998: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2999: v3 += 2;
3000: }
3002: if (n == sz - 1) {
3003: tmp0 = x[*idx];
3004: sum1 -= v1[0] * tmp0;
3005: sum2 -= v2[0] * tmp0;
3006: sum3 -= v3[0] * tmp0;
3007: }
3008: t[row] = sum1;
3009: t[row + 1] = sum2;
3010: t[row + 2] = sum3;
3011: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3012: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3013: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3014: ibdiag += 9;
3015: break;
3016: case 4:
3017: v2 = a->a + ii[row + 1];
3018: v3 = a->a + ii[row + 2];
3019: v4 = a->a + ii[row + 3];
3020: sum1 = b[row];
3021: sum2 = b[row + 1];
3022: sum3 = b[row + 2];
3023: sum4 = b[row + 3];
3024: for (n = 0; n < sz - 1; n += 2) {
3025: i1 = idx[0];
3026: i2 = idx[1];
3027: idx += 2;
3028: tmp0 = x[i1];
3029: tmp1 = x[i2];
3030: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3031: v1 += 2;
3032: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3033: v2 += 2;
3034: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3035: v3 += 2;
3036: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3037: v4 += 2;
3038: }
3040: if (n == sz - 1) {
3041: tmp0 = x[*idx];
3042: sum1 -= v1[0] * tmp0;
3043: sum2 -= v2[0] * tmp0;
3044: sum3 -= v3[0] * tmp0;
3045: sum4 -= v4[0] * tmp0;
3046: }
3047: t[row] = sum1;
3048: t[row + 1] = sum2;
3049: t[row + 2] = sum3;
3050: t[row + 3] = sum4;
3051: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3052: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3053: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3054: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3055: ibdiag += 16;
3056: break;
3057: case 5:
3058: v2 = a->a + ii[row + 1];
3059: v3 = a->a + ii[row + 2];
3060: v4 = a->a + ii[row + 3];
3061: v5 = a->a + ii[row + 4];
3062: sum1 = b[row];
3063: sum2 = b[row + 1];
3064: sum3 = b[row + 2];
3065: sum4 = b[row + 3];
3066: sum5 = b[row + 4];
3067: for (n = 0; n < sz - 1; n += 2) {
3068: i1 = idx[0];
3069: i2 = idx[1];
3070: idx += 2;
3071: tmp0 = x[i1];
3072: tmp1 = x[i2];
3073: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3074: v1 += 2;
3075: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3076: v2 += 2;
3077: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3078: v3 += 2;
3079: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3080: v4 += 2;
3081: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3082: v5 += 2;
3083: }
3085: if (n == sz - 1) {
3086: tmp0 = x[*idx];
3087: sum1 -= v1[0] * tmp0;
3088: sum2 -= v2[0] * tmp0;
3089: sum3 -= v3[0] * tmp0;
3090: sum4 -= v4[0] * tmp0;
3091: sum5 -= v5[0] * tmp0;
3092: }
3093: t[row] = sum1;
3094: t[row + 1] = sum2;
3095: t[row + 2] = sum3;
3096: t[row + 3] = sum4;
3097: t[row + 4] = sum5;
3098: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3099: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3100: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3101: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3102: x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3103: ibdiag += 25;
3104: break;
3105: default:
3106: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3107: }
3108: }
3110: xb = t;
3111: PetscCall(PetscLogFlops(a->nz));
3112: } else xb = b;
3113: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3114: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3115: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3116: nodesz = sizes[i + 1] - sizes[i];
3117: ibdiag -= nodesz * nodesz;
3118: sz = ii[row + 1] - diag[row] - 1;
3119: v1 = a->a + diag[row] + 1;
3120: idx = a->j + diag[row] + 1;
3122: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3123: switch (nodesz) {
3124: case 1:
3126: sum1 = xb[row];
3127: for (n = 0; n < sz - 1; n += 2) {
3128: i1 = idx[0];
3129: i2 = idx[1];
3130: idx += 2;
3131: tmp0 = x[i1];
3132: tmp1 = x[i2];
3133: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3134: v1 += 2;
3135: }
3137: if (n == sz - 1) {
3138: tmp0 = x[*idx];
3139: sum1 -= *v1 * tmp0;
3140: }
3141: x[row--] = sum1 * (*ibdiag);
3142: break;
3144: case 2:
3146: sum1 = xb[row];
3147: sum2 = xb[row - 1];
3148: /* note that sum1 is associated with the second of the two rows */
3149: v2 = a->a + diag[row - 1] + 2;
3150: for (n = 0; n < sz - 1; n += 2) {
3151: i1 = idx[0];
3152: i2 = idx[1];
3153: idx += 2;
3154: tmp0 = x[i1];
3155: tmp1 = x[i2];
3156: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3157: v1 += 2;
3158: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3159: v2 += 2;
3160: }
3162: if (n == sz - 1) {
3163: tmp0 = x[*idx];
3164: sum1 -= *v1 * tmp0;
3165: sum2 -= *v2 * tmp0;
3166: }
3167: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3168: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3169: break;
3170: case 3:
3172: sum1 = xb[row];
3173: sum2 = xb[row - 1];
3174: sum3 = xb[row - 2];
3175: v2 = a->a + diag[row - 1] + 2;
3176: v3 = a->a + diag[row - 2] + 3;
3177: for (n = 0; n < sz - 1; n += 2) {
3178: i1 = idx[0];
3179: i2 = idx[1];
3180: idx += 2;
3181: tmp0 = x[i1];
3182: tmp1 = x[i2];
3183: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3184: v1 += 2;
3185: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3186: v2 += 2;
3187: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3188: v3 += 2;
3189: }
3191: if (n == sz - 1) {
3192: tmp0 = x[*idx];
3193: sum1 -= *v1 * tmp0;
3194: sum2 -= *v2 * tmp0;
3195: sum3 -= *v3 * tmp0;
3196: }
3197: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3198: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3199: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3200: break;
3201: case 4:
3203: sum1 = xb[row];
3204: sum2 = xb[row - 1];
3205: sum3 = xb[row - 2];
3206: sum4 = xb[row - 3];
3207: v2 = a->a + diag[row - 1] + 2;
3208: v3 = a->a + diag[row - 2] + 3;
3209: v4 = a->a + diag[row - 3] + 4;
3210: for (n = 0; n < sz - 1; n += 2) {
3211: i1 = idx[0];
3212: i2 = idx[1];
3213: idx += 2;
3214: tmp0 = x[i1];
3215: tmp1 = x[i2];
3216: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3217: v1 += 2;
3218: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3219: v2 += 2;
3220: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3221: v3 += 2;
3222: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3223: v4 += 2;
3224: }
3226: if (n == sz - 1) {
3227: tmp0 = x[*idx];
3228: sum1 -= *v1 * tmp0;
3229: sum2 -= *v2 * tmp0;
3230: sum3 -= *v3 * tmp0;
3231: sum4 -= *v4 * tmp0;
3232: }
3233: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3234: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3235: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3236: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3237: break;
3238: case 5:
3240: sum1 = xb[row];
3241: sum2 = xb[row - 1];
3242: sum3 = xb[row - 2];
3243: sum4 = xb[row - 3];
3244: sum5 = xb[row - 4];
3245: v2 = a->a + diag[row - 1] + 2;
3246: v3 = a->a + diag[row - 2] + 3;
3247: v4 = a->a + diag[row - 3] + 4;
3248: v5 = a->a + diag[row - 4] + 5;
3249: for (n = 0; n < sz - 1; n += 2) {
3250: i1 = idx[0];
3251: i2 = idx[1];
3252: idx += 2;
3253: tmp0 = x[i1];
3254: tmp1 = x[i2];
3255: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3256: v1 += 2;
3257: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3258: v2 += 2;
3259: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3260: v3 += 2;
3261: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3262: v4 += 2;
3263: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3264: v5 += 2;
3265: }
3267: if (n == sz - 1) {
3268: tmp0 = x[*idx];
3269: sum1 -= *v1 * tmp0;
3270: sum2 -= *v2 * tmp0;
3271: sum3 -= *v3 * tmp0;
3272: sum4 -= *v4 * tmp0;
3273: sum5 -= *v5 * tmp0;
3274: }
3275: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3276: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3277: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3278: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3279: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3280: break;
3281: default:
3282: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3283: }
3284: }
3286: PetscCall(PetscLogFlops(a->nz));
3287: }
3288: its--;
3289: }
3290: while (its--) {
3291: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3292: for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
3293: nodesz = sizes[i + 1] - sizes[i];
3294: sz = diag[row] - ii[row];
3295: v1 = a->a + ii[row];
3296: idx = a->j + ii[row];
3297: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3298: switch (nodesz) {
3299: case 1:
3300: sum1 = b[row];
3301: for (n = 0; n < sz - 1; n += 2) {
3302: i1 = idx[0];
3303: i2 = idx[1];
3304: idx += 2;
3305: tmp0 = x[i1];
3306: tmp1 = x[i2];
3307: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3308: v1 += 2;
3309: }
3310: if (n == sz - 1) {
3311: tmp0 = x[*idx++];
3312: sum1 -= *v1 * tmp0;
3313: v1++;
3314: }
3315: t[row] = sum1;
3316: sz = ii[row + 1] - diag[row] - 1;
3317: idx = a->j + diag[row] + 1;
3318: v1 += 1;
3319: for (n = 0; n < sz - 1; n += 2) {
3320: i1 = idx[0];
3321: i2 = idx[1];
3322: idx += 2;
3323: tmp0 = x[i1];
3324: tmp1 = x[i2];
3325: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3326: v1 += 2;
3327: }
3328: if (n == sz - 1) {
3329: tmp0 = x[*idx++];
3330: sum1 -= *v1 * tmp0;
3331: }
3332: /* in MatSOR_SeqAIJ this line would be
3333: *
3334: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3335: *
3336: * but omega == 1, so this becomes
3337: *
3338: * x[row] = sum1*(*ibdiag++);
3339: *
3340: */
3341: x[row] = sum1 * (*ibdiag);
3342: break;
3343: case 2:
3344: v2 = a->a + ii[row + 1];
3345: sum1 = b[row];
3346: sum2 = b[row + 1];
3347: for (n = 0; n < sz - 1; n += 2) {
3348: i1 = idx[0];
3349: i2 = idx[1];
3350: idx += 2;
3351: tmp0 = x[i1];
3352: tmp1 = x[i2];
3353: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3354: v1 += 2;
3355: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3356: v2 += 2;
3357: }
3358: if (n == sz - 1) {
3359: tmp0 = x[*idx++];
3360: sum1 -= v1[0] * tmp0;
3361: sum2 -= v2[0] * tmp0;
3362: v1++;
3363: v2++;
3364: }
3365: t[row] = sum1;
3366: t[row + 1] = sum2;
3367: sz = ii[row + 1] - diag[row] - 2;
3368: idx = a->j + diag[row] + 2;
3369: v1 += 2;
3370: v2 += 2;
3371: for (n = 0; n < sz - 1; n += 2) {
3372: i1 = idx[0];
3373: i2 = idx[1];
3374: idx += 2;
3375: tmp0 = x[i1];
3376: tmp1 = x[i2];
3377: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3378: v1 += 2;
3379: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3380: v2 += 2;
3381: }
3382: if (n == sz - 1) {
3383: tmp0 = x[*idx];
3384: sum1 -= v1[0] * tmp0;
3385: sum2 -= v2[0] * tmp0;
3386: }
3387: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3388: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3389: break;
3390: case 3:
3391: v2 = a->a + ii[row + 1];
3392: v3 = a->a + ii[row + 2];
3393: sum1 = b[row];
3394: sum2 = b[row + 1];
3395: sum3 = b[row + 2];
3396: for (n = 0; n < sz - 1; n += 2) {
3397: i1 = idx[0];
3398: i2 = idx[1];
3399: idx += 2;
3400: tmp0 = x[i1];
3401: tmp1 = x[i2];
3402: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3403: v1 += 2;
3404: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3405: v2 += 2;
3406: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3407: v3 += 2;
3408: }
3409: if (n == sz - 1) {
3410: tmp0 = x[*idx++];
3411: sum1 -= v1[0] * tmp0;
3412: sum2 -= v2[0] * tmp0;
3413: sum3 -= v3[0] * tmp0;
3414: v1++;
3415: v2++;
3416: v3++;
3417: }
3418: t[row] = sum1;
3419: t[row + 1] = sum2;
3420: t[row + 2] = sum3;
3421: sz = ii[row + 1] - diag[row] - 3;
3422: idx = a->j + diag[row] + 3;
3423: v1 += 3;
3424: v2 += 3;
3425: v3 += 3;
3426: for (n = 0; n < sz - 1; n += 2) {
3427: i1 = idx[0];
3428: i2 = idx[1];
3429: idx += 2;
3430: tmp0 = x[i1];
3431: tmp1 = x[i2];
3432: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3433: v1 += 2;
3434: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3435: v2 += 2;
3436: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3437: v3 += 2;
3438: }
3439: if (n == sz - 1) {
3440: tmp0 = x[*idx];
3441: sum1 -= v1[0] * tmp0;
3442: sum2 -= v2[0] * tmp0;
3443: sum3 -= v3[0] * tmp0;
3444: }
3445: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3446: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3447: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3448: break;
3449: case 4:
3450: v2 = a->a + ii[row + 1];
3451: v3 = a->a + ii[row + 2];
3452: v4 = a->a + ii[row + 3];
3453: sum1 = b[row];
3454: sum2 = b[row + 1];
3455: sum3 = b[row + 2];
3456: sum4 = b[row + 3];
3457: for (n = 0; n < sz - 1; n += 2) {
3458: i1 = idx[0];
3459: i2 = idx[1];
3460: idx += 2;
3461: tmp0 = x[i1];
3462: tmp1 = x[i2];
3463: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3464: v1 += 2;
3465: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3466: v2 += 2;
3467: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3468: v3 += 2;
3469: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3470: v4 += 2;
3471: }
3472: if (n == sz - 1) {
3473: tmp0 = x[*idx++];
3474: sum1 -= v1[0] * tmp0;
3475: sum2 -= v2[0] * tmp0;
3476: sum3 -= v3[0] * tmp0;
3477: sum4 -= v4[0] * tmp0;
3478: v1++;
3479: v2++;
3480: v3++;
3481: v4++;
3482: }
3483: t[row] = sum1;
3484: t[row + 1] = sum2;
3485: t[row + 2] = sum3;
3486: t[row + 3] = sum4;
3487: sz = ii[row + 1] - diag[row] - 4;
3488: idx = a->j + diag[row] + 4;
3489: v1 += 4;
3490: v2 += 4;
3491: v3 += 4;
3492: v4 += 4;
3493: for (n = 0; n < sz - 1; n += 2) {
3494: i1 = idx[0];
3495: i2 = idx[1];
3496: idx += 2;
3497: tmp0 = x[i1];
3498: tmp1 = x[i2];
3499: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3500: v1 += 2;
3501: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3502: v2 += 2;
3503: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3504: v3 += 2;
3505: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3506: v4 += 2;
3507: }
3508: if (n == sz - 1) {
3509: tmp0 = x[*idx];
3510: sum1 -= v1[0] * tmp0;
3511: sum2 -= v2[0] * tmp0;
3512: sum3 -= v3[0] * tmp0;
3513: sum4 -= v4[0] * tmp0;
3514: }
3515: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3516: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3517: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3518: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3519: break;
3520: case 5:
3521: v2 = a->a + ii[row + 1];
3522: v3 = a->a + ii[row + 2];
3523: v4 = a->a + ii[row + 3];
3524: v5 = a->a + ii[row + 4];
3525: sum1 = b[row];
3526: sum2 = b[row + 1];
3527: sum3 = b[row + 2];
3528: sum4 = b[row + 3];
3529: sum5 = b[row + 4];
3530: for (n = 0; n < sz - 1; n += 2) {
3531: i1 = idx[0];
3532: i2 = idx[1];
3533: idx += 2;
3534: tmp0 = x[i1];
3535: tmp1 = x[i2];
3536: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3537: v1 += 2;
3538: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3539: v2 += 2;
3540: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3541: v3 += 2;
3542: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3543: v4 += 2;
3544: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3545: v5 += 2;
3546: }
3547: if (n == sz - 1) {
3548: tmp0 = x[*idx++];
3549: sum1 -= v1[0] * tmp0;
3550: sum2 -= v2[0] * tmp0;
3551: sum3 -= v3[0] * tmp0;
3552: sum4 -= v4[0] * tmp0;
3553: sum5 -= v5[0] * tmp0;
3554: v1++;
3555: v2++;
3556: v3++;
3557: v4++;
3558: v5++;
3559: }
3560: t[row] = sum1;
3561: t[row + 1] = sum2;
3562: t[row + 2] = sum3;
3563: t[row + 3] = sum4;
3564: t[row + 4] = sum5;
3565: sz = ii[row + 1] - diag[row] - 5;
3566: idx = a->j + diag[row] + 5;
3567: v1 += 5;
3568: v2 += 5;
3569: v3 += 5;
3570: v4 += 5;
3571: v5 += 5;
3572: for (n = 0; n < sz - 1; n += 2) {
3573: i1 = idx[0];
3574: i2 = idx[1];
3575: idx += 2;
3576: tmp0 = x[i1];
3577: tmp1 = x[i2];
3578: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3579: v1 += 2;
3580: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3581: v2 += 2;
3582: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3583: v3 += 2;
3584: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3585: v4 += 2;
3586: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3587: v5 += 2;
3588: }
3589: if (n == sz - 1) {
3590: tmp0 = x[*idx];
3591: sum1 -= v1[0] * tmp0;
3592: sum2 -= v2[0] * tmp0;
3593: sum3 -= v3[0] * tmp0;
3594: sum4 -= v4[0] * tmp0;
3595: sum5 -= v5[0] * tmp0;
3596: }
3597: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3598: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3599: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3600: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3601: x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3602: break;
3603: default:
3604: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3605: }
3606: }
3607: xb = t;
3608: PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3609: } else xb = b;
3611: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3612: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3613: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3614: nodesz = sizes[i + 1] - sizes[i];
3615: ibdiag -= nodesz * nodesz;
3617: /* set RHS */
3618: if (xb == b) {
3619: /* whole (old way) */
3620: sz = ii[row + 1] - ii[row];
3621: idx = a->j + ii[row];
3622: switch (nodesz) {
3623: case 5:
3624: v5 = a->a + ii[row - 4]; /* fall through */
3625: case 4:
3626: v4 = a->a + ii[row - 3]; /* fall through */
3627: case 3:
3628: v3 = a->a + ii[row - 2]; /* fall through */
3629: case 2:
3630: v2 = a->a + ii[row - 1]; /* fall through */
3631: case 1:
3632: v1 = a->a + ii[row];
3633: break;
3634: default:
3635: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3636: }
3637: } else {
3638: /* upper, no diag */
3639: sz = ii[row + 1] - diag[row] - 1;
3640: idx = a->j + diag[row] + 1;
3641: switch (nodesz) {
3642: case 5:
3643: v5 = a->a + diag[row - 4] + 5; /* fall through */
3644: case 4:
3645: v4 = a->a + diag[row - 3] + 4; /* fall through */
3646: case 3:
3647: v3 = a->a + diag[row - 2] + 3; /* fall through */
3648: case 2:
3649: v2 = a->a + diag[row - 1] + 2; /* fall through */
3650: case 1:
3651: v1 = a->a + diag[row] + 1;
3652: }
3653: }
3654: /* set sum */
3655: switch (nodesz) {
3656: case 5:
3657: sum5 = xb[row - 4]; /* fall through */
3658: case 4:
3659: sum4 = xb[row - 3]; /* fall through */
3660: case 3:
3661: sum3 = xb[row - 2]; /* fall through */
3662: case 2:
3663: sum2 = xb[row - 1]; /* fall through */
3664: case 1:
3665: /* note that sum1 is associated with the last row */
3666: sum1 = xb[row];
3667: }
3668: /* do sums */
3669: for (n = 0; n < sz - 1; n += 2) {
3670: i1 = idx[0];
3671: i2 = idx[1];
3672: idx += 2;
3673: tmp0 = x[i1];
3674: tmp1 = x[i2];
3675: switch (nodesz) {
3676: case 5:
3677: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3678: v5 += 2; /* fall through */
3679: case 4:
3680: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3681: v4 += 2; /* fall through */
3682: case 3:
3683: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3684: v3 += 2; /* fall through */
3685: case 2:
3686: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3687: v2 += 2; /* fall through */
3688: case 1:
3689: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3690: v1 += 2;
3691: }
3692: }
3693: /* ragged edge */
3694: if (n == sz - 1) {
3695: tmp0 = x[*idx];
3696: switch (nodesz) {
3697: case 5:
3698: sum5 -= *v5 * tmp0; /* fall through */
3699: case 4:
3700: sum4 -= *v4 * tmp0; /* fall through */
3701: case 3:
3702: sum3 -= *v3 * tmp0; /* fall through */
3703: case 2:
3704: sum2 -= *v2 * tmp0; /* fall through */
3705: case 1:
3706: sum1 -= *v1 * tmp0;
3707: }
3708: }
3709: /* update */
3710: if (xb == b) {
3711: /* whole (old way) w/ diag */
3712: switch (nodesz) {
3713: case 5:
3714: x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3715: x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3716: x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3717: x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3718: x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3719: break;
3720: case 4:
3721: x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3722: x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3723: x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3724: x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3725: break;
3726: case 3:
3727: x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3728: x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3729: x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3730: break;
3731: case 2:
3732: x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3733: x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3734: break;
3735: case 1:
3736: x[row--] += sum1 * (*ibdiag);
3737: break;
3738: }
3739: } else {
3740: /* no diag so set = */
3741: switch (nodesz) {
3742: case 5:
3743: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3744: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3745: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3746: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3747: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3748: break;
3749: case 4:
3750: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3751: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3752: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3753: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3754: break;
3755: case 3:
3756: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3757: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3758: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3759: break;
3760: case 2:
3761: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3762: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3763: break;
3764: case 1:
3765: x[row--] = sum1 * (*ibdiag);
3766: break;
3767: }
3768: }
3769: }
3770: if (xb == b) {
3771: PetscCall(PetscLogFlops(2.0 * a->nz));
3772: } else {
3773: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3774: }
3775: }
3776: }
3777: if (flag & SOR_EISENSTAT) {
3778: /*
3779: Apply (U + D)^-1 where D is now the block diagonal
3780: */
3781: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3782: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3783: nodesz = sizes[i + 1] - sizes[i];
3784: ibdiag -= nodesz * nodesz;
3785: sz = ii[row + 1] - diag[row] - 1;
3786: v1 = a->a + diag[row] + 1;
3787: idx = a->j + diag[row] + 1;
3788: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3789: switch (nodesz) {
3790: case 1:
3792: sum1 = b[row];
3793: for (n = 0; n < sz - 1; n += 2) {
3794: i1 = idx[0];
3795: i2 = idx[1];
3796: idx += 2;
3797: tmp0 = x[i1];
3798: tmp1 = x[i2];
3799: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3800: v1 += 2;
3801: }
3803: if (n == sz - 1) {
3804: tmp0 = x[*idx];
3805: sum1 -= *v1 * tmp0;
3806: }
3807: x[row] = sum1 * (*ibdiag);
3808: row--;
3809: break;
3811: case 2:
3813: sum1 = b[row];
3814: sum2 = b[row - 1];
3815: /* note that sum1 is associated with the second of the two rows */
3816: v2 = a->a + diag[row - 1] + 2;
3817: for (n = 0; n < sz - 1; n += 2) {
3818: i1 = idx[0];
3819: i2 = idx[1];
3820: idx += 2;
3821: tmp0 = x[i1];
3822: tmp1 = x[i2];
3823: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3824: v1 += 2;
3825: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3826: v2 += 2;
3827: }
3829: if (n == sz - 1) {
3830: tmp0 = x[*idx];
3831: sum1 -= *v1 * tmp0;
3832: sum2 -= *v2 * tmp0;
3833: }
3834: x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3835: x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3836: row -= 2;
3837: break;
3838: case 3:
3840: sum1 = b[row];
3841: sum2 = b[row - 1];
3842: sum3 = b[row - 2];
3843: v2 = a->a + diag[row - 1] + 2;
3844: v3 = a->a + diag[row - 2] + 3;
3845: for (n = 0; n < sz - 1; n += 2) {
3846: i1 = idx[0];
3847: i2 = idx[1];
3848: idx += 2;
3849: tmp0 = x[i1];
3850: tmp1 = x[i2];
3851: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3852: v1 += 2;
3853: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3854: v2 += 2;
3855: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3856: v3 += 2;
3857: }
3859: if (n == sz - 1) {
3860: tmp0 = x[*idx];
3861: sum1 -= *v1 * tmp0;
3862: sum2 -= *v2 * tmp0;
3863: sum3 -= *v3 * tmp0;
3864: }
3865: x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3866: x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3867: x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3868: row -= 3;
3869: break;
3870: case 4:
3872: sum1 = b[row];
3873: sum2 = b[row - 1];
3874: sum3 = b[row - 2];
3875: sum4 = b[row - 3];
3876: v2 = a->a + diag[row - 1] + 2;
3877: v3 = a->a + diag[row - 2] + 3;
3878: v4 = a->a + diag[row - 3] + 4;
3879: for (n = 0; n < sz - 1; n += 2) {
3880: i1 = idx[0];
3881: i2 = idx[1];
3882: idx += 2;
3883: tmp0 = x[i1];
3884: tmp1 = x[i2];
3885: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3886: v1 += 2;
3887: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3888: v2 += 2;
3889: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3890: v3 += 2;
3891: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3892: v4 += 2;
3893: }
3895: if (n == sz - 1) {
3896: tmp0 = x[*idx];
3897: sum1 -= *v1 * tmp0;
3898: sum2 -= *v2 * tmp0;
3899: sum3 -= *v3 * tmp0;
3900: sum4 -= *v4 * tmp0;
3901: }
3902: x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3903: x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3904: x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3905: x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3906: row -= 4;
3907: break;
3908: case 5:
3910: sum1 = b[row];
3911: sum2 = b[row - 1];
3912: sum3 = b[row - 2];
3913: sum4 = b[row - 3];
3914: sum5 = b[row - 4];
3915: v2 = a->a + diag[row - 1] + 2;
3916: v3 = a->a + diag[row - 2] + 3;
3917: v4 = a->a + diag[row - 3] + 4;
3918: v5 = a->a + diag[row - 4] + 5;
3919: for (n = 0; n < sz - 1; n += 2) {
3920: i1 = idx[0];
3921: i2 = idx[1];
3922: idx += 2;
3923: tmp0 = x[i1];
3924: tmp1 = x[i2];
3925: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3926: v1 += 2;
3927: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3928: v2 += 2;
3929: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3930: v3 += 2;
3931: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3932: v4 += 2;
3933: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3934: v5 += 2;
3935: }
3937: if (n == sz - 1) {
3938: tmp0 = x[*idx];
3939: sum1 -= *v1 * tmp0;
3940: sum2 -= *v2 * tmp0;
3941: sum3 -= *v3 * tmp0;
3942: sum4 -= *v4 * tmp0;
3943: sum5 -= *v5 * tmp0;
3944: }
3945: x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3946: x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3947: x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3948: x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3949: x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3950: row -= 5;
3951: break;
3952: default:
3953: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3954: }
3955: }
3956: PetscCall(PetscLogFlops(a->nz));
3958: /*
3959: t = b - D x where D is the block diagonal
3960: */
3961: cnt = 0;
3962: for (i = 0, row = 0; i < m; i++) {
3963: nodesz = sizes[i + 1] - sizes[i];
3964: switch (nodesz) {
3965: case 1:
3966: t[row] = b[row] - bdiag[cnt++] * x[row];
3967: row++;
3968: break;
3969: case 2:
3970: x1 = x[row];
3971: x2 = x[row + 1];
3972: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3973: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3974: t[row] = b[row] - tmp1;
3975: t[row + 1] = b[row + 1] - tmp2;
3976: row += 2;
3977: cnt += 4;
3978: break;
3979: case 3:
3980: x1 = x[row];
3981: x2 = x[row + 1];
3982: x3 = x[row + 2];
3983: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3984: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3985: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3986: t[row] = b[row] - tmp1;
3987: t[row + 1] = b[row + 1] - tmp2;
3988: t[row + 2] = b[row + 2] - tmp3;
3989: row += 3;
3990: cnt += 9;
3991: break;
3992: case 4:
3993: x1 = x[row];
3994: x2 = x[row + 1];
3995: x3 = x[row + 2];
3996: x4 = x[row + 3];
3997: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3998: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3999: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4000: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4001: t[row] = b[row] - tmp1;
4002: t[row + 1] = b[row + 1] - tmp2;
4003: t[row + 2] = b[row + 2] - tmp3;
4004: t[row + 3] = b[row + 3] - tmp4;
4005: row += 4;
4006: cnt += 16;
4007: break;
4008: case 5:
4009: x1 = x[row];
4010: x2 = x[row + 1];
4011: x3 = x[row + 2];
4012: x4 = x[row + 3];
4013: x5 = x[row + 4];
4014: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4015: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4016: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4017: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4018: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4019: t[row] = b[row] - tmp1;
4020: t[row + 1] = b[row + 1] - tmp2;
4021: t[row + 2] = b[row + 2] - tmp3;
4022: t[row + 3] = b[row + 3] - tmp4;
4023: t[row + 4] = b[row + 4] - tmp5;
4024: row += 5;
4025: cnt += 25;
4026: break;
4027: default:
4028: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4029: }
4030: }
4031: PetscCall(PetscLogFlops(m));
4033: /*
4034: Apply (L + D)^-1 where D is the block diagonal
4035: */
4036: for (i = 0, row = 0; i < m; i++) {
4037: nodesz = sizes[i + 1] - sizes[i];
4038: sz = diag[row] - ii[row];
4039: v1 = a->a + ii[row];
4040: idx = a->j + ii[row];
4041: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4042: switch (nodesz) {
4043: case 1:
4045: sum1 = t[row];
4046: for (n = 0; n < sz - 1; n += 2) {
4047: i1 = idx[0];
4048: i2 = idx[1];
4049: idx += 2;
4050: tmp0 = t[i1];
4051: tmp1 = t[i2];
4052: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4053: v1 += 2;
4054: }
4056: if (n == sz - 1) {
4057: tmp0 = t[*idx];
4058: sum1 -= *v1 * tmp0;
4059: }
4060: x[row] += t[row] = sum1 * (*ibdiag++);
4061: row++;
4062: break;
4063: case 2:
4064: v2 = a->a + ii[row + 1];
4065: sum1 = t[row];
4066: sum2 = t[row + 1];
4067: for (n = 0; n < sz - 1; n += 2) {
4068: i1 = idx[0];
4069: i2 = idx[1];
4070: idx += 2;
4071: tmp0 = t[i1];
4072: tmp1 = t[i2];
4073: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4074: v1 += 2;
4075: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4076: v2 += 2;
4077: }
4079: if (n == sz - 1) {
4080: tmp0 = t[*idx];
4081: sum1 -= v1[0] * tmp0;
4082: sum2 -= v2[0] * tmp0;
4083: }
4084: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4085: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4086: ibdiag += 4;
4087: row += 2;
4088: break;
4089: case 3:
4090: v2 = a->a + ii[row + 1];
4091: v3 = a->a + ii[row + 2];
4092: sum1 = t[row];
4093: sum2 = t[row + 1];
4094: sum3 = t[row + 2];
4095: for (n = 0; n < sz - 1; n += 2) {
4096: i1 = idx[0];
4097: i2 = idx[1];
4098: idx += 2;
4099: tmp0 = t[i1];
4100: tmp1 = t[i2];
4101: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4102: v1 += 2;
4103: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4104: v2 += 2;
4105: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4106: v3 += 2;
4107: }
4109: if (n == sz - 1) {
4110: tmp0 = t[*idx];
4111: sum1 -= v1[0] * tmp0;
4112: sum2 -= v2[0] * tmp0;
4113: sum3 -= v3[0] * tmp0;
4114: }
4115: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4116: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4117: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4118: ibdiag += 9;
4119: row += 3;
4120: break;
4121: case 4:
4122: v2 = a->a + ii[row + 1];
4123: v3 = a->a + ii[row + 2];
4124: v4 = a->a + ii[row + 3];
4125: sum1 = t[row];
4126: sum2 = t[row + 1];
4127: sum3 = t[row + 2];
4128: sum4 = t[row + 3];
4129: for (n = 0; n < sz - 1; n += 2) {
4130: i1 = idx[0];
4131: i2 = idx[1];
4132: idx += 2;
4133: tmp0 = t[i1];
4134: tmp1 = t[i2];
4135: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4136: v1 += 2;
4137: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4138: v2 += 2;
4139: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4140: v3 += 2;
4141: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4142: v4 += 2;
4143: }
4145: if (n == sz - 1) {
4146: tmp0 = t[*idx];
4147: sum1 -= v1[0] * tmp0;
4148: sum2 -= v2[0] * tmp0;
4149: sum3 -= v3[0] * tmp0;
4150: sum4 -= v4[0] * tmp0;
4151: }
4152: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4153: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4154: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4155: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4156: ibdiag += 16;
4157: row += 4;
4158: break;
4159: case 5:
4160: v2 = a->a + ii[row + 1];
4161: v3 = a->a + ii[row + 2];
4162: v4 = a->a + ii[row + 3];
4163: v5 = a->a + ii[row + 4];
4164: sum1 = t[row];
4165: sum2 = t[row + 1];
4166: sum3 = t[row + 2];
4167: sum4 = t[row + 3];
4168: sum5 = t[row + 4];
4169: for (n = 0; n < sz - 1; n += 2) {
4170: i1 = idx[0];
4171: i2 = idx[1];
4172: idx += 2;
4173: tmp0 = t[i1];
4174: tmp1 = t[i2];
4175: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4176: v1 += 2;
4177: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4178: v2 += 2;
4179: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4180: v3 += 2;
4181: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4182: v4 += 2;
4183: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4184: v5 += 2;
4185: }
4187: if (n == sz - 1) {
4188: tmp0 = t[*idx];
4189: sum1 -= v1[0] * tmp0;
4190: sum2 -= v2[0] * tmp0;
4191: sum3 -= v3[0] * tmp0;
4192: sum4 -= v4[0] * tmp0;
4193: sum5 -= v5[0] * tmp0;
4194: }
4195: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4196: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4197: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4198: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4199: x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4200: ibdiag += 25;
4201: row += 5;
4202: break;
4203: default:
4204: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4205: }
4206: }
4207: PetscCall(PetscLogFlops(a->nz));
4208: }
4209: PetscCall(VecRestoreArray(xx, &x));
4210: PetscCall(VecRestoreArrayRead(bb, &b));
4211: PetscFunctionReturn(PETSC_SUCCESS);
4212: }
4214: static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4215: {
4216: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4217: PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4218: const MatScalar *bdiag = a->inode.bdiag;
4219: const PetscScalar *b;
4220: PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz;
4221: const PetscInt *sizes = a->inode.size_csr;
4223: PetscFunctionBegin;
4224: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4225: PetscCall(VecGetArray(xx, &x));
4226: PetscCall(VecGetArrayRead(bb, &b));
4227: cnt = 0;
4228: for (i = 0, row = 0; i < m; i++) {
4229: nodesz = sizes[i + 1] - sizes[i];
4230: switch (nodesz) {
4231: case 1:
4232: x[row] = b[row] * bdiag[cnt++];
4233: row++;
4234: break;
4235: case 2:
4236: x1 = b[row];
4237: x2 = b[row + 1];
4238: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4239: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4240: x[row++] = tmp1;
4241: x[row++] = tmp2;
4242: cnt += 4;
4243: break;
4244: case 3:
4245: x1 = b[row];
4246: x2 = b[row + 1];
4247: x3 = b[row + 2];
4248: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4249: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4250: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4251: x[row++] = tmp1;
4252: x[row++] = tmp2;
4253: x[row++] = tmp3;
4254: cnt += 9;
4255: break;
4256: case 4:
4257: x1 = b[row];
4258: x2 = b[row + 1];
4259: x3 = b[row + 2];
4260: x4 = b[row + 3];
4261: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4262: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4263: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4264: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4265: x[row++] = tmp1;
4266: x[row++] = tmp2;
4267: x[row++] = tmp3;
4268: x[row++] = tmp4;
4269: cnt += 16;
4270: break;
4271: case 5:
4272: x1 = b[row];
4273: x2 = b[row + 1];
4274: x3 = b[row + 2];
4275: x4 = b[row + 3];
4276: x5 = b[row + 4];
4277: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4278: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4279: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4280: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4281: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4282: x[row++] = tmp1;
4283: x[row++] = tmp2;
4284: x[row++] = tmp3;
4285: x[row++] = tmp4;
4286: x[row++] = tmp5;
4287: cnt += 25;
4288: break;
4289: default:
4290: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4291: }
4292: }
4293: PetscCall(PetscLogFlops(2.0 * cnt));
4294: PetscCall(VecRestoreArray(xx, &x));
4295: PetscCall(VecRestoreArrayRead(bb, &b));
4296: PetscFunctionReturn(PETSC_SUCCESS);
4297: }
4299: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4300: {
4301: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4303: PetscFunctionBegin;
4304: a->inode.node_count = 0;
4305: a->inode.use = PETSC_FALSE;
4306: a->inode.checked = PETSC_FALSE;
4307: a->inode.mat_nonzerostate = -1;
4308: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4309: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4310: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4311: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4312: A->ops->coloringpatch = NULL;
4313: A->ops->multdiagonalblock = NULL;
4314: if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4315: PetscFunctionReturn(PETSC_SUCCESS);
4316: }
4318: /*
4319: samestructure indicates that the matrix has not changed its nonzero structure so we
4320: do not need to recompute the inodes
4321: */
4322: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4323: {
4324: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4325: PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size;
4326: PetscBool flag;
4327: const PetscInt *idx, *idy, *ii;
4329: PetscFunctionBegin;
4330: if (!a->inode.use) {
4331: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4332: PetscCall(PetscFree(a->inode.size_csr));
4333: PetscFunctionReturn(PETSC_SUCCESS);
4334: }
4335: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4337: m = A->rmap->n;
4338: if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
4339: ns = a->inode.size_csr;
4340: ns[0] = 0;
4342: i = 0;
4343: node_count = 0;
4344: idx = a->j;
4345: ii = a->i;
4346: if (idx) {
4347: while (i < m) { /* For each row */
4348: nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4349: /* Limits the number of elements in a node to 'a->inode.limit' */
4350: for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4351: nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4352: if (nzy != nzx) break;
4353: idy += nzx; /* Same nonzero pattern */
4354: PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4355: if (!flag) break;
4356: }
4357: ns[node_count + 1] = ns[node_count] + blk_size;
4358: node_count++;
4359: idx += blk_size * nzx;
4360: i = j;
4361: }
4362: }
4363: /* If not enough inodes found,, do not use inode version of the routines */
4364: if (!m || !idx || node_count > .8 * m) {
4365: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4366: PetscCall(PetscFree(a->inode.size_csr));
4367: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4368: } else {
4369: if (!A->factortype) {
4370: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4371: if (A->rmap->n == A->cmap->n) {
4372: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4373: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4374: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4375: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4376: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4377: }
4378: } else {
4379: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4380: }
4381: a->inode.node_count = node_count;
4382: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4383: }
4384: a->inode.checked = PETSC_TRUE;
4385: a->inode.mat_nonzerostate = A->nonzerostate;
4386: PetscFunctionReturn(PETSC_SUCCESS);
4387: }
4389: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4390: {
4391: Mat B = *C;
4392: Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4393: PetscInt m = A->rmap->n;
4395: PetscFunctionBegin;
4396: c->inode.use = a->inode.use;
4397: c->inode.limit = a->inode.limit;
4398: c->inode.max_limit = a->inode.max_limit;
4399: c->inode.checked = PETSC_FALSE;
4400: c->inode.size_csr = NULL;
4401: c->inode.node_count = 0;
4402: c->inode.ibdiagvalid = PETSC_FALSE;
4403: c->inode.ibdiag = NULL;
4404: c->inode.bdiag = NULL;
4405: c->inode.mat_nonzerostate = -1;
4406: if (a->inode.use) {
4407: if (a->inode.checked && a->inode.size_csr) {
4408: PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
4409: PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4411: c->inode.checked = PETSC_TRUE;
4412: c->inode.node_count = a->inode.node_count;
4413: c->inode.mat_nonzerostate = (*C)->nonzerostate;
4414: }
4415: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4416: if (!B->factortype) {
4417: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4418: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4419: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4420: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4421: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4422: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4423: } else {
4424: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4425: }
4426: }
4427: PetscFunctionReturn(PETSC_SUCCESS);
4428: }
4430: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4431: {
4432: PetscInt k;
4433: const PetscInt *vi;
4435: PetscFunctionBegin;
4436: vi = aj + ai[row];
4437: for (k = 0; k < nzl; k++) cols[k] = vi[k];
4438: vi = aj + adiag[row];
4439: cols[nzl] = vi[0];
4440: vi = aj + adiag[row + 1] + 1;
4441: for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4442: PetscFunctionReturn(PETSC_SUCCESS);
4443: }
4444: /*
4445: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4446: Modified from MatSeqAIJCheckInode().
4448: Input Parameters:
4449: . Mat A - ILU or LU matrix factor
4451: */
4452: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4453: {
4454: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4455: PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4456: PetscInt *cols1, *cols2, *ns;
4457: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4458: PetscBool flag;
4460: PetscFunctionBegin;
4461: if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4462: if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4464: m = A->rmap->n;
4465: if (a->inode.size_csr) ns = a->inode.size_csr;
4466: else PetscCall(PetscMalloc1(m + 1, &ns));
4467: ns[0] = 0;
4469: i = 0;
4470: node_count = 0;
4471: PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4472: while (i < m) { /* For each row */
4473: nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */
4474: nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4475: nzx = nzl1 + nzu1 + 1;
4476: PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4478: /* Limits the number of elements in a node to 'a->inode.limit' */
4479: for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4480: nzl2 = ai[j + 1] - ai[j];
4481: nzu2 = adiag[j] - adiag[j + 1] - 1;
4482: nzy = nzl2 + nzu2 + 1;
4483: if (nzy != nzx) break;
4484: PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4485: PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4486: if (!flag) break;
4487: }
4488: ns[node_count + 1] = ns[node_count] + blk_size;
4489: node_count++;
4490: i = j;
4491: }
4492: PetscCall(PetscFree2(cols1, cols2));
4493: /* If not enough inodes found,, do not use inode version of the routines */
4494: if (!m || node_count > .8 * m) {
4495: PetscCall(PetscFree(ns));
4497: a->inode.node_count = 0;
4498: a->inode.size_csr = NULL;
4499: a->inode.use = PETSC_FALSE;
4501: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4502: } else {
4503: A->ops->mult = NULL;
4504: A->ops->sor = NULL;
4505: A->ops->multadd = NULL;
4506: A->ops->getrowij = NULL;
4507: A->ops->restorerowij = NULL;
4508: A->ops->getcolumnij = NULL;
4509: A->ops->restorecolumnij = NULL;
4510: A->ops->coloringpatch = NULL;
4511: A->ops->multdiagonalblock = NULL;
4512: a->inode.node_count = node_count;
4513: a->inode.size_csr = ns;
4514: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4515: }
4516: a->inode.checked = PETSC_TRUE;
4517: PetscFunctionReturn(PETSC_SUCCESS);
4518: }
4520: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4521: {
4522: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4524: PetscFunctionBegin;
4525: a->inode.ibdiagvalid = PETSC_FALSE;
4526: PetscFunctionReturn(PETSC_SUCCESS);
4527: }
4529: /*
4530: This is really ugly. if inodes are used this replaces the
4531: permutations with ones that correspond to rows/cols of the matrix
4532: rather than inode blocks
4533: */
4534: PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4535: {
4536: PetscFunctionBegin;
4537: PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4538: PetscFunctionReturn(PETSC_SUCCESS);
4539: }
4541: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4542: {
4543: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4544: PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4545: const PetscInt *ridx, *cidx;
4546: PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
4547: PetscInt nslim_col, *ns_col;
4548: IS ris = *rperm, cis = *cperm;
4550: PetscFunctionBegin;
4551: if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */
4552: if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4554: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4555: PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
4556: PetscCall(PetscMalloc2(m, &permr, n, &permc));
4558: PetscCall(ISGetIndices(ris, &ridx));
4559: PetscCall(ISGetIndices(cis, &cidx));
4561: /* Form the inode structure for the rows of permuted matrix using inv perm*/
4562: for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
4564: /* Construct the permutations for rows*/
4565: for (i = 0, row = 0; i < nslim_row; ++i) {
4566: indx = ridx[i];
4567: start_val = tns[indx];
4568: end_val = tns[indx + 1];
4569: for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4570: }
4572: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4573: for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
4575: /* Construct permutations for columns */
4576: for (i = 0, col = 0; i < nslim_col; ++i) {
4577: indx = cidx[i];
4578: start_val = tns[indx];
4579: end_val = tns[indx + 1];
4580: for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4581: }
4583: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4584: PetscCall(ISSetPermutation(*rperm));
4585: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4586: PetscCall(ISSetPermutation(*cperm));
4588: PetscCall(ISRestoreIndices(ris, &ridx));
4589: PetscCall(ISRestoreIndices(cis, &cidx));
4591: PetscCall(PetscFree(ns_col));
4592: PetscCall(PetscFree2(permr, permc));
4593: PetscCall(ISDestroy(&cis));
4594: PetscCall(ISDestroy(&ris));
4595: PetscCall(PetscFree(tns));
4596: PetscFunctionReturn(PETSC_SUCCESS);
4597: }
4599: /*@C
4600: MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4602: Not Collective
4604: Input Parameter:
4605: . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4607: Output Parameters:
4608: + node_count - no of inodes present in the matrix.
4609: . sizes - an array of size `node_count`, with the sizes of each inode.
4610: - limit - the max size used to generate the inodes.
4612: Level: advanced
4614: Note:
4615: It should be called after the matrix is assembled.
4616: The contents of the sizes[] array should not be changed.
4617: `NULL` may be passed for information not needed
4619: .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4620: @*/
4621: PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4622: {
4623: PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4625: PetscFunctionBegin;
4626: PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4627: PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4628: if (f) PetscCall((*f)(A, node_count, sizes, limit));
4629: PetscFunctionReturn(PETSC_SUCCESS);
4630: }
4632: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4633: {
4634: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4636: PetscFunctionBegin;
4637: if (node_count) *node_count = a->inode.node_count;
4638: if (sizes) *sizes = a->inode.size_csr;
4639: if (limit) *limit = a->inode.limit;
4640: PetscFunctionReturn(PETSC_SUCCESS);
4641: }