Actual source code: inode.c
1: /*
2: This file provides high performance routines for the Inode format (compressed sparse row)
3: by taking advantage of rows with identical nonzero structure (I-nodes).
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #if defined(PETSC_HAVE_XMMINTRIN_H)
7: #include <xmmintrin.h>
8: #endif
10: static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11: {
12: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13: PetscInt i, count, m, n, min_mn, *ns_row, *ns_col;
15: PetscFunctionBegin;
16: n = A->cmap->n;
17: m = A->rmap->n;
18: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19: ns_row = a->inode.size_csr;
21: min_mn = (m < n) ? m : n;
22: if (!ns) {
23: for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24: for (; count + 1 < n; count++, i++);
25: if (count < n) i++;
26: *size = i;
27: PetscFunctionReturn(PETSC_SUCCESS);
28: }
29: PetscCall(PetscMalloc1(n + 1, &ns_col));
30: ns_col[0] = 0;
32: /* Use the same row structure wherever feasible. */
33: for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
35: /* if m < n; pad up the remainder with inode_limit */
36: for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37: /* The last node is the odd ball. pad it up with the remaining rows; */
38: if (count < n) {
39: ns_col[i + 1] = ns_col[i] + (n - count);
40: i++;
41: } else if (count > n) {
42: /* Adjust for the over estimation */
43: ns_col[i] += n - count;
44: }
45: *size = i;
46: *ns = ns_col;
47: PetscFunctionReturn(PETSC_SUCCESS);
48: }
50: /*
51: This builds symmetric version of nonzero structure,
52: */
53: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54: {
55: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
56: PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
57: PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
58: const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
60: PetscFunctionBegin;
61: nslim_row = a->inode.node_count;
62: m = A->rmap->n;
63: n = A->cmap->n;
64: PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
65: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
67: /* Use the row_inode as column_inode */
68: nslim_col = nslim_row;
69: ns_col = ns_row;
71: /* allocate space for reformatted inode structure */
72: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
73: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
75: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
76: nsz = ns_col[i1 + 1] - ns_col[i1];
77: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
78: }
79: /* allocate space for row pointers */
80: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
81: *iia = ia;
82: PetscCall(PetscMalloc1(nslim_row + 1, &work));
84: /* determine the number of columns in each row */
85: ia[0] = oshift;
86: for (i1 = 0; i1 < nslim_row; i1++) {
87: row = ns_row[i1];
88: j = aj + ai[row] + ishift;
89: jmax = aj + ai[row + 1] + ishift;
90: if (j == jmax) continue; /* empty row */
91: col = *j++ + ishift;
92: i2 = tvc[col];
93: while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
94: ia[i1 + 1]++;
95: ia[i2 + 1]++;
96: i2++; /* Start col of next node */
97: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
98: i2 = tvc[col];
99: }
100: if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101: }
103: /* shift ia[i] to point to next row */
104: for (i1 = 1; i1 < nslim_row + 1; i1++) {
105: row = ia[i1 - 1];
106: ia[i1] += row;
107: work[i1 - 1] = row - oshift;
108: }
110: /* allocate space for column pointers */
111: nz = ia[nslim_row] + (!ishift);
112: PetscCall(PetscMalloc1(nz, &ja));
113: *jja = ja;
115: /* loop over lower triangular part putting into ja */
116: for (i1 = 0; i1 < nslim_row; i1++) {
117: row = ns_row[i1];
118: j = aj + ai[row] + ishift;
119: jmax = aj + ai[row + 1] + ishift;
120: if (j == jmax) continue; /* empty row */
121: col = *j++ + ishift;
122: i2 = tvc[col];
123: while (i2 < i1 && j < jmax) {
124: ja[work[i2]++] = i1 + oshift;
125: ja[work[i1]++] = i2 + oshift;
126: ++i2;
127: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
128: i2 = tvc[col];
129: }
130: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
131: }
132: PetscCall(PetscFree(work));
133: PetscCall(PetscFree2(tns, tvc));
134: PetscFunctionReturn(PETSC_SUCCESS);
135: }
137: /*
138: This builds nonsymmetric version of nonzero structure,
139: */
140: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141: {
142: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
143: PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
144: PetscInt *tns, *tvc, nsz, i1, i2;
145: const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
147: PetscFunctionBegin;
148: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
149: nslim_row = a->inode.node_count;
150: n = A->cmap->n;
152: /* Create The column_inode for this matrix */
153: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
155: /* allocate space for reformatted column_inode structure */
156: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
159: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160: nsz = ns_col[i1 + 1] - ns_col[i1];
161: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
162: }
163: /* allocate space for row pointers */
164: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
165: *iia = ia;
166: PetscCall(PetscMalloc1(nslim_row + 1, &work));
168: /* determine the number of columns in each row */
169: ia[0] = oshift;
170: for (i1 = 0; i1 < nslim_row; i1++) {
171: row = ns_row[i1];
172: j = aj + ai[row] + ishift;
173: nz = ai[row + 1] - ai[row];
174: if (!nz) continue; /* empty row */
175: col = *j++ + ishift;
176: i2 = tvc[col];
177: while (nz-- > 0) { /* off-diagonal elements */
178: ia[i1 + 1]++;
179: i2++; /* Start col of next node */
180: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
181: if (nz > 0) i2 = tvc[col];
182: }
183: }
185: /* shift ia[i] to point to next row */
186: for (i1 = 1; i1 < nslim_row + 1; i1++) {
187: row = ia[i1 - 1];
188: ia[i1] += row;
189: work[i1 - 1] = row - oshift;
190: }
192: /* allocate space for column pointers */
193: nz = ia[nslim_row] + (!ishift);
194: PetscCall(PetscMalloc1(nz, &ja));
195: *jja = ja;
197: /* loop over matrix putting into ja */
198: for (i1 = 0; i1 < nslim_row; i1++) {
199: row = ns_row[i1];
200: j = aj + ai[row] + ishift;
201: nz = ai[row + 1] - ai[row];
202: if (!nz) continue; /* empty row */
203: col = *j++ + ishift;
204: i2 = tvc[col];
205: while (nz-- > 0) {
206: ja[work[i1]++] = i2 + oshift;
207: ++i2;
208: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
209: if (nz > 0) i2 = tvc[col];
210: }
211: }
212: PetscCall(PetscFree(ns_col));
213: PetscCall(PetscFree(work));
214: PetscCall(PetscFree2(tns, tvc));
215: PetscFunctionReturn(PETSC_SUCCESS);
216: }
218: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219: {
220: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
222: PetscFunctionBegin;
223: if (n) *n = a->inode.node_count;
224: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
225: if (!blockcompressed) {
226: PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
227: } else if (symmetric) {
228: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
229: } else {
230: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
231: }
232: PetscFunctionReturn(PETSC_SUCCESS);
233: }
235: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236: {
237: PetscFunctionBegin;
238: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
240: if (!blockcompressed) {
241: PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
242: } else {
243: PetscCall(PetscFree(*ia));
244: PetscCall(PetscFree(*ja));
245: }
246: PetscFunctionReturn(PETSC_SUCCESS);
247: }
249: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250: {
251: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252: PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
253: PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
255: PetscFunctionBegin;
256: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
257: nslim_row = a->inode.node_count;
258: n = A->cmap->n;
260: /* Create The column_inode for this matrix */
261: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
263: /* allocate space for reformatted column_inode structure */
264: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
265: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
267: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
268: nsz = ns_col[i1 + 1] - ns_col[i1];
269: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
270: }
271: /* allocate space for column pointers */
272: PetscCall(PetscCalloc1(nslim_col + 1, &ia));
273: *iia = ia;
274: PetscCall(PetscMalloc1(nslim_col + 1, &work));
276: /* determine the number of columns in each row */
277: ia[0] = oshift;
278: for (i1 = 0; i1 < nslim_row; i1++) {
279: row = ns_row[i1];
280: j = aj + ai[row] + ishift;
281: col = *j++ + ishift;
282: i2 = tvc[col];
283: nz = ai[row + 1] - ai[row];
284: while (nz-- > 0) { /* off-diagonal elements */
285: /* ia[i1+1]++; */
286: ia[i2 + 1]++;
287: i2++;
288: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
289: if (nz > 0) i2 = tvc[col];
290: }
291: }
293: /* shift ia[i] to point to next col */
294: for (i1 = 1; i1 < nslim_col + 1; i1++) {
295: col = ia[i1 - 1];
296: ia[i1] += col;
297: work[i1 - 1] = col - oshift;
298: }
300: /* allocate space for column pointers */
301: nz = ia[nslim_col] + (!ishift);
302: PetscCall(PetscMalloc1(nz, &ja));
303: *jja = ja;
305: /* loop over matrix putting into ja */
306: for (i1 = 0; i1 < nslim_row; i1++) {
307: row = ns_row[i1];
308: j = aj + ai[row] + ishift;
309: col = *j++ + ishift;
310: i2 = tvc[col];
311: nz = ai[row + 1] - ai[row];
312: while (nz-- > 0) {
313: /* ja[work[i1]++] = i2 + oshift; */
314: ja[work[i2]++] = i1 + oshift;
315: i2++;
316: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
317: if (nz > 0) i2 = tvc[col];
318: }
319: }
320: PetscCall(PetscFree(ns_col));
321: PetscCall(PetscFree(work));
322: PetscCall(PetscFree2(tns, tvc));
323: PetscFunctionReturn(PETSC_SUCCESS);
324: }
326: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327: {
328: PetscFunctionBegin;
329: PetscCall(MatCreateColInode_Private(A, n, NULL));
330: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
332: if (!blockcompressed) {
333: PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
334: } else if (symmetric) {
335: /* Since the indices are symmetric it doesn't matter */
336: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
337: } else {
338: PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
339: }
340: PetscFunctionReturn(PETSC_SUCCESS);
341: }
343: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344: {
345: PetscFunctionBegin;
346: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
347: if (!blockcompressed) {
348: PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
349: } else {
350: PetscCall(PetscFree(*ia));
351: PetscCall(PetscFree(*ja));
352: }
353: PetscFunctionReturn(PETSC_SUCCESS);
354: }
356: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357: {
358: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
359: PetscScalar *y;
360: const PetscScalar *x;
361: PetscInt row, node_max, nonzerorow = 0;
362: PetscInt *ns;
364: PetscFunctionBegin;
365: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
366: node_max = a->inode.node_count;
367: ns = a->inode.size_csr; /* Node Size array */
368: PetscCall(VecGetArrayRead(xx, &x));
369: PetscCall(VecGetArray(yy, &y));
371: PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow))
372: for (PetscInt i = 0; i < node_max; ++i) {
373: PetscInt i1, i2, nsz, n, sz;
374: const MatScalar *v1, *v2, *v3, *v4, *v5;
375: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
376: const PetscInt *idx;
378: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379: #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
380: #endif
381: row = ns[i];
382: nsz = ns[i + 1] - ns[i];
383: n = a->i[row + 1] - a->i[row];
384: nonzerorow += (n > 0) * nsz;
386: idx = &a->j[a->i[row]];
387: v1 = &a->a[a->i[row]];
388: PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
389: PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
390: sz = n; /* No of non zeros in this row */
391: /* Switch on the size of Node */
392: switch (nsz) { /* Each loop in 'case' is unrolled */
393: case 1:
394: sum1 = 0.;
396: for (n = 0; n < sz - 1; n += 2) {
397: i1 = idx[0]; /* The instructions are ordered to */
398: i2 = idx[1]; /* make the compiler's job easy */
399: idx += 2;
400: tmp0 = x[i1];
401: tmp1 = x[i2];
402: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
403: v1 += 2;
404: }
406: if (n == sz - 1) { /* Take care of the last nonzero */
407: tmp0 = x[*idx++];
408: sum1 += *v1++ * tmp0;
409: }
410: y[row++] = sum1;
411: break;
412: case 2:
413: sum1 = 0.;
414: sum2 = 0.;
415: v2 = v1 + n;
417: for (n = 0; n < sz - 1; n += 2) {
418: i1 = idx[0];
419: i2 = idx[1];
420: idx += 2;
421: tmp0 = x[i1];
422: tmp1 = x[i2];
423: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
424: v1 += 2;
425: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
426: v2 += 2;
427: }
428: if (n == sz - 1) {
429: tmp0 = x[*idx++];
430: sum1 += *v1++ * tmp0;
431: sum2 += *v2++ * tmp0;
432: }
433: y[row++] = sum1;
434: y[row++] = sum2;
435: v1 = v2; /* Since the next block to be processed starts there*/
436: idx += sz;
437: break;
438: case 3:
439: sum1 = 0.;
440: sum2 = 0.;
441: sum3 = 0.;
442: v2 = v1 + n;
443: v3 = v2 + n;
445: for (n = 0; n < sz - 1; n += 2) {
446: i1 = idx[0];
447: i2 = idx[1];
448: idx += 2;
449: tmp0 = x[i1];
450: tmp1 = x[i2];
451: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
452: v1 += 2;
453: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
454: v2 += 2;
455: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
456: v3 += 2;
457: }
458: if (n == sz - 1) {
459: tmp0 = x[*idx++];
460: sum1 += *v1++ * tmp0;
461: sum2 += *v2++ * tmp0;
462: sum3 += *v3++ * tmp0;
463: }
464: y[row++] = sum1;
465: y[row++] = sum2;
466: y[row++] = sum3;
467: v1 = v3; /* Since the next block to be processed starts there*/
468: idx += 2 * sz;
469: break;
470: case 4:
471: sum1 = 0.;
472: sum2 = 0.;
473: sum3 = 0.;
474: sum4 = 0.;
475: v2 = v1 + n;
476: v3 = v2 + n;
477: v4 = v3 + n;
479: for (n = 0; n < sz - 1; n += 2) {
480: i1 = idx[0];
481: i2 = idx[1];
482: idx += 2;
483: tmp0 = x[i1];
484: tmp1 = x[i2];
485: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
486: v1 += 2;
487: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
488: v2 += 2;
489: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
490: v3 += 2;
491: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
492: v4 += 2;
493: }
494: if (n == sz - 1) {
495: tmp0 = x[*idx++];
496: sum1 += *v1++ * tmp0;
497: sum2 += *v2++ * tmp0;
498: sum3 += *v3++ * tmp0;
499: sum4 += *v4++ * tmp0;
500: }
501: y[row++] = sum1;
502: y[row++] = sum2;
503: y[row++] = sum3;
504: y[row++] = sum4;
505: v1 = v4; /* Since the next block to be processed starts there*/
506: idx += 3 * sz;
507: break;
508: case 5:
509: sum1 = 0.;
510: sum2 = 0.;
511: sum3 = 0.;
512: sum4 = 0.;
513: sum5 = 0.;
514: v2 = v1 + n;
515: v3 = v2 + n;
516: v4 = v3 + n;
517: v5 = v4 + n;
519: for (n = 0; n < sz - 1; n += 2) {
520: i1 = idx[0];
521: i2 = idx[1];
522: idx += 2;
523: tmp0 = x[i1];
524: tmp1 = x[i2];
525: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
526: v1 += 2;
527: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
528: v2 += 2;
529: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
530: v3 += 2;
531: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
532: v4 += 2;
533: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
534: v5 += 2;
535: }
536: if (n == sz - 1) {
537: tmp0 = x[*idx++];
538: sum1 += *v1++ * tmp0;
539: sum2 += *v2++ * tmp0;
540: sum3 += *v3++ * tmp0;
541: sum4 += *v4++ * tmp0;
542: sum5 += *v5++ * tmp0;
543: }
544: y[row++] = sum1;
545: y[row++] = sum2;
546: y[row++] = sum3;
547: y[row++] = sum4;
548: y[row++] = sum5;
549: v1 = v5; /* Since the next block to be processed starts there */
550: idx += 4 * sz;
551: break;
552: default:
553: SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
554: }
555: }
556: PetscCall(VecRestoreArrayRead(xx, &x));
557: PetscCall(VecRestoreArray(yy, &y));
558: PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
559: PetscFunctionReturn(PETSC_SUCCESS);
560: }
562: /* Almost same code as the MatMult_SeqAIJ_Inode() */
563: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
564: {
565: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
566: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
567: const MatScalar *v1, *v2, *v3, *v4, *v5;
568: const PetscScalar *x;
569: PetscScalar *y, *z, *zt;
570: PetscInt i1, i2, n, i, row, node_max, nsz, sz;
571: const PetscInt *idx, *ns, *ii;
573: PetscFunctionBegin;
574: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
575: node_max = a->inode.node_count;
576: ns = a->inode.size_csr; /* Node Size array */
578: PetscCall(VecGetArrayRead(xx, &x));
579: PetscCall(VecGetArrayPair(zz, yy, &z, &y));
580: zt = z;
582: idx = a->j;
583: v1 = a->a;
584: ii = a->i;
586: for (i = 0; i < node_max; ++i) {
587: row = ns[i];
588: nsz = ns[i + 1] - ns[i];
589: n = ii[1] - ii[0];
590: ii += nsz;
591: sz = n; /* No of non zeros in this row */
592: /* Switch on the size of Node */
593: switch (nsz) { /* Each loop in 'case' is unrolled */
594: case 1:
595: sum1 = *zt++;
597: for (n = 0; n < sz - 1; n += 2) {
598: i1 = idx[0]; /* The instructions are ordered to */
599: i2 = idx[1]; /* make the compiler's job easy */
600: idx += 2;
601: tmp0 = x[i1];
602: tmp1 = x[i2];
603: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
604: v1 += 2;
605: }
607: if (n == sz - 1) { /* Take care of the last nonzero */
608: tmp0 = x[*idx++];
609: sum1 += *v1++ * tmp0;
610: }
611: y[row++] = sum1;
612: break;
613: case 2:
614: sum1 = *zt++;
615: sum2 = *zt++;
616: v2 = v1 + n;
618: for (n = 0; n < sz - 1; n += 2) {
619: i1 = idx[0];
620: i2 = idx[1];
621: idx += 2;
622: tmp0 = x[i1];
623: tmp1 = x[i2];
624: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
625: v1 += 2;
626: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
627: v2 += 2;
628: }
629: if (n == sz - 1) {
630: tmp0 = x[*idx++];
631: sum1 += *v1++ * tmp0;
632: sum2 += *v2++ * tmp0;
633: }
634: y[row++] = sum1;
635: y[row++] = sum2;
636: v1 = v2; /* Since the next block to be processed starts there*/
637: idx += sz;
638: break;
639: case 3:
640: sum1 = *zt++;
641: sum2 = *zt++;
642: sum3 = *zt++;
643: v2 = v1 + n;
644: v3 = v2 + n;
646: for (n = 0; n < sz - 1; n += 2) {
647: i1 = idx[0];
648: i2 = idx[1];
649: idx += 2;
650: tmp0 = x[i1];
651: tmp1 = x[i2];
652: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
653: v1 += 2;
654: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
655: v2 += 2;
656: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
657: v3 += 2;
658: }
659: if (n == sz - 1) {
660: tmp0 = x[*idx++];
661: sum1 += *v1++ * tmp0;
662: sum2 += *v2++ * tmp0;
663: sum3 += *v3++ * tmp0;
664: }
665: y[row++] = sum1;
666: y[row++] = sum2;
667: y[row++] = sum3;
668: v1 = v3; /* Since the next block to be processed starts there*/
669: idx += 2 * sz;
670: break;
671: case 4:
672: sum1 = *zt++;
673: sum2 = *zt++;
674: sum3 = *zt++;
675: sum4 = *zt++;
676: v2 = v1 + n;
677: v3 = v2 + n;
678: v4 = v3 + n;
680: for (n = 0; n < sz - 1; n += 2) {
681: i1 = idx[0];
682: i2 = idx[1];
683: idx += 2;
684: tmp0 = x[i1];
685: tmp1 = x[i2];
686: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
687: v1 += 2;
688: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
689: v2 += 2;
690: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
691: v3 += 2;
692: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
693: v4 += 2;
694: }
695: if (n == sz - 1) {
696: tmp0 = x[*idx++];
697: sum1 += *v1++ * tmp0;
698: sum2 += *v2++ * tmp0;
699: sum3 += *v3++ * tmp0;
700: sum4 += *v4++ * tmp0;
701: }
702: y[row++] = sum1;
703: y[row++] = sum2;
704: y[row++] = sum3;
705: y[row++] = sum4;
706: v1 = v4; /* Since the next block to be processed starts there*/
707: idx += 3 * sz;
708: break;
709: case 5:
710: sum1 = *zt++;
711: sum2 = *zt++;
712: sum3 = *zt++;
713: sum4 = *zt++;
714: sum5 = *zt++;
715: v2 = v1 + n;
716: v3 = v2 + n;
717: v4 = v3 + n;
718: v5 = v4 + n;
720: for (n = 0; n < sz - 1; n += 2) {
721: i1 = idx[0];
722: i2 = idx[1];
723: idx += 2;
724: tmp0 = x[i1];
725: tmp1 = x[i2];
726: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
727: v1 += 2;
728: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
729: v2 += 2;
730: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
731: v3 += 2;
732: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
733: v4 += 2;
734: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
735: v5 += 2;
736: }
737: if (n == sz - 1) {
738: tmp0 = x[*idx++];
739: sum1 += *v1++ * tmp0;
740: sum2 += *v2++ * tmp0;
741: sum3 += *v3++ * tmp0;
742: sum4 += *v4++ * tmp0;
743: sum5 += *v5++ * tmp0;
744: }
745: y[row++] = sum1;
746: y[row++] = sum2;
747: y[row++] = sum3;
748: y[row++] = sum4;
749: y[row++] = sum5;
750: v1 = v5; /* Since the next block to be processed starts there */
751: idx += 4 * sz;
752: break;
753: default:
754: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
755: }
756: }
757: PetscCall(VecRestoreArrayRead(xx, &x));
758: PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
759: PetscCall(PetscLogFlops(2.0 * a->nz));
760: PetscFunctionReturn(PETSC_SUCCESS);
761: }
763: static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
764: {
765: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
766: IS iscol = a->col, isrow = a->row;
767: const PetscInt *r, *c, *rout, *cout;
768: PetscInt i, j, n = A->rmap->n, nz;
769: PetscInt node_max, *ns, row, nsz, aii, i0, i1;
770: const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
771: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
772: PetscScalar sum1, sum2, sum3, sum4, sum5;
773: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
774: const PetscScalar *b;
776: PetscFunctionBegin;
777: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
778: node_max = a->inode.node_count;
779: ns = a->inode.size_csr; /* Node Size array */
781: PetscCall(VecGetArrayRead(bb, &b));
782: PetscCall(VecGetArrayWrite(xx, &x));
783: tmp = a->solve_work;
785: PetscCall(ISGetIndices(isrow, &rout));
786: r = rout;
787: PetscCall(ISGetIndices(iscol, &cout));
788: c = cout + (n - 1);
790: /* forward solve the lower triangular */
791: tmps = tmp;
792: aa = a_a;
793: aj = a_j;
794: ad = a->diag;
796: for (i = 0, row = 0; i < node_max; ++i) {
797: row = ns[i];
798: nsz = ns[i + 1] - ns[i];
799: aii = ai[row];
800: v1 = aa + aii;
801: vi = aj + aii;
802: nz = ad[row] - aii;
803: if (i < node_max - 1) {
804: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
805: * but our indexing to determine its size could. */
806: PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
807: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
808: PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
809: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
810: }
812: switch (nsz) { /* Each loop in 'case' is unrolled */
813: case 1:
814: sum1 = b[*r++];
815: for (j = 0; j < nz - 1; j += 2) {
816: i0 = vi[0];
817: i1 = vi[1];
818: vi += 2;
819: tmp0 = tmps[i0];
820: tmp1 = tmps[i1];
821: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
822: v1 += 2;
823: }
824: if (j == nz - 1) {
825: tmp0 = tmps[*vi++];
826: sum1 -= *v1++ * tmp0;
827: }
828: tmp[row++] = sum1;
829: break;
830: case 2:
831: sum1 = b[*r++];
832: sum2 = b[*r++];
833: v2 = aa + ai[row + 1];
835: for (j = 0; j < nz - 1; j += 2) {
836: i0 = vi[0];
837: i1 = vi[1];
838: vi += 2;
839: tmp0 = tmps[i0];
840: tmp1 = tmps[i1];
841: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
842: v1 += 2;
843: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
844: v2 += 2;
845: }
846: if (j == nz - 1) {
847: tmp0 = tmps[*vi++];
848: sum1 -= *v1++ * tmp0;
849: sum2 -= *v2++ * tmp0;
850: }
851: sum2 -= *v2++ * sum1;
852: tmp[row++] = sum1;
853: tmp[row++] = sum2;
854: break;
855: case 3:
856: sum1 = b[*r++];
857: sum2 = b[*r++];
858: sum3 = b[*r++];
859: v2 = aa + ai[row + 1];
860: v3 = aa + ai[row + 2];
862: for (j = 0; j < nz - 1; j += 2) {
863: i0 = vi[0];
864: i1 = vi[1];
865: vi += 2;
866: tmp0 = tmps[i0];
867: tmp1 = tmps[i1];
868: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
869: v1 += 2;
870: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
871: v2 += 2;
872: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
873: v3 += 2;
874: }
875: if (j == nz - 1) {
876: tmp0 = tmps[*vi++];
877: sum1 -= *v1++ * tmp0;
878: sum2 -= *v2++ * tmp0;
879: sum3 -= *v3++ * tmp0;
880: }
881: sum2 -= *v2++ * sum1;
882: sum3 -= *v3++ * sum1;
883: sum3 -= *v3++ * sum2;
885: tmp[row++] = sum1;
886: tmp[row++] = sum2;
887: tmp[row++] = sum3;
888: break;
890: case 4:
891: sum1 = b[*r++];
892: sum2 = b[*r++];
893: sum3 = b[*r++];
894: sum4 = b[*r++];
895: v2 = aa + ai[row + 1];
896: v3 = aa + ai[row + 2];
897: v4 = aa + ai[row + 3];
899: for (j = 0; j < nz - 1; j += 2) {
900: i0 = vi[0];
901: i1 = vi[1];
902: vi += 2;
903: tmp0 = tmps[i0];
904: tmp1 = tmps[i1];
905: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
906: v1 += 2;
907: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
908: v2 += 2;
909: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
910: v3 += 2;
911: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
912: v4 += 2;
913: }
914: if (j == nz - 1) {
915: tmp0 = tmps[*vi++];
916: sum1 -= *v1++ * tmp0;
917: sum2 -= *v2++ * tmp0;
918: sum3 -= *v3++ * tmp0;
919: sum4 -= *v4++ * tmp0;
920: }
921: sum2 -= *v2++ * sum1;
922: sum3 -= *v3++ * sum1;
923: sum4 -= *v4++ * sum1;
924: sum3 -= *v3++ * sum2;
925: sum4 -= *v4++ * sum2;
926: sum4 -= *v4++ * sum3;
928: tmp[row++] = sum1;
929: tmp[row++] = sum2;
930: tmp[row++] = sum3;
931: tmp[row++] = sum4;
932: break;
933: case 5:
934: sum1 = b[*r++];
935: sum2 = b[*r++];
936: sum3 = b[*r++];
937: sum4 = b[*r++];
938: sum5 = b[*r++];
939: v2 = aa + ai[row + 1];
940: v3 = aa + ai[row + 2];
941: v4 = aa + ai[row + 3];
942: v5 = aa + ai[row + 4];
944: for (j = 0; j < nz - 1; j += 2) {
945: i0 = vi[0];
946: i1 = vi[1];
947: vi += 2;
948: tmp0 = tmps[i0];
949: tmp1 = tmps[i1];
950: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
951: v1 += 2;
952: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
953: v2 += 2;
954: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
955: v3 += 2;
956: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
957: v4 += 2;
958: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
959: v5 += 2;
960: }
961: if (j == nz - 1) {
962: tmp0 = tmps[*vi++];
963: sum1 -= *v1++ * tmp0;
964: sum2 -= *v2++ * tmp0;
965: sum3 -= *v3++ * tmp0;
966: sum4 -= *v4++ * tmp0;
967: sum5 -= *v5++ * tmp0;
968: }
970: sum2 -= *v2++ * sum1;
971: sum3 -= *v3++ * sum1;
972: sum4 -= *v4++ * sum1;
973: sum5 -= *v5++ * sum1;
974: sum3 -= *v3++ * sum2;
975: sum4 -= *v4++ * sum2;
976: sum5 -= *v5++ * sum2;
977: sum4 -= *v4++ * sum3;
978: sum5 -= *v5++ * sum3;
979: sum5 -= *v5++ * sum4;
981: tmp[row++] = sum1;
982: tmp[row++] = sum2;
983: tmp[row++] = sum3;
984: tmp[row++] = sum4;
985: tmp[row++] = sum5;
986: break;
987: default:
988: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
989: }
990: }
991: /* backward solve the upper triangular */
992: for (i = node_max - 1; i >= 0; i--) {
993: row = ns[i + 1];
994: nsz = ns[i + 1] - ns[i];
995: aii = ai[row + 1] - 1;
996: v1 = aa + aii;
997: vi = aj + aii;
998: nz = aii - ad[row];
999: switch (nsz) { /* Each loop in 'case' is unrolled */
1000: case 1:
1001: sum1 = tmp[row];
1003: for (j = nz; j > 1; j -= 2) {
1004: vi -= 2;
1005: i0 = vi[2];
1006: i1 = vi[1];
1007: tmp0 = tmps[i0];
1008: tmp1 = tmps[i1];
1009: v1 -= 2;
1010: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1011: }
1012: if (j == 1) {
1013: tmp0 = tmps[*vi--];
1014: sum1 -= *v1-- * tmp0;
1015: }
1016: x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1017: row--;
1018: break;
1019: case 2:
1020: sum1 = tmp[row];
1021: sum2 = tmp[row - 1];
1022: v2 = aa + ai[row] - 1;
1023: for (j = nz; j > 1; j -= 2) {
1024: vi -= 2;
1025: i0 = vi[2];
1026: i1 = vi[1];
1027: tmp0 = tmps[i0];
1028: tmp1 = tmps[i1];
1029: v1 -= 2;
1030: v2 -= 2;
1031: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1032: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1033: }
1034: if (j == 1) {
1035: tmp0 = tmps[*vi--];
1036: sum1 -= *v1-- * tmp0;
1037: sum2 -= *v2-- * tmp0;
1038: }
1040: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1041: row--;
1042: sum2 -= *v2-- * tmp0;
1043: x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1044: row--;
1045: break;
1046: case 3:
1047: sum1 = tmp[row];
1048: sum2 = tmp[row - 1];
1049: sum3 = tmp[row - 2];
1050: v2 = aa + ai[row] - 1;
1051: v3 = aa + ai[row - 1] - 1;
1052: for (j = nz; j > 1; j -= 2) {
1053: vi -= 2;
1054: i0 = vi[2];
1055: i1 = vi[1];
1056: tmp0 = tmps[i0];
1057: tmp1 = tmps[i1];
1058: v1 -= 2;
1059: v2 -= 2;
1060: v3 -= 2;
1061: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064: }
1065: if (j == 1) {
1066: tmp0 = tmps[*vi--];
1067: sum1 -= *v1-- * tmp0;
1068: sum2 -= *v2-- * tmp0;
1069: sum3 -= *v3-- * tmp0;
1070: }
1071: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1072: row--;
1073: sum2 -= *v2-- * tmp0;
1074: sum3 -= *v3-- * tmp0;
1075: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1076: row--;
1077: sum3 -= *v3-- * tmp0;
1078: x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1079: row--;
1081: break;
1082: case 4:
1083: sum1 = tmp[row];
1084: sum2 = tmp[row - 1];
1085: sum3 = tmp[row - 2];
1086: sum4 = tmp[row - 3];
1087: v2 = aa + ai[row] - 1;
1088: v3 = aa + ai[row - 1] - 1;
1089: v4 = aa + ai[row - 2] - 1;
1091: for (j = nz; j > 1; j -= 2) {
1092: vi -= 2;
1093: i0 = vi[2];
1094: i1 = vi[1];
1095: tmp0 = tmps[i0];
1096: tmp1 = tmps[i1];
1097: v1 -= 2;
1098: v2 -= 2;
1099: v3 -= 2;
1100: v4 -= 2;
1101: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1102: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1103: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1104: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1105: }
1106: if (j == 1) {
1107: tmp0 = tmps[*vi--];
1108: sum1 -= *v1-- * tmp0;
1109: sum2 -= *v2-- * tmp0;
1110: sum3 -= *v3-- * tmp0;
1111: sum4 -= *v4-- * tmp0;
1112: }
1114: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1115: row--;
1116: sum2 -= *v2-- * tmp0;
1117: sum3 -= *v3-- * tmp0;
1118: sum4 -= *v4-- * tmp0;
1119: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1120: row--;
1121: sum3 -= *v3-- * tmp0;
1122: sum4 -= *v4-- * tmp0;
1123: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1124: row--;
1125: sum4 -= *v4-- * tmp0;
1126: x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1127: row--;
1128: break;
1129: case 5:
1130: sum1 = tmp[row];
1131: sum2 = tmp[row - 1];
1132: sum3 = tmp[row - 2];
1133: sum4 = tmp[row - 3];
1134: sum5 = tmp[row - 4];
1135: v2 = aa + ai[row] - 1;
1136: v3 = aa + ai[row - 1] - 1;
1137: v4 = aa + ai[row - 2] - 1;
1138: v5 = aa + ai[row - 3] - 1;
1139: for (j = nz; j > 1; j -= 2) {
1140: vi -= 2;
1141: i0 = vi[2];
1142: i1 = vi[1];
1143: tmp0 = tmps[i0];
1144: tmp1 = tmps[i1];
1145: v1 -= 2;
1146: v2 -= 2;
1147: v3 -= 2;
1148: v4 -= 2;
1149: v5 -= 2;
1150: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1151: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1152: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1153: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1154: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1155: }
1156: if (j == 1) {
1157: tmp0 = tmps[*vi--];
1158: sum1 -= *v1-- * tmp0;
1159: sum2 -= *v2-- * tmp0;
1160: sum3 -= *v3-- * tmp0;
1161: sum4 -= *v4-- * tmp0;
1162: sum5 -= *v5-- * tmp0;
1163: }
1165: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1166: row--;
1167: sum2 -= *v2-- * tmp0;
1168: sum3 -= *v3-- * tmp0;
1169: sum4 -= *v4-- * tmp0;
1170: sum5 -= *v5-- * tmp0;
1171: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1172: row--;
1173: sum3 -= *v3-- * tmp0;
1174: sum4 -= *v4-- * tmp0;
1175: sum5 -= *v5-- * tmp0;
1176: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1177: row--;
1178: sum4 -= *v4-- * tmp0;
1179: sum5 -= *v5-- * tmp0;
1180: tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1181: row--;
1182: sum5 -= *v5-- * tmp0;
1183: x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1184: row--;
1185: break;
1186: default:
1187: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1188: }
1189: }
1190: PetscCall(ISRestoreIndices(isrow, &rout));
1191: PetscCall(ISRestoreIndices(iscol, &cout));
1192: PetscCall(VecRestoreArrayRead(bb, &b));
1193: PetscCall(VecRestoreArrayWrite(xx, &x));
1194: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1195: PetscFunctionReturn(PETSC_SUCCESS);
1196: }
1198: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1199: {
1200: Mat C = B;
1201: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1202: IS isrow = b->row, isicol = b->icol;
1203: const PetscInt *r, *ic, *ics;
1204: const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1205: PetscInt i, j, k, nz, nzL, row, *pj;
1206: const PetscInt *ajtmp, *bjtmp;
1207: MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1208: const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1209: FactorShiftCtx sctx;
1210: const PetscInt *ddiag;
1211: PetscReal rs;
1212: MatScalar d;
1213: PetscInt inod, nodesz, node_max, col;
1214: const PetscInt *ns;
1215: PetscInt *tmp_vec1, *tmp_vec2, *nsmap;
1217: PetscFunctionBegin;
1218: /* MatPivotSetUp(): initialize shift context sctx */
1219: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1221: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1222: ddiag = a->diag;
1223: sctx.shift_top = info->zeropivot;
1224: for (i = 0; i < n; i++) {
1225: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1226: d = (aa)[ddiag[i]];
1227: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1228: v = aa + ai[i];
1229: nz = ai[i + 1] - ai[i];
1230: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1231: if (rs > sctx.shift_top) sctx.shift_top = rs;
1232: }
1233: sctx.shift_top *= 1.1;
1234: sctx.nshift_max = 5;
1235: sctx.shift_lo = 0.;
1236: sctx.shift_hi = 1.;
1237: }
1239: PetscCall(ISGetIndices(isrow, &r));
1240: PetscCall(ISGetIndices(isicol, &ic));
1242: PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1243: ics = ic;
1245: node_max = a->inode.node_count;
1246: ns = a->inode.size_csr;
1247: PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1249: /* If max inode size > 4, split it into two inodes.*/
1250: /* also map the inode sizes according to the ordering */
1251: PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1252: for (i = 0, j = 0; i < node_max; ++i, ++j) {
1253: nodesz = ns[i + 1] - ns[i];
1254: if (nodesz > 4) {
1255: tmp_vec1[j] = 4;
1256: ++j;
1257: tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
1258: } else {
1259: tmp_vec1[j] = nodesz;
1260: }
1261: }
1262: /* Use the correct node_max */
1263: node_max = j;
1265: /* Now reorder the inode info based on mat re-ordering info */
1266: /* First create a row -> inode_size_array_index map */
1267: PetscCall(PetscMalloc1(n + 1, &nsmap));
1268: PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1269: tmp_vec2[0] = 0;
1270: for (i = 0, row = 0; i < node_max; i++) {
1271: nodesz = tmp_vec1[i];
1272: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1273: }
1274: /* Using nsmap, create a reordered ns structure */
1275: for (i = 0, j = 0; i < node_max; i++) {
1276: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1277: tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
1278: j += nodesz;
1279: }
1280: PetscCall(PetscFree(nsmap));
1281: PetscCall(PetscFree(tmp_vec1));
1283: /* Now use the correct ns */
1284: ns = tmp_vec2;
1286: do {
1287: sctx.newshift = PETSC_FALSE;
1288: /* Now loop over each block-row, and do the factorization */
1289: for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1290: nodesz = ns[inod + 1] - ns[inod];
1292: switch (nodesz) {
1293: case 1:
1294: /* zero rtmp1 */
1295: /* L part */
1296: nz = bi[i + 1] - bi[i];
1297: bjtmp = bj + bi[i];
1298: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1300: /* U part */
1301: nz = bdiag[i] - bdiag[i + 1];
1302: bjtmp = bj + bdiag[i + 1] + 1;
1303: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1305: /* load in initial (unfactored row) */
1306: nz = ai[r[i] + 1] - ai[r[i]];
1307: ajtmp = aj + ai[r[i]];
1308: v = aa + ai[r[i]];
1309: for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1311: /* ZeropivotApply() */
1312: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1314: /* elimination */
1315: bjtmp = bj + bi[i];
1316: row = *bjtmp++;
1317: nzL = bi[i + 1] - bi[i];
1318: for (k = 0; k < nzL; k++) {
1319: pc = rtmp1 + row;
1320: if (*pc != 0.0) {
1321: pv = b->a + bdiag[row];
1322: mul1 = *pc * (*pv);
1323: *pc = mul1;
1324: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1325: pv = b->a + bdiag[row + 1] + 1;
1326: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1327: for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1328: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1329: }
1330: row = *bjtmp++;
1331: }
1333: /* finished row so stick it into b->a */
1334: rs = 0.0;
1335: /* L part */
1336: pv = b->a + bi[i];
1337: pj = b->j + bi[i];
1338: nz = bi[i + 1] - bi[i];
1339: for (j = 0; j < nz; j++) {
1340: pv[j] = rtmp1[pj[j]];
1341: rs += PetscAbsScalar(pv[j]);
1342: }
1344: /* U part */
1345: pv = b->a + bdiag[i + 1] + 1;
1346: pj = b->j + bdiag[i + 1] + 1;
1347: nz = bdiag[i] - bdiag[i + 1] - 1;
1348: for (j = 0; j < nz; j++) {
1349: pv[j] = rtmp1[pj[j]];
1350: rs += PetscAbsScalar(pv[j]);
1351: }
1353: /* Check zero pivot */
1354: sctx.rs = rs;
1355: sctx.pv = rtmp1[i];
1356: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1357: if (sctx.newshift) break;
1359: /* Mark diagonal and invert diagonal for simpler triangular solves */
1360: pv = b->a + bdiag[i];
1361: *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1362: break;
1364: case 2:
1365: /* zero rtmp1 and rtmp2 */
1366: /* L part */
1367: nz = bi[i + 1] - bi[i];
1368: bjtmp = bj + bi[i];
1369: for (j = 0; j < nz; j++) {
1370: col = bjtmp[j];
1371: rtmp1[col] = 0.0;
1372: rtmp2[col] = 0.0;
1373: }
1375: /* U part */
1376: nz = bdiag[i] - bdiag[i + 1];
1377: bjtmp = bj + bdiag[i + 1] + 1;
1378: for (j = 0; j < nz; j++) {
1379: col = bjtmp[j];
1380: rtmp1[col] = 0.0;
1381: rtmp2[col] = 0.0;
1382: }
1384: /* load in initial (unfactored row) */
1385: nz = ai[r[i] + 1] - ai[r[i]];
1386: ajtmp = aj + ai[r[i]];
1387: v1 = aa + ai[r[i]];
1388: v2 = aa + ai[r[i + 1]];
1389: for (j = 0; j < nz; j++) {
1390: col = ics[ajtmp[j]];
1391: rtmp1[col] = v1[j];
1392: rtmp2[col] = v2[j];
1393: }
1394: /* ZeropivotApply(): shift the diagonal of the matrix */
1395: rtmp1[i] += sctx.shift_amount;
1396: rtmp2[i + 1] += sctx.shift_amount;
1398: /* elimination */
1399: bjtmp = bj + bi[i];
1400: row = *bjtmp++; /* pivot row */
1401: nzL = bi[i + 1] - bi[i];
1402: for (k = 0; k < nzL; k++) {
1403: pc1 = rtmp1 + row;
1404: pc2 = rtmp2 + row;
1405: if (*pc1 != 0.0 || *pc2 != 0.0) {
1406: pv = b->a + bdiag[row];
1407: mul1 = *pc1 * (*pv);
1408: mul2 = *pc2 * (*pv);
1409: *pc1 = mul1;
1410: *pc2 = mul2;
1412: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1413: pv = b->a + bdiag[row + 1] + 1;
1414: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1415: for (j = 0; j < nz; j++) {
1416: col = pj[j];
1417: rtmp1[col] -= mul1 * pv[j];
1418: rtmp2[col] -= mul2 * pv[j];
1419: }
1420: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1421: }
1422: row = *bjtmp++;
1423: }
1425: /* finished row i; check zero pivot, then stick row i into b->a */
1426: rs = 0.0;
1427: /* L part */
1428: pc1 = b->a + bi[i];
1429: pj = b->j + bi[i];
1430: nz = bi[i + 1] - bi[i];
1431: for (j = 0; j < nz; j++) {
1432: col = pj[j];
1433: pc1[j] = rtmp1[col];
1434: rs += PetscAbsScalar(pc1[j]);
1435: }
1436: /* U part */
1437: pc1 = b->a + bdiag[i + 1] + 1;
1438: pj = b->j + bdiag[i + 1] + 1;
1439: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1440: for (j = 0; j < nz; j++) {
1441: col = pj[j];
1442: pc1[j] = rtmp1[col];
1443: rs += PetscAbsScalar(pc1[j]);
1444: }
1446: sctx.rs = rs;
1447: sctx.pv = rtmp1[i];
1448: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1449: if (sctx.newshift) break;
1450: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1451: *pc1 = 1.0 / sctx.pv;
1453: /* Now take care of diagonal 2x2 block. */
1454: pc2 = rtmp2 + i;
1455: if (*pc2 != 0.0) {
1456: mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */
1457: *pc2 = mul1; /* insert L entry */
1458: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1459: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1460: for (j = 0; j < nz; j++) {
1461: col = pj[j];
1462: rtmp2[col] -= mul1 * rtmp1[col];
1463: }
1464: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1465: }
1467: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1468: rs = 0.0;
1469: /* L part */
1470: pc2 = b->a + bi[i + 1];
1471: pj = b->j + bi[i + 1];
1472: nz = bi[i + 2] - bi[i + 1];
1473: for (j = 0; j < nz; j++) {
1474: col = pj[j];
1475: pc2[j] = rtmp2[col];
1476: rs += PetscAbsScalar(pc2[j]);
1477: }
1478: /* U part */
1479: pc2 = b->a + bdiag[i + 2] + 1;
1480: pj = b->j + bdiag[i + 2] + 1;
1481: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1482: for (j = 0; j < nz; j++) {
1483: col = pj[j];
1484: pc2[j] = rtmp2[col];
1485: rs += PetscAbsScalar(pc2[j]);
1486: }
1488: sctx.rs = rs;
1489: sctx.pv = rtmp2[i + 1];
1490: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1491: if (sctx.newshift) break;
1492: pc2 = b->a + bdiag[i + 1];
1493: *pc2 = 1.0 / sctx.pv;
1494: break;
1496: case 3:
1497: /* zero rtmp */
1498: /* L part */
1499: nz = bi[i + 1] - bi[i];
1500: bjtmp = bj + bi[i];
1501: for (j = 0; j < nz; j++) {
1502: col = bjtmp[j];
1503: rtmp1[col] = 0.0;
1504: rtmp2[col] = 0.0;
1505: rtmp3[col] = 0.0;
1506: }
1508: /* U part */
1509: nz = bdiag[i] - bdiag[i + 1];
1510: bjtmp = bj + bdiag[i + 1] + 1;
1511: for (j = 0; j < nz; j++) {
1512: col = bjtmp[j];
1513: rtmp1[col] = 0.0;
1514: rtmp2[col] = 0.0;
1515: rtmp3[col] = 0.0;
1516: }
1518: /* load in initial (unfactored row) */
1519: nz = ai[r[i] + 1] - ai[r[i]];
1520: ajtmp = aj + ai[r[i]];
1521: v1 = aa + ai[r[i]];
1522: v2 = aa + ai[r[i + 1]];
1523: v3 = aa + ai[r[i + 2]];
1524: for (j = 0; j < nz; j++) {
1525: col = ics[ajtmp[j]];
1526: rtmp1[col] = v1[j];
1527: rtmp2[col] = v2[j];
1528: rtmp3[col] = v3[j];
1529: }
1530: /* ZeropivotApply(): shift the diagonal of the matrix */
1531: rtmp1[i] += sctx.shift_amount;
1532: rtmp2[i + 1] += sctx.shift_amount;
1533: rtmp3[i + 2] += sctx.shift_amount;
1535: /* elimination */
1536: bjtmp = bj + bi[i];
1537: row = *bjtmp++; /* pivot row */
1538: nzL = bi[i + 1] - bi[i];
1539: for (k = 0; k < nzL; k++) {
1540: pc1 = rtmp1 + row;
1541: pc2 = rtmp2 + row;
1542: pc3 = rtmp3 + row;
1543: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1544: pv = b->a + bdiag[row];
1545: mul1 = *pc1 * (*pv);
1546: mul2 = *pc2 * (*pv);
1547: mul3 = *pc3 * (*pv);
1548: *pc1 = mul1;
1549: *pc2 = mul2;
1550: *pc3 = mul3;
1552: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1553: pv = b->a + bdiag[row + 1] + 1;
1554: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1555: for (j = 0; j < nz; j++) {
1556: col = pj[j];
1557: rtmp1[col] -= mul1 * pv[j];
1558: rtmp2[col] -= mul2 * pv[j];
1559: rtmp3[col] -= mul3 * pv[j];
1560: }
1561: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1562: }
1563: row = *bjtmp++;
1564: }
1566: /* finished row i; check zero pivot, then stick row i into b->a */
1567: rs = 0.0;
1568: /* L part */
1569: pc1 = b->a + bi[i];
1570: pj = b->j + bi[i];
1571: nz = bi[i + 1] - bi[i];
1572: for (j = 0; j < nz; j++) {
1573: col = pj[j];
1574: pc1[j] = rtmp1[col];
1575: rs += PetscAbsScalar(pc1[j]);
1576: }
1577: /* U part */
1578: pc1 = b->a + bdiag[i + 1] + 1;
1579: pj = b->j + bdiag[i + 1] + 1;
1580: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1581: for (j = 0; j < nz; j++) {
1582: col = pj[j];
1583: pc1[j] = rtmp1[col];
1584: rs += PetscAbsScalar(pc1[j]);
1585: }
1587: sctx.rs = rs;
1588: sctx.pv = rtmp1[i];
1589: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1590: if (sctx.newshift) break;
1591: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1592: *pc1 = 1.0 / sctx.pv;
1594: /* Now take care of 1st column of diagonal 3x3 block. */
1595: pc2 = rtmp2 + i;
1596: pc3 = rtmp3 + i;
1597: if (*pc2 != 0.0 || *pc3 != 0.0) {
1598: mul2 = (*pc2) * (*pc1);
1599: *pc2 = mul2;
1600: mul3 = (*pc3) * (*pc1);
1601: *pc3 = mul3;
1602: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1603: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1604: for (j = 0; j < nz; j++) {
1605: col = pj[j];
1606: rtmp2[col] -= mul2 * rtmp1[col];
1607: rtmp3[col] -= mul3 * rtmp1[col];
1608: }
1609: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1610: }
1612: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1613: rs = 0.0;
1614: /* L part */
1615: pc2 = b->a + bi[i + 1];
1616: pj = b->j + bi[i + 1];
1617: nz = bi[i + 2] - bi[i + 1];
1618: for (j = 0; j < nz; j++) {
1619: col = pj[j];
1620: pc2[j] = rtmp2[col];
1621: rs += PetscAbsScalar(pc2[j]);
1622: }
1623: /* U part */
1624: pc2 = b->a + bdiag[i + 2] + 1;
1625: pj = b->j + bdiag[i + 2] + 1;
1626: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1627: for (j = 0; j < nz; j++) {
1628: col = pj[j];
1629: pc2[j] = rtmp2[col];
1630: rs += PetscAbsScalar(pc2[j]);
1631: }
1633: sctx.rs = rs;
1634: sctx.pv = rtmp2[i + 1];
1635: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1636: if (sctx.newshift) break;
1637: pc2 = b->a + bdiag[i + 1];
1638: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1640: /* Now take care of 2nd column of diagonal 3x3 block. */
1641: pc3 = rtmp3 + i + 1;
1642: if (*pc3 != 0.0) {
1643: mul3 = (*pc3) * (*pc2);
1644: *pc3 = mul3;
1645: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1646: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1647: for (j = 0; j < nz; j++) {
1648: col = pj[j];
1649: rtmp3[col] -= mul3 * rtmp2[col];
1650: }
1651: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1652: }
1654: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1655: rs = 0.0;
1656: /* L part */
1657: pc3 = b->a + bi[i + 2];
1658: pj = b->j + bi[i + 2];
1659: nz = bi[i + 3] - bi[i + 2];
1660: for (j = 0; j < nz; j++) {
1661: col = pj[j];
1662: pc3[j] = rtmp3[col];
1663: rs += PetscAbsScalar(pc3[j]);
1664: }
1665: /* U part */
1666: pc3 = b->a + bdiag[i + 3] + 1;
1667: pj = b->j + bdiag[i + 3] + 1;
1668: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1669: for (j = 0; j < nz; j++) {
1670: col = pj[j];
1671: pc3[j] = rtmp3[col];
1672: rs += PetscAbsScalar(pc3[j]);
1673: }
1675: sctx.rs = rs;
1676: sctx.pv = rtmp3[i + 2];
1677: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1678: if (sctx.newshift) break;
1679: pc3 = b->a + bdiag[i + 2];
1680: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1681: break;
1682: case 4:
1683: /* zero rtmp */
1684: /* L part */
1685: nz = bi[i + 1] - bi[i];
1686: bjtmp = bj + bi[i];
1687: for (j = 0; j < nz; j++) {
1688: col = bjtmp[j];
1689: rtmp1[col] = 0.0;
1690: rtmp2[col] = 0.0;
1691: rtmp3[col] = 0.0;
1692: rtmp4[col] = 0.0;
1693: }
1695: /* U part */
1696: nz = bdiag[i] - bdiag[i + 1];
1697: bjtmp = bj + bdiag[i + 1] + 1;
1698: for (j = 0; j < nz; j++) {
1699: col = bjtmp[j];
1700: rtmp1[col] = 0.0;
1701: rtmp2[col] = 0.0;
1702: rtmp3[col] = 0.0;
1703: rtmp4[col] = 0.0;
1704: }
1706: /* load in initial (unfactored row) */
1707: nz = ai[r[i] + 1] - ai[r[i]];
1708: ajtmp = aj + ai[r[i]];
1709: v1 = aa + ai[r[i]];
1710: v2 = aa + ai[r[i + 1]];
1711: v3 = aa + ai[r[i + 2]];
1712: v4 = aa + ai[r[i + 3]];
1713: for (j = 0; j < nz; j++) {
1714: col = ics[ajtmp[j]];
1715: rtmp1[col] = v1[j];
1716: rtmp2[col] = v2[j];
1717: rtmp3[col] = v3[j];
1718: rtmp4[col] = v4[j];
1719: }
1720: /* ZeropivotApply(): shift the diagonal of the matrix */
1721: rtmp1[i] += sctx.shift_amount;
1722: rtmp2[i + 1] += sctx.shift_amount;
1723: rtmp3[i + 2] += sctx.shift_amount;
1724: rtmp4[i + 3] += sctx.shift_amount;
1726: /* elimination */
1727: bjtmp = bj + bi[i];
1728: row = *bjtmp++; /* pivot row */
1729: nzL = bi[i + 1] - bi[i];
1730: for (k = 0; k < nzL; k++) {
1731: pc1 = rtmp1 + row;
1732: pc2 = rtmp2 + row;
1733: pc3 = rtmp3 + row;
1734: pc4 = rtmp4 + row;
1735: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1736: pv = b->a + bdiag[row];
1737: mul1 = *pc1 * (*pv);
1738: mul2 = *pc2 * (*pv);
1739: mul3 = *pc3 * (*pv);
1740: mul4 = *pc4 * (*pv);
1741: *pc1 = mul1;
1742: *pc2 = mul2;
1743: *pc3 = mul3;
1744: *pc4 = mul4;
1746: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1747: pv = b->a + bdiag[row + 1] + 1;
1748: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1749: for (j = 0; j < nz; j++) {
1750: col = pj[j];
1751: rtmp1[col] -= mul1 * pv[j];
1752: rtmp2[col] -= mul2 * pv[j];
1753: rtmp3[col] -= mul3 * pv[j];
1754: rtmp4[col] -= mul4 * pv[j];
1755: }
1756: PetscCall(PetscLogFlops(4 + 8.0 * nz));
1757: }
1758: row = *bjtmp++;
1759: }
1761: /* finished row i; check zero pivot, then stick row i into b->a */
1762: rs = 0.0;
1763: /* L part */
1764: pc1 = b->a + bi[i];
1765: pj = b->j + bi[i];
1766: nz = bi[i + 1] - bi[i];
1767: for (j = 0; j < nz; j++) {
1768: col = pj[j];
1769: pc1[j] = rtmp1[col];
1770: rs += PetscAbsScalar(pc1[j]);
1771: }
1772: /* U part */
1773: pc1 = b->a + bdiag[i + 1] + 1;
1774: pj = b->j + bdiag[i + 1] + 1;
1775: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1776: for (j = 0; j < nz; j++) {
1777: col = pj[j];
1778: pc1[j] = rtmp1[col];
1779: rs += PetscAbsScalar(pc1[j]);
1780: }
1782: sctx.rs = rs;
1783: sctx.pv = rtmp1[i];
1784: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1785: if (sctx.newshift) break;
1786: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1787: *pc1 = 1.0 / sctx.pv;
1789: /* Now take care of 1st column of diagonal 4x4 block. */
1790: pc2 = rtmp2 + i;
1791: pc3 = rtmp3 + i;
1792: pc4 = rtmp4 + i;
1793: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1794: mul2 = (*pc2) * (*pc1);
1795: *pc2 = mul2;
1796: mul3 = (*pc3) * (*pc1);
1797: *pc3 = mul3;
1798: mul4 = (*pc4) * (*pc1);
1799: *pc4 = mul4;
1800: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1801: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1802: for (j = 0; j < nz; j++) {
1803: col = pj[j];
1804: rtmp2[col] -= mul2 * rtmp1[col];
1805: rtmp3[col] -= mul3 * rtmp1[col];
1806: rtmp4[col] -= mul4 * rtmp1[col];
1807: }
1808: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1809: }
1811: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1812: rs = 0.0;
1813: /* L part */
1814: pc2 = b->a + bi[i + 1];
1815: pj = b->j + bi[i + 1];
1816: nz = bi[i + 2] - bi[i + 1];
1817: for (j = 0; j < nz; j++) {
1818: col = pj[j];
1819: pc2[j] = rtmp2[col];
1820: rs += PetscAbsScalar(pc2[j]);
1821: }
1822: /* U part */
1823: pc2 = b->a + bdiag[i + 2] + 1;
1824: pj = b->j + bdiag[i + 2] + 1;
1825: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1826: for (j = 0; j < nz; j++) {
1827: col = pj[j];
1828: pc2[j] = rtmp2[col];
1829: rs += PetscAbsScalar(pc2[j]);
1830: }
1832: sctx.rs = rs;
1833: sctx.pv = rtmp2[i + 1];
1834: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1835: if (sctx.newshift) break;
1836: pc2 = b->a + bdiag[i + 1];
1837: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1839: /* Now take care of 2nd column of diagonal 4x4 block. */
1840: pc3 = rtmp3 + i + 1;
1841: pc4 = rtmp4 + i + 1;
1842: if (*pc3 != 0.0 || *pc4 != 0.0) {
1843: mul3 = (*pc3) * (*pc2);
1844: *pc3 = mul3;
1845: mul4 = (*pc4) * (*pc2);
1846: *pc4 = mul4;
1847: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1848: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1849: for (j = 0; j < nz; j++) {
1850: col = pj[j];
1851: rtmp3[col] -= mul3 * rtmp2[col];
1852: rtmp4[col] -= mul4 * rtmp2[col];
1853: }
1854: PetscCall(PetscLogFlops(4.0 * nz));
1855: }
1857: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1858: rs = 0.0;
1859: /* L part */
1860: pc3 = b->a + bi[i + 2];
1861: pj = b->j + bi[i + 2];
1862: nz = bi[i + 3] - bi[i + 2];
1863: for (j = 0; j < nz; j++) {
1864: col = pj[j];
1865: pc3[j] = rtmp3[col];
1866: rs += PetscAbsScalar(pc3[j]);
1867: }
1868: /* U part */
1869: pc3 = b->a + bdiag[i + 3] + 1;
1870: pj = b->j + bdiag[i + 3] + 1;
1871: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1872: for (j = 0; j < nz; j++) {
1873: col = pj[j];
1874: pc3[j] = rtmp3[col];
1875: rs += PetscAbsScalar(pc3[j]);
1876: }
1878: sctx.rs = rs;
1879: sctx.pv = rtmp3[i + 2];
1880: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1881: if (sctx.newshift) break;
1882: pc3 = b->a + bdiag[i + 2];
1883: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1885: /* Now take care of 3rd column of diagonal 4x4 block. */
1886: pc4 = rtmp4 + i + 2;
1887: if (*pc4 != 0.0) {
1888: mul4 = (*pc4) * (*pc3);
1889: *pc4 = mul4;
1890: pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */
1891: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1892: for (j = 0; j < nz; j++) {
1893: col = pj[j];
1894: rtmp4[col] -= mul4 * rtmp3[col];
1895: }
1896: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1897: }
1899: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1900: rs = 0.0;
1901: /* L part */
1902: pc4 = b->a + bi[i + 3];
1903: pj = b->j + bi[i + 3];
1904: nz = bi[i + 4] - bi[i + 3];
1905: for (j = 0; j < nz; j++) {
1906: col = pj[j];
1907: pc4[j] = rtmp4[col];
1908: rs += PetscAbsScalar(pc4[j]);
1909: }
1910: /* U part */
1911: pc4 = b->a + bdiag[i + 4] + 1;
1912: pj = b->j + bdiag[i + 4] + 1;
1913: nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1914: for (j = 0; j < nz; j++) {
1915: col = pj[j];
1916: pc4[j] = rtmp4[col];
1917: rs += PetscAbsScalar(pc4[j]);
1918: }
1920: sctx.rs = rs;
1921: sctx.pv = rtmp4[i + 3];
1922: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1923: if (sctx.newshift) break;
1924: pc4 = b->a + bdiag[i + 3];
1925: *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1926: break;
1928: default:
1929: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1930: }
1931: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1932: i += nodesz; /* Update the row */
1933: }
1935: /* MatPivotRefine() */
1936: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1937: /*
1938: * if no shift in this attempt & shifting & started shifting & can refine,
1939: * then try lower shift
1940: */
1941: sctx.shift_hi = sctx.shift_fraction;
1942: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1943: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1944: sctx.newshift = PETSC_TRUE;
1945: sctx.nshift++;
1946: }
1947: } while (sctx.newshift);
1949: PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1950: PetscCall(PetscFree(tmp_vec2));
1951: PetscCall(ISRestoreIndices(isicol, &ic));
1952: PetscCall(ISRestoreIndices(isrow, &r));
1954: if (b->inode.size_csr) {
1955: C->ops->solve = MatSolve_SeqAIJ_Inode;
1956: } else {
1957: C->ops->solve = MatSolve_SeqAIJ;
1958: }
1959: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1960: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1961: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1962: C->ops->matsolve = MatMatSolve_SeqAIJ;
1963: C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1964: C->assembled = PETSC_TRUE;
1965: C->preallocated = PETSC_TRUE;
1967: PetscCall(PetscLogFlops(C->cmap->n));
1969: /* MatShiftView(A,info,&sctx) */
1970: if (sctx.nshift) {
1971: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1972: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1973: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1974: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1975: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1976: PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1977: }
1978: }
1979: PetscFunctionReturn(PETSC_SUCCESS);
1980: }
1982: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
1983: {
1984: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1985: IS iscol = a->col, isrow = a->row;
1986: const PetscInt *r, *c, *rout, *cout;
1987: PetscInt i, j;
1988: PetscInt node_max, row, nsz, aii, i0, i1, nz;
1989: const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
1990: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
1991: PetscScalar sum1, sum2, sum3, sum4, sum5;
1992: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
1993: const PetscScalar *b;
1995: PetscFunctionBegin;
1996: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
1997: node_max = a->inode.node_count;
1998: ns = a->inode.size_csr; /* Node Size array */
2000: PetscCall(VecGetArrayRead(bb, &b));
2001: PetscCall(VecGetArrayWrite(xx, &x));
2002: tmp = a->solve_work;
2004: PetscCall(ISGetIndices(isrow, &rout));
2005: r = rout;
2006: PetscCall(ISGetIndices(iscol, &cout));
2007: c = cout;
2009: /* forward solve the lower triangular */
2010: tmps = tmp;
2011: aa = a_a;
2012: aj = a_j;
2013: ad = a->diag;
2015: for (i = 0; i < node_max; ++i) {
2016: row = ns[i];
2017: nsz = ns[i + 1] - ns[i];
2018: aii = ai[row];
2019: v1 = aa + aii;
2020: vi = aj + aii;
2021: nz = ai[row + 1] - ai[row];
2023: if (i < node_max - 1) {
2024: /* Prefetch the indices for the next block */
2025: PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2026: /* Prefetch the data for the next block */
2027: PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2028: }
2030: switch (nsz) { /* Each loop in 'case' is unrolled */
2031: case 1:
2032: sum1 = b[r[row]];
2033: for (j = 0; j < nz - 1; j += 2) {
2034: i0 = vi[j];
2035: i1 = vi[j + 1];
2036: tmp0 = tmps[i0];
2037: tmp1 = tmps[i1];
2038: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2039: }
2040: if (j == nz - 1) {
2041: tmp0 = tmps[vi[j]];
2042: sum1 -= v1[j] * tmp0;
2043: }
2044: tmp[row++] = sum1;
2045: break;
2046: case 2:
2047: sum1 = b[r[row]];
2048: sum2 = b[r[row + 1]];
2049: v2 = aa + ai[row + 1];
2051: for (j = 0; j < nz - 1; j += 2) {
2052: i0 = vi[j];
2053: i1 = vi[j + 1];
2054: tmp0 = tmps[i0];
2055: tmp1 = tmps[i1];
2056: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2057: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2058: }
2059: if (j == nz - 1) {
2060: tmp0 = tmps[vi[j]];
2061: sum1 -= v1[j] * tmp0;
2062: sum2 -= v2[j] * tmp0;
2063: }
2064: sum2 -= v2[nz] * sum1;
2065: tmp[row++] = sum1;
2066: tmp[row++] = sum2;
2067: break;
2068: case 3:
2069: sum1 = b[r[row]];
2070: sum2 = b[r[row + 1]];
2071: sum3 = b[r[row + 2]];
2072: v2 = aa + ai[row + 1];
2073: v3 = aa + ai[row + 2];
2075: for (j = 0; j < nz - 1; j += 2) {
2076: i0 = vi[j];
2077: i1 = vi[j + 1];
2078: tmp0 = tmps[i0];
2079: tmp1 = tmps[i1];
2080: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2081: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2082: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2083: }
2084: if (j == nz - 1) {
2085: tmp0 = tmps[vi[j]];
2086: sum1 -= v1[j] * tmp0;
2087: sum2 -= v2[j] * tmp0;
2088: sum3 -= v3[j] * tmp0;
2089: }
2090: sum2 -= v2[nz] * sum1;
2091: sum3 -= v3[nz] * sum1;
2092: sum3 -= v3[nz + 1] * sum2;
2093: tmp[row++] = sum1;
2094: tmp[row++] = sum2;
2095: tmp[row++] = sum3;
2096: break;
2098: case 4:
2099: sum1 = b[r[row]];
2100: sum2 = b[r[row + 1]];
2101: sum3 = b[r[row + 2]];
2102: sum4 = b[r[row + 3]];
2103: v2 = aa + ai[row + 1];
2104: v3 = aa + ai[row + 2];
2105: v4 = aa + ai[row + 3];
2107: for (j = 0; j < nz - 1; j += 2) {
2108: i0 = vi[j];
2109: i1 = vi[j + 1];
2110: tmp0 = tmps[i0];
2111: tmp1 = tmps[i1];
2112: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2113: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2114: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2115: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2116: }
2117: if (j == nz - 1) {
2118: tmp0 = tmps[vi[j]];
2119: sum1 -= v1[j] * tmp0;
2120: sum2 -= v2[j] * tmp0;
2121: sum3 -= v3[j] * tmp0;
2122: sum4 -= v4[j] * tmp0;
2123: }
2124: sum2 -= v2[nz] * sum1;
2125: sum3 -= v3[nz] * sum1;
2126: sum4 -= v4[nz] * sum1;
2127: sum3 -= v3[nz + 1] * sum2;
2128: sum4 -= v4[nz + 1] * sum2;
2129: sum4 -= v4[nz + 2] * sum3;
2131: tmp[row++] = sum1;
2132: tmp[row++] = sum2;
2133: tmp[row++] = sum3;
2134: tmp[row++] = sum4;
2135: break;
2136: case 5:
2137: sum1 = b[r[row]];
2138: sum2 = b[r[row + 1]];
2139: sum3 = b[r[row + 2]];
2140: sum4 = b[r[row + 3]];
2141: sum5 = b[r[row + 4]];
2142: v2 = aa + ai[row + 1];
2143: v3 = aa + ai[row + 2];
2144: v4 = aa + ai[row + 3];
2145: v5 = aa + ai[row + 4];
2147: for (j = 0; j < nz - 1; j += 2) {
2148: i0 = vi[j];
2149: i1 = vi[j + 1];
2150: tmp0 = tmps[i0];
2151: tmp1 = tmps[i1];
2152: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2153: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2154: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2155: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2156: sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2157: }
2158: if (j == nz - 1) {
2159: tmp0 = tmps[vi[j]];
2160: sum1 -= v1[j] * tmp0;
2161: sum2 -= v2[j] * tmp0;
2162: sum3 -= v3[j] * tmp0;
2163: sum4 -= v4[j] * tmp0;
2164: sum5 -= v5[j] * tmp0;
2165: }
2167: sum2 -= v2[nz] * sum1;
2168: sum3 -= v3[nz] * sum1;
2169: sum4 -= v4[nz] * sum1;
2170: sum5 -= v5[nz] * sum1;
2171: sum3 -= v3[nz + 1] * sum2;
2172: sum4 -= v4[nz + 1] * sum2;
2173: sum5 -= v5[nz + 1] * sum2;
2174: sum4 -= v4[nz + 2] * sum3;
2175: sum5 -= v5[nz + 2] * sum3;
2176: sum5 -= v5[nz + 3] * sum4;
2178: tmp[row++] = sum1;
2179: tmp[row++] = sum2;
2180: tmp[row++] = sum3;
2181: tmp[row++] = sum4;
2182: tmp[row++] = sum5;
2183: break;
2184: default:
2185: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2186: }
2187: }
2188: /* backward solve the upper triangular */
2189: for (i = node_max - 1; i >= 0; i--) {
2190: row = ns[i + 1] - 1;
2191: nsz = ns[i + 1] - ns[i];
2192: aii = ad[row + 1] + 1;
2193: v1 = aa + aii;
2194: vi = aj + aii;
2195: nz = ad[row] - ad[row + 1] - 1;
2197: if (i > 0) {
2198: /* Prefetch the indices for the next block */
2199: PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2200: /* Prefetch the data for the next block */
2201: PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2202: }
2204: switch (nsz) { /* Each loop in 'case' is unrolled */
2205: case 1:
2206: sum1 = tmp[row];
2208: for (j = 0; j < nz - 1; j += 2) {
2209: i0 = vi[j];
2210: i1 = vi[j + 1];
2211: tmp0 = tmps[i0];
2212: tmp1 = tmps[i1];
2213: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2214: }
2215: if (j == nz - 1) {
2216: tmp0 = tmps[vi[j]];
2217: sum1 -= v1[j] * tmp0;
2218: }
2219: x[c[row]] = tmp[row] = sum1 * v1[nz];
2220: row--;
2221: break;
2222: case 2:
2223: sum1 = tmp[row];
2224: sum2 = tmp[row - 1];
2225: v2 = aa + ad[row] + 1;
2226: for (j = 0; j < nz - 1; j += 2) {
2227: i0 = vi[j];
2228: i1 = vi[j + 1];
2229: tmp0 = tmps[i0];
2230: tmp1 = tmps[i1];
2231: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2232: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2233: }
2234: if (j == nz - 1) {
2235: tmp0 = tmps[vi[j]];
2236: sum1 -= v1[j] * tmp0;
2237: sum2 -= v2[j + 1] * tmp0;
2238: }
2240: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2241: row--;
2242: sum2 -= v2[0] * tmp0;
2243: x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2244: row--;
2245: break;
2246: case 3:
2247: sum1 = tmp[row];
2248: sum2 = tmp[row - 1];
2249: sum3 = tmp[row - 2];
2250: v2 = aa + ad[row] + 1;
2251: v3 = aa + ad[row - 1] + 1;
2252: for (j = 0; j < nz - 1; j += 2) {
2253: i0 = vi[j];
2254: i1 = vi[j + 1];
2255: tmp0 = tmps[i0];
2256: tmp1 = tmps[i1];
2257: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2258: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2259: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2260: }
2261: if (j == nz - 1) {
2262: tmp0 = tmps[vi[j]];
2263: sum1 -= v1[j] * tmp0;
2264: sum2 -= v2[j + 1] * tmp0;
2265: sum3 -= v3[j + 2] * tmp0;
2266: }
2267: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2268: row--;
2269: sum2 -= v2[0] * tmp0;
2270: sum3 -= v3[1] * tmp0;
2271: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2272: row--;
2273: sum3 -= v3[0] * tmp0;
2274: x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2275: row--;
2277: break;
2278: case 4:
2279: sum1 = tmp[row];
2280: sum2 = tmp[row - 1];
2281: sum3 = tmp[row - 2];
2282: sum4 = tmp[row - 3];
2283: v2 = aa + ad[row] + 1;
2284: v3 = aa + ad[row - 1] + 1;
2285: v4 = aa + ad[row - 2] + 1;
2287: for (j = 0; j < nz - 1; j += 2) {
2288: i0 = vi[j];
2289: i1 = vi[j + 1];
2290: tmp0 = tmps[i0];
2291: tmp1 = tmps[i1];
2292: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2293: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2294: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2295: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2296: }
2297: if (j == nz - 1) {
2298: tmp0 = tmps[vi[j]];
2299: sum1 -= v1[j] * tmp0;
2300: sum2 -= v2[j + 1] * tmp0;
2301: sum3 -= v3[j + 2] * tmp0;
2302: sum4 -= v4[j + 3] * tmp0;
2303: }
2305: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2306: row--;
2307: sum2 -= v2[0] * tmp0;
2308: sum3 -= v3[1] * tmp0;
2309: sum4 -= v4[2] * tmp0;
2310: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2311: row--;
2312: sum3 -= v3[0] * tmp0;
2313: sum4 -= v4[1] * tmp0;
2314: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2315: row--;
2316: sum4 -= v4[0] * tmp0;
2317: x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2318: row--;
2319: break;
2320: case 5:
2321: sum1 = tmp[row];
2322: sum2 = tmp[row - 1];
2323: sum3 = tmp[row - 2];
2324: sum4 = tmp[row - 3];
2325: sum5 = tmp[row - 4];
2326: v2 = aa + ad[row] + 1;
2327: v3 = aa + ad[row - 1] + 1;
2328: v4 = aa + ad[row - 2] + 1;
2329: v5 = aa + ad[row - 3] + 1;
2330: for (j = 0; j < nz - 1; j += 2) {
2331: i0 = vi[j];
2332: i1 = vi[j + 1];
2333: tmp0 = tmps[i0];
2334: tmp1 = tmps[i1];
2335: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2336: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2337: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2338: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2339: sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2340: }
2341: if (j == nz - 1) {
2342: tmp0 = tmps[vi[j]];
2343: sum1 -= v1[j] * tmp0;
2344: sum2 -= v2[j + 1] * tmp0;
2345: sum3 -= v3[j + 2] * tmp0;
2346: sum4 -= v4[j + 3] * tmp0;
2347: sum5 -= v5[j + 4] * tmp0;
2348: }
2350: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2351: row--;
2352: sum2 -= v2[0] * tmp0;
2353: sum3 -= v3[1] * tmp0;
2354: sum4 -= v4[2] * tmp0;
2355: sum5 -= v5[3] * tmp0;
2356: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2357: row--;
2358: sum3 -= v3[0] * tmp0;
2359: sum4 -= v4[1] * tmp0;
2360: sum5 -= v5[2] * tmp0;
2361: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2362: row--;
2363: sum4 -= v4[0] * tmp0;
2364: sum5 -= v5[1] * tmp0;
2365: tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2366: row--;
2367: sum5 -= v5[0] * tmp0;
2368: x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2369: row--;
2370: break;
2371: default:
2372: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2373: }
2374: }
2375: PetscCall(ISRestoreIndices(isrow, &rout));
2376: PetscCall(ISRestoreIndices(iscol, &cout));
2377: PetscCall(VecRestoreArrayRead(bb, &b));
2378: PetscCall(VecRestoreArrayWrite(xx, &x));
2379: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2380: PetscFunctionReturn(PETSC_SUCCESS);
2381: }
2383: /*
2384: Makes a longer coloring[] array and calls the usual code with that
2385: */
2386: static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2387: {
2388: Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data;
2389: PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
2390: PetscInt *colorused, i;
2391: ISColoringValue *newcolor;
2393: PetscFunctionBegin;
2394: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2395: PetscCall(PetscMalloc1(n + 1, &newcolor));
2396: /* loop over inodes, marking a color for each column*/
2397: row = 0;
2398: for (i = 0; i < m; i++) {
2399: for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
2400: }
2402: /* eliminate unneeded colors */
2403: PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2404: for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2406: for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2407: ncolors = colorused[5 * ncolors - 1];
2408: for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
2409: PetscCall(PetscFree(colorused));
2410: PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2411: PetscCall(PetscFree(coloring));
2412: PetscFunctionReturn(PETSC_SUCCESS);
2413: }
2415: #include <petsc/private/kernels/blockinvert.h>
2417: /*
2418: Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
2419: */
2420: static PetscErrorCode MatInvertDiagonalForSOR_SeqAIJ_Inode(Mat A, PetscScalar omega, PetscScalar fshift)
2421: {
2422: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2423: MatScalar *ibdiag, *bdiag, work[25];
2424: const MatScalar *v = a->a;
2425: PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2426: PetscInt m = a->inode.node_count, cnt = 0, i, j, row, nodesz;
2427: PetscInt k, ipvt[5];
2428: PetscBool allowzeropivot = PetscNot(A->erroriffailure), zeropivotdetected;
2429: const PetscInt *sizes = a->inode.size_csr, *diag;
2431: PetscFunctionBegin;
2432: if (a->idiagState == ((PetscObject)A)->state) PetscFunctionReturn(PETSC_SUCCESS);
2433: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &diag, NULL));
2434: if (!a->inode.ibdiag) {
2435: /* calculate space needed for diagonal blocks */
2436: for (i = 0; i < m; i++) {
2437: nodesz = sizes[i + 1] - sizes[i];
2438: cnt += nodesz * nodesz;
2439: }
2440: a->inode.bdiagsize = cnt;
2441: PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2442: }
2444: /* copy over the diagonal blocks and invert them */
2445: ibdiag = a->inode.ibdiag;
2446: bdiag = a->inode.bdiag;
2447: cnt = 0;
2448: for (i = 0, row = 0; i < m; i++) {
2449: nodesz = sizes[i + 1] - sizes[i];
2450: for (j = 0; j < nodesz; j++) {
2451: for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
2452: }
2453: PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
2455: switch (nodesz) {
2456: case 1:
2457: /* Create matrix data structure */
2458: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2459: PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2460: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2461: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2462: A->factorerror_zeropivot_row = row;
2463: PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2464: }
2465: ibdiag[cnt] = 1.0 / ibdiag[cnt];
2466: break;
2467: case 2:
2468: PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2469: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2470: break;
2471: case 3:
2472: PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2473: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2474: break;
2475: case 4:
2476: PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2477: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2478: break;
2479: case 5:
2480: PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2481: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2482: break;
2483: default:
2484: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2485: }
2486: cnt += nodesz * nodesz;
2487: row += nodesz;
2488: }
2489: a->inode.ibdiagState = ((PetscObject)A)->state;
2490: PetscFunctionReturn(PETSC_SUCCESS);
2491: }
2493: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2494: {
2495: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2496: PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2497: MatScalar *ibdiag, *bdiag, *t;
2498: PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2499: const MatScalar *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2500: const PetscScalar *xb, *b;
2501: PetscInt n, m = a->inode.node_count, cnt = 0, i, row, i1, i2, nodesz;
2502: PetscInt sz;
2503: const PetscInt *sizes = a->inode.size_csr, *idx, *diag, *ii = a->i;
2505: PetscFunctionBegin;
2506: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2507: PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2508: PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2509: PetscCall(MatInvertDiagonalForSOR_SeqAIJ_Inode(A, omega, fshift));
2510: diag = a->diag;
2512: ibdiag = a->inode.ibdiag;
2513: bdiag = a->inode.bdiag;
2514: t = a->inode.ssor_work;
2516: PetscCall(VecGetArray(xx, &x));
2517: PetscCall(VecGetArrayRead(bb, &b));
2518: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2519: if (flag & SOR_ZERO_INITIAL_GUESS) {
2520: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2521: for (i = 0, row = 0; i < m; i++) {
2522: sz = diag[row] - ii[row];
2523: v1 = a->a + ii[row];
2524: idx = a->j + ii[row];
2526: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2527: nodesz = sizes[i + 1] - sizes[i];
2528: switch (nodesz) {
2529: case 1:
2531: sum1 = b[row];
2532: for (n = 0; n < sz - 1; n += 2) {
2533: i1 = idx[0];
2534: i2 = idx[1];
2535: idx += 2;
2536: tmp0 = x[i1];
2537: tmp1 = x[i2];
2538: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2539: v1 += 2;
2540: }
2542: if (n == sz - 1) {
2543: tmp0 = x[*idx];
2544: sum1 -= *v1 * tmp0;
2545: }
2546: t[row] = sum1;
2547: x[row++] = sum1 * (*ibdiag++);
2548: break;
2549: case 2:
2550: v2 = a->a + ii[row + 1];
2551: sum1 = b[row];
2552: sum2 = b[row + 1];
2553: for (n = 0; n < sz - 1; n += 2) {
2554: i1 = idx[0];
2555: i2 = idx[1];
2556: idx += 2;
2557: tmp0 = x[i1];
2558: tmp1 = x[i2];
2559: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2560: v1 += 2;
2561: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2562: v2 += 2;
2563: }
2565: if (n == sz - 1) {
2566: tmp0 = x[*idx];
2567: sum1 -= v1[0] * tmp0;
2568: sum2 -= v2[0] * tmp0;
2569: }
2570: t[row] = sum1;
2571: t[row + 1] = sum2;
2572: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2573: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2574: ibdiag += 4;
2575: break;
2576: case 3:
2577: v2 = a->a + ii[row + 1];
2578: v3 = a->a + ii[row + 2];
2579: sum1 = b[row];
2580: sum2 = b[row + 1];
2581: sum3 = b[row + 2];
2582: for (n = 0; n < sz - 1; n += 2) {
2583: i1 = idx[0];
2584: i2 = idx[1];
2585: idx += 2;
2586: tmp0 = x[i1];
2587: tmp1 = x[i2];
2588: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2589: v1 += 2;
2590: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2591: v2 += 2;
2592: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2593: v3 += 2;
2594: }
2596: if (n == sz - 1) {
2597: tmp0 = x[*idx];
2598: sum1 -= v1[0] * tmp0;
2599: sum2 -= v2[0] * tmp0;
2600: sum3 -= v3[0] * tmp0;
2601: }
2602: t[row] = sum1;
2603: t[row + 1] = sum2;
2604: t[row + 2] = sum3;
2605: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2606: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2607: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2608: ibdiag += 9;
2609: break;
2610: case 4:
2611: v2 = a->a + ii[row + 1];
2612: v3 = a->a + ii[row + 2];
2613: v4 = a->a + ii[row + 3];
2614: sum1 = b[row];
2615: sum2 = b[row + 1];
2616: sum3 = b[row + 2];
2617: sum4 = b[row + 3];
2618: for (n = 0; n < sz - 1; n += 2) {
2619: i1 = idx[0];
2620: i2 = idx[1];
2621: idx += 2;
2622: tmp0 = x[i1];
2623: tmp1 = x[i2];
2624: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2625: v1 += 2;
2626: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2627: v2 += 2;
2628: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2629: v3 += 2;
2630: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2631: v4 += 2;
2632: }
2634: if (n == sz - 1) {
2635: tmp0 = x[*idx];
2636: sum1 -= v1[0] * tmp0;
2637: sum2 -= v2[0] * tmp0;
2638: sum3 -= v3[0] * tmp0;
2639: sum4 -= v4[0] * tmp0;
2640: }
2641: t[row] = sum1;
2642: t[row + 1] = sum2;
2643: t[row + 2] = sum3;
2644: t[row + 3] = sum4;
2645: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
2646: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
2647: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
2648: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
2649: ibdiag += 16;
2650: break;
2651: case 5:
2652: v2 = a->a + ii[row + 1];
2653: v3 = a->a + ii[row + 2];
2654: v4 = a->a + ii[row + 3];
2655: v5 = a->a + ii[row + 4];
2656: sum1 = b[row];
2657: sum2 = b[row + 1];
2658: sum3 = b[row + 2];
2659: sum4 = b[row + 3];
2660: sum5 = b[row + 4];
2661: for (n = 0; n < sz - 1; n += 2) {
2662: i1 = idx[0];
2663: i2 = idx[1];
2664: idx += 2;
2665: tmp0 = x[i1];
2666: tmp1 = x[i2];
2667: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2668: v1 += 2;
2669: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2670: v2 += 2;
2671: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2672: v3 += 2;
2673: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2674: v4 += 2;
2675: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
2676: v5 += 2;
2677: }
2679: if (n == sz - 1) {
2680: tmp0 = x[*idx];
2681: sum1 -= v1[0] * tmp0;
2682: sum2 -= v2[0] * tmp0;
2683: sum3 -= v3[0] * tmp0;
2684: sum4 -= v4[0] * tmp0;
2685: sum5 -= v5[0] * tmp0;
2686: }
2687: t[row] = sum1;
2688: t[row + 1] = sum2;
2689: t[row + 2] = sum3;
2690: t[row + 3] = sum4;
2691: t[row + 4] = sum5;
2692: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
2693: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
2694: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
2695: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
2696: x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
2697: ibdiag += 25;
2698: break;
2699: default:
2700: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2701: }
2702: }
2704: xb = t;
2705: PetscCall(PetscLogFlops(a->nz));
2706: } else xb = b;
2707: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2708: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
2709: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
2710: nodesz = sizes[i + 1] - sizes[i];
2711: ibdiag -= nodesz * nodesz;
2712: sz = ii[row + 1] - diag[row] - 1;
2713: v1 = a->a + diag[row] + 1;
2714: idx = a->j + diag[row] + 1;
2716: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2717: switch (nodesz) {
2718: case 1:
2720: sum1 = xb[row];
2721: for (n = 0; n < sz - 1; n += 2) {
2722: i1 = idx[0];
2723: i2 = idx[1];
2724: idx += 2;
2725: tmp0 = x[i1];
2726: tmp1 = x[i2];
2727: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2728: v1 += 2;
2729: }
2731: if (n == sz - 1) {
2732: tmp0 = x[*idx];
2733: sum1 -= *v1 * tmp0;
2734: }
2735: x[row--] = sum1 * (*ibdiag);
2736: break;
2738: case 2:
2740: sum1 = xb[row];
2741: sum2 = xb[row - 1];
2742: /* note that sum1 is associated with the second of the two rows */
2743: v2 = a->a + diag[row - 1] + 2;
2744: for (n = 0; n < sz - 1; n += 2) {
2745: i1 = idx[0];
2746: i2 = idx[1];
2747: idx += 2;
2748: tmp0 = x[i1];
2749: tmp1 = x[i2];
2750: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2751: v1 += 2;
2752: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2753: v2 += 2;
2754: }
2756: if (n == sz - 1) {
2757: tmp0 = x[*idx];
2758: sum1 -= *v1 * tmp0;
2759: sum2 -= *v2 * tmp0;
2760: }
2761: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
2762: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
2763: break;
2764: case 3:
2766: sum1 = xb[row];
2767: sum2 = xb[row - 1];
2768: sum3 = xb[row - 2];
2769: v2 = a->a + diag[row - 1] + 2;
2770: v3 = a->a + diag[row - 2] + 3;
2771: for (n = 0; n < sz - 1; n += 2) {
2772: i1 = idx[0];
2773: i2 = idx[1];
2774: idx += 2;
2775: tmp0 = x[i1];
2776: tmp1 = x[i2];
2777: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2778: v1 += 2;
2779: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2780: v2 += 2;
2781: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2782: v3 += 2;
2783: }
2785: if (n == sz - 1) {
2786: tmp0 = x[*idx];
2787: sum1 -= *v1 * tmp0;
2788: sum2 -= *v2 * tmp0;
2789: sum3 -= *v3 * tmp0;
2790: }
2791: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
2792: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
2793: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
2794: break;
2795: case 4:
2797: sum1 = xb[row];
2798: sum2 = xb[row - 1];
2799: sum3 = xb[row - 2];
2800: sum4 = xb[row - 3];
2801: v2 = a->a + diag[row - 1] + 2;
2802: v3 = a->a + diag[row - 2] + 3;
2803: v4 = a->a + diag[row - 3] + 4;
2804: for (n = 0; n < sz - 1; n += 2) {
2805: i1 = idx[0];
2806: i2 = idx[1];
2807: idx += 2;
2808: tmp0 = x[i1];
2809: tmp1 = x[i2];
2810: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2811: v1 += 2;
2812: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2813: v2 += 2;
2814: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2815: v3 += 2;
2816: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2817: v4 += 2;
2818: }
2820: if (n == sz - 1) {
2821: tmp0 = x[*idx];
2822: sum1 -= *v1 * tmp0;
2823: sum2 -= *v2 * tmp0;
2824: sum3 -= *v3 * tmp0;
2825: sum4 -= *v4 * tmp0;
2826: }
2827: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
2828: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
2829: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
2830: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
2831: break;
2832: case 5:
2834: sum1 = xb[row];
2835: sum2 = xb[row - 1];
2836: sum3 = xb[row - 2];
2837: sum4 = xb[row - 3];
2838: sum5 = xb[row - 4];
2839: v2 = a->a + diag[row - 1] + 2;
2840: v3 = a->a + diag[row - 2] + 3;
2841: v4 = a->a + diag[row - 3] + 4;
2842: v5 = a->a + diag[row - 4] + 5;
2843: for (n = 0; n < sz - 1; n += 2) {
2844: i1 = idx[0];
2845: i2 = idx[1];
2846: idx += 2;
2847: tmp0 = x[i1];
2848: tmp1 = x[i2];
2849: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2850: v1 += 2;
2851: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2852: v2 += 2;
2853: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2854: v3 += 2;
2855: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2856: v4 += 2;
2857: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
2858: v5 += 2;
2859: }
2861: if (n == sz - 1) {
2862: tmp0 = x[*idx];
2863: sum1 -= *v1 * tmp0;
2864: sum2 -= *v2 * tmp0;
2865: sum3 -= *v3 * tmp0;
2866: sum4 -= *v4 * tmp0;
2867: sum5 -= *v5 * tmp0;
2868: }
2869: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
2870: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
2871: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
2872: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
2873: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
2874: break;
2875: default:
2876: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2877: }
2878: }
2880: PetscCall(PetscLogFlops(a->nz));
2881: }
2882: its--;
2883: }
2884: while (its--) {
2885: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2886: for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
2887: nodesz = sizes[i + 1] - sizes[i];
2888: sz = diag[row] - ii[row];
2889: v1 = a->a + ii[row];
2890: idx = a->j + ii[row];
2891: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2892: switch (nodesz) {
2893: case 1:
2894: sum1 = b[row];
2895: for (n = 0; n < sz - 1; n += 2) {
2896: i1 = idx[0];
2897: i2 = idx[1];
2898: idx += 2;
2899: tmp0 = x[i1];
2900: tmp1 = x[i2];
2901: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2902: v1 += 2;
2903: }
2904: if (n == sz - 1) {
2905: tmp0 = x[*idx++];
2906: sum1 -= *v1 * tmp0;
2907: v1++;
2908: }
2909: t[row] = sum1;
2910: sz = ii[row + 1] - diag[row] - 1;
2911: idx = a->j + diag[row] + 1;
2912: v1 += 1;
2913: for (n = 0; n < sz - 1; n += 2) {
2914: i1 = idx[0];
2915: i2 = idx[1];
2916: idx += 2;
2917: tmp0 = x[i1];
2918: tmp1 = x[i2];
2919: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2920: v1 += 2;
2921: }
2922: if (n == sz - 1) {
2923: tmp0 = x[*idx++];
2924: sum1 -= *v1 * tmp0;
2925: }
2926: /* in MatSOR_SeqAIJ this line would be
2927: *
2928: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
2929: *
2930: * but omega == 1, so this becomes
2931: *
2932: * x[row] = sum1*(*ibdiag++);
2933: *
2934: */
2935: x[row] = sum1 * (*ibdiag);
2936: break;
2937: case 2:
2938: v2 = a->a + ii[row + 1];
2939: sum1 = b[row];
2940: sum2 = b[row + 1];
2941: for (n = 0; n < sz - 1; n += 2) {
2942: i1 = idx[0];
2943: i2 = idx[1];
2944: idx += 2;
2945: tmp0 = x[i1];
2946: tmp1 = x[i2];
2947: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2948: v1 += 2;
2949: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2950: v2 += 2;
2951: }
2952: if (n == sz - 1) {
2953: tmp0 = x[*idx++];
2954: sum1 -= v1[0] * tmp0;
2955: sum2 -= v2[0] * tmp0;
2956: v1++;
2957: v2++;
2958: }
2959: t[row] = sum1;
2960: t[row + 1] = sum2;
2961: sz = ii[row + 1] - diag[row] - 2;
2962: idx = a->j + diag[row] + 2;
2963: v1 += 2;
2964: v2 += 2;
2965: for (n = 0; n < sz - 1; n += 2) {
2966: i1 = idx[0];
2967: i2 = idx[1];
2968: idx += 2;
2969: tmp0 = x[i1];
2970: tmp1 = x[i2];
2971: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2972: v1 += 2;
2973: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2974: v2 += 2;
2975: }
2976: if (n == sz - 1) {
2977: tmp0 = x[*idx];
2978: sum1 -= v1[0] * tmp0;
2979: sum2 -= v2[0] * tmp0;
2980: }
2981: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2982: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2983: break;
2984: case 3:
2985: v2 = a->a + ii[row + 1];
2986: v3 = a->a + ii[row + 2];
2987: sum1 = b[row];
2988: sum2 = b[row + 1];
2989: sum3 = b[row + 2];
2990: for (n = 0; n < sz - 1; n += 2) {
2991: i1 = idx[0];
2992: i2 = idx[1];
2993: idx += 2;
2994: tmp0 = x[i1];
2995: tmp1 = x[i2];
2996: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2997: v1 += 2;
2998: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2999: v2 += 2;
3000: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3001: v3 += 2;
3002: }
3003: if (n == sz - 1) {
3004: tmp0 = x[*idx++];
3005: sum1 -= v1[0] * tmp0;
3006: sum2 -= v2[0] * tmp0;
3007: sum3 -= v3[0] * tmp0;
3008: v1++;
3009: v2++;
3010: v3++;
3011: }
3012: t[row] = sum1;
3013: t[row + 1] = sum2;
3014: t[row + 2] = sum3;
3015: sz = ii[row + 1] - diag[row] - 3;
3016: idx = a->j + diag[row] + 3;
3017: v1 += 3;
3018: v2 += 3;
3019: v3 += 3;
3020: for (n = 0; n < sz - 1; n += 2) {
3021: i1 = idx[0];
3022: i2 = idx[1];
3023: idx += 2;
3024: tmp0 = x[i1];
3025: tmp1 = x[i2];
3026: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3027: v1 += 2;
3028: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3029: v2 += 2;
3030: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3031: v3 += 2;
3032: }
3033: if (n == sz - 1) {
3034: tmp0 = x[*idx];
3035: sum1 -= v1[0] * tmp0;
3036: sum2 -= v2[0] * tmp0;
3037: sum3 -= v3[0] * tmp0;
3038: }
3039: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3040: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3041: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3042: break;
3043: case 4:
3044: v2 = a->a + ii[row + 1];
3045: v3 = a->a + ii[row + 2];
3046: v4 = a->a + ii[row + 3];
3047: sum1 = b[row];
3048: sum2 = b[row + 1];
3049: sum3 = b[row + 2];
3050: sum4 = b[row + 3];
3051: for (n = 0; n < sz - 1; n += 2) {
3052: i1 = idx[0];
3053: i2 = idx[1];
3054: idx += 2;
3055: tmp0 = x[i1];
3056: tmp1 = x[i2];
3057: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3058: v1 += 2;
3059: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3060: v2 += 2;
3061: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3062: v3 += 2;
3063: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3064: v4 += 2;
3065: }
3066: if (n == sz - 1) {
3067: tmp0 = x[*idx++];
3068: sum1 -= v1[0] * tmp0;
3069: sum2 -= v2[0] * tmp0;
3070: sum3 -= v3[0] * tmp0;
3071: sum4 -= v4[0] * tmp0;
3072: v1++;
3073: v2++;
3074: v3++;
3075: v4++;
3076: }
3077: t[row] = sum1;
3078: t[row + 1] = sum2;
3079: t[row + 2] = sum3;
3080: t[row + 3] = sum4;
3081: sz = ii[row + 1] - diag[row] - 4;
3082: idx = a->j + diag[row] + 4;
3083: v1 += 4;
3084: v2 += 4;
3085: v3 += 4;
3086: v4 += 4;
3087: for (n = 0; n < sz - 1; n += 2) {
3088: i1 = idx[0];
3089: i2 = idx[1];
3090: idx += 2;
3091: tmp0 = x[i1];
3092: tmp1 = x[i2];
3093: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3094: v1 += 2;
3095: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3096: v2 += 2;
3097: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3098: v3 += 2;
3099: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3100: v4 += 2;
3101: }
3102: if (n == sz - 1) {
3103: tmp0 = x[*idx];
3104: sum1 -= v1[0] * tmp0;
3105: sum2 -= v2[0] * tmp0;
3106: sum3 -= v3[0] * tmp0;
3107: sum4 -= v4[0] * tmp0;
3108: }
3109: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3110: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3111: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3112: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3113: break;
3114: case 5:
3115: v2 = a->a + ii[row + 1];
3116: v3 = a->a + ii[row + 2];
3117: v4 = a->a + ii[row + 3];
3118: v5 = a->a + ii[row + 4];
3119: sum1 = b[row];
3120: sum2 = b[row + 1];
3121: sum3 = b[row + 2];
3122: sum4 = b[row + 3];
3123: sum5 = b[row + 4];
3124: for (n = 0; n < sz - 1; n += 2) {
3125: i1 = idx[0];
3126: i2 = idx[1];
3127: idx += 2;
3128: tmp0 = x[i1];
3129: tmp1 = x[i2];
3130: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3131: v1 += 2;
3132: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3133: v2 += 2;
3134: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3135: v3 += 2;
3136: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3137: v4 += 2;
3138: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3139: v5 += 2;
3140: }
3141: if (n == sz - 1) {
3142: tmp0 = x[*idx++];
3143: sum1 -= v1[0] * tmp0;
3144: sum2 -= v2[0] * tmp0;
3145: sum3 -= v3[0] * tmp0;
3146: sum4 -= v4[0] * tmp0;
3147: sum5 -= v5[0] * tmp0;
3148: v1++;
3149: v2++;
3150: v3++;
3151: v4++;
3152: v5++;
3153: }
3154: t[row] = sum1;
3155: t[row + 1] = sum2;
3156: t[row + 2] = sum3;
3157: t[row + 3] = sum4;
3158: t[row + 4] = sum5;
3159: sz = ii[row + 1] - diag[row] - 5;
3160: idx = a->j + diag[row] + 5;
3161: v1 += 5;
3162: v2 += 5;
3163: v3 += 5;
3164: v4 += 5;
3165: v5 += 5;
3166: for (n = 0; n < sz - 1; n += 2) {
3167: i1 = idx[0];
3168: i2 = idx[1];
3169: idx += 2;
3170: tmp0 = x[i1];
3171: tmp1 = x[i2];
3172: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3173: v1 += 2;
3174: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3175: v2 += 2;
3176: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3177: v3 += 2;
3178: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3179: v4 += 2;
3180: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3181: v5 += 2;
3182: }
3183: if (n == sz - 1) {
3184: tmp0 = x[*idx];
3185: sum1 -= v1[0] * tmp0;
3186: sum2 -= v2[0] * tmp0;
3187: sum3 -= v3[0] * tmp0;
3188: sum4 -= v4[0] * tmp0;
3189: sum5 -= v5[0] * tmp0;
3190: }
3191: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3192: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3193: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3194: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3195: x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3196: break;
3197: default:
3198: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3199: }
3200: }
3201: xb = t;
3202: PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3203: } else xb = b;
3205: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3206: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3207: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3208: nodesz = sizes[i + 1] - sizes[i];
3209: ibdiag -= nodesz * nodesz;
3211: /* set RHS */
3212: if (xb == b) {
3213: /* whole (old way) */
3214: sz = ii[row + 1] - ii[row];
3215: idx = a->j + ii[row];
3216: switch (nodesz) {
3217: case 5:
3218: v5 = a->a + ii[row - 4]; /* fall through */
3219: case 4:
3220: v4 = a->a + ii[row - 3]; /* fall through */
3221: case 3:
3222: v3 = a->a + ii[row - 2]; /* fall through */
3223: case 2:
3224: v2 = a->a + ii[row - 1]; /* fall through */
3225: case 1:
3226: v1 = a->a + ii[row];
3227: break;
3228: default:
3229: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3230: }
3231: } else {
3232: /* upper, no diag */
3233: sz = ii[row + 1] - diag[row] - 1;
3234: idx = a->j + diag[row] + 1;
3235: switch (nodesz) {
3236: case 5:
3237: v5 = a->a + diag[row - 4] + 5; /* fall through */
3238: case 4:
3239: v4 = a->a + diag[row - 3] + 4; /* fall through */
3240: case 3:
3241: v3 = a->a + diag[row - 2] + 3; /* fall through */
3242: case 2:
3243: v2 = a->a + diag[row - 1] + 2; /* fall through */
3244: case 1:
3245: v1 = a->a + diag[row] + 1;
3246: }
3247: }
3248: /* set sum */
3249: switch (nodesz) {
3250: case 5:
3251: sum5 = xb[row - 4]; /* fall through */
3252: case 4:
3253: sum4 = xb[row - 3]; /* fall through */
3254: case 3:
3255: sum3 = xb[row - 2]; /* fall through */
3256: case 2:
3257: sum2 = xb[row - 1]; /* fall through */
3258: case 1:
3259: /* note that sum1 is associated with the last row */
3260: sum1 = xb[row];
3261: }
3262: /* do sums */
3263: for (n = 0; n < sz - 1; n += 2) {
3264: i1 = idx[0];
3265: i2 = idx[1];
3266: idx += 2;
3267: tmp0 = x[i1];
3268: tmp1 = x[i2];
3269: switch (nodesz) {
3270: case 5:
3271: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3272: v5 += 2; /* fall through */
3273: case 4:
3274: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3275: v4 += 2; /* fall through */
3276: case 3:
3277: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3278: v3 += 2; /* fall through */
3279: case 2:
3280: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3281: v2 += 2; /* fall through */
3282: case 1:
3283: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3284: v1 += 2;
3285: }
3286: }
3287: /* ragged edge */
3288: if (n == sz - 1) {
3289: tmp0 = x[*idx];
3290: switch (nodesz) {
3291: case 5:
3292: sum5 -= *v5 * tmp0; /* fall through */
3293: case 4:
3294: sum4 -= *v4 * tmp0; /* fall through */
3295: case 3:
3296: sum3 -= *v3 * tmp0; /* fall through */
3297: case 2:
3298: sum2 -= *v2 * tmp0; /* fall through */
3299: case 1:
3300: sum1 -= *v1 * tmp0;
3301: }
3302: }
3303: /* update */
3304: if (xb == b) {
3305: /* whole (old way) w/ diag */
3306: switch (nodesz) {
3307: case 5:
3308: x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3309: x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3310: x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3311: x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3312: x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3313: break;
3314: case 4:
3315: x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3316: x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3317: x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3318: x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3319: break;
3320: case 3:
3321: x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3322: x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3323: x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3324: break;
3325: case 2:
3326: x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3327: x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3328: break;
3329: case 1:
3330: x[row--] += sum1 * (*ibdiag);
3331: break;
3332: }
3333: } else {
3334: /* no diag so set = */
3335: switch (nodesz) {
3336: case 5:
3337: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3338: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3339: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3340: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3341: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3342: break;
3343: case 4:
3344: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3345: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3346: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3347: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3348: break;
3349: case 3:
3350: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3351: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3352: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3353: break;
3354: case 2:
3355: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3356: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3357: break;
3358: case 1:
3359: x[row--] = sum1 * (*ibdiag);
3360: break;
3361: }
3362: }
3363: }
3364: if (xb == b) {
3365: PetscCall(PetscLogFlops(2.0 * a->nz));
3366: } else {
3367: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3368: }
3369: }
3370: }
3371: if (flag & SOR_EISENSTAT) {
3372: /*
3373: Apply (U + D)^-1 where D is now the block diagonal
3374: */
3375: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3376: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3377: nodesz = sizes[i + 1] - sizes[i];
3378: ibdiag -= nodesz * nodesz;
3379: sz = ii[row + 1] - diag[row] - 1;
3380: v1 = a->a + diag[row] + 1;
3381: idx = a->j + diag[row] + 1;
3382: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3383: switch (nodesz) {
3384: case 1:
3386: sum1 = b[row];
3387: for (n = 0; n < sz - 1; n += 2) {
3388: i1 = idx[0];
3389: i2 = idx[1];
3390: idx += 2;
3391: tmp0 = x[i1];
3392: tmp1 = x[i2];
3393: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3394: v1 += 2;
3395: }
3397: if (n == sz - 1) {
3398: tmp0 = x[*idx];
3399: sum1 -= *v1 * tmp0;
3400: }
3401: x[row] = sum1 * (*ibdiag);
3402: row--;
3403: break;
3405: case 2:
3407: sum1 = b[row];
3408: sum2 = b[row - 1];
3409: /* note that sum1 is associated with the second of the two rows */
3410: v2 = a->a + diag[row - 1] + 2;
3411: for (n = 0; n < sz - 1; n += 2) {
3412: i1 = idx[0];
3413: i2 = idx[1];
3414: idx += 2;
3415: tmp0 = x[i1];
3416: tmp1 = x[i2];
3417: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3418: v1 += 2;
3419: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3420: v2 += 2;
3421: }
3423: if (n == sz - 1) {
3424: tmp0 = x[*idx];
3425: sum1 -= *v1 * tmp0;
3426: sum2 -= *v2 * tmp0;
3427: }
3428: x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3429: x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3430: row -= 2;
3431: break;
3432: case 3:
3434: sum1 = b[row];
3435: sum2 = b[row - 1];
3436: sum3 = b[row - 2];
3437: v2 = a->a + diag[row - 1] + 2;
3438: v3 = a->a + diag[row - 2] + 3;
3439: for (n = 0; n < sz - 1; n += 2) {
3440: i1 = idx[0];
3441: i2 = idx[1];
3442: idx += 2;
3443: tmp0 = x[i1];
3444: tmp1 = x[i2];
3445: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3446: v1 += 2;
3447: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3448: v2 += 2;
3449: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3450: v3 += 2;
3451: }
3453: if (n == sz - 1) {
3454: tmp0 = x[*idx];
3455: sum1 -= *v1 * tmp0;
3456: sum2 -= *v2 * tmp0;
3457: sum3 -= *v3 * tmp0;
3458: }
3459: x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3460: x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3461: x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3462: row -= 3;
3463: break;
3464: case 4:
3466: sum1 = b[row];
3467: sum2 = b[row - 1];
3468: sum3 = b[row - 2];
3469: sum4 = b[row - 3];
3470: v2 = a->a + diag[row - 1] + 2;
3471: v3 = a->a + diag[row - 2] + 3;
3472: v4 = a->a + diag[row - 3] + 4;
3473: for (n = 0; n < sz - 1; n += 2) {
3474: i1 = idx[0];
3475: i2 = idx[1];
3476: idx += 2;
3477: tmp0 = x[i1];
3478: tmp1 = x[i2];
3479: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3480: v1 += 2;
3481: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3482: v2 += 2;
3483: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3484: v3 += 2;
3485: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3486: v4 += 2;
3487: }
3489: if (n == sz - 1) {
3490: tmp0 = x[*idx];
3491: sum1 -= *v1 * tmp0;
3492: sum2 -= *v2 * tmp0;
3493: sum3 -= *v3 * tmp0;
3494: sum4 -= *v4 * tmp0;
3495: }
3496: x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3497: x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3498: x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3499: x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3500: row -= 4;
3501: break;
3502: case 5:
3504: sum1 = b[row];
3505: sum2 = b[row - 1];
3506: sum3 = b[row - 2];
3507: sum4 = b[row - 3];
3508: sum5 = b[row - 4];
3509: v2 = a->a + diag[row - 1] + 2;
3510: v3 = a->a + diag[row - 2] + 3;
3511: v4 = a->a + diag[row - 3] + 4;
3512: v5 = a->a + diag[row - 4] + 5;
3513: for (n = 0; n < sz - 1; n += 2) {
3514: i1 = idx[0];
3515: i2 = idx[1];
3516: idx += 2;
3517: tmp0 = x[i1];
3518: tmp1 = x[i2];
3519: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3520: v1 += 2;
3521: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3522: v2 += 2;
3523: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3524: v3 += 2;
3525: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3526: v4 += 2;
3527: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3528: v5 += 2;
3529: }
3531: if (n == sz - 1) {
3532: tmp0 = x[*idx];
3533: sum1 -= *v1 * tmp0;
3534: sum2 -= *v2 * tmp0;
3535: sum3 -= *v3 * tmp0;
3536: sum4 -= *v4 * tmp0;
3537: sum5 -= *v5 * tmp0;
3538: }
3539: x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3540: x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3541: x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3542: x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3543: x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3544: row -= 5;
3545: break;
3546: default:
3547: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3548: }
3549: }
3550: PetscCall(PetscLogFlops(a->nz));
3552: /*
3553: t = b - D x where D is the block diagonal
3554: */
3555: cnt = 0;
3556: for (i = 0, row = 0; i < m; i++) {
3557: nodesz = sizes[i + 1] - sizes[i];
3558: switch (nodesz) {
3559: case 1:
3560: t[row] = b[row] - bdiag[cnt++] * x[row];
3561: row++;
3562: break;
3563: case 2:
3564: x1 = x[row];
3565: x2 = x[row + 1];
3566: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3567: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3568: t[row] = b[row] - tmp1;
3569: t[row + 1] = b[row + 1] - tmp2;
3570: row += 2;
3571: cnt += 4;
3572: break;
3573: case 3:
3574: x1 = x[row];
3575: x2 = x[row + 1];
3576: x3 = x[row + 2];
3577: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3578: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3579: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3580: t[row] = b[row] - tmp1;
3581: t[row + 1] = b[row + 1] - tmp2;
3582: t[row + 2] = b[row + 2] - tmp3;
3583: row += 3;
3584: cnt += 9;
3585: break;
3586: case 4:
3587: x1 = x[row];
3588: x2 = x[row + 1];
3589: x3 = x[row + 2];
3590: x4 = x[row + 3];
3591: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3592: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3593: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3594: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3595: t[row] = b[row] - tmp1;
3596: t[row + 1] = b[row + 1] - tmp2;
3597: t[row + 2] = b[row + 2] - tmp3;
3598: t[row + 3] = b[row + 3] - tmp4;
3599: row += 4;
3600: cnt += 16;
3601: break;
3602: case 5:
3603: x1 = x[row];
3604: x2 = x[row + 1];
3605: x3 = x[row + 2];
3606: x4 = x[row + 3];
3607: x5 = x[row + 4];
3608: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3609: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3610: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3611: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3612: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3613: t[row] = b[row] - tmp1;
3614: t[row + 1] = b[row + 1] - tmp2;
3615: t[row + 2] = b[row + 2] - tmp3;
3616: t[row + 3] = b[row + 3] - tmp4;
3617: t[row + 4] = b[row + 4] - tmp5;
3618: row += 5;
3619: cnt += 25;
3620: break;
3621: default:
3622: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3623: }
3624: }
3625: PetscCall(PetscLogFlops(m));
3627: /*
3628: Apply (L + D)^-1 where D is the block diagonal
3629: */
3630: for (i = 0, row = 0; i < m; i++) {
3631: nodesz = sizes[i + 1] - sizes[i];
3632: sz = diag[row] - ii[row];
3633: v1 = a->a + ii[row];
3634: idx = a->j + ii[row];
3635: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3636: switch (nodesz) {
3637: case 1:
3639: sum1 = t[row];
3640: for (n = 0; n < sz - 1; n += 2) {
3641: i1 = idx[0];
3642: i2 = idx[1];
3643: idx += 2;
3644: tmp0 = t[i1];
3645: tmp1 = t[i2];
3646: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3647: v1 += 2;
3648: }
3650: if (n == sz - 1) {
3651: tmp0 = t[*idx];
3652: sum1 -= *v1 * tmp0;
3653: }
3654: x[row] += t[row] = sum1 * (*ibdiag++);
3655: row++;
3656: break;
3657: case 2:
3658: v2 = a->a + ii[row + 1];
3659: sum1 = t[row];
3660: sum2 = t[row + 1];
3661: for (n = 0; n < sz - 1; n += 2) {
3662: i1 = idx[0];
3663: i2 = idx[1];
3664: idx += 2;
3665: tmp0 = t[i1];
3666: tmp1 = t[i2];
3667: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3668: v1 += 2;
3669: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3670: v2 += 2;
3671: }
3673: if (n == sz - 1) {
3674: tmp0 = t[*idx];
3675: sum1 -= v1[0] * tmp0;
3676: sum2 -= v2[0] * tmp0;
3677: }
3678: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3679: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3680: ibdiag += 4;
3681: row += 2;
3682: break;
3683: case 3:
3684: v2 = a->a + ii[row + 1];
3685: v3 = a->a + ii[row + 2];
3686: sum1 = t[row];
3687: sum2 = t[row + 1];
3688: sum3 = t[row + 2];
3689: for (n = 0; n < sz - 1; n += 2) {
3690: i1 = idx[0];
3691: i2 = idx[1];
3692: idx += 2;
3693: tmp0 = t[i1];
3694: tmp1 = t[i2];
3695: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3696: v1 += 2;
3697: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3698: v2 += 2;
3699: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3700: v3 += 2;
3701: }
3703: if (n == sz - 1) {
3704: tmp0 = t[*idx];
3705: sum1 -= v1[0] * tmp0;
3706: sum2 -= v2[0] * tmp0;
3707: sum3 -= v3[0] * tmp0;
3708: }
3709: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3710: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3711: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3712: ibdiag += 9;
3713: row += 3;
3714: break;
3715: case 4:
3716: v2 = a->a + ii[row + 1];
3717: v3 = a->a + ii[row + 2];
3718: v4 = a->a + ii[row + 3];
3719: sum1 = t[row];
3720: sum2 = t[row + 1];
3721: sum3 = t[row + 2];
3722: sum4 = t[row + 3];
3723: for (n = 0; n < sz - 1; n += 2) {
3724: i1 = idx[0];
3725: i2 = idx[1];
3726: idx += 2;
3727: tmp0 = t[i1];
3728: tmp1 = t[i2];
3729: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3730: v1 += 2;
3731: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3732: v2 += 2;
3733: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3734: v3 += 2;
3735: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3736: v4 += 2;
3737: }
3739: if (n == sz - 1) {
3740: tmp0 = t[*idx];
3741: sum1 -= v1[0] * tmp0;
3742: sum2 -= v2[0] * tmp0;
3743: sum3 -= v3[0] * tmp0;
3744: sum4 -= v4[0] * tmp0;
3745: }
3746: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3747: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3748: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3749: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3750: ibdiag += 16;
3751: row += 4;
3752: break;
3753: case 5:
3754: v2 = a->a + ii[row + 1];
3755: v3 = a->a + ii[row + 2];
3756: v4 = a->a + ii[row + 3];
3757: v5 = a->a + ii[row + 4];
3758: sum1 = t[row];
3759: sum2 = t[row + 1];
3760: sum3 = t[row + 2];
3761: sum4 = t[row + 3];
3762: sum5 = t[row + 4];
3763: for (n = 0; n < sz - 1; n += 2) {
3764: i1 = idx[0];
3765: i2 = idx[1];
3766: idx += 2;
3767: tmp0 = t[i1];
3768: tmp1 = t[i2];
3769: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3770: v1 += 2;
3771: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3772: v2 += 2;
3773: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3774: v3 += 2;
3775: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3776: v4 += 2;
3777: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3778: v5 += 2;
3779: }
3781: if (n == sz - 1) {
3782: tmp0 = t[*idx];
3783: sum1 -= v1[0] * tmp0;
3784: sum2 -= v2[0] * tmp0;
3785: sum3 -= v3[0] * tmp0;
3786: sum4 -= v4[0] * tmp0;
3787: sum5 -= v5[0] * tmp0;
3788: }
3789: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3790: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3791: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3792: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3793: x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3794: ibdiag += 25;
3795: row += 5;
3796: break;
3797: default:
3798: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3799: }
3800: }
3801: PetscCall(PetscLogFlops(a->nz));
3802: }
3803: PetscCall(VecRestoreArray(xx, &x));
3804: PetscCall(VecRestoreArrayRead(bb, &b));
3805: PetscFunctionReturn(PETSC_SUCCESS);
3806: }
3808: static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
3809: {
3810: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3811: PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
3812: const MatScalar *bdiag = a->inode.bdiag;
3813: const PetscScalar *b;
3814: PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz;
3815: const PetscInt *sizes = a->inode.size_csr;
3817: PetscFunctionBegin;
3818: PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
3819: PetscCall(VecGetArray(xx, &x));
3820: PetscCall(VecGetArrayRead(bb, &b));
3821: cnt = 0;
3822: for (i = 0, row = 0; i < m; i++) {
3823: nodesz = sizes[i + 1] - sizes[i];
3824: switch (nodesz) {
3825: case 1:
3826: x[row] = b[row] * bdiag[cnt++];
3827: row++;
3828: break;
3829: case 2:
3830: x1 = b[row];
3831: x2 = b[row + 1];
3832: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3833: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3834: x[row++] = tmp1;
3835: x[row++] = tmp2;
3836: cnt += 4;
3837: break;
3838: case 3:
3839: x1 = b[row];
3840: x2 = b[row + 1];
3841: x3 = b[row + 2];
3842: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3843: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3844: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3845: x[row++] = tmp1;
3846: x[row++] = tmp2;
3847: x[row++] = tmp3;
3848: cnt += 9;
3849: break;
3850: case 4:
3851: x1 = b[row];
3852: x2 = b[row + 1];
3853: x3 = b[row + 2];
3854: x4 = b[row + 3];
3855: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3856: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3857: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3858: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3859: x[row++] = tmp1;
3860: x[row++] = tmp2;
3861: x[row++] = tmp3;
3862: x[row++] = tmp4;
3863: cnt += 16;
3864: break;
3865: case 5:
3866: x1 = b[row];
3867: x2 = b[row + 1];
3868: x3 = b[row + 2];
3869: x4 = b[row + 3];
3870: x5 = b[row + 4];
3871: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3872: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3873: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3874: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3875: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3876: x[row++] = tmp1;
3877: x[row++] = tmp2;
3878: x[row++] = tmp3;
3879: x[row++] = tmp4;
3880: x[row++] = tmp5;
3881: cnt += 25;
3882: break;
3883: default:
3884: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3885: }
3886: }
3887: PetscCall(PetscLogFlops(2.0 * cnt));
3888: PetscCall(VecRestoreArray(xx, &x));
3889: PetscCall(VecRestoreArrayRead(bb, &b));
3890: PetscFunctionReturn(PETSC_SUCCESS);
3891: }
3893: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
3894: {
3895: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3897: PetscFunctionBegin;
3898: a->inode.node_count = 0;
3899: a->inode.use = PETSC_FALSE;
3900: a->inode.checked = PETSC_FALSE;
3901: a->inode.mat_nonzerostate = -1;
3902: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
3903: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
3904: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
3905: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
3906: A->ops->coloringpatch = NULL;
3907: A->ops->multdiagonalblock = NULL;
3908: if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
3909: PetscFunctionReturn(PETSC_SUCCESS);
3910: }
3912: /*
3913: samestructure indicates that the matrix has not changed its nonzero structure so we
3914: do not need to recompute the inodes
3915: */
3916: PetscErrorCode MatSeqAIJCheckInode(Mat A)
3917: {
3918: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3919: PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size;
3920: PetscBool flag;
3921: const PetscInt *idx, *idy, *ii;
3923: PetscFunctionBegin;
3924: if (!a->inode.use) {
3925: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
3926: PetscCall(PetscFree(a->inode.size_csr));
3927: PetscFunctionReturn(PETSC_SUCCESS);
3928: }
3929: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
3931: m = A->rmap->n;
3932: if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
3933: ns = a->inode.size_csr;
3934: ns[0] = 0;
3936: i = 0;
3937: node_count = 0;
3938: idx = a->j;
3939: ii = a->i;
3940: if (idx) {
3941: while (i < m) { /* For each row */
3942: nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
3943: /* Limits the number of elements in a node to 'a->inode.limit' */
3944: for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
3945: nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
3946: if (nzy != nzx) break;
3947: idy += nzx; /* Same nonzero pattern */
3948: PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
3949: if (!flag) break;
3950: }
3951: ns[node_count + 1] = ns[node_count] + blk_size;
3952: node_count++;
3953: idx += blk_size * nzx;
3954: i = j;
3955: }
3956: }
3957: /* If not enough inodes found,, do not use inode version of the routines */
3958: if (!m || !idx || node_count > .8 * m) {
3959: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
3960: PetscCall(PetscFree(a->inode.size_csr));
3961: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
3962: } else {
3963: if (!A->factortype) {
3964: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3965: if (A->rmap->n == A->cmap->n) {
3966: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
3967: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
3968: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
3969: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
3970: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
3971: }
3972: } else {
3973: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
3974: }
3975: a->inode.node_count = node_count;
3976: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
3977: }
3978: a->inode.checked = PETSC_TRUE;
3979: a->inode.mat_nonzerostate = A->nonzerostate;
3980: PetscFunctionReturn(PETSC_SUCCESS);
3981: }
3983: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
3984: {
3985: Mat B = *C;
3986: Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
3987: PetscInt m = A->rmap->n;
3989: PetscFunctionBegin;
3990: c->inode.use = a->inode.use;
3991: c->inode.limit = a->inode.limit;
3992: c->inode.max_limit = a->inode.max_limit;
3993: c->inode.checked = PETSC_FALSE;
3994: c->inode.size_csr = NULL;
3995: c->inode.node_count = 0;
3996: c->inode.ibdiag = NULL;
3997: c->inode.bdiag = NULL;
3998: c->inode.mat_nonzerostate = -1;
3999: if (a->inode.use) {
4000: if (a->inode.checked && a->inode.size_csr) {
4001: PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
4002: PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4004: c->inode.checked = PETSC_TRUE;
4005: c->inode.node_count = a->inode.node_count;
4006: c->inode.mat_nonzerostate = (*C)->nonzerostate;
4007: }
4008: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4009: if (!B->factortype) {
4010: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4011: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4012: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4013: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4014: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4015: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4016: } else {
4017: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4018: }
4019: }
4020: PetscFunctionReturn(PETSC_SUCCESS);
4021: }
4023: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4024: {
4025: PetscInt k;
4026: const PetscInt *vi;
4028: PetscFunctionBegin;
4029: vi = aj + ai[row];
4030: for (k = 0; k < nzl; k++) cols[k] = vi[k];
4031: vi = aj + adiag[row];
4032: cols[nzl] = vi[0];
4033: vi = aj + adiag[row + 1] + 1;
4034: for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4035: PetscFunctionReturn(PETSC_SUCCESS);
4036: }
4037: /*
4038: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4039: Modified from MatSeqAIJCheckInode().
4041: Input Parameters:
4042: . Mat A - ILU or LU matrix factor
4044: */
4045: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4046: {
4047: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4048: PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4049: PetscInt *cols1, *cols2, *ns;
4050: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4051: PetscBool flag;
4053: PetscFunctionBegin;
4054: if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4055: if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4057: m = A->rmap->n;
4058: if (a->inode.size_csr) ns = a->inode.size_csr;
4059: else PetscCall(PetscMalloc1(m + 1, &ns));
4060: ns[0] = 0;
4062: i = 0;
4063: node_count = 0;
4064: PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4065: while (i < m) { /* For each row */
4066: nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */
4067: nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4068: nzx = nzl1 + nzu1 + 1;
4069: PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4071: /* Limits the number of elements in a node to 'a->inode.limit' */
4072: for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4073: nzl2 = ai[j + 1] - ai[j];
4074: nzu2 = adiag[j] - adiag[j + 1] - 1;
4075: nzy = nzl2 + nzu2 + 1;
4076: if (nzy != nzx) break;
4077: PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4078: PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4079: if (!flag) break;
4080: }
4081: ns[node_count + 1] = ns[node_count] + blk_size;
4082: node_count++;
4083: i = j;
4084: }
4085: PetscCall(PetscFree2(cols1, cols2));
4086: /* If not enough inodes found,, do not use inode version of the routines */
4087: if (!m || node_count > .8 * m) {
4088: PetscCall(PetscFree(ns));
4090: a->inode.node_count = 0;
4091: a->inode.size_csr = NULL;
4092: a->inode.use = PETSC_FALSE;
4094: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4095: } else {
4096: A->ops->mult = NULL;
4097: A->ops->sor = NULL;
4098: A->ops->multadd = NULL;
4099: A->ops->getrowij = NULL;
4100: A->ops->restorerowij = NULL;
4101: A->ops->getcolumnij = NULL;
4102: A->ops->restorecolumnij = NULL;
4103: A->ops->coloringpatch = NULL;
4104: A->ops->multdiagonalblock = NULL;
4105: a->inode.node_count = node_count;
4106: a->inode.size_csr = ns;
4107: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4108: }
4109: a->inode.checked = PETSC_TRUE;
4110: PetscFunctionReturn(PETSC_SUCCESS);
4111: }
4113: /*
4114: This is really ugly. if inodes are used this replaces the
4115: permutations with ones that correspond to rows/cols of the matrix
4116: rather than inode blocks
4117: */
4118: PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4119: {
4120: PetscFunctionBegin;
4121: PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4122: PetscFunctionReturn(PETSC_SUCCESS);
4123: }
4125: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4126: {
4127: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4128: PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4129: const PetscInt *ridx, *cidx;
4130: PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
4131: PetscInt nslim_col, *ns_col;
4132: IS ris = *rperm, cis = *cperm;
4134: PetscFunctionBegin;
4135: if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */
4136: if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4138: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4139: PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
4140: PetscCall(PetscMalloc2(m, &permr, n, &permc));
4142: PetscCall(ISGetIndices(ris, &ridx));
4143: PetscCall(ISGetIndices(cis, &cidx));
4145: /* Form the inode structure for the rows of permuted matrix using inv perm*/
4146: for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
4148: /* Construct the permutations for rows*/
4149: for (i = 0, row = 0; i < nslim_row; ++i) {
4150: indx = ridx[i];
4151: start_val = tns[indx];
4152: end_val = tns[indx + 1];
4153: for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4154: }
4156: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4157: for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
4159: /* Construct permutations for columns */
4160: for (i = 0, col = 0; i < nslim_col; ++i) {
4161: indx = cidx[i];
4162: start_val = tns[indx];
4163: end_val = tns[indx + 1];
4164: for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4165: }
4167: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4168: PetscCall(ISSetPermutation(*rperm));
4169: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4170: PetscCall(ISSetPermutation(*cperm));
4172: PetscCall(ISRestoreIndices(ris, &ridx));
4173: PetscCall(ISRestoreIndices(cis, &cidx));
4175: PetscCall(PetscFree(ns_col));
4176: PetscCall(PetscFree2(permr, permc));
4177: PetscCall(ISDestroy(&cis));
4178: PetscCall(ISDestroy(&ris));
4179: PetscCall(PetscFree(tns));
4180: PetscFunctionReturn(PETSC_SUCCESS);
4181: }
4183: /*@C
4184: MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4186: Not Collective
4188: Input Parameter:
4189: . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4191: Output Parameters:
4192: + node_count - no of inodes present in the matrix.
4193: . sizes - an array of size `node_count`, with the sizes of each inode.
4194: - limit - the max size used to generate the inodes.
4196: Level: advanced
4198: Note:
4199: It should be called after the matrix is assembled.
4200: The contents of the sizes[] array should not be changed.
4201: `NULL` may be passed for information not needed
4203: .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4204: @*/
4205: PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4206: {
4207: PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4209: PetscFunctionBegin;
4210: PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4211: PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4212: if (f) PetscCall((*f)(A, node_count, sizes, limit));
4213: PetscFunctionReturn(PETSC_SUCCESS);
4214: }
4216: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4217: {
4218: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4220: PetscFunctionBegin;
4221: if (node_count) *node_count = a->inode.node_count;
4222: if (sizes) *sizes = a->inode.size_csr;
4223: if (limit) *limit = a->inode.limit;
4224: PetscFunctionReturn(PETSC_SUCCESS);
4225: }